1c030f2e4Sxinhui pan /*
2c030f2e4Sxinhui pan  * Copyright 2018 Advanced Micro Devices, Inc.
3c030f2e4Sxinhui pan  *
4c030f2e4Sxinhui pan  * Permission is hereby granted, free of charge, to any person obtaining a
5c030f2e4Sxinhui pan  * copy of this software and associated documentation files (the "Software"),
6c030f2e4Sxinhui pan  * to deal in the Software without restriction, including without limitation
7c030f2e4Sxinhui pan  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8c030f2e4Sxinhui pan  * and/or sell copies of the Software, and to permit persons to whom the
9c030f2e4Sxinhui pan  * Software is furnished to do so, subject to the following conditions:
10c030f2e4Sxinhui pan  *
11c030f2e4Sxinhui pan  * The above copyright notice and this permission notice shall be included in
12c030f2e4Sxinhui pan  * all copies or substantial portions of the Software.
13c030f2e4Sxinhui pan  *
14c030f2e4Sxinhui pan  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15c030f2e4Sxinhui pan  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16c030f2e4Sxinhui pan  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17c030f2e4Sxinhui pan  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18c030f2e4Sxinhui pan  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19c030f2e4Sxinhui pan  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20c030f2e4Sxinhui pan  * OTHER DEALINGS IN THE SOFTWARE.
21c030f2e4Sxinhui pan  *
22c030f2e4Sxinhui pan  *
23c030f2e4Sxinhui pan  */
24c030f2e4Sxinhui pan #include <linux/debugfs.h>
25c030f2e4Sxinhui pan #include <linux/list.h>
26c030f2e4Sxinhui pan #include <linux/module.h>
27f867723bSSam Ravnborg #include <linux/uaccess.h>
287c6e68c7SAndrey Grodzovsky #include <linux/reboot.h>
297c6e68c7SAndrey Grodzovsky #include <linux/syscalls.h>
3005adfd80SLuben Tuikov #include <linux/pm_runtime.h>
31dbf3850dSYang Wang #include <linux/list_sort.h>
32f867723bSSam Ravnborg 
33c030f2e4Sxinhui pan #include "amdgpu.h"
34c030f2e4Sxinhui pan #include "amdgpu_ras.h"
35b404ae82Sxinhui pan #include "amdgpu_atomfirmware.h"
3619744f5fSHawking Zhang #include "amdgpu_xgmi.h"
374e644fffSHawking Zhang #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
389af357bcSHawking Zhang #include "nbio_v4_3.h"
39ecd1191eSCandice Li #include "nbif_v6_3_1.h"
407692e1eeSTao Zhou #include "nbio_v7_9.h"
41f50160cfSStanley.Yang #include "atom.h"
4225a2b22eSAndrey Grodzovsky #include "amdgpu_reset.h"
434e2965bdSHawking Zhang #include "amdgpu_psp.h"
4425a2b22eSAndrey Grodzovsky 
4512b2cab7SMukul Joshi #ifdef CONFIG_X86_MCE_AMD
4612b2cab7SMukul Joshi #include <asm/mce.h>
47c030f2e4Sxinhui pan 
4812b2cab7SMukul Joshi static bool notifier_registered;
4912b2cab7SMukul Joshi #endif
50eb0c3cd4SGuchun Chen static const char *RAS_FS_NAME = "ras";
51eb0c3cd4SGuchun Chen 
52c030f2e4Sxinhui pan const char *ras_error_string[] = {
53c030f2e4Sxinhui pan 	"none",
54c030f2e4Sxinhui pan 	"parity",
55c030f2e4Sxinhui pan 	"single_correctable",
56c030f2e4Sxinhui pan 	"multi_uncorrectable",
57c030f2e4Sxinhui pan 	"poison",
58c030f2e4Sxinhui pan };
59c030f2e4Sxinhui pan 
60c030f2e4Sxinhui pan const char *ras_block_string[] = {
61c030f2e4Sxinhui pan 	"umc",
62c030f2e4Sxinhui pan 	"sdma",
63c030f2e4Sxinhui pan 	"gfx",
64c030f2e4Sxinhui pan 	"mmhub",
65c030f2e4Sxinhui pan 	"athub",
66c030f2e4Sxinhui pan 	"pcie_bif",
67c030f2e4Sxinhui pan 	"hdp",
68c030f2e4Sxinhui pan 	"xgmi_wafl",
69c030f2e4Sxinhui pan 	"df",
70c030f2e4Sxinhui pan 	"smn",
71c030f2e4Sxinhui pan 	"sem",
72c030f2e4Sxinhui pan 	"mp0",
73c030f2e4Sxinhui pan 	"mp1",
74c030f2e4Sxinhui pan 	"fuse",
75640ae42eSJohn Clements 	"mca",
76a3d63c62SMohammad Zafar Ziya 	"vcn",
77a3d63c62SMohammad Zafar Ziya 	"jpeg",
787ed97155SYang Wang 	"ih",
797ed97155SYang Wang 	"mpio",
80*cc11dffcSStanley.Yang 	"mmsch",
81c030f2e4Sxinhui pan };
82c030f2e4Sxinhui pan 
83640ae42eSJohn Clements const char *ras_mca_block_string[] = {
84640ae42eSJohn Clements 	"mca_mp0",
85640ae42eSJohn Clements 	"mca_mp1",
86640ae42eSJohn Clements 	"mca_mpio",
87640ae42eSJohn Clements 	"mca_iohc",
88640ae42eSJohn Clements };
89640ae42eSJohn Clements 
90d5e8ff5fSyipechai struct amdgpu_ras_block_list {
91d5e8ff5fSyipechai 	/* ras block link */
92d5e8ff5fSyipechai 	struct list_head node;
93d5e8ff5fSyipechai 
94d5e8ff5fSyipechai 	struct amdgpu_ras_block_object *ras_obj;
95d5e8ff5fSyipechai };
96d5e8ff5fSyipechai 
get_ras_block_str(struct ras_common_if * ras_block)97640ae42eSJohn Clements const char *get_ras_block_str(struct ras_common_if *ras_block)
98640ae42eSJohn Clements {
99640ae42eSJohn Clements 	if (!ras_block)
100640ae42eSJohn Clements 		return "NULL";
101640ae42eSJohn Clements 
1027ed97155SYang Wang 	if (ras_block->block >= AMDGPU_RAS_BLOCK_COUNT ||
1037ed97155SYang Wang 	    ras_block->block >= ARRAY_SIZE(ras_block_string))
104640ae42eSJohn Clements 		return "OUT OF RANGE";
105640ae42eSJohn Clements 
106640ae42eSJohn Clements 	if (ras_block->block == AMDGPU_RAS_BLOCK__MCA)
107640ae42eSJohn Clements 		return ras_mca_block_string[ras_block->sub_block_index];
108640ae42eSJohn Clements 
109640ae42eSJohn Clements 	return ras_block_string[ras_block->block];
110640ae42eSJohn Clements }
111640ae42eSJohn Clements 
112954ea6aaSyipechai #define ras_block_str(_BLOCK_) \
113954ea6aaSyipechai 	(((_BLOCK_) < ARRAY_SIZE(ras_block_string)) ? ras_block_string[_BLOCK_] : "Out Of Range")
1148b0fb0e9Syipechai 
115c030f2e4Sxinhui pan #define ras_err_str(i) (ras_error_string[ffs(i)])
116c030f2e4Sxinhui pan 
117108c6a63Sxinhui pan #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
118108c6a63Sxinhui pan 
1197cdc2ee3STao Zhou /* inject address is 52 bits */
1207cdc2ee3STao Zhou #define	RAS_UMC_INJECT_ADDR_LIMIT	(0x1ULL << 52)
1217cdc2ee3STao Zhou 
122e4e6a589SLuben Tuikov /* typical ECC bad page rate is 1 bad page per 100MB VRAM */
123e4e6a589SLuben Tuikov #define RAS_BAD_PAGE_COVER              (100 * 1024 * 1024ULL)
124c84d4670SGuchun Chen 
12578146c1dSYiPeng Chai #define MAX_UMC_POISON_POLLING_TIME_ASYNC  300  //ms
1266c23f3d1SYiPeng Chai 
1272cf8e50eSYiPeng Chai #define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100  //ms
1282cf8e50eSYiPeng Chai 
129c0470691SYiPeng Chai #define MAX_FLUSH_RETIRE_DWORK_TIMES  100
130c0470691SYiPeng Chai 
13152dd95f2SGuchun Chen enum amdgpu_ras_retire_page_reservation {
13252dd95f2SGuchun Chen 	AMDGPU_RAS_RETIRE_PAGE_RESERVED,
13352dd95f2SGuchun Chen 	AMDGPU_RAS_RETIRE_PAGE_PENDING,
13452dd95f2SGuchun Chen 	AMDGPU_RAS_RETIRE_PAGE_FAULT,
13552dd95f2SGuchun Chen };
1367c6e68c7SAndrey Grodzovsky 
1377c6e68c7SAndrey Grodzovsky atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
1387c6e68c7SAndrey Grodzovsky 
139676deb38SDennis Li static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
140676deb38SDennis Li 				uint64_t addr);
1416e4be987STao Zhou static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
1426e4be987STao Zhou 				uint64_t addr);
14312b2cab7SMukul Joshi #ifdef CONFIG_X86_MCE_AMD
14491a1a52dSMukul Joshi static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev);
14591a1a52dSMukul Joshi struct mce_notifier_adev_list {
14691a1a52dSMukul Joshi 	struct amdgpu_device *devs[MAX_GPU_INSTANCE];
14791a1a52dSMukul Joshi 	int num_gpu;
14891a1a52dSMukul Joshi };
14991a1a52dSMukul Joshi static struct mce_notifier_adev_list mce_adev_list;
15012b2cab7SMukul Joshi #endif
1516e4be987STao Zhou 
amdgpu_ras_set_error_query_ready(struct amdgpu_device * adev,bool ready)15261380faaSJohn Clements void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready)
15361380faaSJohn Clements {
154a9d82d2fSEvan Quan 	if (adev && amdgpu_ras_get_context(adev))
15561380faaSJohn Clements 		amdgpu_ras_get_context(adev)->error_query_ready = ready;
15661380faaSJohn Clements }
15761380faaSJohn Clements 
amdgpu_ras_get_error_query_ready(struct amdgpu_device * adev)158f3167919SNirmoy Das static bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev)
15961380faaSJohn Clements {
160a9d82d2fSEvan Quan 	if (adev && amdgpu_ras_get_context(adev))
16161380faaSJohn Clements 		return amdgpu_ras_get_context(adev)->error_query_ready;
16261380faaSJohn Clements 
16361380faaSJohn Clements 	return false;
16461380faaSJohn Clements }
16561380faaSJohn Clements 
amdgpu_reserve_page_direct(struct amdgpu_device * adev,uint64_t address)166cbb8f989SJohn Clements static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t address)
167cbb8f989SJohn Clements {
1685b1270beSYang Wang 	struct ras_err_data err_data;
169cbb8f989SJohn Clements 	struct eeprom_table_record err_rec;
1705b1270beSYang Wang 	int ret;
171cbb8f989SJohn Clements 
172cbb8f989SJohn Clements 	if ((address >= adev->gmc.mc_vram_size) ||
173cbb8f989SJohn Clements 	    (address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
174cbb8f989SJohn Clements 		dev_warn(adev->dev,
175cbb8f989SJohn Clements 		         "RAS WARN: input address 0x%llx is invalid.\n",
176cbb8f989SJohn Clements 		         address);
177cbb8f989SJohn Clements 		return -EINVAL;
178cbb8f989SJohn Clements 	}
179cbb8f989SJohn Clements 
180cbb8f989SJohn Clements 	if (amdgpu_ras_check_bad_page(adev, address)) {
181cbb8f989SJohn Clements 		dev_warn(adev->dev,
18280b0cd0fSLuben Tuikov 			 "RAS WARN: 0x%llx has already been marked as bad page!\n",
183cbb8f989SJohn Clements 			 address);
184cbb8f989SJohn Clements 		return 0;
185cbb8f989SJohn Clements 	}
186cbb8f989SJohn Clements 
1875b1270beSYang Wang 	ret = amdgpu_ras_error_data_init(&err_data);
1885b1270beSYang Wang 	if (ret)
1895b1270beSYang Wang 		return ret;
1905b1270beSYang Wang 
191cbb8f989SJohn Clements 	memset(&err_rec, 0x0, sizeof(struct eeprom_table_record));
192cbb8f989SJohn Clements 	err_data.err_addr = &err_rec;
19371344a71SLuben Tuikov 	amdgpu_umc_fill_error_record(&err_data, address, address, 0, 0);
194cbb8f989SJohn Clements 
195cbb8f989SJohn Clements 	if (amdgpu_bad_page_threshold != 0) {
196cbb8f989SJohn Clements 		amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
197a8d133e6STao Zhou 					 err_data.err_addr_cnt, false);
1984d33e0f1STao Zhou 		amdgpu_ras_save_bad_pages(adev, NULL);
199cbb8f989SJohn Clements 	}
200cbb8f989SJohn Clements 
2015b1270beSYang Wang 	amdgpu_ras_error_data_fini(&err_data);
2025b1270beSYang Wang 
203cbb8f989SJohn Clements 	dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n");
204cbb8f989SJohn Clements 	dev_warn(adev->dev, "Clear EEPROM:\n");
205cbb8f989SJohn Clements 	dev_warn(adev->dev, "    echo 1 > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n");
206cbb8f989SJohn Clements 
207cbb8f989SJohn Clements 	return 0;
208cbb8f989SJohn Clements }
209cbb8f989SJohn Clements 
amdgpu_ras_debugfs_read(struct file * f,char __user * buf,size_t size,loff_t * pos)210c030f2e4Sxinhui pan static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
211c030f2e4Sxinhui pan 					size_t size, loff_t *pos)
212c030f2e4Sxinhui pan {
213c030f2e4Sxinhui pan 	struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private;
214c030f2e4Sxinhui pan 	struct ras_query_if info = {
215c030f2e4Sxinhui pan 		.head = obj->head,
216c030f2e4Sxinhui pan 	};
217c030f2e4Sxinhui pan 	ssize_t s;
218c030f2e4Sxinhui pan 	char val[128];
219c030f2e4Sxinhui pan 
220761d86d3SDennis Li 	if (amdgpu_ras_query_error_status(obj->adev, &info))
221c030f2e4Sxinhui pan 		return -EINVAL;
222c030f2e4Sxinhui pan 
2232a460963SCandice Li 	/* Hardware counter will be reset automatically after the query on Vega20 and Arcturus */
2244e8303cfSLijo Lazar 	if (amdgpu_ip_version(obj->adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 2) &&
2254e8303cfSLijo Lazar 	    amdgpu_ip_version(obj->adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 4)) {
2262a460963SCandice Li 		if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))
2272a460963SCandice Li 			dev_warn(obj->adev->dev, "Failed to reset error counter and error status");
2282a460963SCandice Li 	}
2292a460963SCandice Li 
230c030f2e4Sxinhui pan 	s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
231c030f2e4Sxinhui pan 			"ue", info.ue_count,
232c030f2e4Sxinhui pan 			"ce", info.ce_count);
233c030f2e4Sxinhui pan 	if (*pos >= s)
234c030f2e4Sxinhui pan 		return 0;
235c030f2e4Sxinhui pan 
236c030f2e4Sxinhui pan 	s -= *pos;
237c030f2e4Sxinhui pan 	s = min_t(u64, s, size);
238c030f2e4Sxinhui pan 
239c030f2e4Sxinhui pan 
240c030f2e4Sxinhui pan 	if (copy_to_user(buf, &val[*pos], s))
241c030f2e4Sxinhui pan 		return -EINVAL;
242c030f2e4Sxinhui pan 
243c030f2e4Sxinhui pan 	*pos += s;
244c030f2e4Sxinhui pan 
245c030f2e4Sxinhui pan 	return s;
246c030f2e4Sxinhui pan }
247c030f2e4Sxinhui pan 
248c030f2e4Sxinhui pan static const struct file_operations amdgpu_ras_debugfs_ops = {
249c030f2e4Sxinhui pan 	.owner = THIS_MODULE,
250c030f2e4Sxinhui pan 	.read = amdgpu_ras_debugfs_read,
251190211abSxinhui pan 	.write = NULL,
252c030f2e4Sxinhui pan 	.llseek = default_llseek
253c030f2e4Sxinhui pan };
254c030f2e4Sxinhui pan 
amdgpu_ras_find_block_id_by_name(const char * name,int * block_id)25596ebb307Sxinhui pan static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id)
25696ebb307Sxinhui pan {
25796ebb307Sxinhui pan 	int i;
25896ebb307Sxinhui pan 
25996ebb307Sxinhui pan 	for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) {
26096ebb307Sxinhui pan 		*block_id = i;
261640ae42eSJohn Clements 		if (strcmp(name, ras_block_string[i]) == 0)
26296ebb307Sxinhui pan 			return 0;
26396ebb307Sxinhui pan 	}
26496ebb307Sxinhui pan 	return -EINVAL;
26596ebb307Sxinhui pan }
26696ebb307Sxinhui pan 
amdgpu_ras_debugfs_ctrl_parse_data(struct file * f,const char __user * buf,size_t size,loff_t * pos,struct ras_debug_if * data)26796ebb307Sxinhui pan static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
26896ebb307Sxinhui pan 		const char __user *buf, size_t size,
26996ebb307Sxinhui pan 		loff_t *pos, struct ras_debug_if *data)
27096ebb307Sxinhui pan {
27196ebb307Sxinhui pan 	ssize_t s = min_t(u64, 64, size);
27296ebb307Sxinhui pan 	char str[65];
27396ebb307Sxinhui pan 	char block_name[33];
27496ebb307Sxinhui pan 	char err[9] = "ue";
27596ebb307Sxinhui pan 	int op = -1;
27696ebb307Sxinhui pan 	int block_id;
27744494f96STao Zhou 	uint32_t sub_block;
27896ebb307Sxinhui pan 	u64 address, value;
2792c22ed0bSTao Zhou 	/* default value is 0 if the mask is not set by user */
2802c22ed0bSTao Zhou 	u32 instance_mask = 0;
28196ebb307Sxinhui pan 
28296ebb307Sxinhui pan 	if (*pos)
28396ebb307Sxinhui pan 		return -EINVAL;
28496ebb307Sxinhui pan 	*pos = size;
28596ebb307Sxinhui pan 
28696ebb307Sxinhui pan 	memset(str, 0, sizeof(str));
28796ebb307Sxinhui pan 	memset(data, 0, sizeof(*data));
28896ebb307Sxinhui pan 
28996ebb307Sxinhui pan 	if (copy_from_user(str, buf, s))
29096ebb307Sxinhui pan 		return -EINVAL;
29196ebb307Sxinhui pan 
29296ebb307Sxinhui pan 	if (sscanf(str, "disable %32s", block_name) == 1)
29396ebb307Sxinhui pan 		op = 0;
29496ebb307Sxinhui pan 	else if (sscanf(str, "enable %32s %8s", block_name, err) == 2)
29596ebb307Sxinhui pan 		op = 1;
29696ebb307Sxinhui pan 	else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
29796ebb307Sxinhui pan 		op = 2;
2986df23f4cSDennis Li 	else if (strstr(str, "retire_page") != NULL)
299cbb8f989SJohn Clements 		op = 3;
300b076296bSxinhui pan 	else if (str[0] && str[1] && str[2] && str[3])
30196ebb307Sxinhui pan 		/* ascii string, but commands are not matched. */
30296ebb307Sxinhui pan 		return -EINVAL;
30396ebb307Sxinhui pan 
30496ebb307Sxinhui pan 	if (op != -1) {
305cbb8f989SJohn Clements 		if (op == 3) {
306546aa546SLuben Tuikov 			if (sscanf(str, "%*s 0x%llx", &address) != 1 &&
307546aa546SLuben Tuikov 			    sscanf(str, "%*s %llu", &address) != 1)
308cbb8f989SJohn Clements 				return -EINVAL;
309cbb8f989SJohn Clements 
310cbb8f989SJohn Clements 			data->op = op;
311cbb8f989SJohn Clements 			data->inject.address = address;
312cbb8f989SJohn Clements 
313cbb8f989SJohn Clements 			return 0;
314cbb8f989SJohn Clements 		}
315cbb8f989SJohn Clements 
31696ebb307Sxinhui pan 		if (amdgpu_ras_find_block_id_by_name(block_name, &block_id))
31796ebb307Sxinhui pan 			return -EINVAL;
31896ebb307Sxinhui pan 
31996ebb307Sxinhui pan 		data->head.block = block_id;
320fb1e9171SCandice Li 		/* only ue, ce and poison errors are supported */
321e1063493STao Zhou 		if (!memcmp("ue", err, 2))
322e1063493STao Zhou 			data->head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
323e1063493STao Zhou 		else if (!memcmp("ce", err, 2))
324e1063493STao Zhou 			data->head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
325fb1e9171SCandice Li 		else if (!memcmp("poison", err, 6))
326fb1e9171SCandice Li 			data->head.type = AMDGPU_RAS_ERROR__POISON;
327e1063493STao Zhou 		else
328e1063493STao Zhou 			return -EINVAL;
329e1063493STao Zhou 
33096ebb307Sxinhui pan 		data->op = op;
33196ebb307Sxinhui pan 
33296ebb307Sxinhui pan 		if (op == 2) {
3332c22ed0bSTao Zhou 			if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx 0x%x",
3342c22ed0bSTao Zhou 				   &sub_block, &address, &value, &instance_mask) != 4 &&
3352c22ed0bSTao Zhou 			    sscanf(str, "%*s %*s %*s %u %llu %llu %u",
3362c22ed0bSTao Zhou 				   &sub_block, &address, &value, &instance_mask) != 4 &&
3372c22ed0bSTao Zhou 				sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx",
338546aa546SLuben Tuikov 				   &sub_block, &address, &value) != 3 &&
339546aa546SLuben Tuikov 			    sscanf(str, "%*s %*s %*s %u %llu %llu",
34044494f96STao Zhou 				   &sub_block, &address, &value) != 3)
34196ebb307Sxinhui pan 				return -EINVAL;
34244494f96STao Zhou 			data->head.sub_block_index = sub_block;
34396ebb307Sxinhui pan 			data->inject.address = address;
34496ebb307Sxinhui pan 			data->inject.value = value;
3452c22ed0bSTao Zhou 			data->inject.instance_mask = instance_mask;
34696ebb307Sxinhui pan 		}
34796ebb307Sxinhui pan 	} else {
34873aa8e1aSxinhui pan 		if (size < sizeof(*data))
34996ebb307Sxinhui pan 			return -EINVAL;
35096ebb307Sxinhui pan 
35196ebb307Sxinhui pan 		if (copy_from_user(data, buf, sizeof(*data)))
35296ebb307Sxinhui pan 			return -EINVAL;
35396ebb307Sxinhui pan 	}
35496ebb307Sxinhui pan 
35596ebb307Sxinhui pan 	return 0;
35696ebb307Sxinhui pan }
3577c6e68c7SAndrey Grodzovsky 
amdgpu_ras_instance_mask_check(struct amdgpu_device * adev,struct ras_debug_if * data)358f464c5ddSTao Zhou static void amdgpu_ras_instance_mask_check(struct amdgpu_device *adev,
359f464c5ddSTao Zhou 				struct ras_debug_if *data)
360f464c5ddSTao Zhou {
361f464c5ddSTao Zhou 	int num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask) : 1;
362f464c5ddSTao Zhou 	uint32_t mask, inst_mask = data->inject.instance_mask;
363f464c5ddSTao Zhou 
364f464c5ddSTao Zhou 	/* no need to set instance mask if there is only one instance */
365f464c5ddSTao Zhou 	if (num_xcc <= 1 && inst_mask) {
366f464c5ddSTao Zhou 		data->inject.instance_mask = 0;
367f464c5ddSTao Zhou 		dev_dbg(adev->dev,
368f464c5ddSTao Zhou 			"RAS inject mask(0x%x) isn't supported and force it to 0.\n",
369f464c5ddSTao Zhou 			inst_mask);
370f464c5ddSTao Zhou 
371f464c5ddSTao Zhou 		return;
372f464c5ddSTao Zhou 	}
373f464c5ddSTao Zhou 
374f464c5ddSTao Zhou 	switch (data->head.block) {
375f464c5ddSTao Zhou 	case AMDGPU_RAS_BLOCK__GFX:
376f464c5ddSTao Zhou 		mask = GENMASK(num_xcc - 1, 0);
377f464c5ddSTao Zhou 		break;
378f464c5ddSTao Zhou 	case AMDGPU_RAS_BLOCK__SDMA:
379f464c5ddSTao Zhou 		mask = GENMASK(adev->sdma.num_instances - 1, 0);
380f464c5ddSTao Zhou 		break;
381e3959cb5SStanley.Yang 	case AMDGPU_RAS_BLOCK__VCN:
382e3959cb5SStanley.Yang 	case AMDGPU_RAS_BLOCK__JPEG:
383e3959cb5SStanley.Yang 		mask = GENMASK(adev->vcn.num_vcn_inst - 1, 0);
384e3959cb5SStanley.Yang 		break;
385f464c5ddSTao Zhou 	default:
386e3959cb5SStanley.Yang 		mask = inst_mask;
387f464c5ddSTao Zhou 		break;
388f464c5ddSTao Zhou 	}
389f464c5ddSTao Zhou 
390f464c5ddSTao Zhou 	/* remove invalid bits in instance mask */
391f464c5ddSTao Zhou 	data->inject.instance_mask &= mask;
392f464c5ddSTao Zhou 	if (inst_mask != data->inject.instance_mask)
393f464c5ddSTao Zhou 		dev_dbg(adev->dev,
394f464c5ddSTao Zhou 			"Adjust RAS inject mask 0x%x to 0x%x\n",
395f464c5ddSTao Zhou 			inst_mask, data->inject.instance_mask);
396f464c5ddSTao Zhou }
397f464c5ddSTao Zhou 
39874abc221STom St Denis /**
39974abc221STom St Denis  * DOC: AMDGPU RAS debugfs control interface
40036ea1bd2Sxinhui pan  *
401737c375bSLuben Tuikov  * The control interface accepts struct ras_debug_if which has two members.
40236ea1bd2Sxinhui pan  *
40336ea1bd2Sxinhui pan  * First member: ras_debug_if::head or ras_debug_if::inject.
40496ebb307Sxinhui pan  *
40596ebb307Sxinhui pan  * head is used to indicate which IP block will be under control.
40636ea1bd2Sxinhui pan  *
40736ea1bd2Sxinhui pan  * head has four members, they are block, type, sub_block_index, name.
40836ea1bd2Sxinhui pan  * block: which IP will be under control.
40936ea1bd2Sxinhui pan  * type: what kind of error will be enabled/disabled/injected.
41036ea1bd2Sxinhui pan  * sub_block_index: some IPs have subcomponets. say, GFX, sDMA.
41136ea1bd2Sxinhui pan  * name: the name of IP.
41236ea1bd2Sxinhui pan  *
4132c22ed0bSTao Zhou  * inject has three more members than head, they are address, value and mask.
41436ea1bd2Sxinhui pan  * As their names indicate, inject operation will write the
41536ea1bd2Sxinhui pan  * value to the address.
41636ea1bd2Sxinhui pan  *
417ef177d11SAlex Deucher  * The second member: struct ras_debug_if::op.
418c688a06bSGuchun Chen  * It has three kinds of operations.
419879e723dSAdam Zerella  *
420879e723dSAdam Zerella  * - 0: disable RAS on the block. Take ::head as its data.
421879e723dSAdam Zerella  * - 1: enable RAS on the block. Take ::head as its data.
422879e723dSAdam Zerella  * - 2: inject errors on the block. Take ::inject as its data.
42336ea1bd2Sxinhui pan  *
42496ebb307Sxinhui pan  * How to use the interface?
425ef177d11SAlex Deucher  *
426737c375bSLuben Tuikov  * In a program
427ef177d11SAlex Deucher  *
428737c375bSLuben Tuikov  * Copy the struct ras_debug_if in your code and initialize it.
429737c375bSLuben Tuikov  * Write the struct to the control interface.
430ef177d11SAlex Deucher  *
431737c375bSLuben Tuikov  * From shell
43296ebb307Sxinhui pan  *
433879e723dSAdam Zerella  * .. code-block:: bash
434879e723dSAdam Zerella  *
435737c375bSLuben Tuikov  *	echo "disable <block>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
436737c375bSLuben Tuikov  *	echo "enable  <block> <error>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
4372c22ed0bSTao Zhou  *	echo "inject  <block> <error> <sub-block> <address> <value> <mask>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
438879e723dSAdam Zerella  *
439737c375bSLuben Tuikov  * Where N, is the card which you want to affect.
440ef177d11SAlex Deucher  *
441737c375bSLuben Tuikov  * "disable" requires only the block.
442737c375bSLuben Tuikov  * "enable" requires the block and error type.
443737c375bSLuben Tuikov  * "inject" requires the block, error type, address, and value.
444c666bbf0SDwaipayan Ray  *
445737c375bSLuben Tuikov  * The block is one of: umc, sdma, gfx, etc.
44696ebb307Sxinhui pan  *	see ras_block_string[] for details
447c666bbf0SDwaipayan Ray  *
448fb1e9171SCandice Li  * The error type is one of: ue, ce and poison where,
449737c375bSLuben Tuikov  *	ue is multi-uncorrectable
450737c375bSLuben Tuikov  *	ce is single-correctable
451fb1e9171SCandice Li  *	poison is poison
452c666bbf0SDwaipayan Ray  *
453737c375bSLuben Tuikov  * The sub-block is a the sub-block index, pass 0 if there is no sub-block.
454737c375bSLuben Tuikov  * The address and value are hexadecimal numbers, leading 0x is optional.
4552c22ed0bSTao Zhou  * The mask means instance mask, is optional, default value is 0x1.
45696ebb307Sxinhui pan  *
457737c375bSLuben Tuikov  * For instance,
458879e723dSAdam Zerella  *
459879e723dSAdam Zerella  * .. code-block:: bash
460879e723dSAdam Zerella  *
46144494f96STao Zhou  *	echo inject umc ue 0x0 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
4622c22ed0bSTao Zhou  *	echo inject umc ce 0 0 0 3 > /sys/kernel/debug/dri/0/ras/ras_ctrl
46396ebb307Sxinhui pan  *	echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl
46496ebb307Sxinhui pan  *
465737c375bSLuben Tuikov  * How to check the result of the operation?
46636ea1bd2Sxinhui pan  *
467737c375bSLuben Tuikov  * To check disable/enable, see "ras" features at,
46836ea1bd2Sxinhui pan  * /sys/class/drm/card[0/1/2...]/device/ras/features
46936ea1bd2Sxinhui pan  *
470737c375bSLuben Tuikov  * To check inject, see the corresponding error count at,
471737c375bSLuben Tuikov  * /sys/class/drm/card[0/1/2...]/device/ras/[gfx|sdma|umc|...]_err_count
47236ea1bd2Sxinhui pan  *
473879e723dSAdam Zerella  * .. note::
474ef177d11SAlex Deucher  *	Operations are only allowed on blocks which are supported.
475737c375bSLuben Tuikov  *	Check the "ras" mask at /sys/module/amdgpu/parameters/ras_mask
476ef177d11SAlex Deucher  *	to see which blocks support RAS on a particular asic.
477ef177d11SAlex Deucher  *
47836ea1bd2Sxinhui pan  */
amdgpu_ras_debugfs_ctrl_write(struct file * f,const char __user * buf,size_t size,loff_t * pos)479cf696091SLuben Tuikov static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
480cf696091SLuben Tuikov 					     const char __user *buf,
48136ea1bd2Sxinhui pan 					     size_t size, loff_t *pos)
48236ea1bd2Sxinhui pan {
48336ea1bd2Sxinhui pan 	struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
48436ea1bd2Sxinhui pan 	struct ras_debug_if data;
48536ea1bd2Sxinhui pan 	int ret = 0;
48636ea1bd2Sxinhui pan 
48761380faaSJohn Clements 	if (!amdgpu_ras_get_error_query_ready(adev)) {
4886952e99cSGuchun Chen 		dev_warn(adev->dev, "RAS WARN: error injection "
4896952e99cSGuchun Chen 				"currently inaccessible\n");
49043c4d576SJohn Clements 		return size;
49143c4d576SJohn Clements 	}
49243c4d576SJohn Clements 
49396ebb307Sxinhui pan 	ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
49496ebb307Sxinhui pan 	if (ret)
495cf696091SLuben Tuikov 		return ret;
49636ea1bd2Sxinhui pan 
49780b0cd0fSLuben Tuikov 	if (data.op == 3) {
498cbb8f989SJohn Clements 		ret = amdgpu_reserve_page_direct(adev, data.inject.address);
49980b0cd0fSLuben Tuikov 		if (!ret)
500cbb8f989SJohn Clements 			return size;
501cbb8f989SJohn Clements 		else
502cbb8f989SJohn Clements 			return ret;
503cbb8f989SJohn Clements 	}
504cbb8f989SJohn Clements 
50536ea1bd2Sxinhui pan 	if (!amdgpu_ras_is_supported(adev, data.head.block))
50636ea1bd2Sxinhui pan 		return -EINVAL;
50736ea1bd2Sxinhui pan 
50836ea1bd2Sxinhui pan 	switch (data.op) {
50936ea1bd2Sxinhui pan 	case 0:
51036ea1bd2Sxinhui pan 		ret = amdgpu_ras_feature_enable(adev, &data.head, 0);
51136ea1bd2Sxinhui pan 		break;
51236ea1bd2Sxinhui pan 	case 1:
51336ea1bd2Sxinhui pan 		ret = amdgpu_ras_feature_enable(adev, &data.head, 1);
51436ea1bd2Sxinhui pan 		break;
51536ea1bd2Sxinhui pan 	case 2:
51643aedbf4SStanley.Yang 		if ((data.inject.address >= adev->gmc.mc_vram_size &&
51743aedbf4SStanley.Yang 		    adev->gmc.mc_vram_size) ||
5187cdc2ee3STao Zhou 		    (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
519b0d4783aSGuchun Chen 			dev_warn(adev->dev, "RAS WARN: input address "
520b0d4783aSGuchun Chen 					"0x%llx is invalid.",
521b0d4783aSGuchun Chen 					data.inject.address);
5227cdc2ee3STao Zhou 			ret = -EINVAL;
523efb426d5Sxinhui pan 			break;
5247cdc2ee3STao Zhou 		}
5257cdc2ee3STao Zhou 
5266e4be987STao Zhou 		/* umc ce/ue error injection for a bad page is not allowed */
5276e4be987STao Zhou 		if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) &&
5286e4be987STao Zhou 		    amdgpu_ras_check_bad_page(adev, data.inject.address)) {
529c65b0805SLuben Tuikov 			dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has "
530c65b0805SLuben Tuikov 				 "already been marked as bad!\n",
5316e4be987STao Zhou 				 data.inject.address);
5326e4be987STao Zhou 			break;
5336e4be987STao Zhou 		}
5346e4be987STao Zhou 
535f464c5ddSTao Zhou 		amdgpu_ras_instance_mask_check(adev, &data);
536f464c5ddSTao Zhou 
5377cdc2ee3STao Zhou 		/* data.inject.address is offset instead of absolute gpu address */
53836ea1bd2Sxinhui pan 		ret = amdgpu_ras_error_inject(adev, &data.inject);
53936ea1bd2Sxinhui pan 		break;
54096ebb307Sxinhui pan 	default:
54196ebb307Sxinhui pan 		ret = -EINVAL;
54296ebb307Sxinhui pan 		break;
543374bf7bdSzhengbin 	}
54436ea1bd2Sxinhui pan 
54536ea1bd2Sxinhui pan 	if (ret)
54679c04621SStanley.Yang 		return ret;
54736ea1bd2Sxinhui pan 
54836ea1bd2Sxinhui pan 	return size;
54936ea1bd2Sxinhui pan }
55036ea1bd2Sxinhui pan 
551084fe13bSAndrey Grodzovsky /**
552084fe13bSAndrey Grodzovsky  * DOC: AMDGPU RAS debugfs EEPROM table reset interface
553084fe13bSAndrey Grodzovsky  *
554f77c7109SAlex Deucher  * Some boards contain an EEPROM which is used to persistently store a list of
555ef177d11SAlex Deucher  * bad pages which experiences ECC errors in vram.  This interface provides
556f77c7109SAlex Deucher  * a way to reset the EEPROM, e.g., after testing error injection.
557f77c7109SAlex Deucher  *
558f77c7109SAlex Deucher  * Usage:
559f77c7109SAlex Deucher  *
560f77c7109SAlex Deucher  * .. code-block:: bash
561f77c7109SAlex Deucher  *
562f77c7109SAlex Deucher  *	echo 1 > ../ras/ras_eeprom_reset
563f77c7109SAlex Deucher  *
564f77c7109SAlex Deucher  * will reset EEPROM table to 0 entries.
565f77c7109SAlex Deucher  *
566084fe13bSAndrey Grodzovsky  */
amdgpu_ras_debugfs_eeprom_write(struct file * f,const char __user * buf,size_t size,loff_t * pos)567cf696091SLuben Tuikov static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f,
568cf696091SLuben Tuikov 					       const char __user *buf,
569084fe13bSAndrey Grodzovsky 					       size_t size, loff_t *pos)
570084fe13bSAndrey Grodzovsky {
571bf0b91b7SGuchun Chen 	struct amdgpu_device *adev =
572bf0b91b7SGuchun Chen 		(struct amdgpu_device *)file_inode(f)->i_private;
573084fe13bSAndrey Grodzovsky 	int ret;
574084fe13bSAndrey Grodzovsky 
575bf0b91b7SGuchun Chen 	ret = amdgpu_ras_eeprom_reset_table(
576bf0b91b7SGuchun Chen 		&(amdgpu_ras_get_context(adev)->eeprom_control));
577084fe13bSAndrey Grodzovsky 
57863d4c081SLuben Tuikov 	if (!ret) {
579cf696091SLuben Tuikov 		/* Something was written to EEPROM.
580cf696091SLuben Tuikov 		 */
581bf0b91b7SGuchun Chen 		amdgpu_ras_get_context(adev)->flags = RAS_DEFAULT_FLAGS;
582bf0b91b7SGuchun Chen 		return size;
583bf0b91b7SGuchun Chen 	} else {
584cf696091SLuben Tuikov 		return ret;
585bf0b91b7SGuchun Chen 	}
586084fe13bSAndrey Grodzovsky }
587084fe13bSAndrey Grodzovsky 
58836ea1bd2Sxinhui pan static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
58936ea1bd2Sxinhui pan 	.owner = THIS_MODULE,
59036ea1bd2Sxinhui pan 	.read = NULL,
59136ea1bd2Sxinhui pan 	.write = amdgpu_ras_debugfs_ctrl_write,
59236ea1bd2Sxinhui pan 	.llseek = default_llseek
59336ea1bd2Sxinhui pan };
59436ea1bd2Sxinhui pan 
595084fe13bSAndrey Grodzovsky static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = {
596084fe13bSAndrey Grodzovsky 	.owner = THIS_MODULE,
597084fe13bSAndrey Grodzovsky 	.read = NULL,
598084fe13bSAndrey Grodzovsky 	.write = amdgpu_ras_debugfs_eeprom_write,
599084fe13bSAndrey Grodzovsky 	.llseek = default_llseek
600084fe13bSAndrey Grodzovsky };
601084fe13bSAndrey Grodzovsky 
602f77c7109SAlex Deucher /**
603f77c7109SAlex Deucher  * DOC: AMDGPU RAS sysfs Error Count Interface
604f77c7109SAlex Deucher  *
605ef177d11SAlex Deucher  * It allows the user to read the error count for each IP block on the gpu through
606f77c7109SAlex Deucher  * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
607f77c7109SAlex Deucher  *
608f77c7109SAlex Deucher  * It outputs the multiple lines which report the uncorrected (ue) and corrected
609f77c7109SAlex Deucher  * (ce) error counts.
610f77c7109SAlex Deucher  *
611f77c7109SAlex Deucher  * The format of one line is below,
612f77c7109SAlex Deucher  *
613f77c7109SAlex Deucher  * [ce|ue]: count
614f77c7109SAlex Deucher  *
615f77c7109SAlex Deucher  * Example:
616f77c7109SAlex Deucher  *
617f77c7109SAlex Deucher  * .. code-block:: bash
618f77c7109SAlex Deucher  *
619f77c7109SAlex Deucher  *	ue: 0
620f77c7109SAlex Deucher  *	ce: 1
621f77c7109SAlex Deucher  *
622f77c7109SAlex Deucher  */
amdgpu_ras_sysfs_read(struct device * dev,struct device_attribute * attr,char * buf)623c030f2e4Sxinhui pan static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
624c030f2e4Sxinhui pan 		struct device_attribute *attr, char *buf)
625c030f2e4Sxinhui pan {
626c030f2e4Sxinhui pan 	struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr);
627c030f2e4Sxinhui pan 	struct ras_query_if info = {
628c030f2e4Sxinhui pan 		.head = obj->head,
629c030f2e4Sxinhui pan 	};
630c030f2e4Sxinhui pan 
63161380faaSJohn Clements 	if (!amdgpu_ras_get_error_query_ready(obj->adev))
63236000c7aSTian Tao 		return sysfs_emit(buf, "Query currently inaccessible\n");
63343c4d576SJohn Clements 
634761d86d3SDennis Li 	if (amdgpu_ras_query_error_status(obj->adev, &info))
635c030f2e4Sxinhui pan 		return -EINVAL;
636c030f2e4Sxinhui pan 
6374e8303cfSLijo Lazar 	if (amdgpu_ip_version(obj->adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 2) &&
6384e8303cfSLijo Lazar 	    amdgpu_ip_version(obj->adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 4)) {
6391f0d8e37SMukul Joshi 		if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))
6402a460963SCandice Li 			dev_warn(obj->adev->dev, "Failed to reset error counter and error status");
6411f0d8e37SMukul Joshi 	}
6421f0d8e37SMukul Joshi 
6432c7a1560SStanley.Yang 	if (info.head.block == AMDGPU_RAS_BLOCK__UMC)
6442c7a1560SStanley.Yang 		return sysfs_emit(buf, "%s: %lu\n%s: %lu\n%s: %lu\n", "ue", info.ue_count,
6452c7a1560SStanley.Yang 				"ce", info.ce_count, "de", info.de_count);
6462c7a1560SStanley.Yang 	else
64736000c7aSTian Tao 		return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count,
648c030f2e4Sxinhui pan 				"ce", info.ce_count);
649c030f2e4Sxinhui pan }
650c030f2e4Sxinhui pan 
651c030f2e4Sxinhui pan /* obj begin */
652c030f2e4Sxinhui pan 
653c030f2e4Sxinhui pan #define get_obj(obj) do { (obj)->use++; } while (0)
654c030f2e4Sxinhui pan #define alive_obj(obj) ((obj)->use)
655c030f2e4Sxinhui pan 
put_obj(struct ras_manager * obj)656c030f2e4Sxinhui pan static inline void put_obj(struct ras_manager *obj)
657c030f2e4Sxinhui pan {
658ec3e0a91SYang Wang 	if (obj && (--obj->use == 0)) {
659c030f2e4Sxinhui pan 		list_del(&obj->node);
660ec3e0a91SYang Wang 		amdgpu_ras_error_data_fini(&obj->err_data);
661ec3e0a91SYang Wang 	}
662ec3e0a91SYang Wang 
663f0872686SBernard Zhao 	if (obj && (obj->use < 0))
664640ae42eSJohn Clements 		DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", get_ras_block_str(&obj->head));
665c030f2e4Sxinhui pan }
666c030f2e4Sxinhui pan 
667c030f2e4Sxinhui pan /* make one obj and return it. */
amdgpu_ras_create_obj(struct amdgpu_device * adev,struct ras_common_if * head)668c030f2e4Sxinhui pan static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
669c030f2e4Sxinhui pan 		struct ras_common_if *head)
670c030f2e4Sxinhui pan {
671c030f2e4Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
672c030f2e4Sxinhui pan 	struct ras_manager *obj;
673c030f2e4Sxinhui pan 
6748ab0d6f0SLuben Tuikov 	if (!adev->ras_enabled || !con)
675c030f2e4Sxinhui pan 		return NULL;
676c030f2e4Sxinhui pan 
677c030f2e4Sxinhui pan 	if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
678c030f2e4Sxinhui pan 		return NULL;
679c030f2e4Sxinhui pan 
680640ae42eSJohn Clements 	if (head->block == AMDGPU_RAS_BLOCK__MCA) {
681640ae42eSJohn Clements 		if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST)
682640ae42eSJohn Clements 			return NULL;
683640ae42eSJohn Clements 
684640ae42eSJohn Clements 		obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index];
685640ae42eSJohn Clements 	} else
686c030f2e4Sxinhui pan 		obj = &con->objs[head->block];
687640ae42eSJohn Clements 
688c030f2e4Sxinhui pan 	/* already exist. return obj? */
689c030f2e4Sxinhui pan 	if (alive_obj(obj))
690c030f2e4Sxinhui pan 		return NULL;
691c030f2e4Sxinhui pan 
692ec3e0a91SYang Wang 	if (amdgpu_ras_error_data_init(&obj->err_data))
693ec3e0a91SYang Wang 		return NULL;
694ec3e0a91SYang Wang 
695c030f2e4Sxinhui pan 	obj->head = *head;
696c030f2e4Sxinhui pan 	obj->adev = adev;
697c030f2e4Sxinhui pan 	list_add(&obj->node, &con->head);
698c030f2e4Sxinhui pan 	get_obj(obj);
699c030f2e4Sxinhui pan 
700c030f2e4Sxinhui pan 	return obj;
701c030f2e4Sxinhui pan }
702c030f2e4Sxinhui pan 
703c030f2e4Sxinhui pan /* return an obj equal to head, or the first when head is NULL */
amdgpu_ras_find_obj(struct amdgpu_device * adev,struct ras_common_if * head)704f2a79be1SLe Ma struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
705c030f2e4Sxinhui pan 		struct ras_common_if *head)
706c030f2e4Sxinhui pan {
707c030f2e4Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
708c030f2e4Sxinhui pan 	struct ras_manager *obj;
709c030f2e4Sxinhui pan 	int i;
710c030f2e4Sxinhui pan 
7118ab0d6f0SLuben Tuikov 	if (!adev->ras_enabled || !con)
712c030f2e4Sxinhui pan 		return NULL;
713c030f2e4Sxinhui pan 
714c030f2e4Sxinhui pan 	if (head) {
715c030f2e4Sxinhui pan 		if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
716c030f2e4Sxinhui pan 			return NULL;
717c030f2e4Sxinhui pan 
718640ae42eSJohn Clements 		if (head->block == AMDGPU_RAS_BLOCK__MCA) {
719640ae42eSJohn Clements 			if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST)
720640ae42eSJohn Clements 				return NULL;
721640ae42eSJohn Clements 
722640ae42eSJohn Clements 			obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index];
723640ae42eSJohn Clements 		} else
724c030f2e4Sxinhui pan 			obj = &con->objs[head->block];
725c030f2e4Sxinhui pan 
726640ae42eSJohn Clements 		if (alive_obj(obj))
727c030f2e4Sxinhui pan 			return obj;
728c030f2e4Sxinhui pan 	} else {
729640ae42eSJohn Clements 		for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT + AMDGPU_RAS_MCA_BLOCK_COUNT; i++) {
730c030f2e4Sxinhui pan 			obj = &con->objs[i];
731640ae42eSJohn Clements 			if (alive_obj(obj))
732c030f2e4Sxinhui pan 				return obj;
733c030f2e4Sxinhui pan 		}
734c030f2e4Sxinhui pan 	}
735c030f2e4Sxinhui pan 
736c030f2e4Sxinhui pan 	return NULL;
737c030f2e4Sxinhui pan }
738c030f2e4Sxinhui pan /* obj end */
739c030f2e4Sxinhui pan 
740c030f2e4Sxinhui pan /* feature ctl begin */
amdgpu_ras_is_feature_allowed(struct amdgpu_device * adev,struct ras_common_if * head)741c030f2e4Sxinhui pan static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev,
742c030f2e4Sxinhui pan 					 struct ras_common_if *head)
743c030f2e4Sxinhui pan {
7448ab0d6f0SLuben Tuikov 	return adev->ras_hw_enabled & BIT(head->block);
745c030f2e4Sxinhui pan }
746c030f2e4Sxinhui pan 
amdgpu_ras_is_feature_enabled(struct amdgpu_device * adev,struct ras_common_if * head)747c030f2e4Sxinhui pan static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev,
748c030f2e4Sxinhui pan 		struct ras_common_if *head)
749c030f2e4Sxinhui pan {
750c030f2e4Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
751c030f2e4Sxinhui pan 
752c030f2e4Sxinhui pan 	return con->features & BIT(head->block);
753c030f2e4Sxinhui pan }
754c030f2e4Sxinhui pan 
755c030f2e4Sxinhui pan /*
756c030f2e4Sxinhui pan  * if obj is not created, then create one.
757c030f2e4Sxinhui pan  * set feature enable flag.
758c030f2e4Sxinhui pan  */
__amdgpu_ras_feature_enable(struct amdgpu_device * adev,struct ras_common_if * head,int enable)759c030f2e4Sxinhui pan static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
760c030f2e4Sxinhui pan 		struct ras_common_if *head, int enable)
761c030f2e4Sxinhui pan {
762c030f2e4Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
763c030f2e4Sxinhui pan 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
764c030f2e4Sxinhui pan 
7655caf466aSxinhui pan 	/* If hardware does not support ras, then do not create obj.
7665caf466aSxinhui pan 	 * But if hardware support ras, we can create the obj.
7675caf466aSxinhui pan 	 * Ras framework checks con->hw_supported to see if it need do
7685caf466aSxinhui pan 	 * corresponding initialization.
7695caf466aSxinhui pan 	 * IP checks con->support to see if it need disable ras.
7705caf466aSxinhui pan 	 */
771c030f2e4Sxinhui pan 	if (!amdgpu_ras_is_feature_allowed(adev, head))
772c030f2e4Sxinhui pan 		return 0;
773c030f2e4Sxinhui pan 
774c030f2e4Sxinhui pan 	if (enable) {
775c030f2e4Sxinhui pan 		if (!obj) {
776c030f2e4Sxinhui pan 			obj = amdgpu_ras_create_obj(adev, head);
777c030f2e4Sxinhui pan 			if (!obj)
778c030f2e4Sxinhui pan 				return -EINVAL;
779c030f2e4Sxinhui pan 		} else {
780c030f2e4Sxinhui pan 			/* In case we create obj somewhere else */
781c030f2e4Sxinhui pan 			get_obj(obj);
782c030f2e4Sxinhui pan 		}
783c030f2e4Sxinhui pan 		con->features |= BIT(head->block);
784c030f2e4Sxinhui pan 	} else {
785c030f2e4Sxinhui pan 		if (obj && amdgpu_ras_is_feature_enabled(adev, head)) {
786c030f2e4Sxinhui pan 			con->features &= ~BIT(head->block);
787c030f2e4Sxinhui pan 			put_obj(obj);
788c030f2e4Sxinhui pan 		}
789c030f2e4Sxinhui pan 	}
790c030f2e4Sxinhui pan 
791c030f2e4Sxinhui pan 	return 0;
792c030f2e4Sxinhui pan }
793c030f2e4Sxinhui pan 
794c030f2e4Sxinhui pan /* wrapper of psp_ras_enable_features */
amdgpu_ras_feature_enable(struct amdgpu_device * adev,struct ras_common_if * head,bool enable)795c030f2e4Sxinhui pan int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
796c030f2e4Sxinhui pan 		struct ras_common_if *head, bool enable)
797c030f2e4Sxinhui pan {
798c030f2e4Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
7997fcffecfSArnd Bergmann 	union ta_ras_cmd_input *info;
800bf7aa8beSHawking Zhang 	int ret;
801c030f2e4Sxinhui pan 
802c030f2e4Sxinhui pan 	if (!con)
803c030f2e4Sxinhui pan 		return -EINVAL;
804c030f2e4Sxinhui pan 
805ec70578cSHawking Zhang 	/* For non-gfx ip, do not enable ras feature if it is not allowed */
806ec70578cSHawking Zhang 	/* For gfx ip, regardless of feature support status, */
807ec70578cSHawking Zhang 	/* Force issue enable or disable ras feature commands */
808ec70578cSHawking Zhang 	if (head->block != AMDGPU_RAS_BLOCK__GFX &&
8096fc9d92cSHawking Zhang 	    !amdgpu_ras_is_feature_allowed(adev, head))
810bf7aa8beSHawking Zhang 		return 0;
8116fc9d92cSHawking Zhang 
8126fc9d92cSHawking Zhang 	/* Only enable gfx ras feature from host side */
8136fc9d92cSHawking Zhang 	if (head->block == AMDGPU_RAS_BLOCK__GFX &&
8146fc9d92cSHawking Zhang 	    !amdgpu_sriov_vf(adev) &&
8156fc9d92cSHawking Zhang 	    !amdgpu_ras_intr_triggered()) {
8167fcffecfSArnd Bergmann 		info = kzalloc(sizeof(union ta_ras_cmd_input), GFP_KERNEL);
8177fcffecfSArnd Bergmann 		if (!info)
8187fcffecfSArnd Bergmann 			return -ENOMEM;
8197fcffecfSArnd Bergmann 
820c030f2e4Sxinhui pan 		if (!enable) {
8217fcffecfSArnd Bergmann 			info->disable_features = (struct ta_ras_disable_features_input) {
822828cfa29Sxinhui pan 				.block_id =  amdgpu_ras_block_to_ta(head->block),
823828cfa29Sxinhui pan 				.error_type = amdgpu_ras_error_to_ta(head->type),
824c030f2e4Sxinhui pan 			};
825c030f2e4Sxinhui pan 		} else {
8267fcffecfSArnd Bergmann 			info->enable_features = (struct ta_ras_enable_features_input) {
827828cfa29Sxinhui pan 				.block_id =  amdgpu_ras_block_to_ta(head->block),
828828cfa29Sxinhui pan 				.error_type = amdgpu_ras_error_to_ta(head->type),
829c030f2e4Sxinhui pan 			};
830c030f2e4Sxinhui pan 		}
831c030f2e4Sxinhui pan 
8327fcffecfSArnd Bergmann 		ret = psp_ras_enable_features(&adev->psp, info, enable);
833c030f2e4Sxinhui pan 		if (ret) {
834e4348849STao Zhou 			dev_err(adev->dev, "ras %s %s failed poison:%d ret:%d\n",
835c030f2e4Sxinhui pan 				enable ? "enable":"disable",
836640ae42eSJohn Clements 				get_ras_block_str(head),
837e4348849STao Zhou 				amdgpu_ras_is_poison_mode_supported(adev), ret);
8385838f74cSCong Liu 			kfree(info);
839bf7aa8beSHawking Zhang 			return ret;
840c030f2e4Sxinhui pan 		}
841bf7aa8beSHawking Zhang 
842bf7aa8beSHawking Zhang 		kfree(info);
843bff77e86SLe Ma 	}
844c030f2e4Sxinhui pan 
845c030f2e4Sxinhui pan 	/* setup the obj */
846c030f2e4Sxinhui pan 	__amdgpu_ras_feature_enable(adev, head, enable);
847bf7aa8beSHawking Zhang 
848bf7aa8beSHawking Zhang 	return 0;
849c030f2e4Sxinhui pan }
850c030f2e4Sxinhui pan 
85177de502bSxinhui pan /* Only used in device probe stage and called only once. */
amdgpu_ras_feature_enable_on_boot(struct amdgpu_device * adev,struct ras_common_if * head,bool enable)85277de502bSxinhui pan int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,
85377de502bSxinhui pan 		struct ras_common_if *head, bool enable)
85477de502bSxinhui pan {
85577de502bSxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
85677de502bSxinhui pan 	int ret;
85777de502bSxinhui pan 
85877de502bSxinhui pan 	if (!con)
85977de502bSxinhui pan 		return -EINVAL;
86077de502bSxinhui pan 
86177de502bSxinhui pan 	if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
8627af23ebeSxinhui pan 		if (enable) {
8637af23ebeSxinhui pan 			/* There is no harm to issue a ras TA cmd regardless of
8647af23ebeSxinhui pan 			 * the currecnt ras state.
8657af23ebeSxinhui pan 			 * If current state == target state, it will do nothing
8667af23ebeSxinhui pan 			 * But sometimes it requests driver to reset and repost
8677af23ebeSxinhui pan 			 * with error code -EAGAIN.
86877de502bSxinhui pan 			 */
8697af23ebeSxinhui pan 			ret = amdgpu_ras_feature_enable(adev, head, 1);
8707af23ebeSxinhui pan 			/* With old ras TA, we might fail to enable ras.
8717af23ebeSxinhui pan 			 * Log it and just setup the object.
8727af23ebeSxinhui pan 			 * TODO need remove this WA in the future.
8737af23ebeSxinhui pan 			 */
8747af23ebeSxinhui pan 			if (ret == -EINVAL) {
8757af23ebeSxinhui pan 				ret = __amdgpu_ras_feature_enable(adev, head, 1);
8767af23ebeSxinhui pan 				if (!ret)
8776952e99cSGuchun Chen 					dev_info(adev->dev,
8786952e99cSGuchun Chen 						"RAS INFO: %s setup object\n",
879640ae42eSJohn Clements 						get_ras_block_str(head));
8807af23ebeSxinhui pan 			}
8817af23ebeSxinhui pan 		} else {
8827af23ebeSxinhui pan 			/* setup the object then issue a ras TA disable cmd.*/
88377de502bSxinhui pan 			ret = __amdgpu_ras_feature_enable(adev, head, 1);
88477de502bSxinhui pan 			if (ret)
88577de502bSxinhui pan 				return ret;
88677de502bSxinhui pan 
8870110ac11SYan Zhen 			/* gfx block ras disable cmd must send to ras-ta */
888970fd197SStanley.Yang 			if (head->block == AMDGPU_RAS_BLOCK__GFX)
889970fd197SStanley.Yang 				con->features |= BIT(head->block);
890970fd197SStanley.Yang 
89177de502bSxinhui pan 			ret = amdgpu_ras_feature_enable(adev, head, 0);
89219d0dfdaSStanley.Yang 
89319d0dfdaSStanley.Yang 			/* clean gfx block ras features flag */
8948ab0d6f0SLuben Tuikov 			if (adev->ras_enabled && head->block == AMDGPU_RAS_BLOCK__GFX)
89519d0dfdaSStanley.Yang 				con->features &= ~BIT(head->block);
8967af23ebeSxinhui pan 		}
89777de502bSxinhui pan 	} else
89877de502bSxinhui pan 		ret = amdgpu_ras_feature_enable(adev, head, enable);
89977de502bSxinhui pan 
90077de502bSxinhui pan 	return ret;
90177de502bSxinhui pan }
90277de502bSxinhui pan 
amdgpu_ras_disable_all_features(struct amdgpu_device * adev,bool bypass)903c030f2e4Sxinhui pan static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev,
904c030f2e4Sxinhui pan 		bool bypass)
905c030f2e4Sxinhui pan {
906c030f2e4Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
907c030f2e4Sxinhui pan 	struct ras_manager *obj, *tmp;
908c030f2e4Sxinhui pan 
909c030f2e4Sxinhui pan 	list_for_each_entry_safe(obj, tmp, &con->head, node) {
910c030f2e4Sxinhui pan 		/* bypass psp.
911c030f2e4Sxinhui pan 		 * aka just release the obj and corresponding flags
912c030f2e4Sxinhui pan 		 */
913c030f2e4Sxinhui pan 		if (bypass) {
914c030f2e4Sxinhui pan 			if (__amdgpu_ras_feature_enable(adev, &obj->head, 0))
915c030f2e4Sxinhui pan 				break;
916c030f2e4Sxinhui pan 		} else {
917c030f2e4Sxinhui pan 			if (amdgpu_ras_feature_enable(adev, &obj->head, 0))
918c030f2e4Sxinhui pan 				break;
919c030f2e4Sxinhui pan 		}
920289d513bSkbuild test robot 	}
921c030f2e4Sxinhui pan 
922c030f2e4Sxinhui pan 	return con->features;
923c030f2e4Sxinhui pan }
924c030f2e4Sxinhui pan 
amdgpu_ras_enable_all_features(struct amdgpu_device * adev,bool bypass)925c030f2e4Sxinhui pan static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
926c030f2e4Sxinhui pan 		bool bypass)
927c030f2e4Sxinhui pan {
928c030f2e4Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
929c030f2e4Sxinhui pan 	int i;
930640ae42eSJohn Clements 	const enum amdgpu_ras_error_type default_ras_type = AMDGPU_RAS_ERROR__NONE;
931c030f2e4Sxinhui pan 
932640ae42eSJohn Clements 	for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) {
933c030f2e4Sxinhui pan 		struct ras_common_if head = {
934c030f2e4Sxinhui pan 			.block = i,
935191051a1Sxinhui pan 			.type = default_ras_type,
936c030f2e4Sxinhui pan 			.sub_block_index = 0,
937c030f2e4Sxinhui pan 		};
938640ae42eSJohn Clements 
939640ae42eSJohn Clements 		if (i == AMDGPU_RAS_BLOCK__MCA)
940640ae42eSJohn Clements 			continue;
941640ae42eSJohn Clements 
942640ae42eSJohn Clements 		if (bypass) {
943640ae42eSJohn Clements 			/*
944640ae42eSJohn Clements 			 * bypass psp. vbios enable ras for us.
945640ae42eSJohn Clements 			 * so just create the obj
946640ae42eSJohn Clements 			 */
947640ae42eSJohn Clements 			if (__amdgpu_ras_feature_enable(adev, &head, 1))
948640ae42eSJohn Clements 				break;
949640ae42eSJohn Clements 		} else {
950640ae42eSJohn Clements 			if (amdgpu_ras_feature_enable(adev, &head, 1))
951640ae42eSJohn Clements 				break;
952640ae42eSJohn Clements 		}
953640ae42eSJohn Clements 	}
954640ae42eSJohn Clements 
955640ae42eSJohn Clements 	for (i = 0; i < AMDGPU_RAS_MCA_BLOCK_COUNT; i++) {
956640ae42eSJohn Clements 		struct ras_common_if head = {
957640ae42eSJohn Clements 			.block = AMDGPU_RAS_BLOCK__MCA,
958640ae42eSJohn Clements 			.type = default_ras_type,
959640ae42eSJohn Clements 			.sub_block_index = i,
960640ae42eSJohn Clements 		};
961640ae42eSJohn Clements 
962c030f2e4Sxinhui pan 		if (bypass) {
963c030f2e4Sxinhui pan 			/*
964c030f2e4Sxinhui pan 			 * bypass psp. vbios enable ras for us.
965c030f2e4Sxinhui pan 			 * so just create the obj
966c030f2e4Sxinhui pan 			 */
967c030f2e4Sxinhui pan 			if (__amdgpu_ras_feature_enable(adev, &head, 1))
968c030f2e4Sxinhui pan 				break;
969c030f2e4Sxinhui pan 		} else {
970c030f2e4Sxinhui pan 			if (amdgpu_ras_feature_enable(adev, &head, 1))
971c030f2e4Sxinhui pan 				break;
972c030f2e4Sxinhui pan 		}
973289d513bSkbuild test robot 	}
974c030f2e4Sxinhui pan 
975c030f2e4Sxinhui pan 	return con->features;
976c030f2e4Sxinhui pan }
977c030f2e4Sxinhui pan /* feature ctl end */
978c030f2e4Sxinhui pan 
amdgpu_ras_block_match_default(struct amdgpu_ras_block_object * block_obj,enum amdgpu_ras_block block)979e3d833f4Syipechai static int amdgpu_ras_block_match_default(struct amdgpu_ras_block_object *block_obj,
980e3d833f4Syipechai 		enum amdgpu_ras_block block)
981640ae42eSJohn Clements {
9826492e1b0Syipechai 	if (!block_obj)
9836492e1b0Syipechai 		return -EINVAL;
9846492e1b0Syipechai 
985bdb3489cSyipechai 	if (block_obj->ras_comm.block == block)
9866492e1b0Syipechai 		return 0;
9876492e1b0Syipechai 
9886492e1b0Syipechai 	return -EINVAL;
989640ae42eSJohn Clements }
9906492e1b0Syipechai 
amdgpu_ras_get_ras_block(struct amdgpu_device * adev,enum amdgpu_ras_block block,uint32_t sub_block_index)9916492e1b0Syipechai static struct amdgpu_ras_block_object *amdgpu_ras_get_ras_block(struct amdgpu_device *adev,
9926492e1b0Syipechai 					enum amdgpu_ras_block block, uint32_t sub_block_index)
9936492e1b0Syipechai {
994d5e8ff5fSyipechai 	struct amdgpu_ras_block_list *node, *tmp;
995d5e8ff5fSyipechai 	struct amdgpu_ras_block_object *obj;
9966492e1b0Syipechai 
9976492e1b0Syipechai 	if (block >= AMDGPU_RAS_BLOCK__LAST)
9986492e1b0Syipechai 		return NULL;
9996492e1b0Syipechai 
1000d5e8ff5fSyipechai 	list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
1001d5e8ff5fSyipechai 		if (!node->ras_obj) {
1002d5e8ff5fSyipechai 			dev_warn(adev->dev, "Warning: abnormal ras list node.\n");
1003d5e8ff5fSyipechai 			continue;
1004d5e8ff5fSyipechai 		}
1005d5e8ff5fSyipechai 
1006d5e8ff5fSyipechai 		obj = node->ras_obj;
10076492e1b0Syipechai 		if (obj->ras_block_match) {
10086492e1b0Syipechai 			if (obj->ras_block_match(obj, block, sub_block_index) == 0)
10096492e1b0Syipechai 				return obj;
10106492e1b0Syipechai 		} else {
10116492e1b0Syipechai 			if (amdgpu_ras_block_match_default(obj, block) == 0)
10126492e1b0Syipechai 				return obj;
10136492e1b0Syipechai 		}
10146492e1b0Syipechai 	}
10156492e1b0Syipechai 
10166492e1b0Syipechai 	return NULL;
1017640ae42eSJohn Clements }
1018640ae42eSJohn Clements 
amdgpu_ras_get_ecc_info(struct amdgpu_device * adev,struct ras_err_data * err_data)1019fdcb279dSStanley.Yang static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_data *err_data)
1020fdcb279dSStanley.Yang {
1021fdcb279dSStanley.Yang 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1022fdcb279dSStanley.Yang 	int ret = 0;
1023fdcb279dSStanley.Yang 
1024fdcb279dSStanley.Yang 	/*
1025fdcb279dSStanley.Yang 	 * choosing right query method according to
1026fdcb279dSStanley.Yang 	 * whether smu support query error information
1027fdcb279dSStanley.Yang 	 */
1028bc143d8bSEvan Quan 	ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(ras->umc_ecc));
1029fdcb279dSStanley.Yang 	if (ret == -EOPNOTSUPP) {
1030efe17d5aSyipechai 		if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
1031efe17d5aSyipechai 			adev->umc.ras->ras_block.hw_ops->query_ras_error_count)
1032efe17d5aSyipechai 			adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, err_data);
1033fdcb279dSStanley.Yang 
1034fdcb279dSStanley.Yang 		/* umc query_ras_error_address is also responsible for clearing
1035fdcb279dSStanley.Yang 		 * error status
1036fdcb279dSStanley.Yang 		 */
1037efe17d5aSyipechai 		if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
1038efe17d5aSyipechai 		    adev->umc.ras->ras_block.hw_ops->query_ras_error_address)
1039efe17d5aSyipechai 			adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, err_data);
1040fdcb279dSStanley.Yang 	} else if (!ret) {
1041efe17d5aSyipechai 		if (adev->umc.ras &&
1042efe17d5aSyipechai 			adev->umc.ras->ecc_info_query_ras_error_count)
1043efe17d5aSyipechai 			adev->umc.ras->ecc_info_query_ras_error_count(adev, err_data);
1044fdcb279dSStanley.Yang 
1045efe17d5aSyipechai 		if (adev->umc.ras &&
1046efe17d5aSyipechai 			adev->umc.ras->ecc_info_query_ras_error_address)
1047efe17d5aSyipechai 			adev->umc.ras->ecc_info_query_ras_error_address(adev, err_data);
1048fdcb279dSStanley.Yang 	}
1049fdcb279dSStanley.Yang }
1050fdcb279dSStanley.Yang 
amdgpu_ras_error_print_error_data(struct amdgpu_device * adev,struct ras_manager * ras_mgr,struct ras_err_data * err_data,struct ras_query_context * qctx,const char * blk_name,bool is_ue,bool is_de)10515b1270beSYang Wang static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,
1052ec3e0a91SYang Wang 					      struct ras_manager *ras_mgr,
10535b1270beSYang Wang 					      struct ras_err_data *err_data,
10549dc57c2aSYang Wang 					      struct ras_query_context *qctx,
1055ec3e0a91SYang Wang 					      const char *blk_name,
105646e2231cSCandice Li 					      bool is_ue,
105746e2231cSCandice Li 					      bool is_de)
10585b1270beSYang Wang {
10595b1270beSYang Wang 	struct amdgpu_smuio_mcm_config_info *mcm_info;
10605b1270beSYang Wang 	struct ras_err_node *err_node;
10615b1270beSYang Wang 	struct ras_err_info *err_info;
106275ac6a25SYang Wang 	u64 event_id = qctx->evid.event_id;
10635b1270beSYang Wang 
1064ec3e0a91SYang Wang 	if (is_ue) {
10655b1270beSYang Wang 		for_each_ras_error(err_node, err_data) {
10665b1270beSYang Wang 			err_info = &err_node->err_info;
10675b1270beSYang Wang 			mcm_info = &err_info->mcm_info;
1068ec3e0a91SYang Wang 			if (err_info->ue_count) {
10699dc57c2aSYang Wang 				RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
1070ec3e0a91SYang Wang 					      "%lld new uncorrectable hardware errors detected in %s block\n",
10715b1270beSYang Wang 					      mcm_info->socket_id,
10725b1270beSYang Wang 					      mcm_info->die_id,
10735b1270beSYang Wang 					      err_info->ue_count,
10745b1270beSYang Wang 					      blk_name);
1075ec3e0a91SYang Wang 			}
1076ec3e0a91SYang Wang 		}
1077ec3e0a91SYang Wang 
1078ec3e0a91SYang Wang 		for_each_ras_error(err_node, &ras_mgr->err_data) {
1079ec3e0a91SYang Wang 			err_info = &err_node->err_info;
1080ec3e0a91SYang Wang 			mcm_info = &err_info->mcm_info;
10819dc57c2aSYang Wang 			RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
1082ec3e0a91SYang Wang 				      "%lld uncorrectable hardware errors detected in total in %s block\n",
1083ec3e0a91SYang Wang 				      mcm_info->socket_id, mcm_info->die_id, err_info->ue_count, blk_name);
1084ec3e0a91SYang Wang 		}
1085ec3e0a91SYang Wang 
1086ec3e0a91SYang Wang 	} else {
108746e2231cSCandice Li 		if (is_de) {
108846e2231cSCandice Li 			for_each_ras_error(err_node, err_data) {
108946e2231cSCandice Li 				err_info = &err_node->err_info;
109046e2231cSCandice Li 				mcm_info = &err_info->mcm_info;
109146e2231cSCandice Li 				if (err_info->de_count) {
10929dc57c2aSYang Wang 					RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
109346e2231cSCandice Li 						      "%lld new deferred hardware errors detected in %s block\n",
109446e2231cSCandice Li 						      mcm_info->socket_id,
109546e2231cSCandice Li 						      mcm_info->die_id,
109646e2231cSCandice Li 						      err_info->de_count,
109746e2231cSCandice Li 						      blk_name);
109846e2231cSCandice Li 				}
109946e2231cSCandice Li 			}
110046e2231cSCandice Li 
110146e2231cSCandice Li 			for_each_ras_error(err_node, &ras_mgr->err_data) {
110246e2231cSCandice Li 				err_info = &err_node->err_info;
110346e2231cSCandice Li 				mcm_info = &err_info->mcm_info;
11049dc57c2aSYang Wang 				RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
110546e2231cSCandice Li 					      "%lld deferred hardware errors detected in total in %s block\n",
110646e2231cSCandice Li 					      mcm_info->socket_id, mcm_info->die_id,
110746e2231cSCandice Li 					      err_info->de_count, blk_name);
110846e2231cSCandice Li 			}
110946e2231cSCandice Li 		} else {
1110ec3e0a91SYang Wang 			for_each_ras_error(err_node, err_data) {
1111ec3e0a91SYang Wang 				err_info = &err_node->err_info;
1112ec3e0a91SYang Wang 				mcm_info = &err_info->mcm_info;
1113ec3e0a91SYang Wang 				if (err_info->ce_count) {
11149dc57c2aSYang Wang 					RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
111590bd0147SCandice Li 						      "%lld new correctable hardware errors detected in %s block\n",
11165b1270beSYang Wang 						      mcm_info->socket_id,
11175b1270beSYang Wang 						      mcm_info->die_id,
111849c260beSYang Wang 						      err_info->ce_count,
11195b1270beSYang Wang 						      blk_name);
11205b1270beSYang Wang 				}
11215b1270beSYang Wang 			}
1122ec3e0a91SYang Wang 
1123ec3e0a91SYang Wang 			for_each_ras_error(err_node, &ras_mgr->err_data) {
1124ec3e0a91SYang Wang 				err_info = &err_node->err_info;
1125ec3e0a91SYang Wang 				mcm_info = &err_info->mcm_info;
11269dc57c2aSYang Wang 				RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
112790bd0147SCandice Li 					      "%lld correctable hardware errors detected in total in %s block\n",
112846e2231cSCandice Li 					      mcm_info->socket_id, mcm_info->die_id,
112946e2231cSCandice Li 					      err_info->ce_count, blk_name);
113046e2231cSCandice Li 			}
1131ec3e0a91SYang Wang 		}
1132ec3e0a91SYang Wang 	}
1133ec3e0a91SYang Wang }
1134ec3e0a91SYang Wang 
err_data_has_source_info(struct ras_err_data * data)1135ec3e0a91SYang Wang static inline bool err_data_has_source_info(struct ras_err_data *data)
1136ec3e0a91SYang Wang {
1137ec3e0a91SYang Wang 	return !list_empty(&data->err_node_list);
11385b1270beSYang Wang }
11395b1270beSYang Wang 
amdgpu_ras_error_generate_report(struct amdgpu_device * adev,struct ras_query_if * query_if,struct ras_err_data * err_data,struct ras_query_context * qctx)11405b1270beSYang Wang static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev,
11415b1270beSYang Wang 					     struct ras_query_if *query_if,
11429dc57c2aSYang Wang 					     struct ras_err_data *err_data,
11439dc57c2aSYang Wang 					     struct ras_query_context *qctx)
11445b1270beSYang Wang {
11455b1270beSYang Wang 	struct ras_manager *ras_mgr = amdgpu_ras_find_obj(adev, &query_if->head);
11465b1270beSYang Wang 	const char *blk_name = get_ras_block_str(&query_if->head);
114775ac6a25SYang Wang 	u64 event_id = qctx->evid.event_id;
11485b1270beSYang Wang 
11495b1270beSYang Wang 	if (err_data->ce_count) {
1150ec3e0a91SYang Wang 		if (err_data_has_source_info(err_data)) {
11519dc57c2aSYang Wang 			amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, qctx,
115246e2231cSCandice Li 							  blk_name, false, false);
11535b1270beSYang Wang 		} else if (!adev->aid_mask &&
11545b1270beSYang Wang 			   adev->smuio.funcs &&
11555b1270beSYang Wang 			   adev->smuio.funcs->get_socket_id &&
11565b1270beSYang Wang 			   adev->smuio.funcs->get_die_id) {
11579dc57c2aSYang Wang 			RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d "
11585b1270beSYang Wang 				      "%ld correctable hardware errors "
115990bd0147SCandice Li 				      "detected in %s block\n",
11605b1270beSYang Wang 				      adev->smuio.funcs->get_socket_id(adev),
11615b1270beSYang Wang 				      adev->smuio.funcs->get_die_id(adev),
11625b1270beSYang Wang 				      ras_mgr->err_data.ce_count,
11635b1270beSYang Wang 				      blk_name);
11645b1270beSYang Wang 		} else {
11659dc57c2aSYang Wang 			RAS_EVENT_LOG(adev, event_id, "%ld correctable hardware errors "
116690bd0147SCandice Li 				      "detected in %s block\n",
11675b1270beSYang Wang 				      ras_mgr->err_data.ce_count,
11685b1270beSYang Wang 				      blk_name);
11695b1270beSYang Wang 		}
11705b1270beSYang Wang 	}
11715b1270beSYang Wang 
11725b1270beSYang Wang 	if (err_data->ue_count) {
1173ec3e0a91SYang Wang 		if (err_data_has_source_info(err_data)) {
11749dc57c2aSYang Wang 			amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, qctx,
117546e2231cSCandice Li 							  blk_name, true, false);
11765b1270beSYang Wang 		} else if (!adev->aid_mask &&
11775b1270beSYang Wang 			   adev->smuio.funcs &&
11785b1270beSYang Wang 			   adev->smuio.funcs->get_socket_id &&
11795b1270beSYang Wang 			   adev->smuio.funcs->get_die_id) {
11809dc57c2aSYang Wang 			RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d "
11815b1270beSYang Wang 				      "%ld uncorrectable hardware errors "
11825b1270beSYang Wang 				      "detected in %s block\n",
11835b1270beSYang Wang 				      adev->smuio.funcs->get_socket_id(adev),
11845b1270beSYang Wang 				      adev->smuio.funcs->get_die_id(adev),
11855b1270beSYang Wang 				      ras_mgr->err_data.ue_count,
11865b1270beSYang Wang 				      blk_name);
11875b1270beSYang Wang 		} else {
11889dc57c2aSYang Wang 			RAS_EVENT_LOG(adev, event_id, "%ld uncorrectable hardware errors "
11895b1270beSYang Wang 				      "detected in %s block\n",
11905b1270beSYang Wang 				      ras_mgr->err_data.ue_count,
11915b1270beSYang Wang 				      blk_name);
11925b1270beSYang Wang 		}
11935b1270beSYang Wang 	}
11945b1270beSYang Wang 
119546e2231cSCandice Li 	if (err_data->de_count) {
119646e2231cSCandice Li 		if (err_data_has_source_info(err_data)) {
11979dc57c2aSYang Wang 			amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, qctx,
119846e2231cSCandice Li 							  blk_name, false, true);
119946e2231cSCandice Li 		} else if (!adev->aid_mask &&
120046e2231cSCandice Li 			   adev->smuio.funcs &&
120146e2231cSCandice Li 			   adev->smuio.funcs->get_socket_id &&
120246e2231cSCandice Li 			   adev->smuio.funcs->get_die_id) {
12039dc57c2aSYang Wang 			RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d "
120446e2231cSCandice Li 				      "%ld deferred hardware errors "
120546e2231cSCandice Li 				      "detected in %s block\n",
120646e2231cSCandice Li 				      adev->smuio.funcs->get_socket_id(adev),
120746e2231cSCandice Li 				      adev->smuio.funcs->get_die_id(adev),
120846e2231cSCandice Li 				      ras_mgr->err_data.de_count,
120946e2231cSCandice Li 				      blk_name);
121046e2231cSCandice Li 		} else {
12119dc57c2aSYang Wang 			RAS_EVENT_LOG(adev, event_id, "%ld deferred hardware errors "
121246e2231cSCandice Li 				      "detected in %s block\n",
121346e2231cSCandice Li 				      ras_mgr->err_data.de_count,
121446e2231cSCandice Li 				      blk_name);
121546e2231cSCandice Li 		}
121646e2231cSCandice Li 	}
12175b1270beSYang Wang }
12185b1270beSYang Wang 
amdgpu_ras_virt_error_generate_report(struct amdgpu_device * adev,struct ras_query_if * query_if,struct ras_err_data * err_data,struct ras_query_context * qctx)121984a2947eSVictor Skvortsov static void amdgpu_ras_virt_error_generate_report(struct amdgpu_device *adev,
122084a2947eSVictor Skvortsov 						  struct ras_query_if *query_if,
122184a2947eSVictor Skvortsov 						  struct ras_err_data *err_data,
122284a2947eSVictor Skvortsov 						  struct ras_query_context *qctx)
122384a2947eSVictor Skvortsov {
122484a2947eSVictor Skvortsov 	unsigned long new_ue, new_ce, new_de;
122584a2947eSVictor Skvortsov 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &query_if->head);
122684a2947eSVictor Skvortsov 	const char *blk_name = get_ras_block_str(&query_if->head);
122784a2947eSVictor Skvortsov 	u64 event_id = qctx->evid.event_id;
122884a2947eSVictor Skvortsov 
122984a2947eSVictor Skvortsov 	new_ce = err_data->ce_count - obj->err_data.ce_count;
123084a2947eSVictor Skvortsov 	new_ue = err_data->ue_count - obj->err_data.ue_count;
123184a2947eSVictor Skvortsov 	new_de = err_data->de_count - obj->err_data.de_count;
123284a2947eSVictor Skvortsov 
123384a2947eSVictor Skvortsov 	if (new_ce) {
123484a2947eSVictor Skvortsov 		RAS_EVENT_LOG(adev, event_id, "%lu correctable hardware errors "
123584a2947eSVictor Skvortsov 			      "detected in %s block\n",
123684a2947eSVictor Skvortsov 			      new_ce,
123784a2947eSVictor Skvortsov 			      blk_name);
123884a2947eSVictor Skvortsov 	}
123984a2947eSVictor Skvortsov 
124084a2947eSVictor Skvortsov 	if (new_ue) {
124184a2947eSVictor Skvortsov 		RAS_EVENT_LOG(adev, event_id, "%lu uncorrectable hardware errors "
124284a2947eSVictor Skvortsov 			      "detected in %s block\n",
124384a2947eSVictor Skvortsov 			      new_ue,
124484a2947eSVictor Skvortsov 			      blk_name);
124584a2947eSVictor Skvortsov 	}
124684a2947eSVictor Skvortsov 
124784a2947eSVictor Skvortsov 	if (new_de) {
124884a2947eSVictor Skvortsov 		RAS_EVENT_LOG(adev, event_id, "%lu deferred hardware errors "
124984a2947eSVictor Skvortsov 			      "detected in %s block\n",
125084a2947eSVictor Skvortsov 			      new_de,
125184a2947eSVictor Skvortsov 			      blk_name);
125284a2947eSVictor Skvortsov 	}
125384a2947eSVictor Skvortsov }
125484a2947eSVictor Skvortsov 
amdgpu_rasmgr_error_data_statistic_update(struct ras_manager * obj,struct ras_err_data * err_data)1255ec3e0a91SYang Wang static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, struct ras_err_data *err_data)
1256ec3e0a91SYang Wang {
1257ec3e0a91SYang Wang 	struct ras_err_node *err_node;
1258ec3e0a91SYang Wang 	struct ras_err_info *err_info;
1259ec3e0a91SYang Wang 
1260ec3e0a91SYang Wang 	if (err_data_has_source_info(err_data)) {
1261ec3e0a91SYang Wang 		for_each_ras_error(err_node, err_data) {
1262ec3e0a91SYang Wang 			err_info = &err_node->err_info;
126346e2231cSCandice Li 			amdgpu_ras_error_statistic_de_count(&obj->err_data,
1264671af066SYang Wang 					&err_info->mcm_info, err_info->de_count);
12659f91e983SYiPeng Chai 			amdgpu_ras_error_statistic_ce_count(&obj->err_data,
1266671af066SYang Wang 					&err_info->mcm_info, err_info->ce_count);
12679f91e983SYiPeng Chai 			amdgpu_ras_error_statistic_ue_count(&obj->err_data,
1268671af066SYang Wang 					&err_info->mcm_info, err_info->ue_count);
1269ec3e0a91SYang Wang 		}
1270ec3e0a91SYang Wang 	} else {
1271ec3e0a91SYang Wang 		/* for legacy asic path which doesn't has error source info */
1272ec3e0a91SYang Wang 		obj->err_data.ue_count += err_data->ue_count;
1273ec3e0a91SYang Wang 		obj->err_data.ce_count += err_data->ce_count;
127446e2231cSCandice Li 		obj->err_data.de_count += err_data->de_count;
1275ec3e0a91SYang Wang 	}
1276ec3e0a91SYang Wang }
1277ec3e0a91SYang Wang 
amdgpu_ras_mgr_virt_error_data_statistics_update(struct ras_manager * obj,struct ras_err_data * err_data)127884a2947eSVictor Skvortsov static void amdgpu_ras_mgr_virt_error_data_statistics_update(struct ras_manager *obj,
127984a2947eSVictor Skvortsov 							     struct ras_err_data *err_data)
128084a2947eSVictor Skvortsov {
128184a2947eSVictor Skvortsov 	/* Host reports absolute counts */
128284a2947eSVictor Skvortsov 	obj->err_data.ue_count = err_data->ue_count;
128384a2947eSVictor Skvortsov 	obj->err_data.ce_count = err_data->ce_count;
128484a2947eSVictor Skvortsov 	obj->err_data.de_count = err_data->de_count;
128584a2947eSVictor Skvortsov }
128684a2947eSVictor Skvortsov 
get_ras_manager(struct amdgpu_device * adev,enum amdgpu_ras_block blk)128704c4fcd2SYang Wang static struct ras_manager *get_ras_manager(struct amdgpu_device *adev, enum amdgpu_ras_block blk)
128804c4fcd2SYang Wang {
128904c4fcd2SYang Wang 	struct ras_common_if head;
129004c4fcd2SYang Wang 
129104c4fcd2SYang Wang 	memset(&head, 0, sizeof(head));
129204c4fcd2SYang Wang 	head.block = blk;
129304c4fcd2SYang Wang 
129404c4fcd2SYang Wang 	return amdgpu_ras_find_obj(adev, &head);
129504c4fcd2SYang Wang }
129604c4fcd2SYang Wang 
amdgpu_ras_bind_aca(struct amdgpu_device * adev,enum amdgpu_ras_block blk,const struct aca_info * aca_info,void * data)129704c4fcd2SYang Wang int amdgpu_ras_bind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
129804c4fcd2SYang Wang 			const struct aca_info *aca_info, void *data)
129904c4fcd2SYang Wang {
130004c4fcd2SYang Wang 	struct ras_manager *obj;
130104c4fcd2SYang Wang 
13026f3b6913SYiPeng Chai 	/* in resume phase, no need to create aca fs node */
1303e283f4fbSLijo Lazar 	if (adev->in_suspend || amdgpu_reset_in_recovery(adev))
13046f3b6913SYiPeng Chai 		return 0;
13056f3b6913SYiPeng Chai 
130604c4fcd2SYang Wang 	obj = get_ras_manager(adev, blk);
130704c4fcd2SYang Wang 	if (!obj)
130804c4fcd2SYang Wang 		return -EINVAL;
130904c4fcd2SYang Wang 
131004c4fcd2SYang Wang 	return amdgpu_aca_add_handle(adev, &obj->aca_handle, ras_block_str(blk), aca_info, data);
131104c4fcd2SYang Wang }
131204c4fcd2SYang Wang 
amdgpu_ras_unbind_aca(struct amdgpu_device * adev,enum amdgpu_ras_block blk)131304c4fcd2SYang Wang int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk)
131404c4fcd2SYang Wang {
131504c4fcd2SYang Wang 	struct ras_manager *obj;
131604c4fcd2SYang Wang 
131704c4fcd2SYang Wang 	obj = get_ras_manager(adev, blk);
131804c4fcd2SYang Wang 	if (!obj)
131904c4fcd2SYang Wang 		return -EINVAL;
132004c4fcd2SYang Wang 
132104c4fcd2SYang Wang 	amdgpu_aca_remove_handle(&obj->aca_handle);
132204c4fcd2SYang Wang 
132304c4fcd2SYang Wang 	return 0;
132404c4fcd2SYang Wang }
132504c4fcd2SYang Wang 
amdgpu_aca_log_ras_error_data(struct amdgpu_device * adev,enum amdgpu_ras_block blk,enum aca_error_type type,struct ras_err_data * err_data,struct ras_query_context * qctx)132604c4fcd2SYang Wang static int amdgpu_aca_log_ras_error_data(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
132731fd330bSYang Wang 					 enum aca_error_type type, struct ras_err_data *err_data,
132831fd330bSYang Wang 					 struct ras_query_context *qctx)
132904c4fcd2SYang Wang {
133004c4fcd2SYang Wang 	struct ras_manager *obj;
133104c4fcd2SYang Wang 
133204c4fcd2SYang Wang 	obj = get_ras_manager(adev, blk);
133304c4fcd2SYang Wang 	if (!obj)
133404c4fcd2SYang Wang 		return -EINVAL;
133504c4fcd2SYang Wang 
133631fd330bSYang Wang 	return amdgpu_aca_get_error_data(adev, &obj->aca_handle, type, err_data, qctx);
133704c4fcd2SYang Wang }
133804c4fcd2SYang Wang 
amdgpu_ras_aca_sysfs_read(struct device * dev,struct device_attribute * attr,struct aca_handle * handle,char * buf,void * data)133937973b69SYang Wang ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *attr,
134037973b69SYang Wang 				  struct aca_handle *handle, char *buf, void *data)
134137973b69SYang Wang {
134237973b69SYang Wang 	struct ras_manager *obj = container_of(handle, struct ras_manager, aca_handle);
134337973b69SYang Wang 	struct ras_query_if info = {
134437973b69SYang Wang 		.head = obj->head,
134537973b69SYang Wang 	};
134637973b69SYang Wang 
134778347b65SYiPeng Chai 	if (!amdgpu_ras_get_error_query_ready(obj->adev))
134878347b65SYiPeng Chai 		return sysfs_emit(buf, "Query currently inaccessible\n");
134978347b65SYiPeng Chai 
135037973b69SYang Wang 	if (amdgpu_ras_query_error_status(obj->adev, &info))
135137973b69SYang Wang 		return -EINVAL;
135237973b69SYang Wang 
1353865d3397SYang Wang 	return sysfs_emit(buf, "%s: %lu\n%s: %lu\n%s: %lu\n", "ue", info.ue_count,
13543c603b1fSYang Wang 			  "ce", info.ce_count, "de", info.de_count);
135537973b69SYang Wang }
135637973b69SYang Wang 
amdgpu_ras_query_error_status_helper(struct amdgpu_device * adev,struct ras_query_if * info,struct ras_err_data * err_data,struct ras_query_context * qctx,unsigned int error_query_mode)13578cc0f566SHawking Zhang static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
13588cc0f566SHawking Zhang 						struct ras_query_if *info,
13598cc0f566SHawking Zhang 						struct ras_err_data *err_data,
13609dc57c2aSYang Wang 						struct ras_query_context *qctx,
13618cc0f566SHawking Zhang 						unsigned int error_query_mode)
1362c030f2e4Sxinhui pan {
13638cc0f566SHawking Zhang 	enum amdgpu_ras_block blk = info ? info->head.block : AMDGPU_RAS_BLOCK_COUNT;
13648b0fb0e9Syipechai 	struct amdgpu_ras_block_object *block_obj = NULL;
136504c4fcd2SYang Wang 	int ret;
1366c030f2e4Sxinhui pan 
1367b8d55a90SSrinivasan Shanmugam 	if (blk == AMDGPU_RAS_BLOCK_COUNT)
1368b8d55a90SSrinivasan Shanmugam 		return -EINVAL;
1369b8d55a90SSrinivasan Shanmugam 
13708cc0f566SHawking Zhang 	if (error_query_mode == AMDGPU_RAS_INVALID_ERROR_QUERY)
1371c030f2e4Sxinhui pan 		return -EINVAL;
1372c030f2e4Sxinhui pan 
137384a2947eSVictor Skvortsov 	if (error_query_mode == AMDGPU_RAS_VIRT_ERROR_COUNT_QUERY) {
137484a2947eSVictor Skvortsov 		return amdgpu_virt_req_ras_err_count(adev, blk, err_data);
137584a2947eSVictor Skvortsov 	} else if (error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) {
13767389a5b8Syipechai 		if (info->head.block == AMDGPU_RAS_BLOCK__UMC) {
13778cc0f566SHawking Zhang 			amdgpu_ras_get_ecc_info(adev, err_data);
13787389a5b8Syipechai 		} else {
13797389a5b8Syipechai 			block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0);
13808b0fb0e9Syipechai 			if (!block_obj || !block_obj->hw_ops) {
1381afa37315SLuben Tuikov 				dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
13828b0fb0e9Syipechai 					     get_ras_block_str(&info->head));
13838cc0f566SHawking Zhang 				return -EINVAL;
13843e81ee9aSHawking Zhang 			}
1385761d86d3SDennis Li 
13868b0fb0e9Syipechai 			if (block_obj->hw_ops->query_ras_error_count)
138707ee43faSYang Wang 				block_obj->hw_ops->query_ras_error_count(adev, err_data);
1388761d86d3SDennis Li 
13897389a5b8Syipechai 			if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) ||
13907389a5b8Syipechai 			    (info->head.block == AMDGPU_RAS_BLOCK__GFX) ||
13917389a5b8Syipechai 			    (info->head.block == AMDGPU_RAS_BLOCK__MMHUB)) {
13928b0fb0e9Syipechai 				if (block_obj->hw_ops->query_ras_error_status)
13938b0fb0e9Syipechai 					block_obj->hw_ops->query_ras_error_status(adev);
13946c245386Syipechai 			}
1395939e2258SHawking Zhang 		}
13968cc0f566SHawking Zhang 	} else {
139704c4fcd2SYang Wang 		if (amdgpu_aca_is_enabled(adev)) {
139831fd330bSYang Wang 			ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_UE, err_data, qctx);
139904c4fcd2SYang Wang 			if (ret)
140004c4fcd2SYang Wang 				return ret;
140104c4fcd2SYang Wang 
140231fd330bSYang Wang 			ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_CE, err_data, qctx);
140304c4fcd2SYang Wang 			if (ret)
140404c4fcd2SYang Wang 				return ret;
1405865d3397SYang Wang 
140631fd330bSYang Wang 			ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_DEFERRED, err_data, qctx);
1407865d3397SYang Wang 			if (ret)
1408865d3397SYang Wang 				return ret;
140904c4fcd2SYang Wang 		} else {
14108cc0f566SHawking Zhang 			/* FIXME: add code to check return value later */
14119dc57c2aSYang Wang 			amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_UE, err_data, qctx);
14129dc57c2aSYang Wang 			amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_CE, err_data, qctx);
14138cc0f566SHawking Zhang 		}
141404c4fcd2SYang Wang 	}
14158cc0f566SHawking Zhang 
14168cc0f566SHawking Zhang 	return 0;
14178cc0f566SHawking Zhang }
14188cc0f566SHawking Zhang 
14198cc0f566SHawking Zhang /* query/inject/cure begin */
amdgpu_ras_query_error_status_with_event(struct amdgpu_device * adev,struct ras_query_if * info,enum ras_event_type type)142075ac6a25SYang Wang static int amdgpu_ras_query_error_status_with_event(struct amdgpu_device *adev,
142175ac6a25SYang Wang 						    struct ras_query_if *info,
142275ac6a25SYang Wang 						    enum ras_event_type type)
14238cc0f566SHawking Zhang {
14248cc0f566SHawking Zhang 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
14258cc0f566SHawking Zhang 	struct ras_err_data err_data;
14269dc57c2aSYang Wang 	struct ras_query_context qctx;
14278cc0f566SHawking Zhang 	unsigned int error_query_mode;
14288cc0f566SHawking Zhang 	int ret;
14298cc0f566SHawking Zhang 
14308cc0f566SHawking Zhang 	if (!obj)
14318cc0f566SHawking Zhang 		return -EINVAL;
14328cc0f566SHawking Zhang 
14338cc0f566SHawking Zhang 	ret = amdgpu_ras_error_data_init(&err_data);
14348cc0f566SHawking Zhang 	if (ret)
14358cc0f566SHawking Zhang 		return ret;
14368cc0f566SHawking Zhang 
14378cc0f566SHawking Zhang 	if (!amdgpu_ras_get_error_query_mode(adev, &error_query_mode))
14388cc0f566SHawking Zhang 		return -EINVAL;
14398cc0f566SHawking Zhang 
14409dc57c2aSYang Wang 	memset(&qctx, 0, sizeof(qctx));
144175ac6a25SYang Wang 	qctx.evid.type = type;
144275ac6a25SYang Wang 	qctx.evid.event_id = amdgpu_ras_acquire_event_id(adev, type);
1443f852c979SYiPeng Chai 
1444f852c979SYiPeng Chai 	if (!down_read_trylock(&adev->reset_domain->sem)) {
1445f852c979SYiPeng Chai 		ret = -EIO;
1446f852c979SYiPeng Chai 		goto out_fini_err_data;
1447f852c979SYiPeng Chai 	}
1448f852c979SYiPeng Chai 
14498cc0f566SHawking Zhang 	ret = amdgpu_ras_query_error_status_helper(adev, info,
14508cc0f566SHawking Zhang 						   &err_data,
14519dc57c2aSYang Wang 						   &qctx,
14528cc0f566SHawking Zhang 						   error_query_mode);
1453f852c979SYiPeng Chai 	up_read(&adev->reset_domain->sem);
14548cc0f566SHawking Zhang 	if (ret)
14558cc0f566SHawking Zhang 		goto out_fini_err_data;
145605a58345STao Zhou 
145784a2947eSVictor Skvortsov 	if (error_query_mode != AMDGPU_RAS_VIRT_ERROR_COUNT_QUERY) {
1458ec3e0a91SYang Wang 		amdgpu_rasmgr_error_data_statistic_update(obj, &err_data);
145984a2947eSVictor Skvortsov 		amdgpu_ras_error_generate_report(adev, info, &err_data, &qctx);
146084a2947eSVictor Skvortsov 	} else {
146184a2947eSVictor Skvortsov 		/* Host provides absolute error counts. First generate the report
146284a2947eSVictor Skvortsov 		 * using the previous VF internal count against new host count.
146384a2947eSVictor Skvortsov 		 * Then Update VF internal count.
146484a2947eSVictor Skvortsov 		 */
146584a2947eSVictor Skvortsov 		amdgpu_ras_virt_error_generate_report(adev, info, &err_data, &qctx);
146684a2947eSVictor Skvortsov 		amdgpu_ras_mgr_virt_error_data_statistics_update(obj, &err_data);
146784a2947eSVictor Skvortsov 	}
146805a58345STao Zhou 
1469c030f2e4Sxinhui pan 	info->ue_count = obj->err_data.ue_count;
1470c030f2e4Sxinhui pan 	info->ce_count = obj->err_data.ce_count;
147146e2231cSCandice Li 	info->de_count = obj->err_data.de_count;
1472c030f2e4Sxinhui pan 
14735b1270beSYang Wang out_fini_err_data:
14745b1270beSYang Wang 	amdgpu_ras_error_data_fini(&err_data);
14755b1270beSYang Wang 
14765b1270beSYang Wang 	return ret;
1477c030f2e4Sxinhui pan }
1478c030f2e4Sxinhui pan 
amdgpu_ras_query_error_status(struct amdgpu_device * adev,struct ras_query_if * info)147975ac6a25SYang Wang int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info)
148075ac6a25SYang Wang {
148175ac6a25SYang Wang 	return amdgpu_ras_query_error_status_with_event(adev, info, RAS_EVENT_TYPE_INVALID);
148275ac6a25SYang Wang }
148375ac6a25SYang Wang 
amdgpu_ras_reset_error_count(struct amdgpu_device * adev,enum amdgpu_ras_block block)1484472c5fb2STao Zhou int amdgpu_ras_reset_error_count(struct amdgpu_device *adev,
1485761d86d3SDennis Li 		enum amdgpu_ras_block block)
1486761d86d3SDennis Li {
14878b0fb0e9Syipechai 	struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0);
148873582be1STao Zhou 	const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
148904c4fcd2SYang Wang 	const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs;
14908b0fb0e9Syipechai 
14918b0fb0e9Syipechai 	if (!block_obj || !block_obj->hw_ops) {
1492afa37315SLuben Tuikov 		dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
1493b6efdb02Syipechai 				ras_block_str(block));
1494472c5fb2STao Zhou 		return -EOPNOTSUPP;
14958b0fb0e9Syipechai 	}
1496761d86d3SDennis Li 
1497d1d4c0b7STao Zhou 	if (!amdgpu_ras_is_supported(adev, block) ||
149804c4fcd2SYang Wang 	    !amdgpu_ras_get_aca_debug_mode(adev))
1499d1d4c0b7STao Zhou 		return -EOPNOTSUPP;
1500d1d4c0b7STao Zhou 
150173582be1STao Zhou 	/* skip ras error reset in gpu reset */
15027e437167STao Zhou 	if ((amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) &&
150304c4fcd2SYang Wang 	    ((smu_funcs && smu_funcs->set_debug_mode) ||
150404c4fcd2SYang Wang 	     (mca_funcs && mca_funcs->mca_set_debug_mode)))
150573582be1STao Zhou 		return -EOPNOTSUPP;
150673582be1STao Zhou 
15078b0fb0e9Syipechai 	if (block_obj->hw_ops->reset_ras_error_count)
15088b0fb0e9Syipechai 		block_obj->hw_ops->reset_ras_error_count(adev);
15097780f503SDennis Li 
1510472c5fb2STao Zhou 	return 0;
1511472c5fb2STao Zhou }
1512472c5fb2STao Zhou 
amdgpu_ras_reset_error_status(struct amdgpu_device * adev,enum amdgpu_ras_block block)1513472c5fb2STao Zhou int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
1514472c5fb2STao Zhou 		enum amdgpu_ras_block block)
1515472c5fb2STao Zhou {
1516472c5fb2STao Zhou 	struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0);
1517472c5fb2STao Zhou 
1518472c5fb2STao Zhou 	if (amdgpu_ras_reset_error_count(adev, block) == -EOPNOTSUPP)
1519472c5fb2STao Zhou 		return 0;
1520472c5fb2STao Zhou 
15217389a5b8Syipechai 	if ((block == AMDGPU_RAS_BLOCK__GFX) ||
15227389a5b8Syipechai 	    (block == AMDGPU_RAS_BLOCK__MMHUB)) {
15238b0fb0e9Syipechai 		if (block_obj->hw_ops->reset_ras_error_status)
15248b0fb0e9Syipechai 			block_obj->hw_ops->reset_ras_error_status(adev);
1525761d86d3SDennis Li 	}
1526761d86d3SDennis Li 
1527761d86d3SDennis Li 	return 0;
1528761d86d3SDennis Li }
1529761d86d3SDennis Li 
1530c030f2e4Sxinhui pan /* wrapper of psp_ras_trigger_error */
amdgpu_ras_error_inject(struct amdgpu_device * adev,struct ras_inject_if * info)1531c030f2e4Sxinhui pan int amdgpu_ras_error_inject(struct amdgpu_device *adev,
1532c030f2e4Sxinhui pan 		struct ras_inject_if *info)
1533c030f2e4Sxinhui pan {
1534c030f2e4Sxinhui pan 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1535c030f2e4Sxinhui pan 	struct ta_ras_trigger_error_input block_info = {
1536828cfa29Sxinhui pan 		.block_id =  amdgpu_ras_block_to_ta(info->head.block),
1537828cfa29Sxinhui pan 		.inject_error_type = amdgpu_ras_error_to_ta(info->head.type),
1538c030f2e4Sxinhui pan 		.sub_block_index = info->head.sub_block_index,
1539c030f2e4Sxinhui pan 		.address = info->address,
1540c030f2e4Sxinhui pan 		.value = info->value,
1541c030f2e4Sxinhui pan 	};
15428b0fb0e9Syipechai 	int ret = -EINVAL;
1543ab3b9de6SYang Li 	struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev,
1544ab3b9de6SYang Li 							info->head.block,
1545ab3b9de6SYang Li 							info->head.sub_block_index);
1546c030f2e4Sxinhui pan 
1547248c9635STao Zhou 	/* inject on guest isn't allowed, return success directly */
1548248c9635STao Zhou 	if (amdgpu_sriov_vf(adev))
1549248c9635STao Zhou 		return 0;
1550248c9635STao Zhou 
1551c030f2e4Sxinhui pan 	if (!obj)
1552c030f2e4Sxinhui pan 		return -EINVAL;
1553c030f2e4Sxinhui pan 
155422d4ba53Syipechai 	if (!block_obj || !block_obj->hw_ops)	{
1555afa37315SLuben Tuikov 		dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
1556b6efdb02Syipechai 			     get_ras_block_str(&info->head));
155722d4ba53Syipechai 		return -EINVAL;
155822d4ba53Syipechai 	}
155922d4ba53Syipechai 
1560a6c44d25SJohn Clements 	/* Calculate XGMI relative offset */
1561a80fe1a6STao Zhou 	if (adev->gmc.xgmi.num_physical_nodes > 1 &&
1562a80fe1a6STao Zhou 	    info->head.block != AMDGPU_RAS_BLOCK__GFX) {
156319744f5fSHawking Zhang 		block_info.address =
156419744f5fSHawking Zhang 			amdgpu_xgmi_get_relative_phy_addr(adev,
1565a6c44d25SJohn Clements 							  block_info.address);
1566a6c44d25SJohn Clements 	}
1567a6c44d25SJohn Clements 
156827c5f295STao Zhou 	if (block_obj->hw_ops->ras_error_inject) {
156927c5f295STao Zhou 		if (info->head.block == AMDGPU_RAS_BLOCK__GFX)
15702c22ed0bSTao Zhou 			ret = block_obj->hw_ops->ras_error_inject(adev, info, info->instance_mask);
157127c5f295STao Zhou 		else /* Special ras_error_inject is defined (e.g: xgmi) */
15722c22ed0bSTao Zhou 			ret = block_obj->hw_ops->ras_error_inject(adev, &block_info,
15732c22ed0bSTao Zhou 						info->instance_mask);
157427c5f295STao Zhou 	} else {
157527c5f295STao Zhou 		/* default path */
15762c22ed0bSTao Zhou 		ret = psp_ras_trigger_error(&adev->psp, &block_info, info->instance_mask);
1577a5dd40caSHawking Zhang 	}
1578a5dd40caSHawking Zhang 
1579011907fdSDennis Li 	if (ret)
1580011907fdSDennis Li 		dev_err(adev->dev, "ras inject %s failed %d\n",
1581640ae42eSJohn Clements 			get_ras_block_str(&info->head), ret);
1582c030f2e4Sxinhui pan 
1583c030f2e4Sxinhui pan 	return ret;
1584c030f2e4Sxinhui pan }
1585c030f2e4Sxinhui pan 
15864d9f771eSLuben Tuikov /**
15874a1c9a44SHawking Zhang  * amdgpu_ras_query_error_count_helper -- Get error counter for specific IP
15884a1c9a44SHawking Zhang  * @adev: pointer to AMD GPU device
15894a1c9a44SHawking Zhang  * @ce_count: pointer to an integer to be set to the count of correctible errors.
15904a1c9a44SHawking Zhang  * @ue_count: pointer to an integer to be set to the count of uncorrectible errors.
15914a1c9a44SHawking Zhang  * @query_info: pointer to ras_query_if
15924a1c9a44SHawking Zhang  *
15934a1c9a44SHawking Zhang  * Return 0 for query success or do nothing, otherwise return an error
15944a1c9a44SHawking Zhang  * on failures
15954a1c9a44SHawking Zhang  */
amdgpu_ras_query_error_count_helper(struct amdgpu_device * adev,unsigned long * ce_count,unsigned long * ue_count,struct ras_query_if * query_info)15964a1c9a44SHawking Zhang static int amdgpu_ras_query_error_count_helper(struct amdgpu_device *adev,
15974a1c9a44SHawking Zhang 					       unsigned long *ce_count,
15984a1c9a44SHawking Zhang 					       unsigned long *ue_count,
15994a1c9a44SHawking Zhang 					       struct ras_query_if *query_info)
16004a1c9a44SHawking Zhang {
16014a1c9a44SHawking Zhang 	int ret;
16024a1c9a44SHawking Zhang 
16034a1c9a44SHawking Zhang 	if (!query_info)
16044a1c9a44SHawking Zhang 		/* do nothing if query_info is not specified */
16054a1c9a44SHawking Zhang 		return 0;
16064a1c9a44SHawking Zhang 
16074a1c9a44SHawking Zhang 	ret = amdgpu_ras_query_error_status(adev, query_info);
16084a1c9a44SHawking Zhang 	if (ret)
16094a1c9a44SHawking Zhang 		return ret;
16104a1c9a44SHawking Zhang 
16114a1c9a44SHawking Zhang 	*ce_count += query_info->ce_count;
16124a1c9a44SHawking Zhang 	*ue_count += query_info->ue_count;
16134a1c9a44SHawking Zhang 
16144a1c9a44SHawking Zhang 	/* some hardware/IP supports read to clear
16154a1c9a44SHawking Zhang 	 * no need to explictly reset the err status after the query call */
16164e8303cfSLijo Lazar 	if (amdgpu_ip_version(adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 2) &&
16174e8303cfSLijo Lazar 	    amdgpu_ip_version(adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 4)) {
16184a1c9a44SHawking Zhang 		if (amdgpu_ras_reset_error_status(adev, query_info->head.block))
16194a1c9a44SHawking Zhang 			dev_warn(adev->dev,
16204a1c9a44SHawking Zhang 				 "Failed to reset error counter and error status\n");
16214a1c9a44SHawking Zhang 	}
16224a1c9a44SHawking Zhang 
16234a1c9a44SHawking Zhang 	return 0;
16244a1c9a44SHawking Zhang }
16254a1c9a44SHawking Zhang 
16264a1c9a44SHawking Zhang /**
16274a1c9a44SHawking Zhang  * amdgpu_ras_query_error_count -- Get error counts of all IPs or specific IP
1628bbe04decSIsabella Basso  * @adev: pointer to AMD GPU device
1629bbe04decSIsabella Basso  * @ce_count: pointer to an integer to be set to the count of correctible errors.
1630bbe04decSIsabella Basso  * @ue_count: pointer to an integer to be set to the count of uncorrectible
16314d9f771eSLuben Tuikov  * errors.
16324a1c9a44SHawking Zhang  * @query_info: pointer to ras_query_if if the query request is only for
16334a1c9a44SHawking Zhang  * specific ip block; if info is NULL, then the qurey request is for
16344a1c9a44SHawking Zhang  * all the ip blocks that support query ras error counters/status
16354d9f771eSLuben Tuikov  *
16364d9f771eSLuben Tuikov  * If set, @ce_count or @ue_count, count and return the corresponding
16374d9f771eSLuben Tuikov  * error counts in those integer pointers. Return 0 if the device
16384d9f771eSLuben Tuikov  * supports RAS. Return -EOPNOTSUPP if the device doesn't support RAS.
16394d9f771eSLuben Tuikov  */
amdgpu_ras_query_error_count(struct amdgpu_device * adev,unsigned long * ce_count,unsigned long * ue_count,struct ras_query_if * query_info)16404d9f771eSLuben Tuikov int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
1641a46751fbSLuben Tuikov 				 unsigned long *ce_count,
16424a1c9a44SHawking Zhang 				 unsigned long *ue_count,
16434a1c9a44SHawking Zhang 				 struct ras_query_if *query_info)
1644c030f2e4Sxinhui pan {
1645c030f2e4Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1646c030f2e4Sxinhui pan 	struct ras_manager *obj;
1647a46751fbSLuben Tuikov 	unsigned long ce, ue;
16484a1c9a44SHawking Zhang 	int ret;
1649c030f2e4Sxinhui pan 
16508ab0d6f0SLuben Tuikov 	if (!adev->ras_enabled || !con)
16514d9f771eSLuben Tuikov 		return -EOPNOTSUPP;
16524d9f771eSLuben Tuikov 
16534d9f771eSLuben Tuikov 	/* Don't count since no reporting.
16544d9f771eSLuben Tuikov 	 */
16554d9f771eSLuben Tuikov 	if (!ce_count && !ue_count)
16564d9f771eSLuben Tuikov 		return 0;
1657c030f2e4Sxinhui pan 
1658a46751fbSLuben Tuikov 	ce = 0;
1659a46751fbSLuben Tuikov 	ue = 0;
16604a1c9a44SHawking Zhang 	if (!query_info) {
16614a1c9a44SHawking Zhang 		/* query all the ip blocks that support ras query interface */
1662c030f2e4Sxinhui pan 		list_for_each_entry(obj, &con->head, node) {
1663c030f2e4Sxinhui pan 			struct ras_query_if info = {
1664c030f2e4Sxinhui pan 				.head = obj->head,
1665c030f2e4Sxinhui pan 			};
1666c030f2e4Sxinhui pan 
16674a1c9a44SHawking Zhang 			ret = amdgpu_ras_query_error_count_helper(adev, &ce, &ue, &info);
16684a1c9a44SHawking Zhang 		}
16694a1c9a44SHawking Zhang 	} else {
16704a1c9a44SHawking Zhang 		/* query specific ip block */
16714a1c9a44SHawking Zhang 		ret = amdgpu_ras_query_error_count_helper(adev, &ce, &ue, query_info);
16722a460963SCandice Li 	}
16732a460963SCandice Li 
16744a1c9a44SHawking Zhang 	if (ret)
16754a1c9a44SHawking Zhang 		return ret;
1676c030f2e4Sxinhui pan 
1677a46751fbSLuben Tuikov 	if (ce_count)
1678a46751fbSLuben Tuikov 		*ce_count = ce;
1679a46751fbSLuben Tuikov 
1680a46751fbSLuben Tuikov 	if (ue_count)
1681a46751fbSLuben Tuikov 		*ue_count = ue;
16824d9f771eSLuben Tuikov 
16834d9f771eSLuben Tuikov 	return 0;
1684c030f2e4Sxinhui pan }
1685c030f2e4Sxinhui pan /* query/inject/cure end */
1686c030f2e4Sxinhui pan 
1687c030f2e4Sxinhui pan 
1688c030f2e4Sxinhui pan /* sysfs begin */
1689c030f2e4Sxinhui pan 
1690466b1793Sxinhui pan static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
1691466b1793Sxinhui pan 		struct ras_badpage **bps, unsigned int *count);
1692466b1793Sxinhui pan 
amdgpu_ras_badpage_flags_str(unsigned int flags)1693466b1793Sxinhui pan static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
1694466b1793Sxinhui pan {
1695466b1793Sxinhui pan 	switch (flags) {
169652dd95f2SGuchun Chen 	case AMDGPU_RAS_RETIRE_PAGE_RESERVED:
1697466b1793Sxinhui pan 		return "R";
169852dd95f2SGuchun Chen 	case AMDGPU_RAS_RETIRE_PAGE_PENDING:
1699466b1793Sxinhui pan 		return "P";
170052dd95f2SGuchun Chen 	case AMDGPU_RAS_RETIRE_PAGE_FAULT:
1701466b1793Sxinhui pan 	default:
1702466b1793Sxinhui pan 		return "F";
1703aec576f9STom Rix 	}
1704466b1793Sxinhui pan }
1705466b1793Sxinhui pan 
1706f77c7109SAlex Deucher /**
1707f77c7109SAlex Deucher  * DOC: AMDGPU RAS sysfs gpu_vram_bad_pages Interface
1708466b1793Sxinhui pan  *
1709466b1793Sxinhui pan  * It allows user to read the bad pages of vram on the gpu through
1710466b1793Sxinhui pan  * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages
1711466b1793Sxinhui pan  *
1712466b1793Sxinhui pan  * It outputs multiple lines, and each line stands for one gpu page.
1713466b1793Sxinhui pan  *
1714466b1793Sxinhui pan  * The format of one line is below,
1715466b1793Sxinhui pan  * gpu pfn : gpu page size : flags
1716466b1793Sxinhui pan  *
1717466b1793Sxinhui pan  * gpu pfn and gpu page size are printed in hex format.
1718466b1793Sxinhui pan  * flags can be one of below character,
1719f77c7109SAlex Deucher  *
1720466b1793Sxinhui pan  * R: reserved, this gpu page is reserved and not able to use.
1721f77c7109SAlex Deucher  *
1722466b1793Sxinhui pan  * P: pending for reserve, this gpu page is marked as bad, will be reserved
1723466b1793Sxinhui pan  * in next window of page_reserve.
1724f77c7109SAlex Deucher  *
1725466b1793Sxinhui pan  * F: unable to reserve. this gpu page can't be reserved due to some reasons.
1726466b1793Sxinhui pan  *
1727f77c7109SAlex Deucher  * Examples:
1728f77c7109SAlex Deucher  *
1729f77c7109SAlex Deucher  * .. code-block:: bash
1730f77c7109SAlex Deucher  *
1731466b1793Sxinhui pan  *	0x00000001 : 0x00001000 : R
1732466b1793Sxinhui pan  *	0x00000002 : 0x00001000 : P
1733f77c7109SAlex Deucher  *
1734466b1793Sxinhui pan  */
1735466b1793Sxinhui pan 
amdgpu_ras_sysfs_badpages_read(struct file * f,struct kobject * kobj,const struct bin_attribute * attr,char * buf,loff_t ppos,size_t count)1736466b1793Sxinhui pan static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,
1737466b1793Sxinhui pan 		struct kobject *kobj, const struct bin_attribute *attr,
1738466b1793Sxinhui pan 		char *buf, loff_t ppos, size_t count)
1739466b1793Sxinhui pan {
1740466b1793Sxinhui pan 	struct amdgpu_ras *con =
1741466b1793Sxinhui pan 		container_of(attr, struct amdgpu_ras, badpages_attr);
1742466b1793Sxinhui pan 	struct amdgpu_device *adev = con->adev;
1743466b1793Sxinhui pan 	const unsigned int element_size =
1744466b1793Sxinhui pan 		sizeof("0xabcdabcd : 0x12345678 : R\n") - 1;
1745d6ee400eSSlava Abramov 	unsigned int start = div64_ul(ppos + element_size - 1, element_size);
1746d6ee400eSSlava Abramov 	unsigned int end = div64_ul(ppos + count - 1, element_size);
1747466b1793Sxinhui pan 	ssize_t s = 0;
1748466b1793Sxinhui pan 	struct ras_badpage *bps = NULL;
1749466b1793Sxinhui pan 	unsigned int bps_count = 0;
1750466b1793Sxinhui pan 
1751466b1793Sxinhui pan 	memset(buf, 0, count);
1752466b1793Sxinhui pan 
1753466b1793Sxinhui pan 	if (amdgpu_ras_badpages_read(adev, &bps, &bps_count))
1754466b1793Sxinhui pan 		return 0;
1755466b1793Sxinhui pan 
1756466b1793Sxinhui pan 	for (; start < end && start < bps_count; start++)
1757466b1793Sxinhui pan 		s += scnprintf(&buf[s], element_size + 1,
1758466b1793Sxinhui pan 				"0x%08x : 0x%08x : %1s\n",
1759466b1793Sxinhui pan 				bps[start].bp,
1760466b1793Sxinhui pan 				bps[start].size,
1761466b1793Sxinhui pan 				amdgpu_ras_badpage_flags_str(bps[start].flags));
1762466b1793Sxinhui pan 
1763466b1793Sxinhui pan 	kfree(bps);
1764466b1793Sxinhui pan 
1765466b1793Sxinhui pan 	return s;
1766466b1793Sxinhui pan }
1767466b1793Sxinhui pan 
amdgpu_ras_sysfs_features_read(struct device * dev,struct device_attribute * attr,char * buf)1768c030f2e4Sxinhui pan static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
1769c030f2e4Sxinhui pan 		struct device_attribute *attr, char *buf)
1770c030f2e4Sxinhui pan {
1771c030f2e4Sxinhui pan 	struct amdgpu_ras *con =
1772c030f2e4Sxinhui pan 		container_of(attr, struct amdgpu_ras, features_attr);
1773c030f2e4Sxinhui pan 
17742cffcb66Sye xingchen 	return sysfs_emit(buf, "feature mask: 0x%x\n", con->features);
1775c030f2e4Sxinhui pan }
1776c030f2e4Sxinhui pan 
amdgpu_ras_sysfs_version_show(struct device * dev,struct device_attribute * attr,char * buf)1777625e5f38SAsad Kamal static ssize_t amdgpu_ras_sysfs_version_show(struct device *dev,
1778625e5f38SAsad Kamal 		struct device_attribute *attr, char *buf)
1779625e5f38SAsad Kamal {
1780625e5f38SAsad Kamal 	struct amdgpu_ras *con =
1781625e5f38SAsad Kamal 		container_of(attr, struct amdgpu_ras, version_attr);
1782625e5f38SAsad Kamal 	return sysfs_emit(buf, "table version: 0x%x\n", con->eeprom_control.tbl_hdr.version);
1783625e5f38SAsad Kamal }
1784625e5f38SAsad Kamal 
amdgpu_ras_sysfs_schema_show(struct device * dev,struct device_attribute * attr,char * buf)1785625e5f38SAsad Kamal static ssize_t amdgpu_ras_sysfs_schema_show(struct device *dev,
1786625e5f38SAsad Kamal 		struct device_attribute *attr, char *buf)
1787625e5f38SAsad Kamal {
1788625e5f38SAsad Kamal 	struct amdgpu_ras *con =
1789625e5f38SAsad Kamal 		container_of(attr, struct amdgpu_ras, schema_attr);
1790625e5f38SAsad Kamal 	return sysfs_emit(buf, "schema: 0x%x\n", con->schema);
1791625e5f38SAsad Kamal }
1792625e5f38SAsad Kamal 
179359f488beSYang Wang static struct {
179459f488beSYang Wang 	enum ras_event_type type;
179559f488beSYang Wang 	const char *name;
179659f488beSYang Wang } dump_event[] = {
179759f488beSYang Wang 	{RAS_EVENT_TYPE_FATAL, "Fatal Error"},
179859f488beSYang Wang 	{RAS_EVENT_TYPE_POISON_CREATION, "Poison Creation"},
179959f488beSYang Wang 	{RAS_EVENT_TYPE_POISON_CONSUMPTION, "Poison Consumption"},
180059f488beSYang Wang };
180159f488beSYang Wang 
amdgpu_ras_sysfs_event_state_show(struct device * dev,struct device_attribute * attr,char * buf)180259f488beSYang Wang static ssize_t amdgpu_ras_sysfs_event_state_show(struct device *dev,
180359f488beSYang Wang 						 struct device_attribute *attr, char *buf)
180459f488beSYang Wang {
180559f488beSYang Wang 	struct amdgpu_ras *con =
180659f488beSYang Wang 		container_of(attr, struct amdgpu_ras, event_state_attr);
180759f488beSYang Wang 	struct ras_event_manager *event_mgr = con->event_mgr;
180859f488beSYang Wang 	struct ras_event_state *event_state;
180959f488beSYang Wang 	int i, size = 0;
181059f488beSYang Wang 
181159f488beSYang Wang 	if (!event_mgr)
181259f488beSYang Wang 		return -EINVAL;
181359f488beSYang Wang 
181459f488beSYang Wang 	size += sysfs_emit_at(buf, size, "current seqno: %llu\n", atomic64_read(&event_mgr->seqno));
181559f488beSYang Wang 	for (i = 0; i < ARRAY_SIZE(dump_event); i++) {
181659f488beSYang Wang 		event_state = &event_mgr->event_state[dump_event[i].type];
181759f488beSYang Wang 		size += sysfs_emit_at(buf, size, "%s: count:%llu, last_seqno:%llu\n",
181859f488beSYang Wang 				      dump_event[i].name,
181959f488beSYang Wang 				      atomic64_read(&event_state->count),
182059f488beSYang Wang 				      event_state->last_seqno);
182159f488beSYang Wang 	}
182259f488beSYang Wang 
182359f488beSYang Wang 	return (ssize_t)size;
182459f488beSYang Wang }
182559f488beSYang Wang 
amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device * adev)1826f848159bSGuchun Chen static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device *adev)
1827f848159bSGuchun Chen {
1828f848159bSGuchun Chen 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1829f848159bSGuchun Chen 
18304638e0c2SVitaly Prosyak 	if (adev->dev->kobj.sd)
1831f848159bSGuchun Chen 		sysfs_remove_file_from_group(&adev->dev->kobj,
1832f848159bSGuchun Chen 				&con->badpages_attr.attr,
1833f848159bSGuchun Chen 				RAS_FS_NAME);
1834f848159bSGuchun Chen }
1835f848159bSGuchun Chen 
amdgpu_ras_sysfs_remove_dev_attr_node(struct amdgpu_device * adev)1836625e5f38SAsad Kamal static int amdgpu_ras_sysfs_remove_dev_attr_node(struct amdgpu_device *adev)
1837c030f2e4Sxinhui pan {
1838c030f2e4Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1839c030f2e4Sxinhui pan 	struct attribute *attrs[] = {
1840c030f2e4Sxinhui pan 		&con->features_attr.attr,
1841625e5f38SAsad Kamal 		&con->version_attr.attr,
1842625e5f38SAsad Kamal 		&con->schema_attr.attr,
184359f488beSYang Wang 		&con->event_state_attr.attr,
1844c030f2e4Sxinhui pan 		NULL
1845c030f2e4Sxinhui pan 	};
1846c030f2e4Sxinhui pan 	struct attribute_group group = {
1847eb0c3cd4SGuchun Chen 		.name = RAS_FS_NAME,
1848c030f2e4Sxinhui pan 		.attrs = attrs,
1849c030f2e4Sxinhui pan 	};
1850c030f2e4Sxinhui pan 
18514638e0c2SVitaly Prosyak 	if (adev->dev->kobj.sd)
1852c030f2e4Sxinhui pan 		sysfs_remove_group(&adev->dev->kobj, &group);
1853c030f2e4Sxinhui pan 
1854c030f2e4Sxinhui pan 	return 0;
1855c030f2e4Sxinhui pan }
1856c030f2e4Sxinhui pan 
amdgpu_ras_sysfs_create(struct amdgpu_device * adev,struct ras_common_if * head)1857c030f2e4Sxinhui pan int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
18589252d33dSyipechai 		struct ras_common_if *head)
1859c030f2e4Sxinhui pan {
18609252d33dSyipechai 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
1861c030f2e4Sxinhui pan 
18629262f411SYang Wang 	if (amdgpu_aca_is_enabled(adev))
18639262f411SYang Wang 		return 0;
18649262f411SYang Wang 
1865c030f2e4Sxinhui pan 	if (!obj || obj->attr_inuse)
1866c030f2e4Sxinhui pan 		return -EINVAL;
1867c030f2e4Sxinhui pan 
186804893397SVictor Skvortsov 	if (amdgpu_sriov_vf(adev) && !amdgpu_virt_ras_telemetry_block_en(adev, head->block))
186904893397SVictor Skvortsov 		return 0;
187004893397SVictor Skvortsov 
1871c030f2e4Sxinhui pan 	get_obj(obj);
1872c030f2e4Sxinhui pan 
18739252d33dSyipechai 	snprintf(obj->fs_data.sysfs_name, sizeof(obj->fs_data.sysfs_name),
18749252d33dSyipechai 		"%s_err_count", head->name);
1875c030f2e4Sxinhui pan 
1876c030f2e4Sxinhui pan 	obj->sysfs_attr = (struct device_attribute){
1877c030f2e4Sxinhui pan 		.attr = {
1878c030f2e4Sxinhui pan 			.name = obj->fs_data.sysfs_name,
1879c030f2e4Sxinhui pan 			.mode = S_IRUGO,
1880c030f2e4Sxinhui pan 		},
1881c030f2e4Sxinhui pan 			.show = amdgpu_ras_sysfs_read,
1882c030f2e4Sxinhui pan 	};
1883163def43Sxinhui pan 	sysfs_attr_init(&obj->sysfs_attr.attr);
1884c030f2e4Sxinhui pan 
1885c030f2e4Sxinhui pan 	if (sysfs_add_file_to_group(&adev->dev->kobj,
1886c030f2e4Sxinhui pan 				&obj->sysfs_attr.attr,
1887eb0c3cd4SGuchun Chen 				RAS_FS_NAME)) {
1888c030f2e4Sxinhui pan 		put_obj(obj);
1889c030f2e4Sxinhui pan 		return -EINVAL;
1890c030f2e4Sxinhui pan 	}
1891c030f2e4Sxinhui pan 
1892c030f2e4Sxinhui pan 	obj->attr_inuse = 1;
1893c030f2e4Sxinhui pan 
1894c030f2e4Sxinhui pan 	return 0;
1895c030f2e4Sxinhui pan }
1896c030f2e4Sxinhui pan 
amdgpu_ras_sysfs_remove(struct amdgpu_device * adev,struct ras_common_if * head)1897c030f2e4Sxinhui pan int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
1898c030f2e4Sxinhui pan 		struct ras_common_if *head)
1899c030f2e4Sxinhui pan {
1900c030f2e4Sxinhui pan 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
1901c030f2e4Sxinhui pan 
19029262f411SYang Wang 	if (amdgpu_aca_is_enabled(adev))
19039262f411SYang Wang 		return 0;
19049262f411SYang Wang 
1905c030f2e4Sxinhui pan 	if (!obj || !obj->attr_inuse)
1906c030f2e4Sxinhui pan 		return -EINVAL;
1907c030f2e4Sxinhui pan 
19084638e0c2SVitaly Prosyak 	if (adev->dev->kobj.sd)
1909c030f2e4Sxinhui pan 		sysfs_remove_file_from_group(&adev->dev->kobj,
1910c030f2e4Sxinhui pan 				&obj->sysfs_attr.attr,
1911eb0c3cd4SGuchun Chen 				RAS_FS_NAME);
1912c030f2e4Sxinhui pan 	obj->attr_inuse = 0;
1913c030f2e4Sxinhui pan 	put_obj(obj);
1914c030f2e4Sxinhui pan 
1915c030f2e4Sxinhui pan 	return 0;
1916c030f2e4Sxinhui pan }
1917c030f2e4Sxinhui pan 
amdgpu_ras_sysfs_remove_all(struct amdgpu_device * adev)1918c030f2e4Sxinhui pan static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev)
1919c030f2e4Sxinhui pan {
1920c030f2e4Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1921c030f2e4Sxinhui pan 	struct ras_manager *obj, *tmp;
1922c030f2e4Sxinhui pan 
1923c030f2e4Sxinhui pan 	list_for_each_entry_safe(obj, tmp, &con->head, node) {
1924c030f2e4Sxinhui pan 		amdgpu_ras_sysfs_remove(adev, &obj->head);
1925c030f2e4Sxinhui pan 	}
1926c030f2e4Sxinhui pan 
1927f848159bSGuchun Chen 	if (amdgpu_bad_page_threshold != 0)
1928f848159bSGuchun Chen 		amdgpu_ras_sysfs_remove_bad_page_node(adev);
1929f848159bSGuchun Chen 
1930625e5f38SAsad Kamal 	amdgpu_ras_sysfs_remove_dev_attr_node(adev);
1931c030f2e4Sxinhui pan 
1932c030f2e4Sxinhui pan 	return 0;
1933c030f2e4Sxinhui pan }
1934c030f2e4Sxinhui pan /* sysfs end */
1935c030f2e4Sxinhui pan 
1936ef177d11SAlex Deucher /**
1937ef177d11SAlex Deucher  * DOC: AMDGPU RAS Reboot Behavior for Unrecoverable Errors
1938ef177d11SAlex Deucher  *
1939ef177d11SAlex Deucher  * Normally when there is an uncorrectable error, the driver will reset
1940ef177d11SAlex Deucher  * the GPU to recover.  However, in the event of an unrecoverable error,
1941ef177d11SAlex Deucher  * the driver provides an interface to reboot the system automatically
1942ef177d11SAlex Deucher  * in that event.
1943ef177d11SAlex Deucher  *
1944ef177d11SAlex Deucher  * The following file in debugfs provides that interface:
1945ef177d11SAlex Deucher  * /sys/kernel/debug/dri/[0/1/2...]/ras/auto_reboot
1946ef177d11SAlex Deucher  *
1947ef177d11SAlex Deucher  * Usage:
1948ef177d11SAlex Deucher  *
1949ef177d11SAlex Deucher  * .. code-block:: bash
1950ef177d11SAlex Deucher  *
1951ef177d11SAlex Deucher  *	echo true > .../ras/auto_reboot
1952ef177d11SAlex Deucher  *
1953ef177d11SAlex Deucher  */
1954c030f2e4Sxinhui pan /* debugfs begin */
amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device * adev)1955ea1b8c9bSNirmoy Das static struct dentry *amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
195636ea1bd2Sxinhui pan {
195736ea1bd2Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1958740f42a2SLuben Tuikov 	struct amdgpu_ras_eeprom_control *eeprom = &con->eeprom_control;
19594a580877SLuben Tuikov 	struct drm_minor  *minor = adev_to_drm(adev)->primary;
1960ef0d7d20SLuben Tuikov 	struct dentry     *dir;
196136ea1bd2Sxinhui pan 
196288293c03SNirmoy Das 	dir = debugfs_create_dir(RAS_FS_NAME, minor->debugfs_root);
196388293c03SNirmoy Das 	debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, dir, adev,
196488293c03SNirmoy Das 			    &amdgpu_ras_debugfs_ctrl_ops);
196588293c03SNirmoy Das 	debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, dir, adev,
196688293c03SNirmoy Das 			    &amdgpu_ras_debugfs_eeprom_ops);
19677fb64071SLuben Tuikov 	debugfs_create_u32("bad_page_cnt_threshold", 0444, dir,
19687fb64071SLuben Tuikov 			   &con->bad_page_cnt_threshold);
1969740f42a2SLuben Tuikov 	debugfs_create_u32("ras_num_recs", 0444, dir, &eeprom->ras_num_recs);
1970ef0d7d20SLuben Tuikov 	debugfs_create_x32("ras_hw_enabled", 0444, dir, &adev->ras_hw_enabled);
1971ef0d7d20SLuben Tuikov 	debugfs_create_x32("ras_enabled", 0444, dir, &adev->ras_enabled);
1972c65b0805SLuben Tuikov 	debugfs_create_file("ras_eeprom_size", S_IRUGO, dir, adev,
1973c65b0805SLuben Tuikov 			    &amdgpu_ras_debugfs_eeprom_size_ops);
1974c65b0805SLuben Tuikov 	con->de_ras_eeprom_table = debugfs_create_file("ras_eeprom_table",
1975c65b0805SLuben Tuikov 						       S_IRUGO, dir, adev,
1976c65b0805SLuben Tuikov 						       &amdgpu_ras_debugfs_eeprom_table_ops);
1977c65b0805SLuben Tuikov 	amdgpu_ras_debugfs_set_ret_size(&con->eeprom_control);
1978c688a06bSGuchun Chen 
1979c688a06bSGuchun Chen 	/*
1980c688a06bSGuchun Chen 	 * After one uncorrectable error happens, usually GPU recovery will
1981c688a06bSGuchun Chen 	 * be scheduled. But due to the known problem in GPU recovery failing
1982c688a06bSGuchun Chen 	 * to bring GPU back, below interface provides one direct way to
1983c688a06bSGuchun Chen 	 * user to reboot system automatically in such case within
1984c688a06bSGuchun Chen 	 * ERREVENT_ATHUB_INTERRUPT generated. Normal GPU recovery routine
1985c688a06bSGuchun Chen 	 * will never be called.
1986c688a06bSGuchun Chen 	 */
198788293c03SNirmoy Das 	debugfs_create_bool("auto_reboot", S_IWUGO | S_IRUGO, dir, &con->reboot);
198866459e1dSGuchun Chen 
198966459e1dSGuchun Chen 	/*
199066459e1dSGuchun Chen 	 * User could set this not to clean up hardware's error count register
199166459e1dSGuchun Chen 	 * of RAS IPs during ras recovery.
199266459e1dSGuchun Chen 	 */
199388293c03SNirmoy Das 	debugfs_create_bool("disable_ras_err_cnt_harvest", 0644, dir,
199488293c03SNirmoy Das 			    &con->disable_ras_err_cnt_harvest);
199588293c03SNirmoy Das 	return dir;
199636ea1bd2Sxinhui pan }
199736ea1bd2Sxinhui pan 
amdgpu_ras_debugfs_create(struct amdgpu_device * adev,struct ras_fs_if * head,struct dentry * dir)1998cedf7884SArnd Bergmann static void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
199988293c03SNirmoy Das 				      struct ras_fs_if *head,
200088293c03SNirmoy Das 				      struct dentry *dir)
2001c030f2e4Sxinhui pan {
2002c030f2e4Sxinhui pan 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
2003c030f2e4Sxinhui pan 
200488293c03SNirmoy Das 	if (!obj || !dir)
2005450f30eaSGreg Kroah-Hartman 		return;
2006c030f2e4Sxinhui pan 
2007c030f2e4Sxinhui pan 	get_obj(obj);
2008c030f2e4Sxinhui pan 
2009c030f2e4Sxinhui pan 	memcpy(obj->fs_data.debugfs_name,
2010c030f2e4Sxinhui pan 			head->debugfs_name,
2011c030f2e4Sxinhui pan 			sizeof(obj->fs_data.debugfs_name));
2012c030f2e4Sxinhui pan 
201388293c03SNirmoy Das 	debugfs_create_file(obj->fs_data.debugfs_name, S_IWUGO | S_IRUGO, dir,
201488293c03SNirmoy Das 			    obj, &amdgpu_ras_debugfs_ops);
2015c030f2e4Sxinhui pan }
2016c030f2e4Sxinhui pan 
amdgpu_ras_aca_is_supported(struct amdgpu_device * adev)20179817f061SYang Wang static bool amdgpu_ras_aca_is_supported(struct amdgpu_device *adev)
20189817f061SYang Wang {
20199817f061SYang Wang 	bool ret;
20209817f061SYang Wang 
20219817f061SYang Wang 	switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) {
20229817f061SYang Wang 	case IP_VERSION(13, 0, 6):
20239a826c4aSHawking Zhang 	case IP_VERSION(13, 0, 12):
20249817f061SYang Wang 	case IP_VERSION(13, 0, 14):
20259817f061SYang Wang 		ret = true;
20269817f061SYang Wang 		break;
20279817f061SYang Wang 	default:
20289817f061SYang Wang 		ret = false;
20299817f061SYang Wang 		break;
20309817f061SYang Wang 	}
20319817f061SYang Wang 
20329817f061SYang Wang 	return ret;
20339817f061SYang Wang }
20349817f061SYang Wang 
amdgpu_ras_debugfs_create_all(struct amdgpu_device * adev)2035f9317014STao Zhou void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)
2036f9317014STao Zhou {
2037f9317014STao Zhou 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
203888293c03SNirmoy Das 	struct dentry *dir;
2039c1509f3fSStanley.Yang 	struct ras_manager *obj;
2040f9317014STao Zhou 	struct ras_fs_if fs_info;
2041f9317014STao Zhou 
2042f9317014STao Zhou 	/*
2043f9317014STao Zhou 	 * it won't be called in resume path, no need to check
2044f9317014STao Zhou 	 * suspend and gpu reset status
2045f9317014STao Zhou 	 */
2046cedf7884SArnd Bergmann 	if (!IS_ENABLED(CONFIG_DEBUG_FS) || !con)
2047f9317014STao Zhou 		return;
2048f9317014STao Zhou 
204988293c03SNirmoy Das 	dir = amdgpu_ras_debugfs_create_ctrl_node(adev);
2050f9317014STao Zhou 
2051c1509f3fSStanley.Yang 	list_for_each_entry(obj, &con->head, node) {
2052f9317014STao Zhou 		if (amdgpu_ras_is_supported(adev, obj->head.block) &&
2053f9317014STao Zhou 			(obj->attr_inuse == 1)) {
2054f9317014STao Zhou 			sprintf(fs_info.debugfs_name, "%s_err_inject",
2055640ae42eSJohn Clements 					get_ras_block_str(&obj->head));
2056f9317014STao Zhou 			fs_info.head = obj->head;
205788293c03SNirmoy Das 			amdgpu_ras_debugfs_create(adev, &fs_info, dir);
2058f9317014STao Zhou 		}
2059f9317014STao Zhou 	}
20604051844cSYang Wang 
20619817f061SYang Wang 	if (amdgpu_ras_aca_is_supported(adev)) {
206204c4fcd2SYang Wang 		if (amdgpu_aca_is_enabled(adev))
206304c4fcd2SYang Wang 			amdgpu_aca_smu_debugfs_init(adev, dir);
206404c4fcd2SYang Wang 		else
20654051844cSYang Wang 			amdgpu_mca_smu_debugfs_init(adev, dir);
2066f9317014STao Zhou 	}
20679817f061SYang Wang }
2068f9317014STao Zhou 
2069c030f2e4Sxinhui pan /* debugfs end */
2070c030f2e4Sxinhui pan 
2071c030f2e4Sxinhui pan /* ras fs */
2072c3d4d45dSGuchun Chen static const BIN_ATTR(gpu_vram_bad_pages, S_IRUGO,
2073c3d4d45dSGuchun Chen 		      amdgpu_ras_sysfs_badpages_read, NULL, 0);
2074c3d4d45dSGuchun Chen static DEVICE_ATTR(features, S_IRUGO,
2075c3d4d45dSGuchun Chen 		amdgpu_ras_sysfs_features_read, NULL);
2076625e5f38SAsad Kamal static DEVICE_ATTR(version, 0444,
2077625e5f38SAsad Kamal 		amdgpu_ras_sysfs_version_show, NULL);
2078625e5f38SAsad Kamal static DEVICE_ATTR(schema, 0444,
2079625e5f38SAsad Kamal 		amdgpu_ras_sysfs_schema_show, NULL);
208059f488beSYang Wang static DEVICE_ATTR(event_state, 0444,
208159f488beSYang Wang 		   amdgpu_ras_sysfs_event_state_show, NULL);
amdgpu_ras_fs_init(struct amdgpu_device * adev)2082c030f2e4Sxinhui pan static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
2083c030f2e4Sxinhui pan {
2084c3d4d45dSGuchun Chen 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2085c3d4d45dSGuchun Chen 	struct attribute_group group = {
2086c3d4d45dSGuchun Chen 		.name = RAS_FS_NAME,
2087c3d4d45dSGuchun Chen 	};
2088c3d4d45dSGuchun Chen 	struct attribute *attrs[] = {
2089c3d4d45dSGuchun Chen 		&con->features_attr.attr,
2090625e5f38SAsad Kamal 		&con->version_attr.attr,
2091625e5f38SAsad Kamal 		&con->schema_attr.attr,
209259f488beSYang Wang 		&con->event_state_attr.attr,
2093c3d4d45dSGuchun Chen 		NULL
2094c3d4d45dSGuchun Chen 	};
2095c3d4d45dSGuchun Chen 	const struct bin_attribute *bin_attrs[] = {
2096c3d4d45dSGuchun Chen 		NULL,
2097c3d4d45dSGuchun Chen 		NULL,
2098c3d4d45dSGuchun Chen 	};
2099a069a9ebSAlex Deucher 	int r;
2100c030f2e4Sxinhui pan 
2101625e5f38SAsad Kamal 	group.attrs = attrs;
2102625e5f38SAsad Kamal 
2103c3d4d45dSGuchun Chen 	/* add features entry */
2104c3d4d45dSGuchun Chen 	con->features_attr = dev_attr_features;
2105c3d4d45dSGuchun Chen 	sysfs_attr_init(attrs[0]);
2106c3d4d45dSGuchun Chen 
2107625e5f38SAsad Kamal 	/* add version entry */
2108625e5f38SAsad Kamal 	con->version_attr = dev_attr_version;
2109625e5f38SAsad Kamal 	sysfs_attr_init(attrs[1]);
2110625e5f38SAsad Kamal 
2111625e5f38SAsad Kamal 	/* add schema entry */
2112625e5f38SAsad Kamal 	con->schema_attr = dev_attr_schema;
2113625e5f38SAsad Kamal 	sysfs_attr_init(attrs[2]);
2114625e5f38SAsad Kamal 
211559f488beSYang Wang 	/* add event_state entry */
211659f488beSYang Wang 	con->event_state_attr = dev_attr_event_state;
211759f488beSYang Wang 	sysfs_attr_init(attrs[3]);
211859f488beSYang Wang 
2119c3d4d45dSGuchun Chen 	if (amdgpu_bad_page_threshold != 0) {
2120c3d4d45dSGuchun Chen 		/* add bad_page_features entry */
2121c3d4d45dSGuchun Chen 		con->badpages_attr = bin_attr_gpu_vram_bad_pages;
2122c3d4d45dSGuchun Chen 		sysfs_bin_attr_init(&con->badpages_attr);
2123c3d4d45dSGuchun Chen 		bin_attrs[0] = &con->badpages_attr;
2124c3d4d45dSGuchun Chen 		group.bin_attrs_new = bin_attrs;
2125c3d4d45dSGuchun Chen 	}
2126c3d4d45dSGuchun Chen 
2127c3d4d45dSGuchun Chen 	r = sysfs_create_group(&adev->dev->kobj, &group);
2128a069a9ebSAlex Deucher 	if (r)
2129a069a9ebSAlex Deucher 		dev_err(adev->dev, "Failed to create RAS sysfs group!");
2130a069a9ebSAlex Deucher 
2131f848159bSGuchun Chen 	return 0;
2132c030f2e4Sxinhui pan }
2133c030f2e4Sxinhui pan 
amdgpu_ras_fs_fini(struct amdgpu_device * adev)2134c030f2e4Sxinhui pan static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
2135c030f2e4Sxinhui pan {
2136c030f2e4Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
213788293c03SNirmoy Das 	struct ras_manager *con_obj, *ip_obj, *tmp;
213888293c03SNirmoy Das 
213988293c03SNirmoy Das 	if (IS_ENABLED(CONFIG_DEBUG_FS)) {
214088293c03SNirmoy Das 		list_for_each_entry_safe(con_obj, tmp, &con->head, node) {
214188293c03SNirmoy Das 			ip_obj = amdgpu_ras_find_obj(adev, &con_obj->head);
214288293c03SNirmoy Das 			if (ip_obj)
214388293c03SNirmoy Das 				put_obj(ip_obj);
214488293c03SNirmoy Das 		}
214588293c03SNirmoy Das 	}
214688293c03SNirmoy Das 
214788293c03SNirmoy Das 	amdgpu_ras_sysfs_remove_all(adev);
2148c030f2e4Sxinhui pan 	return 0;
2149c030f2e4Sxinhui pan }
2150c030f2e4Sxinhui pan /* ras fs end */
2151c030f2e4Sxinhui pan 
2152c030f2e4Sxinhui pan /* ih begin */
2153c030f2e4Sxinhui pan 
2154b3c76814STao Zhou /* For the hardware that cannot enable bif ring for both ras_controller_irq
2155b3c76814STao Zhou  * and ras_err_evnet_athub_irq ih cookies, the driver has to poll status
2156b3c76814STao Zhou  * register to check whether the interrupt is triggered or not, and properly
2157b3c76814STao Zhou  * ack the interrupt if it is there
2158b3c76814STao Zhou  */
amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device * adev)2159b3c76814STao Zhou void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev)
2160b3c76814STao Zhou {
2161b3c76814STao Zhou 	/* Fatal error events are handled on host side */
2162950d6425SStanley.Yang 	if (amdgpu_sriov_vf(adev))
21638eba7205SCandice Li 		return;
2164b3c76814STao Zhou 	/**
2165e1ee2111SLijo Lazar 	 * If the current interrupt is caused by a non-fatal RAS error, skip
2166e1ee2111SLijo Lazar 	 * check for fatal error. For fatal errors, FED status of all devices
2167e1ee2111SLijo Lazar 	 * in XGMI hive gets set when the first device gets fatal error
2168e1ee2111SLijo Lazar 	 * interrupt. The error gets propagated to other devices as well, so
2169e1ee2111SLijo Lazar 	 * make sure to ack the interrupt regardless of FED status.
2170e1ee2111SLijo Lazar 	 */
2171e1ee2111SLijo Lazar 	if (!amdgpu_ras_get_fed_status(adev) &&
2172e1ee2111SLijo Lazar 	    amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY))
2173e1ee2111SLijo Lazar 		return;
2174e1ee2111SLijo Lazar 
2175b3c76814STao Zhou 	if (adev->nbio.ras &&
2176b3c76814STao Zhou 	    adev->nbio.ras->handle_ras_controller_intr_no_bifring)
2177b3c76814STao Zhou 		adev->nbio.ras->handle_ras_controller_intr_no_bifring(adev);
2178b3c76814STao Zhou 
2179b3c76814STao Zhou 	if (adev->nbio.ras &&
2180b3c76814STao Zhou 	    adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring)
2181b3c76814STao Zhou 		adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring(adev);
2182b3c76814STao Zhou }
2183b3c76814STao Zhou 
amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager * obj,struct amdgpu_iv_entry * entry)2184b3c76814STao Zhou static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *obj,
218566f87949STao Zhou 				struct amdgpu_iv_entry *entry)
218666f87949STao Zhou {
218766f87949STao Zhou 	bool poison_stat = false;
2188b63ac5d3STao Zhou 	struct amdgpu_device *adev = obj->adev;
218966f87949STao Zhou 	struct amdgpu_ras_block_object *block_obj =
219066f87949STao Zhou 		amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
219166f87949STao Zhou 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
21925f7697bbSTao Zhou 	enum ras_event_type type = RAS_EVENT_TYPE_POISON_CONSUMPTION;
219312b435a4SYang Wang 	u64 event_id;
219412b435a4SYang Wang 	int ret;
219512b435a4SYang Wang 
219666f87949STao Zhou 	if (!block_obj || !con)
21975f7697bbSTao Zhou 		return;
2198b63ac5d3STao Zhou 
2199b63ac5d3STao Zhou 	ret = amdgpu_ras_mark_ras_event(adev, type);
220012b435a4SYang Wang 	if (ret)
220112b435a4SYang Wang 		return;
220212b435a4SYang Wang 
220312b435a4SYang Wang 	amdgpu_ras_set_err_poison(adev, block_obj->ras_comm.block);
2204e1ee2111SLijo Lazar 	/* both query_poison_status and handle_poison_consumption are optional,
2205b63ac5d3STao Zhou 	 * but at least one of them should be implemented if we need poison
2206b63ac5d3STao Zhou 	 * consumption handler
2207b63ac5d3STao Zhou 	 */
2208b63ac5d3STao Zhou 	if (block_obj->hw_ops && block_obj->hw_ops->query_poison_status) {
2209ac7b25d9SYiPeng Chai 		poison_stat = block_obj->hw_ops->query_poison_status(adev);
2210b63ac5d3STao Zhou 		if (!poison_stat) {
2211b63ac5d3STao Zhou 			/* Not poison consumption interrupt, no need to handle it */
2212b63ac5d3STao Zhou 			dev_info(adev->dev, "No RAS poison status in %s poison IH.\n",
2213b63ac5d3STao Zhou 					block_obj->ras_comm.name);
2214b63ac5d3STao Zhou 
2215b63ac5d3STao Zhou 			return;
2216b63ac5d3STao Zhou 		}
2217b63ac5d3STao Zhou 	}
2218b63ac5d3STao Zhou 
2219b63ac5d3STao Zhou 	amdgpu_umc_poison_handler(adev, obj->head.block, 0);
22202fc46e0bSTao Zhou 
222166f87949STao Zhou 	if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption)
2222ac7b25d9SYiPeng Chai 		poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
222366f87949STao Zhou 
222466f87949STao Zhou 	/* gpu reset is fallback for failed and default cases.
22255f7697bbSTao Zhou 	 * For RMA case, amdgpu_umc_poison_handler will handle gpu reset.
22265f7697bbSTao Zhou 	 */
22275f7697bbSTao Zhou 	if (poison_stat && !amdgpu_ras_is_rma(adev)) {
2228792be2e2STao Zhou 		event_id = amdgpu_ras_acquire_event_id(adev, type);
222912b435a4SYang Wang 		RAS_EVENT_LOG(adev, event_id,
223012b435a4SYang Wang 			      "GPU reset for %s RAS poison consumption is issued!\n",
223112b435a4SYang Wang 			      block_obj->ras_comm.name);
2232b63ac5d3STao Zhou 		amdgpu_ras_reset_gpu(adev);
223366f87949STao Zhou 	}
223466f87949STao Zhou 
22355f7697bbSTao Zhou 	if (!poison_stat)
22365f7697bbSTao Zhou 		amdgpu_gfx_poison_consumption_handler(adev, entry);
22375f7697bbSTao Zhou }
2238b63ac5d3STao Zhou 
amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager * obj,struct amdgpu_iv_entry * entry)223966f87949STao Zhou static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj,
224050a7d025STao Zhou 				struct amdgpu_iv_entry *entry)
224150a7d025STao Zhou {
2242c030f2e4Sxinhui pan 	struct amdgpu_device *adev = obj->adev;
22435b9de259SYang Wang 	enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION;
22445b9de259SYang Wang 	u64 event_id;
22455b9de259SYang Wang 	int ret;
22465b9de259SYang Wang 
22475b9de259SYang Wang 	ret = amdgpu_ras_mark_ras_event(adev, type);
22485b9de259SYang Wang 	if (ret)
22495b9de259SYang Wang 		return;
22505b9de259SYang Wang 
22515b9de259SYang Wang 	event_id = amdgpu_ras_acquire_event_id(adev, type);
22525b9de259SYang Wang 	RAS_EVENT_LOG(adev, event_id, "Poison is created\n");
22535b9de259SYang Wang 
2254a734adfbSYiPeng Chai 	if (amdgpu_ip_version(obj->adev, UMC_HWIP, 0) >= IP_VERSION(12, 0, 0)) {
2255a734adfbSYiPeng Chai 		struct amdgpu_ras *con = amdgpu_ras_get_context(obj->adev);
2256a734adfbSYiPeng Chai 
2257a734adfbSYiPeng Chai 		atomic_inc(&con->page_retirement_req_cnt);
2258a734adfbSYiPeng Chai 		atomic_inc(&con->poison_creation_count);
22595f08275cSYiPeng Chai 
2260a734adfbSYiPeng Chai 		wake_up(&con->page_retirement_wq);
2261a734adfbSYiPeng Chai 	}
2262a734adfbSYiPeng Chai }
226350a7d025STao Zhou 
amdgpu_ras_interrupt_umc_handler(struct ras_manager * obj,struct amdgpu_iv_entry * entry)226450a7d025STao Zhou static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj,
226550a7d025STao Zhou 				struct amdgpu_iv_entry *entry)
226650a7d025STao Zhou {
226750a7d025STao Zhou 	struct ras_ih_data *data = &obj->ih_data;
226850a7d025STao Zhou 	struct ras_err_data err_data;
22695b1270beSYang Wang 	int ret;
227050a7d025STao Zhou 
227150a7d025STao Zhou 	if (!data->cb)
227250a7d025STao Zhou 		return;
227350a7d025STao Zhou 
227450a7d025STao Zhou 	ret = amdgpu_ras_error_data_init(&err_data);
22755b1270beSYang Wang 	if (ret)
22765b1270beSYang Wang 		return;
22775b1270beSYang Wang 
22785b1270beSYang Wang 	/* Let IP handle its data, maybe we need get the output
2279c030f2e4Sxinhui pan 	 * from the callback to update the error type/count, etc
228050a7d025STao Zhou 	 */
2281c030f2e4Sxinhui pan 	amdgpu_ras_set_fed(obj->adev, true);
228209a3d820STao Zhou 	ret = data->cb(obj->adev, &err_data, entry);
228350a7d025STao Zhou 	/* ue will trigger an interrupt, and in that case
2284c030f2e4Sxinhui pan 	 * we need do a reset to recovery the whole system.
2285c030f2e4Sxinhui pan 	 * But leave IP do that recovery, here we just dispatch
2286c030f2e4Sxinhui pan 	 * the error.
2287c030f2e4Sxinhui pan 	 */
2288c030f2e4Sxinhui pan 	if (ret == AMDGPU_RAS_SUCCESS) {
2289bd2280daSTao Zhou 		/* these counts could be left as 0 if
229051437623STao Zhou 		 * some blocks do not count error number
229151437623STao Zhou 		 */
2292c030f2e4Sxinhui pan 		obj->err_data.ue_count += err_data.ue_count;
229351437623STao Zhou 		obj->err_data.ce_count += err_data.ce_count;
229451437623STao Zhou 		obj->err_data.de_count += err_data.de_count;
229546e2231cSCandice Li 	}
229651437623STao Zhou 
22975b1270beSYang Wang 	amdgpu_ras_error_data_fini(&err_data);
22985b1270beSYang Wang }
2299c030f2e4Sxinhui pan 
amdgpu_ras_interrupt_handler(struct ras_manager * obj)230050a7d025STao Zhou static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
230150a7d025STao Zhou {
230250a7d025STao Zhou 	struct ras_ih_data *data = &obj->ih_data;
230350a7d025STao Zhou 	struct amdgpu_iv_entry entry;
230450a7d025STao Zhou 
230550a7d025STao Zhou 	while (data->rptr != data->wptr) {
230650a7d025STao Zhou 		rmb();
230750a7d025STao Zhou 		memcpy(&entry, &data->ring[data->rptr],
230850a7d025STao Zhou 				data->element_size);
230950a7d025STao Zhou 
231050a7d025STao Zhou 		wmb();
231150a7d025STao Zhou 		data->rptr = (data->aligned_element_size +
231250a7d025STao Zhou 				data->rptr) % data->ring_size;
231350a7d025STao Zhou 
231450a7d025STao Zhou 		if (amdgpu_ras_is_poison_mode_supported(obj->adev)) {
231550a7d025STao Zhou 			if (obj->head.block == AMDGPU_RAS_BLOCK__UMC)
231650a7d025STao Zhou 				amdgpu_ras_interrupt_poison_creation_handler(obj, &entry);
231750a7d025STao Zhou 			else
231866f87949STao Zhou 				amdgpu_ras_interrupt_poison_consumption_handler(obj, &entry);
231966f87949STao Zhou 		} else {
232050a7d025STao Zhou 			if (obj->head.block == AMDGPU_RAS_BLOCK__UMC)
232150a7d025STao Zhou 				amdgpu_ras_interrupt_umc_handler(obj, &entry);
232250a7d025STao Zhou 			else
232350a7d025STao Zhou 				dev_warn(obj->adev->dev,
232450a7d025STao Zhou 					"No RAS interrupt handler for non-UMC block with poison disabled.\n");
232550a7d025STao Zhou 		}
2326c030f2e4Sxinhui pan 	}
2327c030f2e4Sxinhui pan }
2328f524dd54STao Zhou 
amdgpu_ras_interrupt_process_handler(struct work_struct * work)2329c030f2e4Sxinhui pan static void amdgpu_ras_interrupt_process_handler(struct work_struct *work)
2330c030f2e4Sxinhui pan {
2331c030f2e4Sxinhui pan 	struct ras_ih_data *data =
2332c030f2e4Sxinhui pan 		container_of(work, struct ras_ih_data, ih_work);
2333c030f2e4Sxinhui pan 	struct ras_manager *obj =
2334c030f2e4Sxinhui pan 		container_of(data, struct ras_manager, ih_data);
2335c030f2e4Sxinhui pan 
2336c030f2e4Sxinhui pan 	amdgpu_ras_interrupt_handler(obj);
2337c030f2e4Sxinhui pan }
2338c030f2e4Sxinhui pan 
amdgpu_ras_interrupt_dispatch(struct amdgpu_device * adev,struct ras_dispatch_if * info)2339c030f2e4Sxinhui pan int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
2340c030f2e4Sxinhui pan 		struct ras_dispatch_if *info)
2341c030f2e4Sxinhui pan {
2342c030f2e4Sxinhui pan 	struct ras_manager *obj;
23434c11d30cSMa Jun 	struct ras_ih_data *data;
23444c11d30cSMa Jun 
2345c030f2e4Sxinhui pan 	obj = amdgpu_ras_find_obj(adev, &info->head);
23464c11d30cSMa Jun 	if (!obj)
2347c030f2e4Sxinhui pan 		return -EINVAL;
2348c030f2e4Sxinhui pan 
2349c030f2e4Sxinhui pan 	data = &obj->ih_data;
23504c11d30cSMa Jun 
23514c11d30cSMa Jun 	if (data->inuse == 0)
2352c030f2e4Sxinhui pan 		return 0;
2353c030f2e4Sxinhui pan 
2354c030f2e4Sxinhui pan 	/* Might be overflow... */
2355c030f2e4Sxinhui pan 	memcpy(&data->ring[data->wptr], info->entry,
2356c030f2e4Sxinhui pan 			data->element_size);
2357c030f2e4Sxinhui pan 
2358c030f2e4Sxinhui pan 	wmb();
2359c030f2e4Sxinhui pan 	data->wptr = (data->aligned_element_size +
2360c030f2e4Sxinhui pan 			data->wptr) % data->ring_size;
2361c030f2e4Sxinhui pan 
2362c030f2e4Sxinhui pan 	schedule_work(&data->ih_work);
2363c030f2e4Sxinhui pan 
2364c030f2e4Sxinhui pan 	return 0;
2365c030f2e4Sxinhui pan }
2366c030f2e4Sxinhui pan 
amdgpu_ras_interrupt_remove_handler(struct amdgpu_device * adev,struct ras_common_if * head)2367c030f2e4Sxinhui pan int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
2368c030f2e4Sxinhui pan 		struct ras_common_if *head)
23699252d33dSyipechai {
2370c030f2e4Sxinhui pan 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
23719252d33dSyipechai 	struct ras_ih_data *data;
2372c030f2e4Sxinhui pan 
2373c030f2e4Sxinhui pan 	if (!obj)
2374c030f2e4Sxinhui pan 		return -EINVAL;
2375c030f2e4Sxinhui pan 
2376c030f2e4Sxinhui pan 	data = &obj->ih_data;
2377c030f2e4Sxinhui pan 	if (data->inuse == 0)
2378c030f2e4Sxinhui pan 		return 0;
2379c030f2e4Sxinhui pan 
2380c030f2e4Sxinhui pan 	cancel_work_sync(&data->ih_work);
2381c030f2e4Sxinhui pan 
2382c030f2e4Sxinhui pan 	kfree(data->ring);
2383c030f2e4Sxinhui pan 	memset(data, 0, sizeof(*data));
2384c030f2e4Sxinhui pan 	put_obj(obj);
2385c030f2e4Sxinhui pan 
2386c030f2e4Sxinhui pan 	return 0;
2387c030f2e4Sxinhui pan }
2388c030f2e4Sxinhui pan 
amdgpu_ras_interrupt_add_handler(struct amdgpu_device * adev,struct ras_common_if * head)2389c030f2e4Sxinhui pan int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev,
2390c030f2e4Sxinhui pan 		struct ras_common_if *head)
23919252d33dSyipechai {
2392c030f2e4Sxinhui pan 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
23939252d33dSyipechai 	struct ras_ih_data *data;
2394c030f2e4Sxinhui pan 	struct amdgpu_ras_block_object *ras_obj;
23959252d33dSyipechai 
2396c030f2e4Sxinhui pan 	if (!obj) {
2397c030f2e4Sxinhui pan 		/* in case we registe the IH before enable ras feature */
2398c030f2e4Sxinhui pan 		obj = amdgpu_ras_create_obj(adev, head);
23999252d33dSyipechai 		if (!obj)
2400c030f2e4Sxinhui pan 			return -EINVAL;
2401c030f2e4Sxinhui pan 	} else
2402c030f2e4Sxinhui pan 		get_obj(obj);
2403c030f2e4Sxinhui pan 
2404c030f2e4Sxinhui pan 	ras_obj = container_of(head, struct amdgpu_ras_block_object, ras_comm);
24059252d33dSyipechai 
24069252d33dSyipechai 	data = &obj->ih_data;
2407c030f2e4Sxinhui pan 	/* add the callback.etc */
2408c030f2e4Sxinhui pan 	*data = (struct ras_ih_data) {
2409c030f2e4Sxinhui pan 		.inuse = 0,
2410c030f2e4Sxinhui pan 		.cb = ras_obj->ras_cb,
24119252d33dSyipechai 		.element_size = sizeof(struct amdgpu_iv_entry),
2412c030f2e4Sxinhui pan 		.rptr = 0,
2413c030f2e4Sxinhui pan 		.wptr = 0,
2414c030f2e4Sxinhui pan 	};
2415c030f2e4Sxinhui pan 
2416c030f2e4Sxinhui pan 	INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler);
2417c030f2e4Sxinhui pan 
2418c030f2e4Sxinhui pan 	data->aligned_element_size = ALIGN(data->element_size, 8);
2419c030f2e4Sxinhui pan 	/* the ring can store 64 iv entries. */
2420c030f2e4Sxinhui pan 	data->ring_size = 64 * data->aligned_element_size;
2421c030f2e4Sxinhui pan 	data->ring = kmalloc(data->ring_size, GFP_KERNEL);
2422c030f2e4Sxinhui pan 	if (!data->ring) {
2423c030f2e4Sxinhui pan 		put_obj(obj);
2424c030f2e4Sxinhui pan 		return -ENOMEM;
2425c030f2e4Sxinhui pan 	}
2426c030f2e4Sxinhui pan 
2427c030f2e4Sxinhui pan 	/* IH is ready */
2428c030f2e4Sxinhui pan 	data->inuse = 1;
2429c030f2e4Sxinhui pan 
2430c030f2e4Sxinhui pan 	return 0;
2431c030f2e4Sxinhui pan }
2432c030f2e4Sxinhui pan 
amdgpu_ras_interrupt_remove_all(struct amdgpu_device * adev)2433c030f2e4Sxinhui pan static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
2434c030f2e4Sxinhui pan {
2435c030f2e4Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2436c030f2e4Sxinhui pan 	struct ras_manager *obj, *tmp;
2437c030f2e4Sxinhui pan 
2438c030f2e4Sxinhui pan 	list_for_each_entry_safe(obj, tmp, &con->head, node) {
2439c030f2e4Sxinhui pan 		amdgpu_ras_interrupt_remove_handler(adev, &obj->head);
24409252d33dSyipechai 	}
2441c030f2e4Sxinhui pan 
2442c030f2e4Sxinhui pan 	return 0;
2443c030f2e4Sxinhui pan }
2444c030f2e4Sxinhui pan /* ih end */
2445c030f2e4Sxinhui pan 
2446c030f2e4Sxinhui pan /* traversal all IPs except NBIO to query error counter */
amdgpu_ras_log_on_err_counter(struct amdgpu_device * adev,enum ras_event_type type)2447313c8fd3SGuchun Chen static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev, enum ras_event_type type)
244875ac6a25SYang Wang {
2449313c8fd3SGuchun Chen 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2450313c8fd3SGuchun Chen 	struct ras_manager *obj;
2451313c8fd3SGuchun Chen 
2452313c8fd3SGuchun Chen 	if (!adev->ras_enabled || !con)
24538ab0d6f0SLuben Tuikov 		return;
2454313c8fd3SGuchun Chen 
2455313c8fd3SGuchun Chen 	list_for_each_entry(obj, &con->head, node) {
2456313c8fd3SGuchun Chen 		struct ras_query_if info = {
2457313c8fd3SGuchun Chen 			.head = obj->head,
2458313c8fd3SGuchun Chen 		};
2459313c8fd3SGuchun Chen 
2460313c8fd3SGuchun Chen 		/*
2461313c8fd3SGuchun Chen 		 * PCIE_BIF IP has one different isr by ras controller
2462313c8fd3SGuchun Chen 		 * interrupt, the specific ras counter query will be
2463313c8fd3SGuchun Chen 		 * done in that isr. So skip such block from common
2464313c8fd3SGuchun Chen 		 * sync flood interrupt isr calling.
2465313c8fd3SGuchun Chen 		 */
2466313c8fd3SGuchun Chen 		if (info.head.block == AMDGPU_RAS_BLOCK__PCIE_BIF)
2467313c8fd3SGuchun Chen 			continue;
2468313c8fd3SGuchun Chen 
2469313c8fd3SGuchun Chen 		/*
2470cf63b702SStanley.Yang 		 * this is a workaround for aldebaran, skip send msg to
2471cf63b702SStanley.Yang 		 * smu to get ecc_info table due to smu handle get ecc
2472cf63b702SStanley.Yang 		 * info table failed temporarily.
2473cf63b702SStanley.Yang 		 * should be removed until smu fix handle ecc_info table.
2474cf63b702SStanley.Yang 		 */
2475cf63b702SStanley.Yang 		if ((info.head.block == AMDGPU_RAS_BLOCK__UMC) &&
2476cf63b702SStanley.Yang 		    (amdgpu_ip_version(adev, MP1_HWIP, 0) ==
24774e8303cfSLijo Lazar 		     IP_VERSION(13, 0, 2)))
24784e8303cfSLijo Lazar 			continue;
2479cf63b702SStanley.Yang 
2480cf63b702SStanley.Yang 		amdgpu_ras_query_error_status_with_event(adev, &info, type);
248175ac6a25SYang Wang 
24822a460963SCandice Li 		if (amdgpu_ip_version(adev, MP0_HWIP, 0) !=
24834e8303cfSLijo Lazar 			    IP_VERSION(11, 0, 2) &&
24844e8303cfSLijo Lazar 		    amdgpu_ip_version(adev, MP0_HWIP, 0) !=
24854e8303cfSLijo Lazar 			    IP_VERSION(11, 0, 4) &&
24864e8303cfSLijo Lazar 		    amdgpu_ip_version(adev, MP0_HWIP, 0) !=
24874e8303cfSLijo Lazar 			    IP_VERSION(13, 0, 0)) {
24884e8303cfSLijo Lazar 			if (amdgpu_ras_reset_error_status(adev, info.head.block))
24892a460963SCandice Li 				dev_warn(adev->dev, "Failed to reset error counter and error status");
24902a460963SCandice Li 		}
24912a460963SCandice Li 	}
2492313c8fd3SGuchun Chen }
2493313c8fd3SGuchun Chen 
2494313c8fd3SGuchun Chen /* Parse RdRspStatus and WrRspStatus */
amdgpu_ras_error_status_query(struct amdgpu_device * adev,struct ras_query_if * info)24953f975d0fSStanley.Yang static void amdgpu_ras_error_status_query(struct amdgpu_device *adev,
2496cd92df93SLee Jones 					  struct ras_query_if *info)
24973f975d0fSStanley.Yang {
24983f975d0fSStanley.Yang 	struct amdgpu_ras_block_object *block_obj;
24998eb53bb2Syipechai 	/*
25003f975d0fSStanley.Yang 	 * Only two block need to query read/write
25013f975d0fSStanley.Yang 	 * RspStatus at current state
25023f975d0fSStanley.Yang 	 */
25033f975d0fSStanley.Yang 	if ((info->head.block != AMDGPU_RAS_BLOCK__GFX) &&
25045e67bba3Syipechai 		(info->head.block != AMDGPU_RAS_BLOCK__MMHUB))
25055e67bba3Syipechai 		return;
25065e67bba3Syipechai 
25075e67bba3Syipechai 	block_obj = amdgpu_ras_get_ras_block(adev,
2508b6efdb02Syipechai 					info->head.block,
2509b6efdb02Syipechai 					info->head.sub_block_index);
2510b6efdb02Syipechai 
2511b6efdb02Syipechai 	if (!block_obj || !block_obj->hw_ops) {
25128b0fb0e9Syipechai 		dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
2513afa37315SLuben Tuikov 			     get_ras_block_str(&info->head));
2514b6efdb02Syipechai 		return;
25158b0fb0e9Syipechai 	}
25163f975d0fSStanley.Yang 
25178b0fb0e9Syipechai 	if (block_obj->hw_ops->query_ras_error_status)
25188b0fb0e9Syipechai 		block_obj->hw_ops->query_ras_error_status(adev);
25198b0fb0e9Syipechai 
25205e67bba3Syipechai }
25213f975d0fSStanley.Yang 
amdgpu_ras_query_err_status(struct amdgpu_device * adev)25223f975d0fSStanley.Yang static void amdgpu_ras_query_err_status(struct amdgpu_device *adev)
25233f975d0fSStanley.Yang {
25243f975d0fSStanley.Yang 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
25253f975d0fSStanley.Yang 	struct ras_manager *obj;
25263f975d0fSStanley.Yang 
25273f975d0fSStanley.Yang 	if (!adev->ras_enabled || !con)
25288ab0d6f0SLuben Tuikov 		return;
25293f975d0fSStanley.Yang 
25303f975d0fSStanley.Yang 	list_for_each_entry(obj, &con->head, node) {
25313f975d0fSStanley.Yang 		struct ras_query_if info = {
25323f975d0fSStanley.Yang 			.head = obj->head,
25333f975d0fSStanley.Yang 		};
25343f975d0fSStanley.Yang 
25353f975d0fSStanley.Yang 		amdgpu_ras_error_status_query(adev, &info);
25363f975d0fSStanley.Yang 	}
25373f975d0fSStanley.Yang }
25383f975d0fSStanley.Yang 
25393f975d0fSStanley.Yang /* recovery begin */
2540c030f2e4Sxinhui pan 
2541466b1793Sxinhui pan /* return 0 on success.
2542466b1793Sxinhui pan  * caller need free bps.
2543466b1793Sxinhui pan  */
amdgpu_ras_badpages_read(struct amdgpu_device * adev,struct ras_badpage ** bps,unsigned int * count)2544466b1793Sxinhui pan static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
2545466b1793Sxinhui pan 		struct ras_badpage **bps, unsigned int *count)
2546466b1793Sxinhui pan {
2547466b1793Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2548466b1793Sxinhui pan 	struct ras_err_handler_data *data;
2549466b1793Sxinhui pan 	int i = 0;
2550466b1793Sxinhui pan 	int ret = 0, status;
2551732f2a30SDennis Li 
2552466b1793Sxinhui pan 	if (!con || !con->eh_data || !bps || !count)
2553466b1793Sxinhui pan 		return -EINVAL;
2554466b1793Sxinhui pan 
2555466b1793Sxinhui pan 	mutex_lock(&con->recovery_lock);
2556466b1793Sxinhui pan 	data = con->eh_data;
2557466b1793Sxinhui pan 	if (!data || data->count == 0) {
2558466b1793Sxinhui pan 		*bps = NULL;
2559466b1793Sxinhui pan 		ret = -EINVAL;
256046cf2fecSGuchun Chen 		goto out;
2561466b1793Sxinhui pan 	}
2562466b1793Sxinhui pan 
2563466b1793Sxinhui pan 	*bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL);
2564466b1793Sxinhui pan 	if (!*bps) {
2565466b1793Sxinhui pan 		ret = -ENOMEM;
2566466b1793Sxinhui pan 		goto out;
2567466b1793Sxinhui pan 	}
2568466b1793Sxinhui pan 
2569466b1793Sxinhui pan 	for (; i < data->count; i++) {
2570466b1793Sxinhui pan 		(*bps)[i] = (struct ras_badpage){
2571466b1793Sxinhui pan 			.bp = data->bps[i].retired_page,
25729dc23a63STao Zhou 			.size = AMDGPU_GPU_PAGE_SIZE,
2573466b1793Sxinhui pan 			.flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED,
257452dd95f2SGuchun Chen 		};
2575466b1793Sxinhui pan 		status = amdgpu_vram_mgr_query_page_status(&adev->mman.vram_mgr,
2576ec6aae97SNirmoy Das 				data->bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT);
2577bcc09348SYiPeng Chai 		if (status == -EBUSY)
2578732f2a30SDennis Li 			(*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING;
257952dd95f2SGuchun Chen 		else if (status == -ENOENT)
2580732f2a30SDennis Li 			(*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT;
258152dd95f2SGuchun Chen 	}
2582466b1793Sxinhui pan 
2583466b1793Sxinhui pan 	*count = data->count;
2584466b1793Sxinhui pan out:
2585466b1793Sxinhui pan 	mutex_unlock(&con->recovery_lock);
2586466b1793Sxinhui pan 	return ret;
2587466b1793Sxinhui pan }
2588466b1793Sxinhui pan 
amdgpu_ras_set_fed_all(struct amdgpu_device * adev,struct amdgpu_hive_info * hive,bool status)2589466b1793Sxinhui pan static void amdgpu_ras_set_fed_all(struct amdgpu_device *adev,
2590b41f742dSLijo Lazar 				   struct amdgpu_hive_info *hive, bool status)
2591b41f742dSLijo Lazar {
2592b41f742dSLijo Lazar 	struct amdgpu_device *tmp_adev;
2593b41f742dSLijo Lazar 
2594b41f742dSLijo Lazar 	if (hive) {
2595b41f742dSLijo Lazar 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
2596b41f742dSLijo Lazar 			amdgpu_ras_set_fed(tmp_adev, status);
2597b41f742dSLijo Lazar 	} else {
2598b41f742dSLijo Lazar 		amdgpu_ras_set_fed(adev, status);
2599b41f742dSLijo Lazar 	}
2600b41f742dSLijo Lazar }
2601b41f742dSLijo Lazar 
amdgpu_ras_in_recovery(struct amdgpu_device * adev)2602b41f742dSLijo Lazar bool amdgpu_ras_in_recovery(struct amdgpu_device *adev)
26037e437167STao Zhou {
26047e437167STao Zhou 	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
26057e437167STao Zhou 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
26067e437167STao Zhou 	int hive_ras_recovery = 0;
26077e437167STao Zhou 
26087e437167STao Zhou 	if (hive) {
26097e437167STao Zhou 		hive_ras_recovery = atomic_read(&hive->ras_recovery);
26107e437167STao Zhou 		amdgpu_put_xgmi_hive(hive);
26117e437167STao Zhou 	}
26127e437167STao Zhou 
26137e437167STao Zhou 	if (ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery))
26147e437167STao Zhou 		return true;
26157e437167STao Zhou 
26167e437167STao Zhou 	return false;
26177e437167STao Zhou }
26187e437167STao Zhou 
amdgpu_ras_get_fatal_error_event(struct amdgpu_device * adev)26197e437167STao Zhou static enum ras_event_type amdgpu_ras_get_fatal_error_event(struct amdgpu_device *adev)
262075ac6a25SYang Wang {
262175ac6a25SYang Wang 	if (amdgpu_ras_intr_triggered())
262275ac6a25SYang Wang 		return RAS_EVENT_TYPE_FATAL;
262375ac6a25SYang Wang 	else
262475ac6a25SYang Wang 		return RAS_EVENT_TYPE_POISON_CONSUMPTION;
262512b435a4SYang Wang }
262675ac6a25SYang Wang 
amdgpu_ras_do_recovery(struct work_struct * work)262775ac6a25SYang Wang static void amdgpu_ras_do_recovery(struct work_struct *work)
2628c030f2e4Sxinhui pan {
2629c030f2e4Sxinhui pan 	struct amdgpu_ras *ras =
2630c030f2e4Sxinhui pan 		container_of(work, struct amdgpu_ras, recovery_work);
2631c030f2e4Sxinhui pan 	struct amdgpu_device *remote_adev = NULL;
2632b3dbd6d3SJohn Clements 	struct amdgpu_device *adev = ras->adev;
2633b3dbd6d3SJohn Clements 	struct list_head device_list, *device_list_handle =  NULL;
2634b3dbd6d3SJohn Clements 	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
2635d95e8e97SDennis Li 	enum ras_event_type type;
263675ac6a25SYang Wang 
2637d95e8e97SDennis Li 	if (hive) {
2638b41f742dSLijo Lazar 		atomic_set(&hive->ras_recovery, 1);
263953dd920cSAsad Kamal 
2640b41f742dSLijo Lazar 		/* If any device which is part of the hive received RAS fatal
2641b41f742dSLijo Lazar 		 * error interrupt, set fatal error status on all. This
2642b41f742dSLijo Lazar 		 * condition will need a recovery, and flag will be cleared
2643b41f742dSLijo Lazar 		 * as part of recovery.
2644b41f742dSLijo Lazar 		 */
2645b41f742dSLijo Lazar 		list_for_each_entry(remote_adev, &hive->device_list,
2646b41f742dSLijo Lazar 				    gmc.xgmi.head)
2647b41f742dSLijo Lazar 			if (amdgpu_ras_get_fed_status(remote_adev)) {
2648b41f742dSLijo Lazar 				amdgpu_ras_set_fed_all(adev, hive, true);
2649b41f742dSLijo Lazar 				break;
2650b41f742dSLijo Lazar 			}
2651b41f742dSLijo Lazar 	}
2652b41f742dSLijo Lazar 	if (!ras->disable_ras_err_cnt_harvest) {
265353dd920cSAsad Kamal 
265453dd920cSAsad Kamal 		/* Build list of devices to query RAS related errors */
2655b3dbd6d3SJohn Clements 		if  (hive && adev->gmc.xgmi.num_physical_nodes > 1) {
2656f75e94d8SGuchun Chen 			device_list_handle = &hive->device_list;
2657b3dbd6d3SJohn Clements 		} else {
2658f75e94d8SGuchun Chen 			INIT_LIST_HEAD(&device_list);
265912c17b9dSGuchun Chen 			list_add_tail(&adev->gmc.xgmi.head, &device_list);
2660b3dbd6d3SJohn Clements 			device_list_handle = &device_list;
2661b3dbd6d3SJohn Clements 		}
2662b3dbd6d3SJohn Clements 
2663b3dbd6d3SJohn Clements 		type = amdgpu_ras_get_fatal_error_event(adev);
266475ac6a25SYang Wang 		list_for_each_entry(remote_adev,
2665f75e94d8SGuchun Chen 				device_list_handle, gmc.xgmi.head) {
26663f975d0fSStanley.Yang 			amdgpu_ras_query_err_status(remote_adev);
26673f975d0fSStanley.Yang 			amdgpu_ras_log_on_err_counter(remote_adev, type);
266875ac6a25SYang Wang 		}
26693f975d0fSStanley.Yang 
2670d95e8e97SDennis Li 	}
2671b3dbd6d3SJohn Clements 
2672313c8fd3SGuchun Chen 	if (amdgpu_device_should_recover_gpu(ras->adev)) {
2673f1549c09SLikun Gao 		struct amdgpu_reset_context reset_context;
2674f1549c09SLikun Gao 		memset(&reset_context, 0, sizeof(reset_context));
2675f1549c09SLikun Gao 
2676f1549c09SLikun Gao 		reset_context.method = AMD_RESET_METHOD_NONE;
2677f1549c09SLikun Gao 		reset_context.reset_req_dev = adev;
2678f1549c09SLikun Gao 		reset_context.src = AMDGPU_RESET_SRC_RAS;
2679bac640ddSEric Huang 		set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);
268081db4eabSLijo Lazar 
26811a11a65dSYiPeng Chai 		/* Perform full reset in fatal error mode */
26821a11a65dSYiPeng Chai 		if (!amdgpu_ras_is_poison_mode_supported(ras->adev))
26831a11a65dSYiPeng Chai 			set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
26841a11a65dSYiPeng Chai 		else {
26856c47a79bSYiPeng Chai 			clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
2686f1549c09SLikun Gao 
2687f1549c09SLikun Gao 			if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET) {
26886c47a79bSYiPeng Chai 				ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE2_RESET;
26896c47a79bSYiPeng Chai 				reset_context.method = AMD_RESET_METHOD_MODE2;
26906c47a79bSYiPeng Chai 			}
26916c47a79bSYiPeng Chai 
26922c7cd280SYiPeng Chai 			/* Fatal error occurs in poison mode, mode1 reset is used to
26932c7cd280SYiPeng Chai 			 * recover gpu.
26942c7cd280SYiPeng Chai 			 */
26952c7cd280SYiPeng Chai 			if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET) {
26962c7cd280SYiPeng Chai 				ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET;
26972c7cd280SYiPeng Chai 				set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
26982c7cd280SYiPeng Chai 
26991b98a5f8SYiPeng Chai 				psp_fatal_error_recovery_quirk(&adev->psp);
27001b98a5f8SYiPeng Chai 			}
27012c7cd280SYiPeng Chai 		}
27026c47a79bSYiPeng Chai 
27036c47a79bSYiPeng Chai 		amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context);
2704f1549c09SLikun Gao 	}
2705f1549c09SLikun Gao 	atomic_set(&ras->in_recovery, 0);
2706c030f2e4Sxinhui pan 	if (hive) {
270753dd920cSAsad Kamal 		atomic_set(&hive->ras_recovery, 0);
270853dd920cSAsad Kamal 		amdgpu_put_xgmi_hive(hive);
270953dd920cSAsad Kamal 	}
271053dd920cSAsad Kamal }
2711c030f2e4Sxinhui pan 
2712c030f2e4Sxinhui pan /* alloc/realloc bps array */
amdgpu_ras_realloc_eh_data_space(struct amdgpu_device * adev,struct ras_err_handler_data * data,int pages)2713c030f2e4Sxinhui pan static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
2714c030f2e4Sxinhui pan 		struct ras_err_handler_data *data, int pages)
2715c030f2e4Sxinhui pan {
2716c030f2e4Sxinhui pan 	unsigned int old_space = data->count + data->space_left;
2717c030f2e4Sxinhui pan 	unsigned int new_space = old_space + pages;
2718c030f2e4Sxinhui pan 	unsigned int align_space = ALIGN(new_space, 512);
27199dc23a63STao Zhou 	void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
27209dc23a63STao Zhou 
2721c030f2e4Sxinhui pan 	if (!bps) {
2722676deb38SDennis Li 		return -ENOMEM;
2723c030f2e4Sxinhui pan 	}
27249dc23a63STao Zhou 
2725c030f2e4Sxinhui pan 	if (data->bps) {
2726c030f2e4Sxinhui pan 		memcpy(bps, data->bps,
27279dc23a63STao Zhou 				data->count * sizeof(*data->bps));
2728c030f2e4Sxinhui pan 		kfree(data->bps);
2729c030f2e4Sxinhui pan 	}
2730c030f2e4Sxinhui pan 
2731c030f2e4Sxinhui pan 	data->bps = bps;
27329dc23a63STao Zhou 	data->space_left += align_space - old_space;
2733c030f2e4Sxinhui pan 	return 0;
2734c030f2e4Sxinhui pan }
2735c030f2e4Sxinhui pan 
amdgpu_ras_mca2pa_by_idx(struct amdgpu_device * adev,struct eeprom_table_record * bps,struct ras_err_data * err_data)2736c030f2e4Sxinhui pan static int amdgpu_ras_mca2pa_by_idx(struct amdgpu_device *adev,
2737a8d133e6STao Zhou 			struct eeprom_table_record *bps,
27380eecff79STao Zhou 			struct ras_err_data *err_data)
27390eecff79STao Zhou {
27400eecff79STao Zhou 	struct ta_ras_query_address_input addr_in;
27410eecff79STao Zhou 	uint32_t socket = 0;
27420eecff79STao Zhou 	int ret = 0;
27430eecff79STao Zhou 
27440eecff79STao Zhou 	if (adev->smuio.funcs && adev->smuio.funcs->get_socket_id)
27450eecff79STao Zhou 		socket = adev->smuio.funcs->get_socket_id(adev);
27460eecff79STao Zhou 
27470eecff79STao Zhou 	/* reinit err_data */
27480eecff79STao Zhou 	err_data->err_addr_cnt = 0;
27490eecff79STao Zhou 	err_data->err_addr_len = adev->umc.retire_unit;
27500eecff79STao Zhou 
27510eecff79STao Zhou 	memset(&addr_in, 0, sizeof(addr_in));
27520eecff79STao Zhou 	addr_in.ma.err_addr = bps->address;
27530eecff79STao Zhou 	addr_in.ma.socket_id = socket;
27540eecff79STao Zhou 	addr_in.ma.ch_inst = bps->mem_channel;
27550eecff79STao Zhou 	/* tell RAS TA the node instance is not used */
27560eecff79STao Zhou 	addr_in.ma.node_inst = TA_RAS_INV_NODE;
27570eecff79STao Zhou 
27580eecff79STao Zhou 	if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr)
27590eecff79STao Zhou 		ret = adev->umc.ras->convert_ras_err_addr(adev, err_data,
27600eecff79STao Zhou 				&addr_in, NULL, false);
27610eecff79STao Zhou 
27620eecff79STao Zhou 	return ret;
27630eecff79STao Zhou }
27640eecff79STao Zhou 
amdgpu_ras_mca2pa(struct amdgpu_device * adev,struct eeprom_table_record * bps,struct ras_err_data * err_data)27650eecff79STao Zhou static int amdgpu_ras_mca2pa(struct amdgpu_device *adev,
2766a8d133e6STao Zhou 			struct eeprom_table_record *bps,
2767a8d133e6STao Zhou 			struct ras_err_data *err_data)
2768a8d133e6STao Zhou {
2769a8d133e6STao Zhou 	struct ta_ras_query_address_input addr_in;
2770a8d133e6STao Zhou 	uint32_t die_id, socket = 0;
2771a8d133e6STao Zhou 
2772a8d133e6STao Zhou 	if (adev->smuio.funcs && adev->smuio.funcs->get_socket_id)
2773a8d133e6STao Zhou 		socket = adev->smuio.funcs->get_socket_id(adev);
2774a8d133e6STao Zhou 
2775a8d133e6STao Zhou 	/* although die id is gotten from PA in nps1 mode, the id is
2776a8d133e6STao Zhou 	 * fitable for any nps mode
2777a8d133e6STao Zhou 	 */
2778a8d133e6STao Zhou 	if (adev->umc.ras && adev->umc.ras->get_die_id_from_pa)
2779a8d133e6STao Zhou 		die_id = adev->umc.ras->get_die_id_from_pa(adev, bps->address,
2780a8d133e6STao Zhou 					bps->retired_page << AMDGPU_GPU_PAGE_SHIFT);
2781a8d133e6STao Zhou 	else
2782a8d133e6STao Zhou 		return -EINVAL;
2783a8d133e6STao Zhou 
2784a8d133e6STao Zhou 	/* reinit err_data */
2785a8d133e6STao Zhou 	err_data->err_addr_cnt = 0;
2786a8d133e6STao Zhou 	err_data->err_addr_len = adev->umc.retire_unit;
2787a8d133e6STao Zhou 
2788a8d133e6STao Zhou 	memset(&addr_in, 0, sizeof(addr_in));
2789a8d133e6STao Zhou 	addr_in.ma.err_addr = bps->address;
2790a8d133e6STao Zhou 	addr_in.ma.ch_inst = bps->mem_channel;
2791a8d133e6STao Zhou 	addr_in.ma.umc_inst = bps->mcumc_id;
2792a8d133e6STao Zhou 	addr_in.ma.node_inst = die_id;
2793a8d133e6STao Zhou 	addr_in.ma.socket_id = socket;
2794a8d133e6STao Zhou 
2795a8d133e6STao Zhou 	if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr)
2796a8d133e6STao Zhou 		return adev->umc.ras->convert_ras_err_addr(adev, err_data,
2797a8d133e6STao Zhou 					&addr_in, NULL, false);
2798a8d133e6STao Zhou 	else
2799a8d133e6STao Zhou 		return  -EINVAL;
2800a8d133e6STao Zhou }
2801a8d133e6STao Zhou 
__amdgpu_ras_restore_bad_pages(struct amdgpu_device * adev,struct eeprom_table_record * bps,int count)2802a8d133e6STao Zhou static int __amdgpu_ras_restore_bad_pages(struct amdgpu_device *adev,
28030153d276Sganglxie 					struct eeprom_table_record *bps, int count)
28040153d276Sganglxie {
28050153d276Sganglxie 	int j;
28060153d276Sganglxie 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
28070153d276Sganglxie 	struct ras_err_handler_data *data = con->eh_data;
28080153d276Sganglxie 
28090153d276Sganglxie 	for (j = 0; j < count; j++) {
28100153d276Sganglxie 		if (amdgpu_ras_check_bad_page_unlock(con,
28110153d276Sganglxie 			bps[j].retired_page << AMDGPU_GPU_PAGE_SHIFT))
28120153d276Sganglxie 			continue;
28130153d276Sganglxie 
28140153d276Sganglxie 		if (!data->space_left &&
28150153d276Sganglxie 		    amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
28160153d276Sganglxie 			return -ENOMEM;
28170153d276Sganglxie 		}
28180153d276Sganglxie 
28190153d276Sganglxie 		amdgpu_ras_reserve_page(adev, bps[j].retired_page);
28200153d276Sganglxie 
28210153d276Sganglxie 		memcpy(&data->bps[data->count], &(bps[j]),
28220153d276Sganglxie 				sizeof(struct eeprom_table_record));
28230153d276Sganglxie 		data->count++;
28240153d276Sganglxie 		data->space_left--;
28250153d276Sganglxie 	}
28260153d276Sganglxie 
28270153d276Sganglxie 	return 0;
28280153d276Sganglxie }
28290153d276Sganglxie 
__amdgpu_ras_convert_rec_array_from_rom(struct amdgpu_device * adev,struct eeprom_table_record * bps,struct ras_err_data * err_data,enum amdgpu_memory_partition nps)28300153d276Sganglxie static int __amdgpu_ras_convert_rec_array_from_rom(struct amdgpu_device *adev,
28310153d276Sganglxie 				struct eeprom_table_record *bps, struct ras_err_data *err_data,
28320153d276Sganglxie 				enum amdgpu_memory_partition nps)
28330153d276Sganglxie {
28340153d276Sganglxie 	int i = 0;
28350153d276Sganglxie 	enum amdgpu_memory_partition save_nps;
28360153d276Sganglxie 
28370153d276Sganglxie 	save_nps = (bps[0].retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK;
28380153d276Sganglxie 
28390153d276Sganglxie 	/*old asics just have pa in eeprom*/
2840a4b6e990Sganglxie 	if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12) {
2841a4b6e990Sganglxie 		memcpy(err_data->err_addr, bps,
2842a4b6e990Sganglxie 			sizeof(struct eeprom_table_record) * adev->umc.retire_unit);
2843a4b6e990Sganglxie 		goto out;
2844a4b6e990Sganglxie 	}
2845a4b6e990Sganglxie 
2846a4b6e990Sganglxie 	for (i = 0; i < adev->umc.retire_unit; i++)
28470153d276Sganglxie 		bps[i].retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT);
28480153d276Sganglxie 
28490153d276Sganglxie 	if (save_nps) {
28500153d276Sganglxie 		if (save_nps == nps) {
28510153d276Sganglxie 			if (amdgpu_umc_pages_in_a_row(adev, err_data,
28520153d276Sganglxie 					bps[0].retired_page << AMDGPU_GPU_PAGE_SHIFT))
28530153d276Sganglxie 				return -EINVAL;
28540153d276Sganglxie 		} else {
28550153d276Sganglxie 			if (amdgpu_ras_mca2pa_by_idx(adev, &bps[0], err_data))
28560153d276Sganglxie 				return -EINVAL;
28570153d276Sganglxie 		}
28580153d276Sganglxie 	} else {
28590153d276Sganglxie 		if (amdgpu_ras_mca2pa(adev, &bps[0], err_data)) {
28600153d276Sganglxie 			if (nps == AMDGPU_NPS1_PARTITION_MODE)
28610153d276Sganglxie 				memcpy(err_data->err_addr, bps,
28620153d276Sganglxie 					sizeof(struct eeprom_table_record) * adev->umc.retire_unit);
28630153d276Sganglxie 			else
28640153d276Sganglxie 				return -EOPNOTSUPP;
28650153d276Sganglxie 		}
28660153d276Sganglxie 	}
28670153d276Sganglxie 
28680153d276Sganglxie out:
2869a4b6e990Sganglxie 	return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr, adev->umc.retire_unit);
28700153d276Sganglxie }
28710153d276Sganglxie 
__amdgpu_ras_convert_rec_from_rom(struct amdgpu_device * adev,struct eeprom_table_record * bps,struct ras_err_data * err_data,enum amdgpu_memory_partition nps)28720153d276Sganglxie static int __amdgpu_ras_convert_rec_from_rom(struct amdgpu_device *adev,
28730153d276Sganglxie 				struct eeprom_table_record *bps, struct ras_err_data *err_data,
28740153d276Sganglxie 				enum amdgpu_memory_partition nps)
28750153d276Sganglxie {
28760153d276Sganglxie 	enum amdgpu_memory_partition save_nps;
28770153d276Sganglxie 
28780153d276Sganglxie 	save_nps = (bps->retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK;
28790153d276Sganglxie 	bps->retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT);
28800153d276Sganglxie 
28810153d276Sganglxie 	if (save_nps == nps) {
28820153d276Sganglxie 		if (amdgpu_umc_pages_in_a_row(adev, err_data,
28830153d276Sganglxie 				bps->retired_page << AMDGPU_GPU_PAGE_SHIFT))
28840153d276Sganglxie 			return -EINVAL;
28850153d276Sganglxie 	} else {
28860153d276Sganglxie 		if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data))
28870153d276Sganglxie 			return -EINVAL;
28880153d276Sganglxie 	}
28890153d276Sganglxie 	return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr,
28900153d276Sganglxie 									adev->umc.retire_unit);
28910153d276Sganglxie }
28920153d276Sganglxie 
28930153d276Sganglxie /* it deal with vram only. */
amdgpu_ras_add_bad_pages(struct amdgpu_device * adev,struct eeprom_table_record * bps,int pages,bool from_rom)2894c030f2e4Sxinhui pan int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
2895c030f2e4Sxinhui pan 		struct eeprom_table_record *bps, int pages, bool from_rom)
2896a8d133e6STao Zhou {
2897c030f2e4Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2898c030f2e4Sxinhui pan 	struct ras_err_data err_data;
28990eecff79STao Zhou 	struct amdgpu_ras_eeprom_control *control =
2900d08fb663STao Zhou 			&adev->psp.ras_context.ras->eeprom_control;
2901d08fb663STao Zhou 	enum amdgpu_memory_partition nps = AMDGPU_NPS1_PARTITION_MODE;
290207dd49e1STao Zhou 	int ret = 0;
2903c030f2e4Sxinhui pan 	uint32_t i;
29040153d276Sganglxie 
2905c030f2e4Sxinhui pan 	if (!con || !con->eh_data || !bps || pages <= 0)
290673aa8e1aSxinhui pan 		return 0;
2907c030f2e4Sxinhui pan 
2908c030f2e4Sxinhui pan 	if (from_rom) {
2909a8d133e6STao Zhou 		err_data.err_addr =
29100eecff79STao Zhou 			kcalloc(adev->umc.retire_unit,
29110eecff79STao Zhou 				sizeof(struct eeprom_table_record), GFP_KERNEL);
29120eecff79STao Zhou 		if (!err_data.err_addr) {
29130eecff79STao Zhou 			dev_warn(adev->dev, "Failed to alloc UMC error address record in mca2pa conversion!\n");
29140eecff79STao Zhou 			return -ENOMEM;
29150153d276Sganglxie 		}
29160eecff79STao Zhou 
29170eecff79STao Zhou 		if (adev->gmc.gmc_funcs->query_mem_partition_mode)
291807dd49e1STao Zhou 			nps = adev->gmc.gmc_funcs->query_mem_partition_mode(adev);
291907dd49e1STao Zhou 	}
29200eecff79STao Zhou 
29210eecff79STao Zhou 	mutex_lock(&con->recovery_lock);
2922a8d133e6STao Zhou 
2923a8d133e6STao Zhou 	if (from_rom) {
29240153d276Sganglxie 		for (i = 0; i < pages; i++) {
2925676deb38SDennis Li 			if (control->ras_num_recs - i >= adev->umc.retire_unit) {
29260153d276Sganglxie 				if ((bps[i].address == bps[i + 1].address) &&
29270153d276Sganglxie 				    (bps[i].mem_channel == bps[i + 1].mem_channel)) {
29280153d276Sganglxie 					//deal with retire_unit records a time
29290153d276Sganglxie 					ret = __amdgpu_ras_convert_rec_array_from_rom(adev,
29300153d276Sganglxie 									&bps[i], &err_data, nps);
29310153d276Sganglxie 					if (ret)
29320153d276Sganglxie 						goto free;
29330eecff79STao Zhou 					i += (adev->umc.retire_unit - 1);
2934a8d133e6STao Zhou 				} else {
29350eecff79STao Zhou 					break;
29360153d276Sganglxie 				}
29370153d276Sganglxie 			} else {
29380153d276Sganglxie 				break;
29390153d276Sganglxie 			}
29400eecff79STao Zhou 		}
2941a8d133e6STao Zhou 		for (; i < pages; i++) {
29420153d276Sganglxie 			ret = __amdgpu_ras_convert_rec_from_rom(adev,
29430153d276Sganglxie 				&bps[i], &err_data, nps);
29440153d276Sganglxie 			if (ret)
29450153d276Sganglxie 				goto free;
29460eecff79STao Zhou 		}
2947c030f2e4Sxinhui pan 	} else {
29480153d276Sganglxie 		ret = __amdgpu_ras_restore_bad_pages(adev, bps, pages);
29490153d276Sganglxie 	}
29500eecff79STao Zhou 
29510eecff79STao Zhou free:
29520eecff79STao Zhou 	if (from_rom)
2953a8d133e6STao Zhou 		kfree(err_data.err_addr);
29540eecff79STao Zhou 	mutex_unlock(&con->recovery_lock);
2955c030f2e4Sxinhui pan 
2956c030f2e4Sxinhui pan 	return ret;
2957c030f2e4Sxinhui pan }
2958c030f2e4Sxinhui pan 
2959c030f2e4Sxinhui pan /*
296078ad00c9STao Zhou  * write error record array to eeprom, the function should be
296178ad00c9STao Zhou  * protected by recovery_lock
296278ad00c9STao Zhou  * new_cnt: new added UE count, excluding reserved bad pages, can be NULL
29634d33e0f1STao Zhou  */
amdgpu_ras_save_bad_pages(struct amdgpu_device * adev,unsigned long * new_cnt)296478ad00c9STao Zhou int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
29654d33e0f1STao Zhou 		unsigned long *new_cnt)
29664d33e0f1STao Zhou {
296778ad00c9STao Zhou 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
296878ad00c9STao Zhou 	struct ras_err_handler_data *data;
296978ad00c9STao Zhou 	struct amdgpu_ras_eeprom_control *control;
29708a3e801fSGuchun Chen 	int save_count, unit_num, bad_page_num, i;
2971c3d4acf0STao Zhou 
297278ad00c9STao Zhou 	if (!con || !con->eh_data) {
29734d33e0f1STao Zhou 		if (new_cnt)
29744d33e0f1STao Zhou 			*new_cnt = 0;
29754d33e0f1STao Zhou 
29764d33e0f1STao Zhou 		return 0;
297778ad00c9STao Zhou 	}
29784d33e0f1STao Zhou 
297978ad00c9STao Zhou 	mutex_lock(&con->recovery_lock);
2980d9a69fe5SCandice Li 	control = &con->eeprom_control;
29818a3e801fSGuchun Chen 	data = con->eh_data;
298278ad00c9STao Zhou 	bad_page_num = control->ras_num_bad_pages;
2983ae756cd8STao Zhou 	save_count = data->count - bad_page_num;
2984c3d4acf0STao Zhou 	mutex_unlock(&con->recovery_lock);
2985d9a69fe5SCandice Li 
29864d33e0f1STao Zhou 	unit_num = save_count / adev->umc.retire_unit;
2987c3d4acf0STao Zhou 	if (new_cnt)
29884d33e0f1STao Zhou 		*new_cnt = unit_num;
2989c3d4acf0STao Zhou 
29904d33e0f1STao Zhou 	/* only new entries are saved */
299178ad00c9STao Zhou 	if (save_count > 0) {
2992b1628425SGuchun Chen 		/*old asics only save pa to eeprom like before*/
2993a4b6e990Sganglxie 		if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12) {
2994a4b6e990Sganglxie 			if (amdgpu_ras_eeprom_append(control,
2995a4b6e990Sganglxie 					&data->bps[bad_page_num], save_count)) {
2996a4b6e990Sganglxie 				dev_err(adev->dev, "Failed to save EEPROM table data!");
2997a4b6e990Sganglxie 				return -EIO;
2998a4b6e990Sganglxie 			}
2999a4b6e990Sganglxie 		} else {
3000a4b6e990Sganglxie 			for (i = 0; i < unit_num; i++) {
3001c3d4acf0STao Zhou 				if (amdgpu_ras_eeprom_append(control,
3002c3d4acf0STao Zhou 						&data->bps[bad_page_num +
3003a4b6e990Sganglxie 						i * adev->umc.retire_unit], 1)) {
3004a4b6e990Sganglxie 					dev_err(adev->dev, "Failed to save EEPROM table data!");
3005c3d4acf0STao Zhou 					return -EIO;
3006c3d4acf0STao Zhou 				}
3007c3d4acf0STao Zhou 			}
3008c3d4acf0STao Zhou 		}
3009a4b6e990Sganglxie 
3010a4b6e990Sganglxie 		dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", save_count);
3011b1628425SGuchun Chen 	}
3012b1628425SGuchun Chen 
3013b1628425SGuchun Chen 	return 0;
301478ad00c9STao Zhou }
301578ad00c9STao Zhou 
301678ad00c9STao Zhou /*
301778ad00c9STao Zhou  * read error record array in eeprom and reserve enough space for
301878ad00c9STao Zhou  * storing new bad pages
301978ad00c9STao Zhou  */
amdgpu_ras_load_bad_pages(struct amdgpu_device * adev)302078ad00c9STao Zhou static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
302178ad00c9STao Zhou {
302278ad00c9STao Zhou 	struct amdgpu_ras_eeprom_control *control =
302378ad00c9STao Zhou 		&adev->psp.ras_context.ras->eeprom_control;
30246457205cSCandice Li 	struct eeprom_table_record *bps;
3025e4e6a589SLuben Tuikov 	int ret, i = 0;
3026a8f921a1Sganglxie 
302778ad00c9STao Zhou 	/* no bad page record, skip eeprom access */
302878ad00c9STao Zhou 	if (control->ras_num_recs == 0 || amdgpu_bad_page_threshold == 0)
30290686627bSLuben Tuikov 		return 0;
3030e4e6a589SLuben Tuikov 
303178ad00c9STao Zhou 	bps = kcalloc(control->ras_num_recs, sizeof(*bps), GFP_KERNEL);
30320686627bSLuben Tuikov 	if (!bps)
303378ad00c9STao Zhou 		return -ENOMEM;
303478ad00c9STao Zhou 
303578ad00c9STao Zhou 	ret = amdgpu_ras_eeprom_read(control, bps, control->ras_num_recs);
30360686627bSLuben Tuikov 	if (ret) {
3037772df3dfSTao Zhou 		dev_err(adev->dev, "Failed to load EEPROM table records!");
30386952e99cSGuchun Chen 	} else {
3039772df3dfSTao Zhou 		if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) {
3040a8f921a1Sganglxie 			for (i = 0; i < control->ras_num_recs; i++) {
3041a8f921a1Sganglxie 				if ((control->ras_num_recs - i) >= adev->umc.retire_unit) {
3042a8f921a1Sganglxie 					if ((bps[i].address == bps[i + 1].address) &&
3043a8f921a1Sganglxie 						(bps[i].mem_channel == bps[i + 1].mem_channel)) {
3044a8f921a1Sganglxie 						control->ras_num_pa_recs += adev->umc.retire_unit;
3045a8f921a1Sganglxie 						i += (adev->umc.retire_unit - 1);
3046a8f921a1Sganglxie 					} else {
3047a8f921a1Sganglxie 						control->ras_num_mca_recs +=
3048a8f921a1Sganglxie 									(control->ras_num_recs - i);
3049a8f921a1Sganglxie 						break;
3050a8f921a1Sganglxie 					}
3051a8f921a1Sganglxie 				} else {
3052a8f921a1Sganglxie 					control->ras_num_mca_recs += (control->ras_num_recs - i);
3053a8f921a1Sganglxie 					break;
3054a8f921a1Sganglxie 				}
3055a8f921a1Sganglxie 			}
3056a8f921a1Sganglxie 		}
3057772df3dfSTao Zhou 
3058772df3dfSTao Zhou 		ret = amdgpu_ras_eeprom_check(control);
30591f06e7f3STao Zhou 		if (ret)
30601f06e7f3STao Zhou 			goto out;
30611f06e7f3STao Zhou 
30621f06e7f3STao Zhou 		/* HW not usable */
30631f06e7f3STao Zhou 		if (amdgpu_ras_is_rma(adev)) {
30641f06e7f3STao Zhou 			ret = -EHWPOISON;
30651f06e7f3STao Zhou 			goto out;
30661f06e7f3STao Zhou 		}
30671f06e7f3STao Zhou 
30681f06e7f3STao Zhou 		ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs, true);
3069a8d133e6STao Zhou 	}
3070772df3dfSTao Zhou 
307178ad00c9STao Zhou out:
30721f06e7f3STao Zhou 	kfree(bps);
307378ad00c9STao Zhou 	return ret;
307478ad00c9STao Zhou }
307578ad00c9STao Zhou 
amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras * con,uint64_t addr)307678ad00c9STao Zhou static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
3077676deb38SDennis Li 				uint64_t addr)
3078676deb38SDennis Li {
3079676deb38SDennis Li 	struct ras_err_handler_data *data = con->eh_data;
3080676deb38SDennis Li 	int i;
3081676deb38SDennis Li 
3082676deb38SDennis Li 	addr >>= AMDGPU_GPU_PAGE_SHIFT;
3083676deb38SDennis Li 	for (i = 0; i < data->count; i++)
3084676deb38SDennis Li 		if (addr == data->bps[i].retired_page)
3085676deb38SDennis Li 			return true;
3086676deb38SDennis Li 
3087676deb38SDennis Li 	return false;
3088676deb38SDennis Li }
3089676deb38SDennis Li 
3090676deb38SDennis Li /*
30916e4be987STao Zhou  * check if an address belongs to bad page
30926e4be987STao Zhou  *
30936e4be987STao Zhou  * Note: this check is only for umc block
30946e4be987STao Zhou  */
amdgpu_ras_check_bad_page(struct amdgpu_device * adev,uint64_t addr)30956e4be987STao Zhou static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
30966e4be987STao Zhou 				uint64_t addr)
30976e4be987STao Zhou {
30986e4be987STao Zhou 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
30996e4be987STao Zhou 	bool ret = false;
31006e4be987STao Zhou 
31016e4be987STao Zhou 	if (!con || !con->eh_data)
31026e4be987STao Zhou 		return ret;
31036e4be987STao Zhou 
31046e4be987STao Zhou 	mutex_lock(&con->recovery_lock);
31056e4be987STao Zhou 	ret = amdgpu_ras_check_bad_page_unlock(con, addr);
3106676deb38SDennis Li 	mutex_unlock(&con->recovery_lock);
31076e4be987STao Zhou 	return ret;
31086e4be987STao Zhou }
31096e4be987STao Zhou 
amdgpu_ras_validate_threshold(struct amdgpu_device * adev,uint32_t max_count)31106e4be987STao Zhou static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
3111e5c04edfSChristian König 					  uint32_t max_count)
3112e4e6a589SLuben Tuikov {
3113c84d4670SGuchun Chen 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
3114e5c04edfSChristian König 
3115c84d4670SGuchun Chen 	/*
3116c84d4670SGuchun Chen 	 * amdgpu_bad_page_threshold is used to config
311716b85a09SHawking Zhang 	 * the threshold for the number of bad pages.
311816b85a09SHawking Zhang 	 * -1:  Threshold is set to default value
311916b85a09SHawking Zhang 	 *      Driver will issue a warning message when threshold is reached
312016b85a09SHawking Zhang 	 *      and continue runtime services.
312116b85a09SHawking Zhang 	 * 0:   Disable bad page retirement
312216b85a09SHawking Zhang 	 *      Driver will not retire bad pages
312316b85a09SHawking Zhang 	 *      which is intended for debugging purpose.
312416b85a09SHawking Zhang 	 * -2:  Threshold is determined by a formula
312516b85a09SHawking Zhang 	 *      that assumes 1 bad page per 100M of local memory.
312616b85a09SHawking Zhang 	 *      Driver will continue runtime services when threhold is reached.
312716b85a09SHawking Zhang 	 * 0 < threshold < max number of bad page records in EEPROM,
312816b85a09SHawking Zhang 	 *      A user-defined threshold is set
312916b85a09SHawking Zhang 	 *      Driver will halt runtime services when this custom threshold is reached.
313016b85a09SHawking Zhang 	 */
3131c84d4670SGuchun Chen 	if (amdgpu_bad_page_threshold == -2) {
313216b85a09SHawking Zhang 		u64 val = adev->gmc.mc_vram_size;
3133e4e6a589SLuben Tuikov 
3134c84d4670SGuchun Chen 		do_div(val, RAS_BAD_PAGE_COVER);
3135e4e6a589SLuben Tuikov 		con->bad_page_cnt_threshold = min(lower_32_bits(val),
3136e5c04edfSChristian König 						  max_count);
3137e4e6a589SLuben Tuikov 	} else if (amdgpu_bad_page_threshold == -1) {
313816b85a09SHawking Zhang 		con->bad_page_cnt_threshold = ((con->reserved_pages_in_bytes) >> 21) << 4;
313916b85a09SHawking Zhang 	} else {
3140e5c04edfSChristian König 		con->bad_page_cnt_threshold = min_t(int, max_count,
3141e4e6a589SLuben Tuikov 						    amdgpu_bad_page_threshold);
3142e4e6a589SLuben Tuikov 	}
3143c84d4670SGuchun Chen }
3144c84d4670SGuchun Chen 
amdgpu_ras_put_poison_req(struct amdgpu_device * adev,enum amdgpu_ras_block block,uint16_t pasid,pasid_notify pasid_fn,void * data,uint32_t reset)3145c84d4670SGuchun Chen int amdgpu_ras_put_poison_req(struct amdgpu_device *adev,
314698b5bc87SYiPeng Chai 		enum amdgpu_ras_block block, uint16_t pasid,
314798b5bc87SYiPeng Chai 		pasid_notify pasid_fn, void *data, uint32_t reset)
314898b5bc87SYiPeng Chai {
314998b5bc87SYiPeng Chai 	int ret = 0;
315098b5bc87SYiPeng Chai 	struct ras_poison_msg poison_msg;
315198b5bc87SYiPeng Chai 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
315298b5bc87SYiPeng Chai 
315398b5bc87SYiPeng Chai 	memset(&poison_msg, 0, sizeof(poison_msg));
315498b5bc87SYiPeng Chai 	poison_msg.block = block;
315598b5bc87SYiPeng Chai 	poison_msg.pasid = pasid;
315698b5bc87SYiPeng Chai 	poison_msg.reset = reset;
315798b5bc87SYiPeng Chai 	poison_msg.pasid_fn = pasid_fn;
315898b5bc87SYiPeng Chai 	poison_msg.data = data;
315998b5bc87SYiPeng Chai 
316098b5bc87SYiPeng Chai 	ret = kfifo_put(&con->poison_fifo, poison_msg);
316198b5bc87SYiPeng Chai 	if (!ret) {
316298b5bc87SYiPeng Chai 		dev_err(adev->dev, "Poison message fifo is full!\n");
316398b5bc87SYiPeng Chai 		return -ENOSPC;
316498b5bc87SYiPeng Chai 	}
316598b5bc87SYiPeng Chai 
316698b5bc87SYiPeng Chai 	return 0;
316798b5bc87SYiPeng Chai }
316898b5bc87SYiPeng Chai 
amdgpu_ras_get_poison_req(struct amdgpu_device * adev,struct ras_poison_msg * poison_msg)316998b5bc87SYiPeng Chai static int amdgpu_ras_get_poison_req(struct amdgpu_device *adev,
317098b5bc87SYiPeng Chai 		struct ras_poison_msg *poison_msg)
317198b5bc87SYiPeng Chai {
317298b5bc87SYiPeng Chai 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
317398b5bc87SYiPeng Chai 
317498b5bc87SYiPeng Chai 	return kfifo_get(&con->poison_fifo, poison_msg);
317598b5bc87SYiPeng Chai }
317698b5bc87SYiPeng Chai 
amdgpu_ras_ecc_log_init(struct ras_ecc_log_info * ecc_log)317798b5bc87SYiPeng Chai static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log)
3178f493dd64SYiPeng Chai {
3179f493dd64SYiPeng Chai 	mutex_init(&ecc_log->lock);
3180f493dd64SYiPeng Chai 
3181f493dd64SYiPeng Chai 	INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL);
3182f493dd64SYiPeng Chai 	ecc_log->de_queried_count = 0;
318378146c1dSYiPeng Chai 	ecc_log->prev_de_queried_count = 0;
318478146c1dSYiPeng Chai }
3185f493dd64SYiPeng Chai 
amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info * ecc_log)3186f493dd64SYiPeng Chai static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
3187f493dd64SYiPeng Chai {
3188f493dd64SYiPeng Chai 	struct radix_tree_iter iter;
3189f493dd64SYiPeng Chai 	void __rcu **slot;
3190f493dd64SYiPeng Chai 	struct ras_ecc_err *ecc_err;
3191f493dd64SYiPeng Chai 
3192f493dd64SYiPeng Chai 	mutex_lock(&ecc_log->lock);
3193f493dd64SYiPeng Chai 	radix_tree_for_each_slot(slot, &ecc_log->de_page_tree, &iter, 0) {
3194f493dd64SYiPeng Chai 		ecc_err = radix_tree_deref_slot(slot);
3195f493dd64SYiPeng Chai 		kfree(ecc_err->err_pages.pfn);
3196f493dd64SYiPeng Chai 		kfree(ecc_err);
3197f493dd64SYiPeng Chai 		radix_tree_iter_delete(&ecc_log->de_page_tree, &iter, slot);
3198f493dd64SYiPeng Chai 	}
3199f493dd64SYiPeng Chai 	mutex_unlock(&ecc_log->lock);
3200f493dd64SYiPeng Chai 
3201f493dd64SYiPeng Chai 	mutex_destroy(&ecc_log->lock);
3202f493dd64SYiPeng Chai 	ecc_log->de_queried_count = 0;
320378146c1dSYiPeng Chai 	ecc_log->prev_de_queried_count = 0;
320478146c1dSYiPeng Chai }
3205f493dd64SYiPeng Chai 
amdgpu_ras_schedule_retirement_dwork(struct amdgpu_ras * con,uint32_t delayed_ms)3206a734adfbSYiPeng Chai static bool amdgpu_ras_schedule_retirement_dwork(struct amdgpu_ras *con,
3207c0470691SYiPeng Chai 				uint32_t delayed_ms)
3208c0470691SYiPeng Chai {
3209c0470691SYiPeng Chai 	int ret;
3210c0470691SYiPeng Chai 
3211c0470691SYiPeng Chai 	mutex_lock(&con->umc_ecc_log.lock);
3212c0470691SYiPeng Chai 	ret = radix_tree_tagged(&con->umc_ecc_log.de_page_tree,
3213c0470691SYiPeng Chai 			UMC_ECC_NEW_DETECTED_TAG);
3214c0470691SYiPeng Chai 	mutex_unlock(&con->umc_ecc_log.lock);
3215c0470691SYiPeng Chai 
3216c0470691SYiPeng Chai 	if (ret)
3217c0470691SYiPeng Chai 		schedule_delayed_work(&con->page_retirement_dwork,
3218c0470691SYiPeng Chai 			msecs_to_jiffies(delayed_ms));
3219c0470691SYiPeng Chai 
3220c0470691SYiPeng Chai 	return ret ? true : false;
3221c0470691SYiPeng Chai }
3222c0470691SYiPeng Chai 
amdgpu_ras_do_page_retirement(struct work_struct * work)3223c0470691SYiPeng Chai static void amdgpu_ras_do_page_retirement(struct work_struct *work)
32242cf8e50eSYiPeng Chai {
32252cf8e50eSYiPeng Chai 	struct amdgpu_ras *con = container_of(work, struct amdgpu_ras,
32262cf8e50eSYiPeng Chai 					      page_retirement_dwork.work);
32272cf8e50eSYiPeng Chai 	struct amdgpu_device *adev = con->adev;
32282cf8e50eSYiPeng Chai 	struct ras_err_data err_data;
32292cf8e50eSYiPeng Chai 	unsigned long err_cnt;
32305f7697bbSTao Zhou 
32312cf8e50eSYiPeng Chai 	/* If gpu reset is ongoing, delay retiring the bad pages */
3232e23300dfSYiPeng Chai 	if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) {
3233e23300dfSYiPeng Chai 		amdgpu_ras_schedule_retirement_dwork(con,
3234e23300dfSYiPeng Chai 				AMDGPU_RAS_RETIRE_PAGE_INTERVAL * 3);
3235e23300dfSYiPeng Chai 		return;
32362cf8e50eSYiPeng Chai 	}
3237e23300dfSYiPeng Chai 
32382cf8e50eSYiPeng Chai 	amdgpu_ras_error_data_init(&err_data);
32392cf8e50eSYiPeng Chai 
32402cf8e50eSYiPeng Chai 	amdgpu_umc_handle_bad_pages(adev, &err_data);
32412cf8e50eSYiPeng Chai 	err_cnt = err_data.err_addr_cnt;
32425f7697bbSTao Zhou 
32432cf8e50eSYiPeng Chai 	amdgpu_ras_error_data_fini(&err_data);
32442cf8e50eSYiPeng Chai 
32452cf8e50eSYiPeng Chai 	if (err_cnt && amdgpu_ras_is_rma(adev))
3246792be2e2STao Zhou 		amdgpu_ras_reset_gpu(adev);
32475f7697bbSTao Zhou 
32485f7697bbSTao Zhou 	amdgpu_ras_schedule_retirement_dwork(con,
3249c0470691SYiPeng Chai 			AMDGPU_RAS_RETIRE_PAGE_INTERVAL);
3250c0470691SYiPeng Chai }
32512cf8e50eSYiPeng Chai 
amdgpu_ras_poison_creation_handler(struct amdgpu_device * adev,uint32_t poison_creation_count)32522cf8e50eSYiPeng Chai static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
325378146c1dSYiPeng Chai 				uint32_t poison_creation_count)
325478146c1dSYiPeng Chai {
3255a734adfbSYiPeng Chai 	int ret = 0;
3256a734adfbSYiPeng Chai 	struct ras_ecc_log_info *ecc_log;
3257a734adfbSYiPeng Chai 	struct ras_query_if info;
3258a734adfbSYiPeng Chai 	uint32_t timeout = 0;
325978146c1dSYiPeng Chai 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
3260a734adfbSYiPeng Chai 	uint64_t de_queried_count;
326178146c1dSYiPeng Chai 	uint32_t new_detect_count, total_detect_count;
326278146c1dSYiPeng Chai 	uint32_t need_query_count = poison_creation_count;
326378146c1dSYiPeng Chai 	bool query_data_timeout = false;
326478146c1dSYiPeng Chai 	enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION;
32655b9de259SYang Wang 
3266a734adfbSYiPeng Chai 	memset(&info, 0, sizeof(info));
3267a734adfbSYiPeng Chai 	info.head.block = AMDGPU_RAS_BLOCK__UMC;
32683ca73073SYiPeng Chai 
3269a734adfbSYiPeng Chai 	ecc_log = &ras->umc_ecc_log;
3270a734adfbSYiPeng Chai 	total_detect_count = 0;
327178146c1dSYiPeng Chai 	do {
3272a734adfbSYiPeng Chai 		ret = amdgpu_ras_query_error_status_with_event(adev, &info, type);
32735b9de259SYang Wang 		if (ret)
327478146c1dSYiPeng Chai 			return ret;
327578146c1dSYiPeng Chai 
327678146c1dSYiPeng Chai 		de_queried_count = ecc_log->de_queried_count;
327778146c1dSYiPeng Chai 		if (de_queried_count > ecc_log->prev_de_queried_count) {
327878146c1dSYiPeng Chai 			new_detect_count = de_queried_count - ecc_log->prev_de_queried_count;
327978146c1dSYiPeng Chai 			ecc_log->prev_de_queried_count = de_queried_count;
328078146c1dSYiPeng Chai 			timeout = 0;
328178146c1dSYiPeng Chai 		} else {
328278146c1dSYiPeng Chai 			new_detect_count = 0;
328378146c1dSYiPeng Chai 		}
3284a734adfbSYiPeng Chai 
3285a734adfbSYiPeng Chai 		if (new_detect_count) {
328678146c1dSYiPeng Chai 			total_detect_count += new_detect_count;
328778146c1dSYiPeng Chai 		} else {
328878146c1dSYiPeng Chai 			if (!timeout && need_query_count)
328978146c1dSYiPeng Chai 				timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
329078146c1dSYiPeng Chai 
329178146c1dSYiPeng Chai 			if (timeout) {
329278146c1dSYiPeng Chai 				if (!--timeout) {
329378146c1dSYiPeng Chai 					query_data_timeout = true;
329478146c1dSYiPeng Chai 					break;
329578146c1dSYiPeng Chai 				}
329678146c1dSYiPeng Chai 				msleep(1);
3297a734adfbSYiPeng Chai 			}
3298a734adfbSYiPeng Chai 		}
329978146c1dSYiPeng Chai 	} while (total_detect_count < need_query_count);
330078146c1dSYiPeng Chai 
3301a734adfbSYiPeng Chai 	if (query_data_timeout) {
330278146c1dSYiPeng Chai 		dev_warn(adev->dev, "Can't find deferred error! count: %u\n",
330378146c1dSYiPeng Chai 			(need_query_count - total_detect_count));
330478146c1dSYiPeng Chai 		return -ENOENT;
330578146c1dSYiPeng Chai 	}
3306a734adfbSYiPeng Chai 
3307a734adfbSYiPeng Chai 	if (total_detect_count)
330878146c1dSYiPeng Chai 		schedule_delayed_work(&ras->page_retirement_dwork, 0);
33093ca73073SYiPeng Chai 
331078146c1dSYiPeng Chai 	return 0;
331178146c1dSYiPeng Chai }
3312a734adfbSYiPeng Chai 
amdgpu_ras_clear_poison_fifo(struct amdgpu_device * adev)3313a734adfbSYiPeng Chai static void amdgpu_ras_clear_poison_fifo(struct amdgpu_device *adev)
3314f852c979SYiPeng Chai {
3315f852c979SYiPeng Chai 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
3316f852c979SYiPeng Chai 	struct ras_poison_msg msg;
3317f852c979SYiPeng Chai 	int ret;
3318f852c979SYiPeng Chai 
3319f852c979SYiPeng Chai 	do {
3320f852c979SYiPeng Chai 		ret = kfifo_get(&con->poison_fifo, &msg);
3321f852c979SYiPeng Chai 	} while (ret);
3322f852c979SYiPeng Chai }
3323f852c979SYiPeng Chai 
amdgpu_ras_poison_consumption_handler(struct amdgpu_device * adev,uint32_t msg_count,uint32_t * gpu_reset)3324f852c979SYiPeng Chai static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
3325370fbff4SYiPeng Chai 			uint32_t msg_count, uint32_t *gpu_reset)
3326e278849cSYiPeng Chai {
3327370fbff4SYiPeng Chai 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
3328370fbff4SYiPeng Chai 	uint32_t reset_flags = 0, reset = 0;
3329e278849cSYiPeng Chai 	struct ras_poison_msg msg;
3330e278849cSYiPeng Chai 	int ret, i;
3331e278849cSYiPeng Chai 
3332370fbff4SYiPeng Chai 	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
3333370fbff4SYiPeng Chai 
3334370fbff4SYiPeng Chai 	for (i = 0; i < msg_count; i++) {
3335e278849cSYiPeng Chai 		ret = amdgpu_ras_get_poison_req(adev, &msg);
3336e278849cSYiPeng Chai 		if (!ret)
3337e278849cSYiPeng Chai 			continue;
3338e278849cSYiPeng Chai 
3339e278849cSYiPeng Chai 		if (msg.pasid_fn)
3340e278849cSYiPeng Chai 			msg.pasid_fn(adev, msg.pasid, msg.data);
3341e278849cSYiPeng Chai 
3342e278849cSYiPeng Chai 		reset_flags |= msg.reset;
3343e278849cSYiPeng Chai 	}
3344e278849cSYiPeng Chai 
3345370fbff4SYiPeng Chai 	/* for RMA, amdgpu_ras_poison_creation_handler will trigger gpu reset */
33465f7697bbSTao Zhou 	if (reset_flags && !amdgpu_ras_is_rma(adev)) {
3347792be2e2STao Zhou 		if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET)
3348e278849cSYiPeng Chai 			reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
3349e278849cSYiPeng Chai 		else if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET)
3350e278849cSYiPeng Chai 			reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
3351e278849cSYiPeng Chai 		else
3352e278849cSYiPeng Chai 			reset = reset_flags;
3353e278849cSYiPeng Chai 
3354e278849cSYiPeng Chai 		flush_delayed_work(&con->page_retirement_dwork);
3355370fbff4SYiPeng Chai 
3356370fbff4SYiPeng Chai 		con->gpu_reset_flags |= reset;
3357370fbff4SYiPeng Chai 		amdgpu_ras_reset_gpu(adev);
3358370fbff4SYiPeng Chai 
3359e278849cSYiPeng Chai 		*gpu_reset = reset;
3360e278849cSYiPeng Chai 
3361f852c979SYiPeng Chai 		/* Wait for gpu recovery to complete */
3362f852c979SYiPeng Chai 		flush_work(&con->recovery_work);
3363f852c979SYiPeng Chai 	}
3364370fbff4SYiPeng Chai 
3365370fbff4SYiPeng Chai 	return 0;
3366370fbff4SYiPeng Chai }
3367370fbff4SYiPeng Chai 
amdgpu_ras_page_retirement_thread(void * param)3368370fbff4SYiPeng Chai static int amdgpu_ras_page_retirement_thread(void *param)
33693fdcd0a3SYiPeng Chai {
33703fdcd0a3SYiPeng Chai 	struct amdgpu_device *adev = (struct amdgpu_device *)param;
33713fdcd0a3SYiPeng Chai 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
33723fdcd0a3SYiPeng Chai 	uint32_t poison_creation_count, msg_count;
3373e278849cSYiPeng Chai 	uint32_t gpu_reset;
3374e278849cSYiPeng Chai 	int ret;
33755f08275cSYiPeng Chai 
33763fdcd0a3SYiPeng Chai 	while (!kthread_should_stop()) {
33773fdcd0a3SYiPeng Chai 
33783fdcd0a3SYiPeng Chai 		wait_event_interruptible(con->page_retirement_wq,
33793fdcd0a3SYiPeng Chai 				kthread_should_stop() ||
3380c84a7e21SMukul Joshi 				atomic_read(&con->page_retirement_req_cnt));
33813fdcd0a3SYiPeng Chai 
33823fdcd0a3SYiPeng Chai 		if (kthread_should_stop())
3383c84a7e21SMukul Joshi 			break;
3384c84a7e21SMukul Joshi 
3385c84a7e21SMukul Joshi 		gpu_reset = 0;
3386e278849cSYiPeng Chai 
33875f08275cSYiPeng Chai 		do {
33885f08275cSYiPeng Chai 			poison_creation_count = atomic_read(&con->poison_creation_count);
33895f08275cSYiPeng Chai 			ret = amdgpu_ras_poison_creation_handler(adev, poison_creation_count);
33905f08275cSYiPeng Chai 			if (ret == -EIO)
33915f08275cSYiPeng Chai 				break;
33925f08275cSYiPeng Chai 
33935f08275cSYiPeng Chai 			if (poison_creation_count) {
33945f08275cSYiPeng Chai 				atomic_sub(poison_creation_count, &con->poison_creation_count);
33955f08275cSYiPeng Chai 				atomic_sub(poison_creation_count, &con->page_retirement_req_cnt);
33965f08275cSYiPeng Chai 			}
33975f08275cSYiPeng Chai 		} while (atomic_read(&con->poison_creation_count));
33985f08275cSYiPeng Chai 
33996c23f3d1SYiPeng Chai 		if (ret != -EIO) {
3400e278849cSYiPeng Chai 			msg_count = kfifo_len(&con->poison_fifo);
3401e278849cSYiPeng Chai 			if (msg_count) {
3402e278849cSYiPeng Chai 				ret = amdgpu_ras_poison_consumption_handler(adev,
3403e278849cSYiPeng Chai 						msg_count, &gpu_reset);
3404e278849cSYiPeng Chai 				if ((ret != -EIO) &&
3405e278849cSYiPeng Chai 				    (gpu_reset != AMDGPU_RAS_GPU_RESET_MODE1_RESET))
3406e278849cSYiPeng Chai 					atomic_sub(msg_count, &con->page_retirement_req_cnt);
3407e278849cSYiPeng Chai 			}
3408e278849cSYiPeng Chai 		}
3409e278849cSYiPeng Chai 
3410f852c979SYiPeng Chai 		if ((ret == -EIO) || (gpu_reset == AMDGPU_RAS_GPU_RESET_MODE1_RESET)) {
3411f852c979SYiPeng Chai 			/* gpu mode-1 reset is ongoing or just completed ras mode-1 reset */
3412f852c979SYiPeng Chai 			/* Clear poison creation request */
3413f852c979SYiPeng Chai 			atomic_set(&con->poison_creation_count, 0);
3414f852c979SYiPeng Chai 
3415f852c979SYiPeng Chai 			/* Clear poison fifo */
3416f852c979SYiPeng Chai 			amdgpu_ras_clear_poison_fifo(adev);
3417f852c979SYiPeng Chai 
3418f852c979SYiPeng Chai 			/* Clear all poison requests */
3419f852c979SYiPeng Chai 			atomic_set(&con->page_retirement_req_cnt, 0);
3420f852c979SYiPeng Chai 
3421f852c979SYiPeng Chai 			if (ret == -EIO) {
3422f852c979SYiPeng Chai 				/* Wait for mode-1 reset to complete */
3423f852c979SYiPeng Chai 				down_read(&adev->reset_domain->sem);
3424f852c979SYiPeng Chai 				up_read(&adev->reset_domain->sem);
3425f852c979SYiPeng Chai 			}
3426f852c979SYiPeng Chai 
3427f852c979SYiPeng Chai 			/* Wake up work to save bad pages to eeprom */
3428f852c979SYiPeng Chai 			schedule_delayed_work(&con->page_retirement_dwork, 0);
3429f852c979SYiPeng Chai 		} else if (gpu_reset) {
3430f852c979SYiPeng Chai 			/* gpu just completed mode-2 reset or other reset */
3431f852c979SYiPeng Chai 			/* Clear poison consumption messages cached in fifo */
3432f852c979SYiPeng Chai 			msg_count = kfifo_len(&con->poison_fifo);
3433f852c979SYiPeng Chai 			if (msg_count) {
3434f852c979SYiPeng Chai 				amdgpu_ras_clear_poison_fifo(adev);
3435f852c979SYiPeng Chai 				atomic_sub(msg_count, &con->page_retirement_req_cnt);
3436f852c979SYiPeng Chai 			}
3437f852c979SYiPeng Chai 
3438f852c979SYiPeng Chai 			/* Wake up work to save bad pages to eeprom */
3439f852c979SYiPeng Chai 			schedule_delayed_work(&con->page_retirement_dwork, 0);
3440f852c979SYiPeng Chai 		}
3441f852c979SYiPeng Chai 	}
34423fdcd0a3SYiPeng Chai 
34433fdcd0a3SYiPeng Chai 	return 0;
34443fdcd0a3SYiPeng Chai }
34453fdcd0a3SYiPeng Chai 
amdgpu_ras_init_badpage_info(struct amdgpu_device * adev)34463fdcd0a3SYiPeng Chai int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
3447b17f8732SLijo Lazar {
3448b17f8732SLijo Lazar 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
3449b17f8732SLijo Lazar 	struct amdgpu_ras_eeprom_control *control;
3450772df3dfSTao Zhou 	int ret;
3451b17f8732SLijo Lazar 
3452b17f8732SLijo Lazar 	if (!con || amdgpu_sriov_vf(adev))
3453b17f8732SLijo Lazar 		return 0;
3454b17f8732SLijo Lazar 
3455b17f8732SLijo Lazar 	control = &con->eeprom_control;
3456772df3dfSTao Zhou 	ret = amdgpu_ras_eeprom_init(control);
3457772df3dfSTao Zhou 	if (ret)
3458b17f8732SLijo Lazar 		return ret;
3459b17f8732SLijo Lazar 
3460b17f8732SLijo Lazar 	if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr)
3461772df3dfSTao Zhou 		control->ras_num_pa_recs = control->ras_num_recs;
3462a8f921a1Sganglxie 
3463772df3dfSTao Zhou 	if (control->ras_num_recs) {
3464772df3dfSTao Zhou 		ret = amdgpu_ras_load_bad_pages(adev);
3465b17f8732SLijo Lazar 		if (ret)
3466b17f8732SLijo Lazar 			return ret;
3467b17f8732SLijo Lazar 
3468b17f8732SLijo Lazar 		amdgpu_dpm_send_hbm_bad_pages_num(
3469b17f8732SLijo Lazar 			adev, control->ras_num_bad_pages);
3470ae756cd8STao Zhou 
3471b17f8732SLijo Lazar 		if (con->update_channel_flag == true) {
3472b17f8732SLijo Lazar 			amdgpu_dpm_send_hbm_bad_channel_flag(
3473b17f8732SLijo Lazar 				adev, control->bad_channel_bitmap);
3474772df3dfSTao Zhou 			con->update_channel_flag = false;
3475b17f8732SLijo Lazar 		}
3476b17f8732SLijo Lazar 
347705d50ea3STao Zhou 		/* The format action is only applied to new ASICs */
347805d50ea3STao Zhou 		if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) >= 12 &&
347905d50ea3STao Zhou 		    control->tbl_hdr.version < RAS_TABLE_VER_V3)
348005d50ea3STao Zhou 			if (!amdgpu_ras_eeprom_reset_table(control))
348105d50ea3STao Zhou 				if (amdgpu_ras_save_bad_pages(adev, NULL))
348205d50ea3STao Zhou 					dev_warn(adev->dev, "Failed to format RAS EEPROM data in V3 version!\n");
348305d50ea3STao Zhou 	}
3484b17f8732SLijo Lazar 
3485b17f8732SLijo Lazar 	return ret;
3486b17f8732SLijo Lazar }
3487b17f8732SLijo Lazar 
amdgpu_ras_recovery_init(struct amdgpu_device * adev,bool init_bp_info)3488b17f8732SLijo Lazar int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info)
3489b17f8732SLijo Lazar {
3490c030f2e4Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
3491c030f2e4Sxinhui pan 	struct ras_err_handler_data **data;
34924d1337d2SAndrey Grodzovsky 	u32  max_eeprom_records_count = 0;
3493e4e6a589SLuben Tuikov 	int ret;
349478ad00c9STao Zhou 
3495c030f2e4Sxinhui pan 	if (!con || amdgpu_sriov_vf(adev))
3496e0e146d5SStanley.Yang 		return 0;
34974d1337d2SAndrey Grodzovsky 
34984d1337d2SAndrey Grodzovsky 	/* Allow access to RAS EEPROM via debugfs, when the ASIC
34991d9d2ca8SLuben Tuikov 	 * supports RAS and debugfs is enabled, but when
35001d9d2ca8SLuben Tuikov 	 * adev->ras_enabled is unset, i.e. when "ras_enable"
35011d9d2ca8SLuben Tuikov 	 * module parameter is set to 0.
35021d9d2ca8SLuben Tuikov 	 */
35031d9d2ca8SLuben Tuikov 	con->adev = adev;
35041d9d2ca8SLuben Tuikov 
35051d9d2ca8SLuben Tuikov 	if (!adev->ras_enabled)
35061d9d2ca8SLuben Tuikov 		return 0;
35071d9d2ca8SLuben Tuikov 
35081d9d2ca8SLuben Tuikov 	data = &con->eh_data;
35091d9d2ca8SLuben Tuikov 	*data = kzalloc(sizeof(**data), GFP_KERNEL);
3510091411beSSrinivasan Shanmugam 	if (!*data) {
35111a6fc071STao Zhou 		ret = -ENOMEM;
35121a6fc071STao Zhou 		goto out;
35131a6fc071STao Zhou 	}
35141a6fc071STao Zhou 
3515c030f2e4Sxinhui pan 	mutex_init(&con->recovery_lock);
3516c030f2e4Sxinhui pan 	INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
3517c030f2e4Sxinhui pan 	atomic_set(&con->in_recovery, 0);
3518c030f2e4Sxinhui pan 	con->eeprom_control.bad_channel_bitmap = 0;
351969691c82SStanley.Yang 
3520c030f2e4Sxinhui pan 	max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(&con->eeprom_control);
35217f599fedSStanley.Yang 	amdgpu_ras_validate_threshold(adev, max_eeprom_records_count);
3522e4e6a589SLuben Tuikov 
3523c84d4670SGuchun Chen 	if (init_bp_info) {
3524b17f8732SLijo Lazar 		ret = amdgpu_ras_init_badpage_info(adev);
3525b17f8732SLijo Lazar 		if (ret)
352678ad00c9STao Zhou 			goto free;
35271a6fc071STao Zhou 	}
352878ad00c9STao Zhou 
3529c030f2e4Sxinhui pan 	mutex_init(&con->page_rsv_lock);
3530af730e08SYiPeng Chai 	INIT_KFIFO(con->poison_fifo);
353198b5bc87SYiPeng Chai 	mutex_init(&con->page_retirement_lock);
35323fdcd0a3SYiPeng Chai 	init_waitqueue_head(&con->page_retirement_wq);
35333fdcd0a3SYiPeng Chai 	atomic_set(&con->page_retirement_req_cnt, 0);
35343fdcd0a3SYiPeng Chai 	atomic_set(&con->poison_creation_count, 0);
35355f08275cSYiPeng Chai 	con->page_retirement_thread =
35363fdcd0a3SYiPeng Chai 		kthread_run(amdgpu_ras_page_retirement_thread, adev, "umc_page_retirement");
35373fdcd0a3SYiPeng Chai 	if (IS_ERR(con->page_retirement_thread)) {
35383fdcd0a3SYiPeng Chai 		con->page_retirement_thread = NULL;
35393fdcd0a3SYiPeng Chai 		dev_warn(adev->dev, "Failed to create umc_page_retirement thread!!!\n");
35403fdcd0a3SYiPeng Chai 	}
35413fdcd0a3SYiPeng Chai 
35423fdcd0a3SYiPeng Chai 	INIT_DELAYED_WORK(&con->page_retirement_dwork, amdgpu_ras_do_page_retirement);
35432cf8e50eSYiPeng Chai 	amdgpu_ras_ecc_log_init(&con->umc_ecc_log);
3544f493dd64SYiPeng Chai #ifdef CONFIG_X86_MCE_AMD
354512b2cab7SMukul Joshi 	if ((adev->asic_type == CHIP_ALDEBARAN) &&
354612b2cab7SMukul Joshi 	    (adev->gmc.xgmi.connected_to_cpu))
354712b2cab7SMukul Joshi 		amdgpu_register_bad_pages_mca_notifier(adev);
354891a1a52dSMukul Joshi #endif
354912b2cab7SMukul Joshi 	return 0;
3550c030f2e4Sxinhui pan 
35511a6fc071STao Zhou free:
35521a6fc071STao Zhou 	kfree((*data)->bps);
35531a6fc071STao Zhou 	kfree(*data);
35541a6fc071STao Zhou 	con->eh_data = NULL;
35551995b3a3SFelix Kuehling out:
35561a6fc071STao Zhou 	dev_warn(adev->dev, "Failed to initialize ras recovery! (%d)\n", ret);
3557cf696091SLuben Tuikov 
35581a6fc071STao Zhou 	/*
3559b82e65a9SGuchun Chen 	 * Except error threshold exceeding case, other failure cases in this
3560b82e65a9SGuchun Chen 	 * function would not fail amdgpu driver init.
3561b82e65a9SGuchun Chen 	 */
3562b82e65a9SGuchun Chen 	if (!amdgpu_ras_is_rma(adev))
3563792be2e2STao Zhou 		ret = 0;
3564b82e65a9SGuchun Chen 	else
3565b82e65a9SGuchun Chen 		ret = -EINVAL;
3566b82e65a9SGuchun Chen 
3567b82e65a9SGuchun Chen 	return ret;
35681a6fc071STao Zhou }
3569c030f2e4Sxinhui pan 
amdgpu_ras_recovery_fini(struct amdgpu_device * adev)3570c030f2e4Sxinhui pan static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
3571c030f2e4Sxinhui pan {
3572c030f2e4Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
3573c030f2e4Sxinhui pan 	struct ras_err_handler_data *data = con->eh_data;
3574c030f2e4Sxinhui pan 	int max_flush_timeout = MAX_FLUSH_RETIRE_DWORK_TIMES;
3575c0470691SYiPeng Chai 	bool ret;
3576c0470691SYiPeng Chai 
3577c030f2e4Sxinhui pan 	/* recovery_init failed to init it, fini is useless */
35781a6fc071STao Zhou 	if (!data)
35791a6fc071STao Zhou 		return 0;
35801a6fc071STao Zhou 
35811a6fc071STao Zhou 	/* Save all cached bad pages to eeprom */
3582c0470691SYiPeng Chai 	do {
3583c0470691SYiPeng Chai 		flush_delayed_work(&con->page_retirement_dwork);
3584c0470691SYiPeng Chai 		ret = amdgpu_ras_schedule_retirement_dwork(con, 0);
3585c0470691SYiPeng Chai 	} while (ret && max_flush_timeout--);
3586c0470691SYiPeng Chai 
3587c0470691SYiPeng Chai 	if (con->page_retirement_thread)
35883fdcd0a3SYiPeng Chai 		kthread_stop(con->page_retirement_thread);
35893fdcd0a3SYiPeng Chai 
35903fdcd0a3SYiPeng Chai 	atomic_set(&con->page_retirement_req_cnt, 0);
35913fdcd0a3SYiPeng Chai 	atomic_set(&con->poison_creation_count, 0);
35925f08275cSYiPeng Chai 
35933fdcd0a3SYiPeng Chai 	mutex_destroy(&con->page_rsv_lock);
3594af730e08SYiPeng Chai 
3595af730e08SYiPeng Chai 	cancel_work_sync(&con->recovery_work);
3596c030f2e4Sxinhui pan 
3597c030f2e4Sxinhui pan 	cancel_delayed_work_sync(&con->page_retirement_dwork);
35982cf8e50eSYiPeng Chai 
35992cf8e50eSYiPeng Chai 	amdgpu_ras_ecc_log_fini(&con->umc_ecc_log);
3600f493dd64SYiPeng Chai 
3601f493dd64SYiPeng Chai 	mutex_lock(&con->recovery_lock);
3602c030f2e4Sxinhui pan 	con->eh_data = NULL;
3603c030f2e4Sxinhui pan 	kfree(data->bps);
3604c030f2e4Sxinhui pan 	kfree(data);
3605c030f2e4Sxinhui pan 	mutex_unlock(&con->recovery_lock);
3606c030f2e4Sxinhui pan 
3607c030f2e4Sxinhui pan 	return 0;
3608c030f2e4Sxinhui pan }
3609c030f2e4Sxinhui pan /* recovery end */
3610c030f2e4Sxinhui pan 
amdgpu_ras_asic_supported(struct amdgpu_device * adev)3611c030f2e4Sxinhui pan static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev)
3612084e2640SLuben Tuikov {
36135436ab94SStanley.Yang 	if (amdgpu_sriov_vf(adev)) {
361482835055SYiPeng Chai 		switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) {
36154e8303cfSLijo Lazar 		case IP_VERSION(13, 0, 2):
361682835055SYiPeng Chai 		case IP_VERSION(13, 0, 6):
361780578f16SYiPeng Chai 		case IP_VERSION(13, 0, 12):
36189a826c4aSHawking Zhang 		case IP_VERSION(13, 0, 14):
36191dbd59f3SHawking Zhang 			return true;
362082835055SYiPeng Chai 		default:
362182835055SYiPeng Chai 			return false;
362282835055SYiPeng Chai 		}
362382835055SYiPeng Chai 	}
362482835055SYiPeng Chai 
362582835055SYiPeng Chai 	if (adev->asic_type == CHIP_IP_DISCOVERY) {
3626073285efSYiPeng Chai 		switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) {
36274e8303cfSLijo Lazar 		case IP_VERSION(13, 0, 0):
3628073285efSYiPeng Chai 		case IP_VERSION(13, 0, 6):
3629cb906ce3SStanley.Yang 		case IP_VERSION(13, 0, 10):
3630073285efSYiPeng Chai 		case IP_VERSION(13, 0, 12):
36319a826c4aSHawking Zhang 		case IP_VERSION(13, 0, 14):
36321dbd59f3SHawking Zhang 		case IP_VERSION(14, 0, 3):
3633d1ebe307SCandice Li 			return true;
3634073285efSYiPeng Chai 		default:
3635073285efSYiPeng Chai 			return false;
3636073285efSYiPeng Chai 		}
3637073285efSYiPeng Chai 	}
3638073285efSYiPeng Chai 
3639073285efSYiPeng Chai 	return adev->asic_type == CHIP_VEGA10 ||
3640084e2640SLuben Tuikov 		adev->asic_type == CHIP_VEGA20 ||
3641084e2640SLuben Tuikov 		adev->asic_type == CHIP_ARCTURUS ||
3642084e2640SLuben Tuikov 		adev->asic_type == CHIP_ALDEBARAN ||
364375f06251SHawking Zhang 		adev->asic_type == CHIP_SIENNA_CICHLID;
3644084e2640SLuben Tuikov }
36455436ab94SStanley.Yang 
36465436ab94SStanley.Yang /*
36475caf466aSxinhui pan  * this is workaround for vega20 workstation sku,
3648f50160cfSStanley.Yang  * force enable gfx ras, ignore vbios gfx ras flag
3649f50160cfSStanley.Yang  * due to GC EDC can not write
3650f50160cfSStanley.Yang  */
amdgpu_ras_get_quirks(struct amdgpu_device * adev)3651f50160cfSStanley.Yang static void amdgpu_ras_get_quirks(struct amdgpu_device *adev)
3652e509965eSLuben Tuikov {
3653f50160cfSStanley.Yang 	struct atom_context *ctx = adev->mode_info.atom_context;
3654f50160cfSStanley.Yang 
3655f50160cfSStanley.Yang 	if (!ctx)
3656f50160cfSStanley.Yang 		return;
3657f50160cfSStanley.Yang 
3658f50160cfSStanley.Yang 	if (strnstr(ctx->vbios_pn, "D16406",
3659adf64e21SMario Limonciello 		    sizeof(ctx->vbios_pn)) ||
3660adf64e21SMario Limonciello 		strnstr(ctx->vbios_pn, "D36002",
3661adf64e21SMario Limonciello 			sizeof(ctx->vbios_pn)))
3662adf64e21SMario Limonciello 		adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX);
36638ab0d6f0SLuben Tuikov }
3664f50160cfSStanley.Yang 
3665f50160cfSStanley.Yang /* Query ras capablity via atomfirmware interface */
amdgpu_ras_query_ras_capablity_from_vbios(struct amdgpu_device * adev)36664e2965bdSHawking Zhang static void amdgpu_ras_query_ras_capablity_from_vbios(struct amdgpu_device *adev)
36674e2965bdSHawking Zhang {
36684e2965bdSHawking Zhang 	/* mem_ecc cap */
36694e2965bdSHawking Zhang 	if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
36704e2965bdSHawking Zhang 		dev_info(adev->dev, "MEM ECC is active.\n");
36714e2965bdSHawking Zhang 		adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC |
36724e2965bdSHawking Zhang 					 1 << AMDGPU_RAS_BLOCK__DF);
36734e2965bdSHawking Zhang 	} else {
36744e2965bdSHawking Zhang 		dev_info(adev->dev, "MEM ECC is not presented.\n");
36754e2965bdSHawking Zhang 	}
36764e2965bdSHawking Zhang 
36774e2965bdSHawking Zhang 	/* sram_ecc cap */
36784e2965bdSHawking Zhang 	if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
36794e2965bdSHawking Zhang 		dev_info(adev->dev, "SRAM ECC is active.\n");
36804e2965bdSHawking Zhang 		if (!amdgpu_sriov_vf(adev))
36814e2965bdSHawking Zhang 			adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
36824e2965bdSHawking Zhang 						  1 << AMDGPU_RAS_BLOCK__DF);
36834e2965bdSHawking Zhang 		else
36844e2965bdSHawking Zhang 			adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__PCIE_BIF |
36854e2965bdSHawking Zhang 						 1 << AMDGPU_RAS_BLOCK__SDMA |
36864e2965bdSHawking Zhang 						 1 << AMDGPU_RAS_BLOCK__GFX);
36874e2965bdSHawking Zhang 
36884e2965bdSHawking Zhang 		/*
36894e2965bdSHawking Zhang 		 * VCN/JPEG RAS can be supported on both bare metal and
36904e2965bdSHawking Zhang 		 * SRIOV environment
36914e2965bdSHawking Zhang 		 */
36924e2965bdSHawking Zhang 		if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(2, 6, 0) ||
36934e2965bdSHawking Zhang 		    amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 0) ||
36944e2965bdSHawking Zhang 		    amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 3))
36954e2965bdSHawking Zhang 			adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN |
36964e2965bdSHawking Zhang 						 1 << AMDGPU_RAS_BLOCK__JPEG);
36974e2965bdSHawking Zhang 		else
36984e2965bdSHawking Zhang 			adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN |
36994e2965bdSHawking Zhang 						  1 << AMDGPU_RAS_BLOCK__JPEG);
37004e2965bdSHawking Zhang 
37014e2965bdSHawking Zhang 		/*
37024e2965bdSHawking Zhang 		 * XGMI RAS is not supported if xgmi num physical nodes
37034e2965bdSHawking Zhang 		 * is zero
37044e2965bdSHawking Zhang 		 */
37054e2965bdSHawking Zhang 		if (!adev->gmc.xgmi.num_physical_nodes)
37064e2965bdSHawking Zhang 			adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__XGMI_WAFL);
37074e2965bdSHawking Zhang 	} else {
37084e2965bdSHawking Zhang 		dev_info(adev->dev, "SRAM ECC is not presented.\n");
37094e2965bdSHawking Zhang 	}
37104e2965bdSHawking Zhang }
37114e2965bdSHawking Zhang 
37124e2965bdSHawking Zhang /* Query poison mode from umc/df IP callbacks */
amdgpu_ras_query_poison_mode(struct amdgpu_device * adev)37134e2965bdSHawking Zhang static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev)
37144e2965bdSHawking Zhang {
37154e2965bdSHawking Zhang 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
37164e2965bdSHawking Zhang 	bool df_poison, umc_poison;
37174e2965bdSHawking Zhang 
37184e2965bdSHawking Zhang 	/* poison setting is useless on SRIOV guest */
37194e2965bdSHawking Zhang 	if (amdgpu_sriov_vf(adev) || !con)
37204e2965bdSHawking Zhang 		return;
37214e2965bdSHawking Zhang 
37224e2965bdSHawking Zhang 	/* Init poison supported flag, the default value is false */
37234e2965bdSHawking Zhang 	if (adev->gmc.xgmi.connected_to_cpu ||
37244e2965bdSHawking Zhang 	    adev->gmc.is_app_apu) {
37254e2965bdSHawking Zhang 		/* enabled by default when GPU is connected to CPU */
37264e2965bdSHawking Zhang 		con->poison_supported = true;
37274e2965bdSHawking Zhang 	} else if (adev->df.funcs &&
37284e2965bdSHawking Zhang 	    adev->df.funcs->query_ras_poison_mode &&
37294e2965bdSHawking Zhang 	    adev->umc.ras &&
37304e2965bdSHawking Zhang 	    adev->umc.ras->query_ras_poison_mode) {
37314e2965bdSHawking Zhang 		df_poison =
37324e2965bdSHawking Zhang 			adev->df.funcs->query_ras_poison_mode(adev);
37334e2965bdSHawking Zhang 		umc_poison =
37344e2965bdSHawking Zhang 			adev->umc.ras->query_ras_poison_mode(adev);
37354e2965bdSHawking Zhang 
37364e2965bdSHawking Zhang 		/* Only poison is set in both DF and UMC, we can support it */
37374e2965bdSHawking Zhang 		if (df_poison && umc_poison)
37384e2965bdSHawking Zhang 			con->poison_supported = true;
37394e2965bdSHawking Zhang 		else if (df_poison != umc_poison)
37404e2965bdSHawking Zhang 			dev_warn(adev->dev,
37414e2965bdSHawking Zhang 				"Poison setting is inconsistent in DF/UMC(%d:%d)!\n",
37424e2965bdSHawking Zhang 				df_poison, umc_poison);
37434e2965bdSHawking Zhang 	}
37444e2965bdSHawking Zhang }
37454e2965bdSHawking Zhang 
37464e2965bdSHawking Zhang /*
3747f50160cfSStanley.Yang  * check hardware's ras ability which will be saved in hw_supported.
37485caf466aSxinhui pan  * if hardware does not support ras, we can skip some ras initializtion and
37495caf466aSxinhui pan  * forbid some ras operations from IP.
37505caf466aSxinhui pan  * if software itself, say boot parameter, limit the ras ability. We still
37515caf466aSxinhui pan  * need allow IP do some limited operations, like disable. In such case,
37525caf466aSxinhui pan  * we have to initialize ras as normal. but need check if operation is
37535caf466aSxinhui pan  * allowed or not in each function.
37545caf466aSxinhui pan  */
amdgpu_ras_check_supported(struct amdgpu_device * adev)37555caf466aSxinhui pan static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
3756e509965eSLuben Tuikov {
3757c030f2e4Sxinhui pan 	adev->ras_hw_enabled = adev->ras_enabled = 0;
37588ab0d6f0SLuben Tuikov 
3759c030f2e4Sxinhui pan 	if (!amdgpu_ras_asic_supported(adev))
376038298ce6SStanley.Yang 		return;
37615caf466aSxinhui pan 
3762b404ae82Sxinhui pan 	if (amdgpu_sriov_vf(adev)) {
3763907fec2dSVictor Skvortsov 		if (amdgpu_virt_get_ras_capability(adev))
3764907fec2dSVictor Skvortsov 			goto init_ras_enabled_flag;
3765907fec2dSVictor Skvortsov 	}
3766907fec2dSVictor Skvortsov 
3767907fec2dSVictor Skvortsov 	/* query ras capability from psp */
37684e2965bdSHawking Zhang 	if (amdgpu_psp_get_ras_capability(&adev->psp))
37694e2965bdSHawking Zhang 		goto init_ras_enabled_flag;
37704e2965bdSHawking Zhang 
37714e2965bdSHawking Zhang 	/* query ras capablity from bios */
37724e2965bdSHawking Zhang 	if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
377338298ce6SStanley.Yang 		amdgpu_ras_query_ras_capablity_from_vbios(adev);
37744e2965bdSHawking Zhang 	} else {
377575f06251SHawking Zhang 		/* driver only manages a few IP blocks RAS feature
377675f06251SHawking Zhang 		 * when GPU is connected cpu through XGMI */
377775f06251SHawking Zhang 		adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX |
37788ab0d6f0SLuben Tuikov 					   1 << AMDGPU_RAS_BLOCK__SDMA |
377975f06251SHawking Zhang 					   1 << AMDGPU_RAS_BLOCK__MMHUB);
378075f06251SHawking Zhang 	}
378175f06251SHawking Zhang 
378288474ccaSGuchun Chen 	/* apply asic specific settings (vega20 only for now) */
37834e2965bdSHawking Zhang 	amdgpu_ras_get_quirks(adev);
3784e509965eSLuben Tuikov 
3785b404ae82Sxinhui pan 	/* query poison mode from umc/df ip callback */
37864e2965bdSHawking Zhang 	amdgpu_ras_query_poison_mode(adev);
37874e2965bdSHawking Zhang 
37884e2965bdSHawking Zhang init_ras_enabled_flag:
37894e2965bdSHawking Zhang 	/* hw_supported needs to be aligned with RAS block mask. */
3790b404ae82Sxinhui pan 	adev->ras_hw_enabled &= AMDGPU_RAS_BLOCK_MASK;
37918ab0d6f0SLuben Tuikov 
3792c030f2e4Sxinhui pan 	adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 :
37938ab0d6f0SLuben Tuikov 		adev->ras_hw_enabled & amdgpu_ras_mask;
37948ab0d6f0SLuben Tuikov 
379504c4fcd2SYang Wang 	/* aca is disabled by default except for psp v13_0_6/v13_0_12/v13_0_14 */
379613c13bddSXiang Liu 	adev->aca.is_enabled =
379759af05d6SCandice Li 		(amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6) ||
379813c13bddSXiang Liu 		 amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 12) ||
379913c13bddSXiang Liu 		 amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 14));
380013c13bddSXiang Liu 
3801c389a060STao Zhou 	/* bad page feature is not applicable to specific app platform */
3802c389a060STao Zhou 	if (adev->gmc.is_app_apu &&
3803c389a060STao Zhou 	    amdgpu_ip_version(adev, UMC_HWIP, 0) == IP_VERSION(12, 0, 0))
3804c389a060STao Zhou 		amdgpu_bad_page_threshold = 0;
3805c389a060STao Zhou }
3806c030f2e4Sxinhui pan 
amdgpu_ras_counte_dw(struct work_struct * work)3807c030f2e4Sxinhui pan static void amdgpu_ras_counte_dw(struct work_struct *work)
380805adfd80SLuben Tuikov {
380905adfd80SLuben Tuikov 	struct amdgpu_ras *con = container_of(work, struct amdgpu_ras,
381005adfd80SLuben Tuikov 					      ras_counte_delay_work.work);
381105adfd80SLuben Tuikov 	struct amdgpu_device *adev = con->adev;
381205adfd80SLuben Tuikov 	struct drm_device *dev = adev_to_drm(adev);
3813a3fbb0d8SGuchun Chen 	unsigned long ce_count, ue_count;
381405adfd80SLuben Tuikov 	int res;
381505adfd80SLuben Tuikov 
381605adfd80SLuben Tuikov 	res = pm_runtime_get_sync(dev->dev);
381705adfd80SLuben Tuikov 	if (res < 0)
381805adfd80SLuben Tuikov 		goto Out;
381905adfd80SLuben Tuikov 
382005adfd80SLuben Tuikov 	/* Cache new values.
382105adfd80SLuben Tuikov 	 */
382205adfd80SLuben Tuikov 	if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, NULL) == 0) {
38234a1c9a44SHawking Zhang 		atomic_set(&con->ras_ce_count, ce_count);
382405adfd80SLuben Tuikov 		atomic_set(&con->ras_ue_count, ue_count);
382505adfd80SLuben Tuikov 	}
38264d9f771eSLuben Tuikov 
382705adfd80SLuben Tuikov 	pm_runtime_mark_last_busy(dev->dev);
382805adfd80SLuben Tuikov Out:
382905adfd80SLuben Tuikov 	pm_runtime_put_autosuspend(dev->dev);
383005adfd80SLuben Tuikov }
383105adfd80SLuben Tuikov 
amdgpu_get_ras_schema(struct amdgpu_device * adev)383205adfd80SLuben Tuikov static int amdgpu_get_ras_schema(struct amdgpu_device *adev)
3833625e5f38SAsad Kamal {
3834625e5f38SAsad Kamal 	return  amdgpu_ras_is_poison_mode_supported(adev) ? AMDGPU_RAS_ERROR__POISON : 0 |
3835625e5f38SAsad Kamal 			AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE |
3836625e5f38SAsad Kamal 			AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE |
3837625e5f38SAsad Kamal 			AMDGPU_RAS_ERROR__PARITY;
3838625e5f38SAsad Kamal }
3839625e5f38SAsad Kamal 
ras_event_mgr_init(struct ras_event_manager * mgr)3840625e5f38SAsad Kamal static void ras_event_mgr_init(struct ras_event_manager *mgr)
38419dc57c2aSYang Wang {
38429dc57c2aSYang Wang 	struct ras_event_state *event_state;
384359f488beSYang Wang 	int i;
38449dc57c2aSYang Wang 
38459dc57c2aSYang Wang 	memset(mgr, 0, sizeof(*mgr));
384675ac6a25SYang Wang 	atomic64_set(&mgr->seqno, 0);
384775ac6a25SYang Wang 
384875ac6a25SYang Wang 	for (i = 0; i < ARRAY_SIZE(mgr->event_state); i++) {
384959f488beSYang Wang 		event_state = &mgr->event_state[i];
385059f488beSYang Wang 		event_state->last_seqno = RAS_EVENT_INVALID_ID;
385159f488beSYang Wang 		atomic64_set(&event_state->count, 0);
385259f488beSYang Wang 	}
385359f488beSYang Wang }
38549dc57c2aSYang Wang 
amdgpu_ras_event_mgr_init(struct amdgpu_device * adev)38559dc57c2aSYang Wang static void amdgpu_ras_event_mgr_init(struct amdgpu_device *adev)
38569dc57c2aSYang Wang {
38579dc57c2aSYang Wang 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
38589dc57c2aSYang Wang 	struct amdgpu_hive_info *hive;
38599dc57c2aSYang Wang 
38609dc57c2aSYang Wang 	if (!ras)
38619dc57c2aSYang Wang 		return;
38629dc57c2aSYang Wang 
38639dc57c2aSYang Wang 	hive = amdgpu_get_xgmi_hive(adev);
38649dc57c2aSYang Wang 	ras->event_mgr = hive ? &hive->event_mgr : &ras->__event_mgr;
38659dc57c2aSYang Wang 
38669dc57c2aSYang Wang 	/* init event manager with node 0 on xgmi system */
38679dc57c2aSYang Wang 	if (!amdgpu_reset_in_recovery(adev)) {
3868e283f4fbSLijo Lazar 		if (!hive || adev->gmc.xgmi.node_id == 0)
38699dc57c2aSYang Wang 			ras_event_mgr_init(ras->event_mgr);
38709dc57c2aSYang Wang 	}
38719dc57c2aSYang Wang 
38729dc57c2aSYang Wang 	if (hive)
38739dc57c2aSYang Wang 		amdgpu_put_xgmi_hive(hive);
38749dc57c2aSYang Wang }
38759dc57c2aSYang Wang 
amdgpu_ras_init_reserved_vram_size(struct amdgpu_device * adev)38769dc57c2aSYang Wang static void amdgpu_ras_init_reserved_vram_size(struct amdgpu_device *adev)
3877473af28dSHawking Zhang {
3878473af28dSHawking Zhang 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
3879473af28dSHawking Zhang 
3880473af28dSHawking Zhang 	if (!con || (adev->flags & AMD_IS_APU))
3881473af28dSHawking Zhang 		return;
3882473af28dSHawking Zhang 
3883473af28dSHawking Zhang 	switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) {
3884473af28dSHawking Zhang 	case IP_VERSION(13, 0, 2):
3885473af28dSHawking Zhang 	case IP_VERSION(13, 0, 6):
3886473af28dSHawking Zhang 	case IP_VERSION(13, 0, 12):
38879a826c4aSHawking Zhang 		con->reserved_pages_in_bytes = AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT;
388816b85a09SHawking Zhang 		break;
388916b85a09SHawking Zhang 	case IP_VERSION(13, 0, 14):
3890473af28dSHawking Zhang 		con->reserved_pages_in_bytes = (AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT << 1);
389116b85a09SHawking Zhang 		break;
3892473af28dSHawking Zhang 	default:
3893473af28dSHawking Zhang 		break;
3894473af28dSHawking Zhang 	}
3895473af28dSHawking Zhang }
3896473af28dSHawking Zhang 
amdgpu_ras_init(struct amdgpu_device * adev)3897473af28dSHawking Zhang int amdgpu_ras_init(struct amdgpu_device *adev)
3898c030f2e4Sxinhui pan {
3899c030f2e4Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
3900c030f2e4Sxinhui pan 	int r;
3901c030f2e4Sxinhui pan 
3902c030f2e4Sxinhui pan 	if (con)
3903c030f2e4Sxinhui pan 		return 0;
3904c030f2e4Sxinhui pan 
3905c030f2e4Sxinhui pan 	con = kzalloc(sizeof(*con) +
3906091411beSSrinivasan Shanmugam 			sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT +
3907640ae42eSJohn Clements 			sizeof(struct ras_manager) * AMDGPU_RAS_MCA_BLOCK_COUNT,
3908640ae42eSJohn Clements 			GFP_KERNEL);
3909091411beSSrinivasan Shanmugam 	if (!con)
3910c030f2e4Sxinhui pan 		return -ENOMEM;
3911c030f2e4Sxinhui pan 
3912c030f2e4Sxinhui pan 	con->adev = adev;
391305adfd80SLuben Tuikov 	INIT_DELAYED_WORK(&con->ras_counte_delay_work, amdgpu_ras_counte_dw);
391405adfd80SLuben Tuikov 	atomic_set(&con->ras_ce_count, 0);
391505adfd80SLuben Tuikov 	atomic_set(&con->ras_ue_count, 0);
391605adfd80SLuben Tuikov 
391705adfd80SLuben Tuikov 	con->objs = (struct ras_manager *)(con + 1);
3918c030f2e4Sxinhui pan 
3919c030f2e4Sxinhui pan 	amdgpu_ras_set_context(adev, con);
3920c030f2e4Sxinhui pan 
3921c030f2e4Sxinhui pan 	amdgpu_ras_check_supported(adev);
3922e509965eSLuben Tuikov 
3923e509965eSLuben Tuikov 	if (!adev->ras_enabled || adev->asic_type == CHIP_VEGA10) {
39247ddd9770SOak Zeng 		/* set gfx block ras context feature for VEGA20 Gaming
3925970fd197SStanley.Yang 		 * send ras disable cmd to ras ta during ras late init.
3926970fd197SStanley.Yang 		 */
3927970fd197SStanley.Yang 		if (!adev->ras_enabled && adev->asic_type == CHIP_VEGA20) {
39288ab0d6f0SLuben Tuikov 			con->features |= BIT(AMDGPU_RAS_BLOCK__GFX);
3929970fd197SStanley.Yang 
3930970fd197SStanley.Yang 			return 0;
3931970fd197SStanley.Yang 		}
3932970fd197SStanley.Yang 
3933970fd197SStanley.Yang 		r = 0;
39345e91160aSGuchun Chen 		goto release_con;
39355436ab94SStanley.Yang 	}
3936fb2a3607SHawking Zhang 
3937fb2a3607SHawking Zhang 	con->update_channel_flag = false;
393869691c82SStanley.Yang 	con->features = 0;
3939c030f2e4Sxinhui pan 	con->schema = 0;
3940625e5f38SAsad Kamal 	INIT_LIST_HEAD(&con->head);
3941c030f2e4Sxinhui pan 	/* Might need get this flag from vbios. */
3942108c6a63Sxinhui pan 	con->flags = RAS_DEFAULT_FLAGS;
3943108c6a63Sxinhui pan 
3944c030f2e4Sxinhui pan 	/* initialize nbio ras function ahead of any other
39456e36f231SHawking Zhang 	 * ras functions so hardware fatal error interrupt
39466e36f231SHawking Zhang 	 * can be enabled as early as possible */
39476e36f231SHawking Zhang 	switch (amdgpu_ip_version(adev, NBIO_HWIP, 0)) {
39484e8303cfSLijo Lazar 	case IP_VERSION(7, 4, 0):
3949fdc94d3aSHawking Zhang 	case IP_VERSION(7, 4, 1):
3950fdc94d3aSHawking Zhang 	case IP_VERSION(7, 4, 4):
3951fdc94d3aSHawking Zhang 		if (!adev->gmc.xgmi.connected_to_cpu)
3952fdc94d3aSHawking Zhang 			adev->nbio.ras = &nbio_v7_4_ras;
39532e54fe5dSyipechai 		break;
39546e36f231SHawking Zhang 	case IP_VERSION(4, 3, 0):
39559af357bcSHawking Zhang 		if (adev->ras_hw_enabled & (1 << AMDGPU_RAS_BLOCK__DF))
39569af357bcSHawking Zhang 			/* unlike other generation of nbio ras,
39579af357bcSHawking Zhang 			 * nbio v4_3 only support fatal error interrupt
39589af357bcSHawking Zhang 			 * to inform software that DF is freezed due to
39599af357bcSHawking Zhang 			 * system fatal error event. driver should not
39609af357bcSHawking Zhang 			 * enable nbio ras in such case. Instead,
39619af357bcSHawking Zhang 			 * check DF RAS */
39629af357bcSHawking Zhang 			adev->nbio.ras = &nbio_v4_3_ras;
39639af357bcSHawking Zhang 		break;
39649af357bcSHawking Zhang 	case IP_VERSION(6, 3, 1):
3965ecd1191eSCandice Li 		if (adev->ras_hw_enabled & (1 << AMDGPU_RAS_BLOCK__DF))
3966ecd1191eSCandice Li 			/* unlike other generation of nbio ras,
3967ecd1191eSCandice Li 			 * nbif v6_3_1 only support fatal error interrupt
3968ecd1191eSCandice Li 			 * to inform software that DF is freezed due to
3969ecd1191eSCandice Li 			 * system fatal error event. driver should not
3970ecd1191eSCandice Li 			 * enable nbio ras in such case. Instead,
3971ecd1191eSCandice Li 			 * check DF RAS
3972ecd1191eSCandice Li 			 */
3973ecd1191eSCandice Li 			adev->nbio.ras = &nbif_v6_3_1_ras;
3974ecd1191eSCandice Li 		break;
3975ecd1191eSCandice Li 	case IP_VERSION(7, 9, 0):
39767692e1eeSTao Zhou 	case IP_VERSION(7, 9, 1):
39779a826c4aSHawking Zhang 		if (!adev->gmc.is_app_apu)
39787692e1eeSTao Zhou 			adev->nbio.ras = &nbio_v7_9_ras;
39797692e1eeSTao Zhou 		break;
39807692e1eeSTao Zhou 	default:
39816e36f231SHawking Zhang 		/* nbio ras is not available */
39826e36f231SHawking Zhang 		break;
39836e36f231SHawking Zhang 	}
39846e36f231SHawking Zhang 
39856e36f231SHawking Zhang 	/* nbio ras block needs to be enabled ahead of other ras blocks
3986fdc94d3aSHawking Zhang 	 * to handle fatal error */
3987fdc94d3aSHawking Zhang 	r = amdgpu_nbio_ras_sw_init(adev);
3988fdc94d3aSHawking Zhang 	if (r)
3989fdc94d3aSHawking Zhang 		return r;
3990fdc94d3aSHawking Zhang 
3991fdc94d3aSHawking Zhang 	if (adev->nbio.ras &&
39922e54fe5dSyipechai 	    adev->nbio.ras->init_ras_controller_interrupt) {
39932e54fe5dSyipechai 		r = adev->nbio.ras->init_ras_controller_interrupt(adev);
39942e54fe5dSyipechai 		if (r)
39954e644fffSHawking Zhang 			goto release_con;
39965436ab94SStanley.Yang 	}
39974e644fffSHawking Zhang 
39984e644fffSHawking Zhang 	if (adev->nbio.ras &&
39992e54fe5dSyipechai 	    adev->nbio.ras->init_ras_err_event_athub_interrupt) {
40002e54fe5dSyipechai 		r = adev->nbio.ras->init_ras_err_event_athub_interrupt(adev);
40012e54fe5dSyipechai 		if (r)
40024e644fffSHawking Zhang 			goto release_con;
40035436ab94SStanley.Yang 	}
40044e644fffSHawking Zhang 
40054e644fffSHawking Zhang 	/* Packed socket_id to ras feature mask bits[31:29] */
400673cb81dcSHawking Zhang 	if (adev->smuio.funcs &&
400773cb81dcSHawking Zhang 	    adev->smuio.funcs->get_socket_id)
400873cb81dcSHawking Zhang 		con->features |= ((adev->smuio.funcs->get_socket_id(adev)) <<
4009ee9c3031SStanley.Yang 					AMDGPU_RAS_FEATURES_SOCKETID_SHIFT);
4010ee9c3031SStanley.Yang 
401173cb81dcSHawking Zhang 	/* Get RAS schema for particular SOC */
4012625e5f38SAsad Kamal 	con->schema = amdgpu_get_ras_schema(adev);
4013625e5f38SAsad Kamal 
4014625e5f38SAsad Kamal 	amdgpu_ras_init_reserved_vram_size(adev);
4015473af28dSHawking Zhang 
4016473af28dSHawking Zhang 	if (amdgpu_ras_fs_init(adev)) {
40175e91160aSGuchun Chen 		r = -EINVAL;
40185e91160aSGuchun Chen 		goto release_con;
40195436ab94SStanley.Yang 	}
40205e91160aSGuchun Chen 
4021c030f2e4Sxinhui pan 	if (amdgpu_ras_aca_is_supported(adev)) {
40229817f061SYang Wang 		if (amdgpu_aca_is_enabled(adev))
40239817f061SYang Wang 			r = amdgpu_aca_init(adev);
40249817f061SYang Wang 		else
40259817f061SYang Wang 			r = amdgpu_mca_init(adev);
40269817f061SYang Wang 		if (r)
40279817f061SYang Wang 			goto release_con;
40289817f061SYang Wang 	}
40299817f061SYang Wang 
40309817f061SYang Wang 	dev_info(adev->dev, "RAS INFO: ras initialized successfully, "
40316952e99cSGuchun Chen 		 "hardware ability[%x] ras_mask[%x]\n",
40325d0f903fSxinhui pan 		 adev->ras_hw_enabled, adev->ras_enabled);
40338ab0d6f0SLuben Tuikov 
4034e509965eSLuben Tuikov 	return 0;
4035c030f2e4Sxinhui pan release_con:
40365436ab94SStanley.Yang 	amdgpu_ras_set_context(adev, NULL);
4037c030f2e4Sxinhui pan 	kfree(con);
4038c030f2e4Sxinhui pan 
4039c030f2e4Sxinhui pan 	return r;
40405e91160aSGuchun Chen }
4041c030f2e4Sxinhui pan 
amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device * adev)4042c030f2e4Sxinhui pan int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev)
40438f6368a9SJohn Clements {
4044134d16d5SJohn Clements 	if (adev->gmc.xgmi.connected_to_cpu ||
40458107e499SHawking Zhang 	    adev->gmc.is_app_apu)
40468107e499SHawking Zhang 		return 1;
4047134d16d5SJohn Clements 	return 0;
4048134d16d5SJohn Clements }
4049134d16d5SJohn Clements 
amdgpu_persistent_edc_harvesting(struct amdgpu_device * adev,struct ras_common_if * ras_block)4050134d16d5SJohn Clements static int amdgpu_persistent_edc_harvesting(struct amdgpu_device *adev,
4051134d16d5SJohn Clements 					struct ras_common_if *ras_block)
4052134d16d5SJohn Clements {
4053134d16d5SJohn Clements 	struct ras_query_if info = {
4054134d16d5SJohn Clements 		.head = *ras_block,
4055134d16d5SJohn Clements 	};
4056134d16d5SJohn Clements 
4057134d16d5SJohn Clements 	if (!amdgpu_persistent_edc_harvesting_supported(adev))
4058134d16d5SJohn Clements 		return 0;
4059134d16d5SJohn Clements 
4060134d16d5SJohn Clements 	if (amdgpu_ras_query_error_status(adev, &info) != 0)
4061134d16d5SJohn Clements 		DRM_WARN("RAS init harvest failure");
4062134d16d5SJohn Clements 
4063134d16d5SJohn Clements 	if (amdgpu_ras_reset_error_status(adev, ras_block->block) != 0)
4064134d16d5SJohn Clements 		DRM_WARN("RAS init harvest reset failure");
4065134d16d5SJohn Clements 
4066134d16d5SJohn Clements 	return 0;
4067134d16d5SJohn Clements }
4068134d16d5SJohn Clements 
amdgpu_ras_is_poison_mode_supported(struct amdgpu_device * adev)4069134d16d5SJohn Clements bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev)
4070e4348849STao Zhou {
4071e4348849STao Zhou        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
4072e4348849STao Zhou 
4073e4348849STao Zhou        if (!con)
4074e4348849STao Zhou                return false;
4075e4348849STao Zhou 
4076e4348849STao Zhou        return con->poison_supported;
4077e4348849STao Zhou }
4078e4348849STao Zhou 
4079e4348849STao Zhou /* helper function to handle common stuff in ip late init phase */
amdgpu_ras_block_late_init(struct amdgpu_device * adev,struct ras_common_if * ras_block)4080b293e891SHawking Zhang int amdgpu_ras_block_late_init(struct amdgpu_device *adev,
4081563285c8Syipechai 			 struct ras_common_if *ras_block)
4082563285c8Syipechai {
4083b293e891SHawking Zhang 	struct amdgpu_ras_block_object *ras_obj = NULL;
408429c9b6cdSyipechai 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
408505adfd80SLuben Tuikov 	struct ras_query_if *query_info;
40864a1c9a44SHawking Zhang 	unsigned long ue_count, ce_count;
408705adfd80SLuben Tuikov 	int r;
4088b293e891SHawking Zhang 
4089b293e891SHawking Zhang 	/* disable RAS feature per IP block if it is not supported */
4090b293e891SHawking Zhang 	if (!amdgpu_ras_is_supported(adev, ras_block->block)) {
4091b293e891SHawking Zhang 		amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0);
4092b293e891SHawking Zhang 		return 0;
4093b293e891SHawking Zhang 	}
4094b293e891SHawking Zhang 
4095b293e891SHawking Zhang 	r = amdgpu_ras_feature_enable_on_boot(adev, ras_block, 1);
4096b293e891SHawking Zhang 	if (r) {
4097b293e891SHawking Zhang 		if (adev->in_suspend || amdgpu_reset_in_recovery(adev)) {
4098e283f4fbSLijo Lazar 			/* in resume phase, if fail to enable ras,
4099b293e891SHawking Zhang 			 * clean up all ras fs nodes, and disable ras */
4100b293e891SHawking Zhang 			goto cleanup;
4101b293e891SHawking Zhang 		} else
4102b293e891SHawking Zhang 			return r;
4103b293e891SHawking Zhang 	}
4104b293e891SHawking Zhang 
4105b293e891SHawking Zhang 	/* check for errors on warm reset edc persisant supported ASIC */
4106134d16d5SJohn Clements 	amdgpu_persistent_edc_harvesting(adev, ras_block);
4107134d16d5SJohn Clements 
4108134d16d5SJohn Clements 	/* in resume phase, no need to create ras fs node */
4109b293e891SHawking Zhang 	if (adev->in_suspend || amdgpu_reset_in_recovery(adev))
4110e283f4fbSLijo Lazar 		return 0;
4111b293e891SHawking Zhang 
4112b293e891SHawking Zhang 	ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm);
4113563285c8Syipechai 	if (ras_obj->ras_cb || (ras_obj->hw_ops &&
411436780606STao Zhou 	    (ras_obj->hw_ops->query_poison_status ||
411536780606STao Zhou 	    ras_obj->hw_ops->handle_poison_consumption))) {
411636780606STao Zhou 		r = amdgpu_ras_interrupt_add_handler(adev, ras_block);
41179252d33dSyipechai 		if (r)
4118b293e891SHawking Zhang 			goto cleanup;
4119779596ceSTom Rix 	}
4120b293e891SHawking Zhang 
4121b293e891SHawking Zhang 	if (ras_obj->hw_ops &&
4122f957138cSHawking Zhang 	    (ras_obj->hw_ops->query_ras_error_count ||
4123f957138cSHawking Zhang 	     ras_obj->hw_ops->query_ras_error_status)) {
4124f957138cSHawking Zhang 		r = amdgpu_ras_sysfs_create(adev, ras_block);
41259252d33dSyipechai 		if (r)
4126b293e891SHawking Zhang 			goto interrupt;
4127779596ceSTom Rix 
4128b293e891SHawking Zhang 		/* Those are the cached values at init.
412905adfd80SLuben Tuikov 		 */
413005adfd80SLuben Tuikov 		query_info = kzalloc(sizeof(*query_info), GFP_KERNEL);
4131f957138cSHawking Zhang 		if (!query_info)
41324a1c9a44SHawking Zhang 			return -ENOMEM;
41334a1c9a44SHawking Zhang 		memcpy(&query_info->head, ras_block, sizeof(struct ras_common_if));
41344a1c9a44SHawking Zhang 
41354a1c9a44SHawking Zhang 		if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, query_info) == 0) {
41364a1c9a44SHawking Zhang 			atomic_set(&con->ras_ce_count, ce_count);
413705adfd80SLuben Tuikov 			atomic_set(&con->ras_ue_count, ue_count);
413805adfd80SLuben Tuikov 		}
41394d9f771eSLuben Tuikov 
414005adfd80SLuben Tuikov 		kfree(query_info);
41414a1c9a44SHawking Zhang 	}
4142f957138cSHawking Zhang 
4143f957138cSHawking Zhang 	return 0;
4144b293e891SHawking Zhang 
4145779596ceSTom Rix interrupt:
4146779596ceSTom Rix 	if (ras_obj->ras_cb)
4147563285c8Syipechai 		amdgpu_ras_interrupt_remove_handler(adev, ras_block);
41489252d33dSyipechai cleanup:
4149779596ceSTom Rix 	amdgpu_ras_feature_enable(adev, ras_block, 0);
4150b293e891SHawking Zhang 	return r;
4151b293e891SHawking Zhang }
4152b293e891SHawking Zhang 
amdgpu_ras_block_late_init_default(struct amdgpu_device * adev,struct ras_common_if * ras_block)4153b293e891SHawking Zhang static int amdgpu_ras_block_late_init_default(struct amdgpu_device *adev,
4154d41ff22aSMaíra Canal 			 struct ras_common_if *ras_block)
4155418abce2Syipechai {
4156418abce2Syipechai 	return amdgpu_ras_block_late_init(adev, ras_block);
4157418abce2Syipechai }
4158418abce2Syipechai 
4159418abce2Syipechai /* helper function to remove ras fs node and interrupt handler */
amdgpu_ras_block_late_fini(struct amdgpu_device * adev,struct ras_common_if * ras_block)4160b293e891SHawking Zhang void amdgpu_ras_block_late_fini(struct amdgpu_device *adev,
4161bdb3489cSyipechai 			  struct ras_common_if *ras_block)
4162bdb3489cSyipechai {
4163bdb3489cSyipechai 	struct amdgpu_ras_block_object *ras_obj;
4164563285c8Syipechai 	if (!ras_block)
4165bdb3489cSyipechai 		return;
4166bdb3489cSyipechai 
4167bdb3489cSyipechai 	amdgpu_ras_sysfs_remove(adev, ras_block);
4168563285c8Syipechai 
4169bdb3489cSyipechai 	ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm);
4170563285c8Syipechai 	if (ras_obj->ras_cb)
4171563285c8Syipechai 		amdgpu_ras_interrupt_remove_handler(adev, ras_block);
4172563285c8Syipechai }
4173bdb3489cSyipechai 
amdgpu_ras_block_late_fini_default(struct amdgpu_device * adev,struct ras_common_if * ras_block)4174bdb3489cSyipechai static void amdgpu_ras_block_late_fini_default(struct amdgpu_device *adev,
417580e0c2cbSyipechai 			  struct ras_common_if *ras_block)
417680e0c2cbSyipechai {
417780e0c2cbSyipechai 	return amdgpu_ras_block_late_fini(adev, ras_block);
417880e0c2cbSyipechai }
417980e0c2cbSyipechai 
418080e0c2cbSyipechai /* do some init work after IP late init as dependence.
4181a564808eSxinhui pan  * and it runs in resume/gpu reset/booting up cases.
4182511fdbc3Sxinhui pan  */
amdgpu_ras_resume(struct amdgpu_device * adev)4183a564808eSxinhui pan void amdgpu_ras_resume(struct amdgpu_device *adev)
4184511fdbc3Sxinhui pan {
4185108c6a63Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
4186108c6a63Sxinhui pan 	struct ras_manager *obj, *tmp;
4187108c6a63Sxinhui pan 
4188108c6a63Sxinhui pan 	if (!adev->ras_enabled || !con) {
41898ab0d6f0SLuben Tuikov 		/* clean ras context for VEGA20 Gaming after send ras disable cmd */
4190970fd197SStanley.Yang 		amdgpu_release_ras_context(adev);
4191970fd197SStanley.Yang 
4192970fd197SStanley.Yang 		return;
4193108c6a63Sxinhui pan 	}
4194970fd197SStanley.Yang 
4195108c6a63Sxinhui pan 	if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
4196108c6a63Sxinhui pan 		/* Set up all other IPs which are not implemented. There is a
4197191051a1Sxinhui pan 		 * tricky thing that IP's actual ras error type should be
4198191051a1Sxinhui pan 		 * MULTI_UNCORRECTABLE, but as driver does not handle it, so
4199191051a1Sxinhui pan 		 * ERROR_NONE make sense anyway.
4200191051a1Sxinhui pan 		 */
4201191051a1Sxinhui pan 		amdgpu_ras_enable_all_features(adev, 1);
4202191051a1Sxinhui pan 
4203191051a1Sxinhui pan 		/* We enable ras on all hw_supported block, but as boot
4204191051a1Sxinhui pan 		 * parameter might disable some of them and one or more IP has
4205191051a1Sxinhui pan 		 * not implemented yet. So we disable them on behalf.
4206191051a1Sxinhui pan 		 */
4207191051a1Sxinhui pan 		list_for_each_entry_safe(obj, tmp, &con->head, node) {
4208108c6a63Sxinhui pan 			if (!amdgpu_ras_is_supported(adev, obj->head.block)) {
4209108c6a63Sxinhui pan 				amdgpu_ras_feature_enable(adev, &obj->head, 0);
4210108c6a63Sxinhui pan 				/* there should be no any reference. */
4211108c6a63Sxinhui pan 				WARN_ON(alive_obj(obj));
4212108c6a63Sxinhui pan 			}
4213108c6a63Sxinhui pan 		}
4214191051a1Sxinhui pan 	}
4215108c6a63Sxinhui pan }
4216108c6a63Sxinhui pan 
amdgpu_ras_suspend(struct amdgpu_device * adev)4217108c6a63Sxinhui pan void amdgpu_ras_suspend(struct amdgpu_device *adev)
4218511fdbc3Sxinhui pan {
4219511fdbc3Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
4220511fdbc3Sxinhui pan 
4221511fdbc3Sxinhui pan 	if (!adev->ras_enabled || !con)
42228ab0d6f0SLuben Tuikov 		return;
4223511fdbc3Sxinhui pan 
4224511fdbc3Sxinhui pan 	amdgpu_ras_disable_all_features(adev, 0);
4225511fdbc3Sxinhui pan 	/* Make sure all ras objects are disabled. */
4226511fdbc3Sxinhui pan 	if (AMDGPU_RAS_GET_FEATURES(con->features))
4227ee9c3031SStanley.Yang 		amdgpu_ras_disable_all_features(adev, 1);
4228511fdbc3Sxinhui pan }
4229511fdbc3Sxinhui pan 
amdgpu_ras_late_init(struct amdgpu_device * adev)4230511fdbc3Sxinhui pan int amdgpu_ras_late_init(struct amdgpu_device *adev)
4231867e24caSyipechai {
4232867e24caSyipechai 	struct amdgpu_ras_block_list *node, *tmp;
4233867e24caSyipechai 	struct amdgpu_ras_block_object *obj;
4234867e24caSyipechai 	int r;
4235867e24caSyipechai 
4236867e24caSyipechai 	amdgpu_ras_event_mgr_init(adev);
42379dc57c2aSYang Wang 
42389dc57c2aSYang Wang 	if (amdgpu_ras_aca_is_supported(adev)) {
42399817f061SYang Wang 		if (amdgpu_reset_in_recovery(adev)) {
4240e283f4fbSLijo Lazar 			if (amdgpu_aca_is_enabled(adev))
42419817f061SYang Wang 				r = amdgpu_aca_reset(adev);
42429817f061SYang Wang 			else
42439817f061SYang Wang 				r = amdgpu_mca_reset(adev);
42449817f061SYang Wang 			if (r)
4245c0c48f0dSYang Wang 				return r;
4246c0c48f0dSYang Wang 		}
4247062a7ce6SYang Wang 
4248c0c48f0dSYang Wang 		if (!amdgpu_sriov_vf(adev)) {
42499817f061SYang Wang 			if (amdgpu_aca_is_enabled(adev))
42509817f061SYang Wang 				amdgpu_ras_set_aca_debug_mode(adev, false);
4251c0c48f0dSYang Wang 			else
425276ad30f5SYang Wang 				amdgpu_ras_set_mca_debug_mode(adev, false);
42536697dbf0SHawking Zhang 		}
4254c0c48f0dSYang Wang 	}
42559817f061SYang Wang 
4256201761b5SLijo Lazar 	/* Guest side doesn't need init ras feature */
4257329cec8fSYang Wang 	if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_ras_telemetry_en(adev))
425884a2947eSVictor Skvortsov 		return 0;
4259329cec8fSYang Wang 
4260329cec8fSYang Wang 	list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
4261867e24caSyipechai 		obj = node->ras_obj;
42622866a454SYang Wang 		if (!obj) {
42632866a454SYang Wang 			dev_warn(adev->dev, "Warning: abnormal ras list node.\n");
4264867e24caSyipechai 			continue;
4265867e24caSyipechai 		}
4266867e24caSyipechai 
4267418abce2Syipechai 		if (!amdgpu_ras_is_supported(adev, obj->ras_comm.block))
42682866a454SYang Wang 			continue;
42692866a454SYang Wang 
42702866a454SYang Wang 		if (obj->ras_late_init) {
4271867e24caSyipechai 			r = obj->ras_late_init(adev, &obj->ras_comm);
4272867e24caSyipechai 			if (r) {
4273867e24caSyipechai 				dev_err(adev->dev, "%s failed to execute ras_late_init! ret:%d\n",
4274867e24caSyipechai 					obj->ras_comm.name, r);
4275867e24caSyipechai 				return r;
4276867e24caSyipechai 			}
4277867e24caSyipechai 		} else
4278418abce2Syipechai 			amdgpu_ras_block_late_init_default(adev, &obj->ras_comm);
4279418abce2Syipechai 	}
4280867e24caSyipechai 
4281867e24caSyipechai 	return 0;
4282867e24caSyipechai }
4283867e24caSyipechai 
4284867e24caSyipechai /* do some fini work before IP fini as dependence */
amdgpu_ras_pre_fini(struct amdgpu_device * adev)4285c030f2e4Sxinhui pan int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
4286c030f2e4Sxinhui pan {
4287c030f2e4Sxinhui pan 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
4288c030f2e4Sxinhui pan 
4289c030f2e4Sxinhui pan 	if (!adev->ras_enabled || !con)
42908ab0d6f0SLuben Tuikov 		return 0;
4291c030f2e4Sxinhui pan 
4292c030f2e4Sxinhui pan 
429372c8c97bSAndrey Grodzovsky 	/* Need disable ras on all IPs here before ip [hw/sw]fini */
4294c030f2e4Sxinhui pan 	if (AMDGPU_RAS_GET_FEATURES(con->features))
4295ee9c3031SStanley.Yang 		amdgpu_ras_disable_all_features(adev, 0);
4296c030f2e4Sxinhui pan 	amdgpu_ras_recovery_fini(adev);
4297c030f2e4Sxinhui pan 	return 0;
4298c030f2e4Sxinhui pan }
4299c030f2e4Sxinhui pan 
amdgpu_ras_fini(struct amdgpu_device * adev)4300c030f2e4Sxinhui pan int amdgpu_ras_fini(struct amdgpu_device *adev)
4301c030f2e4Sxinhui pan {
4302c030f2e4Sxinhui pan 	struct amdgpu_ras_block_list *ras_node, *tmp;
4303d5e8ff5fSyipechai 	struct amdgpu_ras_block_object *obj = NULL;
43041f211a82Syipechai 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
4305c030f2e4Sxinhui pan 
4306c030f2e4Sxinhui pan 	if (!adev->ras_enabled || !con)
43078ab0d6f0SLuben Tuikov 		return 0;
4308c030f2e4Sxinhui pan 
4309c030f2e4Sxinhui pan 	list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) {
43101f211a82Syipechai 		if (ras_node->ras_obj) {
43111f211a82Syipechai 			obj = ras_node->ras_obj;
43121f211a82Syipechai 			if (amdgpu_ras_is_supported(adev, obj->ras_comm.block) &&
43131f211a82Syipechai 			    obj->ras_fini)
43141f211a82Syipechai 				obj->ras_fini(adev, &obj->ras_comm);
43151f211a82Syipechai 			else
431680e0c2cbSyipechai 				amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm);
431780e0c2cbSyipechai 		}
43181f211a82Syipechai 
43191f211a82Syipechai 		/* Clear ras blocks from ras_list and free ras block list node */
43201f211a82Syipechai 		list_del(&ras_node->node);
43211f211a82Syipechai 		kfree(ras_node);
43221f211a82Syipechai 	}
43231f211a82Syipechai 
43241f211a82Syipechai 	amdgpu_ras_fs_fini(adev);
4325c030f2e4Sxinhui pan 	amdgpu_ras_interrupt_remove_all(adev);
4326c030f2e4Sxinhui pan 
4327c030f2e4Sxinhui pan 	if (amdgpu_ras_aca_is_supported(adev)) {
43289817f061SYang Wang 		if (amdgpu_aca_is_enabled(adev))
4329c0c48f0dSYang Wang 			amdgpu_aca_fini(adev);
4330c0c48f0dSYang Wang 		else
433176ad30f5SYang Wang 			amdgpu_mca_fini(adev);
433276ad30f5SYang Wang 	}
43339817f061SYang Wang 
4334c0c48f0dSYang Wang 	WARN(AMDGPU_RAS_GET_FEATURES(con->features), "Feature mask is not cleared");
4335ee9c3031SStanley.Yang 
4336c030f2e4Sxinhui pan 	if (AMDGPU_RAS_GET_FEATURES(con->features))
4337ee9c3031SStanley.Yang 		amdgpu_ras_disable_all_features(adev, 0);
4338edfdde90STao Zhou 
4339c030f2e4Sxinhui pan 	cancel_delayed_work_sync(&con->ras_counte_delay_work);
434005adfd80SLuben Tuikov 
434105adfd80SLuben Tuikov 	amdgpu_ras_set_context(adev, NULL);
4342c030f2e4Sxinhui pan 	kfree(con);
4343c030f2e4Sxinhui pan 
4344c030f2e4Sxinhui pan 	return 0;
4345c030f2e4Sxinhui pan }
4346c030f2e4Sxinhui pan 
amdgpu_ras_get_fed_status(struct amdgpu_device * adev)43477c6e68c7SAndrey Grodzovsky bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev)
43481b6ef74bSLijo Lazar {
43491b6ef74bSLijo Lazar 	struct amdgpu_ras *ras;
43501b6ef74bSLijo Lazar 
43511b6ef74bSLijo Lazar 	ras = amdgpu_ras_get_context(adev);
43521b6ef74bSLijo Lazar 	if (!ras)
43531b6ef74bSLijo Lazar 		return false;
43541b6ef74bSLijo Lazar 
43551b6ef74bSLijo Lazar 	return test_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state);
4356e1ee2111SLijo Lazar }
43571b6ef74bSLijo Lazar 
amdgpu_ras_set_fed(struct amdgpu_device * adev,bool status)43581b6ef74bSLijo Lazar void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status)
43591b6ef74bSLijo Lazar {
43601b6ef74bSLijo Lazar 	struct amdgpu_ras *ras;
43611b6ef74bSLijo Lazar 
43621b6ef74bSLijo Lazar 	ras = amdgpu_ras_get_context(adev);
43631b6ef74bSLijo Lazar 	if (ras) {
4364e1ee2111SLijo Lazar 		if (status)
4365e1ee2111SLijo Lazar 			set_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state);
4366e1ee2111SLijo Lazar 		else
4367e1ee2111SLijo Lazar 			clear_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state);
4368e1ee2111SLijo Lazar 	}
4369e1ee2111SLijo Lazar }
4370e1ee2111SLijo Lazar 
amdgpu_ras_clear_err_state(struct amdgpu_device * adev)4371e1ee2111SLijo Lazar void amdgpu_ras_clear_err_state(struct amdgpu_device *adev)
4372e1ee2111SLijo Lazar {
4373e1ee2111SLijo Lazar 	struct amdgpu_ras *ras;
4374e1ee2111SLijo Lazar 
4375e1ee2111SLijo Lazar 	ras = amdgpu_ras_get_context(adev);
4376e1ee2111SLijo Lazar 	if (ras)
43771b6ef74bSLijo Lazar 		ras->ras_err_state = 0;
4378e1ee2111SLijo Lazar }
4379e1ee2111SLijo Lazar 
amdgpu_ras_set_err_poison(struct amdgpu_device * adev,enum amdgpu_ras_block block)4380e1ee2111SLijo Lazar void amdgpu_ras_set_err_poison(struct amdgpu_device *adev,
4381e1ee2111SLijo Lazar 			       enum amdgpu_ras_block block)
4382e1ee2111SLijo Lazar {
4383e1ee2111SLijo Lazar 	struct amdgpu_ras *ras;
4384e1ee2111SLijo Lazar 
4385e1ee2111SLijo Lazar 	ras = amdgpu_ras_get_context(adev);
4386e1ee2111SLijo Lazar 	if (ras)
4387e1ee2111SLijo Lazar 		set_bit(block, &ras->ras_err_state);
4388e1ee2111SLijo Lazar }
4389e1ee2111SLijo Lazar 
amdgpu_ras_is_err_state(struct amdgpu_device * adev,int block)4390e1ee2111SLijo Lazar bool amdgpu_ras_is_err_state(struct amdgpu_device *adev, int block)
4391e1ee2111SLijo Lazar {
4392e1ee2111SLijo Lazar 	struct amdgpu_ras *ras;
4393e1ee2111SLijo Lazar 
4394e1ee2111SLijo Lazar 	ras = amdgpu_ras_get_context(adev);
4395e1ee2111SLijo Lazar 	if (ras) {
4396e1ee2111SLijo Lazar 		if (block == AMDGPU_RAS_BLOCK__ANY)
4397e1ee2111SLijo Lazar 			return (ras->ras_err_state != 0);
4398e1ee2111SLijo Lazar 		else
4399e1ee2111SLijo Lazar 			return test_bit(block, &ras->ras_err_state) ||
4400e1ee2111SLijo Lazar 			       test_bit(AMDGPU_RAS_BLOCK__LAST,
4401e1ee2111SLijo Lazar 					&ras->ras_err_state);
4402e1ee2111SLijo Lazar 	}
4403e1ee2111SLijo Lazar 
4404e1ee2111SLijo Lazar 	return false;
4405e1ee2111SLijo Lazar }
44061b6ef74bSLijo Lazar 
__get_ras_event_mgr(struct amdgpu_device * adev)44071b6ef74bSLijo Lazar static struct ras_event_manager *__get_ras_event_mgr(struct amdgpu_device *adev)
440875ac6a25SYang Wang {
44099dc57c2aSYang Wang 	struct amdgpu_ras *ras;
441075ac6a25SYang Wang 
441175ac6a25SYang Wang 	ras = amdgpu_ras_get_context(adev);
441275ac6a25SYang Wang 	if (!ras)
441375ac6a25SYang Wang 		return NULL;
441475ac6a25SYang Wang 
441575ac6a25SYang Wang 	return ras->event_mgr;
441675ac6a25SYang Wang }
441775ac6a25SYang Wang 
amdgpu_ras_mark_ras_event_caller(struct amdgpu_device * adev,enum ras_event_type type,const void * caller)441875ac6a25SYang Wang int amdgpu_ras_mark_ras_event_caller(struct amdgpu_device *adev, enum ras_event_type type,
441975ac6a25SYang Wang 				     const void *caller)
442075ac6a25SYang Wang {
442175ac6a25SYang Wang 	struct ras_event_manager *event_mgr;
442275ac6a25SYang Wang 	struct ras_event_state *event_state;
442359f488beSYang Wang 	int ret = 0;
442475ac6a25SYang Wang 
442575ac6a25SYang Wang 	if (type >= RAS_EVENT_TYPE_COUNT) {
442675ac6a25SYang Wang 		ret = -EINVAL;
442775ac6a25SYang Wang 		goto out;
442875ac6a25SYang Wang 	}
442975ac6a25SYang Wang 
443075ac6a25SYang Wang 	event_mgr = __get_ras_event_mgr(adev);
443175ac6a25SYang Wang 	if (!event_mgr) {
443275ac6a25SYang Wang 		ret = -EINVAL;
443375ac6a25SYang Wang 		goto out;
443475ac6a25SYang Wang 	}
443575ac6a25SYang Wang 
443675ac6a25SYang Wang 	event_state = &event_mgr->event_state[type];
443759f488beSYang Wang 	event_state->last_seqno = atomic64_inc_return(&event_mgr->seqno);
443859f488beSYang Wang 	atomic64_inc(&event_state->count);
443959f488beSYang Wang 
444075ac6a25SYang Wang out:
444175ac6a25SYang Wang 	if (ret && caller)
444275ac6a25SYang Wang 		dev_warn(adev->dev, "failed mark ras event (%d) in %ps, ret:%d\n",
444375ac6a25SYang Wang 			 (int)type, caller, ret);
444475ac6a25SYang Wang 
444575ac6a25SYang Wang 	return ret;
444675ac6a25SYang Wang }
44479dc57c2aSYang Wang 
amdgpu_ras_acquire_event_id(struct amdgpu_device * adev,enum ras_event_type type)44489dc57c2aSYang Wang u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type type)
44499dc57c2aSYang Wang {
44509dc57c2aSYang Wang 	struct ras_event_manager *event_mgr;
445175ac6a25SYang Wang 	u64 id;
44529dc57c2aSYang Wang 
44539dc57c2aSYang Wang 	if (type >= RAS_EVENT_TYPE_COUNT)
445475ac6a25SYang Wang 		return RAS_EVENT_INVALID_ID;
445575ac6a25SYang Wang 
445675ac6a25SYang Wang 	switch (type) {
44579dc57c2aSYang Wang 	case RAS_EVENT_TYPE_FATAL:
445875ac6a25SYang Wang 	case RAS_EVENT_TYPE_POISON_CREATION:
44595b9de259SYang Wang 	case RAS_EVENT_TYPE_POISON_CONSUMPTION:
446012b435a4SYang Wang 		event_mgr = __get_ras_event_mgr(adev);
446175ac6a25SYang Wang 		if (!event_mgr)
446275ac6a25SYang Wang 			return RAS_EVENT_INVALID_ID;
446375ac6a25SYang Wang 
446475ac6a25SYang Wang 		id = event_mgr->event_state[type].last_seqno;
446559f488beSYang Wang 		break;
44669dc57c2aSYang Wang 	case RAS_EVENT_TYPE_INVALID:
44679dc57c2aSYang Wang 	default:
44689dc57c2aSYang Wang 		id = RAS_EVENT_INVALID_ID;
446975ac6a25SYang Wang 		break;
44709dc57c2aSYang Wang 	}
44719dc57c2aSYang Wang 
44729dc57c2aSYang Wang 	return id;
44739dc57c2aSYang Wang }
44749dc57c2aSYang Wang 
amdgpu_ras_global_ras_isr(struct amdgpu_device * adev)44759dc57c2aSYang Wang void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
44767c6e68c7SAndrey Grodzovsky {
44777c6e68c7SAndrey Grodzovsky 	if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
44787c6e68c7SAndrey Grodzovsky 		struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
44792c7cd280SYiPeng Chai 		enum ras_event_type type = RAS_EVENT_TYPE_FATAL;
448075ac6a25SYang Wang 		u64 event_id;
448175ac6a25SYang Wang 
448275ac6a25SYang Wang 		if (amdgpu_ras_mark_ras_event(adev, type))
448375ac6a25SYang Wang 			return;
448475ac6a25SYang Wang 
448575ac6a25SYang Wang 		event_id = amdgpu_ras_acquire_event_id(adev, type);
448675ac6a25SYang Wang 
44872c7cd280SYiPeng Chai 		RAS_EVENT_LOG(adev, event_id, "uncorrectable hardware error"
44889dc57c2aSYang Wang 			      "(ERREVENT_ATHUB_INTERRUPT) detected!\n");
44896952e99cSGuchun Chen 
4490d5ea093eSAndrey Grodzovsky 		amdgpu_ras_set_fed(adev, true);
4491b41f742dSLijo Lazar 		ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
44922c7cd280SYiPeng Chai 		amdgpu_ras_reset_gpu(adev);
449361934624SGuchun Chen 	}
44947c6e68c7SAndrey Grodzovsky }
44957c6e68c7SAndrey Grodzovsky 
amdgpu_ras_need_emergency_restart(struct amdgpu_device * adev)4496bb5c7235SWenhui Sheng bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev)
4497bb5c7235SWenhui Sheng {
4498bb5c7235SWenhui Sheng 	if (adev->asic_type == CHIP_VEGA20 &&
4499bb5c7235SWenhui Sheng 	    adev->pm.fw_version <= 0x283400) {
4500bb5c7235SWenhui Sheng 		return !(amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) &&
4501bb5c7235SWenhui Sheng 				amdgpu_ras_intr_triggered();
4502bb5c7235SWenhui Sheng 	}
4503bb5c7235SWenhui Sheng 
4504bb5c7235SWenhui Sheng 	return false;
4505bb5c7235SWenhui Sheng }
4506bb5c7235SWenhui Sheng 
amdgpu_release_ras_context(struct amdgpu_device * adev)4507970fd197SStanley.Yang void amdgpu_release_ras_context(struct amdgpu_device *adev)
4508970fd197SStanley.Yang {
4509970fd197SStanley.Yang 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
4510970fd197SStanley.Yang 
4511970fd197SStanley.Yang 	if (!con)
4512970fd197SStanley.Yang 		return;
4513970fd197SStanley.Yang 
4514970fd197SStanley.Yang 	if (!adev->ras_enabled && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)) {
45158ab0d6f0SLuben Tuikov 		con->features &= ~BIT(AMDGPU_RAS_BLOCK__GFX);
4516970fd197SStanley.Yang 		amdgpu_ras_set_context(adev, NULL);
4517970fd197SStanley.Yang 		kfree(con);
4518970fd197SStanley.Yang 	}
4519970fd197SStanley.Yang }
4520970fd197SStanley.Yang 
452112b2cab7SMukul Joshi #ifdef CONFIG_X86_MCE_AMD
find_adev(uint32_t node_id)452212b2cab7SMukul Joshi static struct amdgpu_device *find_adev(uint32_t node_id)
452312b2cab7SMukul Joshi {
452412b2cab7SMukul Joshi 	int i;
452512b2cab7SMukul Joshi 	struct amdgpu_device *adev = NULL;
452612b2cab7SMukul Joshi 
452712b2cab7SMukul Joshi 	for (i = 0; i < mce_adev_list.num_gpu; i++) {
452891a1a52dSMukul Joshi 		adev = mce_adev_list.devs[i];
452991a1a52dSMukul Joshi 
453012b2cab7SMukul Joshi 		if (adev && adev->gmc.xgmi.connected_to_cpu &&
453191a1a52dSMukul Joshi 		    adev->gmc.xgmi.physical_node_id == node_id)
453212b2cab7SMukul Joshi 			break;
453312b2cab7SMukul Joshi 		adev = NULL;
453412b2cab7SMukul Joshi 	}
453512b2cab7SMukul Joshi 
453612b2cab7SMukul Joshi 	return adev;
453712b2cab7SMukul Joshi }
453812b2cab7SMukul Joshi 
453912b2cab7SMukul Joshi #define GET_MCA_IPID_GPUID(m)	(((m) >> 44) & 0xF)
454012b2cab7SMukul Joshi #define GET_UMC_INST(m)		(((m) >> 21) & 0x7)
454112b2cab7SMukul Joshi #define GET_CHAN_INDEX(m)	((((m) >> 12) & 0x3) | (((m) >> 18) & 0x4))
454212b2cab7SMukul Joshi #define GPU_ID_OFFSET		8
454312b2cab7SMukul Joshi 
amdgpu_bad_page_notifier(struct notifier_block * nb,unsigned long val,void * data)454412b2cab7SMukul Joshi static int amdgpu_bad_page_notifier(struct notifier_block *nb,
454512b2cab7SMukul Joshi 				    unsigned long val, void *data)
454612b2cab7SMukul Joshi {
454712b2cab7SMukul Joshi 	struct mce *m = (struct mce *)data;
454812b2cab7SMukul Joshi 	struct amdgpu_device *adev = NULL;
454912b2cab7SMukul Joshi 	uint32_t gpu_id = 0;
455012b2cab7SMukul Joshi 	uint32_t umc_inst = 0, ch_inst = 0;
4551cd4c99f1STao Zhou 
455212b2cab7SMukul Joshi 	/*
455312b2cab7SMukul Joshi 	 * If the error was generated in UMC_V2, which belongs to GPU UMCs,
455412b2cab7SMukul Joshi 	 * and error occurred in DramECC (Extended error code = 0) then only
455512b2cab7SMukul Joshi 	 * process the error, else bail out.
455612b2cab7SMukul Joshi 	 */
455712b2cab7SMukul Joshi 	if (!m || !((smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC_V2) &&
455891f75eb4SYazen Ghannam 		    (XEC(m->status, 0x3f) == 0x0)))
455912b2cab7SMukul Joshi 		return NOTIFY_DONE;
456012b2cab7SMukul Joshi 
456112b2cab7SMukul Joshi 	/*
456212b2cab7SMukul Joshi 	 * If it is correctable error, return.
456312b2cab7SMukul Joshi 	 */
456412b2cab7SMukul Joshi 	if (mce_is_correctable(m))
456512b2cab7SMukul Joshi 		return NOTIFY_OK;
456612b2cab7SMukul Joshi 
456712b2cab7SMukul Joshi 	/*
456812b2cab7SMukul Joshi 	 * GPU Id is offset by GPU_ID_OFFSET in MCA_IPID_UMC register.
456912b2cab7SMukul Joshi 	 */
457012b2cab7SMukul Joshi 	gpu_id = GET_MCA_IPID_GPUID(m->ipid) - GPU_ID_OFFSET;
457112b2cab7SMukul Joshi 
457212b2cab7SMukul Joshi 	adev = find_adev(gpu_id);
457312b2cab7SMukul Joshi 	if (!adev) {
457412b2cab7SMukul Joshi 		DRM_WARN("%s: Unable to find adev for gpu_id: %d\n", __func__,
457512b2cab7SMukul Joshi 								gpu_id);
457612b2cab7SMukul Joshi 		return NOTIFY_DONE;
457712b2cab7SMukul Joshi 	}
457812b2cab7SMukul Joshi 
457912b2cab7SMukul Joshi 	/*
458012b2cab7SMukul Joshi 	 * If it is uncorrectable error, then find out UMC instance and
458112b2cab7SMukul Joshi 	 * channel index.
458212b2cab7SMukul Joshi 	 */
458312b2cab7SMukul Joshi 	umc_inst = GET_UMC_INST(m->ipid);
458412b2cab7SMukul Joshi 	ch_inst = GET_CHAN_INDEX(m->ipid);
458512b2cab7SMukul Joshi 
458612b2cab7SMukul Joshi 	dev_info(adev->dev, "Uncorrectable error detected in UMC inst: %d, chan_idx: %d",
458712b2cab7SMukul Joshi 			     umc_inst, ch_inst);
458812b2cab7SMukul Joshi 
458912b2cab7SMukul Joshi 	if (!amdgpu_umc_page_retirement_mca(adev, m->addr, ch_inst, umc_inst))
459024b82292STao Zhou 		return NOTIFY_OK;
459112b2cab7SMukul Joshi 	else
459224b82292STao Zhou 		return NOTIFY_DONE;
459324b82292STao Zhou }
459412b2cab7SMukul Joshi 
459512b2cab7SMukul Joshi static struct notifier_block amdgpu_bad_page_nb = {
459612b2cab7SMukul Joshi 	.notifier_call  = amdgpu_bad_page_notifier,
459712b2cab7SMukul Joshi 	.priority       = MCE_PRIO_UC,
459812b2cab7SMukul Joshi };
459912b2cab7SMukul Joshi 
amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device * adev)460012b2cab7SMukul Joshi static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev)
460191a1a52dSMukul Joshi {
460212b2cab7SMukul Joshi 	/*
460312b2cab7SMukul Joshi 	 * Add the adev to the mce_adev_list.
460491a1a52dSMukul Joshi 	 * During mode2 reset, amdgpu device is temporarily
460591a1a52dSMukul Joshi 	 * removed from the mgpu_info list which can cause
460691a1a52dSMukul Joshi 	 * page retirement to fail.
460791a1a52dSMukul Joshi 	 * Use this list instead of mgpu_info to find the amdgpu
460891a1a52dSMukul Joshi 	 * device on which the UMC error was reported.
460991a1a52dSMukul Joshi 	 */
461091a1a52dSMukul Joshi 	mce_adev_list.devs[mce_adev_list.num_gpu++] = adev;
461191a1a52dSMukul Joshi 
461291a1a52dSMukul Joshi 	/*
461391a1a52dSMukul Joshi 	 * Register the x86 notifier only once
461412b2cab7SMukul Joshi 	 * with MCE subsystem.
461512b2cab7SMukul Joshi 	 */
461612b2cab7SMukul Joshi 	if (notifier_registered == false) {
461712b2cab7SMukul Joshi 		mce_register_decode_chain(&amdgpu_bad_page_nb);
461812b2cab7SMukul Joshi 		notifier_registered = true;
461912b2cab7SMukul Joshi 	}
462012b2cab7SMukul Joshi }
462112b2cab7SMukul Joshi #endif
462212b2cab7SMukul Joshi 
amdgpu_ras_get_context(struct amdgpu_device * adev)46237cab2124Syipechai struct amdgpu_ras *amdgpu_ras_get_context(struct amdgpu_device *adev)
46247cab2124Syipechai {
46257cab2124Syipechai 	if (!adev)
46267cab2124Syipechai 		return NULL;
46277cab2124Syipechai 
46287cab2124Syipechai 	return adev->psp.ras_context.ras;
46297cab2124Syipechai }
46307cab2124Syipechai 
amdgpu_ras_set_context(struct amdgpu_device * adev,struct amdgpu_ras * ras_con)46317cab2124Syipechai int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con)
46327cab2124Syipechai {
46337cab2124Syipechai 	if (!adev)
46347cab2124Syipechai 		return -EINVAL;
463569f91d32SYang Li 
46367cab2124Syipechai 	adev->psp.ras_context.ras = ras_con;
46377cab2124Syipechai 	return 0;
46387cab2124Syipechai }
46397cab2124Syipechai 
46407cab2124Syipechai /* check if ras is supported on block, say, sdma, gfx */
amdgpu_ras_is_supported(struct amdgpu_device * adev,unsigned int block)46417cab2124Syipechai int amdgpu_ras_is_supported(struct amdgpu_device *adev,
46427cab2124Syipechai 		unsigned int block)
46437cab2124Syipechai {
46447cab2124Syipechai 	int ret = 0;
46458f453c51SYiPeng Chai 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
46467cab2124Syipechai 
46477cab2124Syipechai 	if (block >= AMDGPU_RAS_BLOCK_COUNT)
46487cab2124Syipechai 		return 0;
46497cab2124Syipechai 
46508f453c51SYiPeng Chai 	ret = ras && (adev->ras_enabled & (1 << block));
46518f453c51SYiPeng Chai 
46528f453c51SYiPeng Chai 	/* For the special asic with mem ecc enabled but sram ecc
46538f453c51SYiPeng Chai 	 * not enabled, even if the ras block is not supported on
46548f453c51SYiPeng Chai 	 * .ras_enabled, if the asic supports poison mode and the
46558f453c51SYiPeng Chai 	 * ras block has ras configuration, it can be considered
46568f453c51SYiPeng Chai 	 * that the ras block supports ras function.
46578f453c51SYiPeng Chai 	 */
46588f453c51SYiPeng Chai 	if (!ret &&
46598f453c51SYiPeng Chai 	    (block == AMDGPU_RAS_BLOCK__GFX ||
4660bc0f8080SCandice Li 	     block == AMDGPU_RAS_BLOCK__SDMA ||
4661bc0f8080SCandice Li 	     block == AMDGPU_RAS_BLOCK__VCN ||
4662bc0f8080SCandice Li 	     block == AMDGPU_RAS_BLOCK__JPEG) &&
4663bc0f8080SCandice Li 		(amdgpu_ras_mask & (1 << block)) &&
46647ec11c2fSStanley.Yang 	    amdgpu_ras_is_poison_mode_supported(adev) &&
46658f453c51SYiPeng Chai 	    amdgpu_ras_get_ras_block(adev, block, 0))
46668f453c51SYiPeng Chai 		ret = 1;
46678f453c51SYiPeng Chai 
46688f453c51SYiPeng Chai 	return ret;
46698f453c51SYiPeng Chai }
46707cab2124Syipechai 
amdgpu_ras_reset_gpu(struct amdgpu_device * adev)46717cab2124Syipechai int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
46727cab2124Syipechai {
46737cab2124Syipechai 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
46747cab2124Syipechai 
46757cab2124Syipechai 	/* mode1 is the only selection for RMA status */
46765f7697bbSTao Zhou 	if (amdgpu_ras_is_rma(adev)) {
4677792be2e2STao Zhou 		ras->gpu_reset_flags = 0;
46785f7697bbSTao Zhou 		ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
46795f7697bbSTao Zhou 	}
46805f7697bbSTao Zhou 
46815f7697bbSTao Zhou 	if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) {
46829e0feb79SYiPeng Chai 		struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
46839e0feb79SYiPeng Chai 		int hive_ras_recovery = 0;
46849e0feb79SYiPeng Chai 
46859e0feb79SYiPeng Chai 		if (hive) {
46869e0feb79SYiPeng Chai 			hive_ras_recovery = atomic_read(&hive->ras_recovery);
46879e0feb79SYiPeng Chai 			amdgpu_put_xgmi_hive(hive);
46889e0feb79SYiPeng Chai 		}
46899e0feb79SYiPeng Chai 		/* In the case of multiple GPUs, after a GPU has started
46909e0feb79SYiPeng Chai 		 * resetting all GPUs on hive, other GPUs do not need to
46919e0feb79SYiPeng Chai 		 * trigger GPU reset again.
46929e0feb79SYiPeng Chai 		 */
46939e0feb79SYiPeng Chai 		if (!hive_ras_recovery)
46949e0feb79SYiPeng Chai 			amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work);
469525a2b22eSAndrey Grodzovsky 		else
46969e0feb79SYiPeng Chai 			atomic_set(&ras->in_recovery, 0);
46979e0feb79SYiPeng Chai 	} else {
46989e0feb79SYiPeng Chai 		flush_work(&ras->recovery_work);
46999e0feb79SYiPeng Chai 		amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work);
47009e0feb79SYiPeng Chai 	}
47019e0feb79SYiPeng Chai 
47029e0feb79SYiPeng Chai 	return 0;
47037cab2124Syipechai }
47047cab2124Syipechai 
amdgpu_ras_set_mca_debug_mode(struct amdgpu_device * adev,bool enable)47057cab2124Syipechai int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable)
4706201761b5SLijo Lazar {
47078096df76STao Zhou 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
47088096df76STao Zhou 	int ret = 0;
4709201761b5SLijo Lazar 
47108096df76STao Zhou 	if (con) {
4711201761b5SLijo Lazar 		ret = amdgpu_mca_smu_set_debug_mode(adev, enable);
4712201761b5SLijo Lazar 		if (!ret)
4713201761b5SLijo Lazar 			con->is_aca_debug_mode = enable;
471404c4fcd2SYang Wang 	}
47158096df76STao Zhou 
47168096df76STao Zhou 	return ret;
4717201761b5SLijo Lazar }
4718201761b5SLijo Lazar 
amdgpu_ras_set_aca_debug_mode(struct amdgpu_device * adev,bool enable)4719201761b5SLijo Lazar int amdgpu_ras_set_aca_debug_mode(struct amdgpu_device *adev, bool enable)
472033dcda51SYang Wang {
472133dcda51SYang Wang 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
472233dcda51SYang Wang 	int ret = 0;
472333dcda51SYang Wang 
472433dcda51SYang Wang 	if (con) {
472533dcda51SYang Wang 		if (amdgpu_aca_is_enabled(adev))
472604c4fcd2SYang Wang 			ret = amdgpu_aca_smu_set_debug_mode(adev, enable);
472733dcda51SYang Wang 		else
472804c4fcd2SYang Wang 			ret = amdgpu_mca_smu_set_debug_mode(adev, enable);
472904c4fcd2SYang Wang 		if (!ret)
473033dcda51SYang Wang 			con->is_aca_debug_mode = enable;
473104c4fcd2SYang Wang 	}
473233dcda51SYang Wang 
473333dcda51SYang Wang 	return ret;
473433dcda51SYang Wang }
473533dcda51SYang Wang 
amdgpu_ras_get_aca_debug_mode(struct amdgpu_device * adev)473633dcda51SYang Wang bool amdgpu_ras_get_aca_debug_mode(struct amdgpu_device *adev)
473704c4fcd2SYang Wang {
47388096df76STao Zhou 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
47398096df76STao Zhou 	const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs;
474004c4fcd2SYang Wang 	const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
47418096df76STao Zhou 
47428096df76STao Zhou 	if (!con)
47438096df76STao Zhou 		return false;
47448096df76STao Zhou 
47458096df76STao Zhou 	if ((amdgpu_aca_is_enabled(adev) && smu_funcs && smu_funcs->set_debug_mode) ||
474604c4fcd2SYang Wang 	    (!amdgpu_aca_is_enabled(adev) && mca_funcs && mca_funcs->mca_set_debug_mode))
474704c4fcd2SYang Wang 		return con->is_aca_debug_mode;
474804c4fcd2SYang Wang 	else
47498096df76STao Zhou 		return true;
47508096df76STao Zhou }
47518096df76STao Zhou 
amdgpu_ras_get_error_query_mode(struct amdgpu_device * adev,unsigned int * error_query_mode)47527cab2124Syipechai bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev,
47538cc0f566SHawking Zhang 				     unsigned int *error_query_mode)
47548cc0f566SHawking Zhang {
47558cc0f566SHawking Zhang 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
47568cc0f566SHawking Zhang 	const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
47578cc0f566SHawking Zhang 	const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs;
475804c4fcd2SYang Wang 
47598cc0f566SHawking Zhang 	if (!con) {
47608cc0f566SHawking Zhang 		*error_query_mode = AMDGPU_RAS_INVALID_ERROR_QUERY;
47618cc0f566SHawking Zhang 		return false;
47628cc0f566SHawking Zhang 	}
47638cc0f566SHawking Zhang 
47648cc0f566SHawking Zhang 	if (amdgpu_sriov_vf(adev)) {
476584a2947eSVictor Skvortsov 		*error_query_mode = AMDGPU_RAS_VIRT_ERROR_COUNT_QUERY;
476684a2947eSVictor Skvortsov 	} else if ((smu_funcs && smu_funcs->set_debug_mode) || (mca_funcs && mca_funcs->mca_set_debug_mode)) {
476784a2947eSVictor Skvortsov 		*error_query_mode =
47688cc0f566SHawking Zhang 			(con->is_aca_debug_mode) ? AMDGPU_RAS_DIRECT_ERROR_QUERY : AMDGPU_RAS_FIRMWARE_ERROR_QUERY;
476904c4fcd2SYang Wang 	} else {
477084a2947eSVictor Skvortsov 		*error_query_mode = AMDGPU_RAS_DIRECT_ERROR_QUERY;
47718cc0f566SHawking Zhang 	}
477284a2947eSVictor Skvortsov 
47738cc0f566SHawking Zhang 	return true;
47748cc0f566SHawking Zhang }
47758cc0f566SHawking Zhang 
47768cc0f566SHawking Zhang /* Register each ip ras block into amdgpu ras */
amdgpu_ras_register_ras_block(struct amdgpu_device * adev,struct amdgpu_ras_block_object * ras_block_obj)47776492e1b0Syipechai int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
47786492e1b0Syipechai 		struct amdgpu_ras_block_object *ras_block_obj)
47796492e1b0Syipechai {
47806492e1b0Syipechai 	struct amdgpu_ras_block_list *ras_node;
4781d5e8ff5fSyipechai 	if (!adev || !ras_block_obj)
47826492e1b0Syipechai 		return -EINVAL;
47836492e1b0Syipechai 
47846492e1b0Syipechai 	ras_node = kzalloc(sizeof(*ras_node), GFP_KERNEL);
4785d5e8ff5fSyipechai 	if (!ras_node)
4786d5e8ff5fSyipechai 		return -ENOMEM;
4787d5e8ff5fSyipechai 
4788d5e8ff5fSyipechai 	INIT_LIST_HEAD(&ras_node->node);
4789d5e8ff5fSyipechai 	ras_node->ras_obj = ras_block_obj;
4790d5e8ff5fSyipechai 	list_add_tail(&ras_node->node, &adev->ras_list);
4791d5e8ff5fSyipechai 
47926492e1b0Syipechai 	return 0;
47936492e1b0Syipechai }
47946492e1b0Syipechai 
amdgpu_ras_get_error_type_name(uint32_t err_type,char * err_type_name)4795322a7e00SHawking Zhang void amdgpu_ras_get_error_type_name(uint32_t err_type, char *err_type_name)
4796322a7e00SHawking Zhang {
4797322a7e00SHawking Zhang 	if (!err_type_name)
4798322a7e00SHawking Zhang 		return;
4799322a7e00SHawking Zhang 
4800322a7e00SHawking Zhang 	switch (err_type) {
4801322a7e00SHawking Zhang 	case AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE:
4802322a7e00SHawking Zhang 		sprintf(err_type_name, "correctable");
4803322a7e00SHawking Zhang 		break;
4804322a7e00SHawking Zhang 	case AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE:
4805322a7e00SHawking Zhang 		sprintf(err_type_name, "uncorrectable");
4806322a7e00SHawking Zhang 		break;
4807322a7e00SHawking Zhang 	default:
4808322a7e00SHawking Zhang 		sprintf(err_type_name, "unknown");
4809322a7e00SHawking Zhang 		break;
4810322a7e00SHawking Zhang 	}
4811322a7e00SHawking Zhang }
4812322a7e00SHawking Zhang 
amdgpu_ras_inst_get_memory_id_field(struct amdgpu_device * adev,const struct amdgpu_ras_err_status_reg_entry * reg_entry,uint32_t instance,uint32_t * memory_id)4813322a7e00SHawking Zhang bool amdgpu_ras_inst_get_memory_id_field(struct amdgpu_device *adev,
4814322a7e00SHawking Zhang 					 const struct amdgpu_ras_err_status_reg_entry *reg_entry,
4815322a7e00SHawking Zhang 					 uint32_t instance,
4816322a7e00SHawking Zhang 					 uint32_t *memory_id)
4817322a7e00SHawking Zhang {
4818322a7e00SHawking Zhang 	uint32_t err_status_lo_data, err_status_lo_offset;
4819322a7e00SHawking Zhang 
4820322a7e00SHawking Zhang 	if (!reg_entry)
4821322a7e00SHawking Zhang 		return false;
4822322a7e00SHawking Zhang 
4823322a7e00SHawking Zhang 	err_status_lo_offset =
4824322a7e00SHawking Zhang 		AMDGPU_RAS_REG_ENTRY_OFFSET(reg_entry->hwip, instance,
4825322a7e00SHawking Zhang 					    reg_entry->seg_lo, reg_entry->reg_lo);
4826322a7e00SHawking Zhang 	err_status_lo_data = RREG32(err_status_lo_offset);
4827322a7e00SHawking Zhang 
4828322a7e00SHawking Zhang 	if ((reg_entry->flags & AMDGPU_RAS_ERR_STATUS_VALID) &&
4829322a7e00SHawking Zhang 	    !REG_GET_FIELD(err_status_lo_data, ERR_STATUS_LO, ERR_STATUS_VALID_FLAG))
4830322a7e00SHawking Zhang 		return false;
4831322a7e00SHawking Zhang 
4832322a7e00SHawking Zhang 	*memory_id = REG_GET_FIELD(err_status_lo_data, ERR_STATUS_LO, MEMORY_ID);
4833322a7e00SHawking Zhang 
4834322a7e00SHawking Zhang 	return true;
4835322a7e00SHawking Zhang }
4836322a7e00SHawking Zhang 
amdgpu_ras_inst_get_err_cnt_field(struct amdgpu_device * adev,const struct amdgpu_ras_err_status_reg_entry * reg_entry,uint32_t instance,unsigned long * err_cnt)4837322a7e00SHawking Zhang bool amdgpu_ras_inst_get_err_cnt_field(struct amdgpu_device *adev,
4838322a7e00SHawking Zhang 				       const struct amdgpu_ras_err_status_reg_entry *reg_entry,
4839322a7e00SHawking Zhang 				       uint32_t instance,
4840322a7e00SHawking Zhang 				       unsigned long *err_cnt)
4841322a7e00SHawking Zhang {
4842322a7e00SHawking Zhang 	uint32_t err_status_hi_data, err_status_hi_offset;
4843322a7e00SHawking Zhang 
4844322a7e00SHawking Zhang 	if (!reg_entry)
4845322a7e00SHawking Zhang 		return false;
4846322a7e00SHawking Zhang 
4847322a7e00SHawking Zhang 	err_status_hi_offset =
4848322a7e00SHawking Zhang 		AMDGPU_RAS_REG_ENTRY_OFFSET(reg_entry->hwip, instance,
4849322a7e00SHawking Zhang 					    reg_entry->seg_hi, reg_entry->reg_hi);
4850322a7e00SHawking Zhang 	err_status_hi_data = RREG32(err_status_hi_offset);
4851322a7e00SHawking Zhang 
4852322a7e00SHawking Zhang 	if ((reg_entry->flags & AMDGPU_RAS_ERR_INFO_VALID) &&
4853322a7e00SHawking Zhang 	    !REG_GET_FIELD(err_status_hi_data, ERR_STATUS_HI, ERR_INFO_VALID_FLAG))
4854322a7e00SHawking Zhang 		/* keep the check here in case we need to refer to the result later */
48559b337b7dSHawking Zhang 		dev_dbg(adev->dev, "Invalid err_info field\n");
48569b337b7dSHawking Zhang 
4857322a7e00SHawking Zhang 	/* read err count */
4858322a7e00SHawking Zhang 	*err_cnt = REG_GET_FIELD(err_status_hi_data, ERR_STATUS, ERR_CNT);
4859322a7e00SHawking Zhang 
4860322a7e00SHawking Zhang 	return true;
4861322a7e00SHawking Zhang }
4862322a7e00SHawking Zhang 
amdgpu_ras_inst_query_ras_error_count(struct amdgpu_device * adev,const struct amdgpu_ras_err_status_reg_entry * reg_list,uint32_t reg_list_size,const struct amdgpu_ras_memory_id_entry * mem_list,uint32_t mem_list_size,uint32_t instance,uint32_t err_type,unsigned long * err_count)4863322a7e00SHawking Zhang void amdgpu_ras_inst_query_ras_error_count(struct amdgpu_device *adev,
4864322a7e00SHawking Zhang 					   const struct amdgpu_ras_err_status_reg_entry *reg_list,
4865322a7e00SHawking Zhang 					   uint32_t reg_list_size,
4866322a7e00SHawking Zhang 					   const struct amdgpu_ras_memory_id_entry *mem_list,
4867322a7e00SHawking Zhang 					   uint32_t mem_list_size,
4868322a7e00SHawking Zhang 					   uint32_t instance,
4869322a7e00SHawking Zhang 					   uint32_t err_type,
4870322a7e00SHawking Zhang 					   unsigned long *err_count)
4871322a7e00SHawking Zhang {
4872322a7e00SHawking Zhang 	uint32_t memory_id;
4873322a7e00SHawking Zhang 	unsigned long err_cnt;
4874322a7e00SHawking Zhang 	char err_type_name[16];
4875322a7e00SHawking Zhang 	uint32_t i, j;
4876322a7e00SHawking Zhang 
4877322a7e00SHawking Zhang 	for (i = 0; i < reg_list_size; i++) {
4878322a7e00SHawking Zhang 		/* query memory_id from err_status_lo */
48799b337b7dSHawking Zhang 		if (!amdgpu_ras_inst_get_memory_id_field(adev, &reg_list[i],
48809b337b7dSHawking Zhang 							 instance, &memory_id))
48819b337b7dSHawking Zhang 			continue;
48829b337b7dSHawking Zhang 
48839b337b7dSHawking Zhang 		/* query err_cnt from err_status_hi */
4884322a7e00SHawking Zhang 		if (!amdgpu_ras_inst_get_err_cnt_field(adev, &reg_list[i],
4885322a7e00SHawking Zhang 						       instance, &err_cnt) ||
4886322a7e00SHawking Zhang 		    !err_cnt)
4887322a7e00SHawking Zhang 			continue;
4888322a7e00SHawking Zhang 
4889322a7e00SHawking Zhang 		*err_count += err_cnt;
4890322a7e00SHawking Zhang 
4891322a7e00SHawking Zhang 		/* log the errors */
4892322a7e00SHawking Zhang 		amdgpu_ras_get_error_type_name(err_type, err_type_name);
4893322a7e00SHawking Zhang 		if (!mem_list) {
4894322a7e00SHawking Zhang 			/* memory_list is not supported */
4895322a7e00SHawking Zhang 			dev_info(adev->dev,
4896322a7e00SHawking Zhang 				 "%ld %s hardware errors detected in %s, instance: %d, memory_id: %d\n",
4897322a7e00SHawking Zhang 				 err_cnt, err_type_name,
4898322a7e00SHawking Zhang 				 reg_list[i].block_name,
4899322a7e00SHawking Zhang 				 instance, memory_id);
4900322a7e00SHawking Zhang 		} else {
4901322a7e00SHawking Zhang 			for (j = 0; j < mem_list_size; j++) {
4902322a7e00SHawking Zhang 				if (memory_id == mem_list[j].memory_id) {
4903322a7e00SHawking Zhang 					dev_info(adev->dev,
4904322a7e00SHawking Zhang 						 "%ld %s hardware errors detected in %s, instance: %d, memory block: %s\n",
4905322a7e00SHawking Zhang 						 err_cnt, err_type_name,
4906322a7e00SHawking Zhang 						 reg_list[i].block_name,
4907322a7e00SHawking Zhang 						 instance, mem_list[j].name);
4908322a7e00SHawking Zhang 					break;
4909322a7e00SHawking Zhang 				}
4910322a7e00SHawking Zhang 			}
4911322a7e00SHawking Zhang 		}
4912322a7e00SHawking Zhang 	}
4913322a7e00SHawking Zhang }
4914322a7e00SHawking Zhang 
amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device * adev,const struct amdgpu_ras_err_status_reg_entry * reg_list,uint32_t reg_list_size,uint32_t instance)4915e53a3250SHawking Zhang void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev,
4916e53a3250SHawking Zhang 					   const struct amdgpu_ras_err_status_reg_entry *reg_list,
4917e53a3250SHawking Zhang 					   uint32_t reg_list_size,
4918e53a3250SHawking Zhang 					   uint32_t instance)
4919e53a3250SHawking Zhang {
4920e53a3250SHawking Zhang 	uint32_t err_status_lo_offset, err_status_hi_offset;
4921e53a3250SHawking Zhang 	uint32_t i;
4922e53a3250SHawking Zhang 
4923e53a3250SHawking Zhang 	for (i = 0; i < reg_list_size; i++) {
4924e53a3250SHawking Zhang 		err_status_lo_offset =
4925e53a3250SHawking Zhang 			AMDGPU_RAS_REG_ENTRY_OFFSET(reg_list[i].hwip, instance,
4926e53a3250SHawking Zhang 						    reg_list[i].seg_lo, reg_list[i].reg_lo);
4927e53a3250SHawking Zhang 		err_status_hi_offset =
4928e53a3250SHawking Zhang 			AMDGPU_RAS_REG_ENTRY_OFFSET(reg_list[i].hwip, instance,
4929e53a3250SHawking Zhang 						    reg_list[i].seg_hi, reg_list[i].reg_hi);
4930e53a3250SHawking Zhang 		WREG32(err_status_lo_offset, 0);
4931e53a3250SHawking Zhang 		WREG32(err_status_hi_offset, 0);
4932e53a3250SHawking Zhang 	}
4933e53a3250SHawking Zhang }
4934e53a3250SHawking Zhang 
amdgpu_ras_error_data_init(struct ras_err_data * err_data)49355b1270beSYang Wang int amdgpu_ras_error_data_init(struct ras_err_data *err_data)
49365b1270beSYang Wang {
49375b1270beSYang Wang 	memset(err_data, 0, sizeof(*err_data));
49385b1270beSYang Wang 
49395b1270beSYang Wang 	INIT_LIST_HEAD(&err_data->err_node_list);
49405b1270beSYang Wang 
49415b1270beSYang Wang 	return 0;
49425b1270beSYang Wang }
49435b1270beSYang Wang 
amdgpu_ras_error_node_release(struct ras_err_node * err_node)49445b1270beSYang Wang static void amdgpu_ras_error_node_release(struct ras_err_node *err_node)
49455b1270beSYang Wang {
49465b1270beSYang Wang 	if (!err_node)
49475b1270beSYang Wang 		return;
49485b1270beSYang Wang 
49495b1270beSYang Wang 	list_del(&err_node->node);
49505b1270beSYang Wang 	kvfree(err_node);
49515b1270beSYang Wang }
49525b1270beSYang Wang 
amdgpu_ras_error_data_fini(struct ras_err_data * err_data)49535b1270beSYang Wang void amdgpu_ras_error_data_fini(struct ras_err_data *err_data)
49545b1270beSYang Wang {
49555b1270beSYang Wang 	struct ras_err_node *err_node, *tmp;
49565b1270beSYang Wang 
49575b1270beSYang Wang 	list_for_each_entry_safe(err_node, tmp, &err_data->err_node_list, node)
49588a656611SStanley.Yang 		amdgpu_ras_error_node_release(err_node);
49595b1270beSYang Wang }
49605b1270beSYang Wang 
amdgpu_ras_error_find_node_by_id(struct ras_err_data * err_data,struct amdgpu_smuio_mcm_config_info * mcm_info)49615b1270beSYang Wang static struct ras_err_node *amdgpu_ras_error_find_node_by_id(struct ras_err_data *err_data,
49625b1270beSYang Wang 							     struct amdgpu_smuio_mcm_config_info *mcm_info)
49635b1270beSYang Wang {
49645b1270beSYang Wang 	struct ras_err_node *err_node;
49655b1270beSYang Wang 	struct amdgpu_smuio_mcm_config_info *ref_id;
49665b1270beSYang Wang 
49675b1270beSYang Wang 	if (!err_data || !mcm_info)
49685b1270beSYang Wang 		return NULL;
49695b1270beSYang Wang 
49705b1270beSYang Wang 	for_each_ras_error(err_node, err_data) {
49715b1270beSYang Wang 		ref_id = &err_node->err_info.mcm_info;
49725b1270beSYang Wang 
49735b1270beSYang Wang 		if (mcm_info->socket_id == ref_id->socket_id &&
497453d4d779SYang Wang 		    mcm_info->die_id == ref_id->die_id)
497553d4d779SYang Wang 			return err_node;
49765b1270beSYang Wang 	}
49775b1270beSYang Wang 
49785b1270beSYang Wang 	return NULL;
49795b1270beSYang Wang }
49805b1270beSYang Wang 
amdgpu_ras_error_node_new(void)49815b1270beSYang Wang static struct ras_err_node *amdgpu_ras_error_node_new(void)
49825b1270beSYang Wang {
49835b1270beSYang Wang 	struct ras_err_node *err_node;
49845b1270beSYang Wang 
49855b1270beSYang Wang 	err_node = kvzalloc(sizeof(*err_node), GFP_KERNEL);
49865b1270beSYang Wang 	if (!err_node)
49875b1270beSYang Wang 		return NULL;
49885b1270beSYang Wang 
49895b1270beSYang Wang 	INIT_LIST_HEAD(&err_node->node);
49905b1270beSYang Wang 
49915b1270beSYang Wang 	return err_node;
49925b1270beSYang Wang }
49935b1270beSYang Wang 
ras_err_info_cmp(void * priv,const struct list_head * a,const struct list_head * b)49945b1270beSYang Wang static int ras_err_info_cmp(void *priv, const struct list_head *a, const struct list_head *b)
4995dbf3850dSYang Wang {
4996dbf3850dSYang Wang 	struct ras_err_node *nodea = container_of(a, struct ras_err_node, node);
4997dbf3850dSYang Wang 	struct ras_err_node *nodeb = container_of(b, struct ras_err_node, node);
4998dbf3850dSYang Wang 	struct amdgpu_smuio_mcm_config_info *infoa = &nodea->err_info.mcm_info;
4999dbf3850dSYang Wang 	struct amdgpu_smuio_mcm_config_info *infob = &nodeb->err_info.mcm_info;
5000dbf3850dSYang Wang 
5001dbf3850dSYang Wang 	if (unlikely(infoa->socket_id != infob->socket_id))
5002dbf3850dSYang Wang 		return infoa->socket_id - infob->socket_id;
5003dbf3850dSYang Wang 	else
5004dbf3850dSYang Wang 		return infoa->die_id - infob->die_id;
5005dbf3850dSYang Wang 
5006dbf3850dSYang Wang 	return 0;
5007dbf3850dSYang Wang }
5008dbf3850dSYang Wang 
amdgpu_ras_error_get_info(struct ras_err_data * err_data,struct amdgpu_smuio_mcm_config_info * mcm_info)5009dbf3850dSYang Wang static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_data,
50105b1270beSYang Wang 				struct amdgpu_smuio_mcm_config_info *mcm_info)
50110795b5d2SYiPeng Chai {
50125b1270beSYang Wang 	struct ras_err_node *err_node;
50135b1270beSYang Wang 
50145b1270beSYang Wang 	err_node = amdgpu_ras_error_find_node_by_id(err_data, mcm_info);
50155b1270beSYang Wang 	if (err_node)
50165b1270beSYang Wang 		return &err_node->err_info;
50175b1270beSYang Wang 
50185b1270beSYang Wang 	err_node = amdgpu_ras_error_node_new();
50195b1270beSYang Wang 	if (!err_node)
50205b1270beSYang Wang 		return NULL;
50215b1270beSYang Wang 
50225b1270beSYang Wang 	memcpy(&err_node->err_info.mcm_info, mcm_info, sizeof(*mcm_info));
50230795b5d2SYiPeng Chai 
50249f91e983SYiPeng Chai 	err_data->err_list_count++;
50255b1270beSYang Wang 	list_add_tail(&err_node->node, &err_data->err_node_list);
50265b1270beSYang Wang 	list_sort(NULL, &err_data->err_node_list, ras_err_info_cmp);
5027dbf3850dSYang Wang 
50285b1270beSYang Wang 	return &err_node->err_info;
50295b1270beSYang Wang }
50305b1270beSYang Wang 
amdgpu_ras_error_statistic_ue_count(struct ras_err_data * err_data,struct amdgpu_smuio_mcm_config_info * mcm_info,u64 count)50315b1270beSYang Wang int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
50325b1270beSYang Wang 					struct amdgpu_smuio_mcm_config_info *mcm_info,
50339f91e983SYiPeng Chai 					u64 count)
5034671af066SYang Wang {
50355b1270beSYang Wang 	struct ras_err_info *err_info;
50365b1270beSYang Wang 
50375b1270beSYang Wang 	if (!err_data || !mcm_info)
50385b1270beSYang Wang 		return -EINVAL;
50395b1270beSYang Wang 
50405b1270beSYang Wang 	if (!count)
50415b1270beSYang Wang 		return 0;
50425b1270beSYang Wang 
50435b1270beSYang Wang 	err_info = amdgpu_ras_error_get_info(err_data, mcm_info);
50440795b5d2SYiPeng Chai 	if (!err_info)
50455b1270beSYang Wang 		return -EINVAL;
50465b1270beSYang Wang 
50475b1270beSYang Wang 	err_info->ue_count += count;
50485b1270beSYang Wang 	err_data->ue_count += count;
50495b1270beSYang Wang 
50505b1270beSYang Wang 	return 0;
50515b1270beSYang Wang }
50525b1270beSYang Wang 
amdgpu_ras_error_statistic_ce_count(struct ras_err_data * err_data,struct amdgpu_smuio_mcm_config_info * mcm_info,u64 count)50535b1270beSYang Wang int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
50545b1270beSYang Wang 					struct amdgpu_smuio_mcm_config_info *mcm_info,
50559f91e983SYiPeng Chai 					u64 count)
5056671af066SYang Wang {
50575b1270beSYang Wang 	struct ras_err_info *err_info;
50585b1270beSYang Wang 
50595b1270beSYang Wang 	if (!err_data || !mcm_info)
50605b1270beSYang Wang 		return -EINVAL;
50615b1270beSYang Wang 
50625b1270beSYang Wang 	if (!count)
50635b1270beSYang Wang 		return 0;
50645b1270beSYang Wang 
50655b1270beSYang Wang 	err_info = amdgpu_ras_error_get_info(err_data, mcm_info);
50660795b5d2SYiPeng Chai 	if (!err_info)
50675b1270beSYang Wang 		return -EINVAL;
50685b1270beSYang Wang 
50695b1270beSYang Wang 	err_info->ce_count += count;
50705b1270beSYang Wang 	err_data->ce_count += count;
50715b1270beSYang Wang 
50725b1270beSYang Wang 	return 0;
50735b1270beSYang Wang }
50745b1270beSYang Wang 
amdgpu_ras_error_statistic_de_count(struct ras_err_data * err_data,struct amdgpu_smuio_mcm_config_info * mcm_info,u64 count)5075cce4febbSHawking Zhang int amdgpu_ras_error_statistic_de_count(struct ras_err_data *err_data,
507646e2231cSCandice Li 					struct amdgpu_smuio_mcm_config_info *mcm_info,
507746e2231cSCandice Li 					u64 count)
5078671af066SYang Wang {
507946e2231cSCandice Li 	struct ras_err_info *err_info;
508046e2231cSCandice Li 
508146e2231cSCandice Li 	if (!err_data || !mcm_info)
508246e2231cSCandice Li 		return -EINVAL;
508346e2231cSCandice Li 
508446e2231cSCandice Li 	if (!count)
508546e2231cSCandice Li 		return 0;
508646e2231cSCandice Li 
508746e2231cSCandice Li 	err_info = amdgpu_ras_error_get_info(err_data, mcm_info);
50880795b5d2SYiPeng Chai 	if (!err_info)
508946e2231cSCandice Li 		return -EINVAL;
509046e2231cSCandice Li 
509146e2231cSCandice Li 	err_info->de_count += count;
509246e2231cSCandice Li 	err_data->de_count += count;
509346e2231cSCandice Li 
509446e2231cSCandice Li 	return 0;
509546e2231cSCandice Li }
509646e2231cSCandice Li 
509746e2231cSCandice Li #define mmMP0_SMN_C2PMSG_92	0x1609C
5098cce4febbSHawking Zhang #define mmMP0_SMN_C2PMSG_126	0x160BE
amdgpu_ras_boot_time_error_reporting(struct amdgpu_device * adev,u32 instance)5099cce4febbSHawking Zhang static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev,
5100cce4febbSHawking Zhang 						 u32 instance)
5101a474161eSHawking Zhang {
5102cce4febbSHawking Zhang 	u32 socket_id, aid_id, hbm_id;
5103cce4febbSHawking Zhang 	u32 fw_status;
5104a474161eSHawking Zhang 	u32 boot_error;
5105a474161eSHawking Zhang 	u64 reg_addr;
5106cce4febbSHawking Zhang 
5107cce4febbSHawking Zhang 	/* The pattern for smn addressing in other SOC could be different from
5108cce4febbSHawking Zhang 	 * the one for aqua_vanjaram. We should revisit the code if the pattern
5109cce4febbSHawking Zhang 	 * is changed. In such case, replace the aqua_vanjaram implementation
5110cce4febbSHawking Zhang 	 * with more common helper */
5111cce4febbSHawking Zhang 	reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) +
5112cce4febbSHawking Zhang 		   aqua_vanjaram_encode_ext_smn_addressing(instance);
5113cce4febbSHawking Zhang 	fw_status = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
5114a474161eSHawking Zhang 
5115cce4febbSHawking Zhang 	reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) +
5116a474161eSHawking Zhang 		   aqua_vanjaram_encode_ext_smn_addressing(instance);
5117a474161eSHawking Zhang 	boot_error = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
5118a474161eSHawking Zhang 
5119a474161eSHawking Zhang 	socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error);
5120a474161eSHawking Zhang 	aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error);
5121a474161eSHawking Zhang 	hbm_id = ((1 == AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error)) ? 0 : 1);
5122d3dbccacSHawking Zhang 
5123cce4febbSHawking Zhang 	if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error))
5124cce4febbSHawking Zhang 		dev_info(adev->dev,
5125a474161eSHawking Zhang 			 "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, memory training failed\n",
5126a474161eSHawking Zhang 			 socket_id, aid_id, hbm_id, fw_status);
5127a474161eSHawking Zhang 
5128cce4febbSHawking Zhang 	if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error))
5129cce4febbSHawking Zhang 		dev_info(adev->dev,
5130a474161eSHawking Zhang 			 "socket: %d, aid: %d, fw_status: 0x%x, firmware load failed at boot time\n",
5131a474161eSHawking Zhang 			 socket_id, aid_id, fw_status);
5132a474161eSHawking Zhang 
5133cce4febbSHawking Zhang 	if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error))
5134cce4febbSHawking Zhang 		dev_info(adev->dev,
5135a474161eSHawking Zhang 			 "socket: %d, aid: %d, fw_status: 0x%x, wafl link training failed\n",
5136a474161eSHawking Zhang 			 socket_id, aid_id, fw_status);
5137a474161eSHawking Zhang 
5138cce4febbSHawking Zhang 	if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error))
5139cce4febbSHawking Zhang 		dev_info(adev->dev,
5140a474161eSHawking Zhang 			 "socket: %d, aid: %d, fw_status: 0x%x, xgmi link training failed\n",
5141a474161eSHawking Zhang 			 socket_id, aid_id, fw_status);
5142a474161eSHawking Zhang 
5143cce4febbSHawking Zhang 	if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error))
5144cce4febbSHawking Zhang 		dev_info(adev->dev,
5145a474161eSHawking Zhang 			 "socket: %d, aid: %d, fw_status: 0x%x, usr cp link training failed\n",
5146a474161eSHawking Zhang 			 socket_id, aid_id, fw_status);
5147a474161eSHawking Zhang 
5148cce4febbSHawking Zhang 	if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error))
5149cce4febbSHawking Zhang 		dev_info(adev->dev,
5150a474161eSHawking Zhang 			 "socket: %d, aid: %d, fw_status: 0x%x, usr dp link training failed\n",
5151a474161eSHawking Zhang 			 socket_id, aid_id, fw_status);
5152a474161eSHawking Zhang 
5153cce4febbSHawking Zhang 	if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error))
5154cce4febbSHawking Zhang 		dev_info(adev->dev,
5155a474161eSHawking Zhang 			 "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, hbm memory test failed\n",
5156a474161eSHawking Zhang 			 socket_id, aid_id, hbm_id, fw_status);
5157a474161eSHawking Zhang 
5158cce4febbSHawking Zhang 	if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error))
5159cce4febbSHawking Zhang 		dev_info(adev->dev,
5160a474161eSHawking Zhang 			 "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, hbm bist test failed\n",
5161a474161eSHawking Zhang 			 socket_id, aid_id, hbm_id, fw_status);
5162a474161eSHawking Zhang 
5163dfe9d047SHawking Zhang 	if (AMDGPU_RAS_GPU_ERR_DATA_ABORT(boot_error))
5164dfe9d047SHawking Zhang 		dev_info(adev->dev,
5165dfe9d047SHawking Zhang 			 "socket: %d, aid: %d, fw_status: 0x%x, data abort exception\n",
5166dfe9d047SHawking Zhang 			 socket_id, aid_id, fw_status);
5167dfe9d047SHawking Zhang 
5168dfe9d047SHawking Zhang 	if (AMDGPU_RAS_GPU_ERR_GENERIC(boot_error))
5169d4bd7a50SXiang Liu 		dev_info(adev->dev,
5170dfe9d047SHawking Zhang 			 "socket: %d, aid: %d, fw_status: 0x%x, Boot Controller Generic Error\n",
5171d4bd7a50SXiang Liu 			 socket_id, aid_id, fw_status);
5172dfe9d047SHawking Zhang }
5173cce4febbSHawking Zhang 
amdgpu_ras_boot_error_detected(struct amdgpu_device * adev,u32 instance)5174cce4febbSHawking Zhang static bool amdgpu_ras_boot_error_detected(struct amdgpu_device *adev,
5175a474161eSHawking Zhang 					   u32 instance)
5176a474161eSHawking Zhang {
5177cce4febbSHawking Zhang 	u64 reg_addr;
5178a474161eSHawking Zhang 	u32 reg_data;
5179cce4febbSHawking Zhang 	int retry_loop;
5180cce4febbSHawking Zhang 
5181cce4febbSHawking Zhang 	reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) +
51821731ba9bSHawking Zhang 		   aqua_vanjaram_encode_ext_smn_addressing(instance);
51831731ba9bSHawking Zhang 
51841731ba9bSHawking Zhang 	for (retry_loop = 0; retry_loop < AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT; retry_loop++) {
51851731ba9bSHawking Zhang 		reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
51861731ba9bSHawking Zhang 		if ((reg_data & AMDGPU_RAS_BOOT_STATUS_MASK) == AMDGPU_RAS_BOOT_STEADY_STATUS)
5187a474161eSHawking Zhang 			return false;
5188a474161eSHawking Zhang 		else
5189a474161eSHawking Zhang 			msleep(1);
51901731ba9bSHawking Zhang 	}
51911731ba9bSHawking Zhang 
51921731ba9bSHawking Zhang 	return true;
5193a474161eSHawking Zhang }
5194cce4febbSHawking Zhang 
amdgpu_ras_query_boot_status(struct amdgpu_device * adev,u32 num_instances)5195cce4febbSHawking Zhang void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 num_instances)
5196cce4febbSHawking Zhang {
5197cce4febbSHawking Zhang 	u32 i;
5198cce4febbSHawking Zhang 
5199cce4febbSHawking Zhang 	for (i = 0; i < num_instances; i++) {
5200cce4febbSHawking Zhang 		if (amdgpu_ras_boot_error_detected(adev, i))
5201a474161eSHawking Zhang 			amdgpu_ras_boot_time_error_reporting(adev, i);
5202a474161eSHawking Zhang 	}
5203cce4febbSHawking Zhang }
5204cce4febbSHawking Zhang 
amdgpu_ras_reserve_page(struct amdgpu_device * adev,uint64_t pfn)5205af730e08SYiPeng Chai int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn)
5206af730e08SYiPeng Chai {
5207af730e08SYiPeng Chai 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
5208af730e08SYiPeng Chai 	struct amdgpu_vram_mgr *mgr = &adev->mman.vram_mgr;
5209af730e08SYiPeng Chai 	uint64_t start = pfn << AMDGPU_GPU_PAGE_SHIFT;
5210af730e08SYiPeng Chai 	int ret = 0;
5211af730e08SYiPeng Chai 
5212af730e08SYiPeng Chai 	mutex_lock(&con->page_rsv_lock);
5213af730e08SYiPeng Chai 	ret = amdgpu_vram_mgr_query_page_status(mgr, start);
5214af730e08SYiPeng Chai 	if (ret == -ENOENT)
5215af730e08SYiPeng Chai 		ret = amdgpu_vram_mgr_reserve_range(mgr, start, AMDGPU_GPU_PAGE_SIZE);
5216af730e08SYiPeng Chai 	mutex_unlock(&con->page_rsv_lock);
5217af730e08SYiPeng Chai 
5218af730e08SYiPeng Chai 	return ret;
5219af730e08SYiPeng Chai }
5220af730e08SYiPeng Chai 
amdgpu_ras_event_log_print(struct amdgpu_device * adev,u64 event_id,const char * fmt,...)5221b712d7c2SYang Wang void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id,
5222b712d7c2SYang Wang 				const char *fmt, ...)
5223b712d7c2SYang Wang {
5224b712d7c2SYang Wang 	struct va_format vaf;
5225b712d7c2SYang Wang 	va_list args;
5226b712d7c2SYang Wang 
5227b712d7c2SYang Wang 	va_start(args, fmt);
5228b712d7c2SYang Wang 	vaf.fmt = fmt;
5229b712d7c2SYang Wang 	vaf.va = &args;
5230b712d7c2SYang Wang 
5231b712d7c2SYang Wang 	if (RAS_EVENT_ID_IS_VALID(event_id))
523275ac6a25SYang Wang 		dev_printk(KERN_INFO, adev->dev, "{%llu}%pV", event_id, &vaf);
5233b712d7c2SYang Wang 	else
5234b712d7c2SYang Wang 		dev_printk(KERN_INFO, adev->dev, "%pV", &vaf);
5235b712d7c2SYang Wang 
5236b712d7c2SYang Wang 	va_end(args);
5237b712d7c2SYang Wang }
5238b712d7c2SYang Wang 
amdgpu_ras_is_rma(struct amdgpu_device * adev)5239792be2e2STao Zhou bool amdgpu_ras_is_rma(struct amdgpu_device *adev)
5240792be2e2STao Zhou {
5241792be2e2STao Zhou 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
5242792be2e2STao Zhou 
5243792be2e2STao Zhou 	if (!con)
5244792be2e2STao Zhou 		return false;
5245792be2e2STao Zhou 
5246792be2e2STao Zhou 	return con->is_rma;
5247792be2e2STao Zhou }
5248792be2e2STao Zhou