186edcc7dSTao Zhou /*
286edcc7dSTao Zhou  * Copyright 2019 Advanced Micro Devices, Inc.
386edcc7dSTao Zhou  *
486edcc7dSTao Zhou  * Permission is hereby granted, free of charge, to any person obtaining a
586edcc7dSTao Zhou  * copy of this software and associated documentation files (the "Software"),
686edcc7dSTao Zhou  * to deal in the Software without restriction, including without limitation
786edcc7dSTao Zhou  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
886edcc7dSTao Zhou  * and/or sell copies of the Software, and to permit persons to whom the
986edcc7dSTao Zhou  * Software is furnished to do so, subject to the following conditions:
1086edcc7dSTao Zhou  *
1186edcc7dSTao Zhou  * The above copyright notice and this permission notice shall be included in
1286edcc7dSTao Zhou  * all copies or substantial portions of the Software.
1386edcc7dSTao Zhou  *
1486edcc7dSTao Zhou  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1586edcc7dSTao Zhou  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1686edcc7dSTao Zhou  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
1786edcc7dSTao Zhou  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
1886edcc7dSTao Zhou  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
1986edcc7dSTao Zhou  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
2086edcc7dSTao Zhou  * OTHER DEALINGS IN THE SOFTWARE.
2186edcc7dSTao Zhou  *
2286edcc7dSTao Zhou  */
2386edcc7dSTao Zhou 
24f27defcaSYiPeng Chai #include <linux/sort.h>
257cab2124Syipechai #include "amdgpu.h"
26cbe4d43eSTao Zhou #include "umc_v6_7.h"
276c23f3d1SYiPeng Chai #define MAX_UMC_POISON_POLLING_TIME_SYNC   20  //ms
28cbe4d43eSTao Zhou 
29f27defcaSYiPeng Chai #define MAX_UMC_HASH_STRING_SIZE  256
30f27defcaSYiPeng Chai 
amdgpu_umc_convert_error_address(struct amdgpu_device * adev,struct ras_err_data * err_data,uint64_t err_addr,uint32_t ch_inst,uint32_t umc_inst)31cbe4d43eSTao Zhou static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
32cbe4d43eSTao Zhou 				    struct ras_err_data *err_data, uint64_t err_addr,
33cbe4d43eSTao Zhou 				    uint32_t ch_inst, uint32_t umc_inst)
34cbe4d43eSTao Zhou {
354e8303cfSLijo Lazar 	switch (amdgpu_ip_version(adev, UMC_HWIP, 0)) {
36cbe4d43eSTao Zhou 	case IP_VERSION(6, 7, 0):
37cbe4d43eSTao Zhou 		umc_v6_7_convert_error_address(adev,
38cbe4d43eSTao Zhou 				err_data, err_addr, ch_inst, umc_inst);
39cbe4d43eSTao Zhou 		break;
40cbe4d43eSTao Zhou 	default:
41cbe4d43eSTao Zhou 		dev_warn(adev->dev,
42cbe4d43eSTao Zhou 			 "UMC address to Physical address translation is not supported\n");
43cbe4d43eSTao Zhou 		return AMDGPU_RAS_FAIL;
44cbe4d43eSTao Zhou 	}
45cbe4d43eSTao Zhou 
46cbe4d43eSTao Zhou 	return AMDGPU_RAS_SUCCESS;
47cbe4d43eSTao Zhou }
48cbe4d43eSTao Zhou 
amdgpu_umc_page_retirement_mca(struct amdgpu_device * adev,uint64_t err_addr,uint32_t ch_inst,uint32_t umc_inst)49cbe4d43eSTao Zhou int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
50cbe4d43eSTao Zhou 			uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst)
51cbe4d43eSTao Zhou {
525b1270beSYang Wang 	struct ras_err_data err_data;
535b1270beSYang Wang 	int ret;
545b1270beSYang Wang 
555b1270beSYang Wang 	ret = amdgpu_ras_error_data_init(&err_data);
565b1270beSYang Wang 	if (ret)
575b1270beSYang Wang 		return ret;
58cbe4d43eSTao Zhou 
59cbe4d43eSTao Zhou 	err_data.err_addr =
60cbe4d43eSTao Zhou 		kcalloc(adev->umc.max_ras_err_cnt_per_query,
61cbe4d43eSTao Zhou 			sizeof(struct eeprom_table_record), GFP_KERNEL);
62cbe4d43eSTao Zhou 	if (!err_data.err_addr) {
63cbe4d43eSTao Zhou 		dev_warn(adev->dev,
64cbe4d43eSTao Zhou 			"Failed to alloc memory for umc error record in MCA notifier!\n");
655b1270beSYang Wang 		ret = AMDGPU_RAS_FAIL;
665b1270beSYang Wang 		goto out_fini_err_data;
67cbe4d43eSTao Zhou 	}
68cbe4d43eSTao Zhou 
69e74313beSYiPeng Chai 	err_data.err_addr_len = adev->umc.max_ras_err_cnt_per_query;
70e74313beSYiPeng Chai 
71cbe4d43eSTao Zhou 	/*
72cbe4d43eSTao Zhou 	 * Translate UMC channel address to Physical address
73cbe4d43eSTao Zhou 	 */
74cbe4d43eSTao Zhou 	ret = amdgpu_umc_convert_error_address(adev, &err_data, err_addr,
75cbe4d43eSTao Zhou 					ch_inst, umc_inst);
76cbe4d43eSTao Zhou 	if (ret)
775b1270beSYang Wang 		goto out_free_err_addr;
78cbe4d43eSTao Zhou 
79cbe4d43eSTao Zhou 	if (amdgpu_bad_page_threshold != 0) {
80cbe4d43eSTao Zhou 		amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
81a8d133e6STao Zhou 						err_data.err_addr_cnt, false);
824d33e0f1STao Zhou 		amdgpu_ras_save_bad_pages(adev, NULL);
83cbe4d43eSTao Zhou 	}
84cbe4d43eSTao Zhou 
855b1270beSYang Wang out_free_err_addr:
86cbe4d43eSTao Zhou 	kfree(err_data.err_addr);
875b1270beSYang Wang 
885b1270beSYang Wang out_fini_err_data:
895b1270beSYang Wang 	amdgpu_ras_error_data_fini(&err_data);
905b1270beSYang Wang 
91cbe4d43eSTao Zhou 	return ret;
92cbe4d43eSTao Zhou }
9386edcc7dSTao Zhou 
amdgpu_umc_handle_bad_pages(struct amdgpu_device * adev,void * ras_error_status)942cf8e50eSYiPeng Chai void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
956c23f3d1SYiPeng Chai 			void *ras_error_status)
9634cc4fd9STao Zhou {
9734cc4fd9STao Zhou 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
98513befa6SStanley.Yang 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
991757bb7dSTao Zhou 	unsigned int error_query_mode;
100fdcb279dSStanley.Yang 	int ret = 0;
1019c97bf88SCandice Li 	unsigned long err_count;
1021757bb7dSTao Zhou 
1031757bb7dSTao Zhou 	amdgpu_ras_get_error_query_mode(adev, &error_query_mode);
1041757bb7dSTao Zhou 
1056c23f3d1SYiPeng Chai 	mutex_lock(&con->page_retirement_lock);
106bc143d8bSEvan Quan 	ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc));
1071757bb7dSTao Zhou 	if (ret == -EOPNOTSUPP &&
1081757bb7dSTao Zhou 	    error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) {
109efe17d5aSyipechai 		if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
110efe17d5aSyipechai 		    adev->umc.ras->ras_block.hw_ops->query_ras_error_count)
111efe17d5aSyipechai 		    adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, ras_error_status);
11234cc4fd9STao Zhou 
113efe17d5aSyipechai 		if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
114efe17d5aSyipechai 		    adev->umc.ras->ras_block.hw_ops->query_ras_error_address &&
11534cc4fd9STao Zhou 		    adev->umc.max_ras_err_cnt_per_query) {
11634cc4fd9STao Zhou 			err_data->err_addr =
11734cc4fd9STao Zhou 				kcalloc(adev->umc.max_ras_err_cnt_per_query,
11834cc4fd9STao Zhou 					sizeof(struct eeprom_table_record), GFP_KERNEL);
11961130c74SJohn Clements 
12034cc4fd9STao Zhou 			/* still call query_ras_error_address to clear error status
12134cc4fd9STao Zhou 			 * even NOMEM error is encountered
12234cc4fd9STao Zhou 			 */
12334cc4fd9STao Zhou 			if(!err_data->err_addr)
1246952e99cSGuchun Chen 				dev_warn(adev->dev, "Failed to alloc memory for "
1256952e99cSGuchun Chen 						"umc error address record!\n");
126e74313beSYiPeng Chai 			else
127e74313beSYiPeng Chai 				err_data->err_addr_len = adev->umc.max_ras_err_cnt_per_query;
12834cc4fd9STao Zhou 
12934cc4fd9STao Zhou 			/* umc query_ras_error_address is also responsible for clearing
13034cc4fd9STao Zhou 			 * error status
13134cc4fd9STao Zhou 			 */
132efe17d5aSyipechai 			adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, ras_error_status);
13334cc4fd9STao Zhou 		}
1341757bb7dSTao Zhou 	} else if (error_query_mode == AMDGPU_RAS_FIRMWARE_ERROR_QUERY ||
1351757bb7dSTao Zhou 	    (!ret && error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY)) {
136efe17d5aSyipechai 		if (adev->umc.ras &&
137efe17d5aSyipechai 		    adev->umc.ras->ecc_info_query_ras_error_count)
138efe17d5aSyipechai 		    adev->umc.ras->ecc_info_query_ras_error_count(adev, ras_error_status);
139fdcb279dSStanley.Yang 
140efe17d5aSyipechai 		if (adev->umc.ras &&
141efe17d5aSyipechai 		    adev->umc.ras->ecc_info_query_ras_error_address &&
142fdcb279dSStanley.Yang 		    adev->umc.max_ras_err_cnt_per_query) {
143fdcb279dSStanley.Yang 			err_data->err_addr =
144fdcb279dSStanley.Yang 				kcalloc(adev->umc.max_ras_err_cnt_per_query,
145fdcb279dSStanley.Yang 					sizeof(struct eeprom_table_record), GFP_KERNEL);
146fdcb279dSStanley.Yang 
147fdcb279dSStanley.Yang 			/* still call query_ras_error_address to clear error status
148fdcb279dSStanley.Yang 			 * even NOMEM error is encountered
149fdcb279dSStanley.Yang 			 */
150fdcb279dSStanley.Yang 			if(!err_data->err_addr)
151fdcb279dSStanley.Yang 				dev_warn(adev->dev, "Failed to alloc memory for "
152fdcb279dSStanley.Yang 						"umc error address record!\n");
153e74313beSYiPeng Chai 			else
154e74313beSYiPeng Chai 				err_data->err_addr_len = adev->umc.max_ras_err_cnt_per_query;
155fdcb279dSStanley.Yang 
156fdcb279dSStanley.Yang 			/* umc query_ras_error_address is also responsible for clearing
157fdcb279dSStanley.Yang 			 * error status
158fdcb279dSStanley.Yang 			 */
159efe17d5aSyipechai 			adev->umc.ras->ecc_info_query_ras_error_address(adev, ras_error_status);
160fdcb279dSStanley.Yang 		}
161fdcb279dSStanley.Yang 	}
16234cc4fd9STao Zhou 
16334cc4fd9STao Zhou 	/* only uncorrectable error needs gpu reset */
1649c97bf88SCandice Li 	if (err_data->ue_count || err_data->de_count) {
1659c97bf88SCandice Li 		err_count = err_data->ue_count + err_data->de_count;
166a219ecbbSGuchun Chen 		if ((amdgpu_bad_page_threshold != 0) &&
16722503d80SDennis Li 			err_data->err_addr_cnt) {
16834cc4fd9STao Zhou 			amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
169a8d133e6STao Zhou 						err_data->err_addr_cnt, false);
1709c97bf88SCandice Li 			amdgpu_ras_save_bad_pages(adev, &err_count);
171513befa6SStanley.Yang 
172ae756cd8STao Zhou 			amdgpu_dpm_send_hbm_bad_pages_num(adev,
173ae756cd8STao Zhou 					con->eeprom_control.ras_num_bad_pages);
17469691c82SStanley.Yang 
17569691c82SStanley.Yang 			if (con->update_channel_flag == true) {
17669691c82SStanley.Yang 				amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
17769691c82SStanley.Yang 				con->update_channel_flag = false;
17869691c82SStanley.Yang 			}
17922503d80SDennis Li 		}
1806c23f3d1SYiPeng Chai 	}
18134cc4fd9STao Zhou 
1826c23f3d1SYiPeng Chai 	kfree(err_data->err_addr);
183506c245fSBob Zhou 	err_data->err_addr = NULL;
1846c23f3d1SYiPeng Chai 
1856c23f3d1SYiPeng Chai 	mutex_unlock(&con->page_retirement_lock);
1866c23f3d1SYiPeng Chai }
1876c23f3d1SYiPeng Chai 
amdgpu_umc_do_page_retirement(struct amdgpu_device * adev,void * ras_error_status,struct amdgpu_iv_entry * entry,uint32_t reset)1886c23f3d1SYiPeng Chai static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
1896c23f3d1SYiPeng Chai 		void *ras_error_status,
1906c23f3d1SYiPeng Chai 		struct amdgpu_iv_entry *entry,
1912fc46e0bSTao Zhou 		uint32_t reset)
1926c23f3d1SYiPeng Chai {
1936c23f3d1SYiPeng Chai 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
1946c23f3d1SYiPeng Chai 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1956c23f3d1SYiPeng Chai 
1966c23f3d1SYiPeng Chai 	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
1976c23f3d1SYiPeng Chai 	amdgpu_umc_handle_bad_pages(adev, ras_error_status);
1986c23f3d1SYiPeng Chai 
1995f7697bbSTao Zhou 	if ((err_data->ue_count || err_data->de_count) &&
200792be2e2STao Zhou 	    (reset || amdgpu_ras_is_rma(adev))) {
2012fc46e0bSTao Zhou 		con->gpu_reset_flags |= reset;
20261934624SGuchun Chen 		amdgpu_ras_reset_gpu(adev);
20334cc4fd9STao Zhou 	}
2046c23f3d1SYiPeng Chai 
2056c23f3d1SYiPeng Chai 	return AMDGPU_RAS_SUCCESS;
206f7aeee73STao Zhou }
20734cc4fd9STao Zhou 
amdgpu_umc_pasid_poison_handler(struct amdgpu_device * adev,enum amdgpu_ras_block block,uint16_t pasid,pasid_notify pasid_fn,void * data,uint32_t reset)208bfa579b3SYiPeng Chai int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev,
209bfa579b3SYiPeng Chai 			enum amdgpu_ras_block block, uint16_t pasid,
210bfa579b3SYiPeng Chai 			pasid_notify pasid_fn, void *data, uint32_t reset)
211fec8c524STao Zhou {
212ae45a18bSTao Zhou 	int ret = AMDGPU_RAS_SUCCESS;
213ae45a18bSTao Zhou 
21438298ce6SStanley.Yang 	if (adev->gmc.xgmi.connected_to_cpu ||
21538298ce6SStanley.Yang 		adev->gmc.is_app_apu) {
21638298ce6SStanley.Yang 		if (reset) {
21738298ce6SStanley.Yang 			/* MCA poison handler is only responsible for GPU reset,
21838298ce6SStanley.Yang 			 * let MCA notifier do page retirement.
21938298ce6SStanley.Yang 			 */
22038298ce6SStanley.Yang 			kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
22138298ce6SStanley.Yang 			amdgpu_ras_reset_gpu(adev);
22238298ce6SStanley.Yang 		}
22338298ce6SStanley.Yang 		return ret;
22438298ce6SStanley.Yang 	}
22538298ce6SStanley.Yang 
226e643823dSTao Zhou 	if (!amdgpu_sriov_vf(adev)) {
2276c23f3d1SYiPeng Chai 		if (amdgpu_ip_version(adev, UMC_HWIP, 0) < IP_VERSION(12, 0, 0)) {
2285b1270beSYang Wang 			struct ras_err_data err_data;
229fec8c524STao Zhou 			struct ras_common_if head = {
230fec8c524STao Zhou 				.block = AMDGPU_RAS_BLOCK__UMC,
231fec8c524STao Zhou 			};
232fec8c524STao Zhou 			struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
233fec8c524STao Zhou 
2345b1270beSYang Wang 			ret = amdgpu_ras_error_data_init(&err_data);
2355b1270beSYang Wang 			if (ret)
2365b1270beSYang Wang 				return ret;
2375b1270beSYang Wang 
2381ed0e176STao Zhou 			ret = amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
239fec8c524STao Zhou 
240fec8c524STao Zhou 			if (ret == AMDGPU_RAS_SUCCESS && obj) {
2411ed0e176STao Zhou 				obj->err_data.ue_count += err_data.ue_count;
2421ed0e176STao Zhou 				obj->err_data.ce_count += err_data.ce_count;
24346e2231cSCandice Li 				obj->err_data.de_count += err_data.de_count;
244fec8c524STao Zhou 			}
2455b1270beSYang Wang 
2465b1270beSYang Wang 			amdgpu_ras_error_data_fini(&err_data);
247e643823dSTao Zhou 		} else {
2486c23f3d1SYiPeng Chai 			struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
249e278849cSYiPeng Chai 			int ret;
2506c23f3d1SYiPeng Chai 
251e278849cSYiPeng Chai 			ret = amdgpu_ras_put_poison_req(adev,
252bfa579b3SYiPeng Chai 				block, pasid, pasid_fn, data, reset);
253e278849cSYiPeng Chai 			if (!ret) {
2546c23f3d1SYiPeng Chai 				atomic_inc(&con->page_retirement_req_cnt);
2556c23f3d1SYiPeng Chai 				wake_up(&con->page_retirement_wq);
2566c23f3d1SYiPeng Chai 			}
257e278849cSYiPeng Chai 		}
2586c23f3d1SYiPeng Chai 	} else {
259e643823dSTao Zhou 		if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
260ed1e1e42SYiPeng Chai 			adev->virt.ops->ras_poison_handler(adev, block);
261e643823dSTao Zhou 		else
262e643823dSTao Zhou 			dev_warn(adev->dev,
263e643823dSTao Zhou 				"No ras_poison_handler interface in SRIOV!\n");
264e643823dSTao Zhou 	}
265fec8c524STao Zhou 
266fec8c524STao Zhou 	return ret;
267fec8c524STao Zhou }
268fec8c524STao Zhou 
amdgpu_umc_poison_handler(struct amdgpu_device * adev,enum amdgpu_ras_block block,uint32_t reset)269bfa579b3SYiPeng Chai int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
270bfa579b3SYiPeng Chai 			enum amdgpu_ras_block block, uint32_t reset)
271bfa579b3SYiPeng Chai {
272bfa579b3SYiPeng Chai 	return amdgpu_umc_pasid_poison_handler(adev,
273bfa579b3SYiPeng Chai 				block, 0, NULL, NULL, reset);
274bfa579b3SYiPeng Chai }
275bfa579b3SYiPeng Chai 
amdgpu_umc_process_ras_data_cb(struct amdgpu_device * adev,void * ras_error_status,struct amdgpu_iv_entry * entry)276a3ace75cSyipechai int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
277fec8c524STao Zhou 		void *ras_error_status,
278fec8c524STao Zhou 		struct amdgpu_iv_entry *entry)
279fec8c524STao Zhou {
2802fc46e0bSTao Zhou 	return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry,
2812fc46e0bSTao Zhou 				AMDGPU_RAS_GPU_RESET_MODE1_RESET);
282fec8c524STao Zhou }
283fec8c524STao Zhou 
amdgpu_umc_ras_sw_init(struct amdgpu_device * adev)284a6dcf9a7SHawking Zhang int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev)
285a6dcf9a7SHawking Zhang {
286a6dcf9a7SHawking Zhang 	int err;
287a6dcf9a7SHawking Zhang 	struct amdgpu_umc_ras *ras;
288a6dcf9a7SHawking Zhang 
289a6dcf9a7SHawking Zhang 	if (!adev->umc.ras)
290a6dcf9a7SHawking Zhang 		return 0;
291a6dcf9a7SHawking Zhang 
292a6dcf9a7SHawking Zhang 	ras = adev->umc.ras;
293a6dcf9a7SHawking Zhang 
294a6dcf9a7SHawking Zhang 	err = amdgpu_ras_register_ras_block(adev, &ras->ras_block);
295a6dcf9a7SHawking Zhang 	if (err) {
296a6dcf9a7SHawking Zhang 		dev_err(adev->dev, "Failed to register umc ras block!\n");
297a6dcf9a7SHawking Zhang 		return err;
298a6dcf9a7SHawking Zhang 	}
299a6dcf9a7SHawking Zhang 
300a6dcf9a7SHawking Zhang 	strcpy(adev->umc.ras->ras_block.ras_comm.name, "umc");
301a6dcf9a7SHawking Zhang 	ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__UMC;
302a6dcf9a7SHawking Zhang 	ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
303a6dcf9a7SHawking Zhang 	adev->umc.ras_if = &ras->ras_block.ras_comm;
304a6dcf9a7SHawking Zhang 
305a6dcf9a7SHawking Zhang 	if (!ras->ras_block.ras_late_init)
306a6dcf9a7SHawking Zhang 		ras->ras_block.ras_late_init = amdgpu_umc_ras_late_init;
307a6dcf9a7SHawking Zhang 
308cfa07598SHawking Zhang 	if (!ras->ras_block.ras_cb)
309a6dcf9a7SHawking Zhang 		ras->ras_block.ras_cb = amdgpu_umc_process_ras_data_cb;
310a6dcf9a7SHawking Zhang 
311a6dcf9a7SHawking Zhang 	return 0;
312a6dcf9a7SHawking Zhang }
313a6dcf9a7SHawking Zhang 
amdgpu_umc_ras_late_init(struct amdgpu_device * adev,struct ras_common_if * ras_block)3144e9b1fa5Syipechai int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block)
315fec8c524STao Zhou {
316fec8c524STao Zhou 	int r;
317fec8c524STao Zhou 
318caae42f0Syipechai 	r = amdgpu_ras_block_late_init(adev, ras_block);
319fec8c524STao Zhou 	if (r)
320a3ace75cSyipechai 		return r;
321fec8c524STao Zhou 
32284a2947eSVictor Skvortsov 	if (amdgpu_sriov_vf(adev))
32384a2947eSVictor Skvortsov 		return r;
32484a2947eSVictor Skvortsov 
325caae42f0Syipechai 	if (amdgpu_ras_is_supported(adev, ras_block->block)) {
326fec8c524STao Zhou 		r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
327fec8c524STao Zhou 		if (r)
328fec8c524STao Zhou 			goto late_fini;
329fec8c524STao Zhou 	}
330fec8c524STao Zhou 
331fec8c524STao Zhou 	/* ras init of specific umc version */
332efe17d5aSyipechai 	if (adev->umc.ras &&
333efe17d5aSyipechai 	    adev->umc.ras->err_cnt_init)
334efe17d5aSyipechai 		adev->umc.ras->err_cnt_init(adev);
335fec8c524STao Zhou 
336fec8c524STao Zhou 	return 0;
337fec8c524STao Zhou 
338fec8c524STao Zhou late_fini:
339caae42f0Syipechai 	amdgpu_ras_block_late_fini(adev, ras_block);
340fec8c524STao Zhou 	return r;
341fec8c524STao Zhou }
342fec8c524STao Zhou 
amdgpu_umc_process_ecc_irq(struct amdgpu_device * adev,struct amdgpu_irq_src * source,struct amdgpu_iv_entry * entry)34334cc4fd9STao Zhou int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
34434cc4fd9STao Zhou 		struct amdgpu_irq_src *source,
34534cc4fd9STao Zhou 		struct amdgpu_iv_entry *entry)
34634cc4fd9STao Zhou {
34703740baaSTao Zhou 	struct ras_common_if *ras_if = adev->umc.ras_if;
34834cc4fd9STao Zhou 	struct ras_dispatch_if ih_data = {
34934cc4fd9STao Zhou 		.entry = entry,
35034cc4fd9STao Zhou 	};
35134cc4fd9STao Zhou 
35234cc4fd9STao Zhou 	if (!ras_if)
35334cc4fd9STao Zhou 		return 0;
35434cc4fd9STao Zhou 
35534cc4fd9STao Zhou 	ih_data.head = *ras_if;
35634cc4fd9STao Zhou 
35734cc4fd9STao Zhou 	amdgpu_ras_interrupt_dispatch(adev, &ih_data);
35834cc4fd9STao Zhou 	return 0;
35934cc4fd9STao Zhou }
360400013b2STao Zhou 
amdgpu_umc_fill_error_record(struct ras_err_data * err_data,uint64_t err_addr,uint64_t retired_page,uint32_t channel_index,uint32_t umc_inst)361e74313beSYiPeng Chai int amdgpu_umc_fill_error_record(struct ras_err_data *err_data,
362400013b2STao Zhou 		uint64_t err_addr,
363400013b2STao Zhou 		uint64_t retired_page,
364400013b2STao Zhou 		uint32_t channel_index,
365400013b2STao Zhou 		uint32_t umc_inst)
366400013b2STao Zhou {
367e74313beSYiPeng Chai 	struct eeprom_table_record *err_rec;
368e74313beSYiPeng Chai 
369e74313beSYiPeng Chai 	if (!err_data ||
370e74313beSYiPeng Chai 	    !err_data->err_addr ||
371e74313beSYiPeng Chai 	    (err_data->err_addr_cnt >= err_data->err_addr_len))
372e74313beSYiPeng Chai 		return -EINVAL;
373e74313beSYiPeng Chai 
374e74313beSYiPeng Chai 	err_rec = &err_data->err_addr[err_data->err_addr_cnt];
375400013b2STao Zhou 
376400013b2STao Zhou 	err_rec->address = err_addr;
377400013b2STao Zhou 	/* page frame address is saved */
378400013b2STao Zhou 	err_rec->retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
379400013b2STao Zhou 	err_rec->ts = (uint64_t)ktime_get_real_seconds();
380400013b2STao Zhou 	err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
381400013b2STao Zhou 	err_rec->cu = 0;
382400013b2STao Zhou 	err_rec->mem_channel = channel_index;
383400013b2STao Zhou 	err_rec->mcumc_id = umc_inst;
384400013b2STao Zhou 
385400013b2STao Zhou 	err_data->err_addr_cnt++;
386e74313beSYiPeng Chai 
387e74313beSYiPeng Chai 	return 0;
388400013b2STao Zhou }
389e86bd8b2SYiPeng Chai 
amdgpu_umc_loop_all_aid(struct amdgpu_device * adev,umc_func func,void * data)390*f7a594e4SLijo Lazar static int amdgpu_umc_loop_all_aid(struct amdgpu_device *adev, umc_func func,
391*f7a594e4SLijo Lazar 				   void *data)
392*f7a594e4SLijo Lazar {
393*f7a594e4SLijo Lazar 	uint32_t umc_node_inst;
394*f7a594e4SLijo Lazar 	uint32_t node_inst;
395*f7a594e4SLijo Lazar 	uint32_t umc_inst;
396*f7a594e4SLijo Lazar 	uint32_t ch_inst;
397*f7a594e4SLijo Lazar 	int ret;
398*f7a594e4SLijo Lazar 
399*f7a594e4SLijo Lazar 	/*
400*f7a594e4SLijo Lazar 	 * This loop is done based on the following -
401*f7a594e4SLijo Lazar 	 * umc.active mask = mask of active umc instances across all nodes
402*f7a594e4SLijo Lazar 	 * umc.umc_inst_num = maximum number of umc instancess per node
403*f7a594e4SLijo Lazar 	 * umc.node_inst_num = maximum number of node instances
404*f7a594e4SLijo Lazar 	 * Channel instances are not assumed to be harvested.
405*f7a594e4SLijo Lazar 	 */
406*f7a594e4SLijo Lazar 	dev_dbg(adev->dev, "active umcs :%lx umc_inst per node: %d",
407*f7a594e4SLijo Lazar 		adev->umc.active_mask, adev->umc.umc_inst_num);
408*f7a594e4SLijo Lazar 	for_each_set_bit(umc_node_inst, &(adev->umc.active_mask),
409*f7a594e4SLijo Lazar 			 adev->umc.node_inst_num * adev->umc.umc_inst_num) {
410*f7a594e4SLijo Lazar 		node_inst = umc_node_inst / adev->umc.umc_inst_num;
411*f7a594e4SLijo Lazar 		umc_inst = umc_node_inst % adev->umc.umc_inst_num;
412*f7a594e4SLijo Lazar 		LOOP_UMC_CH_INST(ch_inst) {
413*f7a594e4SLijo Lazar 			dev_dbg(adev->dev,
414*f7a594e4SLijo Lazar 				"node_inst :%d umc_inst: %d ch_inst: %d",
415*f7a594e4SLijo Lazar 				node_inst, umc_inst, ch_inst);
416*f7a594e4SLijo Lazar 			ret = func(adev, node_inst, umc_inst, ch_inst, data);
417*f7a594e4SLijo Lazar 			if (ret) {
418*f7a594e4SLijo Lazar 				dev_err(adev->dev,
419*f7a594e4SLijo Lazar 					"Node %d umc %d ch %d func returns %d\n",
420*f7a594e4SLijo Lazar 					node_inst, umc_inst, ch_inst, ret);
421*f7a594e4SLijo Lazar 				return ret;
422*f7a594e4SLijo Lazar 			}
423*f7a594e4SLijo Lazar 		}
424*f7a594e4SLijo Lazar 	}
425*f7a594e4SLijo Lazar 
426*f7a594e4SLijo Lazar 	return 0;
427*f7a594e4SLijo Lazar }
428*f7a594e4SLijo Lazar 
amdgpu_umc_loop_channels(struct amdgpu_device * adev,umc_func func,void * data)429e86bd8b2SYiPeng Chai int amdgpu_umc_loop_channels(struct amdgpu_device *adev,
430e86bd8b2SYiPeng Chai 			umc_func func, void *data)
431e86bd8b2SYiPeng Chai {
432e86bd8b2SYiPeng Chai 	uint32_t node_inst       = 0;
433e86bd8b2SYiPeng Chai 	uint32_t umc_inst        = 0;
434e86bd8b2SYiPeng Chai 	uint32_t ch_inst         = 0;
435e86bd8b2SYiPeng Chai 	int ret = 0;
436e86bd8b2SYiPeng Chai 
437*f7a594e4SLijo Lazar 	if (adev->aid_mask)
438*f7a594e4SLijo Lazar 		return amdgpu_umc_loop_all_aid(adev, func, data);
439*f7a594e4SLijo Lazar 
440e86bd8b2SYiPeng Chai 	if (adev->umc.node_inst_num) {
441e86bd8b2SYiPeng Chai 		LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) {
442e86bd8b2SYiPeng Chai 			ret = func(adev, node_inst, umc_inst, ch_inst, data);
443e86bd8b2SYiPeng Chai 			if (ret) {
444e86bd8b2SYiPeng Chai 				dev_err(adev->dev, "Node %d umc %d ch %d func returns %d\n",
445e86bd8b2SYiPeng Chai 					node_inst, umc_inst, ch_inst, ret);
446e86bd8b2SYiPeng Chai 				return ret;
447e86bd8b2SYiPeng Chai 			}
448e86bd8b2SYiPeng Chai 		}
449e86bd8b2SYiPeng Chai 	} else {
450e86bd8b2SYiPeng Chai 		LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
451e86bd8b2SYiPeng Chai 			ret = func(adev, 0, umc_inst, ch_inst, data);
452e86bd8b2SYiPeng Chai 			if (ret) {
453e86bd8b2SYiPeng Chai 				dev_err(adev->dev, "Umc %d ch %d func returns %d\n",
454e86bd8b2SYiPeng Chai 					umc_inst, ch_inst, ret);
455e86bd8b2SYiPeng Chai 				return ret;
456e86bd8b2SYiPeng Chai 			}
457e86bd8b2SYiPeng Chai 		}
458e86bd8b2SYiPeng Chai 	}
459e86bd8b2SYiPeng Chai 
460e86bd8b2SYiPeng Chai 	return 0;
461e86bd8b2SYiPeng Chai }
46295b4063dSYiPeng Chai 
amdgpu_umc_update_ecc_status(struct amdgpu_device * adev,uint64_t status,uint64_t ipid,uint64_t addr)46395b4063dSYiPeng Chai int amdgpu_umc_update_ecc_status(struct amdgpu_device *adev,
46495b4063dSYiPeng Chai 				uint64_t status, uint64_t ipid, uint64_t addr)
46595b4063dSYiPeng Chai {
46695b4063dSYiPeng Chai 	if (adev->umc.ras->update_ecc_status)
46795b4063dSYiPeng Chai 		return adev->umc.ras->update_ecc_status(adev,
46895b4063dSYiPeng Chai 					status, ipid, addr);
46995b4063dSYiPeng Chai 	return 0;
47095b4063dSYiPeng Chai }
471f27defcaSYiPeng Chai 
amdgpu_umc_logs_ecc_err(struct amdgpu_device * adev,struct radix_tree_root * ecc_tree,struct ras_ecc_err * ecc_err)472f27defcaSYiPeng Chai int amdgpu_umc_logs_ecc_err(struct amdgpu_device *adev,
473f27defcaSYiPeng Chai 		struct radix_tree_root *ecc_tree, struct ras_ecc_err *ecc_err)
474f27defcaSYiPeng Chai {
475f27defcaSYiPeng Chai 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
476f27defcaSYiPeng Chai 	struct ras_ecc_log_info *ecc_log;
477f27defcaSYiPeng Chai 	int ret;
478f27defcaSYiPeng Chai 
479f27defcaSYiPeng Chai 	ecc_log = &con->umc_ecc_log;
480f27defcaSYiPeng Chai 
481f27defcaSYiPeng Chai 	mutex_lock(&ecc_log->lock);
48256631deeSYiPeng Chai 	ret = radix_tree_insert(ecc_tree, ecc_err->pa_pfn, ecc_err);
48356631deeSYiPeng Chai 	if (!ret)
484f27defcaSYiPeng Chai 		radix_tree_tag_set(ecc_tree,
48556631deeSYiPeng Chai 			ecc_err->pa_pfn, UMC_ECC_NEW_DETECTED_TAG);
486f27defcaSYiPeng Chai 	mutex_unlock(&ecc_log->lock);
487f27defcaSYiPeng Chai 
488f27defcaSYiPeng Chai 	return ret;
489f27defcaSYiPeng Chai }
4904e7812e2STao Zhou 
amdgpu_umc_pages_in_a_row(struct amdgpu_device * adev,struct ras_err_data * err_data,uint64_t pa_addr)491b02ef407STao Zhou int amdgpu_umc_pages_in_a_row(struct amdgpu_device *adev,
492b02ef407STao Zhou 			struct ras_err_data *err_data, uint64_t pa_addr)
493b02ef407STao Zhou {
494b02ef407STao Zhou 	struct ta_ras_query_address_output addr_out;
495b02ef407STao Zhou 
496b02ef407STao Zhou 	/* reinit err_data */
497b02ef407STao Zhou 	err_data->err_addr_cnt = 0;
498b02ef407STao Zhou 	err_data->err_addr_len = adev->umc.retire_unit;
499b02ef407STao Zhou 
500b02ef407STao Zhou 	addr_out.pa.pa = pa_addr;
501b02ef407STao Zhou 	if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr)
502b02ef407STao Zhou 		return adev->umc.ras->convert_ras_err_addr(adev, err_data, NULL,
503b02ef407STao Zhou 				&addr_out, false);
504b02ef407STao Zhou 	else
505b02ef407STao Zhou 		return -EINVAL;
506b02ef407STao Zhou }
507b02ef407STao Zhou 
amdgpu_umc_lookup_bad_pages_in_a_row(struct amdgpu_device * adev,uint64_t pa_addr,uint64_t * pfns,int len)5084e7812e2STao Zhou int amdgpu_umc_lookup_bad_pages_in_a_row(struct amdgpu_device *adev,
5094e7812e2STao Zhou 			uint64_t pa_addr, uint64_t *pfns, int len)
5104e7812e2STao Zhou {
511b02ef407STao Zhou 	int i, ret;
5124e7812e2STao Zhou 	struct ras_err_data err_data;
5134e7812e2STao Zhou 
51476723fbcSTao Zhou 	err_data.err_addr = kcalloc(adev->umc.retire_unit,
5154e7812e2STao Zhou 				sizeof(struct eeprom_table_record), GFP_KERNEL);
5164e7812e2STao Zhou 	if (!err_data.err_addr) {
5174e7812e2STao Zhou 		dev_warn(adev->dev, "Failed to alloc memory in bad page lookup!\n");
5184e7812e2STao Zhou 		return 0;
5194e7812e2STao Zhou 	}
5204e7812e2STao Zhou 
521b02ef407STao Zhou 	ret = amdgpu_umc_pages_in_a_row(adev, &err_data, pa_addr);
522f44a3058STao Zhou 	if (ret)
5234e7812e2STao Zhou 		goto out;
5244e7812e2STao Zhou 
52576723fbcSTao Zhou 	for (i = 0; i < adev->umc.retire_unit; i++) {
526b02ef407STao Zhou 		if (i >= len)
5274e7812e2STao Zhou 			goto out;
5284e7812e2STao Zhou 
529b02ef407STao Zhou 		pfns[i] = err_data.err_addr[i].retired_page;
5304e7812e2STao Zhou 	}
531b02ef407STao Zhou 	ret = i;
5324e7812e2STao Zhou 
5334e7812e2STao Zhou out:
5344e7812e2STao Zhou 	kfree(err_data.err_addr);
5354e7812e2STao Zhou 	return ret;
5364e7812e2STao Zhou }
5374e7812e2STao Zhou 
amdgpu_umc_mca_to_addr(struct amdgpu_device * adev,uint64_t err_addr,uint32_t ch,uint32_t umc,uint32_t node,uint32_t socket,struct ta_ras_query_address_output * addr_out,bool dump_addr)5384e7812e2STao Zhou int amdgpu_umc_mca_to_addr(struct amdgpu_device *adev,
5394e7812e2STao Zhou 			uint64_t err_addr, uint32_t ch, uint32_t umc,
5404e7812e2STao Zhou 			uint32_t node, uint32_t socket,
54171a0e963STao Zhou 			struct ta_ras_query_address_output *addr_out, bool dump_addr)
5424e7812e2STao Zhou {
5434e7812e2STao Zhou 	struct ta_ras_query_address_input addr_in;
544f44a3058STao Zhou 	int ret;
5454e7812e2STao Zhou 
5464e7812e2STao Zhou 	memset(&addr_in, 0, sizeof(addr_in));
5474e7812e2STao Zhou 	addr_in.ma.err_addr = err_addr;
5484e7812e2STao Zhou 	addr_in.ma.ch_inst = ch;
5494e7812e2STao Zhou 	addr_in.ma.umc_inst = umc;
5504e7812e2STao Zhou 	addr_in.ma.node_inst = node;
5514e7812e2STao Zhou 	addr_in.ma.socket_id = socket;
5524e7812e2STao Zhou 
553f44a3058STao Zhou 	if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) {
554f44a3058STao Zhou 		ret = adev->umc.ras->convert_ras_err_addr(adev, NULL, &addr_in,
55571a0e963STao Zhou 				addr_out, dump_addr);
556f44a3058STao Zhou 		if (ret)
557f44a3058STao Zhou 			return ret;
558f44a3058STao Zhou 	} else {
5594e7812e2STao Zhou 		return 0;
560f44a3058STao Zhou 	}
5614e7812e2STao Zhou 
5624e7812e2STao Zhou 	return 0;
5634e7812e2STao Zhou }
564