186edcc7dSTao Zhou /*
286edcc7dSTao Zhou * Copyright 2019 Advanced Micro Devices, Inc.
386edcc7dSTao Zhou *
486edcc7dSTao Zhou * Permission is hereby granted, free of charge, to any person obtaining a
586edcc7dSTao Zhou * copy of this software and associated documentation files (the "Software"),
686edcc7dSTao Zhou * to deal in the Software without restriction, including without limitation
786edcc7dSTao Zhou * the rights to use, copy, modify, merge, publish, distribute, sublicense,
886edcc7dSTao Zhou * and/or sell copies of the Software, and to permit persons to whom the
986edcc7dSTao Zhou * Software is furnished to do so, subject to the following conditions:
1086edcc7dSTao Zhou *
1186edcc7dSTao Zhou * The above copyright notice and this permission notice shall be included in
1286edcc7dSTao Zhou * all copies or substantial portions of the Software.
1386edcc7dSTao Zhou *
1486edcc7dSTao Zhou * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1586edcc7dSTao Zhou * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1686edcc7dSTao Zhou * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
1786edcc7dSTao Zhou * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
1886edcc7dSTao Zhou * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
1986edcc7dSTao Zhou * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
2086edcc7dSTao Zhou * OTHER DEALINGS IN THE SOFTWARE.
2186edcc7dSTao Zhou *
2286edcc7dSTao Zhou */
2386edcc7dSTao Zhou
24f27defcaSYiPeng Chai #include <linux/sort.h>
257cab2124Syipechai #include "amdgpu.h"
26cbe4d43eSTao Zhou #include "umc_v6_7.h"
276c23f3d1SYiPeng Chai #define MAX_UMC_POISON_POLLING_TIME_SYNC 20 //ms
28cbe4d43eSTao Zhou
29f27defcaSYiPeng Chai #define MAX_UMC_HASH_STRING_SIZE 256
30f27defcaSYiPeng Chai
amdgpu_umc_convert_error_address(struct amdgpu_device * adev,struct ras_err_data * err_data,uint64_t err_addr,uint32_t ch_inst,uint32_t umc_inst)31cbe4d43eSTao Zhou static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
32cbe4d43eSTao Zhou struct ras_err_data *err_data, uint64_t err_addr,
33cbe4d43eSTao Zhou uint32_t ch_inst, uint32_t umc_inst)
34cbe4d43eSTao Zhou {
354e8303cfSLijo Lazar switch (amdgpu_ip_version(adev, UMC_HWIP, 0)) {
36cbe4d43eSTao Zhou case IP_VERSION(6, 7, 0):
37cbe4d43eSTao Zhou umc_v6_7_convert_error_address(adev,
38cbe4d43eSTao Zhou err_data, err_addr, ch_inst, umc_inst);
39cbe4d43eSTao Zhou break;
40cbe4d43eSTao Zhou default:
41cbe4d43eSTao Zhou dev_warn(adev->dev,
42cbe4d43eSTao Zhou "UMC address to Physical address translation is not supported\n");
43cbe4d43eSTao Zhou return AMDGPU_RAS_FAIL;
44cbe4d43eSTao Zhou }
45cbe4d43eSTao Zhou
46cbe4d43eSTao Zhou return AMDGPU_RAS_SUCCESS;
47cbe4d43eSTao Zhou }
48cbe4d43eSTao Zhou
amdgpu_umc_page_retirement_mca(struct amdgpu_device * adev,uint64_t err_addr,uint32_t ch_inst,uint32_t umc_inst)49cbe4d43eSTao Zhou int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
50cbe4d43eSTao Zhou uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst)
51cbe4d43eSTao Zhou {
525b1270beSYang Wang struct ras_err_data err_data;
535b1270beSYang Wang int ret;
545b1270beSYang Wang
555b1270beSYang Wang ret = amdgpu_ras_error_data_init(&err_data);
565b1270beSYang Wang if (ret)
575b1270beSYang Wang return ret;
58cbe4d43eSTao Zhou
59cbe4d43eSTao Zhou err_data.err_addr =
60cbe4d43eSTao Zhou kcalloc(adev->umc.max_ras_err_cnt_per_query,
61cbe4d43eSTao Zhou sizeof(struct eeprom_table_record), GFP_KERNEL);
62cbe4d43eSTao Zhou if (!err_data.err_addr) {
63cbe4d43eSTao Zhou dev_warn(adev->dev,
64cbe4d43eSTao Zhou "Failed to alloc memory for umc error record in MCA notifier!\n");
655b1270beSYang Wang ret = AMDGPU_RAS_FAIL;
665b1270beSYang Wang goto out_fini_err_data;
67cbe4d43eSTao Zhou }
68cbe4d43eSTao Zhou
69e74313beSYiPeng Chai err_data.err_addr_len = adev->umc.max_ras_err_cnt_per_query;
70e74313beSYiPeng Chai
71cbe4d43eSTao Zhou /*
72cbe4d43eSTao Zhou * Translate UMC channel address to Physical address
73cbe4d43eSTao Zhou */
74cbe4d43eSTao Zhou ret = amdgpu_umc_convert_error_address(adev, &err_data, err_addr,
75cbe4d43eSTao Zhou ch_inst, umc_inst);
76cbe4d43eSTao Zhou if (ret)
775b1270beSYang Wang goto out_free_err_addr;
78cbe4d43eSTao Zhou
79cbe4d43eSTao Zhou if (amdgpu_bad_page_threshold != 0) {
80cbe4d43eSTao Zhou amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
81a8d133e6STao Zhou err_data.err_addr_cnt, false);
824d33e0f1STao Zhou amdgpu_ras_save_bad_pages(adev, NULL);
83cbe4d43eSTao Zhou }
84cbe4d43eSTao Zhou
855b1270beSYang Wang out_free_err_addr:
86cbe4d43eSTao Zhou kfree(err_data.err_addr);
875b1270beSYang Wang
885b1270beSYang Wang out_fini_err_data:
895b1270beSYang Wang amdgpu_ras_error_data_fini(&err_data);
905b1270beSYang Wang
91cbe4d43eSTao Zhou return ret;
92cbe4d43eSTao Zhou }
9386edcc7dSTao Zhou
amdgpu_umc_handle_bad_pages(struct amdgpu_device * adev,void * ras_error_status)942cf8e50eSYiPeng Chai void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
956c23f3d1SYiPeng Chai void *ras_error_status)
9634cc4fd9STao Zhou {
9734cc4fd9STao Zhou struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
98513befa6SStanley.Yang struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
991757bb7dSTao Zhou unsigned int error_query_mode;
100fdcb279dSStanley.Yang int ret = 0;
1019c97bf88SCandice Li unsigned long err_count;
1021757bb7dSTao Zhou
1031757bb7dSTao Zhou amdgpu_ras_get_error_query_mode(adev, &error_query_mode);
1041757bb7dSTao Zhou
1056c23f3d1SYiPeng Chai mutex_lock(&con->page_retirement_lock);
106bc143d8bSEvan Quan ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc));
1071757bb7dSTao Zhou if (ret == -EOPNOTSUPP &&
1081757bb7dSTao Zhou error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) {
109efe17d5aSyipechai if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
110efe17d5aSyipechai adev->umc.ras->ras_block.hw_ops->query_ras_error_count)
111efe17d5aSyipechai adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, ras_error_status);
11234cc4fd9STao Zhou
113efe17d5aSyipechai if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
114efe17d5aSyipechai adev->umc.ras->ras_block.hw_ops->query_ras_error_address &&
11534cc4fd9STao Zhou adev->umc.max_ras_err_cnt_per_query) {
11634cc4fd9STao Zhou err_data->err_addr =
11734cc4fd9STao Zhou kcalloc(adev->umc.max_ras_err_cnt_per_query,
11834cc4fd9STao Zhou sizeof(struct eeprom_table_record), GFP_KERNEL);
11961130c74SJohn Clements
12034cc4fd9STao Zhou /* still call query_ras_error_address to clear error status
12134cc4fd9STao Zhou * even NOMEM error is encountered
12234cc4fd9STao Zhou */
12334cc4fd9STao Zhou if(!err_data->err_addr)
1246952e99cSGuchun Chen dev_warn(adev->dev, "Failed to alloc memory for "
1256952e99cSGuchun Chen "umc error address record!\n");
126e74313beSYiPeng Chai else
127e74313beSYiPeng Chai err_data->err_addr_len = adev->umc.max_ras_err_cnt_per_query;
12834cc4fd9STao Zhou
12934cc4fd9STao Zhou /* umc query_ras_error_address is also responsible for clearing
13034cc4fd9STao Zhou * error status
13134cc4fd9STao Zhou */
132efe17d5aSyipechai adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, ras_error_status);
13334cc4fd9STao Zhou }
1341757bb7dSTao Zhou } else if (error_query_mode == AMDGPU_RAS_FIRMWARE_ERROR_QUERY ||
1351757bb7dSTao Zhou (!ret && error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY)) {
136efe17d5aSyipechai if (adev->umc.ras &&
137efe17d5aSyipechai adev->umc.ras->ecc_info_query_ras_error_count)
138efe17d5aSyipechai adev->umc.ras->ecc_info_query_ras_error_count(adev, ras_error_status);
139fdcb279dSStanley.Yang
140efe17d5aSyipechai if (adev->umc.ras &&
141efe17d5aSyipechai adev->umc.ras->ecc_info_query_ras_error_address &&
142fdcb279dSStanley.Yang adev->umc.max_ras_err_cnt_per_query) {
143fdcb279dSStanley.Yang err_data->err_addr =
144fdcb279dSStanley.Yang kcalloc(adev->umc.max_ras_err_cnt_per_query,
145fdcb279dSStanley.Yang sizeof(struct eeprom_table_record), GFP_KERNEL);
146fdcb279dSStanley.Yang
147fdcb279dSStanley.Yang /* still call query_ras_error_address to clear error status
148fdcb279dSStanley.Yang * even NOMEM error is encountered
149fdcb279dSStanley.Yang */
150fdcb279dSStanley.Yang if(!err_data->err_addr)
151fdcb279dSStanley.Yang dev_warn(adev->dev, "Failed to alloc memory for "
152fdcb279dSStanley.Yang "umc error address record!\n");
153e74313beSYiPeng Chai else
154e74313beSYiPeng Chai err_data->err_addr_len = adev->umc.max_ras_err_cnt_per_query;
155fdcb279dSStanley.Yang
156fdcb279dSStanley.Yang /* umc query_ras_error_address is also responsible for clearing
157fdcb279dSStanley.Yang * error status
158fdcb279dSStanley.Yang */
159efe17d5aSyipechai adev->umc.ras->ecc_info_query_ras_error_address(adev, ras_error_status);
160fdcb279dSStanley.Yang }
161fdcb279dSStanley.Yang }
16234cc4fd9STao Zhou
16334cc4fd9STao Zhou /* only uncorrectable error needs gpu reset */
1649c97bf88SCandice Li if (err_data->ue_count || err_data->de_count) {
1659c97bf88SCandice Li err_count = err_data->ue_count + err_data->de_count;
166a219ecbbSGuchun Chen if ((amdgpu_bad_page_threshold != 0) &&
16722503d80SDennis Li err_data->err_addr_cnt) {
16834cc4fd9STao Zhou amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
169a8d133e6STao Zhou err_data->err_addr_cnt, false);
1709c97bf88SCandice Li amdgpu_ras_save_bad_pages(adev, &err_count);
171513befa6SStanley.Yang
172ae756cd8STao Zhou amdgpu_dpm_send_hbm_bad_pages_num(adev,
173ae756cd8STao Zhou con->eeprom_control.ras_num_bad_pages);
17469691c82SStanley.Yang
17569691c82SStanley.Yang if (con->update_channel_flag == true) {
17669691c82SStanley.Yang amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
17769691c82SStanley.Yang con->update_channel_flag = false;
17869691c82SStanley.Yang }
17922503d80SDennis Li }
1806c23f3d1SYiPeng Chai }
18134cc4fd9STao Zhou
1826c23f3d1SYiPeng Chai kfree(err_data->err_addr);
183506c245fSBob Zhou err_data->err_addr = NULL;
1846c23f3d1SYiPeng Chai
1856c23f3d1SYiPeng Chai mutex_unlock(&con->page_retirement_lock);
1866c23f3d1SYiPeng Chai }
1876c23f3d1SYiPeng Chai
amdgpu_umc_do_page_retirement(struct amdgpu_device * adev,void * ras_error_status,struct amdgpu_iv_entry * entry,uint32_t reset)1886c23f3d1SYiPeng Chai static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
1896c23f3d1SYiPeng Chai void *ras_error_status,
1906c23f3d1SYiPeng Chai struct amdgpu_iv_entry *entry,
1912fc46e0bSTao Zhou uint32_t reset)
1926c23f3d1SYiPeng Chai {
1936c23f3d1SYiPeng Chai struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
1946c23f3d1SYiPeng Chai struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1956c23f3d1SYiPeng Chai
1966c23f3d1SYiPeng Chai kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
1976c23f3d1SYiPeng Chai amdgpu_umc_handle_bad_pages(adev, ras_error_status);
1986c23f3d1SYiPeng Chai
1995f7697bbSTao Zhou if ((err_data->ue_count || err_data->de_count) &&
200792be2e2STao Zhou (reset || amdgpu_ras_is_rma(adev))) {
2012fc46e0bSTao Zhou con->gpu_reset_flags |= reset;
20261934624SGuchun Chen amdgpu_ras_reset_gpu(adev);
20334cc4fd9STao Zhou }
2046c23f3d1SYiPeng Chai
2056c23f3d1SYiPeng Chai return AMDGPU_RAS_SUCCESS;
206f7aeee73STao Zhou }
20734cc4fd9STao Zhou
amdgpu_umc_pasid_poison_handler(struct amdgpu_device * adev,enum amdgpu_ras_block block,uint16_t pasid,pasid_notify pasid_fn,void * data,uint32_t reset)208bfa579b3SYiPeng Chai int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev,
209bfa579b3SYiPeng Chai enum amdgpu_ras_block block, uint16_t pasid,
210bfa579b3SYiPeng Chai pasid_notify pasid_fn, void *data, uint32_t reset)
211fec8c524STao Zhou {
212ae45a18bSTao Zhou int ret = AMDGPU_RAS_SUCCESS;
213ae45a18bSTao Zhou
21438298ce6SStanley.Yang if (adev->gmc.xgmi.connected_to_cpu ||
21538298ce6SStanley.Yang adev->gmc.is_app_apu) {
21638298ce6SStanley.Yang if (reset) {
21738298ce6SStanley.Yang /* MCA poison handler is only responsible for GPU reset,
21838298ce6SStanley.Yang * let MCA notifier do page retirement.
21938298ce6SStanley.Yang */
22038298ce6SStanley.Yang kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
22138298ce6SStanley.Yang amdgpu_ras_reset_gpu(adev);
22238298ce6SStanley.Yang }
22338298ce6SStanley.Yang return ret;
22438298ce6SStanley.Yang }
22538298ce6SStanley.Yang
226e643823dSTao Zhou if (!amdgpu_sriov_vf(adev)) {
2276c23f3d1SYiPeng Chai if (amdgpu_ip_version(adev, UMC_HWIP, 0) < IP_VERSION(12, 0, 0)) {
2285b1270beSYang Wang struct ras_err_data err_data;
229fec8c524STao Zhou struct ras_common_if head = {
230fec8c524STao Zhou .block = AMDGPU_RAS_BLOCK__UMC,
231fec8c524STao Zhou };
232fec8c524STao Zhou struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
233fec8c524STao Zhou
2345b1270beSYang Wang ret = amdgpu_ras_error_data_init(&err_data);
2355b1270beSYang Wang if (ret)
2365b1270beSYang Wang return ret;
2375b1270beSYang Wang
2381ed0e176STao Zhou ret = amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
239fec8c524STao Zhou
240fec8c524STao Zhou if (ret == AMDGPU_RAS_SUCCESS && obj) {
2411ed0e176STao Zhou obj->err_data.ue_count += err_data.ue_count;
2421ed0e176STao Zhou obj->err_data.ce_count += err_data.ce_count;
24346e2231cSCandice Li obj->err_data.de_count += err_data.de_count;
244fec8c524STao Zhou }
2455b1270beSYang Wang
2465b1270beSYang Wang amdgpu_ras_error_data_fini(&err_data);
247e643823dSTao Zhou } else {
2486c23f3d1SYiPeng Chai struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
249e278849cSYiPeng Chai int ret;
2506c23f3d1SYiPeng Chai
251e278849cSYiPeng Chai ret = amdgpu_ras_put_poison_req(adev,
252bfa579b3SYiPeng Chai block, pasid, pasid_fn, data, reset);
253e278849cSYiPeng Chai if (!ret) {
2546c23f3d1SYiPeng Chai atomic_inc(&con->page_retirement_req_cnt);
2556c23f3d1SYiPeng Chai wake_up(&con->page_retirement_wq);
2566c23f3d1SYiPeng Chai }
257e278849cSYiPeng Chai }
2586c23f3d1SYiPeng Chai } else {
259e643823dSTao Zhou if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
260ed1e1e42SYiPeng Chai adev->virt.ops->ras_poison_handler(adev, block);
261e643823dSTao Zhou else
262e643823dSTao Zhou dev_warn(adev->dev,
263e643823dSTao Zhou "No ras_poison_handler interface in SRIOV!\n");
264e643823dSTao Zhou }
265fec8c524STao Zhou
266fec8c524STao Zhou return ret;
267fec8c524STao Zhou }
268fec8c524STao Zhou
amdgpu_umc_poison_handler(struct amdgpu_device * adev,enum amdgpu_ras_block block,uint32_t reset)269bfa579b3SYiPeng Chai int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
270bfa579b3SYiPeng Chai enum amdgpu_ras_block block, uint32_t reset)
271bfa579b3SYiPeng Chai {
272bfa579b3SYiPeng Chai return amdgpu_umc_pasid_poison_handler(adev,
273bfa579b3SYiPeng Chai block, 0, NULL, NULL, reset);
274bfa579b3SYiPeng Chai }
275bfa579b3SYiPeng Chai
amdgpu_umc_process_ras_data_cb(struct amdgpu_device * adev,void * ras_error_status,struct amdgpu_iv_entry * entry)276a3ace75cSyipechai int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
277fec8c524STao Zhou void *ras_error_status,
278fec8c524STao Zhou struct amdgpu_iv_entry *entry)
279fec8c524STao Zhou {
2802fc46e0bSTao Zhou return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry,
2812fc46e0bSTao Zhou AMDGPU_RAS_GPU_RESET_MODE1_RESET);
282fec8c524STao Zhou }
283fec8c524STao Zhou
amdgpu_umc_ras_sw_init(struct amdgpu_device * adev)284a6dcf9a7SHawking Zhang int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev)
285a6dcf9a7SHawking Zhang {
286a6dcf9a7SHawking Zhang int err;
287a6dcf9a7SHawking Zhang struct amdgpu_umc_ras *ras;
288a6dcf9a7SHawking Zhang
289a6dcf9a7SHawking Zhang if (!adev->umc.ras)
290a6dcf9a7SHawking Zhang return 0;
291a6dcf9a7SHawking Zhang
292a6dcf9a7SHawking Zhang ras = adev->umc.ras;
293a6dcf9a7SHawking Zhang
294a6dcf9a7SHawking Zhang err = amdgpu_ras_register_ras_block(adev, &ras->ras_block);
295a6dcf9a7SHawking Zhang if (err) {
296a6dcf9a7SHawking Zhang dev_err(adev->dev, "Failed to register umc ras block!\n");
297a6dcf9a7SHawking Zhang return err;
298a6dcf9a7SHawking Zhang }
299a6dcf9a7SHawking Zhang
300a6dcf9a7SHawking Zhang strcpy(adev->umc.ras->ras_block.ras_comm.name, "umc");
301a6dcf9a7SHawking Zhang ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__UMC;
302a6dcf9a7SHawking Zhang ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
303a6dcf9a7SHawking Zhang adev->umc.ras_if = &ras->ras_block.ras_comm;
304a6dcf9a7SHawking Zhang
305a6dcf9a7SHawking Zhang if (!ras->ras_block.ras_late_init)
306a6dcf9a7SHawking Zhang ras->ras_block.ras_late_init = amdgpu_umc_ras_late_init;
307a6dcf9a7SHawking Zhang
308cfa07598SHawking Zhang if (!ras->ras_block.ras_cb)
309a6dcf9a7SHawking Zhang ras->ras_block.ras_cb = amdgpu_umc_process_ras_data_cb;
310a6dcf9a7SHawking Zhang
311a6dcf9a7SHawking Zhang return 0;
312a6dcf9a7SHawking Zhang }
313a6dcf9a7SHawking Zhang
amdgpu_umc_ras_late_init(struct amdgpu_device * adev,struct ras_common_if * ras_block)3144e9b1fa5Syipechai int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block)
315fec8c524STao Zhou {
316fec8c524STao Zhou int r;
317fec8c524STao Zhou
318caae42f0Syipechai r = amdgpu_ras_block_late_init(adev, ras_block);
319fec8c524STao Zhou if (r)
320a3ace75cSyipechai return r;
321fec8c524STao Zhou
32284a2947eSVictor Skvortsov if (amdgpu_sriov_vf(adev))
32384a2947eSVictor Skvortsov return r;
32484a2947eSVictor Skvortsov
325caae42f0Syipechai if (amdgpu_ras_is_supported(adev, ras_block->block)) {
326fec8c524STao Zhou r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
327fec8c524STao Zhou if (r)
328fec8c524STao Zhou goto late_fini;
329fec8c524STao Zhou }
330fec8c524STao Zhou
331fec8c524STao Zhou /* ras init of specific umc version */
332efe17d5aSyipechai if (adev->umc.ras &&
333efe17d5aSyipechai adev->umc.ras->err_cnt_init)
334efe17d5aSyipechai adev->umc.ras->err_cnt_init(adev);
335fec8c524STao Zhou
336fec8c524STao Zhou return 0;
337fec8c524STao Zhou
338fec8c524STao Zhou late_fini:
339caae42f0Syipechai amdgpu_ras_block_late_fini(adev, ras_block);
340fec8c524STao Zhou return r;
341fec8c524STao Zhou }
342fec8c524STao Zhou
amdgpu_umc_process_ecc_irq(struct amdgpu_device * adev,struct amdgpu_irq_src * source,struct amdgpu_iv_entry * entry)34334cc4fd9STao Zhou int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
34434cc4fd9STao Zhou struct amdgpu_irq_src *source,
34534cc4fd9STao Zhou struct amdgpu_iv_entry *entry)
34634cc4fd9STao Zhou {
34703740baaSTao Zhou struct ras_common_if *ras_if = adev->umc.ras_if;
34834cc4fd9STao Zhou struct ras_dispatch_if ih_data = {
34934cc4fd9STao Zhou .entry = entry,
35034cc4fd9STao Zhou };
35134cc4fd9STao Zhou
35234cc4fd9STao Zhou if (!ras_if)
35334cc4fd9STao Zhou return 0;
35434cc4fd9STao Zhou
35534cc4fd9STao Zhou ih_data.head = *ras_if;
35634cc4fd9STao Zhou
35734cc4fd9STao Zhou amdgpu_ras_interrupt_dispatch(adev, &ih_data);
35834cc4fd9STao Zhou return 0;
35934cc4fd9STao Zhou }
360400013b2STao Zhou
amdgpu_umc_fill_error_record(struct ras_err_data * err_data,uint64_t err_addr,uint64_t retired_page,uint32_t channel_index,uint32_t umc_inst)361e74313beSYiPeng Chai int amdgpu_umc_fill_error_record(struct ras_err_data *err_data,
362400013b2STao Zhou uint64_t err_addr,
363400013b2STao Zhou uint64_t retired_page,
364400013b2STao Zhou uint32_t channel_index,
365400013b2STao Zhou uint32_t umc_inst)
366400013b2STao Zhou {
367e74313beSYiPeng Chai struct eeprom_table_record *err_rec;
368e74313beSYiPeng Chai
369e74313beSYiPeng Chai if (!err_data ||
370e74313beSYiPeng Chai !err_data->err_addr ||
371e74313beSYiPeng Chai (err_data->err_addr_cnt >= err_data->err_addr_len))
372e74313beSYiPeng Chai return -EINVAL;
373e74313beSYiPeng Chai
374e74313beSYiPeng Chai err_rec = &err_data->err_addr[err_data->err_addr_cnt];
375400013b2STao Zhou
376400013b2STao Zhou err_rec->address = err_addr;
377400013b2STao Zhou /* page frame address is saved */
378400013b2STao Zhou err_rec->retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
379400013b2STao Zhou err_rec->ts = (uint64_t)ktime_get_real_seconds();
380400013b2STao Zhou err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
381400013b2STao Zhou err_rec->cu = 0;
382400013b2STao Zhou err_rec->mem_channel = channel_index;
383400013b2STao Zhou err_rec->mcumc_id = umc_inst;
384400013b2STao Zhou
385400013b2STao Zhou err_data->err_addr_cnt++;
386e74313beSYiPeng Chai
387e74313beSYiPeng Chai return 0;
388400013b2STao Zhou }
389e86bd8b2SYiPeng Chai
amdgpu_umc_loop_all_aid(struct amdgpu_device * adev,umc_func func,void * data)390*f7a594e4SLijo Lazar static int amdgpu_umc_loop_all_aid(struct amdgpu_device *adev, umc_func func,
391*f7a594e4SLijo Lazar void *data)
392*f7a594e4SLijo Lazar {
393*f7a594e4SLijo Lazar uint32_t umc_node_inst;
394*f7a594e4SLijo Lazar uint32_t node_inst;
395*f7a594e4SLijo Lazar uint32_t umc_inst;
396*f7a594e4SLijo Lazar uint32_t ch_inst;
397*f7a594e4SLijo Lazar int ret;
398*f7a594e4SLijo Lazar
399*f7a594e4SLijo Lazar /*
400*f7a594e4SLijo Lazar * This loop is done based on the following -
401*f7a594e4SLijo Lazar * umc.active mask = mask of active umc instances across all nodes
402*f7a594e4SLijo Lazar * umc.umc_inst_num = maximum number of umc instancess per node
403*f7a594e4SLijo Lazar * umc.node_inst_num = maximum number of node instances
404*f7a594e4SLijo Lazar * Channel instances are not assumed to be harvested.
405*f7a594e4SLijo Lazar */
406*f7a594e4SLijo Lazar dev_dbg(adev->dev, "active umcs :%lx umc_inst per node: %d",
407*f7a594e4SLijo Lazar adev->umc.active_mask, adev->umc.umc_inst_num);
408*f7a594e4SLijo Lazar for_each_set_bit(umc_node_inst, &(adev->umc.active_mask),
409*f7a594e4SLijo Lazar adev->umc.node_inst_num * adev->umc.umc_inst_num) {
410*f7a594e4SLijo Lazar node_inst = umc_node_inst / adev->umc.umc_inst_num;
411*f7a594e4SLijo Lazar umc_inst = umc_node_inst % adev->umc.umc_inst_num;
412*f7a594e4SLijo Lazar LOOP_UMC_CH_INST(ch_inst) {
413*f7a594e4SLijo Lazar dev_dbg(adev->dev,
414*f7a594e4SLijo Lazar "node_inst :%d umc_inst: %d ch_inst: %d",
415*f7a594e4SLijo Lazar node_inst, umc_inst, ch_inst);
416*f7a594e4SLijo Lazar ret = func(adev, node_inst, umc_inst, ch_inst, data);
417*f7a594e4SLijo Lazar if (ret) {
418*f7a594e4SLijo Lazar dev_err(adev->dev,
419*f7a594e4SLijo Lazar "Node %d umc %d ch %d func returns %d\n",
420*f7a594e4SLijo Lazar node_inst, umc_inst, ch_inst, ret);
421*f7a594e4SLijo Lazar return ret;
422*f7a594e4SLijo Lazar }
423*f7a594e4SLijo Lazar }
424*f7a594e4SLijo Lazar }
425*f7a594e4SLijo Lazar
426*f7a594e4SLijo Lazar return 0;
427*f7a594e4SLijo Lazar }
428*f7a594e4SLijo Lazar
amdgpu_umc_loop_channels(struct amdgpu_device * adev,umc_func func,void * data)429e86bd8b2SYiPeng Chai int amdgpu_umc_loop_channels(struct amdgpu_device *adev,
430e86bd8b2SYiPeng Chai umc_func func, void *data)
431e86bd8b2SYiPeng Chai {
432e86bd8b2SYiPeng Chai uint32_t node_inst = 0;
433e86bd8b2SYiPeng Chai uint32_t umc_inst = 0;
434e86bd8b2SYiPeng Chai uint32_t ch_inst = 0;
435e86bd8b2SYiPeng Chai int ret = 0;
436e86bd8b2SYiPeng Chai
437*f7a594e4SLijo Lazar if (adev->aid_mask)
438*f7a594e4SLijo Lazar return amdgpu_umc_loop_all_aid(adev, func, data);
439*f7a594e4SLijo Lazar
440e86bd8b2SYiPeng Chai if (adev->umc.node_inst_num) {
441e86bd8b2SYiPeng Chai LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) {
442e86bd8b2SYiPeng Chai ret = func(adev, node_inst, umc_inst, ch_inst, data);
443e86bd8b2SYiPeng Chai if (ret) {
444e86bd8b2SYiPeng Chai dev_err(adev->dev, "Node %d umc %d ch %d func returns %d\n",
445e86bd8b2SYiPeng Chai node_inst, umc_inst, ch_inst, ret);
446e86bd8b2SYiPeng Chai return ret;
447e86bd8b2SYiPeng Chai }
448e86bd8b2SYiPeng Chai }
449e86bd8b2SYiPeng Chai } else {
450e86bd8b2SYiPeng Chai LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
451e86bd8b2SYiPeng Chai ret = func(adev, 0, umc_inst, ch_inst, data);
452e86bd8b2SYiPeng Chai if (ret) {
453e86bd8b2SYiPeng Chai dev_err(adev->dev, "Umc %d ch %d func returns %d\n",
454e86bd8b2SYiPeng Chai umc_inst, ch_inst, ret);
455e86bd8b2SYiPeng Chai return ret;
456e86bd8b2SYiPeng Chai }
457e86bd8b2SYiPeng Chai }
458e86bd8b2SYiPeng Chai }
459e86bd8b2SYiPeng Chai
460e86bd8b2SYiPeng Chai return 0;
461e86bd8b2SYiPeng Chai }
46295b4063dSYiPeng Chai
amdgpu_umc_update_ecc_status(struct amdgpu_device * adev,uint64_t status,uint64_t ipid,uint64_t addr)46395b4063dSYiPeng Chai int amdgpu_umc_update_ecc_status(struct amdgpu_device *adev,
46495b4063dSYiPeng Chai uint64_t status, uint64_t ipid, uint64_t addr)
46595b4063dSYiPeng Chai {
46695b4063dSYiPeng Chai if (adev->umc.ras->update_ecc_status)
46795b4063dSYiPeng Chai return adev->umc.ras->update_ecc_status(adev,
46895b4063dSYiPeng Chai status, ipid, addr);
46995b4063dSYiPeng Chai return 0;
47095b4063dSYiPeng Chai }
471f27defcaSYiPeng Chai
amdgpu_umc_logs_ecc_err(struct amdgpu_device * adev,struct radix_tree_root * ecc_tree,struct ras_ecc_err * ecc_err)472f27defcaSYiPeng Chai int amdgpu_umc_logs_ecc_err(struct amdgpu_device *adev,
473f27defcaSYiPeng Chai struct radix_tree_root *ecc_tree, struct ras_ecc_err *ecc_err)
474f27defcaSYiPeng Chai {
475f27defcaSYiPeng Chai struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
476f27defcaSYiPeng Chai struct ras_ecc_log_info *ecc_log;
477f27defcaSYiPeng Chai int ret;
478f27defcaSYiPeng Chai
479f27defcaSYiPeng Chai ecc_log = &con->umc_ecc_log;
480f27defcaSYiPeng Chai
481f27defcaSYiPeng Chai mutex_lock(&ecc_log->lock);
48256631deeSYiPeng Chai ret = radix_tree_insert(ecc_tree, ecc_err->pa_pfn, ecc_err);
48356631deeSYiPeng Chai if (!ret)
484f27defcaSYiPeng Chai radix_tree_tag_set(ecc_tree,
48556631deeSYiPeng Chai ecc_err->pa_pfn, UMC_ECC_NEW_DETECTED_TAG);
486f27defcaSYiPeng Chai mutex_unlock(&ecc_log->lock);
487f27defcaSYiPeng Chai
488f27defcaSYiPeng Chai return ret;
489f27defcaSYiPeng Chai }
4904e7812e2STao Zhou
amdgpu_umc_pages_in_a_row(struct amdgpu_device * adev,struct ras_err_data * err_data,uint64_t pa_addr)491b02ef407STao Zhou int amdgpu_umc_pages_in_a_row(struct amdgpu_device *adev,
492b02ef407STao Zhou struct ras_err_data *err_data, uint64_t pa_addr)
493b02ef407STao Zhou {
494b02ef407STao Zhou struct ta_ras_query_address_output addr_out;
495b02ef407STao Zhou
496b02ef407STao Zhou /* reinit err_data */
497b02ef407STao Zhou err_data->err_addr_cnt = 0;
498b02ef407STao Zhou err_data->err_addr_len = adev->umc.retire_unit;
499b02ef407STao Zhou
500b02ef407STao Zhou addr_out.pa.pa = pa_addr;
501b02ef407STao Zhou if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr)
502b02ef407STao Zhou return adev->umc.ras->convert_ras_err_addr(adev, err_data, NULL,
503b02ef407STao Zhou &addr_out, false);
504b02ef407STao Zhou else
505b02ef407STao Zhou return -EINVAL;
506b02ef407STao Zhou }
507b02ef407STao Zhou
amdgpu_umc_lookup_bad_pages_in_a_row(struct amdgpu_device * adev,uint64_t pa_addr,uint64_t * pfns,int len)5084e7812e2STao Zhou int amdgpu_umc_lookup_bad_pages_in_a_row(struct amdgpu_device *adev,
5094e7812e2STao Zhou uint64_t pa_addr, uint64_t *pfns, int len)
5104e7812e2STao Zhou {
511b02ef407STao Zhou int i, ret;
5124e7812e2STao Zhou struct ras_err_data err_data;
5134e7812e2STao Zhou
51476723fbcSTao Zhou err_data.err_addr = kcalloc(adev->umc.retire_unit,
5154e7812e2STao Zhou sizeof(struct eeprom_table_record), GFP_KERNEL);
5164e7812e2STao Zhou if (!err_data.err_addr) {
5174e7812e2STao Zhou dev_warn(adev->dev, "Failed to alloc memory in bad page lookup!\n");
5184e7812e2STao Zhou return 0;
5194e7812e2STao Zhou }
5204e7812e2STao Zhou
521b02ef407STao Zhou ret = amdgpu_umc_pages_in_a_row(adev, &err_data, pa_addr);
522f44a3058STao Zhou if (ret)
5234e7812e2STao Zhou goto out;
5244e7812e2STao Zhou
52576723fbcSTao Zhou for (i = 0; i < adev->umc.retire_unit; i++) {
526b02ef407STao Zhou if (i >= len)
5274e7812e2STao Zhou goto out;
5284e7812e2STao Zhou
529b02ef407STao Zhou pfns[i] = err_data.err_addr[i].retired_page;
5304e7812e2STao Zhou }
531b02ef407STao Zhou ret = i;
5324e7812e2STao Zhou
5334e7812e2STao Zhou out:
5344e7812e2STao Zhou kfree(err_data.err_addr);
5354e7812e2STao Zhou return ret;
5364e7812e2STao Zhou }
5374e7812e2STao Zhou
amdgpu_umc_mca_to_addr(struct amdgpu_device * adev,uint64_t err_addr,uint32_t ch,uint32_t umc,uint32_t node,uint32_t socket,struct ta_ras_query_address_output * addr_out,bool dump_addr)5384e7812e2STao Zhou int amdgpu_umc_mca_to_addr(struct amdgpu_device *adev,
5394e7812e2STao Zhou uint64_t err_addr, uint32_t ch, uint32_t umc,
5404e7812e2STao Zhou uint32_t node, uint32_t socket,
54171a0e963STao Zhou struct ta_ras_query_address_output *addr_out, bool dump_addr)
5424e7812e2STao Zhou {
5434e7812e2STao Zhou struct ta_ras_query_address_input addr_in;
544f44a3058STao Zhou int ret;
5454e7812e2STao Zhou
5464e7812e2STao Zhou memset(&addr_in, 0, sizeof(addr_in));
5474e7812e2STao Zhou addr_in.ma.err_addr = err_addr;
5484e7812e2STao Zhou addr_in.ma.ch_inst = ch;
5494e7812e2STao Zhou addr_in.ma.umc_inst = umc;
5504e7812e2STao Zhou addr_in.ma.node_inst = node;
5514e7812e2STao Zhou addr_in.ma.socket_id = socket;
5524e7812e2STao Zhou
553f44a3058STao Zhou if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) {
554f44a3058STao Zhou ret = adev->umc.ras->convert_ras_err_addr(adev, NULL, &addr_in,
55571a0e963STao Zhou addr_out, dump_addr);
556f44a3058STao Zhou if (ret)
557f44a3058STao Zhou return ret;
558f44a3058STao Zhou } else {
5594e7812e2STao Zhou return 0;
560f44a3058STao Zhou }
5614e7812e2STao Zhou
5624e7812e2STao Zhou return 0;
5634e7812e2STao Zhou }
564