13907c492SJohn Clements /* 23907c492SJohn Clements * Copyright (C) 2021 Advanced Micro Devices, Inc. 33907c492SJohn Clements * 43907c492SJohn Clements * Permission is hereby granted, free of charge, to any person obtaining a 53907c492SJohn Clements * copy of this software and associated documentation files (the "Software"), 63907c492SJohn Clements * to deal in the Software without restriction, including without limitation 73907c492SJohn Clements * the rights to use, copy, modify, merge, publish, distribute, sublicense, 83907c492SJohn Clements * and/or sell copies of the Software, and to permit persons to whom the 93907c492SJohn Clements * Software is furnished to do so, subject to the following conditions: 103907c492SJohn Clements * 113907c492SJohn Clements * The above copyright notice and this permission notice shall be included 123907c492SJohn Clements * in all copies or substantial portions of the Software. 133907c492SJohn Clements * 143907c492SJohn Clements * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 153907c492SJohn Clements * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 163907c492SJohn Clements * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 173907c492SJohn Clements * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN 183907c492SJohn Clements * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 193907c492SJohn Clements * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 203907c492SJohn Clements */ 213907c492SJohn Clements #ifndef __AMDGPU_MCA_H__ 223907c492SJohn Clements #define __AMDGPU_MCA_H__ 233907c492SJohn Clements 247ff607e2SYang Wang #include "amdgpu_ras.h" 257ff607e2SYang Wang 267ff607e2SYang Wang #define MCA_MAX_REGS_COUNT (16) 277ff607e2SYang Wang 2807c1db70SYang Wang #define MCA_REG_FIELD(x, h, l) (((x) & GENMASK_ULL(h, l)) >> l) 2907c1db70SYang Wang #define MCA_REG__STATUS__VAL(x) MCA_REG_FIELD(x, 63, 63) 3007c1db70SYang Wang #define MCA_REG__STATUS__OVERFLOW(x) MCA_REG_FIELD(x, 62, 62) 3107c1db70SYang Wang #define MCA_REG__STATUS__UC(x) MCA_REG_FIELD(x, 61, 61) 3207c1db70SYang Wang #define MCA_REG__STATUS__EN(x) MCA_REG_FIELD(x, 60, 60) 3307c1db70SYang Wang #define MCA_REG__STATUS__MISCV(x) MCA_REG_FIELD(x, 59, 59) 3407c1db70SYang Wang #define MCA_REG__STATUS__ADDRV(x) MCA_REG_FIELD(x, 58, 58) 3507c1db70SYang Wang #define MCA_REG__STATUS__PCC(x) MCA_REG_FIELD(x, 57, 57) 3607c1db70SYang Wang #define MCA_REG__STATUS__ERRCOREIDVAL(x) MCA_REG_FIELD(x, 56, 56) 3707c1db70SYang Wang #define MCA_REG__STATUS__TCC(x) MCA_REG_FIELD(x, 55, 55) 3807c1db70SYang Wang #define MCA_REG__STATUS__SYNDV(x) MCA_REG_FIELD(x, 53, 53) 3907c1db70SYang Wang #define MCA_REG__STATUS__CECC(x) MCA_REG_FIELD(x, 46, 46) 4007c1db70SYang Wang #define MCA_REG__STATUS__UECC(x) MCA_REG_FIELD(x, 45, 45) 4107c1db70SYang Wang #define MCA_REG__STATUS__DEFERRED(x) MCA_REG_FIELD(x, 44, 44) 4207c1db70SYang Wang #define MCA_REG__STATUS__POISON(x) MCA_REG_FIELD(x, 43, 43) 4307c1db70SYang Wang #define MCA_REG__STATUS__SCRUB(x) MCA_REG_FIELD(x, 40, 40) 4407c1db70SYang Wang #define MCA_REG__STATUS__ERRCOREID(x) MCA_REG_FIELD(x, 37, 32) 4507c1db70SYang Wang #define MCA_REG__STATUS__ADDRLSB(x) MCA_REG_FIELD(x, 29, 24) 4607c1db70SYang Wang #define MCA_REG__STATUS__ERRORCODEEXT(x) MCA_REG_FIELD(x, 21, 16) 4707c1db70SYang Wang #define MCA_REG__STATUS__ERRORCODE(x) MCA_REG_FIELD(x, 15, 0) 4807c1db70SYang Wang 49058eb519SHawking Zhang #define MCA_REG__MISC0__ERRCNT(x) MCA_REG_FIELD(x, 43, 32) 50058eb519SHawking Zhang 5137c57631SYang Wang #define MCA_REG__SYND__ERRORINFORMATION(x) MCA_REG_FIELD(x, 17, 0) 5237c57631SYang Wang 537ff607e2SYang Wang enum amdgpu_mca_ip { 547ff607e2SYang Wang AMDGPU_MCA_IP_UNKNOW = -1, 557ff607e2SYang Wang AMDGPU_MCA_IP_PSP = 0, 567ff607e2SYang Wang AMDGPU_MCA_IP_SDMA, 577ff607e2SYang Wang AMDGPU_MCA_IP_GC, 587ff607e2SYang Wang AMDGPU_MCA_IP_SMU, 597ff607e2SYang Wang AMDGPU_MCA_IP_MP5, 607ff607e2SYang Wang AMDGPU_MCA_IP_UMC, 6176d2da18SYang Wang AMDGPU_MCA_IP_PCS_XGMI, 627ff607e2SYang Wang AMDGPU_MCA_IP_COUNT, 637ff607e2SYang Wang }; 647ff607e2SYang Wang 657ff607e2SYang Wang enum amdgpu_mca_error_type { 667ff607e2SYang Wang AMDGPU_MCA_ERROR_TYPE_UE = 0, 677ff607e2SYang Wang AMDGPU_MCA_ERROR_TYPE_CE, 68afb617f3SYiPeng Chai AMDGPU_MCA_ERROR_TYPE_DE, 697ff607e2SYang Wang }; 707ff607e2SYang Wang 71b0e2062dSyipechai struct amdgpu_mca_ras_block { 72b0e2062dSyipechai struct amdgpu_ras_block_object ras_block; 733907c492SJohn Clements }; 743907c492SJohn Clements 753907c492SJohn Clements struct amdgpu_mca_ras { 763907c492SJohn Clements struct ras_common_if *ras_if; 77b0e2062dSyipechai struct amdgpu_mca_ras_block *ras; 783907c492SJohn Clements }; 793907c492SJohn Clements 8076ad30f5SYang Wang struct mca_bank_set { 8176ad30f5SYang Wang int nr_entries; 8276ad30f5SYang Wang struct list_head list; 8376ad30f5SYang Wang }; 8476ad30f5SYang Wang 8576ad30f5SYang Wang struct mca_bank_cache { 8676ad30f5SYang Wang struct mca_bank_set mca_set; 87*8c9ee180SYang Wang struct mutex lock; 8876ad30f5SYang Wang }; 8976ad30f5SYang Wang 903907c492SJohn Clements struct amdgpu_mca { 913907c492SJohn Clements struct amdgpu_mca_ras mp0; 923907c492SJohn Clements struct amdgpu_mca_ras mp1; 933907c492SJohn Clements struct amdgpu_mca_ras mpio; 947ff607e2SYang Wang const struct amdgpu_mca_smu_funcs *mca_funcs; 9576ad30f5SYang Wang struct mca_bank_cache mca_caches[AMDGPU_MCA_ERROR_TYPE_DE]; 965eccab32SYang Wang atomic_t ue_update_flag; 977ff607e2SYang Wang }; 987ff607e2SYang Wang 9907c1db70SYang Wang enum mca_reg_idx { 10007c1db70SYang Wang MCA_REG_IDX_STATUS = 1, 10107c1db70SYang Wang MCA_REG_IDX_ADDR = 2, 10207c1db70SYang Wang MCA_REG_IDX_MISC0 = 3, 10307c1db70SYang Wang MCA_REG_IDX_IPID = 5, 10407c1db70SYang Wang MCA_REG_IDX_SYND = 6, 10507c1db70SYang Wang MCA_REG_IDX_COUNT = 16, 10607c1db70SYang Wang }; 10707c1db70SYang Wang 1087ff607e2SYang Wang struct mca_bank_info { 1097ff607e2SYang Wang int socket_id; 1107ff607e2SYang Wang int aid; 1117ff607e2SYang Wang int hwid; 1127ff607e2SYang Wang int mcatype; 1137ff607e2SYang Wang }; 1147ff607e2SYang Wang 1157ff607e2SYang Wang struct mca_bank_entry { 1167ff607e2SYang Wang int idx; 1177ff607e2SYang Wang enum amdgpu_mca_error_type type; 1187ff607e2SYang Wang enum amdgpu_mca_ip ip; 1197ff607e2SYang Wang struct mca_bank_info info; 1207ff607e2SYang Wang uint64_t regs[MCA_MAX_REGS_COUNT]; 1217ff607e2SYang Wang }; 1227ff607e2SYang Wang 12307c1db70SYang Wang struct mca_bank_node { 12407c1db70SYang Wang struct mca_bank_entry entry; 12507c1db70SYang Wang struct list_head node; 12607c1db70SYang Wang }; 12707c1db70SYang Wang 1287ff607e2SYang Wang struct amdgpu_mca_smu_funcs { 1297ff607e2SYang Wang int max_ue_count; 1307ff607e2SYang Wang int max_ce_count; 1317ff607e2SYang Wang int (*mca_set_debug_mode)(struct amdgpu_device *adev, bool enable); 13207c1db70SYang Wang int (*mca_parse_mca_error_count)(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, 13307c1db70SYang Wang struct mca_bank_entry *entry, uint32_t *count); 1347ff607e2SYang Wang int (*mca_get_valid_mca_count)(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, 1357ff607e2SYang Wang uint32_t *count); 1367ff607e2SYang Wang int (*mca_get_mca_entry)(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, 1377ff607e2SYang Wang int idx, struct mca_bank_entry *entry); 1383907c492SJohn Clements }; 1393907c492SJohn Clements 1403907c492SJohn Clements void amdgpu_mca_query_correctable_error_count(struct amdgpu_device *adev, 1413907c492SJohn Clements uint64_t mc_status_addr, 1423907c492SJohn Clements unsigned long *error_count); 1433907c492SJohn Clements 1443907c492SJohn Clements void amdgpu_mca_query_uncorrectable_error_count(struct amdgpu_device *adev, 1453907c492SJohn Clements uint64_t mc_status_addr, 1463907c492SJohn Clements unsigned long *error_count); 1473907c492SJohn Clements 1483907c492SJohn Clements void amdgpu_mca_reset_error_count(struct amdgpu_device *adev, 1493907c492SJohn Clements uint64_t mc_status_addr); 1503907c492SJohn Clements 1513907c492SJohn Clements void amdgpu_mca_query_ras_error_count(struct amdgpu_device *adev, 1523907c492SJohn Clements uint64_t mc_status_addr, 1533907c492SJohn Clements void *ras_error_status); 1547f544c54SHawking Zhang int amdgpu_mca_mp0_ras_sw_init(struct amdgpu_device *adev); 1557f544c54SHawking Zhang int amdgpu_mca_mp1_ras_sw_init(struct amdgpu_device *adev); 1567f544c54SHawking Zhang int amdgpu_mca_mpio_ras_sw_init(struct amdgpu_device *adev); 1577ff607e2SYang Wang 1587ff607e2SYang Wang void amdgpu_mca_smu_init_funcs(struct amdgpu_device *adev, const struct amdgpu_mca_smu_funcs *mca_funcs); 15976ad30f5SYang Wang int amdgpu_mca_init(struct amdgpu_device *adev); 16076ad30f5SYang Wang void amdgpu_mca_fini(struct amdgpu_device *adev); 16176ad30f5SYang Wang int amdgpu_mca_reset(struct amdgpu_device *adev); 1627ff607e2SYang Wang int amdgpu_mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable); 16307c1db70SYang Wang int amdgpu_mca_smu_get_mca_set_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk, 16407c1db70SYang Wang enum amdgpu_mca_error_type type, uint32_t *total); 1654051844cSYang Wang void amdgpu_mca_smu_debugfs_init(struct amdgpu_device *adev, struct dentry *root); 1669dc57c2aSYang Wang int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, 1679dc57c2aSYang Wang struct ras_err_data *err_data, struct ras_query_context *qctx); 16807c1db70SYang Wang 1693907c492SJohn Clements #endif 170