1 /* 2 * Copyright 2021 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 * 22 */ 23 24 #include <linux/devcoredump.h> 25 #include <generated/utsrelease.h> 26 27 #include "amdgpu_reset.h" 28 #include "aldebaran.h" 29 #include "sienna_cichlid.h" 30 #include "smu_v13_0_10.h" 31 32 const char *hw_ip_names[MAX_HWIP] = { 33 [GC_HWIP] = "GC", 34 [HDP_HWIP] = "HDP", 35 [SDMA0_HWIP] = "SDMA0", 36 [SDMA1_HWIP] = "SDMA1", 37 [SDMA2_HWIP] = "SDMA2", 38 [SDMA3_HWIP] = "SDMA3", 39 [SDMA4_HWIP] = "SDMA4", 40 [SDMA5_HWIP] = "SDMA5", 41 [SDMA6_HWIP] = "SDMA6", 42 [SDMA7_HWIP] = "SDMA7", 43 [LSDMA_HWIP] = "LSDMA", 44 [MMHUB_HWIP] = "MMHUB", 45 [ATHUB_HWIP] = "ATHUB", 46 [NBIO_HWIP] = "NBIO", 47 [MP0_HWIP] = "MP0", 48 [MP1_HWIP] = "MP1", 49 [UVD_HWIP] = "UVD/JPEG/VCN", 50 [VCN1_HWIP] = "VCN1", 51 [VCE_HWIP] = "VCE", 52 [VPE_HWIP] = "VPE", 53 [DF_HWIP] = "DF", 54 [DCE_HWIP] = "DCE", 55 [OSSSYS_HWIP] = "OSSSYS", 56 [SMUIO_HWIP] = "SMUIO", 57 [PWR_HWIP] = "PWR", 58 [NBIF_HWIP] = "NBIF", 59 [THM_HWIP] = "THM", 60 [CLK_HWIP] = "CLK", 61 [UMC_HWIP] = "UMC", 62 [RSMU_HWIP] = "RSMU", 63 [XGMI_HWIP] = "XGMI", 64 [DCI_HWIP] = "DCI", 65 [PCIE_HWIP] = "PCIE", 66 }; 67 68 int amdgpu_reset_init(struct amdgpu_device *adev) 69 { 70 int ret = 0; 71 72 switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) { 73 case IP_VERSION(13, 0, 2): 74 case IP_VERSION(13, 0, 6): 75 ret = aldebaran_reset_init(adev); 76 break; 77 case IP_VERSION(11, 0, 7): 78 ret = sienna_cichlid_reset_init(adev); 79 break; 80 case IP_VERSION(13, 0, 10): 81 ret = smu_v13_0_10_reset_init(adev); 82 break; 83 default: 84 break; 85 } 86 87 return ret; 88 } 89 90 int amdgpu_reset_fini(struct amdgpu_device *adev) 91 { 92 int ret = 0; 93 94 switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) { 95 case IP_VERSION(13, 0, 2): 96 case IP_VERSION(13, 0, 6): 97 ret = aldebaran_reset_fini(adev); 98 break; 99 case IP_VERSION(11, 0, 7): 100 ret = sienna_cichlid_reset_fini(adev); 101 break; 102 case IP_VERSION(13, 0, 10): 103 ret = smu_v13_0_10_reset_fini(adev); 104 break; 105 default: 106 break; 107 } 108 109 return ret; 110 } 111 112 int amdgpu_reset_prepare_hwcontext(struct amdgpu_device *adev, 113 struct amdgpu_reset_context *reset_context) 114 { 115 struct amdgpu_reset_handler *reset_handler = NULL; 116 117 if (adev->reset_cntl && adev->reset_cntl->get_reset_handler) 118 reset_handler = adev->reset_cntl->get_reset_handler( 119 adev->reset_cntl, reset_context); 120 if (!reset_handler) 121 return -EOPNOTSUPP; 122 123 return reset_handler->prepare_hwcontext(adev->reset_cntl, 124 reset_context); 125 } 126 127 int amdgpu_reset_perform_reset(struct amdgpu_device *adev, 128 struct amdgpu_reset_context *reset_context) 129 { 130 int ret; 131 struct amdgpu_reset_handler *reset_handler = NULL; 132 133 if (adev->reset_cntl) 134 reset_handler = adev->reset_cntl->get_reset_handler( 135 adev->reset_cntl, reset_context); 136 if (!reset_handler) 137 return -EOPNOTSUPP; 138 139 ret = reset_handler->perform_reset(adev->reset_cntl, reset_context); 140 if (ret) 141 return ret; 142 143 return reset_handler->restore_hwcontext(adev->reset_cntl, 144 reset_context); 145 } 146 147 148 void amdgpu_reset_destroy_reset_domain(struct kref *ref) 149 { 150 struct amdgpu_reset_domain *reset_domain = container_of(ref, 151 struct amdgpu_reset_domain, 152 refcount); 153 if (reset_domain->wq) 154 destroy_workqueue(reset_domain->wq); 155 156 kvfree(reset_domain); 157 } 158 159 struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_domain_type type, 160 char *wq_name) 161 { 162 struct amdgpu_reset_domain *reset_domain; 163 164 reset_domain = kvzalloc(sizeof(struct amdgpu_reset_domain), GFP_KERNEL); 165 if (!reset_domain) { 166 DRM_ERROR("Failed to allocate amdgpu_reset_domain!"); 167 return NULL; 168 } 169 170 reset_domain->type = type; 171 kref_init(&reset_domain->refcount); 172 173 reset_domain->wq = create_singlethread_workqueue(wq_name); 174 if (!reset_domain->wq) { 175 DRM_ERROR("Failed to allocate wq for amdgpu_reset_domain!"); 176 amdgpu_reset_put_reset_domain(reset_domain); 177 return NULL; 178 179 } 180 181 atomic_set(&reset_domain->in_gpu_reset, 0); 182 atomic_set(&reset_domain->reset_res, 0); 183 init_rwsem(&reset_domain->sem); 184 185 return reset_domain; 186 } 187 188 void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain) 189 { 190 atomic_set(&reset_domain->in_gpu_reset, 1); 191 down_write(&reset_domain->sem); 192 } 193 194 195 void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain) 196 { 197 atomic_set(&reset_domain->in_gpu_reset, 0); 198 up_write(&reset_domain->sem); 199 } 200 201 #ifndef CONFIG_DEV_COREDUMP 202 void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost, 203 struct amdgpu_reset_context *reset_context) 204 { 205 } 206 #else 207 static ssize_t 208 amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count, 209 void *data, size_t datalen) 210 { 211 struct drm_printer p; 212 struct amdgpu_coredump_info *coredump = data; 213 struct drm_print_iterator iter; 214 int i; 215 216 iter.data = buffer; 217 iter.offset = 0; 218 iter.start = offset; 219 iter.remain = count; 220 221 p = drm_coredump_printer(&iter); 222 223 drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); 224 drm_printf(&p, "version: " AMDGPU_COREDUMP_VERSION "\n"); 225 drm_printf(&p, "kernel: " UTS_RELEASE "\n"); 226 drm_printf(&p, "module: " KBUILD_MODNAME "\n"); 227 drm_printf(&p, "time: %lld.%09ld\n", coredump->reset_time.tv_sec, 228 coredump->reset_time.tv_nsec); 229 230 if (coredump->reset_task_info.pid) 231 drm_printf(&p, "process_name: %s PID: %d\n", 232 coredump->reset_task_info.process_name, 233 coredump->reset_task_info.pid); 234 235 /* GPU IP's information of the SOC */ 236 if (coredump->adev) { 237 drm_printf(&p, "\nIP Information\n"); 238 drm_printf(&p, "SOC Family: %d\n", coredump->adev->family); 239 drm_printf(&p, "SOC Revision id: %d\n", coredump->adev->rev_id); 240 drm_printf(&p, "SOC External Revision id: %d\n", 241 coredump->adev->external_rev_id); 242 243 for (int i = 1; i < MAX_HWIP; i++) { 244 for (int j = 0; j < HWIP_MAX_INSTANCE; j++) { 245 int ver = coredump->adev->ip_versions[i][j]; 246 247 if (ver) 248 drm_printf(&p, "HWIP: %s[%d][%d]: v%d.%d.%d.%d.%d\n", 249 hw_ip_names[i], i, j, 250 IP_VERSION_MAJ(ver), 251 IP_VERSION_MIN(ver), 252 IP_VERSION_REV(ver), 253 IP_VERSION_VARIANT(ver), 254 IP_VERSION_SUBREV(ver)); 255 } 256 } 257 } 258 259 if (coredump->ring) { 260 drm_printf(&p, "\nRing timed out details\n"); 261 drm_printf(&p, "IP Type: %d Ring Name: %s\n", 262 coredump->ring->funcs->type, 263 coredump->ring->name); 264 } 265 266 if (coredump->adev) { 267 struct amdgpu_vm_fault_info *fault_info = 268 &coredump->adev->vm_manager.fault_info; 269 270 drm_printf(&p, "\n[%s] Page fault observed\n", 271 fault_info->vmhub ? "mmhub" : "gfxhub"); 272 drm_printf(&p, "Faulty page starting at address: 0x%016llx\n", 273 fault_info->addr); 274 drm_printf(&p, "Protection fault status register: 0x%x\n\n", 275 fault_info->status); 276 } 277 278 drm_printf(&p, "Ring buffer information\n"); 279 for (int i = 0; i < coredump->adev->num_rings; i++) { 280 int j = 0; 281 struct amdgpu_ring *ring = coredump->adev->rings[i]; 282 283 drm_printf(&p, "ring name: %s\n", ring->name); 284 drm_printf(&p, "Rptr: 0x%llx Wptr: 0x%llx RB mask: %x\n", 285 amdgpu_ring_get_rptr(ring), 286 amdgpu_ring_get_wptr(ring), 287 ring->buf_mask); 288 drm_printf(&p, "Ring size in dwords: %d\n", 289 ring->ring_size / 4); 290 drm_printf(&p, "Ring contents\n"); 291 drm_printf(&p, "Offset \t Value\n"); 292 293 while (j < ring->ring_size) { 294 drm_printf(&p, "0x%x \t 0x%x\n", j, ring->ring[j/4]); 295 j += 4; 296 } 297 } 298 299 if (coredump->reset_vram_lost) 300 drm_printf(&p, "VRAM is lost due to GPU reset!\n"); 301 if (coredump->adev->reset_info.num_regs) { 302 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); 303 304 for (i = 0; i < coredump->adev->reset_info.num_regs; i++) 305 drm_printf(&p, "0x%08x: 0x%08x\n", 306 coredump->adev->reset_info.reset_dump_reg_list[i], 307 coredump->adev->reset_info.reset_dump_reg_value[i]); 308 } 309 310 return count - iter.remain; 311 } 312 313 static void amdgpu_devcoredump_free(void *data) 314 { 315 kfree(data); 316 } 317 318 void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost, 319 struct amdgpu_reset_context *reset_context) 320 { 321 struct amdgpu_coredump_info *coredump; 322 struct drm_device *dev = adev_to_drm(adev); 323 struct amdgpu_job *job = reset_context->job; 324 struct drm_sched_job *s_job; 325 326 coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT); 327 328 if (!coredump) { 329 DRM_ERROR("%s: failed to allocate memory for coredump\n", __func__); 330 return; 331 } 332 333 coredump->reset_vram_lost = vram_lost; 334 335 if (reset_context->job && reset_context->job->vm) { 336 struct amdgpu_task_info *ti; 337 struct amdgpu_vm *vm = reset_context->job->vm; 338 339 ti = amdgpu_vm_get_task_info_vm(vm); 340 if (ti) { 341 coredump->reset_task_info = *ti; 342 amdgpu_vm_put_task_info(ti); 343 } 344 } 345 346 if (job) { 347 s_job = &job->base; 348 coredump->ring = to_amdgpu_ring(s_job->sched); 349 } 350 351 coredump->adev = adev; 352 353 ktime_get_ts64(&coredump->reset_time); 354 355 dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT, 356 amdgpu_devcoredump_read, amdgpu_devcoredump_free); 357 } 358 #endif 359