1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/pci-p2pdma.h> 36 #include <linux/apple-gmux.h> 37 38 #include <drm/drm_aperture.h> 39 #include <drm/drm_atomic_helper.h> 40 #include <drm/drm_crtc_helper.h> 41 #include <drm/drm_fb_helper.h> 42 #include <drm/drm_probe_helper.h> 43 #include <drm/amdgpu_drm.h> 44 #include <linux/device.h> 45 #include <linux/vgaarb.h> 46 #include <linux/vga_switcheroo.h> 47 #include <linux/efi.h> 48 #include "amdgpu.h" 49 #include "amdgpu_trace.h" 50 #include "amdgpu_i2c.h" 51 #include "atom.h" 52 #include "amdgpu_atombios.h" 53 #include "amdgpu_atomfirmware.h" 54 #include "amd_pcie.h" 55 #ifdef CONFIG_DRM_AMDGPU_SI 56 #include "si.h" 57 #endif 58 #ifdef CONFIG_DRM_AMDGPU_CIK 59 #include "cik.h" 60 #endif 61 #include "vi.h" 62 #include "soc15.h" 63 #include "nv.h" 64 #include "bif/bif_4_1_d.h" 65 #include <linux/firmware.h> 66 #include "amdgpu_vf_error.h" 67 68 #include "amdgpu_amdkfd.h" 69 #include "amdgpu_pm.h" 70 71 #include "amdgpu_xgmi.h" 72 #include "amdgpu_ras.h" 73 #include "amdgpu_pmu.h" 74 #include "amdgpu_fru_eeprom.h" 75 #include "amdgpu_reset.h" 76 #include "amdgpu_virt.h" 77 #include "amdgpu_dev_coredump.h" 78 79 #include <linux/suspend.h> 80 #include <drm/task_barrier.h> 81 #include <linux/pm_runtime.h> 82 83 #include <drm/drm_drv.h> 84 85 #if IS_ENABLED(CONFIG_X86) 86 #include <asm/intel-family.h> 87 #endif 88 89 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 96 97 #define AMDGPU_RESUME_MS 2000 98 #define AMDGPU_MAX_RETRY_LIMIT 2 99 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 100 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 101 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 102 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 103 104 static const struct drm_driver amdgpu_kms_driver; 105 106 const char *amdgpu_asic_name[] = { 107 "TAHITI", 108 "PITCAIRN", 109 "VERDE", 110 "OLAND", 111 "HAINAN", 112 "BONAIRE", 113 "KAVERI", 114 "KABINI", 115 "HAWAII", 116 "MULLINS", 117 "TOPAZ", 118 "TONGA", 119 "FIJI", 120 "CARRIZO", 121 "STONEY", 122 "POLARIS10", 123 "POLARIS11", 124 "POLARIS12", 125 "VEGAM", 126 "VEGA10", 127 "VEGA12", 128 "VEGA20", 129 "RAVEN", 130 "ARCTURUS", 131 "RENOIR", 132 "ALDEBARAN", 133 "NAVI10", 134 "CYAN_SKILLFISH", 135 "NAVI14", 136 "NAVI12", 137 "SIENNA_CICHLID", 138 "NAVY_FLOUNDER", 139 "VANGOGH", 140 "DIMGREY_CAVEFISH", 141 "BEIGE_GOBY", 142 "YELLOW_CARP", 143 "IP DISCOVERY", 144 "LAST", 145 }; 146 147 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 148 149 /** 150 * DOC: pcie_replay_count 151 * 152 * The amdgpu driver provides a sysfs API for reporting the total number 153 * of PCIe replays (NAKs) 154 * The file pcie_replay_count is used for this and returns the total 155 * number of replays as a sum of the NAKs generated and NAKs received 156 */ 157 158 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 159 struct device_attribute *attr, char *buf) 160 { 161 struct drm_device *ddev = dev_get_drvdata(dev); 162 struct amdgpu_device *adev = drm_to_adev(ddev); 163 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 164 165 return sysfs_emit(buf, "%llu\n", cnt); 166 } 167 168 static DEVICE_ATTR(pcie_replay_count, 0444, 169 amdgpu_device_get_pcie_replay_count, NULL); 170 171 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 172 struct bin_attribute *attr, char *buf, 173 loff_t ppos, size_t count) 174 { 175 struct device *dev = kobj_to_dev(kobj); 176 struct drm_device *ddev = dev_get_drvdata(dev); 177 struct amdgpu_device *adev = drm_to_adev(ddev); 178 ssize_t bytes_read; 179 180 switch (ppos) { 181 case AMDGPU_SYS_REG_STATE_XGMI: 182 bytes_read = amdgpu_asic_get_reg_state( 183 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 184 break; 185 case AMDGPU_SYS_REG_STATE_WAFL: 186 bytes_read = amdgpu_asic_get_reg_state( 187 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 188 break; 189 case AMDGPU_SYS_REG_STATE_PCIE: 190 bytes_read = amdgpu_asic_get_reg_state( 191 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 192 break; 193 case AMDGPU_SYS_REG_STATE_USR: 194 bytes_read = amdgpu_asic_get_reg_state( 195 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 196 break; 197 case AMDGPU_SYS_REG_STATE_USR_1: 198 bytes_read = amdgpu_asic_get_reg_state( 199 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 200 break; 201 default: 202 return -EINVAL; 203 } 204 205 return bytes_read; 206 } 207 208 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 209 AMDGPU_SYS_REG_STATE_END); 210 211 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 212 { 213 int ret; 214 215 if (!amdgpu_asic_get_reg_state_supported(adev)) 216 return 0; 217 218 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 219 220 return ret; 221 } 222 223 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 224 { 225 if (!amdgpu_asic_get_reg_state_supported(adev)) 226 return; 227 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 228 } 229 230 /** 231 * DOC: board_info 232 * 233 * The amdgpu driver provides a sysfs API for giving board related information. 234 * It provides the form factor information in the format 235 * 236 * type : form factor 237 * 238 * Possible form factor values 239 * 240 * - "cem" - PCIE CEM card 241 * - "oam" - Open Compute Accelerator Module 242 * - "unknown" - Not known 243 * 244 */ 245 246 static ssize_t amdgpu_device_get_board_info(struct device *dev, 247 struct device_attribute *attr, 248 char *buf) 249 { 250 struct drm_device *ddev = dev_get_drvdata(dev); 251 struct amdgpu_device *adev = drm_to_adev(ddev); 252 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 253 const char *pkg; 254 255 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 256 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 257 258 switch (pkg_type) { 259 case AMDGPU_PKG_TYPE_CEM: 260 pkg = "cem"; 261 break; 262 case AMDGPU_PKG_TYPE_OAM: 263 pkg = "oam"; 264 break; 265 default: 266 pkg = "unknown"; 267 break; 268 } 269 270 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 271 } 272 273 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 274 275 static struct attribute *amdgpu_board_attrs[] = { 276 &dev_attr_board_info.attr, 277 NULL, 278 }; 279 280 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 281 struct attribute *attr, int n) 282 { 283 struct device *dev = kobj_to_dev(kobj); 284 struct drm_device *ddev = dev_get_drvdata(dev); 285 struct amdgpu_device *adev = drm_to_adev(ddev); 286 287 if (adev->flags & AMD_IS_APU) 288 return 0; 289 290 return attr->mode; 291 } 292 293 static const struct attribute_group amdgpu_board_attrs_group = { 294 .attrs = amdgpu_board_attrs, 295 .is_visible = amdgpu_board_attrs_is_visible 296 }; 297 298 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 299 300 301 /** 302 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 303 * 304 * @dev: drm_device pointer 305 * 306 * Returns true if the device is a dGPU with ATPX power control, 307 * otherwise return false. 308 */ 309 bool amdgpu_device_supports_px(struct drm_device *dev) 310 { 311 struct amdgpu_device *adev = drm_to_adev(dev); 312 313 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 314 return true; 315 return false; 316 } 317 318 /** 319 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 320 * 321 * @dev: drm_device pointer 322 * 323 * Returns true if the device is a dGPU with ACPI power control, 324 * otherwise return false. 325 */ 326 bool amdgpu_device_supports_boco(struct drm_device *dev) 327 { 328 struct amdgpu_device *adev = drm_to_adev(dev); 329 330 if (adev->has_pr3 || 331 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 332 return true; 333 return false; 334 } 335 336 /** 337 * amdgpu_device_supports_baco - Does the device support BACO 338 * 339 * @dev: drm_device pointer 340 * 341 * Returns true if the device supporte BACO, 342 * otherwise return false. 343 */ 344 bool amdgpu_device_supports_baco(struct drm_device *dev) 345 { 346 struct amdgpu_device *adev = drm_to_adev(dev); 347 348 return amdgpu_asic_supports_baco(adev); 349 } 350 351 /** 352 * amdgpu_device_supports_smart_shift - Is the device dGPU with 353 * smart shift support 354 * 355 * @dev: drm_device pointer 356 * 357 * Returns true if the device is a dGPU with Smart Shift support, 358 * otherwise returns false. 359 */ 360 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 361 { 362 return (amdgpu_device_supports_boco(dev) && 363 amdgpu_acpi_is_power_shift_control_supported()); 364 } 365 366 /* 367 * VRAM access helper functions 368 */ 369 370 /** 371 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 372 * 373 * @adev: amdgpu_device pointer 374 * @pos: offset of the buffer in vram 375 * @buf: virtual address of the buffer in system memory 376 * @size: read/write size, sizeof(@buf) must > @size 377 * @write: true - write to vram, otherwise - read from vram 378 */ 379 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 380 void *buf, size_t size, bool write) 381 { 382 unsigned long flags; 383 uint32_t hi = ~0, tmp = 0; 384 uint32_t *data = buf; 385 uint64_t last; 386 int idx; 387 388 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 389 return; 390 391 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 392 393 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 394 for (last = pos + size; pos < last; pos += 4) { 395 tmp = pos >> 31; 396 397 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 398 if (tmp != hi) { 399 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 400 hi = tmp; 401 } 402 if (write) 403 WREG32_NO_KIQ(mmMM_DATA, *data++); 404 else 405 *data++ = RREG32_NO_KIQ(mmMM_DATA); 406 } 407 408 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 409 drm_dev_exit(idx); 410 } 411 412 /** 413 * amdgpu_device_aper_access - access vram by vram aperature 414 * 415 * @adev: amdgpu_device pointer 416 * @pos: offset of the buffer in vram 417 * @buf: virtual address of the buffer in system memory 418 * @size: read/write size, sizeof(@buf) must > @size 419 * @write: true - write to vram, otherwise - read from vram 420 * 421 * The return value means how many bytes have been transferred. 422 */ 423 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 424 void *buf, size_t size, bool write) 425 { 426 #ifdef CONFIG_64BIT 427 void __iomem *addr; 428 size_t count = 0; 429 uint64_t last; 430 431 if (!adev->mman.aper_base_kaddr) 432 return 0; 433 434 last = min(pos + size, adev->gmc.visible_vram_size); 435 if (last > pos) { 436 addr = adev->mman.aper_base_kaddr + pos; 437 count = last - pos; 438 439 if (write) { 440 memcpy_toio(addr, buf, count); 441 /* Make sure HDP write cache flush happens without any reordering 442 * after the system memory contents are sent over PCIe device 443 */ 444 mb(); 445 amdgpu_device_flush_hdp(adev, NULL); 446 } else { 447 amdgpu_device_invalidate_hdp(adev, NULL); 448 /* Make sure HDP read cache is invalidated before issuing a read 449 * to the PCIe device 450 */ 451 mb(); 452 memcpy_fromio(buf, addr, count); 453 } 454 455 } 456 457 return count; 458 #else 459 return 0; 460 #endif 461 } 462 463 /** 464 * amdgpu_device_vram_access - read/write a buffer in vram 465 * 466 * @adev: amdgpu_device pointer 467 * @pos: offset of the buffer in vram 468 * @buf: virtual address of the buffer in system memory 469 * @size: read/write size, sizeof(@buf) must > @size 470 * @write: true - write to vram, otherwise - read from vram 471 */ 472 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 473 void *buf, size_t size, bool write) 474 { 475 size_t count; 476 477 /* try to using vram apreature to access vram first */ 478 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 479 size -= count; 480 if (size) { 481 /* using MM to access rest vram */ 482 pos += count; 483 buf += count; 484 amdgpu_device_mm_access(adev, pos, buf, size, write); 485 } 486 } 487 488 /* 489 * register access helper functions. 490 */ 491 492 /* Check if hw access should be skipped because of hotplug or device error */ 493 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 494 { 495 if (adev->no_hw_access) 496 return true; 497 498 #ifdef CONFIG_LOCKDEP 499 /* 500 * This is a bit complicated to understand, so worth a comment. What we assert 501 * here is that the GPU reset is not running on another thread in parallel. 502 * 503 * For this we trylock the read side of the reset semaphore, if that succeeds 504 * we know that the reset is not running in paralell. 505 * 506 * If the trylock fails we assert that we are either already holding the read 507 * side of the lock or are the reset thread itself and hold the write side of 508 * the lock. 509 */ 510 if (in_task()) { 511 if (down_read_trylock(&adev->reset_domain->sem)) 512 up_read(&adev->reset_domain->sem); 513 else 514 lockdep_assert_held(&adev->reset_domain->sem); 515 } 516 #endif 517 return false; 518 } 519 520 /** 521 * amdgpu_device_rreg - read a memory mapped IO or indirect register 522 * 523 * @adev: amdgpu_device pointer 524 * @reg: dword aligned register offset 525 * @acc_flags: access flags which require special behavior 526 * 527 * Returns the 32 bit value from the offset specified. 528 */ 529 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 530 uint32_t reg, uint32_t acc_flags) 531 { 532 uint32_t ret; 533 534 if (amdgpu_device_skip_hw_access(adev)) 535 return 0; 536 537 if ((reg * 4) < adev->rmmio_size) { 538 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 539 amdgpu_sriov_runtime(adev) && 540 down_read_trylock(&adev->reset_domain->sem)) { 541 ret = amdgpu_kiq_rreg(adev, reg, 0); 542 up_read(&adev->reset_domain->sem); 543 } else { 544 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 545 } 546 } else { 547 ret = adev->pcie_rreg(adev, reg * 4); 548 } 549 550 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 551 552 return ret; 553 } 554 555 /* 556 * MMIO register read with bytes helper functions 557 * @offset:bytes offset from MMIO start 558 */ 559 560 /** 561 * amdgpu_mm_rreg8 - read a memory mapped IO register 562 * 563 * @adev: amdgpu_device pointer 564 * @offset: byte aligned register offset 565 * 566 * Returns the 8 bit value from the offset specified. 567 */ 568 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 569 { 570 if (amdgpu_device_skip_hw_access(adev)) 571 return 0; 572 573 if (offset < adev->rmmio_size) 574 return (readb(adev->rmmio + offset)); 575 BUG(); 576 } 577 578 579 /** 580 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 581 * 582 * @adev: amdgpu_device pointer 583 * @reg: dword aligned register offset 584 * @acc_flags: access flags which require special behavior 585 * @xcc_id: xcc accelerated compute core id 586 * 587 * Returns the 32 bit value from the offset specified. 588 */ 589 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 590 uint32_t reg, uint32_t acc_flags, 591 uint32_t xcc_id) 592 { 593 uint32_t ret, rlcg_flag; 594 595 if (amdgpu_device_skip_hw_access(adev)) 596 return 0; 597 598 if ((reg * 4) < adev->rmmio_size) { 599 if (amdgpu_sriov_vf(adev) && 600 !amdgpu_sriov_runtime(adev) && 601 adev->gfx.rlc.rlcg_reg_access_supported && 602 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 603 GC_HWIP, false, 604 &rlcg_flag)) { 605 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, xcc_id); 606 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 607 amdgpu_sriov_runtime(adev) && 608 down_read_trylock(&adev->reset_domain->sem)) { 609 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 610 up_read(&adev->reset_domain->sem); 611 } else { 612 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 613 } 614 } else { 615 ret = adev->pcie_rreg(adev, reg * 4); 616 } 617 618 return ret; 619 } 620 621 /* 622 * MMIO register write with bytes helper functions 623 * @offset:bytes offset from MMIO start 624 * @value: the value want to be written to the register 625 */ 626 627 /** 628 * amdgpu_mm_wreg8 - read a memory mapped IO register 629 * 630 * @adev: amdgpu_device pointer 631 * @offset: byte aligned register offset 632 * @value: 8 bit value to write 633 * 634 * Writes the value specified to the offset specified. 635 */ 636 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 637 { 638 if (amdgpu_device_skip_hw_access(adev)) 639 return; 640 641 if (offset < adev->rmmio_size) 642 writeb(value, adev->rmmio + offset); 643 else 644 BUG(); 645 } 646 647 /** 648 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 649 * 650 * @adev: amdgpu_device pointer 651 * @reg: dword aligned register offset 652 * @v: 32 bit value to write to the register 653 * @acc_flags: access flags which require special behavior 654 * 655 * Writes the value specified to the offset specified. 656 */ 657 void amdgpu_device_wreg(struct amdgpu_device *adev, 658 uint32_t reg, uint32_t v, 659 uint32_t acc_flags) 660 { 661 if (amdgpu_device_skip_hw_access(adev)) 662 return; 663 664 if ((reg * 4) < adev->rmmio_size) { 665 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 666 amdgpu_sriov_runtime(adev) && 667 down_read_trylock(&adev->reset_domain->sem)) { 668 amdgpu_kiq_wreg(adev, reg, v, 0); 669 up_read(&adev->reset_domain->sem); 670 } else { 671 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 672 } 673 } else { 674 adev->pcie_wreg(adev, reg * 4, v); 675 } 676 677 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 678 } 679 680 /** 681 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 682 * 683 * @adev: amdgpu_device pointer 684 * @reg: mmio/rlc register 685 * @v: value to write 686 * @xcc_id: xcc accelerated compute core id 687 * 688 * this function is invoked only for the debugfs register access 689 */ 690 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 691 uint32_t reg, uint32_t v, 692 uint32_t xcc_id) 693 { 694 if (amdgpu_device_skip_hw_access(adev)) 695 return; 696 697 if (amdgpu_sriov_fullaccess(adev) && 698 adev->gfx.rlc.funcs && 699 adev->gfx.rlc.funcs->is_rlcg_access_range) { 700 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 701 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 702 } else if ((reg * 4) >= adev->rmmio_size) { 703 adev->pcie_wreg(adev, reg * 4, v); 704 } else { 705 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 706 } 707 } 708 709 /** 710 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 711 * 712 * @adev: amdgpu_device pointer 713 * @reg: dword aligned register offset 714 * @v: 32 bit value to write to the register 715 * @acc_flags: access flags which require special behavior 716 * @xcc_id: xcc accelerated compute core id 717 * 718 * Writes the value specified to the offset specified. 719 */ 720 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 721 uint32_t reg, uint32_t v, 722 uint32_t acc_flags, uint32_t xcc_id) 723 { 724 uint32_t rlcg_flag; 725 726 if (amdgpu_device_skip_hw_access(adev)) 727 return; 728 729 if ((reg * 4) < adev->rmmio_size) { 730 if (amdgpu_sriov_vf(adev) && 731 !amdgpu_sriov_runtime(adev) && 732 adev->gfx.rlc.rlcg_reg_access_supported && 733 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 734 GC_HWIP, true, 735 &rlcg_flag)) { 736 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, xcc_id); 737 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 738 amdgpu_sriov_runtime(adev) && 739 down_read_trylock(&adev->reset_domain->sem)) { 740 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 741 up_read(&adev->reset_domain->sem); 742 } else { 743 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 744 } 745 } else { 746 adev->pcie_wreg(adev, reg * 4, v); 747 } 748 } 749 750 /** 751 * amdgpu_device_indirect_rreg - read an indirect register 752 * 753 * @adev: amdgpu_device pointer 754 * @reg_addr: indirect register address to read from 755 * 756 * Returns the value of indirect register @reg_addr 757 */ 758 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 759 u32 reg_addr) 760 { 761 unsigned long flags, pcie_index, pcie_data; 762 void __iomem *pcie_index_offset; 763 void __iomem *pcie_data_offset; 764 u32 r; 765 766 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 767 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 768 769 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 770 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 771 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 772 773 writel(reg_addr, pcie_index_offset); 774 readl(pcie_index_offset); 775 r = readl(pcie_data_offset); 776 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 777 778 return r; 779 } 780 781 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 782 u64 reg_addr) 783 { 784 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 785 u32 r; 786 void __iomem *pcie_index_offset; 787 void __iomem *pcie_index_hi_offset; 788 void __iomem *pcie_data_offset; 789 790 if (unlikely(!adev->nbio.funcs)) { 791 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 792 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 793 } else { 794 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 795 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 796 } 797 798 if (reg_addr >> 32) { 799 if (unlikely(!adev->nbio.funcs)) 800 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 801 else 802 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 803 } else { 804 pcie_index_hi = 0; 805 } 806 807 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 808 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 809 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 810 if (pcie_index_hi != 0) 811 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 812 pcie_index_hi * 4; 813 814 writel(reg_addr, pcie_index_offset); 815 readl(pcie_index_offset); 816 if (pcie_index_hi != 0) { 817 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 818 readl(pcie_index_hi_offset); 819 } 820 r = readl(pcie_data_offset); 821 822 /* clear the high bits */ 823 if (pcie_index_hi != 0) { 824 writel(0, pcie_index_hi_offset); 825 readl(pcie_index_hi_offset); 826 } 827 828 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 829 830 return r; 831 } 832 833 /** 834 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 835 * 836 * @adev: amdgpu_device pointer 837 * @reg_addr: indirect register address to read from 838 * 839 * Returns the value of indirect register @reg_addr 840 */ 841 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 842 u32 reg_addr) 843 { 844 unsigned long flags, pcie_index, pcie_data; 845 void __iomem *pcie_index_offset; 846 void __iomem *pcie_data_offset; 847 u64 r; 848 849 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 850 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 851 852 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 853 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 854 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 855 856 /* read low 32 bits */ 857 writel(reg_addr, pcie_index_offset); 858 readl(pcie_index_offset); 859 r = readl(pcie_data_offset); 860 /* read high 32 bits */ 861 writel(reg_addr + 4, pcie_index_offset); 862 readl(pcie_index_offset); 863 r |= ((u64)readl(pcie_data_offset) << 32); 864 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 865 866 return r; 867 } 868 869 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 870 u64 reg_addr) 871 { 872 unsigned long flags, pcie_index, pcie_data; 873 unsigned long pcie_index_hi = 0; 874 void __iomem *pcie_index_offset; 875 void __iomem *pcie_index_hi_offset; 876 void __iomem *pcie_data_offset; 877 u64 r; 878 879 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 880 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 881 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 882 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 883 884 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 885 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 886 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 887 if (pcie_index_hi != 0) 888 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 889 pcie_index_hi * 4; 890 891 /* read low 32 bits */ 892 writel(reg_addr, pcie_index_offset); 893 readl(pcie_index_offset); 894 if (pcie_index_hi != 0) { 895 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 896 readl(pcie_index_hi_offset); 897 } 898 r = readl(pcie_data_offset); 899 /* read high 32 bits */ 900 writel(reg_addr + 4, pcie_index_offset); 901 readl(pcie_index_offset); 902 if (pcie_index_hi != 0) { 903 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 904 readl(pcie_index_hi_offset); 905 } 906 r |= ((u64)readl(pcie_data_offset) << 32); 907 908 /* clear the high bits */ 909 if (pcie_index_hi != 0) { 910 writel(0, pcie_index_hi_offset); 911 readl(pcie_index_hi_offset); 912 } 913 914 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 915 916 return r; 917 } 918 919 /** 920 * amdgpu_device_indirect_wreg - write an indirect register address 921 * 922 * @adev: amdgpu_device pointer 923 * @reg_addr: indirect register offset 924 * @reg_data: indirect register data 925 * 926 */ 927 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 928 u32 reg_addr, u32 reg_data) 929 { 930 unsigned long flags, pcie_index, pcie_data; 931 void __iomem *pcie_index_offset; 932 void __iomem *pcie_data_offset; 933 934 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 935 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 936 937 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 938 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 939 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 940 941 writel(reg_addr, pcie_index_offset); 942 readl(pcie_index_offset); 943 writel(reg_data, pcie_data_offset); 944 readl(pcie_data_offset); 945 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 946 } 947 948 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 949 u64 reg_addr, u32 reg_data) 950 { 951 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 952 void __iomem *pcie_index_offset; 953 void __iomem *pcie_index_hi_offset; 954 void __iomem *pcie_data_offset; 955 956 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 957 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 958 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 959 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 960 else 961 pcie_index_hi = 0; 962 963 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 964 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 965 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 966 if (pcie_index_hi != 0) 967 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 968 pcie_index_hi * 4; 969 970 writel(reg_addr, pcie_index_offset); 971 readl(pcie_index_offset); 972 if (pcie_index_hi != 0) { 973 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 974 readl(pcie_index_hi_offset); 975 } 976 writel(reg_data, pcie_data_offset); 977 readl(pcie_data_offset); 978 979 /* clear the high bits */ 980 if (pcie_index_hi != 0) { 981 writel(0, pcie_index_hi_offset); 982 readl(pcie_index_hi_offset); 983 } 984 985 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 986 } 987 988 /** 989 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 990 * 991 * @adev: amdgpu_device pointer 992 * @reg_addr: indirect register offset 993 * @reg_data: indirect register data 994 * 995 */ 996 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 997 u32 reg_addr, u64 reg_data) 998 { 999 unsigned long flags, pcie_index, pcie_data; 1000 void __iomem *pcie_index_offset; 1001 void __iomem *pcie_data_offset; 1002 1003 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1004 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1005 1006 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1007 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1008 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1009 1010 /* write low 32 bits */ 1011 writel(reg_addr, pcie_index_offset); 1012 readl(pcie_index_offset); 1013 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1014 readl(pcie_data_offset); 1015 /* write high 32 bits */ 1016 writel(reg_addr + 4, pcie_index_offset); 1017 readl(pcie_index_offset); 1018 writel((u32)(reg_data >> 32), pcie_data_offset); 1019 readl(pcie_data_offset); 1020 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1021 } 1022 1023 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1024 u64 reg_addr, u64 reg_data) 1025 { 1026 unsigned long flags, pcie_index, pcie_data; 1027 unsigned long pcie_index_hi = 0; 1028 void __iomem *pcie_index_offset; 1029 void __iomem *pcie_index_hi_offset; 1030 void __iomem *pcie_data_offset; 1031 1032 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1033 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1034 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1035 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1036 1037 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1038 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1039 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1040 if (pcie_index_hi != 0) 1041 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1042 pcie_index_hi * 4; 1043 1044 /* write low 32 bits */ 1045 writel(reg_addr, pcie_index_offset); 1046 readl(pcie_index_offset); 1047 if (pcie_index_hi != 0) { 1048 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1049 readl(pcie_index_hi_offset); 1050 } 1051 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1052 readl(pcie_data_offset); 1053 /* write high 32 bits */ 1054 writel(reg_addr + 4, pcie_index_offset); 1055 readl(pcie_index_offset); 1056 if (pcie_index_hi != 0) { 1057 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1058 readl(pcie_index_hi_offset); 1059 } 1060 writel((u32)(reg_data >> 32), pcie_data_offset); 1061 readl(pcie_data_offset); 1062 1063 /* clear the high bits */ 1064 if (pcie_index_hi != 0) { 1065 writel(0, pcie_index_hi_offset); 1066 readl(pcie_index_hi_offset); 1067 } 1068 1069 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1070 } 1071 1072 /** 1073 * amdgpu_device_get_rev_id - query device rev_id 1074 * 1075 * @adev: amdgpu_device pointer 1076 * 1077 * Return device rev_id 1078 */ 1079 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1080 { 1081 return adev->nbio.funcs->get_rev_id(adev); 1082 } 1083 1084 /** 1085 * amdgpu_invalid_rreg - dummy reg read function 1086 * 1087 * @adev: amdgpu_device pointer 1088 * @reg: offset of register 1089 * 1090 * Dummy register read function. Used for register blocks 1091 * that certain asics don't have (all asics). 1092 * Returns the value in the register. 1093 */ 1094 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1095 { 1096 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 1097 BUG(); 1098 return 0; 1099 } 1100 1101 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1102 { 1103 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1104 BUG(); 1105 return 0; 1106 } 1107 1108 /** 1109 * amdgpu_invalid_wreg - dummy reg write function 1110 * 1111 * @adev: amdgpu_device pointer 1112 * @reg: offset of register 1113 * @v: value to write to the register 1114 * 1115 * Dummy register read function. Used for register blocks 1116 * that certain asics don't have (all asics). 1117 */ 1118 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1119 { 1120 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 1121 reg, v); 1122 BUG(); 1123 } 1124 1125 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1126 { 1127 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 1128 reg, v); 1129 BUG(); 1130 } 1131 1132 /** 1133 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1134 * 1135 * @adev: amdgpu_device pointer 1136 * @reg: offset of register 1137 * 1138 * Dummy register read function. Used for register blocks 1139 * that certain asics don't have (all asics). 1140 * Returns the value in the register. 1141 */ 1142 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1143 { 1144 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 1145 BUG(); 1146 return 0; 1147 } 1148 1149 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1150 { 1151 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1152 BUG(); 1153 return 0; 1154 } 1155 1156 /** 1157 * amdgpu_invalid_wreg64 - dummy reg write function 1158 * 1159 * @adev: amdgpu_device pointer 1160 * @reg: offset of register 1161 * @v: value to write to the register 1162 * 1163 * Dummy register read function. Used for register blocks 1164 * that certain asics don't have (all asics). 1165 */ 1166 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1167 { 1168 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1169 reg, v); 1170 BUG(); 1171 } 1172 1173 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1174 { 1175 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1176 reg, v); 1177 BUG(); 1178 } 1179 1180 /** 1181 * amdgpu_block_invalid_rreg - dummy reg read function 1182 * 1183 * @adev: amdgpu_device pointer 1184 * @block: offset of instance 1185 * @reg: offset of register 1186 * 1187 * Dummy register read function. Used for register blocks 1188 * that certain asics don't have (all asics). 1189 * Returns the value in the register. 1190 */ 1191 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1192 uint32_t block, uint32_t reg) 1193 { 1194 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1195 reg, block); 1196 BUG(); 1197 return 0; 1198 } 1199 1200 /** 1201 * amdgpu_block_invalid_wreg - dummy reg write function 1202 * 1203 * @adev: amdgpu_device pointer 1204 * @block: offset of instance 1205 * @reg: offset of register 1206 * @v: value to write to the register 1207 * 1208 * Dummy register read function. Used for register blocks 1209 * that certain asics don't have (all asics). 1210 */ 1211 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1212 uint32_t block, 1213 uint32_t reg, uint32_t v) 1214 { 1215 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1216 reg, block, v); 1217 BUG(); 1218 } 1219 1220 /** 1221 * amdgpu_device_asic_init - Wrapper for atom asic_init 1222 * 1223 * @adev: amdgpu_device pointer 1224 * 1225 * Does any asic specific work and then calls atom asic init. 1226 */ 1227 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1228 { 1229 int ret; 1230 1231 amdgpu_asic_pre_asic_init(adev); 1232 1233 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1234 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1235 amdgpu_psp_wait_for_bootloader(adev); 1236 ret = amdgpu_atomfirmware_asic_init(adev, true); 1237 return ret; 1238 } else { 1239 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1240 } 1241 1242 return 0; 1243 } 1244 1245 /** 1246 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1247 * 1248 * @adev: amdgpu_device pointer 1249 * 1250 * Allocates a scratch page of VRAM for use by various things in the 1251 * driver. 1252 */ 1253 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1254 { 1255 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1256 AMDGPU_GEM_DOMAIN_VRAM | 1257 AMDGPU_GEM_DOMAIN_GTT, 1258 &adev->mem_scratch.robj, 1259 &adev->mem_scratch.gpu_addr, 1260 (void **)&adev->mem_scratch.ptr); 1261 } 1262 1263 /** 1264 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1265 * 1266 * @adev: amdgpu_device pointer 1267 * 1268 * Frees the VRAM scratch page. 1269 */ 1270 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1271 { 1272 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1273 } 1274 1275 /** 1276 * amdgpu_device_program_register_sequence - program an array of registers. 1277 * 1278 * @adev: amdgpu_device pointer 1279 * @registers: pointer to the register array 1280 * @array_size: size of the register array 1281 * 1282 * Programs an array or registers with and or masks. 1283 * This is a helper for setting golden registers. 1284 */ 1285 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1286 const u32 *registers, 1287 const u32 array_size) 1288 { 1289 u32 tmp, reg, and_mask, or_mask; 1290 int i; 1291 1292 if (array_size % 3) 1293 return; 1294 1295 for (i = 0; i < array_size; i += 3) { 1296 reg = registers[i + 0]; 1297 and_mask = registers[i + 1]; 1298 or_mask = registers[i + 2]; 1299 1300 if (and_mask == 0xffffffff) { 1301 tmp = or_mask; 1302 } else { 1303 tmp = RREG32(reg); 1304 tmp &= ~and_mask; 1305 if (adev->family >= AMDGPU_FAMILY_AI) 1306 tmp |= (or_mask & and_mask); 1307 else 1308 tmp |= or_mask; 1309 } 1310 WREG32(reg, tmp); 1311 } 1312 } 1313 1314 /** 1315 * amdgpu_device_pci_config_reset - reset the GPU 1316 * 1317 * @adev: amdgpu_device pointer 1318 * 1319 * Resets the GPU using the pci config reset sequence. 1320 * Only applicable to asics prior to vega10. 1321 */ 1322 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1323 { 1324 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1325 } 1326 1327 /** 1328 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1329 * 1330 * @adev: amdgpu_device pointer 1331 * 1332 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1333 */ 1334 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1335 { 1336 return pci_reset_function(adev->pdev); 1337 } 1338 1339 /* 1340 * amdgpu_device_wb_*() 1341 * Writeback is the method by which the GPU updates special pages in memory 1342 * with the status of certain GPU events (fences, ring pointers,etc.). 1343 */ 1344 1345 /** 1346 * amdgpu_device_wb_fini - Disable Writeback and free memory 1347 * 1348 * @adev: amdgpu_device pointer 1349 * 1350 * Disables Writeback and frees the Writeback memory (all asics). 1351 * Used at driver shutdown. 1352 */ 1353 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1354 { 1355 if (adev->wb.wb_obj) { 1356 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1357 &adev->wb.gpu_addr, 1358 (void **)&adev->wb.wb); 1359 adev->wb.wb_obj = NULL; 1360 } 1361 } 1362 1363 /** 1364 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1365 * 1366 * @adev: amdgpu_device pointer 1367 * 1368 * Initializes writeback and allocates writeback memory (all asics). 1369 * Used at driver startup. 1370 * Returns 0 on success or an -error on failure. 1371 */ 1372 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1373 { 1374 int r; 1375 1376 if (adev->wb.wb_obj == NULL) { 1377 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1378 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1379 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1380 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1381 (void **)&adev->wb.wb); 1382 if (r) { 1383 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1384 return r; 1385 } 1386 1387 adev->wb.num_wb = AMDGPU_MAX_WB; 1388 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1389 1390 /* clear wb memory */ 1391 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1392 } 1393 1394 return 0; 1395 } 1396 1397 /** 1398 * amdgpu_device_wb_get - Allocate a wb entry 1399 * 1400 * @adev: amdgpu_device pointer 1401 * @wb: wb index 1402 * 1403 * Allocate a wb slot for use by the driver (all asics). 1404 * Returns 0 on success or -EINVAL on failure. 1405 */ 1406 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1407 { 1408 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1409 1410 if (offset < adev->wb.num_wb) { 1411 __set_bit(offset, adev->wb.used); 1412 *wb = offset << 3; /* convert to dw offset */ 1413 return 0; 1414 } else { 1415 return -EINVAL; 1416 } 1417 } 1418 1419 /** 1420 * amdgpu_device_wb_free - Free a wb entry 1421 * 1422 * @adev: amdgpu_device pointer 1423 * @wb: wb index 1424 * 1425 * Free a wb slot allocated for use by the driver (all asics) 1426 */ 1427 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1428 { 1429 wb >>= 3; 1430 if (wb < adev->wb.num_wb) 1431 __clear_bit(wb, adev->wb.used); 1432 } 1433 1434 /** 1435 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1436 * 1437 * @adev: amdgpu_device pointer 1438 * 1439 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1440 * to fail, but if any of the BARs is not accessible after the size we abort 1441 * driver loading by returning -ENODEV. 1442 */ 1443 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1444 { 1445 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1446 struct pci_bus *root; 1447 struct resource *res; 1448 unsigned int i; 1449 u16 cmd; 1450 int r; 1451 1452 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1453 return 0; 1454 1455 /* Bypass for VF */ 1456 if (amdgpu_sriov_vf(adev)) 1457 return 0; 1458 1459 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1460 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1461 DRM_WARN("System can't access extended configuration space,please check!!\n"); 1462 1463 /* skip if the bios has already enabled large BAR */ 1464 if (adev->gmc.real_vram_size && 1465 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1466 return 0; 1467 1468 /* Check if the root BUS has 64bit memory resources */ 1469 root = adev->pdev->bus; 1470 while (root->parent) 1471 root = root->parent; 1472 1473 pci_bus_for_each_resource(root, res, i) { 1474 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1475 res->start > 0x100000000ull) 1476 break; 1477 } 1478 1479 /* Trying to resize is pointless without a root hub window above 4GB */ 1480 if (!res) 1481 return 0; 1482 1483 /* Limit the BAR size to what is available */ 1484 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1485 rbar_size); 1486 1487 /* Disable memory decoding while we change the BAR addresses and size */ 1488 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1489 pci_write_config_word(adev->pdev, PCI_COMMAND, 1490 cmd & ~PCI_COMMAND_MEMORY); 1491 1492 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1493 amdgpu_doorbell_fini(adev); 1494 if (adev->asic_type >= CHIP_BONAIRE) 1495 pci_release_resource(adev->pdev, 2); 1496 1497 pci_release_resource(adev->pdev, 0); 1498 1499 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1500 if (r == -ENOSPC) 1501 DRM_INFO("Not enough PCI address space for a large BAR."); 1502 else if (r && r != -ENOTSUPP) 1503 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1504 1505 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1506 1507 /* When the doorbell or fb BAR isn't available we have no chance of 1508 * using the device. 1509 */ 1510 r = amdgpu_doorbell_init(adev); 1511 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1512 return -ENODEV; 1513 1514 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1515 1516 return 0; 1517 } 1518 1519 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1520 { 1521 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1522 return false; 1523 1524 return true; 1525 } 1526 1527 /* 1528 * GPU helpers function. 1529 */ 1530 /** 1531 * amdgpu_device_need_post - check if the hw need post or not 1532 * 1533 * @adev: amdgpu_device pointer 1534 * 1535 * Check if the asic has been initialized (all asics) at driver startup 1536 * or post is needed if hw reset is performed. 1537 * Returns true if need or false if not. 1538 */ 1539 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1540 { 1541 uint32_t reg; 1542 1543 if (amdgpu_sriov_vf(adev)) 1544 return false; 1545 1546 if (!amdgpu_device_read_bios(adev)) 1547 return false; 1548 1549 if (amdgpu_passthrough(adev)) { 1550 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1551 * some old smc fw still need driver do vPost otherwise gpu hang, while 1552 * those smc fw version above 22.15 doesn't have this flaw, so we force 1553 * vpost executed for smc version below 22.15 1554 */ 1555 if (adev->asic_type == CHIP_FIJI) { 1556 int err; 1557 uint32_t fw_ver; 1558 1559 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1560 /* force vPost if error occured */ 1561 if (err) 1562 return true; 1563 1564 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1565 release_firmware(adev->pm.fw); 1566 if (fw_ver < 0x00160e00) 1567 return true; 1568 } 1569 } 1570 1571 /* Don't post if we need to reset whole hive on init */ 1572 if (adev->gmc.xgmi.pending_reset) 1573 return false; 1574 1575 if (adev->has_hw_reset) { 1576 adev->has_hw_reset = false; 1577 return true; 1578 } 1579 1580 /* bios scratch used on CIK+ */ 1581 if (adev->asic_type >= CHIP_BONAIRE) 1582 return amdgpu_atombios_scratch_need_asic_init(adev); 1583 1584 /* check MEM_SIZE for older asics */ 1585 reg = amdgpu_asic_get_config_memsize(adev); 1586 1587 if ((reg != 0) && (reg != 0xffffffff)) 1588 return false; 1589 1590 return true; 1591 } 1592 1593 /* 1594 * Check whether seamless boot is supported. 1595 * 1596 * So far we only support seamless boot on DCE 3.0 or later. 1597 * If users report that it works on older ASICS as well, we may 1598 * loosen this. 1599 */ 1600 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1601 { 1602 switch (amdgpu_seamless) { 1603 case -1: 1604 break; 1605 case 1: 1606 return true; 1607 case 0: 1608 return false; 1609 default: 1610 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1611 amdgpu_seamless); 1612 return false; 1613 } 1614 1615 if (!(adev->flags & AMD_IS_APU)) 1616 return false; 1617 1618 if (adev->mman.keep_stolen_vga_memory) 1619 return false; 1620 1621 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1622 } 1623 1624 /* 1625 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1626 * don't support dynamic speed switching. Until we have confirmation from Intel 1627 * that a specific host supports it, it's safer that we keep it disabled for all. 1628 * 1629 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1630 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1631 */ 1632 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1633 { 1634 #if IS_ENABLED(CONFIG_X86) 1635 struct cpuinfo_x86 *c = &cpu_data(0); 1636 1637 /* eGPU change speeds based on USB4 fabric conditions */ 1638 if (dev_is_removable(adev->dev)) 1639 return true; 1640 1641 if (c->x86_vendor == X86_VENDOR_INTEL) 1642 return false; 1643 #endif 1644 return true; 1645 } 1646 1647 /** 1648 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1649 * 1650 * @adev: amdgpu_device pointer 1651 * 1652 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1653 * be set for this device. 1654 * 1655 * Returns true if it should be used or false if not. 1656 */ 1657 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1658 { 1659 switch (amdgpu_aspm) { 1660 case -1: 1661 break; 1662 case 0: 1663 return false; 1664 case 1: 1665 return true; 1666 default: 1667 return false; 1668 } 1669 if (adev->flags & AMD_IS_APU) 1670 return false; 1671 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) 1672 return false; 1673 return pcie_aspm_enabled(adev->pdev); 1674 } 1675 1676 /* if we get transitioned to only one device, take VGA back */ 1677 /** 1678 * amdgpu_device_vga_set_decode - enable/disable vga decode 1679 * 1680 * @pdev: PCI device pointer 1681 * @state: enable/disable vga decode 1682 * 1683 * Enable/disable vga decode (all asics). 1684 * Returns VGA resource flags. 1685 */ 1686 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1687 bool state) 1688 { 1689 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1690 1691 amdgpu_asic_set_vga_state(adev, state); 1692 if (state) 1693 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1694 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1695 else 1696 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1697 } 1698 1699 /** 1700 * amdgpu_device_check_block_size - validate the vm block size 1701 * 1702 * @adev: amdgpu_device pointer 1703 * 1704 * Validates the vm block size specified via module parameter. 1705 * The vm block size defines number of bits in page table versus page directory, 1706 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1707 * page table and the remaining bits are in the page directory. 1708 */ 1709 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1710 { 1711 /* defines number of bits in page table versus page directory, 1712 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1713 * page table and the remaining bits are in the page directory 1714 */ 1715 if (amdgpu_vm_block_size == -1) 1716 return; 1717 1718 if (amdgpu_vm_block_size < 9) { 1719 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1720 amdgpu_vm_block_size); 1721 amdgpu_vm_block_size = -1; 1722 } 1723 } 1724 1725 /** 1726 * amdgpu_device_check_vm_size - validate the vm size 1727 * 1728 * @adev: amdgpu_device pointer 1729 * 1730 * Validates the vm size in GB specified via module parameter. 1731 * The VM size is the size of the GPU virtual memory space in GB. 1732 */ 1733 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1734 { 1735 /* no need to check the default value */ 1736 if (amdgpu_vm_size == -1) 1737 return; 1738 1739 if (amdgpu_vm_size < 1) { 1740 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1741 amdgpu_vm_size); 1742 amdgpu_vm_size = -1; 1743 } 1744 } 1745 1746 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1747 { 1748 struct sysinfo si; 1749 bool is_os_64 = (sizeof(void *) == 8); 1750 uint64_t total_memory; 1751 uint64_t dram_size_seven_GB = 0x1B8000000; 1752 uint64_t dram_size_three_GB = 0xB8000000; 1753 1754 if (amdgpu_smu_memory_pool_size == 0) 1755 return; 1756 1757 if (!is_os_64) { 1758 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1759 goto def_value; 1760 } 1761 si_meminfo(&si); 1762 total_memory = (uint64_t)si.totalram * si.mem_unit; 1763 1764 if ((amdgpu_smu_memory_pool_size == 1) || 1765 (amdgpu_smu_memory_pool_size == 2)) { 1766 if (total_memory < dram_size_three_GB) 1767 goto def_value1; 1768 } else if ((amdgpu_smu_memory_pool_size == 4) || 1769 (amdgpu_smu_memory_pool_size == 8)) { 1770 if (total_memory < dram_size_seven_GB) 1771 goto def_value1; 1772 } else { 1773 DRM_WARN("Smu memory pool size not supported\n"); 1774 goto def_value; 1775 } 1776 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1777 1778 return; 1779 1780 def_value1: 1781 DRM_WARN("No enough system memory\n"); 1782 def_value: 1783 adev->pm.smu_prv_buffer_size = 0; 1784 } 1785 1786 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1787 { 1788 if (!(adev->flags & AMD_IS_APU) || 1789 adev->asic_type < CHIP_RAVEN) 1790 return 0; 1791 1792 switch (adev->asic_type) { 1793 case CHIP_RAVEN: 1794 if (adev->pdev->device == 0x15dd) 1795 adev->apu_flags |= AMD_APU_IS_RAVEN; 1796 if (adev->pdev->device == 0x15d8) 1797 adev->apu_flags |= AMD_APU_IS_PICASSO; 1798 break; 1799 case CHIP_RENOIR: 1800 if ((adev->pdev->device == 0x1636) || 1801 (adev->pdev->device == 0x164c)) 1802 adev->apu_flags |= AMD_APU_IS_RENOIR; 1803 else 1804 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1805 break; 1806 case CHIP_VANGOGH: 1807 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1808 break; 1809 case CHIP_YELLOW_CARP: 1810 break; 1811 case CHIP_CYAN_SKILLFISH: 1812 if ((adev->pdev->device == 0x13FE) || 1813 (adev->pdev->device == 0x143F)) 1814 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1815 break; 1816 default: 1817 break; 1818 } 1819 1820 return 0; 1821 } 1822 1823 /** 1824 * amdgpu_device_check_arguments - validate module params 1825 * 1826 * @adev: amdgpu_device pointer 1827 * 1828 * Validates certain module parameters and updates 1829 * the associated values used by the driver (all asics). 1830 */ 1831 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1832 { 1833 if (amdgpu_sched_jobs < 4) { 1834 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1835 amdgpu_sched_jobs); 1836 amdgpu_sched_jobs = 4; 1837 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 1838 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1839 amdgpu_sched_jobs); 1840 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1841 } 1842 1843 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1844 /* gart size must be greater or equal to 32M */ 1845 dev_warn(adev->dev, "gart size (%d) too small\n", 1846 amdgpu_gart_size); 1847 amdgpu_gart_size = -1; 1848 } 1849 1850 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1851 /* gtt size must be greater or equal to 32M */ 1852 dev_warn(adev->dev, "gtt size (%d) too small\n", 1853 amdgpu_gtt_size); 1854 amdgpu_gtt_size = -1; 1855 } 1856 1857 /* valid range is between 4 and 9 inclusive */ 1858 if (amdgpu_vm_fragment_size != -1 && 1859 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1860 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1861 amdgpu_vm_fragment_size = -1; 1862 } 1863 1864 if (amdgpu_sched_hw_submission < 2) { 1865 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1866 amdgpu_sched_hw_submission); 1867 amdgpu_sched_hw_submission = 2; 1868 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1869 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1870 amdgpu_sched_hw_submission); 1871 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1872 } 1873 1874 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1875 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1876 amdgpu_reset_method = -1; 1877 } 1878 1879 amdgpu_device_check_smu_prv_buffer_size(adev); 1880 1881 amdgpu_device_check_vm_size(adev); 1882 1883 amdgpu_device_check_block_size(adev); 1884 1885 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1886 1887 return 0; 1888 } 1889 1890 /** 1891 * amdgpu_switcheroo_set_state - set switcheroo state 1892 * 1893 * @pdev: pci dev pointer 1894 * @state: vga_switcheroo state 1895 * 1896 * Callback for the switcheroo driver. Suspends or resumes 1897 * the asics before or after it is powered up using ACPI methods. 1898 */ 1899 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1900 enum vga_switcheroo_state state) 1901 { 1902 struct drm_device *dev = pci_get_drvdata(pdev); 1903 int r; 1904 1905 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1906 return; 1907 1908 if (state == VGA_SWITCHEROO_ON) { 1909 pr_info("switched on\n"); 1910 /* don't suspend or resume card normally */ 1911 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1912 1913 pci_set_power_state(pdev, PCI_D0); 1914 amdgpu_device_load_pci_state(pdev); 1915 r = pci_enable_device(pdev); 1916 if (r) 1917 DRM_WARN("pci_enable_device failed (%d)\n", r); 1918 amdgpu_device_resume(dev, true); 1919 1920 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1921 } else { 1922 pr_info("switched off\n"); 1923 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1924 amdgpu_device_prepare(dev); 1925 amdgpu_device_suspend(dev, true); 1926 amdgpu_device_cache_pci_state(pdev); 1927 /* Shut down the device */ 1928 pci_disable_device(pdev); 1929 pci_set_power_state(pdev, PCI_D3cold); 1930 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1931 } 1932 } 1933 1934 /** 1935 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1936 * 1937 * @pdev: pci dev pointer 1938 * 1939 * Callback for the switcheroo driver. Check of the switcheroo 1940 * state can be changed. 1941 * Returns true if the state can be changed, false if not. 1942 */ 1943 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1944 { 1945 struct drm_device *dev = pci_get_drvdata(pdev); 1946 1947 /* 1948 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1949 * locking inversion with the driver load path. And the access here is 1950 * completely racy anyway. So don't bother with locking for now. 1951 */ 1952 return atomic_read(&dev->open_count) == 0; 1953 } 1954 1955 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1956 .set_gpu_state = amdgpu_switcheroo_set_state, 1957 .reprobe = NULL, 1958 .can_switch = amdgpu_switcheroo_can_switch, 1959 }; 1960 1961 /** 1962 * amdgpu_device_ip_set_clockgating_state - set the CG state 1963 * 1964 * @dev: amdgpu_device pointer 1965 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1966 * @state: clockgating state (gate or ungate) 1967 * 1968 * Sets the requested clockgating state for all instances of 1969 * the hardware IP specified. 1970 * Returns the error code from the last instance. 1971 */ 1972 int amdgpu_device_ip_set_clockgating_state(void *dev, 1973 enum amd_ip_block_type block_type, 1974 enum amd_clockgating_state state) 1975 { 1976 struct amdgpu_device *adev = dev; 1977 int i, r = 0; 1978 1979 for (i = 0; i < adev->num_ip_blocks; i++) { 1980 if (!adev->ip_blocks[i].status.valid) 1981 continue; 1982 if (adev->ip_blocks[i].version->type != block_type) 1983 continue; 1984 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1985 continue; 1986 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1987 (void *)adev, state); 1988 if (r) 1989 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1990 adev->ip_blocks[i].version->funcs->name, r); 1991 } 1992 return r; 1993 } 1994 1995 /** 1996 * amdgpu_device_ip_set_powergating_state - set the PG state 1997 * 1998 * @dev: amdgpu_device pointer 1999 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2000 * @state: powergating state (gate or ungate) 2001 * 2002 * Sets the requested powergating state for all instances of 2003 * the hardware IP specified. 2004 * Returns the error code from the last instance. 2005 */ 2006 int amdgpu_device_ip_set_powergating_state(void *dev, 2007 enum amd_ip_block_type block_type, 2008 enum amd_powergating_state state) 2009 { 2010 struct amdgpu_device *adev = dev; 2011 int i, r = 0; 2012 2013 for (i = 0; i < adev->num_ip_blocks; i++) { 2014 if (!adev->ip_blocks[i].status.valid) 2015 continue; 2016 if (adev->ip_blocks[i].version->type != block_type) 2017 continue; 2018 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2019 continue; 2020 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2021 (void *)adev, state); 2022 if (r) 2023 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 2024 adev->ip_blocks[i].version->funcs->name, r); 2025 } 2026 return r; 2027 } 2028 2029 /** 2030 * amdgpu_device_ip_get_clockgating_state - get the CG state 2031 * 2032 * @adev: amdgpu_device pointer 2033 * @flags: clockgating feature flags 2034 * 2035 * Walks the list of IPs on the device and updates the clockgating 2036 * flags for each IP. 2037 * Updates @flags with the feature flags for each hardware IP where 2038 * clockgating is enabled. 2039 */ 2040 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2041 u64 *flags) 2042 { 2043 int i; 2044 2045 for (i = 0; i < adev->num_ip_blocks; i++) { 2046 if (!adev->ip_blocks[i].status.valid) 2047 continue; 2048 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2049 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 2050 } 2051 } 2052 2053 /** 2054 * amdgpu_device_ip_wait_for_idle - wait for idle 2055 * 2056 * @adev: amdgpu_device pointer 2057 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2058 * 2059 * Waits for the request hardware IP to be idle. 2060 * Returns 0 for success or a negative error code on failure. 2061 */ 2062 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2063 enum amd_ip_block_type block_type) 2064 { 2065 int i, r; 2066 2067 for (i = 0; i < adev->num_ip_blocks; i++) { 2068 if (!adev->ip_blocks[i].status.valid) 2069 continue; 2070 if (adev->ip_blocks[i].version->type == block_type) { 2071 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 2072 if (r) 2073 return r; 2074 break; 2075 } 2076 } 2077 return 0; 2078 2079 } 2080 2081 /** 2082 * amdgpu_device_ip_is_idle - is the hardware IP idle 2083 * 2084 * @adev: amdgpu_device pointer 2085 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2086 * 2087 * Check if the hardware IP is idle or not. 2088 * Returns true if it the IP is idle, false if not. 2089 */ 2090 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 2091 enum amd_ip_block_type block_type) 2092 { 2093 int i; 2094 2095 for (i = 0; i < adev->num_ip_blocks; i++) { 2096 if (!adev->ip_blocks[i].status.valid) 2097 continue; 2098 if (adev->ip_blocks[i].version->type == block_type) 2099 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 2100 } 2101 return true; 2102 2103 } 2104 2105 /** 2106 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2107 * 2108 * @adev: amdgpu_device pointer 2109 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2110 * 2111 * Returns a pointer to the hardware IP block structure 2112 * if it exists for the asic, otherwise NULL. 2113 */ 2114 struct amdgpu_ip_block * 2115 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2116 enum amd_ip_block_type type) 2117 { 2118 int i; 2119 2120 for (i = 0; i < adev->num_ip_blocks; i++) 2121 if (adev->ip_blocks[i].version->type == type) 2122 return &adev->ip_blocks[i]; 2123 2124 return NULL; 2125 } 2126 2127 /** 2128 * amdgpu_device_ip_block_version_cmp 2129 * 2130 * @adev: amdgpu_device pointer 2131 * @type: enum amd_ip_block_type 2132 * @major: major version 2133 * @minor: minor version 2134 * 2135 * return 0 if equal or greater 2136 * return 1 if smaller or the ip_block doesn't exist 2137 */ 2138 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2139 enum amd_ip_block_type type, 2140 u32 major, u32 minor) 2141 { 2142 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2143 2144 if (ip_block && ((ip_block->version->major > major) || 2145 ((ip_block->version->major == major) && 2146 (ip_block->version->minor >= minor)))) 2147 return 0; 2148 2149 return 1; 2150 } 2151 2152 /** 2153 * amdgpu_device_ip_block_add 2154 * 2155 * @adev: amdgpu_device pointer 2156 * @ip_block_version: pointer to the IP to add 2157 * 2158 * Adds the IP block driver information to the collection of IPs 2159 * on the asic. 2160 */ 2161 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2162 const struct amdgpu_ip_block_version *ip_block_version) 2163 { 2164 if (!ip_block_version) 2165 return -EINVAL; 2166 2167 switch (ip_block_version->type) { 2168 case AMD_IP_BLOCK_TYPE_VCN: 2169 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2170 return 0; 2171 break; 2172 case AMD_IP_BLOCK_TYPE_JPEG: 2173 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2174 return 0; 2175 break; 2176 default: 2177 break; 2178 } 2179 2180 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 2181 ip_block_version->funcs->name); 2182 2183 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2184 2185 return 0; 2186 } 2187 2188 /** 2189 * amdgpu_device_enable_virtual_display - enable virtual display feature 2190 * 2191 * @adev: amdgpu_device pointer 2192 * 2193 * Enabled the virtual display feature if the user has enabled it via 2194 * the module parameter virtual_display. This feature provides a virtual 2195 * display hardware on headless boards or in virtualized environments. 2196 * This function parses and validates the configuration string specified by 2197 * the user and configues the virtual display configuration (number of 2198 * virtual connectors, crtcs, etc.) specified. 2199 */ 2200 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2201 { 2202 adev->enable_virtual_display = false; 2203 2204 if (amdgpu_virtual_display) { 2205 const char *pci_address_name = pci_name(adev->pdev); 2206 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2207 2208 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2209 pciaddstr_tmp = pciaddstr; 2210 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2211 pciaddname = strsep(&pciaddname_tmp, ","); 2212 if (!strcmp("all", pciaddname) 2213 || !strcmp(pci_address_name, pciaddname)) { 2214 long num_crtc; 2215 int res = -1; 2216 2217 adev->enable_virtual_display = true; 2218 2219 if (pciaddname_tmp) 2220 res = kstrtol(pciaddname_tmp, 10, 2221 &num_crtc); 2222 2223 if (!res) { 2224 if (num_crtc < 1) 2225 num_crtc = 1; 2226 if (num_crtc > 6) 2227 num_crtc = 6; 2228 adev->mode_info.num_crtc = num_crtc; 2229 } else { 2230 adev->mode_info.num_crtc = 1; 2231 } 2232 break; 2233 } 2234 } 2235 2236 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2237 amdgpu_virtual_display, pci_address_name, 2238 adev->enable_virtual_display, adev->mode_info.num_crtc); 2239 2240 kfree(pciaddstr); 2241 } 2242 } 2243 2244 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2245 { 2246 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2247 adev->mode_info.num_crtc = 1; 2248 adev->enable_virtual_display = true; 2249 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2250 adev->enable_virtual_display, adev->mode_info.num_crtc); 2251 } 2252 } 2253 2254 /** 2255 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2256 * 2257 * @adev: amdgpu_device pointer 2258 * 2259 * Parses the asic configuration parameters specified in the gpu info 2260 * firmware and makes them availale to the driver for use in configuring 2261 * the asic. 2262 * Returns 0 on success, -EINVAL on failure. 2263 */ 2264 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2265 { 2266 const char *chip_name; 2267 char fw_name[40]; 2268 int err; 2269 const struct gpu_info_firmware_header_v1_0 *hdr; 2270 2271 adev->firmware.gpu_info_fw = NULL; 2272 2273 if (adev->mman.discovery_bin) 2274 return 0; 2275 2276 switch (adev->asic_type) { 2277 default: 2278 return 0; 2279 case CHIP_VEGA10: 2280 chip_name = "vega10"; 2281 break; 2282 case CHIP_VEGA12: 2283 chip_name = "vega12"; 2284 break; 2285 case CHIP_RAVEN: 2286 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2287 chip_name = "raven2"; 2288 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2289 chip_name = "picasso"; 2290 else 2291 chip_name = "raven"; 2292 break; 2293 case CHIP_ARCTURUS: 2294 chip_name = "arcturus"; 2295 break; 2296 case CHIP_NAVI12: 2297 chip_name = "navi12"; 2298 break; 2299 } 2300 2301 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 2302 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name); 2303 if (err) { 2304 dev_err(adev->dev, 2305 "Failed to get gpu_info firmware \"%s\"\n", 2306 fw_name); 2307 goto out; 2308 } 2309 2310 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2311 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2312 2313 switch (hdr->version_major) { 2314 case 1: 2315 { 2316 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2317 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2318 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2319 2320 /* 2321 * Should be droped when DAL no longer needs it. 2322 */ 2323 if (adev->asic_type == CHIP_NAVI12) 2324 goto parse_soc_bounding_box; 2325 2326 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2327 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2328 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2329 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2330 adev->gfx.config.max_texture_channel_caches = 2331 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2332 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2333 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2334 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2335 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2336 adev->gfx.config.double_offchip_lds_buf = 2337 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2338 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2339 adev->gfx.cu_info.max_waves_per_simd = 2340 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2341 adev->gfx.cu_info.max_scratch_slots_per_cu = 2342 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2343 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2344 if (hdr->version_minor >= 1) { 2345 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2346 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2347 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2348 adev->gfx.config.num_sc_per_sh = 2349 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2350 adev->gfx.config.num_packer_per_sc = 2351 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2352 } 2353 2354 parse_soc_bounding_box: 2355 /* 2356 * soc bounding box info is not integrated in disocovery table, 2357 * we always need to parse it from gpu info firmware if needed. 2358 */ 2359 if (hdr->version_minor == 2) { 2360 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2361 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2362 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2363 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2364 } 2365 break; 2366 } 2367 default: 2368 dev_err(adev->dev, 2369 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2370 err = -EINVAL; 2371 goto out; 2372 } 2373 out: 2374 return err; 2375 } 2376 2377 /** 2378 * amdgpu_device_ip_early_init - run early init for hardware IPs 2379 * 2380 * @adev: amdgpu_device pointer 2381 * 2382 * Early initialization pass for hardware IPs. The hardware IPs that make 2383 * up each asic are discovered each IP's early_init callback is run. This 2384 * is the first stage in initializing the asic. 2385 * Returns 0 on success, negative error code on failure. 2386 */ 2387 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2388 { 2389 struct pci_dev *parent; 2390 int i, r; 2391 bool total; 2392 2393 amdgpu_device_enable_virtual_display(adev); 2394 2395 if (amdgpu_sriov_vf(adev)) { 2396 r = amdgpu_virt_request_full_gpu(adev, true); 2397 if (r) 2398 return r; 2399 } 2400 2401 switch (adev->asic_type) { 2402 #ifdef CONFIG_DRM_AMDGPU_SI 2403 case CHIP_VERDE: 2404 case CHIP_TAHITI: 2405 case CHIP_PITCAIRN: 2406 case CHIP_OLAND: 2407 case CHIP_HAINAN: 2408 adev->family = AMDGPU_FAMILY_SI; 2409 r = si_set_ip_blocks(adev); 2410 if (r) 2411 return r; 2412 break; 2413 #endif 2414 #ifdef CONFIG_DRM_AMDGPU_CIK 2415 case CHIP_BONAIRE: 2416 case CHIP_HAWAII: 2417 case CHIP_KAVERI: 2418 case CHIP_KABINI: 2419 case CHIP_MULLINS: 2420 if (adev->flags & AMD_IS_APU) 2421 adev->family = AMDGPU_FAMILY_KV; 2422 else 2423 adev->family = AMDGPU_FAMILY_CI; 2424 2425 r = cik_set_ip_blocks(adev); 2426 if (r) 2427 return r; 2428 break; 2429 #endif 2430 case CHIP_TOPAZ: 2431 case CHIP_TONGA: 2432 case CHIP_FIJI: 2433 case CHIP_POLARIS10: 2434 case CHIP_POLARIS11: 2435 case CHIP_POLARIS12: 2436 case CHIP_VEGAM: 2437 case CHIP_CARRIZO: 2438 case CHIP_STONEY: 2439 if (adev->flags & AMD_IS_APU) 2440 adev->family = AMDGPU_FAMILY_CZ; 2441 else 2442 adev->family = AMDGPU_FAMILY_VI; 2443 2444 r = vi_set_ip_blocks(adev); 2445 if (r) 2446 return r; 2447 break; 2448 default: 2449 r = amdgpu_discovery_set_ip_blocks(adev); 2450 if (r) 2451 return r; 2452 break; 2453 } 2454 2455 if (amdgpu_has_atpx() && 2456 (amdgpu_is_atpx_hybrid() || 2457 amdgpu_has_atpx_dgpu_power_cntl()) && 2458 ((adev->flags & AMD_IS_APU) == 0) && 2459 !dev_is_removable(&adev->pdev->dev)) 2460 adev->flags |= AMD_IS_PX; 2461 2462 if (!(adev->flags & AMD_IS_APU)) { 2463 parent = pcie_find_root_port(adev->pdev); 2464 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2465 } 2466 2467 2468 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2469 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2470 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2471 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2472 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2473 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2474 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2475 2476 total = true; 2477 for (i = 0; i < adev->num_ip_blocks; i++) { 2478 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2479 DRM_WARN("disabled ip block: %d <%s>\n", 2480 i, adev->ip_blocks[i].version->funcs->name); 2481 adev->ip_blocks[i].status.valid = false; 2482 } else { 2483 if (adev->ip_blocks[i].version->funcs->early_init) { 2484 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2485 if (r == -ENOENT) { 2486 adev->ip_blocks[i].status.valid = false; 2487 } else if (r) { 2488 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2489 adev->ip_blocks[i].version->funcs->name, r); 2490 total = false; 2491 } else { 2492 adev->ip_blocks[i].status.valid = true; 2493 } 2494 } else { 2495 adev->ip_blocks[i].status.valid = true; 2496 } 2497 } 2498 /* get the vbios after the asic_funcs are set up */ 2499 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2500 r = amdgpu_device_parse_gpu_info_fw(adev); 2501 if (r) 2502 return r; 2503 2504 /* Read BIOS */ 2505 if (amdgpu_device_read_bios(adev)) { 2506 if (!amdgpu_get_bios(adev)) 2507 return -EINVAL; 2508 2509 r = amdgpu_atombios_init(adev); 2510 if (r) { 2511 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2512 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2513 return r; 2514 } 2515 } 2516 2517 /*get pf2vf msg info at it's earliest time*/ 2518 if (amdgpu_sriov_vf(adev)) 2519 amdgpu_virt_init_data_exchange(adev); 2520 2521 } 2522 } 2523 if (!total) 2524 return -ENODEV; 2525 2526 amdgpu_amdkfd_device_probe(adev); 2527 adev->cg_flags &= amdgpu_cg_mask; 2528 adev->pg_flags &= amdgpu_pg_mask; 2529 2530 return 0; 2531 } 2532 2533 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2534 { 2535 int i, r; 2536 2537 for (i = 0; i < adev->num_ip_blocks; i++) { 2538 if (!adev->ip_blocks[i].status.sw) 2539 continue; 2540 if (adev->ip_blocks[i].status.hw) 2541 continue; 2542 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2543 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2544 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2545 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2546 if (r) { 2547 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2548 adev->ip_blocks[i].version->funcs->name, r); 2549 return r; 2550 } 2551 adev->ip_blocks[i].status.hw = true; 2552 } 2553 } 2554 2555 return 0; 2556 } 2557 2558 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2559 { 2560 int i, r; 2561 2562 for (i = 0; i < adev->num_ip_blocks; i++) { 2563 if (!adev->ip_blocks[i].status.sw) 2564 continue; 2565 if (adev->ip_blocks[i].status.hw) 2566 continue; 2567 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2568 if (r) { 2569 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2570 adev->ip_blocks[i].version->funcs->name, r); 2571 return r; 2572 } 2573 adev->ip_blocks[i].status.hw = true; 2574 } 2575 2576 return 0; 2577 } 2578 2579 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2580 { 2581 int r = 0; 2582 int i; 2583 uint32_t smu_version; 2584 2585 if (adev->asic_type >= CHIP_VEGA10) { 2586 for (i = 0; i < adev->num_ip_blocks; i++) { 2587 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2588 continue; 2589 2590 if (!adev->ip_blocks[i].status.sw) 2591 continue; 2592 2593 /* no need to do the fw loading again if already done*/ 2594 if (adev->ip_blocks[i].status.hw == true) 2595 break; 2596 2597 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2598 r = adev->ip_blocks[i].version->funcs->resume(adev); 2599 if (r) { 2600 DRM_ERROR("resume of IP block <%s> failed %d\n", 2601 adev->ip_blocks[i].version->funcs->name, r); 2602 return r; 2603 } 2604 } else { 2605 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2606 if (r) { 2607 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2608 adev->ip_blocks[i].version->funcs->name, r); 2609 return r; 2610 } 2611 } 2612 2613 adev->ip_blocks[i].status.hw = true; 2614 break; 2615 } 2616 } 2617 2618 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2619 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2620 2621 return r; 2622 } 2623 2624 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2625 { 2626 long timeout; 2627 int r, i; 2628 2629 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2630 struct amdgpu_ring *ring = adev->rings[i]; 2631 2632 /* No need to setup the GPU scheduler for rings that don't need it */ 2633 if (!ring || ring->no_scheduler) 2634 continue; 2635 2636 switch (ring->funcs->type) { 2637 case AMDGPU_RING_TYPE_GFX: 2638 timeout = adev->gfx_timeout; 2639 break; 2640 case AMDGPU_RING_TYPE_COMPUTE: 2641 timeout = adev->compute_timeout; 2642 break; 2643 case AMDGPU_RING_TYPE_SDMA: 2644 timeout = adev->sdma_timeout; 2645 break; 2646 default: 2647 timeout = adev->video_timeout; 2648 break; 2649 } 2650 2651 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL, 2652 DRM_SCHED_PRIORITY_COUNT, 2653 ring->num_hw_submission, 0, 2654 timeout, adev->reset_domain->wq, 2655 ring->sched_score, ring->name, 2656 adev->dev); 2657 if (r) { 2658 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2659 ring->name); 2660 return r; 2661 } 2662 r = amdgpu_uvd_entity_init(adev, ring); 2663 if (r) { 2664 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", 2665 ring->name); 2666 return r; 2667 } 2668 r = amdgpu_vce_entity_init(adev, ring); 2669 if (r) { 2670 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", 2671 ring->name); 2672 return r; 2673 } 2674 } 2675 2676 amdgpu_xcp_update_partition_sched_list(adev); 2677 2678 return 0; 2679 } 2680 2681 2682 /** 2683 * amdgpu_device_ip_init - run init for hardware IPs 2684 * 2685 * @adev: amdgpu_device pointer 2686 * 2687 * Main initialization pass for hardware IPs. The list of all the hardware 2688 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2689 * are run. sw_init initializes the software state associated with each IP 2690 * and hw_init initializes the hardware associated with each IP. 2691 * Returns 0 on success, negative error code on failure. 2692 */ 2693 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2694 { 2695 int i, r; 2696 2697 r = amdgpu_ras_init(adev); 2698 if (r) 2699 return r; 2700 2701 for (i = 0; i < adev->num_ip_blocks; i++) { 2702 if (!adev->ip_blocks[i].status.valid) 2703 continue; 2704 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2705 if (r) { 2706 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2707 adev->ip_blocks[i].version->funcs->name, r); 2708 goto init_failed; 2709 } 2710 adev->ip_blocks[i].status.sw = true; 2711 2712 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2713 /* need to do common hw init early so everything is set up for gmc */ 2714 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2715 if (r) { 2716 DRM_ERROR("hw_init %d failed %d\n", i, r); 2717 goto init_failed; 2718 } 2719 adev->ip_blocks[i].status.hw = true; 2720 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2721 /* need to do gmc hw init early so we can allocate gpu mem */ 2722 /* Try to reserve bad pages early */ 2723 if (amdgpu_sriov_vf(adev)) 2724 amdgpu_virt_exchange_data(adev); 2725 2726 r = amdgpu_device_mem_scratch_init(adev); 2727 if (r) { 2728 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2729 goto init_failed; 2730 } 2731 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2732 if (r) { 2733 DRM_ERROR("hw_init %d failed %d\n", i, r); 2734 goto init_failed; 2735 } 2736 r = amdgpu_device_wb_init(adev); 2737 if (r) { 2738 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2739 goto init_failed; 2740 } 2741 adev->ip_blocks[i].status.hw = true; 2742 2743 /* right after GMC hw init, we create CSA */ 2744 if (adev->gfx.mcbp) { 2745 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2746 AMDGPU_GEM_DOMAIN_VRAM | 2747 AMDGPU_GEM_DOMAIN_GTT, 2748 AMDGPU_CSA_SIZE); 2749 if (r) { 2750 DRM_ERROR("allocate CSA failed %d\n", r); 2751 goto init_failed; 2752 } 2753 } 2754 2755 r = amdgpu_seq64_init(adev); 2756 if (r) { 2757 DRM_ERROR("allocate seq64 failed %d\n", r); 2758 goto init_failed; 2759 } 2760 } 2761 } 2762 2763 if (amdgpu_sriov_vf(adev)) 2764 amdgpu_virt_init_data_exchange(adev); 2765 2766 r = amdgpu_ib_pool_init(adev); 2767 if (r) { 2768 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2769 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2770 goto init_failed; 2771 } 2772 2773 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2774 if (r) 2775 goto init_failed; 2776 2777 r = amdgpu_device_ip_hw_init_phase1(adev); 2778 if (r) 2779 goto init_failed; 2780 2781 r = amdgpu_device_fw_loading(adev); 2782 if (r) 2783 goto init_failed; 2784 2785 r = amdgpu_device_ip_hw_init_phase2(adev); 2786 if (r) 2787 goto init_failed; 2788 2789 /* 2790 * retired pages will be loaded from eeprom and reserved here, 2791 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2792 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2793 * for I2C communication which only true at this point. 2794 * 2795 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2796 * failure from bad gpu situation and stop amdgpu init process 2797 * accordingly. For other failed cases, it will still release all 2798 * the resource and print error message, rather than returning one 2799 * negative value to upper level. 2800 * 2801 * Note: theoretically, this should be called before all vram allocations 2802 * to protect retired page from abusing 2803 */ 2804 r = amdgpu_ras_recovery_init(adev); 2805 if (r) 2806 goto init_failed; 2807 2808 /** 2809 * In case of XGMI grab extra reference for reset domain for this device 2810 */ 2811 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2812 if (amdgpu_xgmi_add_device(adev) == 0) { 2813 if (!amdgpu_sriov_vf(adev)) { 2814 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2815 2816 if (WARN_ON(!hive)) { 2817 r = -ENOENT; 2818 goto init_failed; 2819 } 2820 2821 if (!hive->reset_domain || 2822 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2823 r = -ENOENT; 2824 amdgpu_put_xgmi_hive(hive); 2825 goto init_failed; 2826 } 2827 2828 /* Drop the early temporary reset domain we created for device */ 2829 amdgpu_reset_put_reset_domain(adev->reset_domain); 2830 adev->reset_domain = hive->reset_domain; 2831 amdgpu_put_xgmi_hive(hive); 2832 } 2833 } 2834 } 2835 2836 r = amdgpu_device_init_schedulers(adev); 2837 if (r) 2838 goto init_failed; 2839 2840 if (adev->mman.buffer_funcs_ring->sched.ready) 2841 amdgpu_ttm_set_buffer_funcs_status(adev, true); 2842 2843 /* Don't init kfd if whole hive need to be reset during init */ 2844 if (!adev->gmc.xgmi.pending_reset) { 2845 kgd2kfd_init_zone_device(adev); 2846 amdgpu_amdkfd_device_init(adev); 2847 } 2848 2849 amdgpu_fru_get_product_info(adev); 2850 2851 init_failed: 2852 2853 return r; 2854 } 2855 2856 /** 2857 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2858 * 2859 * @adev: amdgpu_device pointer 2860 * 2861 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2862 * this function before a GPU reset. If the value is retained after a 2863 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2864 */ 2865 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2866 { 2867 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2868 } 2869 2870 /** 2871 * amdgpu_device_check_vram_lost - check if vram is valid 2872 * 2873 * @adev: amdgpu_device pointer 2874 * 2875 * Checks the reset magic value written to the gart pointer in VRAM. 2876 * The driver calls this after a GPU reset to see if the contents of 2877 * VRAM is lost or now. 2878 * returns true if vram is lost, false if not. 2879 */ 2880 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2881 { 2882 if (memcmp(adev->gart.ptr, adev->reset_magic, 2883 AMDGPU_RESET_MAGIC_NUM)) 2884 return true; 2885 2886 if (!amdgpu_in_reset(adev)) 2887 return false; 2888 2889 /* 2890 * For all ASICs with baco/mode1 reset, the VRAM is 2891 * always assumed to be lost. 2892 */ 2893 switch (amdgpu_asic_reset_method(adev)) { 2894 case AMD_RESET_METHOD_BACO: 2895 case AMD_RESET_METHOD_MODE1: 2896 return true; 2897 default: 2898 return false; 2899 } 2900 } 2901 2902 /** 2903 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2904 * 2905 * @adev: amdgpu_device pointer 2906 * @state: clockgating state (gate or ungate) 2907 * 2908 * The list of all the hardware IPs that make up the asic is walked and the 2909 * set_clockgating_state callbacks are run. 2910 * Late initialization pass enabling clockgating for hardware IPs. 2911 * Fini or suspend, pass disabling clockgating for hardware IPs. 2912 * Returns 0 on success, negative error code on failure. 2913 */ 2914 2915 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2916 enum amd_clockgating_state state) 2917 { 2918 int i, j, r; 2919 2920 if (amdgpu_emu_mode == 1) 2921 return 0; 2922 2923 for (j = 0; j < adev->num_ip_blocks; j++) { 2924 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2925 if (!adev->ip_blocks[i].status.late_initialized) 2926 continue; 2927 /* skip CG for GFX, SDMA on S0ix */ 2928 if (adev->in_s0ix && 2929 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2930 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2931 continue; 2932 /* skip CG for VCE/UVD, it's handled specially */ 2933 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2934 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2935 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2936 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2937 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2938 /* enable clockgating to save power */ 2939 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2940 state); 2941 if (r) { 2942 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2943 adev->ip_blocks[i].version->funcs->name, r); 2944 return r; 2945 } 2946 } 2947 } 2948 2949 return 0; 2950 } 2951 2952 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2953 enum amd_powergating_state state) 2954 { 2955 int i, j, r; 2956 2957 if (amdgpu_emu_mode == 1) 2958 return 0; 2959 2960 for (j = 0; j < adev->num_ip_blocks; j++) { 2961 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2962 if (!adev->ip_blocks[i].status.late_initialized) 2963 continue; 2964 /* skip PG for GFX, SDMA on S0ix */ 2965 if (adev->in_s0ix && 2966 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2967 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2968 continue; 2969 /* skip CG for VCE/UVD, it's handled specially */ 2970 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2971 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2972 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2973 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2974 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2975 /* enable powergating to save power */ 2976 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2977 state); 2978 if (r) { 2979 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2980 adev->ip_blocks[i].version->funcs->name, r); 2981 return r; 2982 } 2983 } 2984 } 2985 return 0; 2986 } 2987 2988 static int amdgpu_device_enable_mgpu_fan_boost(void) 2989 { 2990 struct amdgpu_gpu_instance *gpu_ins; 2991 struct amdgpu_device *adev; 2992 int i, ret = 0; 2993 2994 mutex_lock(&mgpu_info.mutex); 2995 2996 /* 2997 * MGPU fan boost feature should be enabled 2998 * only when there are two or more dGPUs in 2999 * the system 3000 */ 3001 if (mgpu_info.num_dgpu < 2) 3002 goto out; 3003 3004 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3005 gpu_ins = &(mgpu_info.gpu_ins[i]); 3006 adev = gpu_ins->adev; 3007 if (!(adev->flags & AMD_IS_APU) && 3008 !gpu_ins->mgpu_fan_enabled) { 3009 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3010 if (ret) 3011 break; 3012 3013 gpu_ins->mgpu_fan_enabled = 1; 3014 } 3015 } 3016 3017 out: 3018 mutex_unlock(&mgpu_info.mutex); 3019 3020 return ret; 3021 } 3022 3023 /** 3024 * amdgpu_device_ip_late_init - run late init for hardware IPs 3025 * 3026 * @adev: amdgpu_device pointer 3027 * 3028 * Late initialization pass for hardware IPs. The list of all the hardware 3029 * IPs that make up the asic is walked and the late_init callbacks are run. 3030 * late_init covers any special initialization that an IP requires 3031 * after all of the have been initialized or something that needs to happen 3032 * late in the init process. 3033 * Returns 0 on success, negative error code on failure. 3034 */ 3035 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3036 { 3037 struct amdgpu_gpu_instance *gpu_instance; 3038 int i = 0, r; 3039 3040 for (i = 0; i < adev->num_ip_blocks; i++) { 3041 if (!adev->ip_blocks[i].status.hw) 3042 continue; 3043 if (adev->ip_blocks[i].version->funcs->late_init) { 3044 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 3045 if (r) { 3046 DRM_ERROR("late_init of IP block <%s> failed %d\n", 3047 adev->ip_blocks[i].version->funcs->name, r); 3048 return r; 3049 } 3050 } 3051 adev->ip_blocks[i].status.late_initialized = true; 3052 } 3053 3054 r = amdgpu_ras_late_init(adev); 3055 if (r) { 3056 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 3057 return r; 3058 } 3059 3060 amdgpu_ras_set_error_query_ready(adev, true); 3061 3062 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3063 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3064 3065 amdgpu_device_fill_reset_magic(adev); 3066 3067 r = amdgpu_device_enable_mgpu_fan_boost(); 3068 if (r) 3069 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 3070 3071 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3072 if (amdgpu_passthrough(adev) && 3073 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3074 adev->asic_type == CHIP_ALDEBARAN)) 3075 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3076 3077 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3078 mutex_lock(&mgpu_info.mutex); 3079 3080 /* 3081 * Reset device p-state to low as this was booted with high. 3082 * 3083 * This should be performed only after all devices from the same 3084 * hive get initialized. 3085 * 3086 * However, it's unknown how many device in the hive in advance. 3087 * As this is counted one by one during devices initializations. 3088 * 3089 * So, we wait for all XGMI interlinked devices initialized. 3090 * This may bring some delays as those devices may come from 3091 * different hives. But that should be OK. 3092 */ 3093 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3094 for (i = 0; i < mgpu_info.num_gpu; i++) { 3095 gpu_instance = &(mgpu_info.gpu_ins[i]); 3096 if (gpu_instance->adev->flags & AMD_IS_APU) 3097 continue; 3098 3099 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3100 AMDGPU_XGMI_PSTATE_MIN); 3101 if (r) { 3102 DRM_ERROR("pstate setting failed (%d).\n", r); 3103 break; 3104 } 3105 } 3106 } 3107 3108 mutex_unlock(&mgpu_info.mutex); 3109 } 3110 3111 return 0; 3112 } 3113 3114 /** 3115 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3116 * 3117 * @adev: amdgpu_device pointer 3118 * 3119 * For ASICs need to disable SMC first 3120 */ 3121 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3122 { 3123 int i, r; 3124 3125 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3126 return; 3127 3128 for (i = 0; i < adev->num_ip_blocks; i++) { 3129 if (!adev->ip_blocks[i].status.hw) 3130 continue; 3131 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3132 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 3133 /* XXX handle errors */ 3134 if (r) { 3135 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3136 adev->ip_blocks[i].version->funcs->name, r); 3137 } 3138 adev->ip_blocks[i].status.hw = false; 3139 break; 3140 } 3141 } 3142 } 3143 3144 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3145 { 3146 int i, r; 3147 3148 for (i = 0; i < adev->num_ip_blocks; i++) { 3149 if (!adev->ip_blocks[i].version->funcs->early_fini) 3150 continue; 3151 3152 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 3153 if (r) { 3154 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3155 adev->ip_blocks[i].version->funcs->name, r); 3156 } 3157 } 3158 3159 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3160 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3161 3162 amdgpu_amdkfd_suspend(adev, false); 3163 3164 /* Workaroud for ASICs need to disable SMC first */ 3165 amdgpu_device_smu_fini_early(adev); 3166 3167 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3168 if (!adev->ip_blocks[i].status.hw) 3169 continue; 3170 3171 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 3172 /* XXX handle errors */ 3173 if (r) { 3174 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3175 adev->ip_blocks[i].version->funcs->name, r); 3176 } 3177 3178 adev->ip_blocks[i].status.hw = false; 3179 } 3180 3181 if (amdgpu_sriov_vf(adev)) { 3182 if (amdgpu_virt_release_full_gpu(adev, false)) 3183 DRM_ERROR("failed to release exclusive mode on fini\n"); 3184 } 3185 3186 return 0; 3187 } 3188 3189 /** 3190 * amdgpu_device_ip_fini - run fini for hardware IPs 3191 * 3192 * @adev: amdgpu_device pointer 3193 * 3194 * Main teardown pass for hardware IPs. The list of all the hardware 3195 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3196 * are run. hw_fini tears down the hardware associated with each IP 3197 * and sw_fini tears down any software state associated with each IP. 3198 * Returns 0 on success, negative error code on failure. 3199 */ 3200 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3201 { 3202 int i, r; 3203 3204 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3205 amdgpu_virt_release_ras_err_handler_data(adev); 3206 3207 if (adev->gmc.xgmi.num_physical_nodes > 1) 3208 amdgpu_xgmi_remove_device(adev); 3209 3210 amdgpu_amdkfd_device_fini_sw(adev); 3211 3212 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3213 if (!adev->ip_blocks[i].status.sw) 3214 continue; 3215 3216 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3217 amdgpu_ucode_free_bo(adev); 3218 amdgpu_free_static_csa(&adev->virt.csa_obj); 3219 amdgpu_device_wb_fini(adev); 3220 amdgpu_device_mem_scratch_fini(adev); 3221 amdgpu_ib_pool_fini(adev); 3222 amdgpu_seq64_fini(adev); 3223 } 3224 3225 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 3226 /* XXX handle errors */ 3227 if (r) { 3228 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3229 adev->ip_blocks[i].version->funcs->name, r); 3230 } 3231 adev->ip_blocks[i].status.sw = false; 3232 adev->ip_blocks[i].status.valid = false; 3233 } 3234 3235 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3236 if (!adev->ip_blocks[i].status.late_initialized) 3237 continue; 3238 if (adev->ip_blocks[i].version->funcs->late_fini) 3239 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 3240 adev->ip_blocks[i].status.late_initialized = false; 3241 } 3242 3243 amdgpu_ras_fini(adev); 3244 3245 return 0; 3246 } 3247 3248 /** 3249 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3250 * 3251 * @work: work_struct. 3252 */ 3253 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3254 { 3255 struct amdgpu_device *adev = 3256 container_of(work, struct amdgpu_device, delayed_init_work.work); 3257 int r; 3258 3259 r = amdgpu_ib_ring_tests(adev); 3260 if (r) 3261 DRM_ERROR("ib ring test failed (%d).\n", r); 3262 } 3263 3264 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3265 { 3266 struct amdgpu_device *adev = 3267 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3268 3269 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3270 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3271 3272 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 3273 adev->gfx.gfx_off_state = true; 3274 } 3275 3276 /** 3277 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3278 * 3279 * @adev: amdgpu_device pointer 3280 * 3281 * Main suspend function for hardware IPs. The list of all the hardware 3282 * IPs that make up the asic is walked, clockgating is disabled and the 3283 * suspend callbacks are run. suspend puts the hardware and software state 3284 * in each IP into a state suitable for suspend. 3285 * Returns 0 on success, negative error code on failure. 3286 */ 3287 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3288 { 3289 int i, r; 3290 3291 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3292 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3293 3294 /* 3295 * Per PMFW team's suggestion, driver needs to handle gfxoff 3296 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3297 * scenario. Add the missing df cstate disablement here. 3298 */ 3299 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3300 dev_warn(adev->dev, "Failed to disallow df cstate"); 3301 3302 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3303 if (!adev->ip_blocks[i].status.valid) 3304 continue; 3305 3306 /* displays are handled separately */ 3307 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3308 continue; 3309 3310 /* XXX handle errors */ 3311 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3312 /* XXX handle errors */ 3313 if (r) { 3314 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3315 adev->ip_blocks[i].version->funcs->name, r); 3316 return r; 3317 } 3318 3319 adev->ip_blocks[i].status.hw = false; 3320 } 3321 3322 return 0; 3323 } 3324 3325 /** 3326 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3327 * 3328 * @adev: amdgpu_device pointer 3329 * 3330 * Main suspend function for hardware IPs. The list of all the hardware 3331 * IPs that make up the asic is walked, clockgating is disabled and the 3332 * suspend callbacks are run. suspend puts the hardware and software state 3333 * in each IP into a state suitable for suspend. 3334 * Returns 0 on success, negative error code on failure. 3335 */ 3336 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3337 { 3338 int i, r; 3339 3340 if (adev->in_s0ix) 3341 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3342 3343 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3344 if (!adev->ip_blocks[i].status.valid) 3345 continue; 3346 /* displays are handled in phase1 */ 3347 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3348 continue; 3349 /* PSP lost connection when err_event_athub occurs */ 3350 if (amdgpu_ras_intr_triggered() && 3351 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3352 adev->ip_blocks[i].status.hw = false; 3353 continue; 3354 } 3355 3356 /* skip unnecessary suspend if we do not initialize them yet */ 3357 if (adev->gmc.xgmi.pending_reset && 3358 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3359 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3360 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3361 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3362 adev->ip_blocks[i].status.hw = false; 3363 continue; 3364 } 3365 3366 /* skip suspend of gfx/mes and psp for S0ix 3367 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3368 * like at runtime. PSP is also part of the always on hardware 3369 * so no need to suspend it. 3370 */ 3371 if (adev->in_s0ix && 3372 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3373 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3374 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3375 continue; 3376 3377 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3378 if (adev->in_s0ix && 3379 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3380 IP_VERSION(5, 0, 0)) && 3381 (adev->ip_blocks[i].version->type == 3382 AMD_IP_BLOCK_TYPE_SDMA)) 3383 continue; 3384 3385 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3386 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3387 * from this location and RLC Autoload automatically also gets loaded 3388 * from here based on PMFW -> PSP message during re-init sequence. 3389 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3390 * the TMR and reload FWs again for IMU enabled APU ASICs. 3391 */ 3392 if (amdgpu_in_reset(adev) && 3393 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3394 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3395 continue; 3396 3397 /* XXX handle errors */ 3398 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3399 /* XXX handle errors */ 3400 if (r) { 3401 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3402 adev->ip_blocks[i].version->funcs->name, r); 3403 } 3404 adev->ip_blocks[i].status.hw = false; 3405 /* handle putting the SMC in the appropriate state */ 3406 if (!amdgpu_sriov_vf(adev)) { 3407 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3408 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3409 if (r) { 3410 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3411 adev->mp1_state, r); 3412 return r; 3413 } 3414 } 3415 } 3416 } 3417 3418 return 0; 3419 } 3420 3421 /** 3422 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3423 * 3424 * @adev: amdgpu_device pointer 3425 * 3426 * Main suspend function for hardware IPs. The list of all the hardware 3427 * IPs that make up the asic is walked, clockgating is disabled and the 3428 * suspend callbacks are run. suspend puts the hardware and software state 3429 * in each IP into a state suitable for suspend. 3430 * Returns 0 on success, negative error code on failure. 3431 */ 3432 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3433 { 3434 int r; 3435 3436 if (amdgpu_sriov_vf(adev)) { 3437 amdgpu_virt_fini_data_exchange(adev); 3438 amdgpu_virt_request_full_gpu(adev, false); 3439 } 3440 3441 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3442 3443 r = amdgpu_device_ip_suspend_phase1(adev); 3444 if (r) 3445 return r; 3446 r = amdgpu_device_ip_suspend_phase2(adev); 3447 3448 if (amdgpu_sriov_vf(adev)) 3449 amdgpu_virt_release_full_gpu(adev, false); 3450 3451 return r; 3452 } 3453 3454 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3455 { 3456 int i, r; 3457 3458 static enum amd_ip_block_type ip_order[] = { 3459 AMD_IP_BLOCK_TYPE_COMMON, 3460 AMD_IP_BLOCK_TYPE_GMC, 3461 AMD_IP_BLOCK_TYPE_PSP, 3462 AMD_IP_BLOCK_TYPE_IH, 3463 }; 3464 3465 for (i = 0; i < adev->num_ip_blocks; i++) { 3466 int j; 3467 struct amdgpu_ip_block *block; 3468 3469 block = &adev->ip_blocks[i]; 3470 block->status.hw = false; 3471 3472 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3473 3474 if (block->version->type != ip_order[j] || 3475 !block->status.valid) 3476 continue; 3477 3478 r = block->version->funcs->hw_init(adev); 3479 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3480 if (r) 3481 return r; 3482 block->status.hw = true; 3483 } 3484 } 3485 3486 return 0; 3487 } 3488 3489 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3490 { 3491 int i, r; 3492 3493 static enum amd_ip_block_type ip_order[] = { 3494 AMD_IP_BLOCK_TYPE_SMC, 3495 AMD_IP_BLOCK_TYPE_DCE, 3496 AMD_IP_BLOCK_TYPE_GFX, 3497 AMD_IP_BLOCK_TYPE_SDMA, 3498 AMD_IP_BLOCK_TYPE_MES, 3499 AMD_IP_BLOCK_TYPE_UVD, 3500 AMD_IP_BLOCK_TYPE_VCE, 3501 AMD_IP_BLOCK_TYPE_VCN, 3502 AMD_IP_BLOCK_TYPE_JPEG 3503 }; 3504 3505 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3506 int j; 3507 struct amdgpu_ip_block *block; 3508 3509 for (j = 0; j < adev->num_ip_blocks; j++) { 3510 block = &adev->ip_blocks[j]; 3511 3512 if (block->version->type != ip_order[i] || 3513 !block->status.valid || 3514 block->status.hw) 3515 continue; 3516 3517 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3518 r = block->version->funcs->resume(adev); 3519 else 3520 r = block->version->funcs->hw_init(adev); 3521 3522 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3523 if (r) 3524 return r; 3525 block->status.hw = true; 3526 } 3527 } 3528 3529 return 0; 3530 } 3531 3532 /** 3533 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3534 * 3535 * @adev: amdgpu_device pointer 3536 * 3537 * First resume function for hardware IPs. The list of all the hardware 3538 * IPs that make up the asic is walked and the resume callbacks are run for 3539 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3540 * after a suspend and updates the software state as necessary. This 3541 * function is also used for restoring the GPU after a GPU reset. 3542 * Returns 0 on success, negative error code on failure. 3543 */ 3544 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3545 { 3546 int i, r; 3547 3548 for (i = 0; i < adev->num_ip_blocks; i++) { 3549 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3550 continue; 3551 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3552 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3553 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3554 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3555 3556 r = adev->ip_blocks[i].version->funcs->resume(adev); 3557 if (r) { 3558 DRM_ERROR("resume of IP block <%s> failed %d\n", 3559 adev->ip_blocks[i].version->funcs->name, r); 3560 return r; 3561 } 3562 adev->ip_blocks[i].status.hw = true; 3563 } 3564 } 3565 3566 return 0; 3567 } 3568 3569 /** 3570 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3571 * 3572 * @adev: amdgpu_device pointer 3573 * 3574 * First resume function for hardware IPs. The list of all the hardware 3575 * IPs that make up the asic is walked and the resume callbacks are run for 3576 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3577 * functional state after a suspend and updates the software state as 3578 * necessary. This function is also used for restoring the GPU after a GPU 3579 * reset. 3580 * Returns 0 on success, negative error code on failure. 3581 */ 3582 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3583 { 3584 int i, r; 3585 3586 for (i = 0; i < adev->num_ip_blocks; i++) { 3587 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3588 continue; 3589 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3590 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3591 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3592 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3593 continue; 3594 r = adev->ip_blocks[i].version->funcs->resume(adev); 3595 if (r) { 3596 DRM_ERROR("resume of IP block <%s> failed %d\n", 3597 adev->ip_blocks[i].version->funcs->name, r); 3598 return r; 3599 } 3600 adev->ip_blocks[i].status.hw = true; 3601 } 3602 3603 return 0; 3604 } 3605 3606 /** 3607 * amdgpu_device_ip_resume - run resume for hardware IPs 3608 * 3609 * @adev: amdgpu_device pointer 3610 * 3611 * Main resume function for hardware IPs. The hardware IPs 3612 * are split into two resume functions because they are 3613 * also used in recovering from a GPU reset and some additional 3614 * steps need to be take between them. In this case (S3/S4) they are 3615 * run sequentially. 3616 * Returns 0 on success, negative error code on failure. 3617 */ 3618 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3619 { 3620 int r; 3621 3622 r = amdgpu_device_ip_resume_phase1(adev); 3623 if (r) 3624 return r; 3625 3626 r = amdgpu_device_fw_loading(adev); 3627 if (r) 3628 return r; 3629 3630 r = amdgpu_device_ip_resume_phase2(adev); 3631 3632 if (adev->mman.buffer_funcs_ring->sched.ready) 3633 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3634 3635 return r; 3636 } 3637 3638 /** 3639 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3640 * 3641 * @adev: amdgpu_device pointer 3642 * 3643 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3644 */ 3645 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3646 { 3647 if (amdgpu_sriov_vf(adev)) { 3648 if (adev->is_atom_fw) { 3649 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3650 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3651 } else { 3652 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3653 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3654 } 3655 3656 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3657 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3658 } 3659 } 3660 3661 /** 3662 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3663 * 3664 * @asic_type: AMD asic type 3665 * 3666 * Check if there is DC (new modesetting infrastructre) support for an asic. 3667 * returns true if DC has support, false if not. 3668 */ 3669 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3670 { 3671 switch (asic_type) { 3672 #ifdef CONFIG_DRM_AMDGPU_SI 3673 case CHIP_HAINAN: 3674 #endif 3675 case CHIP_TOPAZ: 3676 /* chips with no display hardware */ 3677 return false; 3678 #if defined(CONFIG_DRM_AMD_DC) 3679 case CHIP_TAHITI: 3680 case CHIP_PITCAIRN: 3681 case CHIP_VERDE: 3682 case CHIP_OLAND: 3683 /* 3684 * We have systems in the wild with these ASICs that require 3685 * LVDS and VGA support which is not supported with DC. 3686 * 3687 * Fallback to the non-DC driver here by default so as not to 3688 * cause regressions. 3689 */ 3690 #if defined(CONFIG_DRM_AMD_DC_SI) 3691 return amdgpu_dc > 0; 3692 #else 3693 return false; 3694 #endif 3695 case CHIP_BONAIRE: 3696 case CHIP_KAVERI: 3697 case CHIP_KABINI: 3698 case CHIP_MULLINS: 3699 /* 3700 * We have systems in the wild with these ASICs that require 3701 * VGA support which is not supported with DC. 3702 * 3703 * Fallback to the non-DC driver here by default so as not to 3704 * cause regressions. 3705 */ 3706 return amdgpu_dc > 0; 3707 default: 3708 return amdgpu_dc != 0; 3709 #else 3710 default: 3711 if (amdgpu_dc > 0) 3712 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3713 return false; 3714 #endif 3715 } 3716 } 3717 3718 /** 3719 * amdgpu_device_has_dc_support - check if dc is supported 3720 * 3721 * @adev: amdgpu_device pointer 3722 * 3723 * Returns true for supported, false for not supported 3724 */ 3725 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3726 { 3727 if (adev->enable_virtual_display || 3728 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3729 return false; 3730 3731 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3732 } 3733 3734 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3735 { 3736 struct amdgpu_device *adev = 3737 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3738 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3739 3740 /* It's a bug to not have a hive within this function */ 3741 if (WARN_ON(!hive)) 3742 return; 3743 3744 /* 3745 * Use task barrier to synchronize all xgmi reset works across the 3746 * hive. task_barrier_enter and task_barrier_exit will block 3747 * until all the threads running the xgmi reset works reach 3748 * those points. task_barrier_full will do both blocks. 3749 */ 3750 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3751 3752 task_barrier_enter(&hive->tb); 3753 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3754 3755 if (adev->asic_reset_res) 3756 goto fail; 3757 3758 task_barrier_exit(&hive->tb); 3759 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3760 3761 if (adev->asic_reset_res) 3762 goto fail; 3763 3764 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 3765 } else { 3766 3767 task_barrier_full(&hive->tb); 3768 adev->asic_reset_res = amdgpu_asic_reset(adev); 3769 } 3770 3771 fail: 3772 if (adev->asic_reset_res) 3773 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3774 adev->asic_reset_res, adev_to_drm(adev)->unique); 3775 amdgpu_put_xgmi_hive(hive); 3776 } 3777 3778 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3779 { 3780 char *input = amdgpu_lockup_timeout; 3781 char *timeout_setting = NULL; 3782 int index = 0; 3783 long timeout; 3784 int ret = 0; 3785 3786 /* 3787 * By default timeout for non compute jobs is 10000 3788 * and 60000 for compute jobs. 3789 * In SR-IOV or passthrough mode, timeout for compute 3790 * jobs are 60000 by default. 3791 */ 3792 adev->gfx_timeout = msecs_to_jiffies(10000); 3793 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3794 if (amdgpu_sriov_vf(adev)) 3795 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3796 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3797 else 3798 adev->compute_timeout = msecs_to_jiffies(60000); 3799 3800 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3801 while ((timeout_setting = strsep(&input, ",")) && 3802 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3803 ret = kstrtol(timeout_setting, 0, &timeout); 3804 if (ret) 3805 return ret; 3806 3807 if (timeout == 0) { 3808 index++; 3809 continue; 3810 } else if (timeout < 0) { 3811 timeout = MAX_SCHEDULE_TIMEOUT; 3812 dev_warn(adev->dev, "lockup timeout disabled"); 3813 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3814 } else { 3815 timeout = msecs_to_jiffies(timeout); 3816 } 3817 3818 switch (index++) { 3819 case 0: 3820 adev->gfx_timeout = timeout; 3821 break; 3822 case 1: 3823 adev->compute_timeout = timeout; 3824 break; 3825 case 2: 3826 adev->sdma_timeout = timeout; 3827 break; 3828 case 3: 3829 adev->video_timeout = timeout; 3830 break; 3831 default: 3832 break; 3833 } 3834 } 3835 /* 3836 * There is only one value specified and 3837 * it should apply to all non-compute jobs. 3838 */ 3839 if (index == 1) { 3840 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3841 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3842 adev->compute_timeout = adev->gfx_timeout; 3843 } 3844 } 3845 3846 return ret; 3847 } 3848 3849 /** 3850 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3851 * 3852 * @adev: amdgpu_device pointer 3853 * 3854 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3855 */ 3856 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3857 { 3858 struct iommu_domain *domain; 3859 3860 domain = iommu_get_domain_for_dev(adev->dev); 3861 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3862 adev->ram_is_direct_mapped = true; 3863 } 3864 3865 static const struct attribute *amdgpu_dev_attributes[] = { 3866 &dev_attr_pcie_replay_count.attr, 3867 NULL 3868 }; 3869 3870 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 3871 { 3872 if (amdgpu_mcbp == 1) 3873 adev->gfx.mcbp = true; 3874 else if (amdgpu_mcbp == 0) 3875 adev->gfx.mcbp = false; 3876 3877 if (amdgpu_sriov_vf(adev)) 3878 adev->gfx.mcbp = true; 3879 3880 if (adev->gfx.mcbp) 3881 DRM_INFO("MCBP is enabled\n"); 3882 } 3883 3884 /** 3885 * amdgpu_device_init - initialize the driver 3886 * 3887 * @adev: amdgpu_device pointer 3888 * @flags: driver flags 3889 * 3890 * Initializes the driver info and hw (all asics). 3891 * Returns 0 for success or an error on failure. 3892 * Called at driver startup. 3893 */ 3894 int amdgpu_device_init(struct amdgpu_device *adev, 3895 uint32_t flags) 3896 { 3897 struct drm_device *ddev = adev_to_drm(adev); 3898 struct pci_dev *pdev = adev->pdev; 3899 int r, i; 3900 bool px = false; 3901 u32 max_MBps; 3902 int tmp; 3903 3904 adev->shutdown = false; 3905 adev->flags = flags; 3906 3907 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3908 adev->asic_type = amdgpu_force_asic_type; 3909 else 3910 adev->asic_type = flags & AMD_ASIC_MASK; 3911 3912 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3913 if (amdgpu_emu_mode == 1) 3914 adev->usec_timeout *= 10; 3915 adev->gmc.gart_size = 512 * 1024 * 1024; 3916 adev->accel_working = false; 3917 adev->num_rings = 0; 3918 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3919 adev->mman.buffer_funcs = NULL; 3920 adev->mman.buffer_funcs_ring = NULL; 3921 adev->vm_manager.vm_pte_funcs = NULL; 3922 adev->vm_manager.vm_pte_num_scheds = 0; 3923 adev->gmc.gmc_funcs = NULL; 3924 adev->harvest_ip_mask = 0x0; 3925 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3926 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3927 3928 adev->smc_rreg = &amdgpu_invalid_rreg; 3929 adev->smc_wreg = &amdgpu_invalid_wreg; 3930 adev->pcie_rreg = &amdgpu_invalid_rreg; 3931 adev->pcie_wreg = &amdgpu_invalid_wreg; 3932 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 3933 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 3934 adev->pciep_rreg = &amdgpu_invalid_rreg; 3935 adev->pciep_wreg = &amdgpu_invalid_wreg; 3936 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3937 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3938 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 3939 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 3940 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3941 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3942 adev->didt_rreg = &amdgpu_invalid_rreg; 3943 adev->didt_wreg = &amdgpu_invalid_wreg; 3944 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3945 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3946 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3947 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3948 3949 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3950 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3951 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3952 3953 /* mutex initialization are all done here so we 3954 * can recall function without having locking issues 3955 */ 3956 mutex_init(&adev->firmware.mutex); 3957 mutex_init(&adev->pm.mutex); 3958 mutex_init(&adev->gfx.gpu_clock_mutex); 3959 mutex_init(&adev->srbm_mutex); 3960 mutex_init(&adev->gfx.pipe_reserve_mutex); 3961 mutex_init(&adev->gfx.gfx_off_mutex); 3962 mutex_init(&adev->gfx.partition_mutex); 3963 mutex_init(&adev->grbm_idx_mutex); 3964 mutex_init(&adev->mn_lock); 3965 mutex_init(&adev->virt.vf_errors.lock); 3966 hash_init(adev->mn_hash); 3967 mutex_init(&adev->psp.mutex); 3968 mutex_init(&adev->notifier_lock); 3969 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3970 mutex_init(&adev->benchmark_mutex); 3971 3972 amdgpu_device_init_apu_flags(adev); 3973 3974 r = amdgpu_device_check_arguments(adev); 3975 if (r) 3976 return r; 3977 3978 spin_lock_init(&adev->mmio_idx_lock); 3979 spin_lock_init(&adev->smc_idx_lock); 3980 spin_lock_init(&adev->pcie_idx_lock); 3981 spin_lock_init(&adev->uvd_ctx_idx_lock); 3982 spin_lock_init(&adev->didt_idx_lock); 3983 spin_lock_init(&adev->gc_cac_idx_lock); 3984 spin_lock_init(&adev->se_cac_idx_lock); 3985 spin_lock_init(&adev->audio_endpt_idx_lock); 3986 spin_lock_init(&adev->mm_stats.lock); 3987 3988 INIT_LIST_HEAD(&adev->shadow_list); 3989 mutex_init(&adev->shadow_list_lock); 3990 3991 INIT_LIST_HEAD(&adev->reset_list); 3992 3993 INIT_LIST_HEAD(&adev->ras_list); 3994 3995 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 3996 3997 INIT_DELAYED_WORK(&adev->delayed_init_work, 3998 amdgpu_device_delayed_init_work_handler); 3999 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4000 amdgpu_device_delay_enable_gfx_off); 4001 4002 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4003 4004 adev->gfx.gfx_off_req_count = 1; 4005 adev->gfx.gfx_off_residency = 0; 4006 adev->gfx.gfx_off_entrycount = 0; 4007 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4008 4009 atomic_set(&adev->throttling_logging_enabled, 1); 4010 /* 4011 * If throttling continues, logging will be performed every minute 4012 * to avoid log flooding. "-1" is subtracted since the thermal 4013 * throttling interrupt comes every second. Thus, the total logging 4014 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4015 * for throttling interrupt) = 60 seconds. 4016 */ 4017 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4018 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4019 4020 /* Registers mapping */ 4021 /* TODO: block userspace mapping of io register */ 4022 if (adev->asic_type >= CHIP_BONAIRE) { 4023 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4024 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4025 } else { 4026 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4027 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4028 } 4029 4030 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4031 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4032 4033 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4034 if (!adev->rmmio) 4035 return -ENOMEM; 4036 4037 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 4038 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 4039 4040 /* 4041 * Reset domain needs to be present early, before XGMI hive discovered 4042 * (if any) and intitialized to use reset sem and in_gpu reset flag 4043 * early on during init and before calling to RREG32. 4044 */ 4045 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4046 if (!adev->reset_domain) 4047 return -ENOMEM; 4048 4049 /* detect hw virtualization here */ 4050 amdgpu_detect_virtualization(adev); 4051 4052 amdgpu_device_get_pcie_info(adev); 4053 4054 r = amdgpu_device_get_job_timeout_settings(adev); 4055 if (r) { 4056 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4057 return r; 4058 } 4059 4060 amdgpu_device_set_mcbp(adev); 4061 4062 /* early init functions */ 4063 r = amdgpu_device_ip_early_init(adev); 4064 if (r) 4065 return r; 4066 4067 /* Get rid of things like offb */ 4068 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 4069 if (r) 4070 return r; 4071 4072 /* Enable TMZ based on IP_VERSION */ 4073 amdgpu_gmc_tmz_set(adev); 4074 4075 amdgpu_gmc_noretry_set(adev); 4076 /* Need to get xgmi info early to decide the reset behavior*/ 4077 if (adev->gmc.xgmi.supported) { 4078 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4079 if (r) 4080 return r; 4081 } 4082 4083 /* enable PCIE atomic ops */ 4084 if (amdgpu_sriov_vf(adev)) { 4085 if (adev->virt.fw_reserve.p_pf2vf) 4086 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4087 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4088 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4089 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4090 * internal path natively support atomics, set have_atomics_support to true. 4091 */ 4092 } else if ((adev->flags & AMD_IS_APU) && 4093 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4094 IP_VERSION(9, 0, 0))) { 4095 adev->have_atomics_support = true; 4096 } else { 4097 adev->have_atomics_support = 4098 !pci_enable_atomic_ops_to_root(adev->pdev, 4099 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4100 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4101 } 4102 4103 if (!adev->have_atomics_support) 4104 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4105 4106 /* doorbell bar mapping and doorbell index init*/ 4107 amdgpu_doorbell_init(adev); 4108 4109 if (amdgpu_emu_mode == 1) { 4110 /* post the asic on emulation mode */ 4111 emu_soc_asic_init(adev); 4112 goto fence_driver_init; 4113 } 4114 4115 amdgpu_reset_init(adev); 4116 4117 /* detect if we are with an SRIOV vbios */ 4118 if (adev->bios) 4119 amdgpu_device_detect_sriov_bios(adev); 4120 4121 /* check if we need to reset the asic 4122 * E.g., driver was not cleanly unloaded previously, etc. 4123 */ 4124 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4125 if (adev->gmc.xgmi.num_physical_nodes) { 4126 dev_info(adev->dev, "Pending hive reset.\n"); 4127 adev->gmc.xgmi.pending_reset = true; 4128 /* Only need to init necessary block for SMU to handle the reset */ 4129 for (i = 0; i < adev->num_ip_blocks; i++) { 4130 if (!adev->ip_blocks[i].status.valid) 4131 continue; 4132 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 4133 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 4134 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 4135 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 4136 DRM_DEBUG("IP %s disabled for hw_init.\n", 4137 adev->ip_blocks[i].version->funcs->name); 4138 adev->ip_blocks[i].status.hw = true; 4139 } 4140 } 4141 } else { 4142 tmp = amdgpu_reset_method; 4143 /* It should do a default reset when loading or reloading the driver, 4144 * regardless of the module parameter reset_method. 4145 */ 4146 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4147 r = amdgpu_asic_reset(adev); 4148 amdgpu_reset_method = tmp; 4149 if (r) { 4150 dev_err(adev->dev, "asic reset on init failed\n"); 4151 goto failed; 4152 } 4153 } 4154 } 4155 4156 /* Post card if necessary */ 4157 if (amdgpu_device_need_post(adev)) { 4158 if (!adev->bios) { 4159 dev_err(adev->dev, "no vBIOS found\n"); 4160 r = -EINVAL; 4161 goto failed; 4162 } 4163 DRM_INFO("GPU posting now...\n"); 4164 r = amdgpu_device_asic_init(adev); 4165 if (r) { 4166 dev_err(adev->dev, "gpu post error!\n"); 4167 goto failed; 4168 } 4169 } 4170 4171 if (adev->bios) { 4172 if (adev->is_atom_fw) { 4173 /* Initialize clocks */ 4174 r = amdgpu_atomfirmware_get_clock_info(adev); 4175 if (r) { 4176 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4177 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4178 goto failed; 4179 } 4180 } else { 4181 /* Initialize clocks */ 4182 r = amdgpu_atombios_get_clock_info(adev); 4183 if (r) { 4184 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4185 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4186 goto failed; 4187 } 4188 /* init i2c buses */ 4189 if (!amdgpu_device_has_dc_support(adev)) 4190 amdgpu_atombios_i2c_init(adev); 4191 } 4192 } 4193 4194 fence_driver_init: 4195 /* Fence driver */ 4196 r = amdgpu_fence_driver_sw_init(adev); 4197 if (r) { 4198 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4199 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4200 goto failed; 4201 } 4202 4203 /* init the mode config */ 4204 drm_mode_config_init(adev_to_drm(adev)); 4205 4206 r = amdgpu_device_ip_init(adev); 4207 if (r) { 4208 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4209 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4210 goto release_ras_con; 4211 } 4212 4213 amdgpu_fence_driver_hw_init(adev); 4214 4215 dev_info(adev->dev, 4216 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4217 adev->gfx.config.max_shader_engines, 4218 adev->gfx.config.max_sh_per_se, 4219 adev->gfx.config.max_cu_per_sh, 4220 adev->gfx.cu_info.number); 4221 4222 adev->accel_working = true; 4223 4224 amdgpu_vm_check_compute_bug(adev); 4225 4226 /* Initialize the buffer migration limit. */ 4227 if (amdgpu_moverate >= 0) 4228 max_MBps = amdgpu_moverate; 4229 else 4230 max_MBps = 8; /* Allow 8 MB/s. */ 4231 /* Get a log2 for easy divisions. */ 4232 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4233 4234 /* 4235 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4236 * Otherwise the mgpu fan boost feature will be skipped due to the 4237 * gpu instance is counted less. 4238 */ 4239 amdgpu_register_gpu_instance(adev); 4240 4241 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4242 * explicit gating rather than handling it automatically. 4243 */ 4244 if (!adev->gmc.xgmi.pending_reset) { 4245 r = amdgpu_device_ip_late_init(adev); 4246 if (r) { 4247 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4248 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4249 goto release_ras_con; 4250 } 4251 /* must succeed. */ 4252 amdgpu_ras_resume(adev); 4253 queue_delayed_work(system_wq, &adev->delayed_init_work, 4254 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4255 } 4256 4257 if (amdgpu_sriov_vf(adev)) { 4258 amdgpu_virt_release_full_gpu(adev, true); 4259 flush_delayed_work(&adev->delayed_init_work); 4260 } 4261 4262 /* 4263 * Place those sysfs registering after `late_init`. As some of those 4264 * operations performed in `late_init` might affect the sysfs 4265 * interfaces creating. 4266 */ 4267 r = amdgpu_atombios_sysfs_init(adev); 4268 if (r) 4269 drm_err(&adev->ddev, 4270 "registering atombios sysfs failed (%d).\n", r); 4271 4272 r = amdgpu_pm_sysfs_init(adev); 4273 if (r) 4274 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4275 4276 r = amdgpu_ucode_sysfs_init(adev); 4277 if (r) { 4278 adev->ucode_sysfs_en = false; 4279 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4280 } else 4281 adev->ucode_sysfs_en = true; 4282 4283 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4284 if (r) 4285 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4286 4287 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4288 if (r) 4289 dev_err(adev->dev, 4290 "Could not create amdgpu board attributes\n"); 4291 4292 amdgpu_fru_sysfs_init(adev); 4293 amdgpu_reg_state_sysfs_init(adev); 4294 4295 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4296 r = amdgpu_pmu_init(adev); 4297 if (r) 4298 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4299 4300 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4301 if (amdgpu_device_cache_pci_state(adev->pdev)) 4302 pci_restore_state(pdev); 4303 4304 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4305 /* this will fail for cards that aren't VGA class devices, just 4306 * ignore it 4307 */ 4308 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4309 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4310 4311 px = amdgpu_device_supports_px(ddev); 4312 4313 if (px || (!dev_is_removable(&adev->pdev->dev) && 4314 apple_gmux_detect(NULL, NULL))) 4315 vga_switcheroo_register_client(adev->pdev, 4316 &amdgpu_switcheroo_ops, px); 4317 4318 if (px) 4319 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4320 4321 if (adev->gmc.xgmi.pending_reset) 4322 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 4323 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4324 4325 amdgpu_device_check_iommu_direct_map(adev); 4326 4327 return 0; 4328 4329 release_ras_con: 4330 if (amdgpu_sriov_vf(adev)) 4331 amdgpu_virt_release_full_gpu(adev, true); 4332 4333 /* failed in exclusive mode due to timeout */ 4334 if (amdgpu_sriov_vf(adev) && 4335 !amdgpu_sriov_runtime(adev) && 4336 amdgpu_virt_mmio_blocked(adev) && 4337 !amdgpu_virt_wait_reset(adev)) { 4338 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4339 /* Don't send request since VF is inactive. */ 4340 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4341 adev->virt.ops = NULL; 4342 r = -EAGAIN; 4343 } 4344 amdgpu_release_ras_context(adev); 4345 4346 failed: 4347 amdgpu_vf_error_trans_all(adev); 4348 4349 return r; 4350 } 4351 4352 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4353 { 4354 4355 /* Clear all CPU mappings pointing to this device */ 4356 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4357 4358 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4359 amdgpu_doorbell_fini(adev); 4360 4361 iounmap(adev->rmmio); 4362 adev->rmmio = NULL; 4363 if (adev->mman.aper_base_kaddr) 4364 iounmap(adev->mman.aper_base_kaddr); 4365 adev->mman.aper_base_kaddr = NULL; 4366 4367 /* Memory manager related */ 4368 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4369 arch_phys_wc_del(adev->gmc.vram_mtrr); 4370 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4371 } 4372 } 4373 4374 /** 4375 * amdgpu_device_fini_hw - tear down the driver 4376 * 4377 * @adev: amdgpu_device pointer 4378 * 4379 * Tear down the driver info (all asics). 4380 * Called at driver shutdown. 4381 */ 4382 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4383 { 4384 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4385 flush_delayed_work(&adev->delayed_init_work); 4386 adev->shutdown = true; 4387 4388 /* make sure IB test finished before entering exclusive mode 4389 * to avoid preemption on IB test 4390 */ 4391 if (amdgpu_sriov_vf(adev)) { 4392 amdgpu_virt_request_full_gpu(adev, false); 4393 amdgpu_virt_fini_data_exchange(adev); 4394 } 4395 4396 /* disable all interrupts */ 4397 amdgpu_irq_disable_all(adev); 4398 if (adev->mode_info.mode_config_initialized) { 4399 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4400 drm_helper_force_disable_all(adev_to_drm(adev)); 4401 else 4402 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4403 } 4404 amdgpu_fence_driver_hw_fini(adev); 4405 4406 if (adev->mman.initialized) 4407 drain_workqueue(adev->mman.bdev.wq); 4408 4409 if (adev->pm.sysfs_initialized) 4410 amdgpu_pm_sysfs_fini(adev); 4411 if (adev->ucode_sysfs_en) 4412 amdgpu_ucode_sysfs_fini(adev); 4413 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4414 amdgpu_fru_sysfs_fini(adev); 4415 4416 amdgpu_reg_state_sysfs_fini(adev); 4417 4418 /* disable ras feature must before hw fini */ 4419 amdgpu_ras_pre_fini(adev); 4420 4421 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4422 4423 amdgpu_device_ip_fini_early(adev); 4424 4425 amdgpu_irq_fini_hw(adev); 4426 4427 if (adev->mman.initialized) 4428 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4429 4430 amdgpu_gart_dummy_page_fini(adev); 4431 4432 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4433 amdgpu_device_unmap_mmio(adev); 4434 4435 } 4436 4437 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4438 { 4439 int idx; 4440 bool px; 4441 4442 amdgpu_fence_driver_sw_fini(adev); 4443 amdgpu_device_ip_fini(adev); 4444 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4445 adev->accel_working = false; 4446 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4447 4448 amdgpu_reset_fini(adev); 4449 4450 /* free i2c buses */ 4451 if (!amdgpu_device_has_dc_support(adev)) 4452 amdgpu_i2c_fini(adev); 4453 4454 if (amdgpu_emu_mode != 1) 4455 amdgpu_atombios_fini(adev); 4456 4457 kfree(adev->bios); 4458 adev->bios = NULL; 4459 4460 kfree(adev->fru_info); 4461 adev->fru_info = NULL; 4462 4463 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4464 4465 if (px || (!dev_is_removable(&adev->pdev->dev) && 4466 apple_gmux_detect(NULL, NULL))) 4467 vga_switcheroo_unregister_client(adev->pdev); 4468 4469 if (px) 4470 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4471 4472 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4473 vga_client_unregister(adev->pdev); 4474 4475 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4476 4477 iounmap(adev->rmmio); 4478 adev->rmmio = NULL; 4479 amdgpu_doorbell_fini(adev); 4480 drm_dev_exit(idx); 4481 } 4482 4483 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4484 amdgpu_pmu_fini(adev); 4485 if (adev->mman.discovery_bin) 4486 amdgpu_discovery_fini(adev); 4487 4488 amdgpu_reset_put_reset_domain(adev->reset_domain); 4489 adev->reset_domain = NULL; 4490 4491 kfree(adev->pci_state); 4492 4493 } 4494 4495 /** 4496 * amdgpu_device_evict_resources - evict device resources 4497 * @adev: amdgpu device object 4498 * 4499 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4500 * of the vram memory type. Mainly used for evicting device resources 4501 * at suspend time. 4502 * 4503 */ 4504 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4505 { 4506 int ret; 4507 4508 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4509 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4510 return 0; 4511 4512 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4513 if (ret) 4514 DRM_WARN("evicting device resources failed\n"); 4515 return ret; 4516 } 4517 4518 /* 4519 * Suspend & resume. 4520 */ 4521 /** 4522 * amdgpu_device_prepare - prepare for device suspend 4523 * 4524 * @dev: drm dev pointer 4525 * 4526 * Prepare to put the hw in the suspend state (all asics). 4527 * Returns 0 for success or an error on failure. 4528 * Called at driver suspend. 4529 */ 4530 int amdgpu_device_prepare(struct drm_device *dev) 4531 { 4532 struct amdgpu_device *adev = drm_to_adev(dev); 4533 int i, r; 4534 4535 amdgpu_choose_low_power_state(adev); 4536 4537 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4538 return 0; 4539 4540 /* Evict the majority of BOs before starting suspend sequence */ 4541 r = amdgpu_device_evict_resources(adev); 4542 if (r) 4543 goto unprepare; 4544 4545 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4546 4547 for (i = 0; i < adev->num_ip_blocks; i++) { 4548 if (!adev->ip_blocks[i].status.valid) 4549 continue; 4550 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 4551 continue; 4552 r = adev->ip_blocks[i].version->funcs->prepare_suspend((void *)adev); 4553 if (r) 4554 goto unprepare; 4555 } 4556 4557 return 0; 4558 4559 unprepare: 4560 adev->in_s0ix = adev->in_s3 = false; 4561 4562 return r; 4563 } 4564 4565 /** 4566 * amdgpu_device_suspend - initiate device suspend 4567 * 4568 * @dev: drm dev pointer 4569 * @fbcon : notify the fbdev of suspend 4570 * 4571 * Puts the hw in the suspend state (all asics). 4572 * Returns 0 for success or an error on failure. 4573 * Called at driver suspend. 4574 */ 4575 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4576 { 4577 struct amdgpu_device *adev = drm_to_adev(dev); 4578 int r = 0; 4579 4580 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4581 return 0; 4582 4583 adev->in_suspend = true; 4584 4585 if (amdgpu_sriov_vf(adev)) { 4586 amdgpu_virt_fini_data_exchange(adev); 4587 r = amdgpu_virt_request_full_gpu(adev, false); 4588 if (r) 4589 return r; 4590 } 4591 4592 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4593 DRM_WARN("smart shift update failed\n"); 4594 4595 if (fbcon) 4596 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4597 4598 cancel_delayed_work_sync(&adev->delayed_init_work); 4599 4600 amdgpu_ras_suspend(adev); 4601 4602 amdgpu_device_ip_suspend_phase1(adev); 4603 4604 if (!adev->in_s0ix) 4605 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4606 4607 r = amdgpu_device_evict_resources(adev); 4608 if (r) 4609 return r; 4610 4611 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4612 4613 amdgpu_fence_driver_hw_fini(adev); 4614 4615 amdgpu_device_ip_suspend_phase2(adev); 4616 4617 if (amdgpu_sriov_vf(adev)) 4618 amdgpu_virt_release_full_gpu(adev, false); 4619 4620 r = amdgpu_dpm_notify_rlc_state(adev, false); 4621 if (r) 4622 return r; 4623 4624 return 0; 4625 } 4626 4627 /** 4628 * amdgpu_device_resume - initiate device resume 4629 * 4630 * @dev: drm dev pointer 4631 * @fbcon : notify the fbdev of resume 4632 * 4633 * Bring the hw back to operating state (all asics). 4634 * Returns 0 for success or an error on failure. 4635 * Called at driver resume. 4636 */ 4637 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4638 { 4639 struct amdgpu_device *adev = drm_to_adev(dev); 4640 int r = 0; 4641 4642 if (amdgpu_sriov_vf(adev)) { 4643 r = amdgpu_virt_request_full_gpu(adev, true); 4644 if (r) 4645 return r; 4646 } 4647 4648 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4649 return 0; 4650 4651 if (adev->in_s0ix) 4652 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4653 4654 /* post card */ 4655 if (amdgpu_device_need_post(adev)) { 4656 r = amdgpu_device_asic_init(adev); 4657 if (r) 4658 dev_err(adev->dev, "amdgpu asic init failed\n"); 4659 } 4660 4661 r = amdgpu_device_ip_resume(adev); 4662 4663 if (r) { 4664 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4665 goto exit; 4666 } 4667 amdgpu_fence_driver_hw_init(adev); 4668 4669 if (!adev->in_s0ix) { 4670 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4671 if (r) 4672 goto exit; 4673 } 4674 4675 r = amdgpu_device_ip_late_init(adev); 4676 if (r) 4677 goto exit; 4678 4679 queue_delayed_work(system_wq, &adev->delayed_init_work, 4680 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4681 exit: 4682 if (amdgpu_sriov_vf(adev)) { 4683 amdgpu_virt_init_data_exchange(adev); 4684 amdgpu_virt_release_full_gpu(adev, true); 4685 } 4686 4687 if (r) 4688 return r; 4689 4690 /* Make sure IB tests flushed */ 4691 flush_delayed_work(&adev->delayed_init_work); 4692 4693 if (fbcon) 4694 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4695 4696 amdgpu_ras_resume(adev); 4697 4698 if (adev->mode_info.num_crtc) { 4699 /* 4700 * Most of the connector probing functions try to acquire runtime pm 4701 * refs to ensure that the GPU is powered on when connector polling is 4702 * performed. Since we're calling this from a runtime PM callback, 4703 * trying to acquire rpm refs will cause us to deadlock. 4704 * 4705 * Since we're guaranteed to be holding the rpm lock, it's safe to 4706 * temporarily disable the rpm helpers so this doesn't deadlock us. 4707 */ 4708 #ifdef CONFIG_PM 4709 dev->dev->power.disable_depth++; 4710 #endif 4711 if (!adev->dc_enabled) 4712 drm_helper_hpd_irq_event(dev); 4713 else 4714 drm_kms_helper_hotplug_event(dev); 4715 #ifdef CONFIG_PM 4716 dev->dev->power.disable_depth--; 4717 #endif 4718 } 4719 adev->in_suspend = false; 4720 4721 if (adev->enable_mes) 4722 amdgpu_mes_self_test(adev); 4723 4724 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4725 DRM_WARN("smart shift update failed\n"); 4726 4727 return 0; 4728 } 4729 4730 /** 4731 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4732 * 4733 * @adev: amdgpu_device pointer 4734 * 4735 * The list of all the hardware IPs that make up the asic is walked and 4736 * the check_soft_reset callbacks are run. check_soft_reset determines 4737 * if the asic is still hung or not. 4738 * Returns true if any of the IPs are still in a hung state, false if not. 4739 */ 4740 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4741 { 4742 int i; 4743 bool asic_hang = false; 4744 4745 if (amdgpu_sriov_vf(adev)) 4746 return true; 4747 4748 if (amdgpu_asic_need_full_reset(adev)) 4749 return true; 4750 4751 for (i = 0; i < adev->num_ip_blocks; i++) { 4752 if (!adev->ip_blocks[i].status.valid) 4753 continue; 4754 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4755 adev->ip_blocks[i].status.hang = 4756 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4757 if (adev->ip_blocks[i].status.hang) { 4758 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4759 asic_hang = true; 4760 } 4761 } 4762 return asic_hang; 4763 } 4764 4765 /** 4766 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4767 * 4768 * @adev: amdgpu_device pointer 4769 * 4770 * The list of all the hardware IPs that make up the asic is walked and the 4771 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4772 * handles any IP specific hardware or software state changes that are 4773 * necessary for a soft reset to succeed. 4774 * Returns 0 on success, negative error code on failure. 4775 */ 4776 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4777 { 4778 int i, r = 0; 4779 4780 for (i = 0; i < adev->num_ip_blocks; i++) { 4781 if (!adev->ip_blocks[i].status.valid) 4782 continue; 4783 if (adev->ip_blocks[i].status.hang && 4784 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4785 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4786 if (r) 4787 return r; 4788 } 4789 } 4790 4791 return 0; 4792 } 4793 4794 /** 4795 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4796 * 4797 * @adev: amdgpu_device pointer 4798 * 4799 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4800 * reset is necessary to recover. 4801 * Returns true if a full asic reset is required, false if not. 4802 */ 4803 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4804 { 4805 int i; 4806 4807 if (amdgpu_asic_need_full_reset(adev)) 4808 return true; 4809 4810 for (i = 0; i < adev->num_ip_blocks; i++) { 4811 if (!adev->ip_blocks[i].status.valid) 4812 continue; 4813 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4814 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4815 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4816 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4817 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4818 if (adev->ip_blocks[i].status.hang) { 4819 dev_info(adev->dev, "Some block need full reset!\n"); 4820 return true; 4821 } 4822 } 4823 } 4824 return false; 4825 } 4826 4827 /** 4828 * amdgpu_device_ip_soft_reset - do a soft reset 4829 * 4830 * @adev: amdgpu_device pointer 4831 * 4832 * The list of all the hardware IPs that make up the asic is walked and the 4833 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4834 * IP specific hardware or software state changes that are necessary to soft 4835 * reset the IP. 4836 * Returns 0 on success, negative error code on failure. 4837 */ 4838 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4839 { 4840 int i, r = 0; 4841 4842 for (i = 0; i < adev->num_ip_blocks; i++) { 4843 if (!adev->ip_blocks[i].status.valid) 4844 continue; 4845 if (adev->ip_blocks[i].status.hang && 4846 adev->ip_blocks[i].version->funcs->soft_reset) { 4847 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4848 if (r) 4849 return r; 4850 } 4851 } 4852 4853 return 0; 4854 } 4855 4856 /** 4857 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4858 * 4859 * @adev: amdgpu_device pointer 4860 * 4861 * The list of all the hardware IPs that make up the asic is walked and the 4862 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4863 * handles any IP specific hardware or software state changes that are 4864 * necessary after the IP has been soft reset. 4865 * Returns 0 on success, negative error code on failure. 4866 */ 4867 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4868 { 4869 int i, r = 0; 4870 4871 for (i = 0; i < adev->num_ip_blocks; i++) { 4872 if (!adev->ip_blocks[i].status.valid) 4873 continue; 4874 if (adev->ip_blocks[i].status.hang && 4875 adev->ip_blocks[i].version->funcs->post_soft_reset) 4876 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4877 if (r) 4878 return r; 4879 } 4880 4881 return 0; 4882 } 4883 4884 /** 4885 * amdgpu_device_recover_vram - Recover some VRAM contents 4886 * 4887 * @adev: amdgpu_device pointer 4888 * 4889 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4890 * restore things like GPUVM page tables after a GPU reset where 4891 * the contents of VRAM might be lost. 4892 * 4893 * Returns: 4894 * 0 on success, negative error code on failure. 4895 */ 4896 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4897 { 4898 struct dma_fence *fence = NULL, *next = NULL; 4899 struct amdgpu_bo *shadow; 4900 struct amdgpu_bo_vm *vmbo; 4901 long r = 1, tmo; 4902 4903 if (amdgpu_sriov_runtime(adev)) 4904 tmo = msecs_to_jiffies(8000); 4905 else 4906 tmo = msecs_to_jiffies(100); 4907 4908 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4909 mutex_lock(&adev->shadow_list_lock); 4910 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4911 /* If vm is compute context or adev is APU, shadow will be NULL */ 4912 if (!vmbo->shadow) 4913 continue; 4914 shadow = vmbo->shadow; 4915 4916 /* No need to recover an evicted BO */ 4917 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4918 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4919 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4920 continue; 4921 4922 r = amdgpu_bo_restore_shadow(shadow, &next); 4923 if (r) 4924 break; 4925 4926 if (fence) { 4927 tmo = dma_fence_wait_timeout(fence, false, tmo); 4928 dma_fence_put(fence); 4929 fence = next; 4930 if (tmo == 0) { 4931 r = -ETIMEDOUT; 4932 break; 4933 } else if (tmo < 0) { 4934 r = tmo; 4935 break; 4936 } 4937 } else { 4938 fence = next; 4939 } 4940 } 4941 mutex_unlock(&adev->shadow_list_lock); 4942 4943 if (fence) 4944 tmo = dma_fence_wait_timeout(fence, false, tmo); 4945 dma_fence_put(fence); 4946 4947 if (r < 0 || tmo <= 0) { 4948 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4949 return -EIO; 4950 } 4951 4952 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4953 return 0; 4954 } 4955 4956 4957 /** 4958 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4959 * 4960 * @adev: amdgpu_device pointer 4961 * @from_hypervisor: request from hypervisor 4962 * 4963 * do VF FLR and reinitialize Asic 4964 * return 0 means succeeded otherwise failed 4965 */ 4966 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4967 bool from_hypervisor) 4968 { 4969 int r; 4970 struct amdgpu_hive_info *hive = NULL; 4971 int retry_limit = 0; 4972 4973 retry: 4974 amdgpu_amdkfd_pre_reset(adev); 4975 4976 amdgpu_device_stop_pending_resets(adev); 4977 4978 if (from_hypervisor) 4979 r = amdgpu_virt_request_full_gpu(adev, true); 4980 else 4981 r = amdgpu_virt_reset_gpu(adev); 4982 if (r) 4983 return r; 4984 amdgpu_irq_gpu_reset_resume_helper(adev); 4985 4986 /* some sw clean up VF needs to do before recover */ 4987 amdgpu_virt_post_reset(adev); 4988 4989 /* Resume IP prior to SMC */ 4990 r = amdgpu_device_ip_reinit_early_sriov(adev); 4991 if (r) 4992 goto error; 4993 4994 amdgpu_virt_init_data_exchange(adev); 4995 4996 r = amdgpu_device_fw_loading(adev); 4997 if (r) 4998 return r; 4999 5000 /* now we are okay to resume SMC/CP/SDMA */ 5001 r = amdgpu_device_ip_reinit_late_sriov(adev); 5002 if (r) 5003 goto error; 5004 5005 hive = amdgpu_get_xgmi_hive(adev); 5006 /* Update PSP FW topology after reset */ 5007 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5008 r = amdgpu_xgmi_update_topology(hive, adev); 5009 5010 if (hive) 5011 amdgpu_put_xgmi_hive(hive); 5012 5013 if (!r) { 5014 r = amdgpu_ib_ring_tests(adev); 5015 5016 amdgpu_amdkfd_post_reset(adev); 5017 } 5018 5019 error: 5020 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 5021 amdgpu_inc_vram_lost(adev); 5022 r = amdgpu_device_recover_vram(adev); 5023 } 5024 amdgpu_virt_release_full_gpu(adev, true); 5025 5026 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 5027 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 5028 retry_limit++; 5029 goto retry; 5030 } else 5031 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 5032 } 5033 5034 return r; 5035 } 5036 5037 /** 5038 * amdgpu_device_has_job_running - check if there is any job in mirror list 5039 * 5040 * @adev: amdgpu_device pointer 5041 * 5042 * check if there is any job in mirror list 5043 */ 5044 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5045 { 5046 int i; 5047 struct drm_sched_job *job; 5048 5049 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5050 struct amdgpu_ring *ring = adev->rings[i]; 5051 5052 if (!amdgpu_ring_sched_ready(ring)) 5053 continue; 5054 5055 spin_lock(&ring->sched.job_list_lock); 5056 job = list_first_entry_or_null(&ring->sched.pending_list, 5057 struct drm_sched_job, list); 5058 spin_unlock(&ring->sched.job_list_lock); 5059 if (job) 5060 return true; 5061 } 5062 return false; 5063 } 5064 5065 /** 5066 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5067 * 5068 * @adev: amdgpu_device pointer 5069 * 5070 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5071 * a hung GPU. 5072 */ 5073 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5074 { 5075 5076 if (amdgpu_gpu_recovery == 0) 5077 goto disabled; 5078 5079 /* Skip soft reset check in fatal error mode */ 5080 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5081 return true; 5082 5083 if (amdgpu_sriov_vf(adev)) 5084 return true; 5085 5086 if (amdgpu_gpu_recovery == -1) { 5087 switch (adev->asic_type) { 5088 #ifdef CONFIG_DRM_AMDGPU_SI 5089 case CHIP_VERDE: 5090 case CHIP_TAHITI: 5091 case CHIP_PITCAIRN: 5092 case CHIP_OLAND: 5093 case CHIP_HAINAN: 5094 #endif 5095 #ifdef CONFIG_DRM_AMDGPU_CIK 5096 case CHIP_KAVERI: 5097 case CHIP_KABINI: 5098 case CHIP_MULLINS: 5099 #endif 5100 case CHIP_CARRIZO: 5101 case CHIP_STONEY: 5102 case CHIP_CYAN_SKILLFISH: 5103 goto disabled; 5104 default: 5105 break; 5106 } 5107 } 5108 5109 return true; 5110 5111 disabled: 5112 dev_info(adev->dev, "GPU recovery disabled.\n"); 5113 return false; 5114 } 5115 5116 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5117 { 5118 u32 i; 5119 int ret = 0; 5120 5121 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5122 5123 dev_info(adev->dev, "GPU mode1 reset\n"); 5124 5125 /* disable BM */ 5126 pci_clear_master(adev->pdev); 5127 5128 amdgpu_device_cache_pci_state(adev->pdev); 5129 5130 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5131 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5132 ret = amdgpu_dpm_mode1_reset(adev); 5133 } else { 5134 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5135 ret = psp_gpu_reset(adev); 5136 } 5137 5138 if (ret) 5139 goto mode1_reset_failed; 5140 5141 amdgpu_device_load_pci_state(adev->pdev); 5142 ret = amdgpu_psp_wait_for_bootloader(adev); 5143 if (ret) 5144 goto mode1_reset_failed; 5145 5146 /* wait for asic to come out of reset */ 5147 for (i = 0; i < adev->usec_timeout; i++) { 5148 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5149 5150 if (memsize != 0xffffffff) 5151 break; 5152 udelay(1); 5153 } 5154 5155 if (i >= adev->usec_timeout) { 5156 ret = -ETIMEDOUT; 5157 goto mode1_reset_failed; 5158 } 5159 5160 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5161 5162 return 0; 5163 5164 mode1_reset_failed: 5165 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5166 return ret; 5167 } 5168 5169 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5170 struct amdgpu_reset_context *reset_context) 5171 { 5172 int i, r = 0; 5173 struct amdgpu_job *job = NULL; 5174 bool need_full_reset = 5175 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5176 5177 if (reset_context->reset_req_dev == adev) 5178 job = reset_context->job; 5179 5180 if (amdgpu_sriov_vf(adev)) { 5181 /* stop the data exchange thread */ 5182 amdgpu_virt_fini_data_exchange(adev); 5183 } 5184 5185 amdgpu_fence_driver_isr_toggle(adev, true); 5186 5187 /* block all schedulers and reset given job's ring */ 5188 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5189 struct amdgpu_ring *ring = adev->rings[i]; 5190 5191 if (!amdgpu_ring_sched_ready(ring)) 5192 continue; 5193 5194 /* Clear job fence from fence drv to avoid force_completion 5195 * leave NULL and vm flush fence in fence drv 5196 */ 5197 amdgpu_fence_driver_clear_job_fences(ring); 5198 5199 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5200 amdgpu_fence_driver_force_completion(ring); 5201 } 5202 5203 amdgpu_fence_driver_isr_toggle(adev, false); 5204 5205 if (job && job->vm) 5206 drm_sched_increase_karma(&job->base); 5207 5208 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5209 /* If reset handler not implemented, continue; otherwise return */ 5210 if (r == -EOPNOTSUPP) 5211 r = 0; 5212 else 5213 return r; 5214 5215 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5216 if (!amdgpu_sriov_vf(adev)) { 5217 5218 if (!need_full_reset) 5219 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5220 5221 if (!need_full_reset && amdgpu_gpu_recovery && 5222 amdgpu_device_ip_check_soft_reset(adev)) { 5223 amdgpu_device_ip_pre_soft_reset(adev); 5224 r = amdgpu_device_ip_soft_reset(adev); 5225 amdgpu_device_ip_post_soft_reset(adev); 5226 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5227 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5228 need_full_reset = true; 5229 } 5230 } 5231 5232 if (need_full_reset) 5233 r = amdgpu_device_ip_suspend(adev); 5234 if (need_full_reset) 5235 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5236 else 5237 clear_bit(AMDGPU_NEED_FULL_RESET, 5238 &reset_context->flags); 5239 } 5240 5241 return r; 5242 } 5243 5244 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 5245 { 5246 int i; 5247 5248 lockdep_assert_held(&adev->reset_domain->sem); 5249 5250 for (i = 0; i < adev->reset_info.num_regs; i++) { 5251 adev->reset_info.reset_dump_reg_value[i] = 5252 RREG32(adev->reset_info.reset_dump_reg_list[i]); 5253 5254 trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i], 5255 adev->reset_info.reset_dump_reg_value[i]); 5256 } 5257 5258 return 0; 5259 } 5260 5261 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5262 struct amdgpu_reset_context *reset_context) 5263 { 5264 struct amdgpu_device *tmp_adev = NULL; 5265 bool need_full_reset, skip_hw_reset, vram_lost = false; 5266 int r = 0; 5267 5268 /* Try reset handler method first */ 5269 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5270 reset_list); 5271 amdgpu_reset_reg_dumps(tmp_adev); 5272 5273 reset_context->reset_device_list = device_list_handle; 5274 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5275 /* If reset handler not implemented, continue; otherwise return */ 5276 if (r == -EOPNOTSUPP) 5277 r = 0; 5278 else 5279 return r; 5280 5281 /* Reset handler not implemented, use the default method */ 5282 need_full_reset = 5283 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5284 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5285 5286 /* 5287 * ASIC reset has to be done on all XGMI hive nodes ASAP 5288 * to allow proper links negotiation in FW (within 1 sec) 5289 */ 5290 if (!skip_hw_reset && need_full_reset) { 5291 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5292 /* For XGMI run all resets in parallel to speed up the process */ 5293 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5294 tmp_adev->gmc.xgmi.pending_reset = false; 5295 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 5296 r = -EALREADY; 5297 } else 5298 r = amdgpu_asic_reset(tmp_adev); 5299 5300 if (r) { 5301 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 5302 r, adev_to_drm(tmp_adev)->unique); 5303 goto out; 5304 } 5305 } 5306 5307 /* For XGMI wait for all resets to complete before proceed */ 5308 if (!r) { 5309 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5310 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5311 flush_work(&tmp_adev->xgmi_reset_work); 5312 r = tmp_adev->asic_reset_res; 5313 if (r) 5314 break; 5315 } 5316 } 5317 } 5318 } 5319 5320 if (!r && amdgpu_ras_intr_triggered()) { 5321 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5322 amdgpu_ras_reset_error_count(tmp_adev, AMDGPU_RAS_BLOCK__MMHUB); 5323 } 5324 5325 amdgpu_ras_intr_cleared(); 5326 } 5327 5328 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5329 if (need_full_reset) { 5330 /* post card */ 5331 amdgpu_ras_set_fed(tmp_adev, false); 5332 r = amdgpu_device_asic_init(tmp_adev); 5333 if (r) { 5334 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5335 } else { 5336 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5337 5338 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5339 if (r) 5340 goto out; 5341 5342 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5343 5344 amdgpu_coredump(tmp_adev, vram_lost, reset_context); 5345 5346 if (vram_lost) { 5347 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5348 amdgpu_inc_vram_lost(tmp_adev); 5349 } 5350 5351 r = amdgpu_device_fw_loading(tmp_adev); 5352 if (r) 5353 return r; 5354 5355 r = amdgpu_xcp_restore_partition_mode( 5356 tmp_adev->xcp_mgr); 5357 if (r) 5358 goto out; 5359 5360 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5361 if (r) 5362 goto out; 5363 5364 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5365 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5366 5367 if (vram_lost) 5368 amdgpu_device_fill_reset_magic(tmp_adev); 5369 5370 /* 5371 * Add this ASIC as tracked as reset was already 5372 * complete successfully. 5373 */ 5374 amdgpu_register_gpu_instance(tmp_adev); 5375 5376 if (!reset_context->hive && 5377 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5378 amdgpu_xgmi_add_device(tmp_adev); 5379 5380 r = amdgpu_device_ip_late_init(tmp_adev); 5381 if (r) 5382 goto out; 5383 5384 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5385 5386 /* 5387 * The GPU enters bad state once faulty pages 5388 * by ECC has reached the threshold, and ras 5389 * recovery is scheduled next. So add one check 5390 * here to break recovery if it indeed exceeds 5391 * bad page threshold, and remind user to 5392 * retire this GPU or setting one bigger 5393 * bad_page_threshold value to fix this once 5394 * probing driver again. 5395 */ 5396 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5397 /* must succeed. */ 5398 amdgpu_ras_resume(tmp_adev); 5399 } else { 5400 r = -EINVAL; 5401 goto out; 5402 } 5403 5404 /* Update PSP FW topology after reset */ 5405 if (reset_context->hive && 5406 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5407 r = amdgpu_xgmi_update_topology( 5408 reset_context->hive, tmp_adev); 5409 } 5410 } 5411 5412 out: 5413 if (!r) { 5414 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5415 r = amdgpu_ib_ring_tests(tmp_adev); 5416 if (r) { 5417 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5418 need_full_reset = true; 5419 r = -EAGAIN; 5420 goto end; 5421 } 5422 } 5423 5424 if (!r) 5425 r = amdgpu_device_recover_vram(tmp_adev); 5426 else 5427 tmp_adev->asic_reset_res = r; 5428 } 5429 5430 end: 5431 if (need_full_reset) 5432 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5433 else 5434 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5435 return r; 5436 } 5437 5438 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5439 { 5440 5441 switch (amdgpu_asic_reset_method(adev)) { 5442 case AMD_RESET_METHOD_MODE1: 5443 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5444 break; 5445 case AMD_RESET_METHOD_MODE2: 5446 adev->mp1_state = PP_MP1_STATE_RESET; 5447 break; 5448 default: 5449 adev->mp1_state = PP_MP1_STATE_NONE; 5450 break; 5451 } 5452 } 5453 5454 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5455 { 5456 amdgpu_vf_error_trans_all(adev); 5457 adev->mp1_state = PP_MP1_STATE_NONE; 5458 } 5459 5460 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5461 { 5462 struct pci_dev *p = NULL; 5463 5464 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5465 adev->pdev->bus->number, 1); 5466 if (p) { 5467 pm_runtime_enable(&(p->dev)); 5468 pm_runtime_resume(&(p->dev)); 5469 } 5470 5471 pci_dev_put(p); 5472 } 5473 5474 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5475 { 5476 enum amd_reset_method reset_method; 5477 struct pci_dev *p = NULL; 5478 u64 expires; 5479 5480 /* 5481 * For now, only BACO and mode1 reset are confirmed 5482 * to suffer the audio issue without proper suspended. 5483 */ 5484 reset_method = amdgpu_asic_reset_method(adev); 5485 if ((reset_method != AMD_RESET_METHOD_BACO) && 5486 (reset_method != AMD_RESET_METHOD_MODE1)) 5487 return -EINVAL; 5488 5489 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5490 adev->pdev->bus->number, 1); 5491 if (!p) 5492 return -ENODEV; 5493 5494 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5495 if (!expires) 5496 /* 5497 * If we cannot get the audio device autosuspend delay, 5498 * a fixed 4S interval will be used. Considering 3S is 5499 * the audio controller default autosuspend delay setting. 5500 * 4S used here is guaranteed to cover that. 5501 */ 5502 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5503 5504 while (!pm_runtime_status_suspended(&(p->dev))) { 5505 if (!pm_runtime_suspend(&(p->dev))) 5506 break; 5507 5508 if (expires < ktime_get_mono_fast_ns()) { 5509 dev_warn(adev->dev, "failed to suspend display audio\n"); 5510 pci_dev_put(p); 5511 /* TODO: abort the succeeding gpu reset? */ 5512 return -ETIMEDOUT; 5513 } 5514 } 5515 5516 pm_runtime_disable(&(p->dev)); 5517 5518 pci_dev_put(p); 5519 return 0; 5520 } 5521 5522 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5523 { 5524 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5525 5526 #if defined(CONFIG_DEBUG_FS) 5527 if (!amdgpu_sriov_vf(adev)) 5528 cancel_work(&adev->reset_work); 5529 #endif 5530 5531 if (adev->kfd.dev) 5532 cancel_work(&adev->kfd.reset_work); 5533 5534 if (amdgpu_sriov_vf(adev)) 5535 cancel_work(&adev->virt.flr_work); 5536 5537 if (con && adev->ras_enabled) 5538 cancel_work(&con->recovery_work); 5539 5540 } 5541 5542 static int amdgpu_device_health_check(struct list_head *device_list_handle) 5543 { 5544 struct amdgpu_device *tmp_adev; 5545 int ret = 0; 5546 u32 status; 5547 5548 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5549 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status); 5550 if (PCI_POSSIBLE_ERROR(status)) { 5551 dev_err(tmp_adev->dev, "device lost from bus!"); 5552 ret = -ENODEV; 5553 } 5554 } 5555 5556 return ret; 5557 } 5558 5559 /** 5560 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5561 * 5562 * @adev: amdgpu_device pointer 5563 * @job: which job trigger hang 5564 * @reset_context: amdgpu reset context pointer 5565 * 5566 * Attempt to reset the GPU if it has hung (all asics). 5567 * Attempt to do soft-reset or full-reset and reinitialize Asic 5568 * Returns 0 for success or an error on failure. 5569 */ 5570 5571 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5572 struct amdgpu_job *job, 5573 struct amdgpu_reset_context *reset_context) 5574 { 5575 struct list_head device_list, *device_list_handle = NULL; 5576 bool job_signaled = false; 5577 struct amdgpu_hive_info *hive = NULL; 5578 struct amdgpu_device *tmp_adev = NULL; 5579 int i, r = 0; 5580 bool need_emergency_restart = false; 5581 bool audio_suspended = false; 5582 5583 /* 5584 * Special case: RAS triggered and full reset isn't supported 5585 */ 5586 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5587 5588 /* 5589 * Flush RAM to disk so that after reboot 5590 * the user can read log and see why the system rebooted. 5591 */ 5592 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5593 amdgpu_ras_get_context(adev)->reboot) { 5594 DRM_WARN("Emergency reboot."); 5595 5596 ksys_sync_helper(); 5597 emergency_restart(); 5598 } 5599 5600 dev_info(adev->dev, "GPU %s begin!\n", 5601 need_emergency_restart ? "jobs stop":"reset"); 5602 5603 if (!amdgpu_sriov_vf(adev)) 5604 hive = amdgpu_get_xgmi_hive(adev); 5605 if (hive) 5606 mutex_lock(&hive->hive_lock); 5607 5608 reset_context->job = job; 5609 reset_context->hive = hive; 5610 /* 5611 * Build list of devices to reset. 5612 * In case we are in XGMI hive mode, resort the device list 5613 * to put adev in the 1st position. 5614 */ 5615 INIT_LIST_HEAD(&device_list); 5616 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5617 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5618 list_add_tail(&tmp_adev->reset_list, &device_list); 5619 if (adev->shutdown) 5620 tmp_adev->shutdown = true; 5621 } 5622 if (!list_is_first(&adev->reset_list, &device_list)) 5623 list_rotate_to_front(&adev->reset_list, &device_list); 5624 device_list_handle = &device_list; 5625 } else { 5626 list_add_tail(&adev->reset_list, &device_list); 5627 device_list_handle = &device_list; 5628 } 5629 5630 if (!amdgpu_sriov_vf(adev)) { 5631 r = amdgpu_device_health_check(device_list_handle); 5632 if (r) 5633 goto end_reset; 5634 } 5635 5636 /* We need to lock reset domain only once both for XGMI and single device */ 5637 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5638 reset_list); 5639 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5640 5641 /* block all schedulers and reset given job's ring */ 5642 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5643 5644 amdgpu_device_set_mp1_state(tmp_adev); 5645 5646 /* 5647 * Try to put the audio codec into suspend state 5648 * before gpu reset started. 5649 * 5650 * Due to the power domain of the graphics device 5651 * is shared with AZ power domain. Without this, 5652 * we may change the audio hardware from behind 5653 * the audio driver's back. That will trigger 5654 * some audio codec errors. 5655 */ 5656 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5657 audio_suspended = true; 5658 5659 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5660 5661 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5662 5663 if (!amdgpu_sriov_vf(tmp_adev)) 5664 amdgpu_amdkfd_pre_reset(tmp_adev); 5665 5666 /* 5667 * Mark these ASICs to be reseted as untracked first 5668 * And add them back after reset completed 5669 */ 5670 amdgpu_unregister_gpu_instance(tmp_adev); 5671 5672 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5673 5674 /* disable ras on ALL IPs */ 5675 if (!need_emergency_restart && 5676 amdgpu_device_ip_need_full_reset(tmp_adev)) 5677 amdgpu_ras_suspend(tmp_adev); 5678 5679 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5680 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5681 5682 if (!amdgpu_ring_sched_ready(ring)) 5683 continue; 5684 5685 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5686 5687 if (need_emergency_restart) 5688 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5689 } 5690 atomic_inc(&tmp_adev->gpu_reset_counter); 5691 } 5692 5693 if (need_emergency_restart) 5694 goto skip_sched_resume; 5695 5696 /* 5697 * Must check guilty signal here since after this point all old 5698 * HW fences are force signaled. 5699 * 5700 * job->base holds a reference to parent fence 5701 */ 5702 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5703 job_signaled = true; 5704 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5705 goto skip_hw_reset; 5706 } 5707 5708 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5709 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5710 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5711 /*TODO Should we stop ?*/ 5712 if (r) { 5713 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5714 r, adev_to_drm(tmp_adev)->unique); 5715 tmp_adev->asic_reset_res = r; 5716 } 5717 5718 if (!amdgpu_sriov_vf(tmp_adev)) 5719 /* 5720 * Drop all pending non scheduler resets. Scheduler resets 5721 * were already dropped during drm_sched_stop 5722 */ 5723 amdgpu_device_stop_pending_resets(tmp_adev); 5724 } 5725 5726 /* Actual ASIC resets if needed.*/ 5727 /* Host driver will handle XGMI hive reset for SRIOV */ 5728 if (amdgpu_sriov_vf(adev)) { 5729 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5730 if (r) 5731 adev->asic_reset_res = r; 5732 5733 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5734 if (amdgpu_ip_version(adev, GC_HWIP, 0) == 5735 IP_VERSION(9, 4, 2) || 5736 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5737 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5738 amdgpu_ras_resume(adev); 5739 } else { 5740 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5741 if (r && r == -EAGAIN) 5742 goto retry; 5743 } 5744 5745 skip_hw_reset: 5746 5747 /* Post ASIC reset for all devs .*/ 5748 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5749 5750 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5751 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5752 5753 if (!amdgpu_ring_sched_ready(ring)) 5754 continue; 5755 5756 drm_sched_start(&ring->sched, true); 5757 } 5758 5759 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 5760 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5761 5762 if (tmp_adev->asic_reset_res) 5763 r = tmp_adev->asic_reset_res; 5764 5765 tmp_adev->asic_reset_res = 0; 5766 5767 if (r) { 5768 /* bad news, how to tell it to userspace ? */ 5769 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5770 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5771 } else { 5772 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5773 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5774 DRM_WARN("smart shift update failed\n"); 5775 } 5776 } 5777 5778 skip_sched_resume: 5779 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5780 /* unlock kfd: SRIOV would do it separately */ 5781 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5782 amdgpu_amdkfd_post_reset(tmp_adev); 5783 5784 /* kfd_post_reset will do nothing if kfd device is not initialized, 5785 * need to bring up kfd here if it's not be initialized before 5786 */ 5787 if (!adev->kfd.init_complete) 5788 amdgpu_amdkfd_device_init(adev); 5789 5790 if (audio_suspended) 5791 amdgpu_device_resume_display_audio(tmp_adev); 5792 5793 amdgpu_device_unset_mp1_state(tmp_adev); 5794 5795 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5796 } 5797 5798 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5799 reset_list); 5800 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5801 5802 end_reset: 5803 if (hive) { 5804 mutex_unlock(&hive->hive_lock); 5805 amdgpu_put_xgmi_hive(hive); 5806 } 5807 5808 if (r) 5809 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5810 5811 atomic_set(&adev->reset_domain->reset_res, r); 5812 return r; 5813 } 5814 5815 /** 5816 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 5817 * 5818 * @adev: amdgpu_device pointer 5819 * @speed: pointer to the speed of the link 5820 * @width: pointer to the width of the link 5821 * 5822 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 5823 * first physical partner to an AMD dGPU. 5824 * This will exclude any virtual switches and links. 5825 */ 5826 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 5827 enum pci_bus_speed *speed, 5828 enum pcie_link_width *width) 5829 { 5830 struct pci_dev *parent = adev->pdev; 5831 5832 if (!speed || !width) 5833 return; 5834 5835 *speed = PCI_SPEED_UNKNOWN; 5836 *width = PCIE_LNK_WIDTH_UNKNOWN; 5837 5838 while ((parent = pci_upstream_bridge(parent))) { 5839 /* skip upstream/downstream switches internal to dGPU*/ 5840 if (parent->vendor == PCI_VENDOR_ID_ATI) 5841 continue; 5842 *speed = pcie_get_speed_cap(parent); 5843 *width = pcie_get_width_cap(parent); 5844 break; 5845 } 5846 } 5847 5848 /** 5849 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5850 * 5851 * @adev: amdgpu_device pointer 5852 * 5853 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5854 * and lanes) of the slot the device is in. Handles APUs and 5855 * virtualized environments where PCIE config space may not be available. 5856 */ 5857 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5858 { 5859 struct pci_dev *pdev; 5860 enum pci_bus_speed speed_cap, platform_speed_cap; 5861 enum pcie_link_width platform_link_width; 5862 5863 if (amdgpu_pcie_gen_cap) 5864 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5865 5866 if (amdgpu_pcie_lane_cap) 5867 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5868 5869 /* covers APUs as well */ 5870 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 5871 if (adev->pm.pcie_gen_mask == 0) 5872 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5873 if (adev->pm.pcie_mlw_mask == 0) 5874 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5875 return; 5876 } 5877 5878 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5879 return; 5880 5881 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 5882 &platform_link_width); 5883 5884 if (adev->pm.pcie_gen_mask == 0) { 5885 /* asic caps */ 5886 pdev = adev->pdev; 5887 speed_cap = pcie_get_speed_cap(pdev); 5888 if (speed_cap == PCI_SPEED_UNKNOWN) { 5889 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5890 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5891 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5892 } else { 5893 if (speed_cap == PCIE_SPEED_32_0GT) 5894 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5895 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5896 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5897 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5898 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5899 else if (speed_cap == PCIE_SPEED_16_0GT) 5900 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5901 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5902 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5903 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5904 else if (speed_cap == PCIE_SPEED_8_0GT) 5905 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5906 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5907 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5908 else if (speed_cap == PCIE_SPEED_5_0GT) 5909 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5910 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5911 else 5912 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5913 } 5914 /* platform caps */ 5915 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5916 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5917 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5918 } else { 5919 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5920 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5921 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5922 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5923 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5924 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5925 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5926 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5927 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5928 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5929 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5930 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5931 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5932 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5933 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5934 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5935 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5936 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5937 else 5938 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5939 5940 } 5941 } 5942 if (adev->pm.pcie_mlw_mask == 0) { 5943 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5944 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5945 } else { 5946 switch (platform_link_width) { 5947 case PCIE_LNK_X32: 5948 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5949 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5950 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5951 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5952 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5953 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5954 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5955 break; 5956 case PCIE_LNK_X16: 5957 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5958 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5959 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5960 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5961 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5962 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5963 break; 5964 case PCIE_LNK_X12: 5965 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5966 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5967 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5968 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5969 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5970 break; 5971 case PCIE_LNK_X8: 5972 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5973 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5974 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5975 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5976 break; 5977 case PCIE_LNK_X4: 5978 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5979 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5980 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5981 break; 5982 case PCIE_LNK_X2: 5983 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5984 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5985 break; 5986 case PCIE_LNK_X1: 5987 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5988 break; 5989 default: 5990 break; 5991 } 5992 } 5993 } 5994 } 5995 5996 /** 5997 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5998 * 5999 * @adev: amdgpu_device pointer 6000 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6001 * 6002 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6003 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6004 * @peer_adev. 6005 */ 6006 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6007 struct amdgpu_device *peer_adev) 6008 { 6009 #ifdef CONFIG_HSA_AMD_P2P 6010 uint64_t address_mask = peer_adev->dev->dma_mask ? 6011 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6012 resource_size_t aper_limit = 6013 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6014 bool p2p_access = 6015 !adev->gmc.xgmi.connected_to_cpu && 6016 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6017 6018 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 6019 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 6020 !(adev->gmc.aper_base & address_mask || 6021 aper_limit & address_mask)); 6022 #else 6023 return false; 6024 #endif 6025 } 6026 6027 int amdgpu_device_baco_enter(struct drm_device *dev) 6028 { 6029 struct amdgpu_device *adev = drm_to_adev(dev); 6030 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6031 6032 if (!amdgpu_device_supports_baco(dev)) 6033 return -ENOTSUPP; 6034 6035 if (ras && adev->ras_enabled && 6036 adev->nbio.funcs->enable_doorbell_interrupt) 6037 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6038 6039 return amdgpu_dpm_baco_enter(adev); 6040 } 6041 6042 int amdgpu_device_baco_exit(struct drm_device *dev) 6043 { 6044 struct amdgpu_device *adev = drm_to_adev(dev); 6045 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6046 int ret = 0; 6047 6048 if (!amdgpu_device_supports_baco(dev)) 6049 return -ENOTSUPP; 6050 6051 ret = amdgpu_dpm_baco_exit(adev); 6052 if (ret) 6053 return ret; 6054 6055 if (ras && adev->ras_enabled && 6056 adev->nbio.funcs->enable_doorbell_interrupt) 6057 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6058 6059 if (amdgpu_passthrough(adev) && 6060 adev->nbio.funcs->clear_doorbell_interrupt) 6061 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6062 6063 return 0; 6064 } 6065 6066 /** 6067 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6068 * @pdev: PCI device struct 6069 * @state: PCI channel state 6070 * 6071 * Description: Called when a PCI error is detected. 6072 * 6073 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6074 */ 6075 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6076 { 6077 struct drm_device *dev = pci_get_drvdata(pdev); 6078 struct amdgpu_device *adev = drm_to_adev(dev); 6079 int i; 6080 6081 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 6082 6083 if (adev->gmc.xgmi.num_physical_nodes > 1) { 6084 DRM_WARN("No support for XGMI hive yet..."); 6085 return PCI_ERS_RESULT_DISCONNECT; 6086 } 6087 6088 adev->pci_channel_state = state; 6089 6090 switch (state) { 6091 case pci_channel_io_normal: 6092 return PCI_ERS_RESULT_CAN_RECOVER; 6093 /* Fatal error, prepare for slot reset */ 6094 case pci_channel_io_frozen: 6095 /* 6096 * Locking adev->reset_domain->sem will prevent any external access 6097 * to GPU during PCI error recovery 6098 */ 6099 amdgpu_device_lock_reset_domain(adev->reset_domain); 6100 amdgpu_device_set_mp1_state(adev); 6101 6102 /* 6103 * Block any work scheduling as we do for regular GPU reset 6104 * for the duration of the recovery 6105 */ 6106 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6107 struct amdgpu_ring *ring = adev->rings[i]; 6108 6109 if (!amdgpu_ring_sched_ready(ring)) 6110 continue; 6111 6112 drm_sched_stop(&ring->sched, NULL); 6113 } 6114 atomic_inc(&adev->gpu_reset_counter); 6115 return PCI_ERS_RESULT_NEED_RESET; 6116 case pci_channel_io_perm_failure: 6117 /* Permanent error, prepare for device removal */ 6118 return PCI_ERS_RESULT_DISCONNECT; 6119 } 6120 6121 return PCI_ERS_RESULT_NEED_RESET; 6122 } 6123 6124 /** 6125 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6126 * @pdev: pointer to PCI device 6127 */ 6128 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6129 { 6130 6131 DRM_INFO("PCI error: mmio enabled callback!!\n"); 6132 6133 /* TODO - dump whatever for debugging purposes */ 6134 6135 /* This called only if amdgpu_pci_error_detected returns 6136 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6137 * works, no need to reset slot. 6138 */ 6139 6140 return PCI_ERS_RESULT_RECOVERED; 6141 } 6142 6143 /** 6144 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6145 * @pdev: PCI device struct 6146 * 6147 * Description: This routine is called by the pci error recovery 6148 * code after the PCI slot has been reset, just before we 6149 * should resume normal operations. 6150 */ 6151 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6152 { 6153 struct drm_device *dev = pci_get_drvdata(pdev); 6154 struct amdgpu_device *adev = drm_to_adev(dev); 6155 int r, i; 6156 struct amdgpu_reset_context reset_context; 6157 u32 memsize; 6158 struct list_head device_list; 6159 struct amdgpu_hive_info *hive; 6160 int hive_ras_recovery = 0; 6161 struct amdgpu_ras *ras; 6162 6163 /* PCI error slot reset should be skipped During RAS recovery */ 6164 hive = amdgpu_get_xgmi_hive(adev); 6165 if (hive) { 6166 hive_ras_recovery = atomic_read(&hive->ras_recovery); 6167 amdgpu_put_xgmi_hive(hive); 6168 } 6169 ras = amdgpu_ras_get_context(adev); 6170 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3)) && 6171 ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery)) 6172 return PCI_ERS_RESULT_RECOVERED; 6173 6174 DRM_INFO("PCI error: slot reset callback!!\n"); 6175 6176 memset(&reset_context, 0, sizeof(reset_context)); 6177 6178 INIT_LIST_HEAD(&device_list); 6179 list_add_tail(&adev->reset_list, &device_list); 6180 6181 /* wait for asic to come out of reset */ 6182 msleep(500); 6183 6184 /* Restore PCI confspace */ 6185 amdgpu_device_load_pci_state(pdev); 6186 6187 /* confirm ASIC came out of reset */ 6188 for (i = 0; i < adev->usec_timeout; i++) { 6189 memsize = amdgpu_asic_get_config_memsize(adev); 6190 6191 if (memsize != 0xffffffff) 6192 break; 6193 udelay(1); 6194 } 6195 if (memsize == 0xffffffff) { 6196 r = -ETIME; 6197 goto out; 6198 } 6199 6200 reset_context.method = AMD_RESET_METHOD_NONE; 6201 reset_context.reset_req_dev = adev; 6202 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6203 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6204 6205 adev->no_hw_access = true; 6206 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 6207 adev->no_hw_access = false; 6208 if (r) 6209 goto out; 6210 6211 r = amdgpu_do_asic_reset(&device_list, &reset_context); 6212 6213 out: 6214 if (!r) { 6215 if (amdgpu_device_cache_pci_state(adev->pdev)) 6216 pci_restore_state(adev->pdev); 6217 6218 DRM_INFO("PCIe error recovery succeeded\n"); 6219 } else { 6220 DRM_ERROR("PCIe error recovery failed, err:%d", r); 6221 amdgpu_device_unset_mp1_state(adev); 6222 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6223 } 6224 6225 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6226 } 6227 6228 /** 6229 * amdgpu_pci_resume() - resume normal ops after PCI reset 6230 * @pdev: pointer to PCI device 6231 * 6232 * Called when the error recovery driver tells us that its 6233 * OK to resume normal operation. 6234 */ 6235 void amdgpu_pci_resume(struct pci_dev *pdev) 6236 { 6237 struct drm_device *dev = pci_get_drvdata(pdev); 6238 struct amdgpu_device *adev = drm_to_adev(dev); 6239 int i; 6240 6241 6242 DRM_INFO("PCI error: resume callback!!\n"); 6243 6244 /* Only continue execution for the case of pci_channel_io_frozen */ 6245 if (adev->pci_channel_state != pci_channel_io_frozen) 6246 return; 6247 6248 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6249 struct amdgpu_ring *ring = adev->rings[i]; 6250 6251 if (!amdgpu_ring_sched_ready(ring)) 6252 continue; 6253 6254 drm_sched_start(&ring->sched, true); 6255 } 6256 6257 amdgpu_device_unset_mp1_state(adev); 6258 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6259 } 6260 6261 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6262 { 6263 struct drm_device *dev = pci_get_drvdata(pdev); 6264 struct amdgpu_device *adev = drm_to_adev(dev); 6265 int r; 6266 6267 r = pci_save_state(pdev); 6268 if (!r) { 6269 kfree(adev->pci_state); 6270 6271 adev->pci_state = pci_store_saved_state(pdev); 6272 6273 if (!adev->pci_state) { 6274 DRM_ERROR("Failed to store PCI saved state"); 6275 return false; 6276 } 6277 } else { 6278 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6279 return false; 6280 } 6281 6282 return true; 6283 } 6284 6285 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6286 { 6287 struct drm_device *dev = pci_get_drvdata(pdev); 6288 struct amdgpu_device *adev = drm_to_adev(dev); 6289 int r; 6290 6291 if (!adev->pci_state) 6292 return false; 6293 6294 r = pci_load_saved_state(pdev, adev->pci_state); 6295 6296 if (!r) { 6297 pci_restore_state(pdev); 6298 } else { 6299 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6300 return false; 6301 } 6302 6303 return true; 6304 } 6305 6306 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6307 struct amdgpu_ring *ring) 6308 { 6309 #ifdef CONFIG_X86_64 6310 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6311 return; 6312 #endif 6313 if (adev->gmc.xgmi.connected_to_cpu) 6314 return; 6315 6316 if (ring && ring->funcs->emit_hdp_flush) 6317 amdgpu_ring_emit_hdp_flush(ring); 6318 else 6319 amdgpu_asic_flush_hdp(adev, ring); 6320 } 6321 6322 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6323 struct amdgpu_ring *ring) 6324 { 6325 #ifdef CONFIG_X86_64 6326 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6327 return; 6328 #endif 6329 if (adev->gmc.xgmi.connected_to_cpu) 6330 return; 6331 6332 amdgpu_asic_invalidate_hdp(adev, ring); 6333 } 6334 6335 int amdgpu_in_reset(struct amdgpu_device *adev) 6336 { 6337 return atomic_read(&adev->reset_domain->in_gpu_reset); 6338 } 6339 6340 /** 6341 * amdgpu_device_halt() - bring hardware to some kind of halt state 6342 * 6343 * @adev: amdgpu_device pointer 6344 * 6345 * Bring hardware to some kind of halt state so that no one can touch it 6346 * any more. It will help to maintain error context when error occurred. 6347 * Compare to a simple hang, the system will keep stable at least for SSH 6348 * access. Then it should be trivial to inspect the hardware state and 6349 * see what's going on. Implemented as following: 6350 * 6351 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6352 * clears all CPU mappings to device, disallows remappings through page faults 6353 * 2. amdgpu_irq_disable_all() disables all interrupts 6354 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6355 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6356 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6357 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6358 * flush any in flight DMA operations 6359 */ 6360 void amdgpu_device_halt(struct amdgpu_device *adev) 6361 { 6362 struct pci_dev *pdev = adev->pdev; 6363 struct drm_device *ddev = adev_to_drm(adev); 6364 6365 amdgpu_xcp_dev_unplug(adev); 6366 drm_dev_unplug(ddev); 6367 6368 amdgpu_irq_disable_all(adev); 6369 6370 amdgpu_fence_driver_hw_fini(adev); 6371 6372 adev->no_hw_access = true; 6373 6374 amdgpu_device_unmap_mmio(adev); 6375 6376 pci_disable_device(pdev); 6377 pci_wait_for_pending_transaction(pdev); 6378 } 6379 6380 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6381 u32 reg) 6382 { 6383 unsigned long flags, address, data; 6384 u32 r; 6385 6386 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6387 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6388 6389 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6390 WREG32(address, reg * 4); 6391 (void)RREG32(address); 6392 r = RREG32(data); 6393 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6394 return r; 6395 } 6396 6397 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6398 u32 reg, u32 v) 6399 { 6400 unsigned long flags, address, data; 6401 6402 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6403 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6404 6405 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6406 WREG32(address, reg * 4); 6407 (void)RREG32(address); 6408 WREG32(data, v); 6409 (void)RREG32(data); 6410 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6411 } 6412 6413 /** 6414 * amdgpu_device_switch_gang - switch to a new gang 6415 * @adev: amdgpu_device pointer 6416 * @gang: the gang to switch to 6417 * 6418 * Try to switch to a new gang. 6419 * Returns: NULL if we switched to the new gang or a reference to the current 6420 * gang leader. 6421 */ 6422 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6423 struct dma_fence *gang) 6424 { 6425 struct dma_fence *old = NULL; 6426 6427 do { 6428 dma_fence_put(old); 6429 rcu_read_lock(); 6430 old = dma_fence_get_rcu_safe(&adev->gang_submit); 6431 rcu_read_unlock(); 6432 6433 if (old == gang) 6434 break; 6435 6436 if (!dma_fence_is_signaled(old)) 6437 return old; 6438 6439 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6440 old, gang) != old); 6441 6442 dma_fence_put(old); 6443 return NULL; 6444 } 6445 6446 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6447 { 6448 switch (adev->asic_type) { 6449 #ifdef CONFIG_DRM_AMDGPU_SI 6450 case CHIP_HAINAN: 6451 #endif 6452 case CHIP_TOPAZ: 6453 /* chips with no display hardware */ 6454 return false; 6455 #ifdef CONFIG_DRM_AMDGPU_SI 6456 case CHIP_TAHITI: 6457 case CHIP_PITCAIRN: 6458 case CHIP_VERDE: 6459 case CHIP_OLAND: 6460 #endif 6461 #ifdef CONFIG_DRM_AMDGPU_CIK 6462 case CHIP_BONAIRE: 6463 case CHIP_HAWAII: 6464 case CHIP_KAVERI: 6465 case CHIP_KABINI: 6466 case CHIP_MULLINS: 6467 #endif 6468 case CHIP_TONGA: 6469 case CHIP_FIJI: 6470 case CHIP_POLARIS10: 6471 case CHIP_POLARIS11: 6472 case CHIP_POLARIS12: 6473 case CHIP_VEGAM: 6474 case CHIP_CARRIZO: 6475 case CHIP_STONEY: 6476 /* chips with display hardware */ 6477 return true; 6478 default: 6479 /* IP discovery */ 6480 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 6481 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6482 return false; 6483 return true; 6484 } 6485 } 6486 6487 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6488 uint32_t inst, uint32_t reg_addr, char reg_name[], 6489 uint32_t expected_value, uint32_t mask) 6490 { 6491 uint32_t ret = 0; 6492 uint32_t old_ = 0; 6493 uint32_t tmp_ = RREG32(reg_addr); 6494 uint32_t loop = adev->usec_timeout; 6495 6496 while ((tmp_ & (mask)) != (expected_value)) { 6497 if (old_ != tmp_) { 6498 loop = adev->usec_timeout; 6499 old_ = tmp_; 6500 } else 6501 udelay(1); 6502 tmp_ = RREG32(reg_addr); 6503 loop--; 6504 if (!loop) { 6505 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6506 inst, reg_name, (uint32_t)expected_value, 6507 (uint32_t)(tmp_ & (mask))); 6508 ret = -ETIMEDOUT; 6509 break; 6510 } 6511 } 6512 return ret; 6513 } 6514