1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_client_event.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_probe_helper.h> 44 #include <drm/amdgpu_drm.h> 45 #include <linux/device.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 #include "amdgpu_virt.h" 78 #include "amdgpu_dev_coredump.h" 79 80 #include <linux/suspend.h> 81 #include <drm/task_barrier.h> 82 #include <linux/pm_runtime.h> 83 84 #include <drm/drm_drv.h> 85 86 #if IS_ENABLED(CONFIG_X86) 87 #include <asm/intel-family.h> 88 #endif 89 90 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 97 98 #define AMDGPU_RESUME_MS 2000 99 #define AMDGPU_MAX_RETRY_LIMIT 2 100 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 101 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 102 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 103 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 104 105 #define AMDGPU_VBIOS_SKIP (1U << 0) 106 #define AMDGPU_VBIOS_OPTIONAL (1U << 1) 107 108 static const struct drm_driver amdgpu_kms_driver; 109 110 const char *amdgpu_asic_name[] = { 111 "TAHITI", 112 "PITCAIRN", 113 "VERDE", 114 "OLAND", 115 "HAINAN", 116 "BONAIRE", 117 "KAVERI", 118 "KABINI", 119 "HAWAII", 120 "MULLINS", 121 "TOPAZ", 122 "TONGA", 123 "FIJI", 124 "CARRIZO", 125 "STONEY", 126 "POLARIS10", 127 "POLARIS11", 128 "POLARIS12", 129 "VEGAM", 130 "VEGA10", 131 "VEGA12", 132 "VEGA20", 133 "RAVEN", 134 "ARCTURUS", 135 "RENOIR", 136 "ALDEBARAN", 137 "NAVI10", 138 "CYAN_SKILLFISH", 139 "NAVI14", 140 "NAVI12", 141 "SIENNA_CICHLID", 142 "NAVY_FLOUNDER", 143 "VANGOGH", 144 "DIMGREY_CAVEFISH", 145 "BEIGE_GOBY", 146 "YELLOW_CARP", 147 "IP DISCOVERY", 148 "LAST", 149 }; 150 151 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0) 152 /* 153 * Default init level where all blocks are expected to be initialized. This is 154 * the level of initialization expected by default and also after a full reset 155 * of the device. 156 */ 157 struct amdgpu_init_level amdgpu_init_default = { 158 .level = AMDGPU_INIT_LEVEL_DEFAULT, 159 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 160 }; 161 162 struct amdgpu_init_level amdgpu_init_recovery = { 163 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, 164 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 165 }; 166 167 /* 168 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 169 * is used for cases like reset on initialization where the entire hive needs to 170 * be reset before first use. 171 */ 172 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 173 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 174 .hwini_ip_block_mask = 175 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 176 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 177 BIT(AMD_IP_BLOCK_TYPE_PSP) 178 }; 179 180 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 181 enum amd_ip_block_type block) 182 { 183 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 184 } 185 186 void amdgpu_set_init_level(struct amdgpu_device *adev, 187 enum amdgpu_init_lvl_id lvl) 188 { 189 switch (lvl) { 190 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 191 adev->init_lvl = &amdgpu_init_minimal_xgmi; 192 break; 193 case AMDGPU_INIT_LEVEL_RESET_RECOVERY: 194 adev->init_lvl = &amdgpu_init_recovery; 195 break; 196 case AMDGPU_INIT_LEVEL_DEFAULT: 197 fallthrough; 198 default: 199 adev->init_lvl = &amdgpu_init_default; 200 break; 201 } 202 } 203 204 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 205 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 206 void *data); 207 208 /** 209 * DOC: pcie_replay_count 210 * 211 * The amdgpu driver provides a sysfs API for reporting the total number 212 * of PCIe replays (NAKs). 213 * The file pcie_replay_count is used for this and returns the total 214 * number of replays as a sum of the NAKs generated and NAKs received. 215 */ 216 217 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 218 struct device_attribute *attr, char *buf) 219 { 220 struct drm_device *ddev = dev_get_drvdata(dev); 221 struct amdgpu_device *adev = drm_to_adev(ddev); 222 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 223 224 return sysfs_emit(buf, "%llu\n", cnt); 225 } 226 227 static DEVICE_ATTR(pcie_replay_count, 0444, 228 amdgpu_device_get_pcie_replay_count, NULL); 229 230 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 231 struct bin_attribute *attr, char *buf, 232 loff_t ppos, size_t count) 233 { 234 struct device *dev = kobj_to_dev(kobj); 235 struct drm_device *ddev = dev_get_drvdata(dev); 236 struct amdgpu_device *adev = drm_to_adev(ddev); 237 ssize_t bytes_read; 238 239 switch (ppos) { 240 case AMDGPU_SYS_REG_STATE_XGMI: 241 bytes_read = amdgpu_asic_get_reg_state( 242 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 243 break; 244 case AMDGPU_SYS_REG_STATE_WAFL: 245 bytes_read = amdgpu_asic_get_reg_state( 246 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 247 break; 248 case AMDGPU_SYS_REG_STATE_PCIE: 249 bytes_read = amdgpu_asic_get_reg_state( 250 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 251 break; 252 case AMDGPU_SYS_REG_STATE_USR: 253 bytes_read = amdgpu_asic_get_reg_state( 254 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 255 break; 256 case AMDGPU_SYS_REG_STATE_USR_1: 257 bytes_read = amdgpu_asic_get_reg_state( 258 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 259 break; 260 default: 261 return -EINVAL; 262 } 263 264 return bytes_read; 265 } 266 267 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 268 AMDGPU_SYS_REG_STATE_END); 269 270 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 271 { 272 int ret; 273 274 if (!amdgpu_asic_get_reg_state_supported(adev)) 275 return 0; 276 277 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 278 279 return ret; 280 } 281 282 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 283 { 284 if (!amdgpu_asic_get_reg_state_supported(adev)) 285 return; 286 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 287 } 288 289 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block) 290 { 291 int r; 292 293 if (ip_block->version->funcs->suspend) { 294 r = ip_block->version->funcs->suspend(ip_block); 295 if (r) { 296 dev_err(ip_block->adev->dev, 297 "suspend of IP block <%s> failed %d\n", 298 ip_block->version->funcs->name, r); 299 return r; 300 } 301 } 302 303 ip_block->status.hw = false; 304 return 0; 305 } 306 307 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block) 308 { 309 int r; 310 311 if (ip_block->version->funcs->resume) { 312 r = ip_block->version->funcs->resume(ip_block); 313 if (r) { 314 dev_err(ip_block->adev->dev, 315 "resume of IP block <%s> failed %d\n", 316 ip_block->version->funcs->name, r); 317 return r; 318 } 319 } 320 321 ip_block->status.hw = true; 322 return 0; 323 } 324 325 /** 326 * DOC: board_info 327 * 328 * The amdgpu driver provides a sysfs API for giving board related information. 329 * It provides the form factor information in the format 330 * 331 * type : form factor 332 * 333 * Possible form factor values 334 * 335 * - "cem" - PCIE CEM card 336 * - "oam" - Open Compute Accelerator Module 337 * - "unknown" - Not known 338 * 339 */ 340 341 static ssize_t amdgpu_device_get_board_info(struct device *dev, 342 struct device_attribute *attr, 343 char *buf) 344 { 345 struct drm_device *ddev = dev_get_drvdata(dev); 346 struct amdgpu_device *adev = drm_to_adev(ddev); 347 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 348 const char *pkg; 349 350 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 351 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 352 353 switch (pkg_type) { 354 case AMDGPU_PKG_TYPE_CEM: 355 pkg = "cem"; 356 break; 357 case AMDGPU_PKG_TYPE_OAM: 358 pkg = "oam"; 359 break; 360 default: 361 pkg = "unknown"; 362 break; 363 } 364 365 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 366 } 367 368 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 369 370 static struct attribute *amdgpu_board_attrs[] = { 371 &dev_attr_board_info.attr, 372 NULL, 373 }; 374 375 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 376 struct attribute *attr, int n) 377 { 378 struct device *dev = kobj_to_dev(kobj); 379 struct drm_device *ddev = dev_get_drvdata(dev); 380 struct amdgpu_device *adev = drm_to_adev(ddev); 381 382 if (adev->flags & AMD_IS_APU) 383 return 0; 384 385 return attr->mode; 386 } 387 388 static const struct attribute_group amdgpu_board_attrs_group = { 389 .attrs = amdgpu_board_attrs, 390 .is_visible = amdgpu_board_attrs_is_visible 391 }; 392 393 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 394 395 396 /** 397 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 398 * 399 * @dev: drm_device pointer 400 * 401 * Returns true if the device is a dGPU with ATPX power control, 402 * otherwise return false. 403 */ 404 bool amdgpu_device_supports_px(struct drm_device *dev) 405 { 406 struct amdgpu_device *adev = drm_to_adev(dev); 407 408 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 409 return true; 410 return false; 411 } 412 413 /** 414 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 415 * 416 * @dev: drm_device pointer 417 * 418 * Returns true if the device is a dGPU with ACPI power control, 419 * otherwise return false. 420 */ 421 bool amdgpu_device_supports_boco(struct drm_device *dev) 422 { 423 struct amdgpu_device *adev = drm_to_adev(dev); 424 425 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) 426 return false; 427 428 if (adev->has_pr3 || 429 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 430 return true; 431 return false; 432 } 433 434 /** 435 * amdgpu_device_supports_baco - Does the device support BACO 436 * 437 * @dev: drm_device pointer 438 * 439 * Return: 440 * 1 if the device supports BACO; 441 * 3 if the device supports MACO (only works if BACO is supported) 442 * otherwise return 0. 443 */ 444 int amdgpu_device_supports_baco(struct drm_device *dev) 445 { 446 struct amdgpu_device *adev = drm_to_adev(dev); 447 448 return amdgpu_asic_supports_baco(adev); 449 } 450 451 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 452 { 453 struct drm_device *dev; 454 int bamaco_support; 455 456 dev = adev_to_drm(adev); 457 458 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 459 bamaco_support = amdgpu_device_supports_baco(dev); 460 461 switch (amdgpu_runtime_pm) { 462 case 2: 463 if (bamaco_support & MACO_SUPPORT) { 464 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 465 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 466 } else if (bamaco_support == BACO_SUPPORT) { 467 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 468 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 469 } 470 break; 471 case 1: 472 if (bamaco_support & BACO_SUPPORT) { 473 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 474 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 475 } 476 break; 477 case -1: 478 case -2: 479 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */ 480 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 481 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 482 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */ 483 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 484 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 485 } else { 486 if (!bamaco_support) 487 goto no_runtime_pm; 488 489 switch (adev->asic_type) { 490 case CHIP_VEGA20: 491 case CHIP_ARCTURUS: 492 /* BACO are not supported on vega20 and arctrus */ 493 break; 494 case CHIP_VEGA10: 495 /* enable BACO as runpm mode if noretry=0 */ 496 if (!adev->gmc.noretry) 497 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 498 break; 499 default: 500 /* enable BACO as runpm mode on CI+ */ 501 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 502 break; 503 } 504 505 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 506 if (bamaco_support & MACO_SUPPORT) { 507 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 508 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 509 } else { 510 dev_info(adev->dev, "Using BACO for runtime pm\n"); 511 } 512 } 513 } 514 break; 515 case 0: 516 dev_info(adev->dev, "runtime pm is manually disabled\n"); 517 break; 518 default: 519 break; 520 } 521 522 no_runtime_pm: 523 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 524 dev_info(adev->dev, "Runtime PM not available\n"); 525 } 526 /** 527 * amdgpu_device_supports_smart_shift - Is the device dGPU with 528 * smart shift support 529 * 530 * @dev: drm_device pointer 531 * 532 * Returns true if the device is a dGPU with Smart Shift support, 533 * otherwise returns false. 534 */ 535 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 536 { 537 return (amdgpu_device_supports_boco(dev) && 538 amdgpu_acpi_is_power_shift_control_supported()); 539 } 540 541 /* 542 * VRAM access helper functions 543 */ 544 545 /** 546 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 547 * 548 * @adev: amdgpu_device pointer 549 * @pos: offset of the buffer in vram 550 * @buf: virtual address of the buffer in system memory 551 * @size: read/write size, sizeof(@buf) must > @size 552 * @write: true - write to vram, otherwise - read from vram 553 */ 554 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 555 void *buf, size_t size, bool write) 556 { 557 unsigned long flags; 558 uint32_t hi = ~0, tmp = 0; 559 uint32_t *data = buf; 560 uint64_t last; 561 int idx; 562 563 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 564 return; 565 566 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 567 568 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 569 for (last = pos + size; pos < last; pos += 4) { 570 tmp = pos >> 31; 571 572 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 573 if (tmp != hi) { 574 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 575 hi = tmp; 576 } 577 if (write) 578 WREG32_NO_KIQ(mmMM_DATA, *data++); 579 else 580 *data++ = RREG32_NO_KIQ(mmMM_DATA); 581 } 582 583 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 584 drm_dev_exit(idx); 585 } 586 587 /** 588 * amdgpu_device_aper_access - access vram by vram aperture 589 * 590 * @adev: amdgpu_device pointer 591 * @pos: offset of the buffer in vram 592 * @buf: virtual address of the buffer in system memory 593 * @size: read/write size, sizeof(@buf) must > @size 594 * @write: true - write to vram, otherwise - read from vram 595 * 596 * The return value means how many bytes have been transferred. 597 */ 598 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 599 void *buf, size_t size, bool write) 600 { 601 #ifdef CONFIG_64BIT 602 void __iomem *addr; 603 size_t count = 0; 604 uint64_t last; 605 606 if (!adev->mman.aper_base_kaddr) 607 return 0; 608 609 last = min(pos + size, adev->gmc.visible_vram_size); 610 if (last > pos) { 611 addr = adev->mman.aper_base_kaddr + pos; 612 count = last - pos; 613 614 if (write) { 615 memcpy_toio(addr, buf, count); 616 /* Make sure HDP write cache flush happens without any reordering 617 * after the system memory contents are sent over PCIe device 618 */ 619 mb(); 620 amdgpu_device_flush_hdp(adev, NULL); 621 } else { 622 amdgpu_device_invalidate_hdp(adev, NULL); 623 /* Make sure HDP read cache is invalidated before issuing a read 624 * to the PCIe device 625 */ 626 mb(); 627 memcpy_fromio(buf, addr, count); 628 } 629 630 } 631 632 return count; 633 #else 634 return 0; 635 #endif 636 } 637 638 /** 639 * amdgpu_device_vram_access - read/write a buffer in vram 640 * 641 * @adev: amdgpu_device pointer 642 * @pos: offset of the buffer in vram 643 * @buf: virtual address of the buffer in system memory 644 * @size: read/write size, sizeof(@buf) must > @size 645 * @write: true - write to vram, otherwise - read from vram 646 */ 647 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 648 void *buf, size_t size, bool write) 649 { 650 size_t count; 651 652 /* try to using vram apreature to access vram first */ 653 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 654 size -= count; 655 if (size) { 656 /* using MM to access rest vram */ 657 pos += count; 658 buf += count; 659 amdgpu_device_mm_access(adev, pos, buf, size, write); 660 } 661 } 662 663 /* 664 * register access helper functions. 665 */ 666 667 /* Check if hw access should be skipped because of hotplug or device error */ 668 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 669 { 670 if (adev->no_hw_access) 671 return true; 672 673 #ifdef CONFIG_LOCKDEP 674 /* 675 * This is a bit complicated to understand, so worth a comment. What we assert 676 * here is that the GPU reset is not running on another thread in parallel. 677 * 678 * For this we trylock the read side of the reset semaphore, if that succeeds 679 * we know that the reset is not running in parallel. 680 * 681 * If the trylock fails we assert that we are either already holding the read 682 * side of the lock or are the reset thread itself and hold the write side of 683 * the lock. 684 */ 685 if (in_task()) { 686 if (down_read_trylock(&adev->reset_domain->sem)) 687 up_read(&adev->reset_domain->sem); 688 else 689 lockdep_assert_held(&adev->reset_domain->sem); 690 } 691 #endif 692 return false; 693 } 694 695 /** 696 * amdgpu_device_rreg - read a memory mapped IO or indirect register 697 * 698 * @adev: amdgpu_device pointer 699 * @reg: dword aligned register offset 700 * @acc_flags: access flags which require special behavior 701 * 702 * Returns the 32 bit value from the offset specified. 703 */ 704 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 705 uint32_t reg, uint32_t acc_flags) 706 { 707 uint32_t ret; 708 709 if (amdgpu_device_skip_hw_access(adev)) 710 return 0; 711 712 if ((reg * 4) < adev->rmmio_size) { 713 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 714 amdgpu_sriov_runtime(adev) && 715 down_read_trylock(&adev->reset_domain->sem)) { 716 ret = amdgpu_kiq_rreg(adev, reg, 0); 717 up_read(&adev->reset_domain->sem); 718 } else { 719 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 720 } 721 } else { 722 ret = adev->pcie_rreg(adev, reg * 4); 723 } 724 725 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 726 727 return ret; 728 } 729 730 /* 731 * MMIO register read with bytes helper functions 732 * @offset:bytes offset from MMIO start 733 */ 734 735 /** 736 * amdgpu_mm_rreg8 - read a memory mapped IO register 737 * 738 * @adev: amdgpu_device pointer 739 * @offset: byte aligned register offset 740 * 741 * Returns the 8 bit value from the offset specified. 742 */ 743 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 744 { 745 if (amdgpu_device_skip_hw_access(adev)) 746 return 0; 747 748 if (offset < adev->rmmio_size) 749 return (readb(adev->rmmio + offset)); 750 BUG(); 751 } 752 753 754 /** 755 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 756 * 757 * @adev: amdgpu_device pointer 758 * @reg: dword aligned register offset 759 * @acc_flags: access flags which require special behavior 760 * @xcc_id: xcc accelerated compute core id 761 * 762 * Returns the 32 bit value from the offset specified. 763 */ 764 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 765 uint32_t reg, uint32_t acc_flags, 766 uint32_t xcc_id) 767 { 768 uint32_t ret, rlcg_flag; 769 770 if (amdgpu_device_skip_hw_access(adev)) 771 return 0; 772 773 if ((reg * 4) < adev->rmmio_size) { 774 if (amdgpu_sriov_vf(adev) && 775 !amdgpu_sriov_runtime(adev) && 776 adev->gfx.rlc.rlcg_reg_access_supported && 777 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 778 GC_HWIP, false, 779 &rlcg_flag)) { 780 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 781 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 782 amdgpu_sriov_runtime(adev) && 783 down_read_trylock(&adev->reset_domain->sem)) { 784 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 785 up_read(&adev->reset_domain->sem); 786 } else { 787 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 788 } 789 } else { 790 ret = adev->pcie_rreg(adev, reg * 4); 791 } 792 793 return ret; 794 } 795 796 /* 797 * MMIO register write with bytes helper functions 798 * @offset:bytes offset from MMIO start 799 * @value: the value want to be written to the register 800 */ 801 802 /** 803 * amdgpu_mm_wreg8 - read a memory mapped IO register 804 * 805 * @adev: amdgpu_device pointer 806 * @offset: byte aligned register offset 807 * @value: 8 bit value to write 808 * 809 * Writes the value specified to the offset specified. 810 */ 811 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 812 { 813 if (amdgpu_device_skip_hw_access(adev)) 814 return; 815 816 if (offset < adev->rmmio_size) 817 writeb(value, adev->rmmio + offset); 818 else 819 BUG(); 820 } 821 822 /** 823 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 824 * 825 * @adev: amdgpu_device pointer 826 * @reg: dword aligned register offset 827 * @v: 32 bit value to write to the register 828 * @acc_flags: access flags which require special behavior 829 * 830 * Writes the value specified to the offset specified. 831 */ 832 void amdgpu_device_wreg(struct amdgpu_device *adev, 833 uint32_t reg, uint32_t v, 834 uint32_t acc_flags) 835 { 836 if (amdgpu_device_skip_hw_access(adev)) 837 return; 838 839 if ((reg * 4) < adev->rmmio_size) { 840 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 841 amdgpu_sriov_runtime(adev) && 842 down_read_trylock(&adev->reset_domain->sem)) { 843 amdgpu_kiq_wreg(adev, reg, v, 0); 844 up_read(&adev->reset_domain->sem); 845 } else { 846 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 847 } 848 } else { 849 adev->pcie_wreg(adev, reg * 4, v); 850 } 851 852 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 853 } 854 855 /** 856 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 857 * 858 * @adev: amdgpu_device pointer 859 * @reg: mmio/rlc register 860 * @v: value to write 861 * @xcc_id: xcc accelerated compute core id 862 * 863 * this function is invoked only for the debugfs register access 864 */ 865 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 866 uint32_t reg, uint32_t v, 867 uint32_t xcc_id) 868 { 869 if (amdgpu_device_skip_hw_access(adev)) 870 return; 871 872 if (amdgpu_sriov_fullaccess(adev) && 873 adev->gfx.rlc.funcs && 874 adev->gfx.rlc.funcs->is_rlcg_access_range) { 875 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 876 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 877 } else if ((reg * 4) >= adev->rmmio_size) { 878 adev->pcie_wreg(adev, reg * 4, v); 879 } else { 880 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 881 } 882 } 883 884 /** 885 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 886 * 887 * @adev: amdgpu_device pointer 888 * @reg: dword aligned register offset 889 * @v: 32 bit value to write to the register 890 * @acc_flags: access flags which require special behavior 891 * @xcc_id: xcc accelerated compute core id 892 * 893 * Writes the value specified to the offset specified. 894 */ 895 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 896 uint32_t reg, uint32_t v, 897 uint32_t acc_flags, uint32_t xcc_id) 898 { 899 uint32_t rlcg_flag; 900 901 if (amdgpu_device_skip_hw_access(adev)) 902 return; 903 904 if ((reg * 4) < adev->rmmio_size) { 905 if (amdgpu_sriov_vf(adev) && 906 !amdgpu_sriov_runtime(adev) && 907 adev->gfx.rlc.rlcg_reg_access_supported && 908 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 909 GC_HWIP, true, 910 &rlcg_flag)) { 911 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 912 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 913 amdgpu_sriov_runtime(adev) && 914 down_read_trylock(&adev->reset_domain->sem)) { 915 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 916 up_read(&adev->reset_domain->sem); 917 } else { 918 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 919 } 920 } else { 921 adev->pcie_wreg(adev, reg * 4, v); 922 } 923 } 924 925 /** 926 * amdgpu_device_indirect_rreg - read an indirect register 927 * 928 * @adev: amdgpu_device pointer 929 * @reg_addr: indirect register address to read from 930 * 931 * Returns the value of indirect register @reg_addr 932 */ 933 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 934 u32 reg_addr) 935 { 936 unsigned long flags, pcie_index, pcie_data; 937 void __iomem *pcie_index_offset; 938 void __iomem *pcie_data_offset; 939 u32 r; 940 941 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 942 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 943 944 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 945 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 946 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 947 948 writel(reg_addr, pcie_index_offset); 949 readl(pcie_index_offset); 950 r = readl(pcie_data_offset); 951 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 952 953 return r; 954 } 955 956 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 957 u64 reg_addr) 958 { 959 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 960 u32 r; 961 void __iomem *pcie_index_offset; 962 void __iomem *pcie_index_hi_offset; 963 void __iomem *pcie_data_offset; 964 965 if (unlikely(!adev->nbio.funcs)) { 966 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 967 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 968 } else { 969 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 970 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 971 } 972 973 if (reg_addr >> 32) { 974 if (unlikely(!adev->nbio.funcs)) 975 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 976 else 977 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 978 } else { 979 pcie_index_hi = 0; 980 } 981 982 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 983 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 984 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 985 if (pcie_index_hi != 0) 986 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 987 pcie_index_hi * 4; 988 989 writel(reg_addr, pcie_index_offset); 990 readl(pcie_index_offset); 991 if (pcie_index_hi != 0) { 992 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 993 readl(pcie_index_hi_offset); 994 } 995 r = readl(pcie_data_offset); 996 997 /* clear the high bits */ 998 if (pcie_index_hi != 0) { 999 writel(0, pcie_index_hi_offset); 1000 readl(pcie_index_hi_offset); 1001 } 1002 1003 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1004 1005 return r; 1006 } 1007 1008 /** 1009 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 1010 * 1011 * @adev: amdgpu_device pointer 1012 * @reg_addr: indirect register address to read from 1013 * 1014 * Returns the value of indirect register @reg_addr 1015 */ 1016 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1017 u32 reg_addr) 1018 { 1019 unsigned long flags, pcie_index, pcie_data; 1020 void __iomem *pcie_index_offset; 1021 void __iomem *pcie_data_offset; 1022 u64 r; 1023 1024 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1025 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1026 1027 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1028 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1029 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1030 1031 /* read low 32 bits */ 1032 writel(reg_addr, pcie_index_offset); 1033 readl(pcie_index_offset); 1034 r = readl(pcie_data_offset); 1035 /* read high 32 bits */ 1036 writel(reg_addr + 4, pcie_index_offset); 1037 readl(pcie_index_offset); 1038 r |= ((u64)readl(pcie_data_offset) << 32); 1039 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1040 1041 return r; 1042 } 1043 1044 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1045 u64 reg_addr) 1046 { 1047 unsigned long flags, pcie_index, pcie_data; 1048 unsigned long pcie_index_hi = 0; 1049 void __iomem *pcie_index_offset; 1050 void __iomem *pcie_index_hi_offset; 1051 void __iomem *pcie_data_offset; 1052 u64 r; 1053 1054 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1055 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1056 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1057 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1058 1059 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1060 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1061 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1062 if (pcie_index_hi != 0) 1063 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1064 pcie_index_hi * 4; 1065 1066 /* read low 32 bits */ 1067 writel(reg_addr, pcie_index_offset); 1068 readl(pcie_index_offset); 1069 if (pcie_index_hi != 0) { 1070 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1071 readl(pcie_index_hi_offset); 1072 } 1073 r = readl(pcie_data_offset); 1074 /* read high 32 bits */ 1075 writel(reg_addr + 4, pcie_index_offset); 1076 readl(pcie_index_offset); 1077 if (pcie_index_hi != 0) { 1078 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1079 readl(pcie_index_hi_offset); 1080 } 1081 r |= ((u64)readl(pcie_data_offset) << 32); 1082 1083 /* clear the high bits */ 1084 if (pcie_index_hi != 0) { 1085 writel(0, pcie_index_hi_offset); 1086 readl(pcie_index_hi_offset); 1087 } 1088 1089 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1090 1091 return r; 1092 } 1093 1094 /** 1095 * amdgpu_device_indirect_wreg - write an indirect register address 1096 * 1097 * @adev: amdgpu_device pointer 1098 * @reg_addr: indirect register offset 1099 * @reg_data: indirect register data 1100 * 1101 */ 1102 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1103 u32 reg_addr, u32 reg_data) 1104 { 1105 unsigned long flags, pcie_index, pcie_data; 1106 void __iomem *pcie_index_offset; 1107 void __iomem *pcie_data_offset; 1108 1109 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1110 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1111 1112 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1113 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1114 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1115 1116 writel(reg_addr, pcie_index_offset); 1117 readl(pcie_index_offset); 1118 writel(reg_data, pcie_data_offset); 1119 readl(pcie_data_offset); 1120 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1121 } 1122 1123 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1124 u64 reg_addr, u32 reg_data) 1125 { 1126 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1127 void __iomem *pcie_index_offset; 1128 void __iomem *pcie_index_hi_offset; 1129 void __iomem *pcie_data_offset; 1130 1131 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1132 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1133 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1134 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1135 else 1136 pcie_index_hi = 0; 1137 1138 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1139 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1140 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1141 if (pcie_index_hi != 0) 1142 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1143 pcie_index_hi * 4; 1144 1145 writel(reg_addr, pcie_index_offset); 1146 readl(pcie_index_offset); 1147 if (pcie_index_hi != 0) { 1148 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1149 readl(pcie_index_hi_offset); 1150 } 1151 writel(reg_data, pcie_data_offset); 1152 readl(pcie_data_offset); 1153 1154 /* clear the high bits */ 1155 if (pcie_index_hi != 0) { 1156 writel(0, pcie_index_hi_offset); 1157 readl(pcie_index_hi_offset); 1158 } 1159 1160 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1161 } 1162 1163 /** 1164 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1165 * 1166 * @adev: amdgpu_device pointer 1167 * @reg_addr: indirect register offset 1168 * @reg_data: indirect register data 1169 * 1170 */ 1171 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1172 u32 reg_addr, u64 reg_data) 1173 { 1174 unsigned long flags, pcie_index, pcie_data; 1175 void __iomem *pcie_index_offset; 1176 void __iomem *pcie_data_offset; 1177 1178 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1179 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1180 1181 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1182 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1183 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1184 1185 /* write low 32 bits */ 1186 writel(reg_addr, pcie_index_offset); 1187 readl(pcie_index_offset); 1188 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1189 readl(pcie_data_offset); 1190 /* write high 32 bits */ 1191 writel(reg_addr + 4, pcie_index_offset); 1192 readl(pcie_index_offset); 1193 writel((u32)(reg_data >> 32), pcie_data_offset); 1194 readl(pcie_data_offset); 1195 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1196 } 1197 1198 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1199 u64 reg_addr, u64 reg_data) 1200 { 1201 unsigned long flags, pcie_index, pcie_data; 1202 unsigned long pcie_index_hi = 0; 1203 void __iomem *pcie_index_offset; 1204 void __iomem *pcie_index_hi_offset; 1205 void __iomem *pcie_data_offset; 1206 1207 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1208 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1209 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1210 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1211 1212 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1213 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1214 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1215 if (pcie_index_hi != 0) 1216 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1217 pcie_index_hi * 4; 1218 1219 /* write low 32 bits */ 1220 writel(reg_addr, pcie_index_offset); 1221 readl(pcie_index_offset); 1222 if (pcie_index_hi != 0) { 1223 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1224 readl(pcie_index_hi_offset); 1225 } 1226 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1227 readl(pcie_data_offset); 1228 /* write high 32 bits */ 1229 writel(reg_addr + 4, pcie_index_offset); 1230 readl(pcie_index_offset); 1231 if (pcie_index_hi != 0) { 1232 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1233 readl(pcie_index_hi_offset); 1234 } 1235 writel((u32)(reg_data >> 32), pcie_data_offset); 1236 readl(pcie_data_offset); 1237 1238 /* clear the high bits */ 1239 if (pcie_index_hi != 0) { 1240 writel(0, pcie_index_hi_offset); 1241 readl(pcie_index_hi_offset); 1242 } 1243 1244 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1245 } 1246 1247 /** 1248 * amdgpu_device_get_rev_id - query device rev_id 1249 * 1250 * @adev: amdgpu_device pointer 1251 * 1252 * Return device rev_id 1253 */ 1254 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1255 { 1256 return adev->nbio.funcs->get_rev_id(adev); 1257 } 1258 1259 /** 1260 * amdgpu_invalid_rreg - dummy reg read function 1261 * 1262 * @adev: amdgpu_device pointer 1263 * @reg: offset of register 1264 * 1265 * Dummy register read function. Used for register blocks 1266 * that certain asics don't have (all asics). 1267 * Returns the value in the register. 1268 */ 1269 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1270 { 1271 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 1272 BUG(); 1273 return 0; 1274 } 1275 1276 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1277 { 1278 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1279 BUG(); 1280 return 0; 1281 } 1282 1283 /** 1284 * amdgpu_invalid_wreg - dummy reg write function 1285 * 1286 * @adev: amdgpu_device pointer 1287 * @reg: offset of register 1288 * @v: value to write to the register 1289 * 1290 * Dummy register read function. Used for register blocks 1291 * that certain asics don't have (all asics). 1292 */ 1293 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1294 { 1295 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 1296 reg, v); 1297 BUG(); 1298 } 1299 1300 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1301 { 1302 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 1303 reg, v); 1304 BUG(); 1305 } 1306 1307 /** 1308 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1309 * 1310 * @adev: amdgpu_device pointer 1311 * @reg: offset of register 1312 * 1313 * Dummy register read function. Used for register blocks 1314 * that certain asics don't have (all asics). 1315 * Returns the value in the register. 1316 */ 1317 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1318 { 1319 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 1320 BUG(); 1321 return 0; 1322 } 1323 1324 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1325 { 1326 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1327 BUG(); 1328 return 0; 1329 } 1330 1331 /** 1332 * amdgpu_invalid_wreg64 - dummy reg write function 1333 * 1334 * @adev: amdgpu_device pointer 1335 * @reg: offset of register 1336 * @v: value to write to the register 1337 * 1338 * Dummy register read function. Used for register blocks 1339 * that certain asics don't have (all asics). 1340 */ 1341 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1342 { 1343 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1344 reg, v); 1345 BUG(); 1346 } 1347 1348 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1349 { 1350 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1351 reg, v); 1352 BUG(); 1353 } 1354 1355 /** 1356 * amdgpu_block_invalid_rreg - dummy reg read function 1357 * 1358 * @adev: amdgpu_device pointer 1359 * @block: offset of instance 1360 * @reg: offset of register 1361 * 1362 * Dummy register read function. Used for register blocks 1363 * that certain asics don't have (all asics). 1364 * Returns the value in the register. 1365 */ 1366 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1367 uint32_t block, uint32_t reg) 1368 { 1369 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1370 reg, block); 1371 BUG(); 1372 return 0; 1373 } 1374 1375 /** 1376 * amdgpu_block_invalid_wreg - dummy reg write function 1377 * 1378 * @adev: amdgpu_device pointer 1379 * @block: offset of instance 1380 * @reg: offset of register 1381 * @v: value to write to the register 1382 * 1383 * Dummy register read function. Used for register blocks 1384 * that certain asics don't have (all asics). 1385 */ 1386 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1387 uint32_t block, 1388 uint32_t reg, uint32_t v) 1389 { 1390 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1391 reg, block, v); 1392 BUG(); 1393 } 1394 1395 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev) 1396 { 1397 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1398 return AMDGPU_VBIOS_SKIP; 1399 1400 if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev)) 1401 return AMDGPU_VBIOS_OPTIONAL; 1402 1403 return 0; 1404 } 1405 1406 /** 1407 * amdgpu_device_asic_init - Wrapper for atom asic_init 1408 * 1409 * @adev: amdgpu_device pointer 1410 * 1411 * Does any asic specific work and then calls atom asic init. 1412 */ 1413 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1414 { 1415 uint32_t flags; 1416 bool optional; 1417 int ret; 1418 1419 amdgpu_asic_pre_asic_init(adev); 1420 flags = amdgpu_device_get_vbios_flags(adev); 1421 optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP)); 1422 1423 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1424 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1425 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 1426 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1427 amdgpu_psp_wait_for_bootloader(adev); 1428 if (optional && !adev->bios) 1429 return 0; 1430 1431 ret = amdgpu_atomfirmware_asic_init(adev, true); 1432 return ret; 1433 } else { 1434 if (optional && !adev->bios) 1435 return 0; 1436 1437 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1438 } 1439 1440 return 0; 1441 } 1442 1443 /** 1444 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1445 * 1446 * @adev: amdgpu_device pointer 1447 * 1448 * Allocates a scratch page of VRAM for use by various things in the 1449 * driver. 1450 */ 1451 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1452 { 1453 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1454 AMDGPU_GEM_DOMAIN_VRAM | 1455 AMDGPU_GEM_DOMAIN_GTT, 1456 &adev->mem_scratch.robj, 1457 &adev->mem_scratch.gpu_addr, 1458 (void **)&adev->mem_scratch.ptr); 1459 } 1460 1461 /** 1462 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1463 * 1464 * @adev: amdgpu_device pointer 1465 * 1466 * Frees the VRAM scratch page. 1467 */ 1468 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1469 { 1470 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1471 } 1472 1473 /** 1474 * amdgpu_device_program_register_sequence - program an array of registers. 1475 * 1476 * @adev: amdgpu_device pointer 1477 * @registers: pointer to the register array 1478 * @array_size: size of the register array 1479 * 1480 * Programs an array or registers with and or masks. 1481 * This is a helper for setting golden registers. 1482 */ 1483 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1484 const u32 *registers, 1485 const u32 array_size) 1486 { 1487 u32 tmp, reg, and_mask, or_mask; 1488 int i; 1489 1490 if (array_size % 3) 1491 return; 1492 1493 for (i = 0; i < array_size; i += 3) { 1494 reg = registers[i + 0]; 1495 and_mask = registers[i + 1]; 1496 or_mask = registers[i + 2]; 1497 1498 if (and_mask == 0xffffffff) { 1499 tmp = or_mask; 1500 } else { 1501 tmp = RREG32(reg); 1502 tmp &= ~and_mask; 1503 if (adev->family >= AMDGPU_FAMILY_AI) 1504 tmp |= (or_mask & and_mask); 1505 else 1506 tmp |= or_mask; 1507 } 1508 WREG32(reg, tmp); 1509 } 1510 } 1511 1512 /** 1513 * amdgpu_device_pci_config_reset - reset the GPU 1514 * 1515 * @adev: amdgpu_device pointer 1516 * 1517 * Resets the GPU using the pci config reset sequence. 1518 * Only applicable to asics prior to vega10. 1519 */ 1520 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1521 { 1522 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1523 } 1524 1525 /** 1526 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1527 * 1528 * @adev: amdgpu_device pointer 1529 * 1530 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1531 */ 1532 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1533 { 1534 return pci_reset_function(adev->pdev); 1535 } 1536 1537 /* 1538 * amdgpu_device_wb_*() 1539 * Writeback is the method by which the GPU updates special pages in memory 1540 * with the status of certain GPU events (fences, ring pointers,etc.). 1541 */ 1542 1543 /** 1544 * amdgpu_device_wb_fini - Disable Writeback and free memory 1545 * 1546 * @adev: amdgpu_device pointer 1547 * 1548 * Disables Writeback and frees the Writeback memory (all asics). 1549 * Used at driver shutdown. 1550 */ 1551 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1552 { 1553 if (adev->wb.wb_obj) { 1554 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1555 &adev->wb.gpu_addr, 1556 (void **)&adev->wb.wb); 1557 adev->wb.wb_obj = NULL; 1558 } 1559 } 1560 1561 /** 1562 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1563 * 1564 * @adev: amdgpu_device pointer 1565 * 1566 * Initializes writeback and allocates writeback memory (all asics). 1567 * Used at driver startup. 1568 * Returns 0 on success or an -error on failure. 1569 */ 1570 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1571 { 1572 int r; 1573 1574 if (adev->wb.wb_obj == NULL) { 1575 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1576 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1577 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1578 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1579 (void **)&adev->wb.wb); 1580 if (r) { 1581 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1582 return r; 1583 } 1584 1585 adev->wb.num_wb = AMDGPU_MAX_WB; 1586 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1587 1588 /* clear wb memory */ 1589 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1590 } 1591 1592 return 0; 1593 } 1594 1595 /** 1596 * amdgpu_device_wb_get - Allocate a wb entry 1597 * 1598 * @adev: amdgpu_device pointer 1599 * @wb: wb index 1600 * 1601 * Allocate a wb slot for use by the driver (all asics). 1602 * Returns 0 on success or -EINVAL on failure. 1603 */ 1604 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1605 { 1606 unsigned long flags, offset; 1607 1608 spin_lock_irqsave(&adev->wb.lock, flags); 1609 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1610 if (offset < adev->wb.num_wb) { 1611 __set_bit(offset, adev->wb.used); 1612 spin_unlock_irqrestore(&adev->wb.lock, flags); 1613 *wb = offset << 3; /* convert to dw offset */ 1614 return 0; 1615 } else { 1616 spin_unlock_irqrestore(&adev->wb.lock, flags); 1617 return -EINVAL; 1618 } 1619 } 1620 1621 /** 1622 * amdgpu_device_wb_free - Free a wb entry 1623 * 1624 * @adev: amdgpu_device pointer 1625 * @wb: wb index 1626 * 1627 * Free a wb slot allocated for use by the driver (all asics) 1628 */ 1629 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1630 { 1631 unsigned long flags; 1632 1633 wb >>= 3; 1634 spin_lock_irqsave(&adev->wb.lock, flags); 1635 if (wb < adev->wb.num_wb) 1636 __clear_bit(wb, adev->wb.used); 1637 spin_unlock_irqrestore(&adev->wb.lock, flags); 1638 } 1639 1640 /** 1641 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1642 * 1643 * @adev: amdgpu_device pointer 1644 * 1645 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1646 * to fail, but if any of the BARs is not accessible after the size we abort 1647 * driver loading by returning -ENODEV. 1648 */ 1649 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1650 { 1651 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1652 struct pci_bus *root; 1653 struct resource *res; 1654 unsigned int i; 1655 u16 cmd; 1656 int r; 1657 1658 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1659 return 0; 1660 1661 /* Bypass for VF */ 1662 if (amdgpu_sriov_vf(adev)) 1663 return 0; 1664 1665 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1666 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1667 DRM_WARN("System can't access extended configuration space, please check!!\n"); 1668 1669 /* skip if the bios has already enabled large BAR */ 1670 if (adev->gmc.real_vram_size && 1671 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1672 return 0; 1673 1674 /* Check if the root BUS has 64bit memory resources */ 1675 root = adev->pdev->bus; 1676 while (root->parent) 1677 root = root->parent; 1678 1679 pci_bus_for_each_resource(root, res, i) { 1680 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1681 res->start > 0x100000000ull) 1682 break; 1683 } 1684 1685 /* Trying to resize is pointless without a root hub window above 4GB */ 1686 if (!res) 1687 return 0; 1688 1689 /* Limit the BAR size to what is available */ 1690 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1691 rbar_size); 1692 1693 /* Disable memory decoding while we change the BAR addresses and size */ 1694 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1695 pci_write_config_word(adev->pdev, PCI_COMMAND, 1696 cmd & ~PCI_COMMAND_MEMORY); 1697 1698 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1699 amdgpu_doorbell_fini(adev); 1700 if (adev->asic_type >= CHIP_BONAIRE) 1701 pci_release_resource(adev->pdev, 2); 1702 1703 pci_release_resource(adev->pdev, 0); 1704 1705 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1706 if (r == -ENOSPC) 1707 DRM_INFO("Not enough PCI address space for a large BAR."); 1708 else if (r && r != -ENOTSUPP) 1709 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1710 1711 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1712 1713 /* When the doorbell or fb BAR isn't available we have no chance of 1714 * using the device. 1715 */ 1716 r = amdgpu_doorbell_init(adev); 1717 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1718 return -ENODEV; 1719 1720 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1721 1722 return 0; 1723 } 1724 1725 /* 1726 * GPU helpers function. 1727 */ 1728 /** 1729 * amdgpu_device_need_post - check if the hw need post or not 1730 * 1731 * @adev: amdgpu_device pointer 1732 * 1733 * Check if the asic has been initialized (all asics) at driver startup 1734 * or post is needed if hw reset is performed. 1735 * Returns true if need or false if not. 1736 */ 1737 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1738 { 1739 uint32_t reg, flags; 1740 1741 if (amdgpu_sriov_vf(adev)) 1742 return false; 1743 1744 flags = amdgpu_device_get_vbios_flags(adev); 1745 if (flags & AMDGPU_VBIOS_SKIP) 1746 return false; 1747 if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios) 1748 return false; 1749 1750 if (amdgpu_passthrough(adev)) { 1751 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1752 * some old smc fw still need driver do vPost otherwise gpu hang, while 1753 * those smc fw version above 22.15 doesn't have this flaw, so we force 1754 * vpost executed for smc version below 22.15 1755 */ 1756 if (adev->asic_type == CHIP_FIJI) { 1757 int err; 1758 uint32_t fw_ver; 1759 1760 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1761 /* force vPost if error occurred */ 1762 if (err) 1763 return true; 1764 1765 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1766 release_firmware(adev->pm.fw); 1767 if (fw_ver < 0x00160e00) 1768 return true; 1769 } 1770 } 1771 1772 /* Don't post if we need to reset whole hive on init */ 1773 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1774 return false; 1775 1776 if (adev->has_hw_reset) { 1777 adev->has_hw_reset = false; 1778 return true; 1779 } 1780 1781 /* bios scratch used on CIK+ */ 1782 if (adev->asic_type >= CHIP_BONAIRE) 1783 return amdgpu_atombios_scratch_need_asic_init(adev); 1784 1785 /* check MEM_SIZE for older asics */ 1786 reg = amdgpu_asic_get_config_memsize(adev); 1787 1788 if ((reg != 0) && (reg != 0xffffffff)) 1789 return false; 1790 1791 return true; 1792 } 1793 1794 /* 1795 * Check whether seamless boot is supported. 1796 * 1797 * So far we only support seamless boot on DCE 3.0 or later. 1798 * If users report that it works on older ASICS as well, we may 1799 * loosen this. 1800 */ 1801 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1802 { 1803 switch (amdgpu_seamless) { 1804 case -1: 1805 break; 1806 case 1: 1807 return true; 1808 case 0: 1809 return false; 1810 default: 1811 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1812 amdgpu_seamless); 1813 return false; 1814 } 1815 1816 if (!(adev->flags & AMD_IS_APU)) 1817 return false; 1818 1819 if (adev->mman.keep_stolen_vga_memory) 1820 return false; 1821 1822 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1823 } 1824 1825 /* 1826 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1827 * don't support dynamic speed switching. Until we have confirmation from Intel 1828 * that a specific host supports it, it's safer that we keep it disabled for all. 1829 * 1830 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1831 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1832 */ 1833 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1834 { 1835 #if IS_ENABLED(CONFIG_X86) 1836 struct cpuinfo_x86 *c = &cpu_data(0); 1837 1838 /* eGPU change speeds based on USB4 fabric conditions */ 1839 if (dev_is_removable(adev->dev)) 1840 return true; 1841 1842 if (c->x86_vendor == X86_VENDOR_INTEL) 1843 return false; 1844 #endif 1845 return true; 1846 } 1847 1848 /** 1849 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1850 * 1851 * @adev: amdgpu_device pointer 1852 * 1853 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1854 * be set for this device. 1855 * 1856 * Returns true if it should be used or false if not. 1857 */ 1858 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1859 { 1860 switch (amdgpu_aspm) { 1861 case -1: 1862 break; 1863 case 0: 1864 return false; 1865 case 1: 1866 return true; 1867 default: 1868 return false; 1869 } 1870 if (adev->flags & AMD_IS_APU) 1871 return false; 1872 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) 1873 return false; 1874 return pcie_aspm_enabled(adev->pdev); 1875 } 1876 1877 /* if we get transitioned to only one device, take VGA back */ 1878 /** 1879 * amdgpu_device_vga_set_decode - enable/disable vga decode 1880 * 1881 * @pdev: PCI device pointer 1882 * @state: enable/disable vga decode 1883 * 1884 * Enable/disable vga decode (all asics). 1885 * Returns VGA resource flags. 1886 */ 1887 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1888 bool state) 1889 { 1890 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1891 1892 amdgpu_asic_set_vga_state(adev, state); 1893 if (state) 1894 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1895 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1896 else 1897 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1898 } 1899 1900 /** 1901 * amdgpu_device_check_block_size - validate the vm block size 1902 * 1903 * @adev: amdgpu_device pointer 1904 * 1905 * Validates the vm block size specified via module parameter. 1906 * The vm block size defines number of bits in page table versus page directory, 1907 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1908 * page table and the remaining bits are in the page directory. 1909 */ 1910 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1911 { 1912 /* defines number of bits in page table versus page directory, 1913 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1914 * page table and the remaining bits are in the page directory 1915 */ 1916 if (amdgpu_vm_block_size == -1) 1917 return; 1918 1919 if (amdgpu_vm_block_size < 9) { 1920 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1921 amdgpu_vm_block_size); 1922 amdgpu_vm_block_size = -1; 1923 } 1924 } 1925 1926 /** 1927 * amdgpu_device_check_vm_size - validate the vm size 1928 * 1929 * @adev: amdgpu_device pointer 1930 * 1931 * Validates the vm size in GB specified via module parameter. 1932 * The VM size is the size of the GPU virtual memory space in GB. 1933 */ 1934 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1935 { 1936 /* no need to check the default value */ 1937 if (amdgpu_vm_size == -1) 1938 return; 1939 1940 if (amdgpu_vm_size < 1) { 1941 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1942 amdgpu_vm_size); 1943 amdgpu_vm_size = -1; 1944 } 1945 } 1946 1947 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1948 { 1949 struct sysinfo si; 1950 bool is_os_64 = (sizeof(void *) == 8); 1951 uint64_t total_memory; 1952 uint64_t dram_size_seven_GB = 0x1B8000000; 1953 uint64_t dram_size_three_GB = 0xB8000000; 1954 1955 if (amdgpu_smu_memory_pool_size == 0) 1956 return; 1957 1958 if (!is_os_64) { 1959 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1960 goto def_value; 1961 } 1962 si_meminfo(&si); 1963 total_memory = (uint64_t)si.totalram * si.mem_unit; 1964 1965 if ((amdgpu_smu_memory_pool_size == 1) || 1966 (amdgpu_smu_memory_pool_size == 2)) { 1967 if (total_memory < dram_size_three_GB) 1968 goto def_value1; 1969 } else if ((amdgpu_smu_memory_pool_size == 4) || 1970 (amdgpu_smu_memory_pool_size == 8)) { 1971 if (total_memory < dram_size_seven_GB) 1972 goto def_value1; 1973 } else { 1974 DRM_WARN("Smu memory pool size not supported\n"); 1975 goto def_value; 1976 } 1977 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1978 1979 return; 1980 1981 def_value1: 1982 DRM_WARN("No enough system memory\n"); 1983 def_value: 1984 adev->pm.smu_prv_buffer_size = 0; 1985 } 1986 1987 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1988 { 1989 if (!(adev->flags & AMD_IS_APU) || 1990 adev->asic_type < CHIP_RAVEN) 1991 return 0; 1992 1993 switch (adev->asic_type) { 1994 case CHIP_RAVEN: 1995 if (adev->pdev->device == 0x15dd) 1996 adev->apu_flags |= AMD_APU_IS_RAVEN; 1997 if (adev->pdev->device == 0x15d8) 1998 adev->apu_flags |= AMD_APU_IS_PICASSO; 1999 break; 2000 case CHIP_RENOIR: 2001 if ((adev->pdev->device == 0x1636) || 2002 (adev->pdev->device == 0x164c)) 2003 adev->apu_flags |= AMD_APU_IS_RENOIR; 2004 else 2005 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 2006 break; 2007 case CHIP_VANGOGH: 2008 adev->apu_flags |= AMD_APU_IS_VANGOGH; 2009 break; 2010 case CHIP_YELLOW_CARP: 2011 break; 2012 case CHIP_CYAN_SKILLFISH: 2013 if ((adev->pdev->device == 0x13FE) || 2014 (adev->pdev->device == 0x143F)) 2015 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 2016 break; 2017 default: 2018 break; 2019 } 2020 2021 return 0; 2022 } 2023 2024 /** 2025 * amdgpu_device_check_arguments - validate module params 2026 * 2027 * @adev: amdgpu_device pointer 2028 * 2029 * Validates certain module parameters and updates 2030 * the associated values used by the driver (all asics). 2031 */ 2032 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 2033 { 2034 int i; 2035 2036 if (amdgpu_sched_jobs < 4) { 2037 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2038 amdgpu_sched_jobs); 2039 amdgpu_sched_jobs = 4; 2040 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2041 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2042 amdgpu_sched_jobs); 2043 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2044 } 2045 2046 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2047 /* gart size must be greater or equal to 32M */ 2048 dev_warn(adev->dev, "gart size (%d) too small\n", 2049 amdgpu_gart_size); 2050 amdgpu_gart_size = -1; 2051 } 2052 2053 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2054 /* gtt size must be greater or equal to 32M */ 2055 dev_warn(adev->dev, "gtt size (%d) too small\n", 2056 amdgpu_gtt_size); 2057 amdgpu_gtt_size = -1; 2058 } 2059 2060 /* valid range is between 4 and 9 inclusive */ 2061 if (amdgpu_vm_fragment_size != -1 && 2062 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2063 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2064 amdgpu_vm_fragment_size = -1; 2065 } 2066 2067 if (amdgpu_sched_hw_submission < 2) { 2068 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2069 amdgpu_sched_hw_submission); 2070 amdgpu_sched_hw_submission = 2; 2071 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2072 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2073 amdgpu_sched_hw_submission); 2074 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2075 } 2076 2077 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2078 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2079 amdgpu_reset_method = -1; 2080 } 2081 2082 amdgpu_device_check_smu_prv_buffer_size(adev); 2083 2084 amdgpu_device_check_vm_size(adev); 2085 2086 amdgpu_device_check_block_size(adev); 2087 2088 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2089 2090 for (i = 0; i < MAX_XCP; i++) 2091 adev->enforce_isolation[i] = !!enforce_isolation; 2092 2093 return 0; 2094 } 2095 2096 /** 2097 * amdgpu_switcheroo_set_state - set switcheroo state 2098 * 2099 * @pdev: pci dev pointer 2100 * @state: vga_switcheroo state 2101 * 2102 * Callback for the switcheroo driver. Suspends or resumes 2103 * the asics before or after it is powered up using ACPI methods. 2104 */ 2105 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2106 enum vga_switcheroo_state state) 2107 { 2108 struct drm_device *dev = pci_get_drvdata(pdev); 2109 int r; 2110 2111 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 2112 return; 2113 2114 if (state == VGA_SWITCHEROO_ON) { 2115 pr_info("switched on\n"); 2116 /* don't suspend or resume card normally */ 2117 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2118 2119 pci_set_power_state(pdev, PCI_D0); 2120 amdgpu_device_load_pci_state(pdev); 2121 r = pci_enable_device(pdev); 2122 if (r) 2123 DRM_WARN("pci_enable_device failed (%d)\n", r); 2124 amdgpu_device_resume(dev, true); 2125 2126 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2127 } else { 2128 pr_info("switched off\n"); 2129 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2130 amdgpu_device_prepare(dev); 2131 amdgpu_device_suspend(dev, true); 2132 amdgpu_device_cache_pci_state(pdev); 2133 /* Shut down the device */ 2134 pci_disable_device(pdev); 2135 pci_set_power_state(pdev, PCI_D3cold); 2136 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2137 } 2138 } 2139 2140 /** 2141 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2142 * 2143 * @pdev: pci dev pointer 2144 * 2145 * Callback for the switcheroo driver. Check of the switcheroo 2146 * state can be changed. 2147 * Returns true if the state can be changed, false if not. 2148 */ 2149 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2150 { 2151 struct drm_device *dev = pci_get_drvdata(pdev); 2152 2153 /* 2154 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2155 * locking inversion with the driver load path. And the access here is 2156 * completely racy anyway. So don't bother with locking for now. 2157 */ 2158 return atomic_read(&dev->open_count) == 0; 2159 } 2160 2161 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2162 .set_gpu_state = amdgpu_switcheroo_set_state, 2163 .reprobe = NULL, 2164 .can_switch = amdgpu_switcheroo_can_switch, 2165 }; 2166 2167 /** 2168 * amdgpu_device_ip_set_clockgating_state - set the CG state 2169 * 2170 * @dev: amdgpu_device pointer 2171 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2172 * @state: clockgating state (gate or ungate) 2173 * 2174 * Sets the requested clockgating state for all instances of 2175 * the hardware IP specified. 2176 * Returns the error code from the last instance. 2177 */ 2178 int amdgpu_device_ip_set_clockgating_state(void *dev, 2179 enum amd_ip_block_type block_type, 2180 enum amd_clockgating_state state) 2181 { 2182 struct amdgpu_device *adev = dev; 2183 int i, r = 0; 2184 2185 for (i = 0; i < adev->num_ip_blocks; i++) { 2186 if (!adev->ip_blocks[i].status.valid) 2187 continue; 2188 if (adev->ip_blocks[i].version->type != block_type) 2189 continue; 2190 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2191 continue; 2192 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2193 &adev->ip_blocks[i], state); 2194 if (r) 2195 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 2196 adev->ip_blocks[i].version->funcs->name, r); 2197 } 2198 return r; 2199 } 2200 2201 /** 2202 * amdgpu_device_ip_set_powergating_state - set the PG state 2203 * 2204 * @dev: amdgpu_device pointer 2205 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2206 * @state: powergating state (gate or ungate) 2207 * 2208 * Sets the requested powergating state for all instances of 2209 * the hardware IP specified. 2210 * Returns the error code from the last instance. 2211 */ 2212 int amdgpu_device_ip_set_powergating_state(void *dev, 2213 enum amd_ip_block_type block_type, 2214 enum amd_powergating_state state) 2215 { 2216 struct amdgpu_device *adev = dev; 2217 int i, r = 0; 2218 2219 for (i = 0; i < adev->num_ip_blocks; i++) { 2220 if (!adev->ip_blocks[i].status.valid) 2221 continue; 2222 if (adev->ip_blocks[i].version->type != block_type) 2223 continue; 2224 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2225 continue; 2226 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2227 &adev->ip_blocks[i], state); 2228 if (r) 2229 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 2230 adev->ip_blocks[i].version->funcs->name, r); 2231 } 2232 return r; 2233 } 2234 2235 /** 2236 * amdgpu_device_ip_get_clockgating_state - get the CG state 2237 * 2238 * @adev: amdgpu_device pointer 2239 * @flags: clockgating feature flags 2240 * 2241 * Walks the list of IPs on the device and updates the clockgating 2242 * flags for each IP. 2243 * Updates @flags with the feature flags for each hardware IP where 2244 * clockgating is enabled. 2245 */ 2246 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2247 u64 *flags) 2248 { 2249 int i; 2250 2251 for (i = 0; i < adev->num_ip_blocks; i++) { 2252 if (!adev->ip_blocks[i].status.valid) 2253 continue; 2254 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2255 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 2256 } 2257 } 2258 2259 /** 2260 * amdgpu_device_ip_wait_for_idle - wait for idle 2261 * 2262 * @adev: amdgpu_device pointer 2263 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2264 * 2265 * Waits for the request hardware IP to be idle. 2266 * Returns 0 for success or a negative error code on failure. 2267 */ 2268 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2269 enum amd_ip_block_type block_type) 2270 { 2271 int i, r; 2272 2273 for (i = 0; i < adev->num_ip_blocks; i++) { 2274 if (!adev->ip_blocks[i].status.valid) 2275 continue; 2276 if (adev->ip_blocks[i].version->type == block_type) { 2277 if (adev->ip_blocks[i].version->funcs->wait_for_idle) { 2278 r = adev->ip_blocks[i].version->funcs->wait_for_idle( 2279 &adev->ip_blocks[i]); 2280 if (r) 2281 return r; 2282 } 2283 break; 2284 } 2285 } 2286 return 0; 2287 2288 } 2289 2290 /** 2291 * amdgpu_device_ip_is_valid - is the hardware IP enabled 2292 * 2293 * @adev: amdgpu_device pointer 2294 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2295 * 2296 * Check if the hardware IP is enable or not. 2297 * Returns true if it the IP is enable, false if not. 2298 */ 2299 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, 2300 enum amd_ip_block_type block_type) 2301 { 2302 int i; 2303 2304 for (i = 0; i < adev->num_ip_blocks; i++) { 2305 if (adev->ip_blocks[i].version->type == block_type) 2306 return adev->ip_blocks[i].status.valid; 2307 } 2308 return false; 2309 2310 } 2311 2312 /** 2313 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2314 * 2315 * @adev: amdgpu_device pointer 2316 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2317 * 2318 * Returns a pointer to the hardware IP block structure 2319 * if it exists for the asic, otherwise NULL. 2320 */ 2321 struct amdgpu_ip_block * 2322 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2323 enum amd_ip_block_type type) 2324 { 2325 int i; 2326 2327 for (i = 0; i < adev->num_ip_blocks; i++) 2328 if (adev->ip_blocks[i].version->type == type) 2329 return &adev->ip_blocks[i]; 2330 2331 return NULL; 2332 } 2333 2334 /** 2335 * amdgpu_device_ip_block_version_cmp 2336 * 2337 * @adev: amdgpu_device pointer 2338 * @type: enum amd_ip_block_type 2339 * @major: major version 2340 * @minor: minor version 2341 * 2342 * return 0 if equal or greater 2343 * return 1 if smaller or the ip_block doesn't exist 2344 */ 2345 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2346 enum amd_ip_block_type type, 2347 u32 major, u32 minor) 2348 { 2349 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2350 2351 if (ip_block && ((ip_block->version->major > major) || 2352 ((ip_block->version->major == major) && 2353 (ip_block->version->minor >= minor)))) 2354 return 0; 2355 2356 return 1; 2357 } 2358 2359 /** 2360 * amdgpu_device_ip_block_add 2361 * 2362 * @adev: amdgpu_device pointer 2363 * @ip_block_version: pointer to the IP to add 2364 * 2365 * Adds the IP block driver information to the collection of IPs 2366 * on the asic. 2367 */ 2368 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2369 const struct amdgpu_ip_block_version *ip_block_version) 2370 { 2371 if (!ip_block_version) 2372 return -EINVAL; 2373 2374 switch (ip_block_version->type) { 2375 case AMD_IP_BLOCK_TYPE_VCN: 2376 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2377 return 0; 2378 break; 2379 case AMD_IP_BLOCK_TYPE_JPEG: 2380 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2381 return 0; 2382 break; 2383 default: 2384 break; 2385 } 2386 2387 dev_info(adev->dev, "detected ip block number %d <%s>\n", 2388 adev->num_ip_blocks, ip_block_version->funcs->name); 2389 2390 adev->ip_blocks[adev->num_ip_blocks].adev = adev; 2391 2392 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2393 2394 return 0; 2395 } 2396 2397 /** 2398 * amdgpu_device_enable_virtual_display - enable virtual display feature 2399 * 2400 * @adev: amdgpu_device pointer 2401 * 2402 * Enabled the virtual display feature if the user has enabled it via 2403 * the module parameter virtual_display. This feature provides a virtual 2404 * display hardware on headless boards or in virtualized environments. 2405 * This function parses and validates the configuration string specified by 2406 * the user and configures the virtual display configuration (number of 2407 * virtual connectors, crtcs, etc.) specified. 2408 */ 2409 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2410 { 2411 adev->enable_virtual_display = false; 2412 2413 if (amdgpu_virtual_display) { 2414 const char *pci_address_name = pci_name(adev->pdev); 2415 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2416 2417 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2418 pciaddstr_tmp = pciaddstr; 2419 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2420 pciaddname = strsep(&pciaddname_tmp, ","); 2421 if (!strcmp("all", pciaddname) 2422 || !strcmp(pci_address_name, pciaddname)) { 2423 long num_crtc; 2424 int res = -1; 2425 2426 adev->enable_virtual_display = true; 2427 2428 if (pciaddname_tmp) 2429 res = kstrtol(pciaddname_tmp, 10, 2430 &num_crtc); 2431 2432 if (!res) { 2433 if (num_crtc < 1) 2434 num_crtc = 1; 2435 if (num_crtc > 6) 2436 num_crtc = 6; 2437 adev->mode_info.num_crtc = num_crtc; 2438 } else { 2439 adev->mode_info.num_crtc = 1; 2440 } 2441 break; 2442 } 2443 } 2444 2445 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2446 amdgpu_virtual_display, pci_address_name, 2447 adev->enable_virtual_display, adev->mode_info.num_crtc); 2448 2449 kfree(pciaddstr); 2450 } 2451 } 2452 2453 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2454 { 2455 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2456 adev->mode_info.num_crtc = 1; 2457 adev->enable_virtual_display = true; 2458 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2459 adev->enable_virtual_display, adev->mode_info.num_crtc); 2460 } 2461 } 2462 2463 /** 2464 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2465 * 2466 * @adev: amdgpu_device pointer 2467 * 2468 * Parses the asic configuration parameters specified in the gpu info 2469 * firmware and makes them available to the driver for use in configuring 2470 * the asic. 2471 * Returns 0 on success, -EINVAL on failure. 2472 */ 2473 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2474 { 2475 const char *chip_name; 2476 int err; 2477 const struct gpu_info_firmware_header_v1_0 *hdr; 2478 2479 adev->firmware.gpu_info_fw = NULL; 2480 2481 if (adev->mman.discovery_bin) 2482 return 0; 2483 2484 switch (adev->asic_type) { 2485 default: 2486 return 0; 2487 case CHIP_VEGA10: 2488 chip_name = "vega10"; 2489 break; 2490 case CHIP_VEGA12: 2491 chip_name = "vega12"; 2492 break; 2493 case CHIP_RAVEN: 2494 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2495 chip_name = "raven2"; 2496 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2497 chip_name = "picasso"; 2498 else 2499 chip_name = "raven"; 2500 break; 2501 case CHIP_ARCTURUS: 2502 chip_name = "arcturus"; 2503 break; 2504 case CHIP_NAVI12: 2505 chip_name = "navi12"; 2506 break; 2507 } 2508 2509 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2510 AMDGPU_UCODE_OPTIONAL, 2511 "amdgpu/%s_gpu_info.bin", chip_name); 2512 if (err) { 2513 dev_err(adev->dev, 2514 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2515 chip_name); 2516 goto out; 2517 } 2518 2519 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2520 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2521 2522 switch (hdr->version_major) { 2523 case 1: 2524 { 2525 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2526 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2527 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2528 2529 /* 2530 * Should be dropped when DAL no longer needs it. 2531 */ 2532 if (adev->asic_type == CHIP_NAVI12) 2533 goto parse_soc_bounding_box; 2534 2535 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2536 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2537 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2538 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2539 adev->gfx.config.max_texture_channel_caches = 2540 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2541 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2542 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2543 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2544 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2545 adev->gfx.config.double_offchip_lds_buf = 2546 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2547 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2548 adev->gfx.cu_info.max_waves_per_simd = 2549 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2550 adev->gfx.cu_info.max_scratch_slots_per_cu = 2551 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2552 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2553 if (hdr->version_minor >= 1) { 2554 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2555 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2556 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2557 adev->gfx.config.num_sc_per_sh = 2558 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2559 adev->gfx.config.num_packer_per_sc = 2560 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2561 } 2562 2563 parse_soc_bounding_box: 2564 /* 2565 * soc bounding box info is not integrated in disocovery table, 2566 * we always need to parse it from gpu info firmware if needed. 2567 */ 2568 if (hdr->version_minor == 2) { 2569 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2570 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2571 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2572 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2573 } 2574 break; 2575 } 2576 default: 2577 dev_err(adev->dev, 2578 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2579 err = -EINVAL; 2580 goto out; 2581 } 2582 out: 2583 return err; 2584 } 2585 2586 /** 2587 * amdgpu_device_ip_early_init - run early init for hardware IPs 2588 * 2589 * @adev: amdgpu_device pointer 2590 * 2591 * Early initialization pass for hardware IPs. The hardware IPs that make 2592 * up each asic are discovered each IP's early_init callback is run. This 2593 * is the first stage in initializing the asic. 2594 * Returns 0 on success, negative error code on failure. 2595 */ 2596 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2597 { 2598 struct amdgpu_ip_block *ip_block; 2599 struct pci_dev *parent; 2600 bool total, skip_bios; 2601 uint32_t bios_flags; 2602 int i, r; 2603 2604 amdgpu_device_enable_virtual_display(adev); 2605 2606 if (amdgpu_sriov_vf(adev)) { 2607 r = amdgpu_virt_request_full_gpu(adev, true); 2608 if (r) 2609 return r; 2610 } 2611 2612 switch (adev->asic_type) { 2613 #ifdef CONFIG_DRM_AMDGPU_SI 2614 case CHIP_VERDE: 2615 case CHIP_TAHITI: 2616 case CHIP_PITCAIRN: 2617 case CHIP_OLAND: 2618 case CHIP_HAINAN: 2619 adev->family = AMDGPU_FAMILY_SI; 2620 r = si_set_ip_blocks(adev); 2621 if (r) 2622 return r; 2623 break; 2624 #endif 2625 #ifdef CONFIG_DRM_AMDGPU_CIK 2626 case CHIP_BONAIRE: 2627 case CHIP_HAWAII: 2628 case CHIP_KAVERI: 2629 case CHIP_KABINI: 2630 case CHIP_MULLINS: 2631 if (adev->flags & AMD_IS_APU) 2632 adev->family = AMDGPU_FAMILY_KV; 2633 else 2634 adev->family = AMDGPU_FAMILY_CI; 2635 2636 r = cik_set_ip_blocks(adev); 2637 if (r) 2638 return r; 2639 break; 2640 #endif 2641 case CHIP_TOPAZ: 2642 case CHIP_TONGA: 2643 case CHIP_FIJI: 2644 case CHIP_POLARIS10: 2645 case CHIP_POLARIS11: 2646 case CHIP_POLARIS12: 2647 case CHIP_VEGAM: 2648 case CHIP_CARRIZO: 2649 case CHIP_STONEY: 2650 if (adev->flags & AMD_IS_APU) 2651 adev->family = AMDGPU_FAMILY_CZ; 2652 else 2653 adev->family = AMDGPU_FAMILY_VI; 2654 2655 r = vi_set_ip_blocks(adev); 2656 if (r) 2657 return r; 2658 break; 2659 default: 2660 r = amdgpu_discovery_set_ip_blocks(adev); 2661 if (r) 2662 return r; 2663 break; 2664 } 2665 2666 if (amdgpu_has_atpx() && 2667 (amdgpu_is_atpx_hybrid() || 2668 amdgpu_has_atpx_dgpu_power_cntl()) && 2669 ((adev->flags & AMD_IS_APU) == 0) && 2670 !dev_is_removable(&adev->pdev->dev)) 2671 adev->flags |= AMD_IS_PX; 2672 2673 if (!(adev->flags & AMD_IS_APU)) { 2674 parent = pcie_find_root_port(adev->pdev); 2675 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2676 } 2677 2678 2679 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2680 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2681 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2682 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2683 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2684 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2685 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2686 2687 total = true; 2688 for (i = 0; i < adev->num_ip_blocks; i++) { 2689 ip_block = &adev->ip_blocks[i]; 2690 2691 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2692 DRM_WARN("disabled ip block: %d <%s>\n", 2693 i, adev->ip_blocks[i].version->funcs->name); 2694 adev->ip_blocks[i].status.valid = false; 2695 } else if (ip_block->version->funcs->early_init) { 2696 r = ip_block->version->funcs->early_init(ip_block); 2697 if (r == -ENOENT) { 2698 adev->ip_blocks[i].status.valid = false; 2699 } else if (r) { 2700 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2701 adev->ip_blocks[i].version->funcs->name, r); 2702 total = false; 2703 } else { 2704 adev->ip_blocks[i].status.valid = true; 2705 } 2706 } else { 2707 adev->ip_blocks[i].status.valid = true; 2708 } 2709 /* get the vbios after the asic_funcs are set up */ 2710 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2711 r = amdgpu_device_parse_gpu_info_fw(adev); 2712 if (r) 2713 return r; 2714 2715 bios_flags = amdgpu_device_get_vbios_flags(adev); 2716 skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP); 2717 /* Read BIOS */ 2718 if (!skip_bios) { 2719 bool optional = 2720 !!(bios_flags & AMDGPU_VBIOS_OPTIONAL); 2721 if (!amdgpu_get_bios(adev) && !optional) 2722 return -EINVAL; 2723 2724 if (optional && !adev->bios) 2725 dev_info( 2726 adev->dev, 2727 "VBIOS image optional, proceeding without VBIOS image"); 2728 2729 if (adev->bios) { 2730 r = amdgpu_atombios_init(adev); 2731 if (r) { 2732 dev_err(adev->dev, 2733 "amdgpu_atombios_init failed\n"); 2734 amdgpu_vf_error_put( 2735 adev, 2736 AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 2737 0, 0); 2738 return r; 2739 } 2740 } 2741 } 2742 2743 /*get pf2vf msg info at it's earliest time*/ 2744 if (amdgpu_sriov_vf(adev)) 2745 amdgpu_virt_init_data_exchange(adev); 2746 2747 } 2748 } 2749 if (!total) 2750 return -ENODEV; 2751 2752 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2753 if (ip_block->status.valid != false) 2754 amdgpu_amdkfd_device_probe(adev); 2755 2756 adev->cg_flags &= amdgpu_cg_mask; 2757 adev->pg_flags &= amdgpu_pg_mask; 2758 2759 return 0; 2760 } 2761 2762 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2763 { 2764 int i, r; 2765 2766 for (i = 0; i < adev->num_ip_blocks; i++) { 2767 if (!adev->ip_blocks[i].status.sw) 2768 continue; 2769 if (adev->ip_blocks[i].status.hw) 2770 continue; 2771 if (!amdgpu_ip_member_of_hwini( 2772 adev, adev->ip_blocks[i].version->type)) 2773 continue; 2774 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2775 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2776 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2777 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2778 if (r) { 2779 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2780 adev->ip_blocks[i].version->funcs->name, r); 2781 return r; 2782 } 2783 adev->ip_blocks[i].status.hw = true; 2784 } 2785 } 2786 2787 return 0; 2788 } 2789 2790 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2791 { 2792 int i, r; 2793 2794 for (i = 0; i < adev->num_ip_blocks; i++) { 2795 if (!adev->ip_blocks[i].status.sw) 2796 continue; 2797 if (adev->ip_blocks[i].status.hw) 2798 continue; 2799 if (!amdgpu_ip_member_of_hwini( 2800 adev, adev->ip_blocks[i].version->type)) 2801 continue; 2802 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2803 if (r) { 2804 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2805 adev->ip_blocks[i].version->funcs->name, r); 2806 return r; 2807 } 2808 adev->ip_blocks[i].status.hw = true; 2809 } 2810 2811 return 0; 2812 } 2813 2814 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2815 { 2816 int r = 0; 2817 int i; 2818 uint32_t smu_version; 2819 2820 if (adev->asic_type >= CHIP_VEGA10) { 2821 for (i = 0; i < adev->num_ip_blocks; i++) { 2822 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2823 continue; 2824 2825 if (!amdgpu_ip_member_of_hwini(adev, 2826 AMD_IP_BLOCK_TYPE_PSP)) 2827 break; 2828 2829 if (!adev->ip_blocks[i].status.sw) 2830 continue; 2831 2832 /* no need to do the fw loading again if already done*/ 2833 if (adev->ip_blocks[i].status.hw == true) 2834 break; 2835 2836 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2837 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 2838 if (r) 2839 return r; 2840 } else { 2841 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2842 if (r) { 2843 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2844 adev->ip_blocks[i].version->funcs->name, r); 2845 return r; 2846 } 2847 adev->ip_blocks[i].status.hw = true; 2848 } 2849 break; 2850 } 2851 } 2852 2853 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2854 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2855 2856 return r; 2857 } 2858 2859 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2860 { 2861 long timeout; 2862 int r, i; 2863 2864 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2865 struct amdgpu_ring *ring = adev->rings[i]; 2866 2867 /* No need to setup the GPU scheduler for rings that don't need it */ 2868 if (!ring || ring->no_scheduler) 2869 continue; 2870 2871 switch (ring->funcs->type) { 2872 case AMDGPU_RING_TYPE_GFX: 2873 timeout = adev->gfx_timeout; 2874 break; 2875 case AMDGPU_RING_TYPE_COMPUTE: 2876 timeout = adev->compute_timeout; 2877 break; 2878 case AMDGPU_RING_TYPE_SDMA: 2879 timeout = adev->sdma_timeout; 2880 break; 2881 default: 2882 timeout = adev->video_timeout; 2883 break; 2884 } 2885 2886 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL, 2887 DRM_SCHED_PRIORITY_COUNT, 2888 ring->num_hw_submission, 0, 2889 timeout, adev->reset_domain->wq, 2890 ring->sched_score, ring->name, 2891 adev->dev); 2892 if (r) { 2893 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2894 ring->name); 2895 return r; 2896 } 2897 r = amdgpu_uvd_entity_init(adev, ring); 2898 if (r) { 2899 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", 2900 ring->name); 2901 return r; 2902 } 2903 r = amdgpu_vce_entity_init(adev, ring); 2904 if (r) { 2905 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", 2906 ring->name); 2907 return r; 2908 } 2909 } 2910 2911 amdgpu_xcp_update_partition_sched_list(adev); 2912 2913 return 0; 2914 } 2915 2916 2917 /** 2918 * amdgpu_device_ip_init - run init for hardware IPs 2919 * 2920 * @adev: amdgpu_device pointer 2921 * 2922 * Main initialization pass for hardware IPs. The list of all the hardware 2923 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2924 * are run. sw_init initializes the software state associated with each IP 2925 * and hw_init initializes the hardware associated with each IP. 2926 * Returns 0 on success, negative error code on failure. 2927 */ 2928 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2929 { 2930 bool init_badpage; 2931 int i, r; 2932 2933 r = amdgpu_ras_init(adev); 2934 if (r) 2935 return r; 2936 2937 for (i = 0; i < adev->num_ip_blocks; i++) { 2938 if (!adev->ip_blocks[i].status.valid) 2939 continue; 2940 if (adev->ip_blocks[i].version->funcs->sw_init) { 2941 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 2942 if (r) { 2943 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2944 adev->ip_blocks[i].version->funcs->name, r); 2945 goto init_failed; 2946 } 2947 } 2948 adev->ip_blocks[i].status.sw = true; 2949 2950 if (!amdgpu_ip_member_of_hwini( 2951 adev, adev->ip_blocks[i].version->type)) 2952 continue; 2953 2954 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2955 /* need to do common hw init early so everything is set up for gmc */ 2956 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2957 if (r) { 2958 DRM_ERROR("hw_init %d failed %d\n", i, r); 2959 goto init_failed; 2960 } 2961 adev->ip_blocks[i].status.hw = true; 2962 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2963 /* need to do gmc hw init early so we can allocate gpu mem */ 2964 /* Try to reserve bad pages early */ 2965 if (amdgpu_sriov_vf(adev)) 2966 amdgpu_virt_exchange_data(adev); 2967 2968 r = amdgpu_device_mem_scratch_init(adev); 2969 if (r) { 2970 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2971 goto init_failed; 2972 } 2973 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2974 if (r) { 2975 DRM_ERROR("hw_init %d failed %d\n", i, r); 2976 goto init_failed; 2977 } 2978 r = amdgpu_device_wb_init(adev); 2979 if (r) { 2980 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2981 goto init_failed; 2982 } 2983 adev->ip_blocks[i].status.hw = true; 2984 2985 /* right after GMC hw init, we create CSA */ 2986 if (adev->gfx.mcbp) { 2987 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2988 AMDGPU_GEM_DOMAIN_VRAM | 2989 AMDGPU_GEM_DOMAIN_GTT, 2990 AMDGPU_CSA_SIZE); 2991 if (r) { 2992 DRM_ERROR("allocate CSA failed %d\n", r); 2993 goto init_failed; 2994 } 2995 } 2996 2997 r = amdgpu_seq64_init(adev); 2998 if (r) { 2999 DRM_ERROR("allocate seq64 failed %d\n", r); 3000 goto init_failed; 3001 } 3002 } 3003 } 3004 3005 if (amdgpu_sriov_vf(adev)) 3006 amdgpu_virt_init_data_exchange(adev); 3007 3008 r = amdgpu_ib_pool_init(adev); 3009 if (r) { 3010 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 3011 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 3012 goto init_failed; 3013 } 3014 3015 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 3016 if (r) 3017 goto init_failed; 3018 3019 r = amdgpu_device_ip_hw_init_phase1(adev); 3020 if (r) 3021 goto init_failed; 3022 3023 r = amdgpu_device_fw_loading(adev); 3024 if (r) 3025 goto init_failed; 3026 3027 r = amdgpu_device_ip_hw_init_phase2(adev); 3028 if (r) 3029 goto init_failed; 3030 3031 /* 3032 * retired pages will be loaded from eeprom and reserved here, 3033 * it should be called after amdgpu_device_ip_hw_init_phase2 since 3034 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 3035 * for I2C communication which only true at this point. 3036 * 3037 * amdgpu_ras_recovery_init may fail, but the upper only cares the 3038 * failure from bad gpu situation and stop amdgpu init process 3039 * accordingly. For other failed cases, it will still release all 3040 * the resource and print error message, rather than returning one 3041 * negative value to upper level. 3042 * 3043 * Note: theoretically, this should be called before all vram allocations 3044 * to protect retired page from abusing 3045 */ 3046 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 3047 r = amdgpu_ras_recovery_init(adev, init_badpage); 3048 if (r) 3049 goto init_failed; 3050 3051 /** 3052 * In case of XGMI grab extra reference for reset domain for this device 3053 */ 3054 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3055 if (amdgpu_xgmi_add_device(adev) == 0) { 3056 if (!amdgpu_sriov_vf(adev)) { 3057 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3058 3059 if (WARN_ON(!hive)) { 3060 r = -ENOENT; 3061 goto init_failed; 3062 } 3063 3064 if (!hive->reset_domain || 3065 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 3066 r = -ENOENT; 3067 amdgpu_put_xgmi_hive(hive); 3068 goto init_failed; 3069 } 3070 3071 /* Drop the early temporary reset domain we created for device */ 3072 amdgpu_reset_put_reset_domain(adev->reset_domain); 3073 adev->reset_domain = hive->reset_domain; 3074 amdgpu_put_xgmi_hive(hive); 3075 } 3076 } 3077 } 3078 3079 r = amdgpu_device_init_schedulers(adev); 3080 if (r) 3081 goto init_failed; 3082 3083 if (adev->mman.buffer_funcs_ring->sched.ready) 3084 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3085 3086 /* Don't init kfd if whole hive need to be reset during init */ 3087 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3088 kgd2kfd_init_zone_device(adev); 3089 amdgpu_amdkfd_device_init(adev); 3090 } 3091 3092 amdgpu_fru_get_product_info(adev); 3093 3094 r = amdgpu_cper_init(adev); 3095 3096 init_failed: 3097 3098 return r; 3099 } 3100 3101 /** 3102 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3103 * 3104 * @adev: amdgpu_device pointer 3105 * 3106 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3107 * this function before a GPU reset. If the value is retained after a 3108 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents. 3109 */ 3110 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3111 { 3112 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3113 } 3114 3115 /** 3116 * amdgpu_device_check_vram_lost - check if vram is valid 3117 * 3118 * @adev: amdgpu_device pointer 3119 * 3120 * Checks the reset magic value written to the gart pointer in VRAM. 3121 * The driver calls this after a GPU reset to see if the contents of 3122 * VRAM is lost or now. 3123 * returns true if vram is lost, false if not. 3124 */ 3125 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3126 { 3127 if (memcmp(adev->gart.ptr, adev->reset_magic, 3128 AMDGPU_RESET_MAGIC_NUM)) 3129 return true; 3130 3131 if (!amdgpu_in_reset(adev)) 3132 return false; 3133 3134 /* 3135 * For all ASICs with baco/mode1 reset, the VRAM is 3136 * always assumed to be lost. 3137 */ 3138 switch (amdgpu_asic_reset_method(adev)) { 3139 case AMD_RESET_METHOD_BACO: 3140 case AMD_RESET_METHOD_MODE1: 3141 return true; 3142 default: 3143 return false; 3144 } 3145 } 3146 3147 /** 3148 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3149 * 3150 * @adev: amdgpu_device pointer 3151 * @state: clockgating state (gate or ungate) 3152 * 3153 * The list of all the hardware IPs that make up the asic is walked and the 3154 * set_clockgating_state callbacks are run. 3155 * Late initialization pass enabling clockgating for hardware IPs. 3156 * Fini or suspend, pass disabling clockgating for hardware IPs. 3157 * Returns 0 on success, negative error code on failure. 3158 */ 3159 3160 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3161 enum amd_clockgating_state state) 3162 { 3163 int i, j, r; 3164 3165 if (amdgpu_emu_mode == 1) 3166 return 0; 3167 3168 for (j = 0; j < adev->num_ip_blocks; j++) { 3169 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3170 if (!adev->ip_blocks[i].status.late_initialized) 3171 continue; 3172 /* skip CG for GFX, SDMA on S0ix */ 3173 if (adev->in_s0ix && 3174 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3175 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3176 continue; 3177 /* skip CG for VCE/UVD, it's handled specially */ 3178 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3179 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3180 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3181 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3182 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3183 /* enable clockgating to save power */ 3184 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i], 3185 state); 3186 if (r) { 3187 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 3188 adev->ip_blocks[i].version->funcs->name, r); 3189 return r; 3190 } 3191 } 3192 } 3193 3194 return 0; 3195 } 3196 3197 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3198 enum amd_powergating_state state) 3199 { 3200 int i, j, r; 3201 3202 if (amdgpu_emu_mode == 1) 3203 return 0; 3204 3205 for (j = 0; j < adev->num_ip_blocks; j++) { 3206 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3207 if (!adev->ip_blocks[i].status.late_initialized) 3208 continue; 3209 /* skip PG for GFX, SDMA on S0ix */ 3210 if (adev->in_s0ix && 3211 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3212 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3213 continue; 3214 /* skip CG for VCE/UVD, it's handled specially */ 3215 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3216 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3217 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3218 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3219 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3220 /* enable powergating to save power */ 3221 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i], 3222 state); 3223 if (r) { 3224 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 3225 adev->ip_blocks[i].version->funcs->name, r); 3226 return r; 3227 } 3228 } 3229 } 3230 return 0; 3231 } 3232 3233 static int amdgpu_device_enable_mgpu_fan_boost(void) 3234 { 3235 struct amdgpu_gpu_instance *gpu_ins; 3236 struct amdgpu_device *adev; 3237 int i, ret = 0; 3238 3239 mutex_lock(&mgpu_info.mutex); 3240 3241 /* 3242 * MGPU fan boost feature should be enabled 3243 * only when there are two or more dGPUs in 3244 * the system 3245 */ 3246 if (mgpu_info.num_dgpu < 2) 3247 goto out; 3248 3249 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3250 gpu_ins = &(mgpu_info.gpu_ins[i]); 3251 adev = gpu_ins->adev; 3252 if (!(adev->flags & AMD_IS_APU) && 3253 !gpu_ins->mgpu_fan_enabled) { 3254 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3255 if (ret) 3256 break; 3257 3258 gpu_ins->mgpu_fan_enabled = 1; 3259 } 3260 } 3261 3262 out: 3263 mutex_unlock(&mgpu_info.mutex); 3264 3265 return ret; 3266 } 3267 3268 /** 3269 * amdgpu_device_ip_late_init - run late init for hardware IPs 3270 * 3271 * @adev: amdgpu_device pointer 3272 * 3273 * Late initialization pass for hardware IPs. The list of all the hardware 3274 * IPs that make up the asic is walked and the late_init callbacks are run. 3275 * late_init covers any special initialization that an IP requires 3276 * after all of the have been initialized or something that needs to happen 3277 * late in the init process. 3278 * Returns 0 on success, negative error code on failure. 3279 */ 3280 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3281 { 3282 struct amdgpu_gpu_instance *gpu_instance; 3283 int i = 0, r; 3284 3285 for (i = 0; i < adev->num_ip_blocks; i++) { 3286 if (!adev->ip_blocks[i].status.hw) 3287 continue; 3288 if (adev->ip_blocks[i].version->funcs->late_init) { 3289 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3290 if (r) { 3291 DRM_ERROR("late_init of IP block <%s> failed %d\n", 3292 adev->ip_blocks[i].version->funcs->name, r); 3293 return r; 3294 } 3295 } 3296 adev->ip_blocks[i].status.late_initialized = true; 3297 } 3298 3299 r = amdgpu_ras_late_init(adev); 3300 if (r) { 3301 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 3302 return r; 3303 } 3304 3305 if (!amdgpu_reset_in_recovery(adev)) 3306 amdgpu_ras_set_error_query_ready(adev, true); 3307 3308 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3309 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3310 3311 amdgpu_device_fill_reset_magic(adev); 3312 3313 r = amdgpu_device_enable_mgpu_fan_boost(); 3314 if (r) 3315 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 3316 3317 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3318 if (amdgpu_passthrough(adev) && 3319 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3320 adev->asic_type == CHIP_ALDEBARAN)) 3321 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3322 3323 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3324 mutex_lock(&mgpu_info.mutex); 3325 3326 /* 3327 * Reset device p-state to low as this was booted with high. 3328 * 3329 * This should be performed only after all devices from the same 3330 * hive get initialized. 3331 * 3332 * However, it's unknown how many device in the hive in advance. 3333 * As this is counted one by one during devices initializations. 3334 * 3335 * So, we wait for all XGMI interlinked devices initialized. 3336 * This may bring some delays as those devices may come from 3337 * different hives. But that should be OK. 3338 */ 3339 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3340 for (i = 0; i < mgpu_info.num_gpu; i++) { 3341 gpu_instance = &(mgpu_info.gpu_ins[i]); 3342 if (gpu_instance->adev->flags & AMD_IS_APU) 3343 continue; 3344 3345 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3346 AMDGPU_XGMI_PSTATE_MIN); 3347 if (r) { 3348 DRM_ERROR("pstate setting failed (%d).\n", r); 3349 break; 3350 } 3351 } 3352 } 3353 3354 mutex_unlock(&mgpu_info.mutex); 3355 } 3356 3357 return 0; 3358 } 3359 3360 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3361 { 3362 int r; 3363 3364 if (!ip_block->version->funcs->hw_fini) { 3365 DRM_ERROR("hw_fini of IP block <%s> not defined\n", 3366 ip_block->version->funcs->name); 3367 } else { 3368 r = ip_block->version->funcs->hw_fini(ip_block); 3369 /* XXX handle errors */ 3370 if (r) { 3371 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3372 ip_block->version->funcs->name, r); 3373 } 3374 } 3375 3376 ip_block->status.hw = false; 3377 } 3378 3379 /** 3380 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3381 * 3382 * @adev: amdgpu_device pointer 3383 * 3384 * For ASICs need to disable SMC first 3385 */ 3386 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3387 { 3388 int i; 3389 3390 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3391 return; 3392 3393 for (i = 0; i < adev->num_ip_blocks; i++) { 3394 if (!adev->ip_blocks[i].status.hw) 3395 continue; 3396 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3397 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3398 break; 3399 } 3400 } 3401 } 3402 3403 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3404 { 3405 int i, r; 3406 3407 for (i = 0; i < adev->num_ip_blocks; i++) { 3408 if (!adev->ip_blocks[i].version->funcs->early_fini) 3409 continue; 3410 3411 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3412 if (r) { 3413 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3414 adev->ip_blocks[i].version->funcs->name, r); 3415 } 3416 } 3417 3418 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3419 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3420 3421 amdgpu_amdkfd_suspend(adev, false); 3422 3423 /* Workaround for ASICs need to disable SMC first */ 3424 amdgpu_device_smu_fini_early(adev); 3425 3426 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3427 if (!adev->ip_blocks[i].status.hw) 3428 continue; 3429 3430 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3431 } 3432 3433 if (amdgpu_sriov_vf(adev)) { 3434 if (amdgpu_virt_release_full_gpu(adev, false)) 3435 DRM_ERROR("failed to release exclusive mode on fini\n"); 3436 } 3437 3438 return 0; 3439 } 3440 3441 /** 3442 * amdgpu_device_ip_fini - run fini for hardware IPs 3443 * 3444 * @adev: amdgpu_device pointer 3445 * 3446 * Main teardown pass for hardware IPs. The list of all the hardware 3447 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3448 * are run. hw_fini tears down the hardware associated with each IP 3449 * and sw_fini tears down any software state associated with each IP. 3450 * Returns 0 on success, negative error code on failure. 3451 */ 3452 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3453 { 3454 int i, r; 3455 3456 amdgpu_cper_fini(adev); 3457 3458 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3459 amdgpu_virt_release_ras_err_handler_data(adev); 3460 3461 if (adev->gmc.xgmi.num_physical_nodes > 1) 3462 amdgpu_xgmi_remove_device(adev); 3463 3464 amdgpu_amdkfd_device_fini_sw(adev); 3465 3466 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3467 if (!adev->ip_blocks[i].status.sw) 3468 continue; 3469 3470 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3471 amdgpu_ucode_free_bo(adev); 3472 amdgpu_free_static_csa(&adev->virt.csa_obj); 3473 amdgpu_device_wb_fini(adev); 3474 amdgpu_device_mem_scratch_fini(adev); 3475 amdgpu_ib_pool_fini(adev); 3476 amdgpu_seq64_fini(adev); 3477 } 3478 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3479 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3480 /* XXX handle errors */ 3481 if (r) { 3482 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3483 adev->ip_blocks[i].version->funcs->name, r); 3484 } 3485 } 3486 adev->ip_blocks[i].status.sw = false; 3487 adev->ip_blocks[i].status.valid = false; 3488 } 3489 3490 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3491 if (!adev->ip_blocks[i].status.late_initialized) 3492 continue; 3493 if (adev->ip_blocks[i].version->funcs->late_fini) 3494 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3495 adev->ip_blocks[i].status.late_initialized = false; 3496 } 3497 3498 amdgpu_ras_fini(adev); 3499 3500 return 0; 3501 } 3502 3503 /** 3504 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3505 * 3506 * @work: work_struct. 3507 */ 3508 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3509 { 3510 struct amdgpu_device *adev = 3511 container_of(work, struct amdgpu_device, delayed_init_work.work); 3512 int r; 3513 3514 r = amdgpu_ib_ring_tests(adev); 3515 if (r) 3516 DRM_ERROR("ib ring test failed (%d).\n", r); 3517 } 3518 3519 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3520 { 3521 struct amdgpu_device *adev = 3522 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3523 3524 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3525 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3526 3527 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0)) 3528 adev->gfx.gfx_off_state = true; 3529 } 3530 3531 /** 3532 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3533 * 3534 * @adev: amdgpu_device pointer 3535 * 3536 * Main suspend function for hardware IPs. The list of all the hardware 3537 * IPs that make up the asic is walked, clockgating is disabled and the 3538 * suspend callbacks are run. suspend puts the hardware and software state 3539 * in each IP into a state suitable for suspend. 3540 * Returns 0 on success, negative error code on failure. 3541 */ 3542 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3543 { 3544 int i, r; 3545 3546 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3547 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3548 3549 /* 3550 * Per PMFW team's suggestion, driver needs to handle gfxoff 3551 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3552 * scenario. Add the missing df cstate disablement here. 3553 */ 3554 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3555 dev_warn(adev->dev, "Failed to disallow df cstate"); 3556 3557 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3558 if (!adev->ip_blocks[i].status.valid) 3559 continue; 3560 3561 /* displays are handled separately */ 3562 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3563 continue; 3564 3565 /* XXX handle errors */ 3566 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3567 if (r) 3568 return r; 3569 } 3570 3571 return 0; 3572 } 3573 3574 /** 3575 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3576 * 3577 * @adev: amdgpu_device pointer 3578 * 3579 * Main suspend function for hardware IPs. The list of all the hardware 3580 * IPs that make up the asic is walked, clockgating is disabled and the 3581 * suspend callbacks are run. suspend puts the hardware and software state 3582 * in each IP into a state suitable for suspend. 3583 * Returns 0 on success, negative error code on failure. 3584 */ 3585 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3586 { 3587 int i, r; 3588 3589 if (adev->in_s0ix) 3590 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3591 3592 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3593 if (!adev->ip_blocks[i].status.valid) 3594 continue; 3595 /* displays are handled in phase1 */ 3596 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3597 continue; 3598 /* PSP lost connection when err_event_athub occurs */ 3599 if (amdgpu_ras_intr_triggered() && 3600 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3601 adev->ip_blocks[i].status.hw = false; 3602 continue; 3603 } 3604 3605 /* skip unnecessary suspend if we do not initialize them yet */ 3606 if (!amdgpu_ip_member_of_hwini( 3607 adev, adev->ip_blocks[i].version->type)) 3608 continue; 3609 3610 /* skip suspend of gfx/mes and psp for S0ix 3611 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3612 * like at runtime. PSP is also part of the always on hardware 3613 * so no need to suspend it. 3614 */ 3615 if (adev->in_s0ix && 3616 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3617 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3618 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3619 continue; 3620 3621 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3622 if (adev->in_s0ix && 3623 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3624 IP_VERSION(5, 0, 0)) && 3625 (adev->ip_blocks[i].version->type == 3626 AMD_IP_BLOCK_TYPE_SDMA)) 3627 continue; 3628 3629 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3630 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3631 * from this location and RLC Autoload automatically also gets loaded 3632 * from here based on PMFW -> PSP message during re-init sequence. 3633 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3634 * the TMR and reload FWs again for IMU enabled APU ASICs. 3635 */ 3636 if (amdgpu_in_reset(adev) && 3637 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3638 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3639 continue; 3640 3641 /* XXX handle errors */ 3642 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3643 adev->ip_blocks[i].status.hw = false; 3644 3645 /* handle putting the SMC in the appropriate state */ 3646 if (!amdgpu_sriov_vf(adev)) { 3647 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3648 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3649 if (r) { 3650 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3651 adev->mp1_state, r); 3652 return r; 3653 } 3654 } 3655 } 3656 } 3657 3658 return 0; 3659 } 3660 3661 /** 3662 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3663 * 3664 * @adev: amdgpu_device pointer 3665 * 3666 * Main suspend function for hardware IPs. The list of all the hardware 3667 * IPs that make up the asic is walked, clockgating is disabled and the 3668 * suspend callbacks are run. suspend puts the hardware and software state 3669 * in each IP into a state suitable for suspend. 3670 * Returns 0 on success, negative error code on failure. 3671 */ 3672 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3673 { 3674 int r; 3675 3676 if (amdgpu_sriov_vf(adev)) { 3677 amdgpu_virt_fini_data_exchange(adev); 3678 amdgpu_virt_request_full_gpu(adev, false); 3679 } 3680 3681 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3682 3683 r = amdgpu_device_ip_suspend_phase1(adev); 3684 if (r) 3685 return r; 3686 r = amdgpu_device_ip_suspend_phase2(adev); 3687 3688 if (amdgpu_sriov_vf(adev)) 3689 amdgpu_virt_release_full_gpu(adev, false); 3690 3691 return r; 3692 } 3693 3694 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3695 { 3696 int i, r; 3697 3698 static enum amd_ip_block_type ip_order[] = { 3699 AMD_IP_BLOCK_TYPE_COMMON, 3700 AMD_IP_BLOCK_TYPE_GMC, 3701 AMD_IP_BLOCK_TYPE_PSP, 3702 AMD_IP_BLOCK_TYPE_IH, 3703 }; 3704 3705 for (i = 0; i < adev->num_ip_blocks; i++) { 3706 int j; 3707 struct amdgpu_ip_block *block; 3708 3709 block = &adev->ip_blocks[i]; 3710 block->status.hw = false; 3711 3712 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3713 3714 if (block->version->type != ip_order[j] || 3715 !block->status.valid) 3716 continue; 3717 3718 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3719 if (r) { 3720 dev_err(adev->dev, "RE-INIT-early: %s failed\n", 3721 block->version->funcs->name); 3722 return r; 3723 } 3724 block->status.hw = true; 3725 } 3726 } 3727 3728 return 0; 3729 } 3730 3731 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3732 { 3733 struct amdgpu_ip_block *block; 3734 int i, r = 0; 3735 3736 static enum amd_ip_block_type ip_order[] = { 3737 AMD_IP_BLOCK_TYPE_SMC, 3738 AMD_IP_BLOCK_TYPE_DCE, 3739 AMD_IP_BLOCK_TYPE_GFX, 3740 AMD_IP_BLOCK_TYPE_SDMA, 3741 AMD_IP_BLOCK_TYPE_MES, 3742 AMD_IP_BLOCK_TYPE_UVD, 3743 AMD_IP_BLOCK_TYPE_VCE, 3744 AMD_IP_BLOCK_TYPE_VCN, 3745 AMD_IP_BLOCK_TYPE_JPEG 3746 }; 3747 3748 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3749 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]); 3750 3751 if (!block) 3752 continue; 3753 3754 if (block->status.valid && !block->status.hw) { 3755 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 3756 r = amdgpu_ip_block_resume(block); 3757 } else { 3758 r = block->version->funcs->hw_init(block); 3759 } 3760 3761 if (r) { 3762 dev_err(adev->dev, "RE-INIT-late: %s failed\n", 3763 block->version->funcs->name); 3764 break; 3765 } 3766 block->status.hw = true; 3767 } 3768 } 3769 3770 return r; 3771 } 3772 3773 /** 3774 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3775 * 3776 * @adev: amdgpu_device pointer 3777 * 3778 * First resume function for hardware IPs. The list of all the hardware 3779 * IPs that make up the asic is walked and the resume callbacks are run for 3780 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3781 * after a suspend and updates the software state as necessary. This 3782 * function is also used for restoring the GPU after a GPU reset. 3783 * Returns 0 on success, negative error code on failure. 3784 */ 3785 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3786 { 3787 int i, r; 3788 3789 for (i = 0; i < adev->num_ip_blocks; i++) { 3790 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3791 continue; 3792 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3793 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3794 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3795 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3796 3797 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3798 if (r) 3799 return r; 3800 } 3801 } 3802 3803 return 0; 3804 } 3805 3806 /** 3807 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3808 * 3809 * @adev: amdgpu_device pointer 3810 * 3811 * Second resume function for hardware IPs. The list of all the hardware 3812 * IPs that make up the asic is walked and the resume callbacks are run for 3813 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3814 * functional state after a suspend and updates the software state as 3815 * necessary. This function is also used for restoring the GPU after a GPU 3816 * reset. 3817 * Returns 0 on success, negative error code on failure. 3818 */ 3819 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3820 { 3821 int i, r; 3822 3823 for (i = 0; i < adev->num_ip_blocks; i++) { 3824 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3825 continue; 3826 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3827 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3828 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3829 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE || 3830 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3831 continue; 3832 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3833 if (r) 3834 return r; 3835 } 3836 3837 return 0; 3838 } 3839 3840 /** 3841 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs 3842 * 3843 * @adev: amdgpu_device pointer 3844 * 3845 * Third resume function for hardware IPs. The list of all the hardware 3846 * IPs that make up the asic is walked and the resume callbacks are run for 3847 * all DCE. resume puts the hardware into a functional state after a suspend 3848 * and updates the software state as necessary. This function is also used 3849 * for restoring the GPU after a GPU reset. 3850 * 3851 * Returns 0 on success, negative error code on failure. 3852 */ 3853 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev) 3854 { 3855 int i, r; 3856 3857 for (i = 0; i < adev->num_ip_blocks; i++) { 3858 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3859 continue; 3860 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 3861 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3862 if (r) 3863 return r; 3864 } 3865 } 3866 3867 return 0; 3868 } 3869 3870 /** 3871 * amdgpu_device_ip_resume - run resume for hardware IPs 3872 * 3873 * @adev: amdgpu_device pointer 3874 * 3875 * Main resume function for hardware IPs. The hardware IPs 3876 * are split into two resume functions because they are 3877 * also used in recovering from a GPU reset and some additional 3878 * steps need to be take between them. In this case (S3/S4) they are 3879 * run sequentially. 3880 * Returns 0 on success, negative error code on failure. 3881 */ 3882 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3883 { 3884 int r; 3885 3886 r = amdgpu_device_ip_resume_phase1(adev); 3887 if (r) 3888 return r; 3889 3890 r = amdgpu_device_fw_loading(adev); 3891 if (r) 3892 return r; 3893 3894 r = amdgpu_device_ip_resume_phase2(adev); 3895 3896 if (adev->mman.buffer_funcs_ring->sched.ready) 3897 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3898 3899 if (r) 3900 return r; 3901 3902 amdgpu_fence_driver_hw_init(adev); 3903 3904 r = amdgpu_device_ip_resume_phase3(adev); 3905 3906 return r; 3907 } 3908 3909 /** 3910 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3911 * 3912 * @adev: amdgpu_device pointer 3913 * 3914 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3915 */ 3916 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3917 { 3918 if (amdgpu_sriov_vf(adev)) { 3919 if (adev->is_atom_fw) { 3920 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3921 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3922 } else { 3923 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3924 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3925 } 3926 3927 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3928 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3929 } 3930 } 3931 3932 /** 3933 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3934 * 3935 * @asic_type: AMD asic type 3936 * 3937 * Check if there is DC (new modesetting infrastructre) support for an asic. 3938 * returns true if DC has support, false if not. 3939 */ 3940 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3941 { 3942 switch (asic_type) { 3943 #ifdef CONFIG_DRM_AMDGPU_SI 3944 case CHIP_HAINAN: 3945 #endif 3946 case CHIP_TOPAZ: 3947 /* chips with no display hardware */ 3948 return false; 3949 #if defined(CONFIG_DRM_AMD_DC) 3950 case CHIP_TAHITI: 3951 case CHIP_PITCAIRN: 3952 case CHIP_VERDE: 3953 case CHIP_OLAND: 3954 /* 3955 * We have systems in the wild with these ASICs that require 3956 * LVDS and VGA support which is not supported with DC. 3957 * 3958 * Fallback to the non-DC driver here by default so as not to 3959 * cause regressions. 3960 */ 3961 #if defined(CONFIG_DRM_AMD_DC_SI) 3962 return amdgpu_dc > 0; 3963 #else 3964 return false; 3965 #endif 3966 case CHIP_BONAIRE: 3967 case CHIP_KAVERI: 3968 case CHIP_KABINI: 3969 case CHIP_MULLINS: 3970 /* 3971 * We have systems in the wild with these ASICs that require 3972 * VGA support which is not supported with DC. 3973 * 3974 * Fallback to the non-DC driver here by default so as not to 3975 * cause regressions. 3976 */ 3977 return amdgpu_dc > 0; 3978 default: 3979 return amdgpu_dc != 0; 3980 #else 3981 default: 3982 if (amdgpu_dc > 0) 3983 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3984 return false; 3985 #endif 3986 } 3987 } 3988 3989 /** 3990 * amdgpu_device_has_dc_support - check if dc is supported 3991 * 3992 * @adev: amdgpu_device pointer 3993 * 3994 * Returns true for supported, false for not supported 3995 */ 3996 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3997 { 3998 if (adev->enable_virtual_display || 3999 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 4000 return false; 4001 4002 return amdgpu_device_asic_has_dc_support(adev->asic_type); 4003 } 4004 4005 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 4006 { 4007 struct amdgpu_device *adev = 4008 container_of(__work, struct amdgpu_device, xgmi_reset_work); 4009 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 4010 4011 /* It's a bug to not have a hive within this function */ 4012 if (WARN_ON(!hive)) 4013 return; 4014 4015 /* 4016 * Use task barrier to synchronize all xgmi reset works across the 4017 * hive. task_barrier_enter and task_barrier_exit will block 4018 * until all the threads running the xgmi reset works reach 4019 * those points. task_barrier_full will do both blocks. 4020 */ 4021 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 4022 4023 task_barrier_enter(&hive->tb); 4024 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 4025 4026 if (adev->asic_reset_res) 4027 goto fail; 4028 4029 task_barrier_exit(&hive->tb); 4030 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 4031 4032 if (adev->asic_reset_res) 4033 goto fail; 4034 4035 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 4036 } else { 4037 4038 task_barrier_full(&hive->tb); 4039 adev->asic_reset_res = amdgpu_asic_reset(adev); 4040 } 4041 4042 fail: 4043 if (adev->asic_reset_res) 4044 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 4045 adev->asic_reset_res, adev_to_drm(adev)->unique); 4046 amdgpu_put_xgmi_hive(hive); 4047 } 4048 4049 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 4050 { 4051 char *input = amdgpu_lockup_timeout; 4052 char *timeout_setting = NULL; 4053 int index = 0; 4054 long timeout; 4055 int ret = 0; 4056 4057 /* 4058 * By default timeout for non compute jobs is 10000 4059 * and 60000 for compute jobs. 4060 * In SR-IOV or passthrough mode, timeout for compute 4061 * jobs are 60000 by default. 4062 */ 4063 adev->gfx_timeout = msecs_to_jiffies(10000); 4064 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4065 if (amdgpu_sriov_vf(adev)) 4066 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 4067 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 4068 else 4069 adev->compute_timeout = msecs_to_jiffies(60000); 4070 4071 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4072 while ((timeout_setting = strsep(&input, ",")) && 4073 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4074 ret = kstrtol(timeout_setting, 0, &timeout); 4075 if (ret) 4076 return ret; 4077 4078 if (timeout == 0) { 4079 index++; 4080 continue; 4081 } else if (timeout < 0) { 4082 timeout = MAX_SCHEDULE_TIMEOUT; 4083 dev_warn(adev->dev, "lockup timeout disabled"); 4084 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 4085 } else { 4086 timeout = msecs_to_jiffies(timeout); 4087 } 4088 4089 switch (index++) { 4090 case 0: 4091 adev->gfx_timeout = timeout; 4092 break; 4093 case 1: 4094 adev->compute_timeout = timeout; 4095 break; 4096 case 2: 4097 adev->sdma_timeout = timeout; 4098 break; 4099 case 3: 4100 adev->video_timeout = timeout; 4101 break; 4102 default: 4103 break; 4104 } 4105 } 4106 /* 4107 * There is only one value specified and 4108 * it should apply to all non-compute jobs. 4109 */ 4110 if (index == 1) { 4111 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4112 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 4113 adev->compute_timeout = adev->gfx_timeout; 4114 } 4115 } 4116 4117 return ret; 4118 } 4119 4120 /** 4121 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4122 * 4123 * @adev: amdgpu_device pointer 4124 * 4125 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4126 */ 4127 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4128 { 4129 struct iommu_domain *domain; 4130 4131 domain = iommu_get_domain_for_dev(adev->dev); 4132 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4133 adev->ram_is_direct_mapped = true; 4134 } 4135 4136 #if defined(CONFIG_HSA_AMD_P2P) 4137 /** 4138 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4139 * 4140 * @adev: amdgpu_device pointer 4141 * 4142 * return if IOMMU remapping bar address 4143 */ 4144 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4145 { 4146 struct iommu_domain *domain; 4147 4148 domain = iommu_get_domain_for_dev(adev->dev); 4149 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4150 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4151 return true; 4152 4153 return false; 4154 } 4155 #endif 4156 4157 static const struct attribute *amdgpu_dev_attributes[] = { 4158 &dev_attr_pcie_replay_count.attr, 4159 NULL 4160 }; 4161 4162 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4163 { 4164 if (amdgpu_mcbp == 1) 4165 adev->gfx.mcbp = true; 4166 else if (amdgpu_mcbp == 0) 4167 adev->gfx.mcbp = false; 4168 4169 if (amdgpu_sriov_vf(adev)) 4170 adev->gfx.mcbp = true; 4171 4172 if (adev->gfx.mcbp) 4173 DRM_INFO("MCBP is enabled\n"); 4174 } 4175 4176 /** 4177 * amdgpu_device_init - initialize the driver 4178 * 4179 * @adev: amdgpu_device pointer 4180 * @flags: driver flags 4181 * 4182 * Initializes the driver info and hw (all asics). 4183 * Returns 0 for success or an error on failure. 4184 * Called at driver startup. 4185 */ 4186 int amdgpu_device_init(struct amdgpu_device *adev, 4187 uint32_t flags) 4188 { 4189 struct drm_device *ddev = adev_to_drm(adev); 4190 struct pci_dev *pdev = adev->pdev; 4191 int r, i; 4192 bool px = false; 4193 u32 max_MBps; 4194 int tmp; 4195 4196 adev->shutdown = false; 4197 adev->flags = flags; 4198 4199 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4200 adev->asic_type = amdgpu_force_asic_type; 4201 else 4202 adev->asic_type = flags & AMD_ASIC_MASK; 4203 4204 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4205 if (amdgpu_emu_mode == 1) 4206 adev->usec_timeout *= 10; 4207 adev->gmc.gart_size = 512 * 1024 * 1024; 4208 adev->accel_working = false; 4209 adev->num_rings = 0; 4210 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4211 adev->mman.buffer_funcs = NULL; 4212 adev->mman.buffer_funcs_ring = NULL; 4213 adev->vm_manager.vm_pte_funcs = NULL; 4214 adev->vm_manager.vm_pte_num_scheds = 0; 4215 adev->gmc.gmc_funcs = NULL; 4216 adev->harvest_ip_mask = 0x0; 4217 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4218 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4219 4220 adev->smc_rreg = &amdgpu_invalid_rreg; 4221 adev->smc_wreg = &amdgpu_invalid_wreg; 4222 adev->pcie_rreg = &amdgpu_invalid_rreg; 4223 adev->pcie_wreg = &amdgpu_invalid_wreg; 4224 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4225 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4226 adev->pciep_rreg = &amdgpu_invalid_rreg; 4227 adev->pciep_wreg = &amdgpu_invalid_wreg; 4228 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4229 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4230 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4231 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4232 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4233 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4234 adev->didt_rreg = &amdgpu_invalid_rreg; 4235 adev->didt_wreg = &amdgpu_invalid_wreg; 4236 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4237 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4238 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4239 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4240 4241 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4242 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4243 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4244 4245 /* mutex initialization are all done here so we 4246 * can recall function without having locking issues 4247 */ 4248 mutex_init(&adev->firmware.mutex); 4249 mutex_init(&adev->pm.mutex); 4250 mutex_init(&adev->gfx.gpu_clock_mutex); 4251 mutex_init(&adev->srbm_mutex); 4252 mutex_init(&adev->gfx.pipe_reserve_mutex); 4253 mutex_init(&adev->gfx.gfx_off_mutex); 4254 mutex_init(&adev->gfx.partition_mutex); 4255 mutex_init(&adev->grbm_idx_mutex); 4256 mutex_init(&adev->mn_lock); 4257 mutex_init(&adev->virt.vf_errors.lock); 4258 mutex_init(&adev->virt.rlcg_reg_lock); 4259 hash_init(adev->mn_hash); 4260 mutex_init(&adev->psp.mutex); 4261 mutex_init(&adev->notifier_lock); 4262 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4263 mutex_init(&adev->benchmark_mutex); 4264 mutex_init(&adev->gfx.reset_sem_mutex); 4265 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4266 mutex_init(&adev->enforce_isolation_mutex); 4267 mutex_init(&adev->gfx.kfd_sch_mutex); 4268 4269 amdgpu_device_init_apu_flags(adev); 4270 4271 r = amdgpu_device_check_arguments(adev); 4272 if (r) 4273 return r; 4274 4275 spin_lock_init(&adev->mmio_idx_lock); 4276 spin_lock_init(&adev->smc_idx_lock); 4277 spin_lock_init(&adev->pcie_idx_lock); 4278 spin_lock_init(&adev->uvd_ctx_idx_lock); 4279 spin_lock_init(&adev->didt_idx_lock); 4280 spin_lock_init(&adev->gc_cac_idx_lock); 4281 spin_lock_init(&adev->se_cac_idx_lock); 4282 spin_lock_init(&adev->audio_endpt_idx_lock); 4283 spin_lock_init(&adev->mm_stats.lock); 4284 spin_lock_init(&adev->wb.lock); 4285 4286 INIT_LIST_HEAD(&adev->reset_list); 4287 4288 INIT_LIST_HEAD(&adev->ras_list); 4289 4290 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4291 4292 INIT_DELAYED_WORK(&adev->delayed_init_work, 4293 amdgpu_device_delayed_init_work_handler); 4294 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4295 amdgpu_device_delay_enable_gfx_off); 4296 /* 4297 * Initialize the enforce_isolation work structures for each XCP 4298 * partition. This work handler is responsible for enforcing shader 4299 * isolation on AMD GPUs. It counts the number of emitted fences for 4300 * each GFX and compute ring. If there are any fences, it schedules 4301 * the `enforce_isolation_work` to be run after a delay. If there are 4302 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4303 * runqueue. 4304 */ 4305 for (i = 0; i < MAX_XCP; i++) { 4306 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4307 amdgpu_gfx_enforce_isolation_handler); 4308 adev->gfx.enforce_isolation[i].adev = adev; 4309 adev->gfx.enforce_isolation[i].xcp_id = i; 4310 } 4311 4312 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4313 4314 adev->gfx.gfx_off_req_count = 1; 4315 adev->gfx.gfx_off_residency = 0; 4316 adev->gfx.gfx_off_entrycount = 0; 4317 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4318 4319 atomic_set(&adev->throttling_logging_enabled, 1); 4320 /* 4321 * If throttling continues, logging will be performed every minute 4322 * to avoid log flooding. "-1" is subtracted since the thermal 4323 * throttling interrupt comes every second. Thus, the total logging 4324 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4325 * for throttling interrupt) = 60 seconds. 4326 */ 4327 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4328 ratelimit_state_init(&adev->virt.ras_telemetry_rs, 5 * HZ, 1); 4329 4330 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4331 ratelimit_set_flags(&adev->virt.ras_telemetry_rs, RATELIMIT_MSG_ON_RELEASE); 4332 4333 /* Registers mapping */ 4334 /* TODO: block userspace mapping of io register */ 4335 if (adev->asic_type >= CHIP_BONAIRE) { 4336 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4337 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4338 } else { 4339 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4340 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4341 } 4342 4343 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4344 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4345 4346 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4347 if (!adev->rmmio) 4348 return -ENOMEM; 4349 4350 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 4351 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 4352 4353 /* 4354 * Reset domain needs to be present early, before XGMI hive discovered 4355 * (if any) and initialized to use reset sem and in_gpu reset flag 4356 * early on during init and before calling to RREG32. 4357 */ 4358 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4359 if (!adev->reset_domain) 4360 return -ENOMEM; 4361 4362 /* detect hw virtualization here */ 4363 amdgpu_detect_virtualization(adev); 4364 4365 amdgpu_device_get_pcie_info(adev); 4366 4367 r = amdgpu_device_get_job_timeout_settings(adev); 4368 if (r) { 4369 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4370 return r; 4371 } 4372 4373 amdgpu_device_set_mcbp(adev); 4374 4375 /* 4376 * By default, use default mode where all blocks are expected to be 4377 * initialized. At present a 'swinit' of blocks is required to be 4378 * completed before the need for a different level is detected. 4379 */ 4380 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4381 /* early init functions */ 4382 r = amdgpu_device_ip_early_init(adev); 4383 if (r) 4384 return r; 4385 4386 /* Get rid of things like offb */ 4387 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4388 if (r) 4389 return r; 4390 4391 /* Enable TMZ based on IP_VERSION */ 4392 amdgpu_gmc_tmz_set(adev); 4393 4394 if (amdgpu_sriov_vf(adev) && 4395 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4396 /* VF MMIO access (except mailbox range) from CPU 4397 * will be blocked during sriov runtime 4398 */ 4399 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4400 4401 amdgpu_gmc_noretry_set(adev); 4402 /* Need to get xgmi info early to decide the reset behavior*/ 4403 if (adev->gmc.xgmi.supported) { 4404 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4405 if (r) 4406 return r; 4407 } 4408 4409 /* enable PCIE atomic ops */ 4410 if (amdgpu_sriov_vf(adev)) { 4411 if (adev->virt.fw_reserve.p_pf2vf) 4412 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4413 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4414 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4415 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4416 * internal path natively support atomics, set have_atomics_support to true. 4417 */ 4418 } else if ((adev->flags & AMD_IS_APU) && 4419 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4420 IP_VERSION(9, 0, 0))) { 4421 adev->have_atomics_support = true; 4422 } else { 4423 adev->have_atomics_support = 4424 !pci_enable_atomic_ops_to_root(adev->pdev, 4425 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4426 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4427 } 4428 4429 if (!adev->have_atomics_support) 4430 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4431 4432 /* doorbell bar mapping and doorbell index init*/ 4433 amdgpu_doorbell_init(adev); 4434 4435 if (amdgpu_emu_mode == 1) { 4436 /* post the asic on emulation mode */ 4437 emu_soc_asic_init(adev); 4438 goto fence_driver_init; 4439 } 4440 4441 amdgpu_reset_init(adev); 4442 4443 /* detect if we are with an SRIOV vbios */ 4444 if (adev->bios) 4445 amdgpu_device_detect_sriov_bios(adev); 4446 4447 /* check if we need to reset the asic 4448 * E.g., driver was not cleanly unloaded previously, etc. 4449 */ 4450 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4451 if (adev->gmc.xgmi.num_physical_nodes) { 4452 dev_info(adev->dev, "Pending hive reset.\n"); 4453 amdgpu_set_init_level(adev, 4454 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4455 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4456 !amdgpu_device_has_display_hardware(adev)) { 4457 r = psp_gpu_reset(adev); 4458 } else { 4459 tmp = amdgpu_reset_method; 4460 /* It should do a default reset when loading or reloading the driver, 4461 * regardless of the module parameter reset_method. 4462 */ 4463 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4464 r = amdgpu_asic_reset(adev); 4465 amdgpu_reset_method = tmp; 4466 } 4467 4468 if (r) { 4469 dev_err(adev->dev, "asic reset on init failed\n"); 4470 goto failed; 4471 } 4472 } 4473 4474 /* Post card if necessary */ 4475 if (amdgpu_device_need_post(adev)) { 4476 if (!adev->bios) { 4477 dev_err(adev->dev, "no vBIOS found\n"); 4478 r = -EINVAL; 4479 goto failed; 4480 } 4481 DRM_INFO("GPU posting now...\n"); 4482 r = amdgpu_device_asic_init(adev); 4483 if (r) { 4484 dev_err(adev->dev, "gpu post error!\n"); 4485 goto failed; 4486 } 4487 } 4488 4489 if (adev->bios) { 4490 if (adev->is_atom_fw) { 4491 /* Initialize clocks */ 4492 r = amdgpu_atomfirmware_get_clock_info(adev); 4493 if (r) { 4494 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4495 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4496 goto failed; 4497 } 4498 } else { 4499 /* Initialize clocks */ 4500 r = amdgpu_atombios_get_clock_info(adev); 4501 if (r) { 4502 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4503 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4504 goto failed; 4505 } 4506 /* init i2c buses */ 4507 amdgpu_i2c_init(adev); 4508 } 4509 } 4510 4511 fence_driver_init: 4512 /* Fence driver */ 4513 r = amdgpu_fence_driver_sw_init(adev); 4514 if (r) { 4515 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4516 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4517 goto failed; 4518 } 4519 4520 /* init the mode config */ 4521 drm_mode_config_init(adev_to_drm(adev)); 4522 4523 r = amdgpu_device_ip_init(adev); 4524 if (r) { 4525 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4526 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4527 goto release_ras_con; 4528 } 4529 4530 amdgpu_fence_driver_hw_init(adev); 4531 4532 dev_info(adev->dev, 4533 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4534 adev->gfx.config.max_shader_engines, 4535 adev->gfx.config.max_sh_per_se, 4536 adev->gfx.config.max_cu_per_sh, 4537 adev->gfx.cu_info.number); 4538 4539 adev->accel_working = true; 4540 4541 amdgpu_vm_check_compute_bug(adev); 4542 4543 /* Initialize the buffer migration limit. */ 4544 if (amdgpu_moverate >= 0) 4545 max_MBps = amdgpu_moverate; 4546 else 4547 max_MBps = 8; /* Allow 8 MB/s. */ 4548 /* Get a log2 for easy divisions. */ 4549 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4550 4551 /* 4552 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4553 * Otherwise the mgpu fan boost feature will be skipped due to the 4554 * gpu instance is counted less. 4555 */ 4556 amdgpu_register_gpu_instance(adev); 4557 4558 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4559 * explicit gating rather than handling it automatically. 4560 */ 4561 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4562 r = amdgpu_device_ip_late_init(adev); 4563 if (r) { 4564 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4565 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4566 goto release_ras_con; 4567 } 4568 /* must succeed. */ 4569 amdgpu_ras_resume(adev); 4570 queue_delayed_work(system_wq, &adev->delayed_init_work, 4571 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4572 } 4573 4574 if (amdgpu_sriov_vf(adev)) { 4575 amdgpu_virt_release_full_gpu(adev, true); 4576 flush_delayed_work(&adev->delayed_init_work); 4577 } 4578 4579 /* 4580 * Place those sysfs registering after `late_init`. As some of those 4581 * operations performed in `late_init` might affect the sysfs 4582 * interfaces creating. 4583 */ 4584 r = amdgpu_atombios_sysfs_init(adev); 4585 if (r) 4586 drm_err(&adev->ddev, 4587 "registering atombios sysfs failed (%d).\n", r); 4588 4589 r = amdgpu_pm_sysfs_init(adev); 4590 if (r) 4591 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4592 4593 r = amdgpu_ucode_sysfs_init(adev); 4594 if (r) { 4595 adev->ucode_sysfs_en = false; 4596 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4597 } else 4598 adev->ucode_sysfs_en = true; 4599 4600 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4601 if (r) 4602 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4603 4604 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4605 if (r) 4606 dev_err(adev->dev, 4607 "Could not create amdgpu board attributes\n"); 4608 4609 amdgpu_fru_sysfs_init(adev); 4610 amdgpu_reg_state_sysfs_init(adev); 4611 amdgpu_xcp_cfg_sysfs_init(adev); 4612 4613 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4614 r = amdgpu_pmu_init(adev); 4615 if (r) 4616 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4617 4618 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4619 if (amdgpu_device_cache_pci_state(adev->pdev)) 4620 pci_restore_state(pdev); 4621 4622 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4623 /* this will fail for cards that aren't VGA class devices, just 4624 * ignore it 4625 */ 4626 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4627 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4628 4629 px = amdgpu_device_supports_px(ddev); 4630 4631 if (px || (!dev_is_removable(&adev->pdev->dev) && 4632 apple_gmux_detect(NULL, NULL))) 4633 vga_switcheroo_register_client(adev->pdev, 4634 &amdgpu_switcheroo_ops, px); 4635 4636 if (px) 4637 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4638 4639 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4640 amdgpu_xgmi_reset_on_init(adev); 4641 4642 amdgpu_device_check_iommu_direct_map(adev); 4643 4644 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; 4645 r = register_pm_notifier(&adev->pm_nb); 4646 if (r) 4647 goto failed; 4648 4649 return 0; 4650 4651 release_ras_con: 4652 if (amdgpu_sriov_vf(adev)) 4653 amdgpu_virt_release_full_gpu(adev, true); 4654 4655 /* failed in exclusive mode due to timeout */ 4656 if (amdgpu_sriov_vf(adev) && 4657 !amdgpu_sriov_runtime(adev) && 4658 amdgpu_virt_mmio_blocked(adev) && 4659 !amdgpu_virt_wait_reset(adev)) { 4660 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4661 /* Don't send request since VF is inactive. */ 4662 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4663 adev->virt.ops = NULL; 4664 r = -EAGAIN; 4665 } 4666 amdgpu_release_ras_context(adev); 4667 4668 failed: 4669 amdgpu_vf_error_trans_all(adev); 4670 4671 return r; 4672 } 4673 4674 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4675 { 4676 4677 /* Clear all CPU mappings pointing to this device */ 4678 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4679 4680 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4681 amdgpu_doorbell_fini(adev); 4682 4683 iounmap(adev->rmmio); 4684 adev->rmmio = NULL; 4685 if (adev->mman.aper_base_kaddr) 4686 iounmap(adev->mman.aper_base_kaddr); 4687 adev->mman.aper_base_kaddr = NULL; 4688 4689 /* Memory manager related */ 4690 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4691 arch_phys_wc_del(adev->gmc.vram_mtrr); 4692 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4693 } 4694 } 4695 4696 /** 4697 * amdgpu_device_fini_hw - tear down the driver 4698 * 4699 * @adev: amdgpu_device pointer 4700 * 4701 * Tear down the driver info (all asics). 4702 * Called at driver shutdown. 4703 */ 4704 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4705 { 4706 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4707 flush_delayed_work(&adev->delayed_init_work); 4708 4709 if (adev->mman.initialized) 4710 drain_workqueue(adev->mman.bdev.wq); 4711 adev->shutdown = true; 4712 4713 unregister_pm_notifier(&adev->pm_nb); 4714 4715 /* make sure IB test finished before entering exclusive mode 4716 * to avoid preemption on IB test 4717 */ 4718 if (amdgpu_sriov_vf(adev)) { 4719 amdgpu_virt_request_full_gpu(adev, false); 4720 amdgpu_virt_fini_data_exchange(adev); 4721 } 4722 4723 /* disable all interrupts */ 4724 amdgpu_irq_disable_all(adev); 4725 if (adev->mode_info.mode_config_initialized) { 4726 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4727 drm_helper_force_disable_all(adev_to_drm(adev)); 4728 else 4729 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4730 } 4731 amdgpu_fence_driver_hw_fini(adev); 4732 4733 if (adev->pm.sysfs_initialized) 4734 amdgpu_pm_sysfs_fini(adev); 4735 if (adev->ucode_sysfs_en) 4736 amdgpu_ucode_sysfs_fini(adev); 4737 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4738 amdgpu_fru_sysfs_fini(adev); 4739 4740 amdgpu_reg_state_sysfs_fini(adev); 4741 amdgpu_xcp_cfg_sysfs_fini(adev); 4742 4743 /* disable ras feature must before hw fini */ 4744 amdgpu_ras_pre_fini(adev); 4745 4746 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4747 4748 amdgpu_device_ip_fini_early(adev); 4749 4750 amdgpu_irq_fini_hw(adev); 4751 4752 if (adev->mman.initialized) 4753 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4754 4755 amdgpu_gart_dummy_page_fini(adev); 4756 4757 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4758 amdgpu_device_unmap_mmio(adev); 4759 4760 } 4761 4762 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4763 { 4764 int idx; 4765 bool px; 4766 4767 amdgpu_device_ip_fini(adev); 4768 amdgpu_fence_driver_sw_fini(adev); 4769 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4770 adev->accel_working = false; 4771 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4772 4773 amdgpu_reset_fini(adev); 4774 4775 /* free i2c buses */ 4776 amdgpu_i2c_fini(adev); 4777 4778 if (adev->bios) { 4779 if (amdgpu_emu_mode != 1) 4780 amdgpu_atombios_fini(adev); 4781 amdgpu_bios_release(adev); 4782 } 4783 4784 kfree(adev->fru_info); 4785 adev->fru_info = NULL; 4786 4787 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4788 4789 if (px || (!dev_is_removable(&adev->pdev->dev) && 4790 apple_gmux_detect(NULL, NULL))) 4791 vga_switcheroo_unregister_client(adev->pdev); 4792 4793 if (px) 4794 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4795 4796 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4797 vga_client_unregister(adev->pdev); 4798 4799 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4800 4801 iounmap(adev->rmmio); 4802 adev->rmmio = NULL; 4803 amdgpu_doorbell_fini(adev); 4804 drm_dev_exit(idx); 4805 } 4806 4807 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4808 amdgpu_pmu_fini(adev); 4809 if (adev->mman.discovery_bin) 4810 amdgpu_discovery_fini(adev); 4811 4812 amdgpu_reset_put_reset_domain(adev->reset_domain); 4813 adev->reset_domain = NULL; 4814 4815 kfree(adev->pci_state); 4816 4817 } 4818 4819 /** 4820 * amdgpu_device_evict_resources - evict device resources 4821 * @adev: amdgpu device object 4822 * 4823 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4824 * of the vram memory type. Mainly used for evicting device resources 4825 * at suspend time. 4826 * 4827 */ 4828 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4829 { 4830 int ret; 4831 4832 /* No need to evict vram on APUs unless going to S4 */ 4833 if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) 4834 return 0; 4835 4836 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4837 if (ret) 4838 DRM_WARN("evicting device resources failed\n"); 4839 return ret; 4840 } 4841 4842 /* 4843 * Suspend & resume. 4844 */ 4845 /** 4846 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events 4847 * @nb: notifier block 4848 * @mode: suspend mode 4849 * @data: data 4850 * 4851 * This function is called when the system is about to suspend or hibernate. 4852 * It is used to evict resources from the device before the system goes to 4853 * sleep while there is still access to swap. 4854 */ 4855 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 4856 void *data) 4857 { 4858 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); 4859 int r; 4860 4861 switch (mode) { 4862 case PM_HIBERNATION_PREPARE: 4863 adev->in_s4 = true; 4864 fallthrough; 4865 case PM_SUSPEND_PREPARE: 4866 r = amdgpu_device_evict_resources(adev); 4867 /* 4868 * This is considered non-fatal at this time because 4869 * amdgpu_device_prepare() will also fatally evict resources. 4870 * See https://gitlab.freedesktop.org/drm/amd/-/issues/3781 4871 */ 4872 if (r) 4873 drm_warn(adev_to_drm(adev), "Failed to evict resources, freeze active processes if problems occur: %d\n", r); 4874 break; 4875 } 4876 4877 return NOTIFY_DONE; 4878 } 4879 4880 /** 4881 * amdgpu_device_prepare - prepare for device suspend 4882 * 4883 * @dev: drm dev pointer 4884 * 4885 * Prepare to put the hw in the suspend state (all asics). 4886 * Returns 0 for success or an error on failure. 4887 * Called at driver suspend. 4888 */ 4889 int amdgpu_device_prepare(struct drm_device *dev) 4890 { 4891 struct amdgpu_device *adev = drm_to_adev(dev); 4892 int i, r; 4893 4894 amdgpu_choose_low_power_state(adev); 4895 4896 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4897 return 0; 4898 4899 /* Evict the majority of BOs before starting suspend sequence */ 4900 r = amdgpu_device_evict_resources(adev); 4901 if (r) 4902 goto unprepare; 4903 4904 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4905 4906 for (i = 0; i < adev->num_ip_blocks; i++) { 4907 if (!adev->ip_blocks[i].status.valid) 4908 continue; 4909 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 4910 continue; 4911 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 4912 if (r) 4913 goto unprepare; 4914 } 4915 4916 return 0; 4917 4918 unprepare: 4919 adev->in_s0ix = adev->in_s3 = adev->in_s4 = false; 4920 4921 return r; 4922 } 4923 4924 /** 4925 * amdgpu_device_suspend - initiate device suspend 4926 * 4927 * @dev: drm dev pointer 4928 * @notify_clients: notify in-kernel DRM clients 4929 * 4930 * Puts the hw in the suspend state (all asics). 4931 * Returns 0 for success or an error on failure. 4932 * Called at driver suspend. 4933 */ 4934 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 4935 { 4936 struct amdgpu_device *adev = drm_to_adev(dev); 4937 int r = 0; 4938 4939 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4940 return 0; 4941 4942 adev->in_suspend = true; 4943 4944 if (amdgpu_sriov_vf(adev)) { 4945 amdgpu_virt_fini_data_exchange(adev); 4946 r = amdgpu_virt_request_full_gpu(adev, false); 4947 if (r) 4948 return r; 4949 } 4950 4951 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4952 DRM_WARN("smart shift update failed\n"); 4953 4954 if (notify_clients) 4955 drm_client_dev_suspend(adev_to_drm(adev), false); 4956 4957 cancel_delayed_work_sync(&adev->delayed_init_work); 4958 4959 amdgpu_ras_suspend(adev); 4960 4961 amdgpu_device_ip_suspend_phase1(adev); 4962 4963 if (!adev->in_s0ix) 4964 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4965 4966 r = amdgpu_device_evict_resources(adev); 4967 if (r) 4968 return r; 4969 4970 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4971 4972 amdgpu_fence_driver_hw_fini(adev); 4973 4974 amdgpu_device_ip_suspend_phase2(adev); 4975 4976 if (amdgpu_sriov_vf(adev)) 4977 amdgpu_virt_release_full_gpu(adev, false); 4978 4979 r = amdgpu_dpm_notify_rlc_state(adev, false); 4980 if (r) 4981 return r; 4982 4983 return 0; 4984 } 4985 4986 /** 4987 * amdgpu_device_resume - initiate device resume 4988 * 4989 * @dev: drm dev pointer 4990 * @notify_clients: notify in-kernel DRM clients 4991 * 4992 * Bring the hw back to operating state (all asics). 4993 * Returns 0 for success or an error on failure. 4994 * Called at driver resume. 4995 */ 4996 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 4997 { 4998 struct amdgpu_device *adev = drm_to_adev(dev); 4999 int r = 0; 5000 5001 if (amdgpu_sriov_vf(adev)) { 5002 r = amdgpu_virt_request_full_gpu(adev, true); 5003 if (r) 5004 return r; 5005 } 5006 5007 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5008 return 0; 5009 5010 if (adev->in_s0ix) 5011 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 5012 5013 /* post card */ 5014 if (amdgpu_device_need_post(adev)) { 5015 r = amdgpu_device_asic_init(adev); 5016 if (r) 5017 dev_err(adev->dev, "amdgpu asic init failed\n"); 5018 } 5019 5020 r = amdgpu_device_ip_resume(adev); 5021 5022 if (r) { 5023 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 5024 goto exit; 5025 } 5026 5027 if (!adev->in_s0ix) { 5028 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 5029 if (r) 5030 goto exit; 5031 } 5032 5033 r = amdgpu_device_ip_late_init(adev); 5034 if (r) 5035 goto exit; 5036 5037 queue_delayed_work(system_wq, &adev->delayed_init_work, 5038 msecs_to_jiffies(AMDGPU_RESUME_MS)); 5039 exit: 5040 if (amdgpu_sriov_vf(adev)) { 5041 amdgpu_virt_init_data_exchange(adev); 5042 amdgpu_virt_release_full_gpu(adev, true); 5043 } 5044 5045 if (r) 5046 return r; 5047 5048 /* Make sure IB tests flushed */ 5049 flush_delayed_work(&adev->delayed_init_work); 5050 5051 if (notify_clients) 5052 drm_client_dev_resume(adev_to_drm(adev), false); 5053 5054 amdgpu_ras_resume(adev); 5055 5056 if (adev->mode_info.num_crtc) { 5057 /* 5058 * Most of the connector probing functions try to acquire runtime pm 5059 * refs to ensure that the GPU is powered on when connector polling is 5060 * performed. Since we're calling this from a runtime PM callback, 5061 * trying to acquire rpm refs will cause us to deadlock. 5062 * 5063 * Since we're guaranteed to be holding the rpm lock, it's safe to 5064 * temporarily disable the rpm helpers so this doesn't deadlock us. 5065 */ 5066 #ifdef CONFIG_PM 5067 dev->dev->power.disable_depth++; 5068 #endif 5069 if (!adev->dc_enabled) 5070 drm_helper_hpd_irq_event(dev); 5071 else 5072 drm_kms_helper_hotplug_event(dev); 5073 #ifdef CONFIG_PM 5074 dev->dev->power.disable_depth--; 5075 #endif 5076 } 5077 adev->in_suspend = false; 5078 5079 if (adev->enable_mes) 5080 amdgpu_mes_self_test(adev); 5081 5082 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 5083 DRM_WARN("smart shift update failed\n"); 5084 5085 return 0; 5086 } 5087 5088 /** 5089 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 5090 * 5091 * @adev: amdgpu_device pointer 5092 * 5093 * The list of all the hardware IPs that make up the asic is walked and 5094 * the check_soft_reset callbacks are run. check_soft_reset determines 5095 * if the asic is still hung or not. 5096 * Returns true if any of the IPs are still in a hung state, false if not. 5097 */ 5098 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 5099 { 5100 int i; 5101 bool asic_hang = false; 5102 5103 if (amdgpu_sriov_vf(adev)) 5104 return true; 5105 5106 if (amdgpu_asic_need_full_reset(adev)) 5107 return true; 5108 5109 for (i = 0; i < adev->num_ip_blocks; i++) { 5110 if (!adev->ip_blocks[i].status.valid) 5111 continue; 5112 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 5113 adev->ip_blocks[i].status.hang = 5114 adev->ip_blocks[i].version->funcs->check_soft_reset( 5115 &adev->ip_blocks[i]); 5116 if (adev->ip_blocks[i].status.hang) { 5117 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 5118 asic_hang = true; 5119 } 5120 } 5121 return asic_hang; 5122 } 5123 5124 /** 5125 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 5126 * 5127 * @adev: amdgpu_device pointer 5128 * 5129 * The list of all the hardware IPs that make up the asic is walked and the 5130 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 5131 * handles any IP specific hardware or software state changes that are 5132 * necessary for a soft reset to succeed. 5133 * Returns 0 on success, negative error code on failure. 5134 */ 5135 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5136 { 5137 int i, r = 0; 5138 5139 for (i = 0; i < adev->num_ip_blocks; i++) { 5140 if (!adev->ip_blocks[i].status.valid) 5141 continue; 5142 if (adev->ip_blocks[i].status.hang && 5143 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5144 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5145 if (r) 5146 return r; 5147 } 5148 } 5149 5150 return 0; 5151 } 5152 5153 /** 5154 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5155 * 5156 * @adev: amdgpu_device pointer 5157 * 5158 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5159 * reset is necessary to recover. 5160 * Returns true if a full asic reset is required, false if not. 5161 */ 5162 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5163 { 5164 int i; 5165 5166 if (amdgpu_asic_need_full_reset(adev)) 5167 return true; 5168 5169 for (i = 0; i < adev->num_ip_blocks; i++) { 5170 if (!adev->ip_blocks[i].status.valid) 5171 continue; 5172 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5173 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5174 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5175 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5176 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5177 if (adev->ip_blocks[i].status.hang) { 5178 dev_info(adev->dev, "Some block need full reset!\n"); 5179 return true; 5180 } 5181 } 5182 } 5183 return false; 5184 } 5185 5186 /** 5187 * amdgpu_device_ip_soft_reset - do a soft reset 5188 * 5189 * @adev: amdgpu_device pointer 5190 * 5191 * The list of all the hardware IPs that make up the asic is walked and the 5192 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5193 * IP specific hardware or software state changes that are necessary to soft 5194 * reset the IP. 5195 * Returns 0 on success, negative error code on failure. 5196 */ 5197 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5198 { 5199 int i, r = 0; 5200 5201 for (i = 0; i < adev->num_ip_blocks; i++) { 5202 if (!adev->ip_blocks[i].status.valid) 5203 continue; 5204 if (adev->ip_blocks[i].status.hang && 5205 adev->ip_blocks[i].version->funcs->soft_reset) { 5206 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5207 if (r) 5208 return r; 5209 } 5210 } 5211 5212 return 0; 5213 } 5214 5215 /** 5216 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5217 * 5218 * @adev: amdgpu_device pointer 5219 * 5220 * The list of all the hardware IPs that make up the asic is walked and the 5221 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5222 * handles any IP specific hardware or software state changes that are 5223 * necessary after the IP has been soft reset. 5224 * Returns 0 on success, negative error code on failure. 5225 */ 5226 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5227 { 5228 int i, r = 0; 5229 5230 for (i = 0; i < adev->num_ip_blocks; i++) { 5231 if (!adev->ip_blocks[i].status.valid) 5232 continue; 5233 if (adev->ip_blocks[i].status.hang && 5234 adev->ip_blocks[i].version->funcs->post_soft_reset) 5235 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5236 if (r) 5237 return r; 5238 } 5239 5240 return 0; 5241 } 5242 5243 /** 5244 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5245 * 5246 * @adev: amdgpu_device pointer 5247 * @reset_context: amdgpu reset context pointer 5248 * 5249 * do VF FLR and reinitialize Asic 5250 * return 0 means succeeded otherwise failed 5251 */ 5252 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5253 struct amdgpu_reset_context *reset_context) 5254 { 5255 int r; 5256 struct amdgpu_hive_info *hive = NULL; 5257 5258 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5259 if (!amdgpu_ras_get_fed_status(adev)) 5260 amdgpu_virt_ready_to_reset(adev); 5261 amdgpu_virt_wait_reset(adev); 5262 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5263 r = amdgpu_virt_request_full_gpu(adev, true); 5264 } else { 5265 r = amdgpu_virt_reset_gpu(adev); 5266 } 5267 if (r) 5268 return r; 5269 5270 amdgpu_ras_clear_err_state(adev); 5271 amdgpu_irq_gpu_reset_resume_helper(adev); 5272 5273 /* some sw clean up VF needs to do before recover */ 5274 amdgpu_virt_post_reset(adev); 5275 5276 /* Resume IP prior to SMC */ 5277 r = amdgpu_device_ip_reinit_early_sriov(adev); 5278 if (r) 5279 return r; 5280 5281 amdgpu_virt_init_data_exchange(adev); 5282 5283 r = amdgpu_device_fw_loading(adev); 5284 if (r) 5285 return r; 5286 5287 /* now we are okay to resume SMC/CP/SDMA */ 5288 r = amdgpu_device_ip_reinit_late_sriov(adev); 5289 if (r) 5290 return r; 5291 5292 hive = amdgpu_get_xgmi_hive(adev); 5293 /* Update PSP FW topology after reset */ 5294 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5295 r = amdgpu_xgmi_update_topology(hive, adev); 5296 if (hive) 5297 amdgpu_put_xgmi_hive(hive); 5298 if (r) 5299 return r; 5300 5301 r = amdgpu_ib_ring_tests(adev); 5302 if (r) 5303 return r; 5304 5305 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5306 amdgpu_inc_vram_lost(adev); 5307 5308 /* need to be called during full access so we can't do it later like 5309 * bare-metal does. 5310 */ 5311 amdgpu_amdkfd_post_reset(adev); 5312 amdgpu_virt_release_full_gpu(adev, true); 5313 5314 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5315 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5316 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5317 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5318 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5319 amdgpu_ras_resume(adev); 5320 5321 amdgpu_virt_ras_telemetry_post_reset(adev); 5322 5323 return 0; 5324 } 5325 5326 /** 5327 * amdgpu_device_has_job_running - check if there is any unfinished job 5328 * 5329 * @adev: amdgpu_device pointer 5330 * 5331 * check if there is any job running on the device when guest driver receives 5332 * FLR notification from host driver. If there are still jobs running, then 5333 * the guest driver will not respond the FLR reset. Instead, let the job hit 5334 * the timeout and guest driver then issue the reset request. 5335 */ 5336 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5337 { 5338 int i; 5339 5340 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5341 struct amdgpu_ring *ring = adev->rings[i]; 5342 5343 if (!amdgpu_ring_sched_ready(ring)) 5344 continue; 5345 5346 if (amdgpu_fence_count_emitted(ring)) 5347 return true; 5348 } 5349 return false; 5350 } 5351 5352 /** 5353 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5354 * 5355 * @adev: amdgpu_device pointer 5356 * 5357 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5358 * a hung GPU. 5359 */ 5360 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5361 { 5362 5363 if (amdgpu_gpu_recovery == 0) 5364 goto disabled; 5365 5366 /* Skip soft reset check in fatal error mode */ 5367 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5368 return true; 5369 5370 if (amdgpu_sriov_vf(adev)) 5371 return true; 5372 5373 if (amdgpu_gpu_recovery == -1) { 5374 switch (adev->asic_type) { 5375 #ifdef CONFIG_DRM_AMDGPU_SI 5376 case CHIP_VERDE: 5377 case CHIP_TAHITI: 5378 case CHIP_PITCAIRN: 5379 case CHIP_OLAND: 5380 case CHIP_HAINAN: 5381 #endif 5382 #ifdef CONFIG_DRM_AMDGPU_CIK 5383 case CHIP_KAVERI: 5384 case CHIP_KABINI: 5385 case CHIP_MULLINS: 5386 #endif 5387 case CHIP_CARRIZO: 5388 case CHIP_STONEY: 5389 case CHIP_CYAN_SKILLFISH: 5390 goto disabled; 5391 default: 5392 break; 5393 } 5394 } 5395 5396 return true; 5397 5398 disabled: 5399 dev_info(adev->dev, "GPU recovery disabled.\n"); 5400 return false; 5401 } 5402 5403 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5404 { 5405 u32 i; 5406 int ret = 0; 5407 5408 if (adev->bios) 5409 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5410 5411 dev_info(adev->dev, "GPU mode1 reset\n"); 5412 5413 /* Cache the state before bus master disable. The saved config space 5414 * values are used in other cases like restore after mode-2 reset. 5415 */ 5416 amdgpu_device_cache_pci_state(adev->pdev); 5417 5418 /* disable BM */ 5419 pci_clear_master(adev->pdev); 5420 5421 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5422 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5423 ret = amdgpu_dpm_mode1_reset(adev); 5424 } else { 5425 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5426 ret = psp_gpu_reset(adev); 5427 } 5428 5429 if (ret) 5430 goto mode1_reset_failed; 5431 5432 amdgpu_device_load_pci_state(adev->pdev); 5433 ret = amdgpu_psp_wait_for_bootloader(adev); 5434 if (ret) 5435 goto mode1_reset_failed; 5436 5437 /* wait for asic to come out of reset */ 5438 for (i = 0; i < adev->usec_timeout; i++) { 5439 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5440 5441 if (memsize != 0xffffffff) 5442 break; 5443 udelay(1); 5444 } 5445 5446 if (i >= adev->usec_timeout) { 5447 ret = -ETIMEDOUT; 5448 goto mode1_reset_failed; 5449 } 5450 5451 if (adev->bios) 5452 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5453 5454 return 0; 5455 5456 mode1_reset_failed: 5457 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5458 return ret; 5459 } 5460 5461 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5462 struct amdgpu_reset_context *reset_context) 5463 { 5464 int i, r = 0; 5465 struct amdgpu_job *job = NULL; 5466 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5467 bool need_full_reset = 5468 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5469 5470 if (reset_context->reset_req_dev == adev) 5471 job = reset_context->job; 5472 5473 if (amdgpu_sriov_vf(adev)) 5474 amdgpu_virt_pre_reset(adev); 5475 5476 amdgpu_fence_driver_isr_toggle(adev, true); 5477 5478 /* block all schedulers and reset given job's ring */ 5479 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5480 struct amdgpu_ring *ring = adev->rings[i]; 5481 5482 if (!amdgpu_ring_sched_ready(ring)) 5483 continue; 5484 5485 /* Clear job fence from fence drv to avoid force_completion 5486 * leave NULL and vm flush fence in fence drv 5487 */ 5488 amdgpu_fence_driver_clear_job_fences(ring); 5489 5490 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5491 amdgpu_fence_driver_force_completion(ring); 5492 } 5493 5494 amdgpu_fence_driver_isr_toggle(adev, false); 5495 5496 if (job && job->vm) 5497 drm_sched_increase_karma(&job->base); 5498 5499 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5500 /* If reset handler not implemented, continue; otherwise return */ 5501 if (r == -EOPNOTSUPP) 5502 r = 0; 5503 else 5504 return r; 5505 5506 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5507 if (!amdgpu_sriov_vf(adev)) { 5508 5509 if (!need_full_reset) 5510 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5511 5512 if (!need_full_reset && amdgpu_gpu_recovery && 5513 amdgpu_device_ip_check_soft_reset(adev)) { 5514 amdgpu_device_ip_pre_soft_reset(adev); 5515 r = amdgpu_device_ip_soft_reset(adev); 5516 amdgpu_device_ip_post_soft_reset(adev); 5517 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5518 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5519 need_full_reset = true; 5520 } 5521 } 5522 5523 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5524 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5525 /* Trigger ip dump before we reset the asic */ 5526 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5527 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5528 tmp_adev->ip_blocks[i].version->funcs 5529 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5530 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5531 } 5532 5533 if (need_full_reset) 5534 r = amdgpu_device_ip_suspend(adev); 5535 if (need_full_reset) 5536 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5537 else 5538 clear_bit(AMDGPU_NEED_FULL_RESET, 5539 &reset_context->flags); 5540 } 5541 5542 return r; 5543 } 5544 5545 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5546 { 5547 struct list_head *device_list_handle; 5548 bool full_reset, vram_lost = false; 5549 struct amdgpu_device *tmp_adev; 5550 int r, init_level; 5551 5552 device_list_handle = reset_context->reset_device_list; 5553 5554 if (!device_list_handle) 5555 return -EINVAL; 5556 5557 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5558 5559 /** 5560 * If it's reset on init, it's default init level, otherwise keep level 5561 * as recovery level. 5562 */ 5563 if (reset_context->method == AMD_RESET_METHOD_ON_INIT) 5564 init_level = AMDGPU_INIT_LEVEL_DEFAULT; 5565 else 5566 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; 5567 5568 r = 0; 5569 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5570 amdgpu_set_init_level(tmp_adev, init_level); 5571 if (full_reset) { 5572 /* post card */ 5573 amdgpu_ras_clear_err_state(tmp_adev); 5574 r = amdgpu_device_asic_init(tmp_adev); 5575 if (r) { 5576 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5577 } else { 5578 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5579 5580 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5581 if (r) 5582 goto out; 5583 5584 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5585 5586 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5587 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 5588 5589 if (vram_lost) { 5590 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5591 amdgpu_inc_vram_lost(tmp_adev); 5592 } 5593 5594 r = amdgpu_device_fw_loading(tmp_adev); 5595 if (r) 5596 return r; 5597 5598 r = amdgpu_xcp_restore_partition_mode( 5599 tmp_adev->xcp_mgr); 5600 if (r) 5601 goto out; 5602 5603 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5604 if (r) 5605 goto out; 5606 5607 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5608 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5609 5610 r = amdgpu_device_ip_resume_phase3(tmp_adev); 5611 if (r) 5612 goto out; 5613 5614 if (vram_lost) 5615 amdgpu_device_fill_reset_magic(tmp_adev); 5616 5617 /* 5618 * Add this ASIC as tracked as reset was already 5619 * complete successfully. 5620 */ 5621 amdgpu_register_gpu_instance(tmp_adev); 5622 5623 if (!reset_context->hive && 5624 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5625 amdgpu_xgmi_add_device(tmp_adev); 5626 5627 r = amdgpu_device_ip_late_init(tmp_adev); 5628 if (r) 5629 goto out; 5630 5631 drm_client_dev_resume(adev_to_drm(tmp_adev), false); 5632 5633 /* 5634 * The GPU enters bad state once faulty pages 5635 * by ECC has reached the threshold, and ras 5636 * recovery is scheduled next. So add one check 5637 * here to break recovery if it indeed exceeds 5638 * bad page threshold, and remind user to 5639 * retire this GPU or setting one bigger 5640 * bad_page_threshold value to fix this once 5641 * probing driver again. 5642 */ 5643 if (!amdgpu_ras_is_rma(tmp_adev)) { 5644 /* must succeed. */ 5645 amdgpu_ras_resume(tmp_adev); 5646 } else { 5647 r = -EINVAL; 5648 goto out; 5649 } 5650 5651 /* Update PSP FW topology after reset */ 5652 if (reset_context->hive && 5653 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5654 r = amdgpu_xgmi_update_topology( 5655 reset_context->hive, tmp_adev); 5656 } 5657 } 5658 5659 out: 5660 if (!r) { 5661 /* IP init is complete now, set level as default */ 5662 amdgpu_set_init_level(tmp_adev, 5663 AMDGPU_INIT_LEVEL_DEFAULT); 5664 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5665 r = amdgpu_ib_ring_tests(tmp_adev); 5666 if (r) { 5667 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5668 r = -EAGAIN; 5669 goto end; 5670 } 5671 } 5672 5673 if (r) 5674 tmp_adev->asic_reset_res = r; 5675 } 5676 5677 end: 5678 return r; 5679 } 5680 5681 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5682 struct amdgpu_reset_context *reset_context) 5683 { 5684 struct amdgpu_device *tmp_adev = NULL; 5685 bool need_full_reset, skip_hw_reset; 5686 int r = 0; 5687 5688 /* Try reset handler method first */ 5689 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5690 reset_list); 5691 5692 reset_context->reset_device_list = device_list_handle; 5693 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5694 /* If reset handler not implemented, continue; otherwise return */ 5695 if (r == -EOPNOTSUPP) 5696 r = 0; 5697 else 5698 return r; 5699 5700 /* Reset handler not implemented, use the default method */ 5701 need_full_reset = 5702 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5703 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5704 5705 /* 5706 * ASIC reset has to be done on all XGMI hive nodes ASAP 5707 * to allow proper links negotiation in FW (within 1 sec) 5708 */ 5709 if (!skip_hw_reset && need_full_reset) { 5710 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5711 /* For XGMI run all resets in parallel to speed up the process */ 5712 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5713 if (!queue_work(system_unbound_wq, 5714 &tmp_adev->xgmi_reset_work)) 5715 r = -EALREADY; 5716 } else 5717 r = amdgpu_asic_reset(tmp_adev); 5718 5719 if (r) { 5720 dev_err(tmp_adev->dev, 5721 "ASIC reset failed with error, %d for drm dev, %s", 5722 r, adev_to_drm(tmp_adev)->unique); 5723 goto out; 5724 } 5725 } 5726 5727 /* For XGMI wait for all resets to complete before proceed */ 5728 if (!r) { 5729 list_for_each_entry(tmp_adev, device_list_handle, 5730 reset_list) { 5731 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5732 flush_work(&tmp_adev->xgmi_reset_work); 5733 r = tmp_adev->asic_reset_res; 5734 if (r) 5735 break; 5736 } 5737 } 5738 } 5739 } 5740 5741 if (!r && amdgpu_ras_intr_triggered()) { 5742 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5743 amdgpu_ras_reset_error_count(tmp_adev, 5744 AMDGPU_RAS_BLOCK__MMHUB); 5745 } 5746 5747 amdgpu_ras_intr_cleared(); 5748 } 5749 5750 r = amdgpu_device_reinit_after_reset(reset_context); 5751 if (r == -EAGAIN) 5752 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5753 else 5754 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5755 5756 out: 5757 return r; 5758 } 5759 5760 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5761 { 5762 5763 switch (amdgpu_asic_reset_method(adev)) { 5764 case AMD_RESET_METHOD_MODE1: 5765 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5766 break; 5767 case AMD_RESET_METHOD_MODE2: 5768 adev->mp1_state = PP_MP1_STATE_RESET; 5769 break; 5770 default: 5771 adev->mp1_state = PP_MP1_STATE_NONE; 5772 break; 5773 } 5774 } 5775 5776 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5777 { 5778 amdgpu_vf_error_trans_all(adev); 5779 adev->mp1_state = PP_MP1_STATE_NONE; 5780 } 5781 5782 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5783 { 5784 struct pci_dev *p = NULL; 5785 5786 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5787 adev->pdev->bus->number, 1); 5788 if (p) { 5789 pm_runtime_enable(&(p->dev)); 5790 pm_runtime_resume(&(p->dev)); 5791 } 5792 5793 pci_dev_put(p); 5794 } 5795 5796 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5797 { 5798 enum amd_reset_method reset_method; 5799 struct pci_dev *p = NULL; 5800 u64 expires; 5801 5802 /* 5803 * For now, only BACO and mode1 reset are confirmed 5804 * to suffer the audio issue without proper suspended. 5805 */ 5806 reset_method = amdgpu_asic_reset_method(adev); 5807 if ((reset_method != AMD_RESET_METHOD_BACO) && 5808 (reset_method != AMD_RESET_METHOD_MODE1)) 5809 return -EINVAL; 5810 5811 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5812 adev->pdev->bus->number, 1); 5813 if (!p) 5814 return -ENODEV; 5815 5816 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5817 if (!expires) 5818 /* 5819 * If we cannot get the audio device autosuspend delay, 5820 * a fixed 4S interval will be used. Considering 3S is 5821 * the audio controller default autosuspend delay setting. 5822 * 4S used here is guaranteed to cover that. 5823 */ 5824 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5825 5826 while (!pm_runtime_status_suspended(&(p->dev))) { 5827 if (!pm_runtime_suspend(&(p->dev))) 5828 break; 5829 5830 if (expires < ktime_get_mono_fast_ns()) { 5831 dev_warn(adev->dev, "failed to suspend display audio\n"); 5832 pci_dev_put(p); 5833 /* TODO: abort the succeeding gpu reset? */ 5834 return -ETIMEDOUT; 5835 } 5836 } 5837 5838 pm_runtime_disable(&(p->dev)); 5839 5840 pci_dev_put(p); 5841 return 0; 5842 } 5843 5844 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5845 { 5846 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5847 5848 #if defined(CONFIG_DEBUG_FS) 5849 if (!amdgpu_sriov_vf(adev)) 5850 cancel_work(&adev->reset_work); 5851 #endif 5852 5853 if (adev->kfd.dev) 5854 cancel_work(&adev->kfd.reset_work); 5855 5856 if (amdgpu_sriov_vf(adev)) 5857 cancel_work(&adev->virt.flr_work); 5858 5859 if (con && adev->ras_enabled) 5860 cancel_work(&con->recovery_work); 5861 5862 } 5863 5864 static int amdgpu_device_health_check(struct list_head *device_list_handle) 5865 { 5866 struct amdgpu_device *tmp_adev; 5867 int ret = 0; 5868 u32 status; 5869 5870 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5871 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status); 5872 if (PCI_POSSIBLE_ERROR(status)) { 5873 dev_err(tmp_adev->dev, "device lost from bus!"); 5874 ret = -ENODEV; 5875 } 5876 } 5877 5878 return ret; 5879 } 5880 5881 /** 5882 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5883 * 5884 * @adev: amdgpu_device pointer 5885 * @job: which job trigger hang 5886 * @reset_context: amdgpu reset context pointer 5887 * 5888 * Attempt to reset the GPU if it has hung (all asics). 5889 * Attempt to do soft-reset or full-reset and reinitialize Asic 5890 * Returns 0 for success or an error on failure. 5891 */ 5892 5893 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5894 struct amdgpu_job *job, 5895 struct amdgpu_reset_context *reset_context) 5896 { 5897 struct list_head device_list, *device_list_handle = NULL; 5898 bool job_signaled = false; 5899 struct amdgpu_hive_info *hive = NULL; 5900 struct amdgpu_device *tmp_adev = NULL; 5901 int i, r = 0; 5902 bool need_emergency_restart = false; 5903 bool audio_suspended = false; 5904 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 5905 5906 /* 5907 * If it reaches here because of hang/timeout and a RAS error is 5908 * detected at the same time, let RAS recovery take care of it. 5909 */ 5910 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && 5911 !amdgpu_sriov_vf(adev) && 5912 reset_context->src != AMDGPU_RESET_SRC_RAS) { 5913 dev_dbg(adev->dev, 5914 "Gpu recovery from source: %d yielding to RAS error recovery handling", 5915 reset_context->src); 5916 return 0; 5917 } 5918 /* 5919 * Special case: RAS triggered and full reset isn't supported 5920 */ 5921 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5922 5923 /* 5924 * Flush RAM to disk so that after reboot 5925 * the user can read log and see why the system rebooted. 5926 */ 5927 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5928 amdgpu_ras_get_context(adev)->reboot) { 5929 DRM_WARN("Emergency reboot."); 5930 5931 ksys_sync_helper(); 5932 emergency_restart(); 5933 } 5934 5935 dev_info(adev->dev, "GPU %s begin!\n", 5936 need_emergency_restart ? "jobs stop":"reset"); 5937 5938 if (!amdgpu_sriov_vf(adev)) 5939 hive = amdgpu_get_xgmi_hive(adev); 5940 if (hive) 5941 mutex_lock(&hive->hive_lock); 5942 5943 reset_context->job = job; 5944 reset_context->hive = hive; 5945 /* 5946 * Build list of devices to reset. 5947 * In case we are in XGMI hive mode, resort the device list 5948 * to put adev in the 1st position. 5949 */ 5950 INIT_LIST_HEAD(&device_list); 5951 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 5952 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5953 list_add_tail(&tmp_adev->reset_list, &device_list); 5954 if (adev->shutdown) 5955 tmp_adev->shutdown = true; 5956 } 5957 if (!list_is_first(&adev->reset_list, &device_list)) 5958 list_rotate_to_front(&adev->reset_list, &device_list); 5959 device_list_handle = &device_list; 5960 } else { 5961 list_add_tail(&adev->reset_list, &device_list); 5962 device_list_handle = &device_list; 5963 } 5964 5965 if (!amdgpu_sriov_vf(adev)) { 5966 r = amdgpu_device_health_check(device_list_handle); 5967 if (r) 5968 goto end_reset; 5969 } 5970 5971 /* We need to lock reset domain only once both for XGMI and single device */ 5972 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5973 reset_list); 5974 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5975 5976 /* block all schedulers and reset given job's ring */ 5977 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5978 5979 amdgpu_device_set_mp1_state(tmp_adev); 5980 5981 /* 5982 * Try to put the audio codec into suspend state 5983 * before gpu reset started. 5984 * 5985 * Due to the power domain of the graphics device 5986 * is shared with AZ power domain. Without this, 5987 * we may change the audio hardware from behind 5988 * the audio driver's back. That will trigger 5989 * some audio codec errors. 5990 */ 5991 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5992 audio_suspended = true; 5993 5994 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5995 5996 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5997 5998 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 5999 6000 /* 6001 * Mark these ASICs to be reset as untracked first 6002 * And add them back after reset completed 6003 */ 6004 amdgpu_unregister_gpu_instance(tmp_adev); 6005 6006 drm_client_dev_suspend(adev_to_drm(tmp_adev), false); 6007 6008 /* disable ras on ALL IPs */ 6009 if (!need_emergency_restart && 6010 amdgpu_device_ip_need_full_reset(tmp_adev)) 6011 amdgpu_ras_suspend(tmp_adev); 6012 6013 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6014 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6015 6016 if (!amdgpu_ring_sched_ready(ring)) 6017 continue; 6018 6019 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 6020 6021 if (need_emergency_restart) 6022 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 6023 } 6024 atomic_inc(&tmp_adev->gpu_reset_counter); 6025 } 6026 6027 if (need_emergency_restart) 6028 goto skip_sched_resume; 6029 6030 /* 6031 * Must check guilty signal here since after this point all old 6032 * HW fences are force signaled. 6033 * 6034 * job->base holds a reference to parent fence 6035 */ 6036 if (job && dma_fence_is_signaled(&job->hw_fence)) { 6037 job_signaled = true; 6038 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 6039 goto skip_hw_reset; 6040 } 6041 6042 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 6043 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6044 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 6045 /*TODO Should we stop ?*/ 6046 if (r) { 6047 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 6048 r, adev_to_drm(tmp_adev)->unique); 6049 tmp_adev->asic_reset_res = r; 6050 } 6051 } 6052 6053 /* Actual ASIC resets if needed.*/ 6054 /* Host driver will handle XGMI hive reset for SRIOV */ 6055 if (amdgpu_sriov_vf(adev)) { 6056 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 6057 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 6058 amdgpu_ras_set_fed(adev, true); 6059 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 6060 } 6061 6062 r = amdgpu_device_reset_sriov(adev, reset_context); 6063 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 6064 amdgpu_virt_release_full_gpu(adev, true); 6065 goto retry; 6066 } 6067 if (r) 6068 adev->asic_reset_res = r; 6069 } else { 6070 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 6071 if (r && r == -EAGAIN) 6072 goto retry; 6073 } 6074 6075 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6076 /* 6077 * Drop any pending non scheduler resets queued before reset is done. 6078 * Any reset scheduled after this point would be valid. Scheduler resets 6079 * were already dropped during drm_sched_stop and no new ones can come 6080 * in before drm_sched_start. 6081 */ 6082 amdgpu_device_stop_pending_resets(tmp_adev); 6083 } 6084 6085 skip_hw_reset: 6086 6087 /* Post ASIC reset for all devs .*/ 6088 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6089 6090 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6091 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6092 6093 if (!amdgpu_ring_sched_ready(ring)) 6094 continue; 6095 6096 drm_sched_start(&ring->sched, 0); 6097 } 6098 6099 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 6100 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 6101 6102 if (tmp_adev->asic_reset_res) 6103 r = tmp_adev->asic_reset_res; 6104 6105 tmp_adev->asic_reset_res = 0; 6106 6107 if (r) { 6108 /* bad news, how to tell it to userspace ? 6109 * for ras error, we should report GPU bad status instead of 6110 * reset failure 6111 */ 6112 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 6113 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 6114 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", 6115 atomic_read(&tmp_adev->gpu_reset_counter)); 6116 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 6117 } else { 6118 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 6119 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 6120 DRM_WARN("smart shift update failed\n"); 6121 } 6122 } 6123 6124 skip_sched_resume: 6125 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6126 /* unlock kfd: SRIOV would do it separately */ 6127 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 6128 amdgpu_amdkfd_post_reset(tmp_adev); 6129 6130 /* kfd_post_reset will do nothing if kfd device is not initialized, 6131 * need to bring up kfd here if it's not be initialized before 6132 */ 6133 if (!adev->kfd.init_complete) 6134 amdgpu_amdkfd_device_init(adev); 6135 6136 if (audio_suspended) 6137 amdgpu_device_resume_display_audio(tmp_adev); 6138 6139 amdgpu_device_unset_mp1_state(tmp_adev); 6140 6141 amdgpu_ras_set_error_query_ready(tmp_adev, true); 6142 } 6143 6144 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 6145 reset_list); 6146 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 6147 6148 end_reset: 6149 if (hive) { 6150 mutex_unlock(&hive->hive_lock); 6151 amdgpu_put_xgmi_hive(hive); 6152 } 6153 6154 if (r) 6155 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 6156 6157 atomic_set(&adev->reset_domain->reset_res, r); 6158 return r; 6159 } 6160 6161 /** 6162 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6163 * 6164 * @adev: amdgpu_device pointer 6165 * @speed: pointer to the speed of the link 6166 * @width: pointer to the width of the link 6167 * 6168 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6169 * first physical partner to an AMD dGPU. 6170 * This will exclude any virtual switches and links. 6171 */ 6172 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6173 enum pci_bus_speed *speed, 6174 enum pcie_link_width *width) 6175 { 6176 struct pci_dev *parent = adev->pdev; 6177 6178 if (!speed || !width) 6179 return; 6180 6181 *speed = PCI_SPEED_UNKNOWN; 6182 *width = PCIE_LNK_WIDTH_UNKNOWN; 6183 6184 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6185 while ((parent = pci_upstream_bridge(parent))) { 6186 /* skip upstream/downstream switches internal to dGPU*/ 6187 if (parent->vendor == PCI_VENDOR_ID_ATI) 6188 continue; 6189 *speed = pcie_get_speed_cap(parent); 6190 *width = pcie_get_width_cap(parent); 6191 break; 6192 } 6193 } else { 6194 /* use the current speeds rather than max if switching is not supported */ 6195 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6196 } 6197 } 6198 6199 /** 6200 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU 6201 * 6202 * @adev: amdgpu_device pointer 6203 * @speed: pointer to the speed of the link 6204 * @width: pointer to the width of the link 6205 * 6206 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6207 * AMD dGPU which may be a virtual upstream bridge. 6208 */ 6209 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev, 6210 enum pci_bus_speed *speed, 6211 enum pcie_link_width *width) 6212 { 6213 struct pci_dev *parent = adev->pdev; 6214 6215 if (!speed || !width) 6216 return; 6217 6218 parent = pci_upstream_bridge(parent); 6219 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) { 6220 /* use the upstream/downstream switches internal to dGPU */ 6221 *speed = pcie_get_speed_cap(parent); 6222 *width = pcie_get_width_cap(parent); 6223 while ((parent = pci_upstream_bridge(parent))) { 6224 if (parent->vendor == PCI_VENDOR_ID_ATI) { 6225 /* use the upstream/downstream switches internal to dGPU */ 6226 *speed = pcie_get_speed_cap(parent); 6227 *width = pcie_get_width_cap(parent); 6228 } 6229 } 6230 } else { 6231 /* use the device itself */ 6232 *speed = pcie_get_speed_cap(adev->pdev); 6233 *width = pcie_get_width_cap(adev->pdev); 6234 } 6235 } 6236 6237 /** 6238 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6239 * 6240 * @adev: amdgpu_device pointer 6241 * 6242 * Fetches and stores in the driver the PCIE capabilities (gen speed 6243 * and lanes) of the slot the device is in. Handles APUs and 6244 * virtualized environments where PCIE config space may not be available. 6245 */ 6246 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6247 { 6248 enum pci_bus_speed speed_cap, platform_speed_cap; 6249 enum pcie_link_width platform_link_width, link_width; 6250 6251 if (amdgpu_pcie_gen_cap) 6252 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6253 6254 if (amdgpu_pcie_lane_cap) 6255 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6256 6257 /* covers APUs as well */ 6258 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6259 if (adev->pm.pcie_gen_mask == 0) 6260 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6261 if (adev->pm.pcie_mlw_mask == 0) 6262 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6263 return; 6264 } 6265 6266 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6267 return; 6268 6269 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6270 &platform_link_width); 6271 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width); 6272 6273 if (adev->pm.pcie_gen_mask == 0) { 6274 /* asic caps */ 6275 if (speed_cap == PCI_SPEED_UNKNOWN) { 6276 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6277 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6278 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6279 } else { 6280 if (speed_cap == PCIE_SPEED_32_0GT) 6281 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6282 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6283 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6284 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6285 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6286 else if (speed_cap == PCIE_SPEED_16_0GT) 6287 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6288 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6289 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6290 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6291 else if (speed_cap == PCIE_SPEED_8_0GT) 6292 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6293 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6294 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6295 else if (speed_cap == PCIE_SPEED_5_0GT) 6296 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6297 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6298 else 6299 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6300 } 6301 /* platform caps */ 6302 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6303 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6304 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6305 } else { 6306 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6307 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6308 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6309 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6310 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6311 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6312 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6313 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6314 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6315 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6316 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6317 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6318 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6319 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6320 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6321 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6322 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6323 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6324 else 6325 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6326 6327 } 6328 } 6329 if (adev->pm.pcie_mlw_mask == 0) { 6330 /* asic caps */ 6331 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6332 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK; 6333 } else { 6334 switch (link_width) { 6335 case PCIE_LNK_X32: 6336 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 | 6337 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6338 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6339 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6340 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6341 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6342 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6343 break; 6344 case PCIE_LNK_X16: 6345 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6346 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6347 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6348 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6349 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6350 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6351 break; 6352 case PCIE_LNK_X12: 6353 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6354 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6355 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6356 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6357 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6358 break; 6359 case PCIE_LNK_X8: 6360 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6361 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6362 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6363 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6364 break; 6365 case PCIE_LNK_X4: 6366 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6367 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6368 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6369 break; 6370 case PCIE_LNK_X2: 6371 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6372 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6373 break; 6374 case PCIE_LNK_X1: 6375 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1; 6376 break; 6377 default: 6378 break; 6379 } 6380 } 6381 /* platform caps */ 6382 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6383 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6384 } else { 6385 switch (platform_link_width) { 6386 case PCIE_LNK_X32: 6387 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6388 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6389 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6390 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6391 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6392 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6393 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6394 break; 6395 case PCIE_LNK_X16: 6396 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6397 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6398 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6399 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6400 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6401 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6402 break; 6403 case PCIE_LNK_X12: 6404 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6405 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6406 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6407 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6408 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6409 break; 6410 case PCIE_LNK_X8: 6411 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6412 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6413 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6414 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6415 break; 6416 case PCIE_LNK_X4: 6417 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6418 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6419 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6420 break; 6421 case PCIE_LNK_X2: 6422 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6423 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6424 break; 6425 case PCIE_LNK_X1: 6426 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6427 break; 6428 default: 6429 break; 6430 } 6431 } 6432 } 6433 } 6434 6435 /** 6436 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6437 * 6438 * @adev: amdgpu_device pointer 6439 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6440 * 6441 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6442 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6443 * @peer_adev. 6444 */ 6445 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6446 struct amdgpu_device *peer_adev) 6447 { 6448 #ifdef CONFIG_HSA_AMD_P2P 6449 bool p2p_access = 6450 !adev->gmc.xgmi.connected_to_cpu && 6451 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6452 if (!p2p_access) 6453 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 6454 pci_name(peer_adev->pdev)); 6455 6456 bool is_large_bar = adev->gmc.visible_vram_size && 6457 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 6458 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 6459 6460 if (!p2p_addressable) { 6461 uint64_t address_mask = peer_adev->dev->dma_mask ? 6462 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6463 resource_size_t aper_limit = 6464 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6465 6466 p2p_addressable = !(adev->gmc.aper_base & address_mask || 6467 aper_limit & address_mask); 6468 } 6469 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 6470 #else 6471 return false; 6472 #endif 6473 } 6474 6475 int amdgpu_device_baco_enter(struct drm_device *dev) 6476 { 6477 struct amdgpu_device *adev = drm_to_adev(dev); 6478 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6479 6480 if (!amdgpu_device_supports_baco(dev)) 6481 return -ENOTSUPP; 6482 6483 if (ras && adev->ras_enabled && 6484 adev->nbio.funcs->enable_doorbell_interrupt) 6485 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6486 6487 return amdgpu_dpm_baco_enter(adev); 6488 } 6489 6490 int amdgpu_device_baco_exit(struct drm_device *dev) 6491 { 6492 struct amdgpu_device *adev = drm_to_adev(dev); 6493 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6494 int ret = 0; 6495 6496 if (!amdgpu_device_supports_baco(dev)) 6497 return -ENOTSUPP; 6498 6499 ret = amdgpu_dpm_baco_exit(adev); 6500 if (ret) 6501 return ret; 6502 6503 if (ras && adev->ras_enabled && 6504 adev->nbio.funcs->enable_doorbell_interrupt) 6505 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6506 6507 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6508 adev->nbio.funcs->clear_doorbell_interrupt) 6509 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6510 6511 return 0; 6512 } 6513 6514 /** 6515 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6516 * @pdev: PCI device struct 6517 * @state: PCI channel state 6518 * 6519 * Description: Called when a PCI error is detected. 6520 * 6521 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6522 */ 6523 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6524 { 6525 struct drm_device *dev = pci_get_drvdata(pdev); 6526 struct amdgpu_device *adev = drm_to_adev(dev); 6527 int i; 6528 6529 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 6530 6531 if (adev->gmc.xgmi.num_physical_nodes > 1) { 6532 DRM_WARN("No support for XGMI hive yet..."); 6533 return PCI_ERS_RESULT_DISCONNECT; 6534 } 6535 6536 adev->pci_channel_state = state; 6537 6538 switch (state) { 6539 case pci_channel_io_normal: 6540 return PCI_ERS_RESULT_CAN_RECOVER; 6541 /* Fatal error, prepare for slot reset */ 6542 case pci_channel_io_frozen: 6543 /* 6544 * Locking adev->reset_domain->sem will prevent any external access 6545 * to GPU during PCI error recovery 6546 */ 6547 amdgpu_device_lock_reset_domain(adev->reset_domain); 6548 amdgpu_device_set_mp1_state(adev); 6549 6550 /* 6551 * Block any work scheduling as we do for regular GPU reset 6552 * for the duration of the recovery 6553 */ 6554 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6555 struct amdgpu_ring *ring = adev->rings[i]; 6556 6557 if (!amdgpu_ring_sched_ready(ring)) 6558 continue; 6559 6560 drm_sched_stop(&ring->sched, NULL); 6561 } 6562 atomic_inc(&adev->gpu_reset_counter); 6563 return PCI_ERS_RESULT_NEED_RESET; 6564 case pci_channel_io_perm_failure: 6565 /* Permanent error, prepare for device removal */ 6566 return PCI_ERS_RESULT_DISCONNECT; 6567 } 6568 6569 return PCI_ERS_RESULT_NEED_RESET; 6570 } 6571 6572 /** 6573 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6574 * @pdev: pointer to PCI device 6575 */ 6576 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6577 { 6578 6579 DRM_INFO("PCI error: mmio enabled callback!!\n"); 6580 6581 /* TODO - dump whatever for debugging purposes */ 6582 6583 /* This called only if amdgpu_pci_error_detected returns 6584 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6585 * works, no need to reset slot. 6586 */ 6587 6588 return PCI_ERS_RESULT_RECOVERED; 6589 } 6590 6591 /** 6592 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6593 * @pdev: PCI device struct 6594 * 6595 * Description: This routine is called by the pci error recovery 6596 * code after the PCI slot has been reset, just before we 6597 * should resume normal operations. 6598 */ 6599 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6600 { 6601 struct drm_device *dev = pci_get_drvdata(pdev); 6602 struct amdgpu_device *adev = drm_to_adev(dev); 6603 int r, i; 6604 struct amdgpu_reset_context reset_context; 6605 u32 memsize; 6606 struct list_head device_list; 6607 6608 /* PCI error slot reset should be skipped During RAS recovery */ 6609 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 6610 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && 6611 amdgpu_ras_in_recovery(adev)) 6612 return PCI_ERS_RESULT_RECOVERED; 6613 6614 DRM_INFO("PCI error: slot reset callback!!\n"); 6615 6616 memset(&reset_context, 0, sizeof(reset_context)); 6617 6618 INIT_LIST_HEAD(&device_list); 6619 list_add_tail(&adev->reset_list, &device_list); 6620 6621 /* wait for asic to come out of reset */ 6622 msleep(500); 6623 6624 /* Restore PCI confspace */ 6625 amdgpu_device_load_pci_state(pdev); 6626 6627 /* confirm ASIC came out of reset */ 6628 for (i = 0; i < adev->usec_timeout; i++) { 6629 memsize = amdgpu_asic_get_config_memsize(adev); 6630 6631 if (memsize != 0xffffffff) 6632 break; 6633 udelay(1); 6634 } 6635 if (memsize == 0xffffffff) { 6636 r = -ETIME; 6637 goto out; 6638 } 6639 6640 reset_context.method = AMD_RESET_METHOD_NONE; 6641 reset_context.reset_req_dev = adev; 6642 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6643 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6644 6645 adev->no_hw_access = true; 6646 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 6647 adev->no_hw_access = false; 6648 if (r) 6649 goto out; 6650 6651 r = amdgpu_do_asic_reset(&device_list, &reset_context); 6652 6653 out: 6654 if (!r) { 6655 if (amdgpu_device_cache_pci_state(adev->pdev)) 6656 pci_restore_state(adev->pdev); 6657 6658 DRM_INFO("PCIe error recovery succeeded\n"); 6659 } else { 6660 DRM_ERROR("PCIe error recovery failed, err:%d", r); 6661 amdgpu_device_unset_mp1_state(adev); 6662 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6663 } 6664 6665 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6666 } 6667 6668 /** 6669 * amdgpu_pci_resume() - resume normal ops after PCI reset 6670 * @pdev: pointer to PCI device 6671 * 6672 * Called when the error recovery driver tells us that its 6673 * OK to resume normal operation. 6674 */ 6675 void amdgpu_pci_resume(struct pci_dev *pdev) 6676 { 6677 struct drm_device *dev = pci_get_drvdata(pdev); 6678 struct amdgpu_device *adev = drm_to_adev(dev); 6679 int i; 6680 6681 6682 DRM_INFO("PCI error: resume callback!!\n"); 6683 6684 /* Only continue execution for the case of pci_channel_io_frozen */ 6685 if (adev->pci_channel_state != pci_channel_io_frozen) 6686 return; 6687 6688 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6689 struct amdgpu_ring *ring = adev->rings[i]; 6690 6691 if (!amdgpu_ring_sched_ready(ring)) 6692 continue; 6693 6694 drm_sched_start(&ring->sched, 0); 6695 } 6696 6697 amdgpu_device_unset_mp1_state(adev); 6698 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6699 } 6700 6701 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6702 { 6703 struct drm_device *dev = pci_get_drvdata(pdev); 6704 struct amdgpu_device *adev = drm_to_adev(dev); 6705 int r; 6706 6707 if (amdgpu_sriov_vf(adev)) 6708 return false; 6709 6710 r = pci_save_state(pdev); 6711 if (!r) { 6712 kfree(adev->pci_state); 6713 6714 adev->pci_state = pci_store_saved_state(pdev); 6715 6716 if (!adev->pci_state) { 6717 DRM_ERROR("Failed to store PCI saved state"); 6718 return false; 6719 } 6720 } else { 6721 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6722 return false; 6723 } 6724 6725 return true; 6726 } 6727 6728 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6729 { 6730 struct drm_device *dev = pci_get_drvdata(pdev); 6731 struct amdgpu_device *adev = drm_to_adev(dev); 6732 int r; 6733 6734 if (!adev->pci_state) 6735 return false; 6736 6737 r = pci_load_saved_state(pdev, adev->pci_state); 6738 6739 if (!r) { 6740 pci_restore_state(pdev); 6741 } else { 6742 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6743 return false; 6744 } 6745 6746 return true; 6747 } 6748 6749 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6750 struct amdgpu_ring *ring) 6751 { 6752 #ifdef CONFIG_X86_64 6753 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6754 return; 6755 #endif 6756 if (adev->gmc.xgmi.connected_to_cpu) 6757 return; 6758 6759 if (ring && ring->funcs->emit_hdp_flush) 6760 amdgpu_ring_emit_hdp_flush(ring); 6761 else 6762 amdgpu_asic_flush_hdp(adev, ring); 6763 } 6764 6765 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6766 struct amdgpu_ring *ring) 6767 { 6768 #ifdef CONFIG_X86_64 6769 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6770 return; 6771 #endif 6772 if (adev->gmc.xgmi.connected_to_cpu) 6773 return; 6774 6775 amdgpu_asic_invalidate_hdp(adev, ring); 6776 } 6777 6778 int amdgpu_in_reset(struct amdgpu_device *adev) 6779 { 6780 return atomic_read(&adev->reset_domain->in_gpu_reset); 6781 } 6782 6783 /** 6784 * amdgpu_device_halt() - bring hardware to some kind of halt state 6785 * 6786 * @adev: amdgpu_device pointer 6787 * 6788 * Bring hardware to some kind of halt state so that no one can touch it 6789 * any more. It will help to maintain error context when error occurred. 6790 * Compare to a simple hang, the system will keep stable at least for SSH 6791 * access. Then it should be trivial to inspect the hardware state and 6792 * see what's going on. Implemented as following: 6793 * 6794 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6795 * clears all CPU mappings to device, disallows remappings through page faults 6796 * 2. amdgpu_irq_disable_all() disables all interrupts 6797 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6798 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6799 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6800 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6801 * flush any in flight DMA operations 6802 */ 6803 void amdgpu_device_halt(struct amdgpu_device *adev) 6804 { 6805 struct pci_dev *pdev = adev->pdev; 6806 struct drm_device *ddev = adev_to_drm(adev); 6807 6808 amdgpu_xcp_dev_unplug(adev); 6809 drm_dev_unplug(ddev); 6810 6811 amdgpu_irq_disable_all(adev); 6812 6813 amdgpu_fence_driver_hw_fini(adev); 6814 6815 adev->no_hw_access = true; 6816 6817 amdgpu_device_unmap_mmio(adev); 6818 6819 pci_disable_device(pdev); 6820 pci_wait_for_pending_transaction(pdev); 6821 } 6822 6823 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6824 u32 reg) 6825 { 6826 unsigned long flags, address, data; 6827 u32 r; 6828 6829 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6830 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6831 6832 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6833 WREG32(address, reg * 4); 6834 (void)RREG32(address); 6835 r = RREG32(data); 6836 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6837 return r; 6838 } 6839 6840 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6841 u32 reg, u32 v) 6842 { 6843 unsigned long flags, address, data; 6844 6845 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6846 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6847 6848 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6849 WREG32(address, reg * 4); 6850 (void)RREG32(address); 6851 WREG32(data, v); 6852 (void)RREG32(data); 6853 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6854 } 6855 6856 /** 6857 * amdgpu_device_get_gang - return a reference to the current gang 6858 * @adev: amdgpu_device pointer 6859 * 6860 * Returns: A new reference to the current gang leader. 6861 */ 6862 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 6863 { 6864 struct dma_fence *fence; 6865 6866 rcu_read_lock(); 6867 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 6868 rcu_read_unlock(); 6869 return fence; 6870 } 6871 6872 /** 6873 * amdgpu_device_switch_gang - switch to a new gang 6874 * @adev: amdgpu_device pointer 6875 * @gang: the gang to switch to 6876 * 6877 * Try to switch to a new gang. 6878 * Returns: NULL if we switched to the new gang or a reference to the current 6879 * gang leader. 6880 */ 6881 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6882 struct dma_fence *gang) 6883 { 6884 struct dma_fence *old = NULL; 6885 6886 do { 6887 dma_fence_put(old); 6888 old = amdgpu_device_get_gang(adev); 6889 if (old == gang) 6890 break; 6891 6892 if (!dma_fence_is_signaled(old)) 6893 return old; 6894 6895 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6896 old, gang) != old); 6897 6898 dma_fence_put(old); 6899 return NULL; 6900 } 6901 6902 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6903 { 6904 switch (adev->asic_type) { 6905 #ifdef CONFIG_DRM_AMDGPU_SI 6906 case CHIP_HAINAN: 6907 #endif 6908 case CHIP_TOPAZ: 6909 /* chips with no display hardware */ 6910 return false; 6911 #ifdef CONFIG_DRM_AMDGPU_SI 6912 case CHIP_TAHITI: 6913 case CHIP_PITCAIRN: 6914 case CHIP_VERDE: 6915 case CHIP_OLAND: 6916 #endif 6917 #ifdef CONFIG_DRM_AMDGPU_CIK 6918 case CHIP_BONAIRE: 6919 case CHIP_HAWAII: 6920 case CHIP_KAVERI: 6921 case CHIP_KABINI: 6922 case CHIP_MULLINS: 6923 #endif 6924 case CHIP_TONGA: 6925 case CHIP_FIJI: 6926 case CHIP_POLARIS10: 6927 case CHIP_POLARIS11: 6928 case CHIP_POLARIS12: 6929 case CHIP_VEGAM: 6930 case CHIP_CARRIZO: 6931 case CHIP_STONEY: 6932 /* chips with display hardware */ 6933 return true; 6934 default: 6935 /* IP discovery */ 6936 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 6937 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6938 return false; 6939 return true; 6940 } 6941 } 6942 6943 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6944 uint32_t inst, uint32_t reg_addr, char reg_name[], 6945 uint32_t expected_value, uint32_t mask) 6946 { 6947 uint32_t ret = 0; 6948 uint32_t old_ = 0; 6949 uint32_t tmp_ = RREG32(reg_addr); 6950 uint32_t loop = adev->usec_timeout; 6951 6952 while ((tmp_ & (mask)) != (expected_value)) { 6953 if (old_ != tmp_) { 6954 loop = adev->usec_timeout; 6955 old_ = tmp_; 6956 } else 6957 udelay(1); 6958 tmp_ = RREG32(reg_addr); 6959 loop--; 6960 if (!loop) { 6961 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6962 inst, reg_name, (uint32_t)expected_value, 6963 (uint32_t)(tmp_ & (mask))); 6964 ret = -ETIMEDOUT; 6965 break; 6966 } 6967 } 6968 return ret; 6969 } 6970 6971 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 6972 { 6973 ssize_t size = 0; 6974 6975 if (!ring || !ring->adev) 6976 return size; 6977 6978 if (amdgpu_device_should_recover_gpu(ring->adev)) 6979 size |= AMDGPU_RESET_TYPE_FULL; 6980 6981 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 6982 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 6983 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 6984 6985 return size; 6986 } 6987 6988 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 6989 { 6990 ssize_t size = 0; 6991 6992 if (supported_reset == 0) { 6993 size += sysfs_emit_at(buf, size, "unsupported"); 6994 size += sysfs_emit_at(buf, size, "\n"); 6995 return size; 6996 6997 } 6998 6999 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 7000 size += sysfs_emit_at(buf, size, "soft "); 7001 7002 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 7003 size += sysfs_emit_at(buf, size, "queue "); 7004 7005 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 7006 size += sysfs_emit_at(buf, size, "pipe "); 7007 7008 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 7009 size += sysfs_emit_at(buf, size, "full "); 7010 7011 size += sysfs_emit_at(buf, size, "\n"); 7012 return size; 7013 } 7014