1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_client_event.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_probe_helper.h> 44 #include <drm/amdgpu_drm.h> 45 #include <linux/device.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 #include "amdgpu_virt.h" 78 #include "amdgpu_dev_coredump.h" 79 80 #include <linux/suspend.h> 81 #include <drm/task_barrier.h> 82 #include <linux/pm_runtime.h> 83 84 #include <drm/drm_drv.h> 85 86 #if IS_ENABLED(CONFIG_X86) 87 #include <asm/intel-family.h> 88 #endif 89 90 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 97 98 #define AMDGPU_RESUME_MS 2000 99 #define AMDGPU_MAX_RETRY_LIMIT 2 100 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 101 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 102 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 103 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 104 105 #define AMDGPU_VBIOS_SKIP (1U << 0) 106 #define AMDGPU_VBIOS_OPTIONAL (1U << 1) 107 108 static const struct drm_driver amdgpu_kms_driver; 109 110 const char *amdgpu_asic_name[] = { 111 "TAHITI", 112 "PITCAIRN", 113 "VERDE", 114 "OLAND", 115 "HAINAN", 116 "BONAIRE", 117 "KAVERI", 118 "KABINI", 119 "HAWAII", 120 "MULLINS", 121 "TOPAZ", 122 "TONGA", 123 "FIJI", 124 "CARRIZO", 125 "STONEY", 126 "POLARIS10", 127 "POLARIS11", 128 "POLARIS12", 129 "VEGAM", 130 "VEGA10", 131 "VEGA12", 132 "VEGA20", 133 "RAVEN", 134 "ARCTURUS", 135 "RENOIR", 136 "ALDEBARAN", 137 "NAVI10", 138 "CYAN_SKILLFISH", 139 "NAVI14", 140 "NAVI12", 141 "SIENNA_CICHLID", 142 "NAVY_FLOUNDER", 143 "VANGOGH", 144 "DIMGREY_CAVEFISH", 145 "BEIGE_GOBY", 146 "YELLOW_CARP", 147 "IP DISCOVERY", 148 "LAST", 149 }; 150 151 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0) 152 /* 153 * Default init level where all blocks are expected to be initialized. This is 154 * the level of initialization expected by default and also after a full reset 155 * of the device. 156 */ 157 struct amdgpu_init_level amdgpu_init_default = { 158 .level = AMDGPU_INIT_LEVEL_DEFAULT, 159 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 160 }; 161 162 struct amdgpu_init_level amdgpu_init_recovery = { 163 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, 164 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 165 }; 166 167 /* 168 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 169 * is used for cases like reset on initialization where the entire hive needs to 170 * be reset before first use. 171 */ 172 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 173 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 174 .hwini_ip_block_mask = 175 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 176 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 177 BIT(AMD_IP_BLOCK_TYPE_PSP) 178 }; 179 180 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 181 enum amd_ip_block_type block) 182 { 183 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 184 } 185 186 void amdgpu_set_init_level(struct amdgpu_device *adev, 187 enum amdgpu_init_lvl_id lvl) 188 { 189 switch (lvl) { 190 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 191 adev->init_lvl = &amdgpu_init_minimal_xgmi; 192 break; 193 case AMDGPU_INIT_LEVEL_RESET_RECOVERY: 194 adev->init_lvl = &amdgpu_init_recovery; 195 break; 196 case AMDGPU_INIT_LEVEL_DEFAULT: 197 fallthrough; 198 default: 199 adev->init_lvl = &amdgpu_init_default; 200 break; 201 } 202 } 203 204 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 205 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 206 void *data); 207 208 /** 209 * DOC: pcie_replay_count 210 * 211 * The amdgpu driver provides a sysfs API for reporting the total number 212 * of PCIe replays (NAKs). 213 * The file pcie_replay_count is used for this and returns the total 214 * number of replays as a sum of the NAKs generated and NAKs received. 215 */ 216 217 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 218 struct device_attribute *attr, char *buf) 219 { 220 struct drm_device *ddev = dev_get_drvdata(dev); 221 struct amdgpu_device *adev = drm_to_adev(ddev); 222 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 223 224 return sysfs_emit(buf, "%llu\n", cnt); 225 } 226 227 static DEVICE_ATTR(pcie_replay_count, 0444, 228 amdgpu_device_get_pcie_replay_count, NULL); 229 230 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 231 struct bin_attribute *attr, char *buf, 232 loff_t ppos, size_t count) 233 { 234 struct device *dev = kobj_to_dev(kobj); 235 struct drm_device *ddev = dev_get_drvdata(dev); 236 struct amdgpu_device *adev = drm_to_adev(ddev); 237 ssize_t bytes_read; 238 239 switch (ppos) { 240 case AMDGPU_SYS_REG_STATE_XGMI: 241 bytes_read = amdgpu_asic_get_reg_state( 242 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 243 break; 244 case AMDGPU_SYS_REG_STATE_WAFL: 245 bytes_read = amdgpu_asic_get_reg_state( 246 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 247 break; 248 case AMDGPU_SYS_REG_STATE_PCIE: 249 bytes_read = amdgpu_asic_get_reg_state( 250 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 251 break; 252 case AMDGPU_SYS_REG_STATE_USR: 253 bytes_read = amdgpu_asic_get_reg_state( 254 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 255 break; 256 case AMDGPU_SYS_REG_STATE_USR_1: 257 bytes_read = amdgpu_asic_get_reg_state( 258 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 259 break; 260 default: 261 return -EINVAL; 262 } 263 264 return bytes_read; 265 } 266 267 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 268 AMDGPU_SYS_REG_STATE_END); 269 270 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 271 { 272 int ret; 273 274 if (!amdgpu_asic_get_reg_state_supported(adev)) 275 return 0; 276 277 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 278 279 return ret; 280 } 281 282 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 283 { 284 if (!amdgpu_asic_get_reg_state_supported(adev)) 285 return; 286 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 287 } 288 289 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block) 290 { 291 int r; 292 293 if (ip_block->version->funcs->suspend) { 294 r = ip_block->version->funcs->suspend(ip_block); 295 if (r) { 296 dev_err(ip_block->adev->dev, 297 "suspend of IP block <%s> failed %d\n", 298 ip_block->version->funcs->name, r); 299 return r; 300 } 301 } 302 303 ip_block->status.hw = false; 304 return 0; 305 } 306 307 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block) 308 { 309 int r; 310 311 if (ip_block->version->funcs->resume) { 312 r = ip_block->version->funcs->resume(ip_block); 313 if (r) { 314 dev_err(ip_block->adev->dev, 315 "resume of IP block <%s> failed %d\n", 316 ip_block->version->funcs->name, r); 317 return r; 318 } 319 } 320 321 ip_block->status.hw = true; 322 return 0; 323 } 324 325 /** 326 * DOC: board_info 327 * 328 * The amdgpu driver provides a sysfs API for giving board related information. 329 * It provides the form factor information in the format 330 * 331 * type : form factor 332 * 333 * Possible form factor values 334 * 335 * - "cem" - PCIE CEM card 336 * - "oam" - Open Compute Accelerator Module 337 * - "unknown" - Not known 338 * 339 */ 340 341 static ssize_t amdgpu_device_get_board_info(struct device *dev, 342 struct device_attribute *attr, 343 char *buf) 344 { 345 struct drm_device *ddev = dev_get_drvdata(dev); 346 struct amdgpu_device *adev = drm_to_adev(ddev); 347 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 348 const char *pkg; 349 350 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 351 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 352 353 switch (pkg_type) { 354 case AMDGPU_PKG_TYPE_CEM: 355 pkg = "cem"; 356 break; 357 case AMDGPU_PKG_TYPE_OAM: 358 pkg = "oam"; 359 break; 360 default: 361 pkg = "unknown"; 362 break; 363 } 364 365 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 366 } 367 368 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 369 370 static struct attribute *amdgpu_board_attrs[] = { 371 &dev_attr_board_info.attr, 372 NULL, 373 }; 374 375 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 376 struct attribute *attr, int n) 377 { 378 struct device *dev = kobj_to_dev(kobj); 379 struct drm_device *ddev = dev_get_drvdata(dev); 380 struct amdgpu_device *adev = drm_to_adev(ddev); 381 382 if (adev->flags & AMD_IS_APU) 383 return 0; 384 385 return attr->mode; 386 } 387 388 static const struct attribute_group amdgpu_board_attrs_group = { 389 .attrs = amdgpu_board_attrs, 390 .is_visible = amdgpu_board_attrs_is_visible 391 }; 392 393 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 394 395 396 /** 397 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 398 * 399 * @dev: drm_device pointer 400 * 401 * Returns true if the device is a dGPU with ATPX power control, 402 * otherwise return false. 403 */ 404 bool amdgpu_device_supports_px(struct drm_device *dev) 405 { 406 struct amdgpu_device *adev = drm_to_adev(dev); 407 408 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 409 return true; 410 return false; 411 } 412 413 /** 414 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 415 * 416 * @dev: drm_device pointer 417 * 418 * Returns true if the device is a dGPU with ACPI power control, 419 * otherwise return false. 420 */ 421 bool amdgpu_device_supports_boco(struct drm_device *dev) 422 { 423 struct amdgpu_device *adev = drm_to_adev(dev); 424 425 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) 426 return false; 427 428 if (adev->has_pr3 || 429 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 430 return true; 431 return false; 432 } 433 434 /** 435 * amdgpu_device_supports_baco - Does the device support BACO 436 * 437 * @dev: drm_device pointer 438 * 439 * Return: 440 * 1 if the device supports BACO; 441 * 3 if the device supports MACO (only works if BACO is supported) 442 * otherwise return 0. 443 */ 444 int amdgpu_device_supports_baco(struct drm_device *dev) 445 { 446 struct amdgpu_device *adev = drm_to_adev(dev); 447 448 return amdgpu_asic_supports_baco(adev); 449 } 450 451 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 452 { 453 struct drm_device *dev; 454 int bamaco_support; 455 456 dev = adev_to_drm(adev); 457 458 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 459 bamaco_support = amdgpu_device_supports_baco(dev); 460 461 switch (amdgpu_runtime_pm) { 462 case 2: 463 if (bamaco_support & MACO_SUPPORT) { 464 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 465 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 466 } else if (bamaco_support == BACO_SUPPORT) { 467 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 468 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 469 } 470 break; 471 case 1: 472 if (bamaco_support & BACO_SUPPORT) { 473 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 474 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 475 } 476 break; 477 case -1: 478 case -2: 479 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */ 480 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 481 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 482 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */ 483 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 484 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 485 } else { 486 if (!bamaco_support) 487 goto no_runtime_pm; 488 489 switch (adev->asic_type) { 490 case CHIP_VEGA20: 491 case CHIP_ARCTURUS: 492 /* BACO are not supported on vega20 and arctrus */ 493 break; 494 case CHIP_VEGA10: 495 /* enable BACO as runpm mode if noretry=0 */ 496 if (!adev->gmc.noretry) 497 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 498 break; 499 default: 500 /* enable BACO as runpm mode on CI+ */ 501 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 502 break; 503 } 504 505 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 506 if (bamaco_support & MACO_SUPPORT) { 507 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 508 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 509 } else { 510 dev_info(adev->dev, "Using BACO for runtime pm\n"); 511 } 512 } 513 } 514 break; 515 case 0: 516 dev_info(adev->dev, "runtime pm is manually disabled\n"); 517 break; 518 default: 519 break; 520 } 521 522 no_runtime_pm: 523 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 524 dev_info(adev->dev, "Runtime PM not available\n"); 525 } 526 /** 527 * amdgpu_device_supports_smart_shift - Is the device dGPU with 528 * smart shift support 529 * 530 * @dev: drm_device pointer 531 * 532 * Returns true if the device is a dGPU with Smart Shift support, 533 * otherwise returns false. 534 */ 535 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 536 { 537 return (amdgpu_device_supports_boco(dev) && 538 amdgpu_acpi_is_power_shift_control_supported()); 539 } 540 541 /* 542 * VRAM access helper functions 543 */ 544 545 /** 546 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 547 * 548 * @adev: amdgpu_device pointer 549 * @pos: offset of the buffer in vram 550 * @buf: virtual address of the buffer in system memory 551 * @size: read/write size, sizeof(@buf) must > @size 552 * @write: true - write to vram, otherwise - read from vram 553 */ 554 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 555 void *buf, size_t size, bool write) 556 { 557 unsigned long flags; 558 uint32_t hi = ~0, tmp = 0; 559 uint32_t *data = buf; 560 uint64_t last; 561 int idx; 562 563 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 564 return; 565 566 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 567 568 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 569 for (last = pos + size; pos < last; pos += 4) { 570 tmp = pos >> 31; 571 572 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 573 if (tmp != hi) { 574 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 575 hi = tmp; 576 } 577 if (write) 578 WREG32_NO_KIQ(mmMM_DATA, *data++); 579 else 580 *data++ = RREG32_NO_KIQ(mmMM_DATA); 581 } 582 583 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 584 drm_dev_exit(idx); 585 } 586 587 /** 588 * amdgpu_device_aper_access - access vram by vram aperture 589 * 590 * @adev: amdgpu_device pointer 591 * @pos: offset of the buffer in vram 592 * @buf: virtual address of the buffer in system memory 593 * @size: read/write size, sizeof(@buf) must > @size 594 * @write: true - write to vram, otherwise - read from vram 595 * 596 * The return value means how many bytes have been transferred. 597 */ 598 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 599 void *buf, size_t size, bool write) 600 { 601 #ifdef CONFIG_64BIT 602 void __iomem *addr; 603 size_t count = 0; 604 uint64_t last; 605 606 if (!adev->mman.aper_base_kaddr) 607 return 0; 608 609 last = min(pos + size, adev->gmc.visible_vram_size); 610 if (last > pos) { 611 addr = adev->mman.aper_base_kaddr + pos; 612 count = last - pos; 613 614 if (write) { 615 memcpy_toio(addr, buf, count); 616 /* Make sure HDP write cache flush happens without any reordering 617 * after the system memory contents are sent over PCIe device 618 */ 619 mb(); 620 amdgpu_device_flush_hdp(adev, NULL); 621 } else { 622 amdgpu_device_invalidate_hdp(adev, NULL); 623 /* Make sure HDP read cache is invalidated before issuing a read 624 * to the PCIe device 625 */ 626 mb(); 627 memcpy_fromio(buf, addr, count); 628 } 629 630 } 631 632 return count; 633 #else 634 return 0; 635 #endif 636 } 637 638 /** 639 * amdgpu_device_vram_access - read/write a buffer in vram 640 * 641 * @adev: amdgpu_device pointer 642 * @pos: offset of the buffer in vram 643 * @buf: virtual address of the buffer in system memory 644 * @size: read/write size, sizeof(@buf) must > @size 645 * @write: true - write to vram, otherwise - read from vram 646 */ 647 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 648 void *buf, size_t size, bool write) 649 { 650 size_t count; 651 652 /* try to using vram apreature to access vram first */ 653 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 654 size -= count; 655 if (size) { 656 /* using MM to access rest vram */ 657 pos += count; 658 buf += count; 659 amdgpu_device_mm_access(adev, pos, buf, size, write); 660 } 661 } 662 663 /* 664 * register access helper functions. 665 */ 666 667 /* Check if hw access should be skipped because of hotplug or device error */ 668 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 669 { 670 if (adev->no_hw_access) 671 return true; 672 673 #ifdef CONFIG_LOCKDEP 674 /* 675 * This is a bit complicated to understand, so worth a comment. What we assert 676 * here is that the GPU reset is not running on another thread in parallel. 677 * 678 * For this we trylock the read side of the reset semaphore, if that succeeds 679 * we know that the reset is not running in parallel. 680 * 681 * If the trylock fails we assert that we are either already holding the read 682 * side of the lock or are the reset thread itself and hold the write side of 683 * the lock. 684 */ 685 if (in_task()) { 686 if (down_read_trylock(&adev->reset_domain->sem)) 687 up_read(&adev->reset_domain->sem); 688 else 689 lockdep_assert_held(&adev->reset_domain->sem); 690 } 691 #endif 692 return false; 693 } 694 695 /** 696 * amdgpu_device_rreg - read a memory mapped IO or indirect register 697 * 698 * @adev: amdgpu_device pointer 699 * @reg: dword aligned register offset 700 * @acc_flags: access flags which require special behavior 701 * 702 * Returns the 32 bit value from the offset specified. 703 */ 704 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 705 uint32_t reg, uint32_t acc_flags) 706 { 707 uint32_t ret; 708 709 if (amdgpu_device_skip_hw_access(adev)) 710 return 0; 711 712 if ((reg * 4) < adev->rmmio_size) { 713 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 714 amdgpu_sriov_runtime(adev) && 715 down_read_trylock(&adev->reset_domain->sem)) { 716 ret = amdgpu_kiq_rreg(adev, reg, 0); 717 up_read(&adev->reset_domain->sem); 718 } else { 719 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 720 } 721 } else { 722 ret = adev->pcie_rreg(adev, reg * 4); 723 } 724 725 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 726 727 return ret; 728 } 729 730 /* 731 * MMIO register read with bytes helper functions 732 * @offset:bytes offset from MMIO start 733 */ 734 735 /** 736 * amdgpu_mm_rreg8 - read a memory mapped IO register 737 * 738 * @adev: amdgpu_device pointer 739 * @offset: byte aligned register offset 740 * 741 * Returns the 8 bit value from the offset specified. 742 */ 743 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 744 { 745 if (amdgpu_device_skip_hw_access(adev)) 746 return 0; 747 748 if (offset < adev->rmmio_size) 749 return (readb(adev->rmmio + offset)); 750 BUG(); 751 } 752 753 754 /** 755 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 756 * 757 * @adev: amdgpu_device pointer 758 * @reg: dword aligned register offset 759 * @acc_flags: access flags which require special behavior 760 * @xcc_id: xcc accelerated compute core id 761 * 762 * Returns the 32 bit value from the offset specified. 763 */ 764 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 765 uint32_t reg, uint32_t acc_flags, 766 uint32_t xcc_id) 767 { 768 uint32_t ret, rlcg_flag; 769 770 if (amdgpu_device_skip_hw_access(adev)) 771 return 0; 772 773 if ((reg * 4) < adev->rmmio_size) { 774 if (amdgpu_sriov_vf(adev) && 775 !amdgpu_sriov_runtime(adev) && 776 adev->gfx.rlc.rlcg_reg_access_supported && 777 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 778 GC_HWIP, false, 779 &rlcg_flag)) { 780 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 781 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 782 amdgpu_sriov_runtime(adev) && 783 down_read_trylock(&adev->reset_domain->sem)) { 784 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 785 up_read(&adev->reset_domain->sem); 786 } else { 787 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 788 } 789 } else { 790 ret = adev->pcie_rreg(adev, reg * 4); 791 } 792 793 return ret; 794 } 795 796 /* 797 * MMIO register write with bytes helper functions 798 * @offset:bytes offset from MMIO start 799 * @value: the value want to be written to the register 800 */ 801 802 /** 803 * amdgpu_mm_wreg8 - read a memory mapped IO register 804 * 805 * @adev: amdgpu_device pointer 806 * @offset: byte aligned register offset 807 * @value: 8 bit value to write 808 * 809 * Writes the value specified to the offset specified. 810 */ 811 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 812 { 813 if (amdgpu_device_skip_hw_access(adev)) 814 return; 815 816 if (offset < adev->rmmio_size) 817 writeb(value, adev->rmmio + offset); 818 else 819 BUG(); 820 } 821 822 /** 823 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 824 * 825 * @adev: amdgpu_device pointer 826 * @reg: dword aligned register offset 827 * @v: 32 bit value to write to the register 828 * @acc_flags: access flags which require special behavior 829 * 830 * Writes the value specified to the offset specified. 831 */ 832 void amdgpu_device_wreg(struct amdgpu_device *adev, 833 uint32_t reg, uint32_t v, 834 uint32_t acc_flags) 835 { 836 if (amdgpu_device_skip_hw_access(adev)) 837 return; 838 839 if ((reg * 4) < adev->rmmio_size) { 840 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 841 amdgpu_sriov_runtime(adev) && 842 down_read_trylock(&adev->reset_domain->sem)) { 843 amdgpu_kiq_wreg(adev, reg, v, 0); 844 up_read(&adev->reset_domain->sem); 845 } else { 846 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 847 } 848 } else { 849 adev->pcie_wreg(adev, reg * 4, v); 850 } 851 852 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 853 } 854 855 /** 856 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 857 * 858 * @adev: amdgpu_device pointer 859 * @reg: mmio/rlc register 860 * @v: value to write 861 * @xcc_id: xcc accelerated compute core id 862 * 863 * this function is invoked only for the debugfs register access 864 */ 865 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 866 uint32_t reg, uint32_t v, 867 uint32_t xcc_id) 868 { 869 if (amdgpu_device_skip_hw_access(adev)) 870 return; 871 872 if (amdgpu_sriov_fullaccess(adev) && 873 adev->gfx.rlc.funcs && 874 adev->gfx.rlc.funcs->is_rlcg_access_range) { 875 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 876 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 877 } else if ((reg * 4) >= adev->rmmio_size) { 878 adev->pcie_wreg(adev, reg * 4, v); 879 } else { 880 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 881 } 882 } 883 884 /** 885 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 886 * 887 * @adev: amdgpu_device pointer 888 * @reg: dword aligned register offset 889 * @v: 32 bit value to write to the register 890 * @acc_flags: access flags which require special behavior 891 * @xcc_id: xcc accelerated compute core id 892 * 893 * Writes the value specified to the offset specified. 894 */ 895 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 896 uint32_t reg, uint32_t v, 897 uint32_t acc_flags, uint32_t xcc_id) 898 { 899 uint32_t rlcg_flag; 900 901 if (amdgpu_device_skip_hw_access(adev)) 902 return; 903 904 if ((reg * 4) < adev->rmmio_size) { 905 if (amdgpu_sriov_vf(adev) && 906 !amdgpu_sriov_runtime(adev) && 907 adev->gfx.rlc.rlcg_reg_access_supported && 908 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 909 GC_HWIP, true, 910 &rlcg_flag)) { 911 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 912 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 913 amdgpu_sriov_runtime(adev) && 914 down_read_trylock(&adev->reset_domain->sem)) { 915 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 916 up_read(&adev->reset_domain->sem); 917 } else { 918 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 919 } 920 } else { 921 adev->pcie_wreg(adev, reg * 4, v); 922 } 923 } 924 925 /** 926 * amdgpu_device_indirect_rreg - read an indirect register 927 * 928 * @adev: amdgpu_device pointer 929 * @reg_addr: indirect register address to read from 930 * 931 * Returns the value of indirect register @reg_addr 932 */ 933 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 934 u32 reg_addr) 935 { 936 unsigned long flags, pcie_index, pcie_data; 937 void __iomem *pcie_index_offset; 938 void __iomem *pcie_data_offset; 939 u32 r; 940 941 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 942 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 943 944 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 945 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 946 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 947 948 writel(reg_addr, pcie_index_offset); 949 readl(pcie_index_offset); 950 r = readl(pcie_data_offset); 951 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 952 953 return r; 954 } 955 956 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 957 u64 reg_addr) 958 { 959 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 960 u32 r; 961 void __iomem *pcie_index_offset; 962 void __iomem *pcie_index_hi_offset; 963 void __iomem *pcie_data_offset; 964 965 if (unlikely(!adev->nbio.funcs)) { 966 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 967 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 968 } else { 969 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 970 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 971 } 972 973 if (reg_addr >> 32) { 974 if (unlikely(!adev->nbio.funcs)) 975 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 976 else 977 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 978 } else { 979 pcie_index_hi = 0; 980 } 981 982 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 983 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 984 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 985 if (pcie_index_hi != 0) 986 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 987 pcie_index_hi * 4; 988 989 writel(reg_addr, pcie_index_offset); 990 readl(pcie_index_offset); 991 if (pcie_index_hi != 0) { 992 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 993 readl(pcie_index_hi_offset); 994 } 995 r = readl(pcie_data_offset); 996 997 /* clear the high bits */ 998 if (pcie_index_hi != 0) { 999 writel(0, pcie_index_hi_offset); 1000 readl(pcie_index_hi_offset); 1001 } 1002 1003 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1004 1005 return r; 1006 } 1007 1008 /** 1009 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 1010 * 1011 * @adev: amdgpu_device pointer 1012 * @reg_addr: indirect register address to read from 1013 * 1014 * Returns the value of indirect register @reg_addr 1015 */ 1016 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1017 u32 reg_addr) 1018 { 1019 unsigned long flags, pcie_index, pcie_data; 1020 void __iomem *pcie_index_offset; 1021 void __iomem *pcie_data_offset; 1022 u64 r; 1023 1024 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1025 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1026 1027 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1028 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1029 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1030 1031 /* read low 32 bits */ 1032 writel(reg_addr, pcie_index_offset); 1033 readl(pcie_index_offset); 1034 r = readl(pcie_data_offset); 1035 /* read high 32 bits */ 1036 writel(reg_addr + 4, pcie_index_offset); 1037 readl(pcie_index_offset); 1038 r |= ((u64)readl(pcie_data_offset) << 32); 1039 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1040 1041 return r; 1042 } 1043 1044 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1045 u64 reg_addr) 1046 { 1047 unsigned long flags, pcie_index, pcie_data; 1048 unsigned long pcie_index_hi = 0; 1049 void __iomem *pcie_index_offset; 1050 void __iomem *pcie_index_hi_offset; 1051 void __iomem *pcie_data_offset; 1052 u64 r; 1053 1054 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1055 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1056 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1057 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1058 1059 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1060 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1061 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1062 if (pcie_index_hi != 0) 1063 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1064 pcie_index_hi * 4; 1065 1066 /* read low 32 bits */ 1067 writel(reg_addr, pcie_index_offset); 1068 readl(pcie_index_offset); 1069 if (pcie_index_hi != 0) { 1070 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1071 readl(pcie_index_hi_offset); 1072 } 1073 r = readl(pcie_data_offset); 1074 /* read high 32 bits */ 1075 writel(reg_addr + 4, pcie_index_offset); 1076 readl(pcie_index_offset); 1077 if (pcie_index_hi != 0) { 1078 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1079 readl(pcie_index_hi_offset); 1080 } 1081 r |= ((u64)readl(pcie_data_offset) << 32); 1082 1083 /* clear the high bits */ 1084 if (pcie_index_hi != 0) { 1085 writel(0, pcie_index_hi_offset); 1086 readl(pcie_index_hi_offset); 1087 } 1088 1089 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1090 1091 return r; 1092 } 1093 1094 /** 1095 * amdgpu_device_indirect_wreg - write an indirect register address 1096 * 1097 * @adev: amdgpu_device pointer 1098 * @reg_addr: indirect register offset 1099 * @reg_data: indirect register data 1100 * 1101 */ 1102 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1103 u32 reg_addr, u32 reg_data) 1104 { 1105 unsigned long flags, pcie_index, pcie_data; 1106 void __iomem *pcie_index_offset; 1107 void __iomem *pcie_data_offset; 1108 1109 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1110 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1111 1112 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1113 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1114 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1115 1116 writel(reg_addr, pcie_index_offset); 1117 readl(pcie_index_offset); 1118 writel(reg_data, pcie_data_offset); 1119 readl(pcie_data_offset); 1120 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1121 } 1122 1123 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1124 u64 reg_addr, u32 reg_data) 1125 { 1126 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1127 void __iomem *pcie_index_offset; 1128 void __iomem *pcie_index_hi_offset; 1129 void __iomem *pcie_data_offset; 1130 1131 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1132 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1133 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1134 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1135 else 1136 pcie_index_hi = 0; 1137 1138 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1139 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1140 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1141 if (pcie_index_hi != 0) 1142 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1143 pcie_index_hi * 4; 1144 1145 writel(reg_addr, pcie_index_offset); 1146 readl(pcie_index_offset); 1147 if (pcie_index_hi != 0) { 1148 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1149 readl(pcie_index_hi_offset); 1150 } 1151 writel(reg_data, pcie_data_offset); 1152 readl(pcie_data_offset); 1153 1154 /* clear the high bits */ 1155 if (pcie_index_hi != 0) { 1156 writel(0, pcie_index_hi_offset); 1157 readl(pcie_index_hi_offset); 1158 } 1159 1160 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1161 } 1162 1163 /** 1164 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1165 * 1166 * @adev: amdgpu_device pointer 1167 * @reg_addr: indirect register offset 1168 * @reg_data: indirect register data 1169 * 1170 */ 1171 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1172 u32 reg_addr, u64 reg_data) 1173 { 1174 unsigned long flags, pcie_index, pcie_data; 1175 void __iomem *pcie_index_offset; 1176 void __iomem *pcie_data_offset; 1177 1178 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1179 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1180 1181 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1182 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1183 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1184 1185 /* write low 32 bits */ 1186 writel(reg_addr, pcie_index_offset); 1187 readl(pcie_index_offset); 1188 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1189 readl(pcie_data_offset); 1190 /* write high 32 bits */ 1191 writel(reg_addr + 4, pcie_index_offset); 1192 readl(pcie_index_offset); 1193 writel((u32)(reg_data >> 32), pcie_data_offset); 1194 readl(pcie_data_offset); 1195 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1196 } 1197 1198 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1199 u64 reg_addr, u64 reg_data) 1200 { 1201 unsigned long flags, pcie_index, pcie_data; 1202 unsigned long pcie_index_hi = 0; 1203 void __iomem *pcie_index_offset; 1204 void __iomem *pcie_index_hi_offset; 1205 void __iomem *pcie_data_offset; 1206 1207 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1208 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1209 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1210 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1211 1212 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1213 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1214 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1215 if (pcie_index_hi != 0) 1216 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1217 pcie_index_hi * 4; 1218 1219 /* write low 32 bits */ 1220 writel(reg_addr, pcie_index_offset); 1221 readl(pcie_index_offset); 1222 if (pcie_index_hi != 0) { 1223 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1224 readl(pcie_index_hi_offset); 1225 } 1226 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1227 readl(pcie_data_offset); 1228 /* write high 32 bits */ 1229 writel(reg_addr + 4, pcie_index_offset); 1230 readl(pcie_index_offset); 1231 if (pcie_index_hi != 0) { 1232 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1233 readl(pcie_index_hi_offset); 1234 } 1235 writel((u32)(reg_data >> 32), pcie_data_offset); 1236 readl(pcie_data_offset); 1237 1238 /* clear the high bits */ 1239 if (pcie_index_hi != 0) { 1240 writel(0, pcie_index_hi_offset); 1241 readl(pcie_index_hi_offset); 1242 } 1243 1244 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1245 } 1246 1247 /** 1248 * amdgpu_device_get_rev_id - query device rev_id 1249 * 1250 * @adev: amdgpu_device pointer 1251 * 1252 * Return device rev_id 1253 */ 1254 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1255 { 1256 return adev->nbio.funcs->get_rev_id(adev); 1257 } 1258 1259 /** 1260 * amdgpu_invalid_rreg - dummy reg read function 1261 * 1262 * @adev: amdgpu_device pointer 1263 * @reg: offset of register 1264 * 1265 * Dummy register read function. Used for register blocks 1266 * that certain asics don't have (all asics). 1267 * Returns the value in the register. 1268 */ 1269 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1270 { 1271 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 1272 BUG(); 1273 return 0; 1274 } 1275 1276 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1277 { 1278 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1279 BUG(); 1280 return 0; 1281 } 1282 1283 /** 1284 * amdgpu_invalid_wreg - dummy reg write function 1285 * 1286 * @adev: amdgpu_device pointer 1287 * @reg: offset of register 1288 * @v: value to write to the register 1289 * 1290 * Dummy register read function. Used for register blocks 1291 * that certain asics don't have (all asics). 1292 */ 1293 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1294 { 1295 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 1296 reg, v); 1297 BUG(); 1298 } 1299 1300 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1301 { 1302 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 1303 reg, v); 1304 BUG(); 1305 } 1306 1307 /** 1308 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1309 * 1310 * @adev: amdgpu_device pointer 1311 * @reg: offset of register 1312 * 1313 * Dummy register read function. Used for register blocks 1314 * that certain asics don't have (all asics). 1315 * Returns the value in the register. 1316 */ 1317 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1318 { 1319 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 1320 BUG(); 1321 return 0; 1322 } 1323 1324 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1325 { 1326 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1327 BUG(); 1328 return 0; 1329 } 1330 1331 /** 1332 * amdgpu_invalid_wreg64 - dummy reg write function 1333 * 1334 * @adev: amdgpu_device pointer 1335 * @reg: offset of register 1336 * @v: value to write to the register 1337 * 1338 * Dummy register read function. Used for register blocks 1339 * that certain asics don't have (all asics). 1340 */ 1341 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1342 { 1343 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1344 reg, v); 1345 BUG(); 1346 } 1347 1348 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1349 { 1350 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1351 reg, v); 1352 BUG(); 1353 } 1354 1355 /** 1356 * amdgpu_block_invalid_rreg - dummy reg read function 1357 * 1358 * @adev: amdgpu_device pointer 1359 * @block: offset of instance 1360 * @reg: offset of register 1361 * 1362 * Dummy register read function. Used for register blocks 1363 * that certain asics don't have (all asics). 1364 * Returns the value in the register. 1365 */ 1366 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1367 uint32_t block, uint32_t reg) 1368 { 1369 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1370 reg, block); 1371 BUG(); 1372 return 0; 1373 } 1374 1375 /** 1376 * amdgpu_block_invalid_wreg - dummy reg write function 1377 * 1378 * @adev: amdgpu_device pointer 1379 * @block: offset of instance 1380 * @reg: offset of register 1381 * @v: value to write to the register 1382 * 1383 * Dummy register read function. Used for register blocks 1384 * that certain asics don't have (all asics). 1385 */ 1386 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1387 uint32_t block, 1388 uint32_t reg, uint32_t v) 1389 { 1390 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1391 reg, block, v); 1392 BUG(); 1393 } 1394 1395 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev) 1396 { 1397 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1398 return AMDGPU_VBIOS_SKIP; 1399 1400 if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev)) 1401 return AMDGPU_VBIOS_OPTIONAL; 1402 1403 return 0; 1404 } 1405 1406 /** 1407 * amdgpu_device_asic_init - Wrapper for atom asic_init 1408 * 1409 * @adev: amdgpu_device pointer 1410 * 1411 * Does any asic specific work and then calls atom asic init. 1412 */ 1413 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1414 { 1415 uint32_t flags; 1416 bool optional; 1417 int ret; 1418 1419 amdgpu_asic_pre_asic_init(adev); 1420 flags = amdgpu_device_get_vbios_flags(adev); 1421 optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP)); 1422 1423 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1424 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1425 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 1426 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1427 amdgpu_psp_wait_for_bootloader(adev); 1428 if (optional && !adev->bios) 1429 return 0; 1430 1431 ret = amdgpu_atomfirmware_asic_init(adev, true); 1432 return ret; 1433 } else { 1434 if (optional && !adev->bios) 1435 return 0; 1436 1437 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1438 } 1439 1440 return 0; 1441 } 1442 1443 /** 1444 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1445 * 1446 * @adev: amdgpu_device pointer 1447 * 1448 * Allocates a scratch page of VRAM for use by various things in the 1449 * driver. 1450 */ 1451 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1452 { 1453 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1454 AMDGPU_GEM_DOMAIN_VRAM | 1455 AMDGPU_GEM_DOMAIN_GTT, 1456 &adev->mem_scratch.robj, 1457 &adev->mem_scratch.gpu_addr, 1458 (void **)&adev->mem_scratch.ptr); 1459 } 1460 1461 /** 1462 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1463 * 1464 * @adev: amdgpu_device pointer 1465 * 1466 * Frees the VRAM scratch page. 1467 */ 1468 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1469 { 1470 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1471 } 1472 1473 /** 1474 * amdgpu_device_program_register_sequence - program an array of registers. 1475 * 1476 * @adev: amdgpu_device pointer 1477 * @registers: pointer to the register array 1478 * @array_size: size of the register array 1479 * 1480 * Programs an array or registers with and or masks. 1481 * This is a helper for setting golden registers. 1482 */ 1483 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1484 const u32 *registers, 1485 const u32 array_size) 1486 { 1487 u32 tmp, reg, and_mask, or_mask; 1488 int i; 1489 1490 if (array_size % 3) 1491 return; 1492 1493 for (i = 0; i < array_size; i += 3) { 1494 reg = registers[i + 0]; 1495 and_mask = registers[i + 1]; 1496 or_mask = registers[i + 2]; 1497 1498 if (and_mask == 0xffffffff) { 1499 tmp = or_mask; 1500 } else { 1501 tmp = RREG32(reg); 1502 tmp &= ~and_mask; 1503 if (adev->family >= AMDGPU_FAMILY_AI) 1504 tmp |= (or_mask & and_mask); 1505 else 1506 tmp |= or_mask; 1507 } 1508 WREG32(reg, tmp); 1509 } 1510 } 1511 1512 /** 1513 * amdgpu_device_pci_config_reset - reset the GPU 1514 * 1515 * @adev: amdgpu_device pointer 1516 * 1517 * Resets the GPU using the pci config reset sequence. 1518 * Only applicable to asics prior to vega10. 1519 */ 1520 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1521 { 1522 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1523 } 1524 1525 /** 1526 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1527 * 1528 * @adev: amdgpu_device pointer 1529 * 1530 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1531 */ 1532 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1533 { 1534 return pci_reset_function(adev->pdev); 1535 } 1536 1537 /* 1538 * amdgpu_device_wb_*() 1539 * Writeback is the method by which the GPU updates special pages in memory 1540 * with the status of certain GPU events (fences, ring pointers,etc.). 1541 */ 1542 1543 /** 1544 * amdgpu_device_wb_fini - Disable Writeback and free memory 1545 * 1546 * @adev: amdgpu_device pointer 1547 * 1548 * Disables Writeback and frees the Writeback memory (all asics). 1549 * Used at driver shutdown. 1550 */ 1551 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1552 { 1553 if (adev->wb.wb_obj) { 1554 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1555 &adev->wb.gpu_addr, 1556 (void **)&adev->wb.wb); 1557 adev->wb.wb_obj = NULL; 1558 } 1559 } 1560 1561 /** 1562 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1563 * 1564 * @adev: amdgpu_device pointer 1565 * 1566 * Initializes writeback and allocates writeback memory (all asics). 1567 * Used at driver startup. 1568 * Returns 0 on success or an -error on failure. 1569 */ 1570 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1571 { 1572 int r; 1573 1574 if (adev->wb.wb_obj == NULL) { 1575 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1576 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1577 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1578 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1579 (void **)&adev->wb.wb); 1580 if (r) { 1581 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1582 return r; 1583 } 1584 1585 adev->wb.num_wb = AMDGPU_MAX_WB; 1586 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1587 1588 /* clear wb memory */ 1589 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1590 } 1591 1592 return 0; 1593 } 1594 1595 /** 1596 * amdgpu_device_wb_get - Allocate a wb entry 1597 * 1598 * @adev: amdgpu_device pointer 1599 * @wb: wb index 1600 * 1601 * Allocate a wb slot for use by the driver (all asics). 1602 * Returns 0 on success or -EINVAL on failure. 1603 */ 1604 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1605 { 1606 unsigned long flags, offset; 1607 1608 spin_lock_irqsave(&adev->wb.lock, flags); 1609 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1610 if (offset < adev->wb.num_wb) { 1611 __set_bit(offset, adev->wb.used); 1612 spin_unlock_irqrestore(&adev->wb.lock, flags); 1613 *wb = offset << 3; /* convert to dw offset */ 1614 return 0; 1615 } else { 1616 spin_unlock_irqrestore(&adev->wb.lock, flags); 1617 return -EINVAL; 1618 } 1619 } 1620 1621 /** 1622 * amdgpu_device_wb_free - Free a wb entry 1623 * 1624 * @adev: amdgpu_device pointer 1625 * @wb: wb index 1626 * 1627 * Free a wb slot allocated for use by the driver (all asics) 1628 */ 1629 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1630 { 1631 unsigned long flags; 1632 1633 wb >>= 3; 1634 spin_lock_irqsave(&adev->wb.lock, flags); 1635 if (wb < adev->wb.num_wb) 1636 __clear_bit(wb, adev->wb.used); 1637 spin_unlock_irqrestore(&adev->wb.lock, flags); 1638 } 1639 1640 /** 1641 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1642 * 1643 * @adev: amdgpu_device pointer 1644 * 1645 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1646 * to fail, but if any of the BARs is not accessible after the size we abort 1647 * driver loading by returning -ENODEV. 1648 */ 1649 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1650 { 1651 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1652 struct pci_bus *root; 1653 struct resource *res; 1654 unsigned int i; 1655 u16 cmd; 1656 int r; 1657 1658 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1659 return 0; 1660 1661 /* Bypass for VF */ 1662 if (amdgpu_sriov_vf(adev)) 1663 return 0; 1664 1665 /* resizing on Dell G5 SE platforms causes problems with runtime pm */ 1666 if ((amdgpu_runtime_pm != 0) && 1667 adev->pdev->vendor == PCI_VENDOR_ID_ATI && 1668 adev->pdev->device == 0x731f && 1669 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL) 1670 return 0; 1671 1672 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1673 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1674 DRM_WARN("System can't access extended configuration space, please check!!\n"); 1675 1676 /* skip if the bios has already enabled large BAR */ 1677 if (adev->gmc.real_vram_size && 1678 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1679 return 0; 1680 1681 /* Check if the root BUS has 64bit memory resources */ 1682 root = adev->pdev->bus; 1683 while (root->parent) 1684 root = root->parent; 1685 1686 pci_bus_for_each_resource(root, res, i) { 1687 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1688 res->start > 0x100000000ull) 1689 break; 1690 } 1691 1692 /* Trying to resize is pointless without a root hub window above 4GB */ 1693 if (!res) 1694 return 0; 1695 1696 /* Limit the BAR size to what is available */ 1697 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1698 rbar_size); 1699 1700 /* Disable memory decoding while we change the BAR addresses and size */ 1701 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1702 pci_write_config_word(adev->pdev, PCI_COMMAND, 1703 cmd & ~PCI_COMMAND_MEMORY); 1704 1705 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1706 amdgpu_doorbell_fini(adev); 1707 if (adev->asic_type >= CHIP_BONAIRE) 1708 pci_release_resource(adev->pdev, 2); 1709 1710 pci_release_resource(adev->pdev, 0); 1711 1712 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1713 if (r == -ENOSPC) 1714 DRM_INFO("Not enough PCI address space for a large BAR."); 1715 else if (r && r != -ENOTSUPP) 1716 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1717 1718 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1719 1720 /* When the doorbell or fb BAR isn't available we have no chance of 1721 * using the device. 1722 */ 1723 r = amdgpu_doorbell_init(adev); 1724 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1725 return -ENODEV; 1726 1727 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1728 1729 return 0; 1730 } 1731 1732 /* 1733 * GPU helpers function. 1734 */ 1735 /** 1736 * amdgpu_device_need_post - check if the hw need post or not 1737 * 1738 * @adev: amdgpu_device pointer 1739 * 1740 * Check if the asic has been initialized (all asics) at driver startup 1741 * or post is needed if hw reset is performed. 1742 * Returns true if need or false if not. 1743 */ 1744 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1745 { 1746 uint32_t reg, flags; 1747 1748 if (amdgpu_sriov_vf(adev)) 1749 return false; 1750 1751 flags = amdgpu_device_get_vbios_flags(adev); 1752 if (flags & AMDGPU_VBIOS_SKIP) 1753 return false; 1754 if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios) 1755 return false; 1756 1757 if (amdgpu_passthrough(adev)) { 1758 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1759 * some old smc fw still need driver do vPost otherwise gpu hang, while 1760 * those smc fw version above 22.15 doesn't have this flaw, so we force 1761 * vpost executed for smc version below 22.15 1762 */ 1763 if (adev->asic_type == CHIP_FIJI) { 1764 int err; 1765 uint32_t fw_ver; 1766 1767 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1768 /* force vPost if error occurred */ 1769 if (err) 1770 return true; 1771 1772 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1773 release_firmware(adev->pm.fw); 1774 if (fw_ver < 0x00160e00) 1775 return true; 1776 } 1777 } 1778 1779 /* Don't post if we need to reset whole hive on init */ 1780 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1781 return false; 1782 1783 if (adev->has_hw_reset) { 1784 adev->has_hw_reset = false; 1785 return true; 1786 } 1787 1788 /* bios scratch used on CIK+ */ 1789 if (adev->asic_type >= CHIP_BONAIRE) 1790 return amdgpu_atombios_scratch_need_asic_init(adev); 1791 1792 /* check MEM_SIZE for older asics */ 1793 reg = amdgpu_asic_get_config_memsize(adev); 1794 1795 if ((reg != 0) && (reg != 0xffffffff)) 1796 return false; 1797 1798 return true; 1799 } 1800 1801 /* 1802 * Check whether seamless boot is supported. 1803 * 1804 * So far we only support seamless boot on DCE 3.0 or later. 1805 * If users report that it works on older ASICS as well, we may 1806 * loosen this. 1807 */ 1808 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1809 { 1810 switch (amdgpu_seamless) { 1811 case -1: 1812 break; 1813 case 1: 1814 return true; 1815 case 0: 1816 return false; 1817 default: 1818 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1819 amdgpu_seamless); 1820 return false; 1821 } 1822 1823 if (!(adev->flags & AMD_IS_APU)) 1824 return false; 1825 1826 if (adev->mman.keep_stolen_vga_memory) 1827 return false; 1828 1829 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1830 } 1831 1832 /* 1833 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1834 * don't support dynamic speed switching. Until we have confirmation from Intel 1835 * that a specific host supports it, it's safer that we keep it disabled for all. 1836 * 1837 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1838 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1839 */ 1840 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1841 { 1842 #if IS_ENABLED(CONFIG_X86) 1843 struct cpuinfo_x86 *c = &cpu_data(0); 1844 1845 /* eGPU change speeds based on USB4 fabric conditions */ 1846 if (dev_is_removable(adev->dev)) 1847 return true; 1848 1849 if (c->x86_vendor == X86_VENDOR_INTEL) 1850 return false; 1851 #endif 1852 return true; 1853 } 1854 1855 /** 1856 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1857 * 1858 * @adev: amdgpu_device pointer 1859 * 1860 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1861 * be set for this device. 1862 * 1863 * Returns true if it should be used or false if not. 1864 */ 1865 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1866 { 1867 switch (amdgpu_aspm) { 1868 case -1: 1869 break; 1870 case 0: 1871 return false; 1872 case 1: 1873 return true; 1874 default: 1875 return false; 1876 } 1877 if (adev->flags & AMD_IS_APU) 1878 return false; 1879 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) 1880 return false; 1881 return pcie_aspm_enabled(adev->pdev); 1882 } 1883 1884 /* if we get transitioned to only one device, take VGA back */ 1885 /** 1886 * amdgpu_device_vga_set_decode - enable/disable vga decode 1887 * 1888 * @pdev: PCI device pointer 1889 * @state: enable/disable vga decode 1890 * 1891 * Enable/disable vga decode (all asics). 1892 * Returns VGA resource flags. 1893 */ 1894 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1895 bool state) 1896 { 1897 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1898 1899 amdgpu_asic_set_vga_state(adev, state); 1900 if (state) 1901 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1902 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1903 else 1904 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1905 } 1906 1907 /** 1908 * amdgpu_device_check_block_size - validate the vm block size 1909 * 1910 * @adev: amdgpu_device pointer 1911 * 1912 * Validates the vm block size specified via module parameter. 1913 * The vm block size defines number of bits in page table versus page directory, 1914 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1915 * page table and the remaining bits are in the page directory. 1916 */ 1917 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1918 { 1919 /* defines number of bits in page table versus page directory, 1920 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1921 * page table and the remaining bits are in the page directory 1922 */ 1923 if (amdgpu_vm_block_size == -1) 1924 return; 1925 1926 if (amdgpu_vm_block_size < 9) { 1927 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1928 amdgpu_vm_block_size); 1929 amdgpu_vm_block_size = -1; 1930 } 1931 } 1932 1933 /** 1934 * amdgpu_device_check_vm_size - validate the vm size 1935 * 1936 * @adev: amdgpu_device pointer 1937 * 1938 * Validates the vm size in GB specified via module parameter. 1939 * The VM size is the size of the GPU virtual memory space in GB. 1940 */ 1941 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1942 { 1943 /* no need to check the default value */ 1944 if (amdgpu_vm_size == -1) 1945 return; 1946 1947 if (amdgpu_vm_size < 1) { 1948 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1949 amdgpu_vm_size); 1950 amdgpu_vm_size = -1; 1951 } 1952 } 1953 1954 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1955 { 1956 struct sysinfo si; 1957 bool is_os_64 = (sizeof(void *) == 8); 1958 uint64_t total_memory; 1959 uint64_t dram_size_seven_GB = 0x1B8000000; 1960 uint64_t dram_size_three_GB = 0xB8000000; 1961 1962 if (amdgpu_smu_memory_pool_size == 0) 1963 return; 1964 1965 if (!is_os_64) { 1966 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1967 goto def_value; 1968 } 1969 si_meminfo(&si); 1970 total_memory = (uint64_t)si.totalram * si.mem_unit; 1971 1972 if ((amdgpu_smu_memory_pool_size == 1) || 1973 (amdgpu_smu_memory_pool_size == 2)) { 1974 if (total_memory < dram_size_three_GB) 1975 goto def_value1; 1976 } else if ((amdgpu_smu_memory_pool_size == 4) || 1977 (amdgpu_smu_memory_pool_size == 8)) { 1978 if (total_memory < dram_size_seven_GB) 1979 goto def_value1; 1980 } else { 1981 DRM_WARN("Smu memory pool size not supported\n"); 1982 goto def_value; 1983 } 1984 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1985 1986 return; 1987 1988 def_value1: 1989 DRM_WARN("No enough system memory\n"); 1990 def_value: 1991 adev->pm.smu_prv_buffer_size = 0; 1992 } 1993 1994 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1995 { 1996 if (!(adev->flags & AMD_IS_APU) || 1997 adev->asic_type < CHIP_RAVEN) 1998 return 0; 1999 2000 switch (adev->asic_type) { 2001 case CHIP_RAVEN: 2002 if (adev->pdev->device == 0x15dd) 2003 adev->apu_flags |= AMD_APU_IS_RAVEN; 2004 if (adev->pdev->device == 0x15d8) 2005 adev->apu_flags |= AMD_APU_IS_PICASSO; 2006 break; 2007 case CHIP_RENOIR: 2008 if ((adev->pdev->device == 0x1636) || 2009 (adev->pdev->device == 0x164c)) 2010 adev->apu_flags |= AMD_APU_IS_RENOIR; 2011 else 2012 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 2013 break; 2014 case CHIP_VANGOGH: 2015 adev->apu_flags |= AMD_APU_IS_VANGOGH; 2016 break; 2017 case CHIP_YELLOW_CARP: 2018 break; 2019 case CHIP_CYAN_SKILLFISH: 2020 if ((adev->pdev->device == 0x13FE) || 2021 (adev->pdev->device == 0x143F)) 2022 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 2023 break; 2024 default: 2025 break; 2026 } 2027 2028 return 0; 2029 } 2030 2031 /** 2032 * amdgpu_device_check_arguments - validate module params 2033 * 2034 * @adev: amdgpu_device pointer 2035 * 2036 * Validates certain module parameters and updates 2037 * the associated values used by the driver (all asics). 2038 */ 2039 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 2040 { 2041 int i; 2042 2043 if (amdgpu_sched_jobs < 4) { 2044 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2045 amdgpu_sched_jobs); 2046 amdgpu_sched_jobs = 4; 2047 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2048 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2049 amdgpu_sched_jobs); 2050 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2051 } 2052 2053 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2054 /* gart size must be greater or equal to 32M */ 2055 dev_warn(adev->dev, "gart size (%d) too small\n", 2056 amdgpu_gart_size); 2057 amdgpu_gart_size = -1; 2058 } 2059 2060 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2061 /* gtt size must be greater or equal to 32M */ 2062 dev_warn(adev->dev, "gtt size (%d) too small\n", 2063 amdgpu_gtt_size); 2064 amdgpu_gtt_size = -1; 2065 } 2066 2067 /* valid range is between 4 and 9 inclusive */ 2068 if (amdgpu_vm_fragment_size != -1 && 2069 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2070 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2071 amdgpu_vm_fragment_size = -1; 2072 } 2073 2074 if (amdgpu_sched_hw_submission < 2) { 2075 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2076 amdgpu_sched_hw_submission); 2077 amdgpu_sched_hw_submission = 2; 2078 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2079 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2080 amdgpu_sched_hw_submission); 2081 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2082 } 2083 2084 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2085 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2086 amdgpu_reset_method = -1; 2087 } 2088 2089 amdgpu_device_check_smu_prv_buffer_size(adev); 2090 2091 amdgpu_device_check_vm_size(adev); 2092 2093 amdgpu_device_check_block_size(adev); 2094 2095 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2096 2097 for (i = 0; i < MAX_XCP; i++) 2098 adev->enforce_isolation[i] = !!enforce_isolation; 2099 2100 return 0; 2101 } 2102 2103 /** 2104 * amdgpu_switcheroo_set_state - set switcheroo state 2105 * 2106 * @pdev: pci dev pointer 2107 * @state: vga_switcheroo state 2108 * 2109 * Callback for the switcheroo driver. Suspends or resumes 2110 * the asics before or after it is powered up using ACPI methods. 2111 */ 2112 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2113 enum vga_switcheroo_state state) 2114 { 2115 struct drm_device *dev = pci_get_drvdata(pdev); 2116 int r; 2117 2118 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 2119 return; 2120 2121 if (state == VGA_SWITCHEROO_ON) { 2122 pr_info("switched on\n"); 2123 /* don't suspend or resume card normally */ 2124 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2125 2126 pci_set_power_state(pdev, PCI_D0); 2127 amdgpu_device_load_pci_state(pdev); 2128 r = pci_enable_device(pdev); 2129 if (r) 2130 DRM_WARN("pci_enable_device failed (%d)\n", r); 2131 amdgpu_device_resume(dev, true); 2132 2133 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2134 } else { 2135 pr_info("switched off\n"); 2136 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2137 amdgpu_device_prepare(dev); 2138 amdgpu_device_suspend(dev, true); 2139 amdgpu_device_cache_pci_state(pdev); 2140 /* Shut down the device */ 2141 pci_disable_device(pdev); 2142 pci_set_power_state(pdev, PCI_D3cold); 2143 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2144 } 2145 } 2146 2147 /** 2148 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2149 * 2150 * @pdev: pci dev pointer 2151 * 2152 * Callback for the switcheroo driver. Check of the switcheroo 2153 * state can be changed. 2154 * Returns true if the state can be changed, false if not. 2155 */ 2156 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2157 { 2158 struct drm_device *dev = pci_get_drvdata(pdev); 2159 2160 /* 2161 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2162 * locking inversion with the driver load path. And the access here is 2163 * completely racy anyway. So don't bother with locking for now. 2164 */ 2165 return atomic_read(&dev->open_count) == 0; 2166 } 2167 2168 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2169 .set_gpu_state = amdgpu_switcheroo_set_state, 2170 .reprobe = NULL, 2171 .can_switch = amdgpu_switcheroo_can_switch, 2172 }; 2173 2174 /** 2175 * amdgpu_device_ip_set_clockgating_state - set the CG state 2176 * 2177 * @dev: amdgpu_device pointer 2178 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2179 * @state: clockgating state (gate or ungate) 2180 * 2181 * Sets the requested clockgating state for all instances of 2182 * the hardware IP specified. 2183 * Returns the error code from the last instance. 2184 */ 2185 int amdgpu_device_ip_set_clockgating_state(void *dev, 2186 enum amd_ip_block_type block_type, 2187 enum amd_clockgating_state state) 2188 { 2189 struct amdgpu_device *adev = dev; 2190 int i, r = 0; 2191 2192 for (i = 0; i < adev->num_ip_blocks; i++) { 2193 if (!adev->ip_blocks[i].status.valid) 2194 continue; 2195 if (adev->ip_blocks[i].version->type != block_type) 2196 continue; 2197 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2198 continue; 2199 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2200 &adev->ip_blocks[i], state); 2201 if (r) 2202 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 2203 adev->ip_blocks[i].version->funcs->name, r); 2204 } 2205 return r; 2206 } 2207 2208 /** 2209 * amdgpu_device_ip_set_powergating_state - set the PG state 2210 * 2211 * @dev: amdgpu_device pointer 2212 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2213 * @state: powergating state (gate or ungate) 2214 * 2215 * Sets the requested powergating state for all instances of 2216 * the hardware IP specified. 2217 * Returns the error code from the last instance. 2218 */ 2219 int amdgpu_device_ip_set_powergating_state(void *dev, 2220 enum amd_ip_block_type block_type, 2221 enum amd_powergating_state state) 2222 { 2223 struct amdgpu_device *adev = dev; 2224 int i, r = 0; 2225 2226 for (i = 0; i < adev->num_ip_blocks; i++) { 2227 if (!adev->ip_blocks[i].status.valid) 2228 continue; 2229 if (adev->ip_blocks[i].version->type != block_type) 2230 continue; 2231 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2232 continue; 2233 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2234 &adev->ip_blocks[i], state); 2235 if (r) 2236 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 2237 adev->ip_blocks[i].version->funcs->name, r); 2238 } 2239 return r; 2240 } 2241 2242 /** 2243 * amdgpu_device_ip_get_clockgating_state - get the CG state 2244 * 2245 * @adev: amdgpu_device pointer 2246 * @flags: clockgating feature flags 2247 * 2248 * Walks the list of IPs on the device and updates the clockgating 2249 * flags for each IP. 2250 * Updates @flags with the feature flags for each hardware IP where 2251 * clockgating is enabled. 2252 */ 2253 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2254 u64 *flags) 2255 { 2256 int i; 2257 2258 for (i = 0; i < adev->num_ip_blocks; i++) { 2259 if (!adev->ip_blocks[i].status.valid) 2260 continue; 2261 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2262 adev->ip_blocks[i].version->funcs->get_clockgating_state( 2263 &adev->ip_blocks[i], flags); 2264 } 2265 } 2266 2267 /** 2268 * amdgpu_device_ip_wait_for_idle - wait for idle 2269 * 2270 * @adev: amdgpu_device pointer 2271 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2272 * 2273 * Waits for the request hardware IP to be idle. 2274 * Returns 0 for success or a negative error code on failure. 2275 */ 2276 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2277 enum amd_ip_block_type block_type) 2278 { 2279 int i, r; 2280 2281 for (i = 0; i < adev->num_ip_blocks; i++) { 2282 if (!adev->ip_blocks[i].status.valid) 2283 continue; 2284 if (adev->ip_blocks[i].version->type == block_type) { 2285 if (adev->ip_blocks[i].version->funcs->wait_for_idle) { 2286 r = adev->ip_blocks[i].version->funcs->wait_for_idle( 2287 &adev->ip_blocks[i]); 2288 if (r) 2289 return r; 2290 } 2291 break; 2292 } 2293 } 2294 return 0; 2295 2296 } 2297 2298 /** 2299 * amdgpu_device_ip_is_valid - is the hardware IP enabled 2300 * 2301 * @adev: amdgpu_device pointer 2302 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2303 * 2304 * Check if the hardware IP is enable or not. 2305 * Returns true if it the IP is enable, false if not. 2306 */ 2307 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, 2308 enum amd_ip_block_type block_type) 2309 { 2310 int i; 2311 2312 for (i = 0; i < adev->num_ip_blocks; i++) { 2313 if (adev->ip_blocks[i].version->type == block_type) 2314 return adev->ip_blocks[i].status.valid; 2315 } 2316 return false; 2317 2318 } 2319 2320 /** 2321 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2322 * 2323 * @adev: amdgpu_device pointer 2324 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2325 * 2326 * Returns a pointer to the hardware IP block structure 2327 * if it exists for the asic, otherwise NULL. 2328 */ 2329 struct amdgpu_ip_block * 2330 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2331 enum amd_ip_block_type type) 2332 { 2333 int i; 2334 2335 for (i = 0; i < adev->num_ip_blocks; i++) 2336 if (adev->ip_blocks[i].version->type == type) 2337 return &adev->ip_blocks[i]; 2338 2339 return NULL; 2340 } 2341 2342 /** 2343 * amdgpu_device_ip_block_version_cmp 2344 * 2345 * @adev: amdgpu_device pointer 2346 * @type: enum amd_ip_block_type 2347 * @major: major version 2348 * @minor: minor version 2349 * 2350 * return 0 if equal or greater 2351 * return 1 if smaller or the ip_block doesn't exist 2352 */ 2353 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2354 enum amd_ip_block_type type, 2355 u32 major, u32 minor) 2356 { 2357 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2358 2359 if (ip_block && ((ip_block->version->major > major) || 2360 ((ip_block->version->major == major) && 2361 (ip_block->version->minor >= minor)))) 2362 return 0; 2363 2364 return 1; 2365 } 2366 2367 /** 2368 * amdgpu_device_ip_block_add 2369 * 2370 * @adev: amdgpu_device pointer 2371 * @ip_block_version: pointer to the IP to add 2372 * 2373 * Adds the IP block driver information to the collection of IPs 2374 * on the asic. 2375 */ 2376 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2377 const struct amdgpu_ip_block_version *ip_block_version) 2378 { 2379 if (!ip_block_version) 2380 return -EINVAL; 2381 2382 switch (ip_block_version->type) { 2383 case AMD_IP_BLOCK_TYPE_VCN: 2384 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2385 return 0; 2386 break; 2387 case AMD_IP_BLOCK_TYPE_JPEG: 2388 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2389 return 0; 2390 break; 2391 default: 2392 break; 2393 } 2394 2395 dev_info(adev->dev, "detected ip block number %d <%s>\n", 2396 adev->num_ip_blocks, ip_block_version->funcs->name); 2397 2398 adev->ip_blocks[adev->num_ip_blocks].adev = adev; 2399 2400 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2401 2402 return 0; 2403 } 2404 2405 /** 2406 * amdgpu_device_enable_virtual_display - enable virtual display feature 2407 * 2408 * @adev: amdgpu_device pointer 2409 * 2410 * Enabled the virtual display feature if the user has enabled it via 2411 * the module parameter virtual_display. This feature provides a virtual 2412 * display hardware on headless boards or in virtualized environments. 2413 * This function parses and validates the configuration string specified by 2414 * the user and configures the virtual display configuration (number of 2415 * virtual connectors, crtcs, etc.) specified. 2416 */ 2417 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2418 { 2419 adev->enable_virtual_display = false; 2420 2421 if (amdgpu_virtual_display) { 2422 const char *pci_address_name = pci_name(adev->pdev); 2423 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2424 2425 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2426 pciaddstr_tmp = pciaddstr; 2427 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2428 pciaddname = strsep(&pciaddname_tmp, ","); 2429 if (!strcmp("all", pciaddname) 2430 || !strcmp(pci_address_name, pciaddname)) { 2431 long num_crtc; 2432 int res = -1; 2433 2434 adev->enable_virtual_display = true; 2435 2436 if (pciaddname_tmp) 2437 res = kstrtol(pciaddname_tmp, 10, 2438 &num_crtc); 2439 2440 if (!res) { 2441 if (num_crtc < 1) 2442 num_crtc = 1; 2443 if (num_crtc > 6) 2444 num_crtc = 6; 2445 adev->mode_info.num_crtc = num_crtc; 2446 } else { 2447 adev->mode_info.num_crtc = 1; 2448 } 2449 break; 2450 } 2451 } 2452 2453 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2454 amdgpu_virtual_display, pci_address_name, 2455 adev->enable_virtual_display, adev->mode_info.num_crtc); 2456 2457 kfree(pciaddstr); 2458 } 2459 } 2460 2461 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2462 { 2463 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2464 adev->mode_info.num_crtc = 1; 2465 adev->enable_virtual_display = true; 2466 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2467 adev->enable_virtual_display, adev->mode_info.num_crtc); 2468 } 2469 } 2470 2471 /** 2472 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2473 * 2474 * @adev: amdgpu_device pointer 2475 * 2476 * Parses the asic configuration parameters specified in the gpu info 2477 * firmware and makes them available to the driver for use in configuring 2478 * the asic. 2479 * Returns 0 on success, -EINVAL on failure. 2480 */ 2481 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2482 { 2483 const char *chip_name; 2484 int err; 2485 const struct gpu_info_firmware_header_v1_0 *hdr; 2486 2487 adev->firmware.gpu_info_fw = NULL; 2488 2489 if (adev->mman.discovery_bin) 2490 return 0; 2491 2492 switch (adev->asic_type) { 2493 default: 2494 return 0; 2495 case CHIP_VEGA10: 2496 chip_name = "vega10"; 2497 break; 2498 case CHIP_VEGA12: 2499 chip_name = "vega12"; 2500 break; 2501 case CHIP_RAVEN: 2502 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2503 chip_name = "raven2"; 2504 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2505 chip_name = "picasso"; 2506 else 2507 chip_name = "raven"; 2508 break; 2509 case CHIP_ARCTURUS: 2510 chip_name = "arcturus"; 2511 break; 2512 case CHIP_NAVI12: 2513 chip_name = "navi12"; 2514 break; 2515 } 2516 2517 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2518 AMDGPU_UCODE_OPTIONAL, 2519 "amdgpu/%s_gpu_info.bin", chip_name); 2520 if (err) { 2521 dev_err(adev->dev, 2522 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2523 chip_name); 2524 goto out; 2525 } 2526 2527 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2528 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2529 2530 switch (hdr->version_major) { 2531 case 1: 2532 { 2533 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2534 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2535 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2536 2537 /* 2538 * Should be dropped when DAL no longer needs it. 2539 */ 2540 if (adev->asic_type == CHIP_NAVI12) 2541 goto parse_soc_bounding_box; 2542 2543 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2544 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2545 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2546 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2547 adev->gfx.config.max_texture_channel_caches = 2548 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2549 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2550 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2551 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2552 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2553 adev->gfx.config.double_offchip_lds_buf = 2554 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2555 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2556 adev->gfx.cu_info.max_waves_per_simd = 2557 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2558 adev->gfx.cu_info.max_scratch_slots_per_cu = 2559 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2560 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2561 if (hdr->version_minor >= 1) { 2562 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2563 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2564 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2565 adev->gfx.config.num_sc_per_sh = 2566 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2567 adev->gfx.config.num_packer_per_sc = 2568 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2569 } 2570 2571 parse_soc_bounding_box: 2572 /* 2573 * soc bounding box info is not integrated in disocovery table, 2574 * we always need to parse it from gpu info firmware if needed. 2575 */ 2576 if (hdr->version_minor == 2) { 2577 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2578 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2579 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2580 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2581 } 2582 break; 2583 } 2584 default: 2585 dev_err(adev->dev, 2586 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2587 err = -EINVAL; 2588 goto out; 2589 } 2590 out: 2591 return err; 2592 } 2593 2594 /** 2595 * amdgpu_device_ip_early_init - run early init for hardware IPs 2596 * 2597 * @adev: amdgpu_device pointer 2598 * 2599 * Early initialization pass for hardware IPs. The hardware IPs that make 2600 * up each asic are discovered each IP's early_init callback is run. This 2601 * is the first stage in initializing the asic. 2602 * Returns 0 on success, negative error code on failure. 2603 */ 2604 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2605 { 2606 struct amdgpu_ip_block *ip_block; 2607 struct pci_dev *parent; 2608 bool total, skip_bios; 2609 uint32_t bios_flags; 2610 int i, r; 2611 2612 amdgpu_device_enable_virtual_display(adev); 2613 2614 if (amdgpu_sriov_vf(adev)) { 2615 r = amdgpu_virt_request_full_gpu(adev, true); 2616 if (r) 2617 return r; 2618 } 2619 2620 switch (adev->asic_type) { 2621 #ifdef CONFIG_DRM_AMDGPU_SI 2622 case CHIP_VERDE: 2623 case CHIP_TAHITI: 2624 case CHIP_PITCAIRN: 2625 case CHIP_OLAND: 2626 case CHIP_HAINAN: 2627 adev->family = AMDGPU_FAMILY_SI; 2628 r = si_set_ip_blocks(adev); 2629 if (r) 2630 return r; 2631 break; 2632 #endif 2633 #ifdef CONFIG_DRM_AMDGPU_CIK 2634 case CHIP_BONAIRE: 2635 case CHIP_HAWAII: 2636 case CHIP_KAVERI: 2637 case CHIP_KABINI: 2638 case CHIP_MULLINS: 2639 if (adev->flags & AMD_IS_APU) 2640 adev->family = AMDGPU_FAMILY_KV; 2641 else 2642 adev->family = AMDGPU_FAMILY_CI; 2643 2644 r = cik_set_ip_blocks(adev); 2645 if (r) 2646 return r; 2647 break; 2648 #endif 2649 case CHIP_TOPAZ: 2650 case CHIP_TONGA: 2651 case CHIP_FIJI: 2652 case CHIP_POLARIS10: 2653 case CHIP_POLARIS11: 2654 case CHIP_POLARIS12: 2655 case CHIP_VEGAM: 2656 case CHIP_CARRIZO: 2657 case CHIP_STONEY: 2658 if (adev->flags & AMD_IS_APU) 2659 adev->family = AMDGPU_FAMILY_CZ; 2660 else 2661 adev->family = AMDGPU_FAMILY_VI; 2662 2663 r = vi_set_ip_blocks(adev); 2664 if (r) 2665 return r; 2666 break; 2667 default: 2668 r = amdgpu_discovery_set_ip_blocks(adev); 2669 if (r) 2670 return r; 2671 break; 2672 } 2673 2674 if (amdgpu_has_atpx() && 2675 (amdgpu_is_atpx_hybrid() || 2676 amdgpu_has_atpx_dgpu_power_cntl()) && 2677 ((adev->flags & AMD_IS_APU) == 0) && 2678 !dev_is_removable(&adev->pdev->dev)) 2679 adev->flags |= AMD_IS_PX; 2680 2681 if (!(adev->flags & AMD_IS_APU)) { 2682 parent = pcie_find_root_port(adev->pdev); 2683 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2684 } 2685 2686 2687 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2688 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2689 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2690 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2691 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2692 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2693 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2694 2695 total = true; 2696 for (i = 0; i < adev->num_ip_blocks; i++) { 2697 ip_block = &adev->ip_blocks[i]; 2698 2699 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2700 DRM_WARN("disabled ip block: %d <%s>\n", 2701 i, adev->ip_blocks[i].version->funcs->name); 2702 adev->ip_blocks[i].status.valid = false; 2703 } else if (ip_block->version->funcs->early_init) { 2704 r = ip_block->version->funcs->early_init(ip_block); 2705 if (r == -ENOENT) { 2706 adev->ip_blocks[i].status.valid = false; 2707 } else if (r) { 2708 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2709 adev->ip_blocks[i].version->funcs->name, r); 2710 total = false; 2711 } else { 2712 adev->ip_blocks[i].status.valid = true; 2713 } 2714 } else { 2715 adev->ip_blocks[i].status.valid = true; 2716 } 2717 /* get the vbios after the asic_funcs are set up */ 2718 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2719 r = amdgpu_device_parse_gpu_info_fw(adev); 2720 if (r) 2721 return r; 2722 2723 bios_flags = amdgpu_device_get_vbios_flags(adev); 2724 skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP); 2725 /* Read BIOS */ 2726 if (!skip_bios) { 2727 bool optional = 2728 !!(bios_flags & AMDGPU_VBIOS_OPTIONAL); 2729 if (!amdgpu_get_bios(adev) && !optional) 2730 return -EINVAL; 2731 2732 if (optional && !adev->bios) 2733 dev_info( 2734 adev->dev, 2735 "VBIOS image optional, proceeding without VBIOS image"); 2736 2737 if (adev->bios) { 2738 r = amdgpu_atombios_init(adev); 2739 if (r) { 2740 dev_err(adev->dev, 2741 "amdgpu_atombios_init failed\n"); 2742 amdgpu_vf_error_put( 2743 adev, 2744 AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 2745 0, 0); 2746 return r; 2747 } 2748 } 2749 } 2750 2751 /*get pf2vf msg info at it's earliest time*/ 2752 if (amdgpu_sriov_vf(adev)) 2753 amdgpu_virt_init_data_exchange(adev); 2754 2755 } 2756 } 2757 if (!total) 2758 return -ENODEV; 2759 2760 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2761 if (ip_block->status.valid != false) 2762 amdgpu_amdkfd_device_probe(adev); 2763 2764 adev->cg_flags &= amdgpu_cg_mask; 2765 adev->pg_flags &= amdgpu_pg_mask; 2766 2767 return 0; 2768 } 2769 2770 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2771 { 2772 int i, r; 2773 2774 for (i = 0; i < adev->num_ip_blocks; i++) { 2775 if (!adev->ip_blocks[i].status.sw) 2776 continue; 2777 if (adev->ip_blocks[i].status.hw) 2778 continue; 2779 if (!amdgpu_ip_member_of_hwini( 2780 adev, adev->ip_blocks[i].version->type)) 2781 continue; 2782 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2783 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2784 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2785 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2786 if (r) { 2787 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2788 adev->ip_blocks[i].version->funcs->name, r); 2789 return r; 2790 } 2791 adev->ip_blocks[i].status.hw = true; 2792 } 2793 } 2794 2795 return 0; 2796 } 2797 2798 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2799 { 2800 int i, r; 2801 2802 for (i = 0; i < adev->num_ip_blocks; i++) { 2803 if (!adev->ip_blocks[i].status.sw) 2804 continue; 2805 if (adev->ip_blocks[i].status.hw) 2806 continue; 2807 if (!amdgpu_ip_member_of_hwini( 2808 adev, adev->ip_blocks[i].version->type)) 2809 continue; 2810 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2811 if (r) { 2812 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2813 adev->ip_blocks[i].version->funcs->name, r); 2814 return r; 2815 } 2816 adev->ip_blocks[i].status.hw = true; 2817 } 2818 2819 return 0; 2820 } 2821 2822 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2823 { 2824 int r = 0; 2825 int i; 2826 uint32_t smu_version; 2827 2828 if (adev->asic_type >= CHIP_VEGA10) { 2829 for (i = 0; i < adev->num_ip_blocks; i++) { 2830 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2831 continue; 2832 2833 if (!amdgpu_ip_member_of_hwini(adev, 2834 AMD_IP_BLOCK_TYPE_PSP)) 2835 break; 2836 2837 if (!adev->ip_blocks[i].status.sw) 2838 continue; 2839 2840 /* no need to do the fw loading again if already done*/ 2841 if (adev->ip_blocks[i].status.hw == true) 2842 break; 2843 2844 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2845 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 2846 if (r) 2847 return r; 2848 } else { 2849 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2850 if (r) { 2851 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2852 adev->ip_blocks[i].version->funcs->name, r); 2853 return r; 2854 } 2855 adev->ip_blocks[i].status.hw = true; 2856 } 2857 break; 2858 } 2859 } 2860 2861 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2862 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2863 2864 return r; 2865 } 2866 2867 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2868 { 2869 long timeout; 2870 int r, i; 2871 2872 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2873 struct amdgpu_ring *ring = adev->rings[i]; 2874 2875 /* No need to setup the GPU scheduler for rings that don't need it */ 2876 if (!ring || ring->no_scheduler) 2877 continue; 2878 2879 switch (ring->funcs->type) { 2880 case AMDGPU_RING_TYPE_GFX: 2881 timeout = adev->gfx_timeout; 2882 break; 2883 case AMDGPU_RING_TYPE_COMPUTE: 2884 timeout = adev->compute_timeout; 2885 break; 2886 case AMDGPU_RING_TYPE_SDMA: 2887 timeout = adev->sdma_timeout; 2888 break; 2889 default: 2890 timeout = adev->video_timeout; 2891 break; 2892 } 2893 2894 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL, 2895 DRM_SCHED_PRIORITY_COUNT, 2896 ring->num_hw_submission, 0, 2897 timeout, adev->reset_domain->wq, 2898 ring->sched_score, ring->name, 2899 adev->dev); 2900 if (r) { 2901 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2902 ring->name); 2903 return r; 2904 } 2905 r = amdgpu_uvd_entity_init(adev, ring); 2906 if (r) { 2907 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", 2908 ring->name); 2909 return r; 2910 } 2911 r = amdgpu_vce_entity_init(adev, ring); 2912 if (r) { 2913 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", 2914 ring->name); 2915 return r; 2916 } 2917 } 2918 2919 amdgpu_xcp_update_partition_sched_list(adev); 2920 2921 return 0; 2922 } 2923 2924 2925 /** 2926 * amdgpu_device_ip_init - run init for hardware IPs 2927 * 2928 * @adev: amdgpu_device pointer 2929 * 2930 * Main initialization pass for hardware IPs. The list of all the hardware 2931 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2932 * are run. sw_init initializes the software state associated with each IP 2933 * and hw_init initializes the hardware associated with each IP. 2934 * Returns 0 on success, negative error code on failure. 2935 */ 2936 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2937 { 2938 bool init_badpage; 2939 int i, r; 2940 2941 r = amdgpu_ras_init(adev); 2942 if (r) 2943 return r; 2944 2945 for (i = 0; i < adev->num_ip_blocks; i++) { 2946 if (!adev->ip_blocks[i].status.valid) 2947 continue; 2948 if (adev->ip_blocks[i].version->funcs->sw_init) { 2949 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 2950 if (r) { 2951 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2952 adev->ip_blocks[i].version->funcs->name, r); 2953 goto init_failed; 2954 } 2955 } 2956 adev->ip_blocks[i].status.sw = true; 2957 2958 if (!amdgpu_ip_member_of_hwini( 2959 adev, adev->ip_blocks[i].version->type)) 2960 continue; 2961 2962 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2963 /* need to do common hw init early so everything is set up for gmc */ 2964 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2965 if (r) { 2966 DRM_ERROR("hw_init %d failed %d\n", i, r); 2967 goto init_failed; 2968 } 2969 adev->ip_blocks[i].status.hw = true; 2970 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2971 /* need to do gmc hw init early so we can allocate gpu mem */ 2972 /* Try to reserve bad pages early */ 2973 if (amdgpu_sriov_vf(adev)) 2974 amdgpu_virt_exchange_data(adev); 2975 2976 r = amdgpu_device_mem_scratch_init(adev); 2977 if (r) { 2978 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2979 goto init_failed; 2980 } 2981 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2982 if (r) { 2983 DRM_ERROR("hw_init %d failed %d\n", i, r); 2984 goto init_failed; 2985 } 2986 r = amdgpu_device_wb_init(adev); 2987 if (r) { 2988 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2989 goto init_failed; 2990 } 2991 adev->ip_blocks[i].status.hw = true; 2992 2993 /* right after GMC hw init, we create CSA */ 2994 if (adev->gfx.mcbp) { 2995 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2996 AMDGPU_GEM_DOMAIN_VRAM | 2997 AMDGPU_GEM_DOMAIN_GTT, 2998 AMDGPU_CSA_SIZE); 2999 if (r) { 3000 DRM_ERROR("allocate CSA failed %d\n", r); 3001 goto init_failed; 3002 } 3003 } 3004 3005 r = amdgpu_seq64_init(adev); 3006 if (r) { 3007 DRM_ERROR("allocate seq64 failed %d\n", r); 3008 goto init_failed; 3009 } 3010 } 3011 } 3012 3013 if (amdgpu_sriov_vf(adev)) 3014 amdgpu_virt_init_data_exchange(adev); 3015 3016 r = amdgpu_ib_pool_init(adev); 3017 if (r) { 3018 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 3019 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 3020 goto init_failed; 3021 } 3022 3023 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 3024 if (r) 3025 goto init_failed; 3026 3027 r = amdgpu_device_ip_hw_init_phase1(adev); 3028 if (r) 3029 goto init_failed; 3030 3031 r = amdgpu_device_fw_loading(adev); 3032 if (r) 3033 goto init_failed; 3034 3035 r = amdgpu_device_ip_hw_init_phase2(adev); 3036 if (r) 3037 goto init_failed; 3038 3039 /* 3040 * retired pages will be loaded from eeprom and reserved here, 3041 * it should be called after amdgpu_device_ip_hw_init_phase2 since 3042 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 3043 * for I2C communication which only true at this point. 3044 * 3045 * amdgpu_ras_recovery_init may fail, but the upper only cares the 3046 * failure from bad gpu situation and stop amdgpu init process 3047 * accordingly. For other failed cases, it will still release all 3048 * the resource and print error message, rather than returning one 3049 * negative value to upper level. 3050 * 3051 * Note: theoretically, this should be called before all vram allocations 3052 * to protect retired page from abusing 3053 */ 3054 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 3055 r = amdgpu_ras_recovery_init(adev, init_badpage); 3056 if (r) 3057 goto init_failed; 3058 3059 /** 3060 * In case of XGMI grab extra reference for reset domain for this device 3061 */ 3062 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3063 if (amdgpu_xgmi_add_device(adev) == 0) { 3064 if (!amdgpu_sriov_vf(adev)) { 3065 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3066 3067 if (WARN_ON(!hive)) { 3068 r = -ENOENT; 3069 goto init_failed; 3070 } 3071 3072 if (!hive->reset_domain || 3073 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 3074 r = -ENOENT; 3075 amdgpu_put_xgmi_hive(hive); 3076 goto init_failed; 3077 } 3078 3079 /* Drop the early temporary reset domain we created for device */ 3080 amdgpu_reset_put_reset_domain(adev->reset_domain); 3081 adev->reset_domain = hive->reset_domain; 3082 amdgpu_put_xgmi_hive(hive); 3083 } 3084 } 3085 } 3086 3087 r = amdgpu_device_init_schedulers(adev); 3088 if (r) 3089 goto init_failed; 3090 3091 if (adev->mman.buffer_funcs_ring->sched.ready) 3092 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3093 3094 /* Don't init kfd if whole hive need to be reset during init */ 3095 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3096 kgd2kfd_init_zone_device(adev); 3097 amdgpu_amdkfd_device_init(adev); 3098 } 3099 3100 amdgpu_fru_get_product_info(adev); 3101 3102 r = amdgpu_cper_init(adev); 3103 3104 init_failed: 3105 3106 return r; 3107 } 3108 3109 /** 3110 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3111 * 3112 * @adev: amdgpu_device pointer 3113 * 3114 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3115 * this function before a GPU reset. If the value is retained after a 3116 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents. 3117 */ 3118 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3119 { 3120 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3121 } 3122 3123 /** 3124 * amdgpu_device_check_vram_lost - check if vram is valid 3125 * 3126 * @adev: amdgpu_device pointer 3127 * 3128 * Checks the reset magic value written to the gart pointer in VRAM. 3129 * The driver calls this after a GPU reset to see if the contents of 3130 * VRAM is lost or now. 3131 * returns true if vram is lost, false if not. 3132 */ 3133 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3134 { 3135 if (memcmp(adev->gart.ptr, adev->reset_magic, 3136 AMDGPU_RESET_MAGIC_NUM)) 3137 return true; 3138 3139 if (!amdgpu_in_reset(adev)) 3140 return false; 3141 3142 /* 3143 * For all ASICs with baco/mode1 reset, the VRAM is 3144 * always assumed to be lost. 3145 */ 3146 switch (amdgpu_asic_reset_method(adev)) { 3147 case AMD_RESET_METHOD_BACO: 3148 case AMD_RESET_METHOD_MODE1: 3149 return true; 3150 default: 3151 return false; 3152 } 3153 } 3154 3155 /** 3156 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3157 * 3158 * @adev: amdgpu_device pointer 3159 * @state: clockgating state (gate or ungate) 3160 * 3161 * The list of all the hardware IPs that make up the asic is walked and the 3162 * set_clockgating_state callbacks are run. 3163 * Late initialization pass enabling clockgating for hardware IPs. 3164 * Fini or suspend, pass disabling clockgating for hardware IPs. 3165 * Returns 0 on success, negative error code on failure. 3166 */ 3167 3168 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3169 enum amd_clockgating_state state) 3170 { 3171 int i, j, r; 3172 3173 if (amdgpu_emu_mode == 1) 3174 return 0; 3175 3176 for (j = 0; j < adev->num_ip_blocks; j++) { 3177 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3178 if (!adev->ip_blocks[i].status.late_initialized) 3179 continue; 3180 /* skip CG for GFX, SDMA on S0ix */ 3181 if (adev->in_s0ix && 3182 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3183 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3184 continue; 3185 /* skip CG for VCE/UVD, it's handled specially */ 3186 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3187 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3188 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3189 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3190 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3191 /* enable clockgating to save power */ 3192 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i], 3193 state); 3194 if (r) { 3195 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 3196 adev->ip_blocks[i].version->funcs->name, r); 3197 return r; 3198 } 3199 } 3200 } 3201 3202 return 0; 3203 } 3204 3205 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3206 enum amd_powergating_state state) 3207 { 3208 int i, j, r; 3209 3210 if (amdgpu_emu_mode == 1) 3211 return 0; 3212 3213 for (j = 0; j < adev->num_ip_blocks; j++) { 3214 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3215 if (!adev->ip_blocks[i].status.late_initialized) 3216 continue; 3217 /* skip PG for GFX, SDMA on S0ix */ 3218 if (adev->in_s0ix && 3219 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3220 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3221 continue; 3222 /* skip CG for VCE/UVD, it's handled specially */ 3223 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3224 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3225 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3226 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3227 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3228 /* enable powergating to save power */ 3229 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i], 3230 state); 3231 if (r) { 3232 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 3233 adev->ip_blocks[i].version->funcs->name, r); 3234 return r; 3235 } 3236 } 3237 } 3238 return 0; 3239 } 3240 3241 static int amdgpu_device_enable_mgpu_fan_boost(void) 3242 { 3243 struct amdgpu_gpu_instance *gpu_ins; 3244 struct amdgpu_device *adev; 3245 int i, ret = 0; 3246 3247 mutex_lock(&mgpu_info.mutex); 3248 3249 /* 3250 * MGPU fan boost feature should be enabled 3251 * only when there are two or more dGPUs in 3252 * the system 3253 */ 3254 if (mgpu_info.num_dgpu < 2) 3255 goto out; 3256 3257 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3258 gpu_ins = &(mgpu_info.gpu_ins[i]); 3259 adev = gpu_ins->adev; 3260 if (!(adev->flags & AMD_IS_APU) && 3261 !gpu_ins->mgpu_fan_enabled) { 3262 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3263 if (ret) 3264 break; 3265 3266 gpu_ins->mgpu_fan_enabled = 1; 3267 } 3268 } 3269 3270 out: 3271 mutex_unlock(&mgpu_info.mutex); 3272 3273 return ret; 3274 } 3275 3276 /** 3277 * amdgpu_device_ip_late_init - run late init for hardware IPs 3278 * 3279 * @adev: amdgpu_device pointer 3280 * 3281 * Late initialization pass for hardware IPs. The list of all the hardware 3282 * IPs that make up the asic is walked and the late_init callbacks are run. 3283 * late_init covers any special initialization that an IP requires 3284 * after all of the have been initialized or something that needs to happen 3285 * late in the init process. 3286 * Returns 0 on success, negative error code on failure. 3287 */ 3288 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3289 { 3290 struct amdgpu_gpu_instance *gpu_instance; 3291 int i = 0, r; 3292 3293 for (i = 0; i < adev->num_ip_blocks; i++) { 3294 if (!adev->ip_blocks[i].status.hw) 3295 continue; 3296 if (adev->ip_blocks[i].version->funcs->late_init) { 3297 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3298 if (r) { 3299 DRM_ERROR("late_init of IP block <%s> failed %d\n", 3300 adev->ip_blocks[i].version->funcs->name, r); 3301 return r; 3302 } 3303 } 3304 adev->ip_blocks[i].status.late_initialized = true; 3305 } 3306 3307 r = amdgpu_ras_late_init(adev); 3308 if (r) { 3309 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 3310 return r; 3311 } 3312 3313 if (!amdgpu_reset_in_recovery(adev)) 3314 amdgpu_ras_set_error_query_ready(adev, true); 3315 3316 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3317 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3318 3319 amdgpu_device_fill_reset_magic(adev); 3320 3321 r = amdgpu_device_enable_mgpu_fan_boost(); 3322 if (r) 3323 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 3324 3325 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3326 if (amdgpu_passthrough(adev) && 3327 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3328 adev->asic_type == CHIP_ALDEBARAN)) 3329 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3330 3331 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3332 mutex_lock(&mgpu_info.mutex); 3333 3334 /* 3335 * Reset device p-state to low as this was booted with high. 3336 * 3337 * This should be performed only after all devices from the same 3338 * hive get initialized. 3339 * 3340 * However, it's unknown how many device in the hive in advance. 3341 * As this is counted one by one during devices initializations. 3342 * 3343 * So, we wait for all XGMI interlinked devices initialized. 3344 * This may bring some delays as those devices may come from 3345 * different hives. But that should be OK. 3346 */ 3347 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3348 for (i = 0; i < mgpu_info.num_gpu; i++) { 3349 gpu_instance = &(mgpu_info.gpu_ins[i]); 3350 if (gpu_instance->adev->flags & AMD_IS_APU) 3351 continue; 3352 3353 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3354 AMDGPU_XGMI_PSTATE_MIN); 3355 if (r) { 3356 DRM_ERROR("pstate setting failed (%d).\n", r); 3357 break; 3358 } 3359 } 3360 } 3361 3362 mutex_unlock(&mgpu_info.mutex); 3363 } 3364 3365 return 0; 3366 } 3367 3368 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3369 { 3370 int r; 3371 3372 if (!ip_block->version->funcs->hw_fini) { 3373 DRM_ERROR("hw_fini of IP block <%s> not defined\n", 3374 ip_block->version->funcs->name); 3375 } else { 3376 r = ip_block->version->funcs->hw_fini(ip_block); 3377 /* XXX handle errors */ 3378 if (r) { 3379 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3380 ip_block->version->funcs->name, r); 3381 } 3382 } 3383 3384 ip_block->status.hw = false; 3385 } 3386 3387 /** 3388 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3389 * 3390 * @adev: amdgpu_device pointer 3391 * 3392 * For ASICs need to disable SMC first 3393 */ 3394 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3395 { 3396 int i; 3397 3398 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3399 return; 3400 3401 for (i = 0; i < adev->num_ip_blocks; i++) { 3402 if (!adev->ip_blocks[i].status.hw) 3403 continue; 3404 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3405 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3406 break; 3407 } 3408 } 3409 } 3410 3411 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3412 { 3413 int i, r; 3414 3415 for (i = 0; i < adev->num_ip_blocks; i++) { 3416 if (!adev->ip_blocks[i].version->funcs->early_fini) 3417 continue; 3418 3419 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3420 if (r) { 3421 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3422 adev->ip_blocks[i].version->funcs->name, r); 3423 } 3424 } 3425 3426 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3427 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3428 3429 amdgpu_amdkfd_suspend(adev, false); 3430 3431 /* Workaround for ASICs need to disable SMC first */ 3432 amdgpu_device_smu_fini_early(adev); 3433 3434 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3435 if (!adev->ip_blocks[i].status.hw) 3436 continue; 3437 3438 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3439 } 3440 3441 if (amdgpu_sriov_vf(adev)) { 3442 if (amdgpu_virt_release_full_gpu(adev, false)) 3443 DRM_ERROR("failed to release exclusive mode on fini\n"); 3444 } 3445 3446 return 0; 3447 } 3448 3449 /** 3450 * amdgpu_device_ip_fini - run fini for hardware IPs 3451 * 3452 * @adev: amdgpu_device pointer 3453 * 3454 * Main teardown pass for hardware IPs. The list of all the hardware 3455 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3456 * are run. hw_fini tears down the hardware associated with each IP 3457 * and sw_fini tears down any software state associated with each IP. 3458 * Returns 0 on success, negative error code on failure. 3459 */ 3460 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3461 { 3462 int i, r; 3463 3464 amdgpu_cper_fini(adev); 3465 3466 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3467 amdgpu_virt_release_ras_err_handler_data(adev); 3468 3469 if (adev->gmc.xgmi.num_physical_nodes > 1) 3470 amdgpu_xgmi_remove_device(adev); 3471 3472 amdgpu_amdkfd_device_fini_sw(adev); 3473 3474 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3475 if (!adev->ip_blocks[i].status.sw) 3476 continue; 3477 3478 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3479 amdgpu_ucode_free_bo(adev); 3480 amdgpu_free_static_csa(&adev->virt.csa_obj); 3481 amdgpu_device_wb_fini(adev); 3482 amdgpu_device_mem_scratch_fini(adev); 3483 amdgpu_ib_pool_fini(adev); 3484 amdgpu_seq64_fini(adev); 3485 } 3486 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3487 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3488 /* XXX handle errors */ 3489 if (r) { 3490 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3491 adev->ip_blocks[i].version->funcs->name, r); 3492 } 3493 } 3494 adev->ip_blocks[i].status.sw = false; 3495 adev->ip_blocks[i].status.valid = false; 3496 } 3497 3498 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3499 if (!adev->ip_blocks[i].status.late_initialized) 3500 continue; 3501 if (adev->ip_blocks[i].version->funcs->late_fini) 3502 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3503 adev->ip_blocks[i].status.late_initialized = false; 3504 } 3505 3506 amdgpu_ras_fini(adev); 3507 3508 return 0; 3509 } 3510 3511 /** 3512 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3513 * 3514 * @work: work_struct. 3515 */ 3516 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3517 { 3518 struct amdgpu_device *adev = 3519 container_of(work, struct amdgpu_device, delayed_init_work.work); 3520 int r; 3521 3522 r = amdgpu_ib_ring_tests(adev); 3523 if (r) 3524 DRM_ERROR("ib ring test failed (%d).\n", r); 3525 } 3526 3527 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3528 { 3529 struct amdgpu_device *adev = 3530 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3531 3532 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3533 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3534 3535 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0)) 3536 adev->gfx.gfx_off_state = true; 3537 } 3538 3539 /** 3540 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3541 * 3542 * @adev: amdgpu_device pointer 3543 * 3544 * Main suspend function for hardware IPs. The list of all the hardware 3545 * IPs that make up the asic is walked, clockgating is disabled and the 3546 * suspend callbacks are run. suspend puts the hardware and software state 3547 * in each IP into a state suitable for suspend. 3548 * Returns 0 on success, negative error code on failure. 3549 */ 3550 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3551 { 3552 int i, r; 3553 3554 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3555 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3556 3557 /* 3558 * Per PMFW team's suggestion, driver needs to handle gfxoff 3559 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3560 * scenario. Add the missing df cstate disablement here. 3561 */ 3562 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3563 dev_warn(adev->dev, "Failed to disallow df cstate"); 3564 3565 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3566 if (!adev->ip_blocks[i].status.valid) 3567 continue; 3568 3569 /* displays are handled separately */ 3570 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3571 continue; 3572 3573 /* XXX handle errors */ 3574 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3575 if (r) 3576 return r; 3577 } 3578 3579 return 0; 3580 } 3581 3582 /** 3583 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3584 * 3585 * @adev: amdgpu_device pointer 3586 * 3587 * Main suspend function for hardware IPs. The list of all the hardware 3588 * IPs that make up the asic is walked, clockgating is disabled and the 3589 * suspend callbacks are run. suspend puts the hardware and software state 3590 * in each IP into a state suitable for suspend. 3591 * Returns 0 on success, negative error code on failure. 3592 */ 3593 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3594 { 3595 int i, r; 3596 3597 if (adev->in_s0ix) 3598 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3599 3600 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3601 if (!adev->ip_blocks[i].status.valid) 3602 continue; 3603 /* displays are handled in phase1 */ 3604 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3605 continue; 3606 /* PSP lost connection when err_event_athub occurs */ 3607 if (amdgpu_ras_intr_triggered() && 3608 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3609 adev->ip_blocks[i].status.hw = false; 3610 continue; 3611 } 3612 3613 /* skip unnecessary suspend if we do not initialize them yet */ 3614 if (!amdgpu_ip_member_of_hwini( 3615 adev, adev->ip_blocks[i].version->type)) 3616 continue; 3617 3618 /* skip suspend of gfx/mes and psp for S0ix 3619 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3620 * like at runtime. PSP is also part of the always on hardware 3621 * so no need to suspend it. 3622 */ 3623 if (adev->in_s0ix && 3624 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3625 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3626 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3627 continue; 3628 3629 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3630 if (adev->in_s0ix && 3631 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3632 IP_VERSION(5, 0, 0)) && 3633 (adev->ip_blocks[i].version->type == 3634 AMD_IP_BLOCK_TYPE_SDMA)) 3635 continue; 3636 3637 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3638 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3639 * from this location and RLC Autoload automatically also gets loaded 3640 * from here based on PMFW -> PSP message during re-init sequence. 3641 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3642 * the TMR and reload FWs again for IMU enabled APU ASICs. 3643 */ 3644 if (amdgpu_in_reset(adev) && 3645 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3646 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3647 continue; 3648 3649 /* XXX handle errors */ 3650 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3651 adev->ip_blocks[i].status.hw = false; 3652 3653 /* handle putting the SMC in the appropriate state */ 3654 if (!amdgpu_sriov_vf(adev)) { 3655 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3656 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3657 if (r) { 3658 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3659 adev->mp1_state, r); 3660 return r; 3661 } 3662 } 3663 } 3664 } 3665 3666 return 0; 3667 } 3668 3669 /** 3670 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3671 * 3672 * @adev: amdgpu_device pointer 3673 * 3674 * Main suspend function for hardware IPs. The list of all the hardware 3675 * IPs that make up the asic is walked, clockgating is disabled and the 3676 * suspend callbacks are run. suspend puts the hardware and software state 3677 * in each IP into a state suitable for suspend. 3678 * Returns 0 on success, negative error code on failure. 3679 */ 3680 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3681 { 3682 int r; 3683 3684 if (amdgpu_sriov_vf(adev)) { 3685 amdgpu_virt_fini_data_exchange(adev); 3686 amdgpu_virt_request_full_gpu(adev, false); 3687 } 3688 3689 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3690 3691 r = amdgpu_device_ip_suspend_phase1(adev); 3692 if (r) 3693 return r; 3694 r = amdgpu_device_ip_suspend_phase2(adev); 3695 3696 if (amdgpu_sriov_vf(adev)) 3697 amdgpu_virt_release_full_gpu(adev, false); 3698 3699 return r; 3700 } 3701 3702 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3703 { 3704 int i, r; 3705 3706 static enum amd_ip_block_type ip_order[] = { 3707 AMD_IP_BLOCK_TYPE_COMMON, 3708 AMD_IP_BLOCK_TYPE_GMC, 3709 AMD_IP_BLOCK_TYPE_PSP, 3710 AMD_IP_BLOCK_TYPE_IH, 3711 }; 3712 3713 for (i = 0; i < adev->num_ip_blocks; i++) { 3714 int j; 3715 struct amdgpu_ip_block *block; 3716 3717 block = &adev->ip_blocks[i]; 3718 block->status.hw = false; 3719 3720 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3721 3722 if (block->version->type != ip_order[j] || 3723 !block->status.valid) 3724 continue; 3725 3726 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3727 if (r) { 3728 dev_err(adev->dev, "RE-INIT-early: %s failed\n", 3729 block->version->funcs->name); 3730 return r; 3731 } 3732 block->status.hw = true; 3733 } 3734 } 3735 3736 return 0; 3737 } 3738 3739 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3740 { 3741 struct amdgpu_ip_block *block; 3742 int i, r = 0; 3743 3744 static enum amd_ip_block_type ip_order[] = { 3745 AMD_IP_BLOCK_TYPE_SMC, 3746 AMD_IP_BLOCK_TYPE_DCE, 3747 AMD_IP_BLOCK_TYPE_GFX, 3748 AMD_IP_BLOCK_TYPE_SDMA, 3749 AMD_IP_BLOCK_TYPE_MES, 3750 AMD_IP_BLOCK_TYPE_UVD, 3751 AMD_IP_BLOCK_TYPE_VCE, 3752 AMD_IP_BLOCK_TYPE_VCN, 3753 AMD_IP_BLOCK_TYPE_JPEG 3754 }; 3755 3756 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3757 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]); 3758 3759 if (!block) 3760 continue; 3761 3762 if (block->status.valid && !block->status.hw) { 3763 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 3764 r = amdgpu_ip_block_resume(block); 3765 } else { 3766 r = block->version->funcs->hw_init(block); 3767 } 3768 3769 if (r) { 3770 dev_err(adev->dev, "RE-INIT-late: %s failed\n", 3771 block->version->funcs->name); 3772 break; 3773 } 3774 block->status.hw = true; 3775 } 3776 } 3777 3778 return r; 3779 } 3780 3781 /** 3782 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3783 * 3784 * @adev: amdgpu_device pointer 3785 * 3786 * First resume function for hardware IPs. The list of all the hardware 3787 * IPs that make up the asic is walked and the resume callbacks are run for 3788 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3789 * after a suspend and updates the software state as necessary. This 3790 * function is also used for restoring the GPU after a GPU reset. 3791 * Returns 0 on success, negative error code on failure. 3792 */ 3793 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3794 { 3795 int i, r; 3796 3797 for (i = 0; i < adev->num_ip_blocks; i++) { 3798 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3799 continue; 3800 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3801 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3802 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3803 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3804 3805 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3806 if (r) 3807 return r; 3808 } 3809 } 3810 3811 return 0; 3812 } 3813 3814 /** 3815 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3816 * 3817 * @adev: amdgpu_device pointer 3818 * 3819 * Second resume function for hardware IPs. The list of all the hardware 3820 * IPs that make up the asic is walked and the resume callbacks are run for 3821 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3822 * functional state after a suspend and updates the software state as 3823 * necessary. This function is also used for restoring the GPU after a GPU 3824 * reset. 3825 * Returns 0 on success, negative error code on failure. 3826 */ 3827 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3828 { 3829 int i, r; 3830 3831 for (i = 0; i < adev->num_ip_blocks; i++) { 3832 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3833 continue; 3834 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3835 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3836 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3837 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE || 3838 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3839 continue; 3840 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3841 if (r) 3842 return r; 3843 } 3844 3845 return 0; 3846 } 3847 3848 /** 3849 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs 3850 * 3851 * @adev: amdgpu_device pointer 3852 * 3853 * Third resume function for hardware IPs. The list of all the hardware 3854 * IPs that make up the asic is walked and the resume callbacks are run for 3855 * all DCE. resume puts the hardware into a functional state after a suspend 3856 * and updates the software state as necessary. This function is also used 3857 * for restoring the GPU after a GPU reset. 3858 * 3859 * Returns 0 on success, negative error code on failure. 3860 */ 3861 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev) 3862 { 3863 int i, r; 3864 3865 for (i = 0; i < adev->num_ip_blocks; i++) { 3866 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3867 continue; 3868 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 3869 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3870 if (r) 3871 return r; 3872 } 3873 } 3874 3875 return 0; 3876 } 3877 3878 /** 3879 * amdgpu_device_ip_resume - run resume for hardware IPs 3880 * 3881 * @adev: amdgpu_device pointer 3882 * 3883 * Main resume function for hardware IPs. The hardware IPs 3884 * are split into two resume functions because they are 3885 * also used in recovering from a GPU reset and some additional 3886 * steps need to be take between them. In this case (S3/S4) they are 3887 * run sequentially. 3888 * Returns 0 on success, negative error code on failure. 3889 */ 3890 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3891 { 3892 int r; 3893 3894 r = amdgpu_device_ip_resume_phase1(adev); 3895 if (r) 3896 return r; 3897 3898 r = amdgpu_device_fw_loading(adev); 3899 if (r) 3900 return r; 3901 3902 r = amdgpu_device_ip_resume_phase2(adev); 3903 3904 if (adev->mman.buffer_funcs_ring->sched.ready) 3905 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3906 3907 if (r) 3908 return r; 3909 3910 amdgpu_fence_driver_hw_init(adev); 3911 3912 r = amdgpu_device_ip_resume_phase3(adev); 3913 3914 return r; 3915 } 3916 3917 /** 3918 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3919 * 3920 * @adev: amdgpu_device pointer 3921 * 3922 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3923 */ 3924 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3925 { 3926 if (amdgpu_sriov_vf(adev)) { 3927 if (adev->is_atom_fw) { 3928 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3929 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3930 } else { 3931 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3932 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3933 } 3934 3935 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3936 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3937 } 3938 } 3939 3940 /** 3941 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3942 * 3943 * @asic_type: AMD asic type 3944 * 3945 * Check if there is DC (new modesetting infrastructre) support for an asic. 3946 * returns true if DC has support, false if not. 3947 */ 3948 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3949 { 3950 switch (asic_type) { 3951 #ifdef CONFIG_DRM_AMDGPU_SI 3952 case CHIP_HAINAN: 3953 #endif 3954 case CHIP_TOPAZ: 3955 /* chips with no display hardware */ 3956 return false; 3957 #if defined(CONFIG_DRM_AMD_DC) 3958 case CHIP_TAHITI: 3959 case CHIP_PITCAIRN: 3960 case CHIP_VERDE: 3961 case CHIP_OLAND: 3962 /* 3963 * We have systems in the wild with these ASICs that require 3964 * LVDS and VGA support which is not supported with DC. 3965 * 3966 * Fallback to the non-DC driver here by default so as not to 3967 * cause regressions. 3968 */ 3969 #if defined(CONFIG_DRM_AMD_DC_SI) 3970 return amdgpu_dc > 0; 3971 #else 3972 return false; 3973 #endif 3974 case CHIP_BONAIRE: 3975 case CHIP_KAVERI: 3976 case CHIP_KABINI: 3977 case CHIP_MULLINS: 3978 /* 3979 * We have systems in the wild with these ASICs that require 3980 * VGA support which is not supported with DC. 3981 * 3982 * Fallback to the non-DC driver here by default so as not to 3983 * cause regressions. 3984 */ 3985 return amdgpu_dc > 0; 3986 default: 3987 return amdgpu_dc != 0; 3988 #else 3989 default: 3990 if (amdgpu_dc > 0) 3991 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3992 return false; 3993 #endif 3994 } 3995 } 3996 3997 /** 3998 * amdgpu_device_has_dc_support - check if dc is supported 3999 * 4000 * @adev: amdgpu_device pointer 4001 * 4002 * Returns true for supported, false for not supported 4003 */ 4004 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 4005 { 4006 if (adev->enable_virtual_display || 4007 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 4008 return false; 4009 4010 return amdgpu_device_asic_has_dc_support(adev->asic_type); 4011 } 4012 4013 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 4014 { 4015 struct amdgpu_device *adev = 4016 container_of(__work, struct amdgpu_device, xgmi_reset_work); 4017 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 4018 4019 /* It's a bug to not have a hive within this function */ 4020 if (WARN_ON(!hive)) 4021 return; 4022 4023 /* 4024 * Use task barrier to synchronize all xgmi reset works across the 4025 * hive. task_barrier_enter and task_barrier_exit will block 4026 * until all the threads running the xgmi reset works reach 4027 * those points. task_barrier_full will do both blocks. 4028 */ 4029 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 4030 4031 task_barrier_enter(&hive->tb); 4032 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 4033 4034 if (adev->asic_reset_res) 4035 goto fail; 4036 4037 task_barrier_exit(&hive->tb); 4038 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 4039 4040 if (adev->asic_reset_res) 4041 goto fail; 4042 4043 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 4044 } else { 4045 4046 task_barrier_full(&hive->tb); 4047 adev->asic_reset_res = amdgpu_asic_reset(adev); 4048 } 4049 4050 fail: 4051 if (adev->asic_reset_res) 4052 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 4053 adev->asic_reset_res, adev_to_drm(adev)->unique); 4054 amdgpu_put_xgmi_hive(hive); 4055 } 4056 4057 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 4058 { 4059 char *input = amdgpu_lockup_timeout; 4060 char *timeout_setting = NULL; 4061 int index = 0; 4062 long timeout; 4063 int ret = 0; 4064 4065 /* 4066 * By default timeout for non compute jobs is 10000 4067 * and 60000 for compute jobs. 4068 * In SR-IOV or passthrough mode, timeout for compute 4069 * jobs are 60000 by default. 4070 */ 4071 adev->gfx_timeout = msecs_to_jiffies(10000); 4072 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4073 if (amdgpu_sriov_vf(adev)) 4074 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 4075 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 4076 else 4077 adev->compute_timeout = msecs_to_jiffies(60000); 4078 4079 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4080 while ((timeout_setting = strsep(&input, ",")) && 4081 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4082 ret = kstrtol(timeout_setting, 0, &timeout); 4083 if (ret) 4084 return ret; 4085 4086 if (timeout == 0) { 4087 index++; 4088 continue; 4089 } else if (timeout < 0) { 4090 timeout = MAX_SCHEDULE_TIMEOUT; 4091 dev_warn(adev->dev, "lockup timeout disabled"); 4092 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 4093 } else { 4094 timeout = msecs_to_jiffies(timeout); 4095 } 4096 4097 switch (index++) { 4098 case 0: 4099 adev->gfx_timeout = timeout; 4100 break; 4101 case 1: 4102 adev->compute_timeout = timeout; 4103 break; 4104 case 2: 4105 adev->sdma_timeout = timeout; 4106 break; 4107 case 3: 4108 adev->video_timeout = timeout; 4109 break; 4110 default: 4111 break; 4112 } 4113 } 4114 /* 4115 * There is only one value specified and 4116 * it should apply to all non-compute jobs. 4117 */ 4118 if (index == 1) { 4119 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4120 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 4121 adev->compute_timeout = adev->gfx_timeout; 4122 } 4123 } 4124 4125 return ret; 4126 } 4127 4128 /** 4129 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4130 * 4131 * @adev: amdgpu_device pointer 4132 * 4133 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4134 */ 4135 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4136 { 4137 struct iommu_domain *domain; 4138 4139 domain = iommu_get_domain_for_dev(adev->dev); 4140 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4141 adev->ram_is_direct_mapped = true; 4142 } 4143 4144 #if defined(CONFIG_HSA_AMD_P2P) 4145 /** 4146 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4147 * 4148 * @adev: amdgpu_device pointer 4149 * 4150 * return if IOMMU remapping bar address 4151 */ 4152 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4153 { 4154 struct iommu_domain *domain; 4155 4156 domain = iommu_get_domain_for_dev(adev->dev); 4157 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4158 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4159 return true; 4160 4161 return false; 4162 } 4163 #endif 4164 4165 static const struct attribute *amdgpu_dev_attributes[] = { 4166 &dev_attr_pcie_replay_count.attr, 4167 NULL 4168 }; 4169 4170 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4171 { 4172 if (amdgpu_mcbp == 1) 4173 adev->gfx.mcbp = true; 4174 else if (amdgpu_mcbp == 0) 4175 adev->gfx.mcbp = false; 4176 4177 if (amdgpu_sriov_vf(adev)) 4178 adev->gfx.mcbp = true; 4179 4180 if (adev->gfx.mcbp) 4181 DRM_INFO("MCBP is enabled\n"); 4182 } 4183 4184 /** 4185 * amdgpu_device_init - initialize the driver 4186 * 4187 * @adev: amdgpu_device pointer 4188 * @flags: driver flags 4189 * 4190 * Initializes the driver info and hw (all asics). 4191 * Returns 0 for success or an error on failure. 4192 * Called at driver startup. 4193 */ 4194 int amdgpu_device_init(struct amdgpu_device *adev, 4195 uint32_t flags) 4196 { 4197 struct drm_device *ddev = adev_to_drm(adev); 4198 struct pci_dev *pdev = adev->pdev; 4199 int r, i; 4200 bool px = false; 4201 u32 max_MBps; 4202 int tmp; 4203 4204 adev->shutdown = false; 4205 adev->flags = flags; 4206 4207 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4208 adev->asic_type = amdgpu_force_asic_type; 4209 else 4210 adev->asic_type = flags & AMD_ASIC_MASK; 4211 4212 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4213 if (amdgpu_emu_mode == 1) 4214 adev->usec_timeout *= 10; 4215 adev->gmc.gart_size = 512 * 1024 * 1024; 4216 adev->accel_working = false; 4217 adev->num_rings = 0; 4218 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4219 adev->mman.buffer_funcs = NULL; 4220 adev->mman.buffer_funcs_ring = NULL; 4221 adev->vm_manager.vm_pte_funcs = NULL; 4222 adev->vm_manager.vm_pte_num_scheds = 0; 4223 adev->gmc.gmc_funcs = NULL; 4224 adev->harvest_ip_mask = 0x0; 4225 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4226 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4227 4228 adev->smc_rreg = &amdgpu_invalid_rreg; 4229 adev->smc_wreg = &amdgpu_invalid_wreg; 4230 adev->pcie_rreg = &amdgpu_invalid_rreg; 4231 adev->pcie_wreg = &amdgpu_invalid_wreg; 4232 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4233 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4234 adev->pciep_rreg = &amdgpu_invalid_rreg; 4235 adev->pciep_wreg = &amdgpu_invalid_wreg; 4236 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4237 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4238 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4239 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4240 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4241 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4242 adev->didt_rreg = &amdgpu_invalid_rreg; 4243 adev->didt_wreg = &amdgpu_invalid_wreg; 4244 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4245 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4246 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4247 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4248 4249 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4250 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4251 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4252 4253 /* mutex initialization are all done here so we 4254 * can recall function without having locking issues 4255 */ 4256 mutex_init(&adev->firmware.mutex); 4257 mutex_init(&adev->pm.mutex); 4258 mutex_init(&adev->gfx.gpu_clock_mutex); 4259 mutex_init(&adev->srbm_mutex); 4260 mutex_init(&adev->gfx.pipe_reserve_mutex); 4261 mutex_init(&adev->gfx.gfx_off_mutex); 4262 mutex_init(&adev->gfx.partition_mutex); 4263 mutex_init(&adev->grbm_idx_mutex); 4264 mutex_init(&adev->mn_lock); 4265 mutex_init(&adev->virt.vf_errors.lock); 4266 hash_init(adev->mn_hash); 4267 mutex_init(&adev->psp.mutex); 4268 mutex_init(&adev->notifier_lock); 4269 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4270 mutex_init(&adev->benchmark_mutex); 4271 mutex_init(&adev->gfx.reset_sem_mutex); 4272 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4273 mutex_init(&adev->enforce_isolation_mutex); 4274 mutex_init(&adev->gfx.kfd_sch_mutex); 4275 4276 amdgpu_device_init_apu_flags(adev); 4277 4278 r = amdgpu_device_check_arguments(adev); 4279 if (r) 4280 return r; 4281 4282 spin_lock_init(&adev->mmio_idx_lock); 4283 spin_lock_init(&adev->smc_idx_lock); 4284 spin_lock_init(&adev->pcie_idx_lock); 4285 spin_lock_init(&adev->uvd_ctx_idx_lock); 4286 spin_lock_init(&adev->didt_idx_lock); 4287 spin_lock_init(&adev->gc_cac_idx_lock); 4288 spin_lock_init(&adev->se_cac_idx_lock); 4289 spin_lock_init(&adev->audio_endpt_idx_lock); 4290 spin_lock_init(&adev->mm_stats.lock); 4291 spin_lock_init(&adev->virt.rlcg_reg_lock); 4292 spin_lock_init(&adev->wb.lock); 4293 4294 INIT_LIST_HEAD(&adev->reset_list); 4295 4296 INIT_LIST_HEAD(&adev->ras_list); 4297 4298 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4299 4300 INIT_DELAYED_WORK(&adev->delayed_init_work, 4301 amdgpu_device_delayed_init_work_handler); 4302 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4303 amdgpu_device_delay_enable_gfx_off); 4304 /* 4305 * Initialize the enforce_isolation work structures for each XCP 4306 * partition. This work handler is responsible for enforcing shader 4307 * isolation on AMD GPUs. It counts the number of emitted fences for 4308 * each GFX and compute ring. If there are any fences, it schedules 4309 * the `enforce_isolation_work` to be run after a delay. If there are 4310 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4311 * runqueue. 4312 */ 4313 for (i = 0; i < MAX_XCP; i++) { 4314 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4315 amdgpu_gfx_enforce_isolation_handler); 4316 adev->gfx.enforce_isolation[i].adev = adev; 4317 adev->gfx.enforce_isolation[i].xcp_id = i; 4318 } 4319 4320 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4321 4322 adev->gfx.gfx_off_req_count = 1; 4323 adev->gfx.gfx_off_residency = 0; 4324 adev->gfx.gfx_off_entrycount = 0; 4325 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4326 4327 atomic_set(&adev->throttling_logging_enabled, 1); 4328 /* 4329 * If throttling continues, logging will be performed every minute 4330 * to avoid log flooding. "-1" is subtracted since the thermal 4331 * throttling interrupt comes every second. Thus, the total logging 4332 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4333 * for throttling interrupt) = 60 seconds. 4334 */ 4335 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4336 ratelimit_state_init(&adev->virt.ras_telemetry_rs, 5 * HZ, 1); 4337 4338 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4339 ratelimit_set_flags(&adev->virt.ras_telemetry_rs, RATELIMIT_MSG_ON_RELEASE); 4340 4341 /* Registers mapping */ 4342 /* TODO: block userspace mapping of io register */ 4343 if (adev->asic_type >= CHIP_BONAIRE) { 4344 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4345 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4346 } else { 4347 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4348 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4349 } 4350 4351 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4352 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4353 4354 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4355 if (!adev->rmmio) 4356 return -ENOMEM; 4357 4358 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 4359 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 4360 4361 /* 4362 * Reset domain needs to be present early, before XGMI hive discovered 4363 * (if any) and initialized to use reset sem and in_gpu reset flag 4364 * early on during init and before calling to RREG32. 4365 */ 4366 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4367 if (!adev->reset_domain) 4368 return -ENOMEM; 4369 4370 /* detect hw virtualization here */ 4371 amdgpu_detect_virtualization(adev); 4372 4373 amdgpu_device_get_pcie_info(adev); 4374 4375 r = amdgpu_device_get_job_timeout_settings(adev); 4376 if (r) { 4377 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4378 return r; 4379 } 4380 4381 amdgpu_device_set_mcbp(adev); 4382 4383 /* 4384 * By default, use default mode where all blocks are expected to be 4385 * initialized. At present a 'swinit' of blocks is required to be 4386 * completed before the need for a different level is detected. 4387 */ 4388 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4389 /* early init functions */ 4390 r = amdgpu_device_ip_early_init(adev); 4391 if (r) 4392 return r; 4393 4394 /* Get rid of things like offb */ 4395 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4396 if (r) 4397 return r; 4398 4399 /* Enable TMZ based on IP_VERSION */ 4400 amdgpu_gmc_tmz_set(adev); 4401 4402 if (amdgpu_sriov_vf(adev) && 4403 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4404 /* VF MMIO access (except mailbox range) from CPU 4405 * will be blocked during sriov runtime 4406 */ 4407 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4408 4409 amdgpu_gmc_noretry_set(adev); 4410 /* Need to get xgmi info early to decide the reset behavior*/ 4411 if (adev->gmc.xgmi.supported) { 4412 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4413 if (r) 4414 return r; 4415 } 4416 4417 /* enable PCIE atomic ops */ 4418 if (amdgpu_sriov_vf(adev)) { 4419 if (adev->virt.fw_reserve.p_pf2vf) 4420 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4421 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4422 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4423 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4424 * internal path natively support atomics, set have_atomics_support to true. 4425 */ 4426 } else if ((adev->flags & AMD_IS_APU) && 4427 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4428 IP_VERSION(9, 0, 0))) { 4429 adev->have_atomics_support = true; 4430 } else { 4431 adev->have_atomics_support = 4432 !pci_enable_atomic_ops_to_root(adev->pdev, 4433 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4434 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4435 } 4436 4437 if (!adev->have_atomics_support) 4438 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4439 4440 /* doorbell bar mapping and doorbell index init*/ 4441 amdgpu_doorbell_init(adev); 4442 4443 if (amdgpu_emu_mode == 1) { 4444 /* post the asic on emulation mode */ 4445 emu_soc_asic_init(adev); 4446 goto fence_driver_init; 4447 } 4448 4449 amdgpu_reset_init(adev); 4450 4451 /* detect if we are with an SRIOV vbios */ 4452 if (adev->bios) 4453 amdgpu_device_detect_sriov_bios(adev); 4454 4455 /* check if we need to reset the asic 4456 * E.g., driver was not cleanly unloaded previously, etc. 4457 */ 4458 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4459 if (adev->gmc.xgmi.num_physical_nodes) { 4460 dev_info(adev->dev, "Pending hive reset.\n"); 4461 amdgpu_set_init_level(adev, 4462 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4463 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4464 !amdgpu_device_has_display_hardware(adev)) { 4465 r = psp_gpu_reset(adev); 4466 } else { 4467 tmp = amdgpu_reset_method; 4468 /* It should do a default reset when loading or reloading the driver, 4469 * regardless of the module parameter reset_method. 4470 */ 4471 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4472 r = amdgpu_asic_reset(adev); 4473 amdgpu_reset_method = tmp; 4474 } 4475 4476 if (r) { 4477 dev_err(adev->dev, "asic reset on init failed\n"); 4478 goto failed; 4479 } 4480 } 4481 4482 /* Post card if necessary */ 4483 if (amdgpu_device_need_post(adev)) { 4484 if (!adev->bios) { 4485 dev_err(adev->dev, "no vBIOS found\n"); 4486 r = -EINVAL; 4487 goto failed; 4488 } 4489 DRM_INFO("GPU posting now...\n"); 4490 r = amdgpu_device_asic_init(adev); 4491 if (r) { 4492 dev_err(adev->dev, "gpu post error!\n"); 4493 goto failed; 4494 } 4495 } 4496 4497 if (adev->bios) { 4498 if (adev->is_atom_fw) { 4499 /* Initialize clocks */ 4500 r = amdgpu_atomfirmware_get_clock_info(adev); 4501 if (r) { 4502 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4503 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4504 goto failed; 4505 } 4506 } else { 4507 /* Initialize clocks */ 4508 r = amdgpu_atombios_get_clock_info(adev); 4509 if (r) { 4510 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4511 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4512 goto failed; 4513 } 4514 /* init i2c buses */ 4515 amdgpu_i2c_init(adev); 4516 } 4517 } 4518 4519 fence_driver_init: 4520 /* Fence driver */ 4521 r = amdgpu_fence_driver_sw_init(adev); 4522 if (r) { 4523 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4524 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4525 goto failed; 4526 } 4527 4528 /* init the mode config */ 4529 drm_mode_config_init(adev_to_drm(adev)); 4530 4531 r = amdgpu_device_ip_init(adev); 4532 if (r) { 4533 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4534 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4535 goto release_ras_con; 4536 } 4537 4538 amdgpu_fence_driver_hw_init(adev); 4539 4540 dev_info(adev->dev, 4541 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4542 adev->gfx.config.max_shader_engines, 4543 adev->gfx.config.max_sh_per_se, 4544 adev->gfx.config.max_cu_per_sh, 4545 adev->gfx.cu_info.number); 4546 4547 adev->accel_working = true; 4548 4549 amdgpu_vm_check_compute_bug(adev); 4550 4551 /* Initialize the buffer migration limit. */ 4552 if (amdgpu_moverate >= 0) 4553 max_MBps = amdgpu_moverate; 4554 else 4555 max_MBps = 8; /* Allow 8 MB/s. */ 4556 /* Get a log2 for easy divisions. */ 4557 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4558 4559 /* 4560 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4561 * Otherwise the mgpu fan boost feature will be skipped due to the 4562 * gpu instance is counted less. 4563 */ 4564 amdgpu_register_gpu_instance(adev); 4565 4566 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4567 * explicit gating rather than handling it automatically. 4568 */ 4569 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4570 r = amdgpu_device_ip_late_init(adev); 4571 if (r) { 4572 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4573 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4574 goto release_ras_con; 4575 } 4576 /* must succeed. */ 4577 amdgpu_ras_resume(adev); 4578 queue_delayed_work(system_wq, &adev->delayed_init_work, 4579 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4580 } 4581 4582 if (amdgpu_sriov_vf(adev)) { 4583 amdgpu_virt_release_full_gpu(adev, true); 4584 flush_delayed_work(&adev->delayed_init_work); 4585 } 4586 4587 /* 4588 * Place those sysfs registering after `late_init`. As some of those 4589 * operations performed in `late_init` might affect the sysfs 4590 * interfaces creating. 4591 */ 4592 r = amdgpu_atombios_sysfs_init(adev); 4593 if (r) 4594 drm_err(&adev->ddev, 4595 "registering atombios sysfs failed (%d).\n", r); 4596 4597 r = amdgpu_pm_sysfs_init(adev); 4598 if (r) 4599 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4600 4601 r = amdgpu_ucode_sysfs_init(adev); 4602 if (r) { 4603 adev->ucode_sysfs_en = false; 4604 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4605 } else 4606 adev->ucode_sysfs_en = true; 4607 4608 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4609 if (r) 4610 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4611 4612 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4613 if (r) 4614 dev_err(adev->dev, 4615 "Could not create amdgpu board attributes\n"); 4616 4617 amdgpu_fru_sysfs_init(adev); 4618 amdgpu_reg_state_sysfs_init(adev); 4619 amdgpu_xcp_cfg_sysfs_init(adev); 4620 4621 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4622 r = amdgpu_pmu_init(adev); 4623 if (r) 4624 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4625 4626 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4627 if (amdgpu_device_cache_pci_state(adev->pdev)) 4628 pci_restore_state(pdev); 4629 4630 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4631 /* this will fail for cards that aren't VGA class devices, just 4632 * ignore it 4633 */ 4634 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4635 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4636 4637 px = amdgpu_device_supports_px(ddev); 4638 4639 if (px || (!dev_is_removable(&adev->pdev->dev) && 4640 apple_gmux_detect(NULL, NULL))) 4641 vga_switcheroo_register_client(adev->pdev, 4642 &amdgpu_switcheroo_ops, px); 4643 4644 if (px) 4645 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4646 4647 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4648 amdgpu_xgmi_reset_on_init(adev); 4649 4650 amdgpu_device_check_iommu_direct_map(adev); 4651 4652 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; 4653 r = register_pm_notifier(&adev->pm_nb); 4654 if (r) 4655 goto failed; 4656 4657 return 0; 4658 4659 release_ras_con: 4660 if (amdgpu_sriov_vf(adev)) 4661 amdgpu_virt_release_full_gpu(adev, true); 4662 4663 /* failed in exclusive mode due to timeout */ 4664 if (amdgpu_sriov_vf(adev) && 4665 !amdgpu_sriov_runtime(adev) && 4666 amdgpu_virt_mmio_blocked(adev) && 4667 !amdgpu_virt_wait_reset(adev)) { 4668 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4669 /* Don't send request since VF is inactive. */ 4670 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4671 adev->virt.ops = NULL; 4672 r = -EAGAIN; 4673 } 4674 amdgpu_release_ras_context(adev); 4675 4676 failed: 4677 amdgpu_vf_error_trans_all(adev); 4678 4679 return r; 4680 } 4681 4682 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4683 { 4684 4685 /* Clear all CPU mappings pointing to this device */ 4686 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4687 4688 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4689 amdgpu_doorbell_fini(adev); 4690 4691 iounmap(adev->rmmio); 4692 adev->rmmio = NULL; 4693 if (adev->mman.aper_base_kaddr) 4694 iounmap(adev->mman.aper_base_kaddr); 4695 adev->mman.aper_base_kaddr = NULL; 4696 4697 /* Memory manager related */ 4698 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4699 arch_phys_wc_del(adev->gmc.vram_mtrr); 4700 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4701 } 4702 } 4703 4704 /** 4705 * amdgpu_device_fini_hw - tear down the driver 4706 * 4707 * @adev: amdgpu_device pointer 4708 * 4709 * Tear down the driver info (all asics). 4710 * Called at driver shutdown. 4711 */ 4712 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4713 { 4714 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4715 flush_delayed_work(&adev->delayed_init_work); 4716 4717 if (adev->mman.initialized) 4718 drain_workqueue(adev->mman.bdev.wq); 4719 adev->shutdown = true; 4720 4721 unregister_pm_notifier(&adev->pm_nb); 4722 4723 /* make sure IB test finished before entering exclusive mode 4724 * to avoid preemption on IB test 4725 */ 4726 if (amdgpu_sriov_vf(adev)) { 4727 amdgpu_virt_request_full_gpu(adev, false); 4728 amdgpu_virt_fini_data_exchange(adev); 4729 } 4730 4731 /* disable all interrupts */ 4732 amdgpu_irq_disable_all(adev); 4733 if (adev->mode_info.mode_config_initialized) { 4734 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4735 drm_helper_force_disable_all(adev_to_drm(adev)); 4736 else 4737 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4738 } 4739 amdgpu_fence_driver_hw_fini(adev); 4740 4741 if (adev->pm.sysfs_initialized) 4742 amdgpu_pm_sysfs_fini(adev); 4743 if (adev->ucode_sysfs_en) 4744 amdgpu_ucode_sysfs_fini(adev); 4745 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4746 amdgpu_fru_sysfs_fini(adev); 4747 4748 amdgpu_reg_state_sysfs_fini(adev); 4749 amdgpu_xcp_cfg_sysfs_fini(adev); 4750 4751 /* disable ras feature must before hw fini */ 4752 amdgpu_ras_pre_fini(adev); 4753 4754 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4755 4756 amdgpu_device_ip_fini_early(adev); 4757 4758 amdgpu_irq_fini_hw(adev); 4759 4760 if (adev->mman.initialized) 4761 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4762 4763 amdgpu_gart_dummy_page_fini(adev); 4764 4765 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4766 amdgpu_device_unmap_mmio(adev); 4767 4768 } 4769 4770 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4771 { 4772 int idx; 4773 bool px; 4774 4775 amdgpu_device_ip_fini(adev); 4776 amdgpu_fence_driver_sw_fini(adev); 4777 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4778 adev->accel_working = false; 4779 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4780 4781 amdgpu_reset_fini(adev); 4782 4783 /* free i2c buses */ 4784 amdgpu_i2c_fini(adev); 4785 4786 if (adev->bios) { 4787 if (amdgpu_emu_mode != 1) 4788 amdgpu_atombios_fini(adev); 4789 amdgpu_bios_release(adev); 4790 } 4791 4792 kfree(adev->fru_info); 4793 adev->fru_info = NULL; 4794 4795 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4796 4797 if (px || (!dev_is_removable(&adev->pdev->dev) && 4798 apple_gmux_detect(NULL, NULL))) 4799 vga_switcheroo_unregister_client(adev->pdev); 4800 4801 if (px) 4802 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4803 4804 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4805 vga_client_unregister(adev->pdev); 4806 4807 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4808 4809 iounmap(adev->rmmio); 4810 adev->rmmio = NULL; 4811 amdgpu_doorbell_fini(adev); 4812 drm_dev_exit(idx); 4813 } 4814 4815 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4816 amdgpu_pmu_fini(adev); 4817 if (adev->mman.discovery_bin) 4818 amdgpu_discovery_fini(adev); 4819 4820 amdgpu_reset_put_reset_domain(adev->reset_domain); 4821 adev->reset_domain = NULL; 4822 4823 kfree(adev->pci_state); 4824 4825 } 4826 4827 /** 4828 * amdgpu_device_evict_resources - evict device resources 4829 * @adev: amdgpu device object 4830 * 4831 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4832 * of the vram memory type. Mainly used for evicting device resources 4833 * at suspend time. 4834 * 4835 */ 4836 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4837 { 4838 int ret; 4839 4840 /* No need to evict vram on APUs unless going to S4 */ 4841 if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) 4842 return 0; 4843 4844 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4845 if (ret) 4846 DRM_WARN("evicting device resources failed\n"); 4847 return ret; 4848 } 4849 4850 /* 4851 * Suspend & resume. 4852 */ 4853 /** 4854 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events 4855 * @nb: notifier block 4856 * @mode: suspend mode 4857 * @data: data 4858 * 4859 * This function is called when the system is about to suspend or hibernate. 4860 * It is used to evict resources from the device before the system goes to 4861 * sleep while there is still access to swap. 4862 */ 4863 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 4864 void *data) 4865 { 4866 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); 4867 int r; 4868 4869 switch (mode) { 4870 case PM_HIBERNATION_PREPARE: 4871 adev->in_s4 = true; 4872 fallthrough; 4873 case PM_SUSPEND_PREPARE: 4874 r = amdgpu_device_evict_resources(adev); 4875 /* 4876 * This is considered non-fatal at this time because 4877 * amdgpu_device_prepare() will also fatally evict resources. 4878 * See https://gitlab.freedesktop.org/drm/amd/-/issues/3781 4879 */ 4880 if (r) 4881 drm_warn(adev_to_drm(adev), "Failed to evict resources, freeze active processes if problems occur: %d\n", r); 4882 break; 4883 } 4884 4885 return NOTIFY_DONE; 4886 } 4887 4888 /** 4889 * amdgpu_device_prepare - prepare for device suspend 4890 * 4891 * @dev: drm dev pointer 4892 * 4893 * Prepare to put the hw in the suspend state (all asics). 4894 * Returns 0 for success or an error on failure. 4895 * Called at driver suspend. 4896 */ 4897 int amdgpu_device_prepare(struct drm_device *dev) 4898 { 4899 struct amdgpu_device *adev = drm_to_adev(dev); 4900 int i, r; 4901 4902 amdgpu_choose_low_power_state(adev); 4903 4904 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4905 return 0; 4906 4907 /* Evict the majority of BOs before starting suspend sequence */ 4908 r = amdgpu_device_evict_resources(adev); 4909 if (r) 4910 goto unprepare; 4911 4912 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4913 4914 for (i = 0; i < adev->num_ip_blocks; i++) { 4915 if (!adev->ip_blocks[i].status.valid) 4916 continue; 4917 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 4918 continue; 4919 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 4920 if (r) 4921 goto unprepare; 4922 } 4923 4924 return 0; 4925 4926 unprepare: 4927 adev->in_s0ix = adev->in_s3 = adev->in_s4 = false; 4928 4929 return r; 4930 } 4931 4932 /** 4933 * amdgpu_device_suspend - initiate device suspend 4934 * 4935 * @dev: drm dev pointer 4936 * @notify_clients: notify in-kernel DRM clients 4937 * 4938 * Puts the hw in the suspend state (all asics). 4939 * Returns 0 for success or an error on failure. 4940 * Called at driver suspend. 4941 */ 4942 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 4943 { 4944 struct amdgpu_device *adev = drm_to_adev(dev); 4945 int r = 0; 4946 4947 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4948 return 0; 4949 4950 adev->in_suspend = true; 4951 4952 if (amdgpu_sriov_vf(adev)) { 4953 amdgpu_virt_fini_data_exchange(adev); 4954 r = amdgpu_virt_request_full_gpu(adev, false); 4955 if (r) 4956 return r; 4957 } 4958 4959 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4960 DRM_WARN("smart shift update failed\n"); 4961 4962 if (notify_clients) 4963 drm_client_dev_suspend(adev_to_drm(adev), false); 4964 4965 cancel_delayed_work_sync(&adev->delayed_init_work); 4966 4967 amdgpu_ras_suspend(adev); 4968 4969 amdgpu_device_ip_suspend_phase1(adev); 4970 4971 if (!adev->in_s0ix) 4972 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4973 4974 r = amdgpu_device_evict_resources(adev); 4975 if (r) 4976 return r; 4977 4978 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4979 4980 amdgpu_fence_driver_hw_fini(adev); 4981 4982 amdgpu_device_ip_suspend_phase2(adev); 4983 4984 if (amdgpu_sriov_vf(adev)) 4985 amdgpu_virt_release_full_gpu(adev, false); 4986 4987 r = amdgpu_dpm_notify_rlc_state(adev, false); 4988 if (r) 4989 return r; 4990 4991 return 0; 4992 } 4993 4994 /** 4995 * amdgpu_device_resume - initiate device resume 4996 * 4997 * @dev: drm dev pointer 4998 * @notify_clients: notify in-kernel DRM clients 4999 * 5000 * Bring the hw back to operating state (all asics). 5001 * Returns 0 for success or an error on failure. 5002 * Called at driver resume. 5003 */ 5004 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 5005 { 5006 struct amdgpu_device *adev = drm_to_adev(dev); 5007 int r = 0; 5008 5009 if (amdgpu_sriov_vf(adev)) { 5010 r = amdgpu_virt_request_full_gpu(adev, true); 5011 if (r) 5012 return r; 5013 } 5014 5015 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5016 return 0; 5017 5018 if (adev->in_s0ix) 5019 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 5020 5021 /* post card */ 5022 if (amdgpu_device_need_post(adev)) { 5023 r = amdgpu_device_asic_init(adev); 5024 if (r) 5025 dev_err(adev->dev, "amdgpu asic init failed\n"); 5026 } 5027 5028 r = amdgpu_device_ip_resume(adev); 5029 5030 if (r) { 5031 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 5032 goto exit; 5033 } 5034 5035 if (!adev->in_s0ix) { 5036 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 5037 if (r) 5038 goto exit; 5039 } 5040 5041 r = amdgpu_device_ip_late_init(adev); 5042 if (r) 5043 goto exit; 5044 5045 queue_delayed_work(system_wq, &adev->delayed_init_work, 5046 msecs_to_jiffies(AMDGPU_RESUME_MS)); 5047 exit: 5048 if (amdgpu_sriov_vf(adev)) { 5049 amdgpu_virt_init_data_exchange(adev); 5050 amdgpu_virt_release_full_gpu(adev, true); 5051 } 5052 5053 if (r) 5054 return r; 5055 5056 /* Make sure IB tests flushed */ 5057 flush_delayed_work(&adev->delayed_init_work); 5058 5059 if (notify_clients) 5060 drm_client_dev_resume(adev_to_drm(adev), false); 5061 5062 amdgpu_ras_resume(adev); 5063 5064 if (adev->mode_info.num_crtc) { 5065 /* 5066 * Most of the connector probing functions try to acquire runtime pm 5067 * refs to ensure that the GPU is powered on when connector polling is 5068 * performed. Since we're calling this from a runtime PM callback, 5069 * trying to acquire rpm refs will cause us to deadlock. 5070 * 5071 * Since we're guaranteed to be holding the rpm lock, it's safe to 5072 * temporarily disable the rpm helpers so this doesn't deadlock us. 5073 */ 5074 #ifdef CONFIG_PM 5075 dev->dev->power.disable_depth++; 5076 #endif 5077 if (!adev->dc_enabled) 5078 drm_helper_hpd_irq_event(dev); 5079 else 5080 drm_kms_helper_hotplug_event(dev); 5081 #ifdef CONFIG_PM 5082 dev->dev->power.disable_depth--; 5083 #endif 5084 } 5085 adev->in_suspend = false; 5086 5087 if (adev->enable_mes) 5088 amdgpu_mes_self_test(adev); 5089 5090 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 5091 DRM_WARN("smart shift update failed\n"); 5092 5093 return 0; 5094 } 5095 5096 /** 5097 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 5098 * 5099 * @adev: amdgpu_device pointer 5100 * 5101 * The list of all the hardware IPs that make up the asic is walked and 5102 * the check_soft_reset callbacks are run. check_soft_reset determines 5103 * if the asic is still hung or not. 5104 * Returns true if any of the IPs are still in a hung state, false if not. 5105 */ 5106 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 5107 { 5108 int i; 5109 bool asic_hang = false; 5110 5111 if (amdgpu_sriov_vf(adev)) 5112 return true; 5113 5114 if (amdgpu_asic_need_full_reset(adev)) 5115 return true; 5116 5117 for (i = 0; i < adev->num_ip_blocks; i++) { 5118 if (!adev->ip_blocks[i].status.valid) 5119 continue; 5120 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 5121 adev->ip_blocks[i].status.hang = 5122 adev->ip_blocks[i].version->funcs->check_soft_reset( 5123 &adev->ip_blocks[i]); 5124 if (adev->ip_blocks[i].status.hang) { 5125 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 5126 asic_hang = true; 5127 } 5128 } 5129 return asic_hang; 5130 } 5131 5132 /** 5133 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 5134 * 5135 * @adev: amdgpu_device pointer 5136 * 5137 * The list of all the hardware IPs that make up the asic is walked and the 5138 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 5139 * handles any IP specific hardware or software state changes that are 5140 * necessary for a soft reset to succeed. 5141 * Returns 0 on success, negative error code on failure. 5142 */ 5143 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5144 { 5145 int i, r = 0; 5146 5147 for (i = 0; i < adev->num_ip_blocks; i++) { 5148 if (!adev->ip_blocks[i].status.valid) 5149 continue; 5150 if (adev->ip_blocks[i].status.hang && 5151 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5152 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5153 if (r) 5154 return r; 5155 } 5156 } 5157 5158 return 0; 5159 } 5160 5161 /** 5162 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5163 * 5164 * @adev: amdgpu_device pointer 5165 * 5166 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5167 * reset is necessary to recover. 5168 * Returns true if a full asic reset is required, false if not. 5169 */ 5170 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5171 { 5172 int i; 5173 5174 if (amdgpu_asic_need_full_reset(adev)) 5175 return true; 5176 5177 for (i = 0; i < adev->num_ip_blocks; i++) { 5178 if (!adev->ip_blocks[i].status.valid) 5179 continue; 5180 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5181 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5182 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5183 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5184 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5185 if (adev->ip_blocks[i].status.hang) { 5186 dev_info(adev->dev, "Some block need full reset!\n"); 5187 return true; 5188 } 5189 } 5190 } 5191 return false; 5192 } 5193 5194 /** 5195 * amdgpu_device_ip_soft_reset - do a soft reset 5196 * 5197 * @adev: amdgpu_device pointer 5198 * 5199 * The list of all the hardware IPs that make up the asic is walked and the 5200 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5201 * IP specific hardware or software state changes that are necessary to soft 5202 * reset the IP. 5203 * Returns 0 on success, negative error code on failure. 5204 */ 5205 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5206 { 5207 int i, r = 0; 5208 5209 for (i = 0; i < adev->num_ip_blocks; i++) { 5210 if (!adev->ip_blocks[i].status.valid) 5211 continue; 5212 if (adev->ip_blocks[i].status.hang && 5213 adev->ip_blocks[i].version->funcs->soft_reset) { 5214 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5215 if (r) 5216 return r; 5217 } 5218 } 5219 5220 return 0; 5221 } 5222 5223 /** 5224 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5225 * 5226 * @adev: amdgpu_device pointer 5227 * 5228 * The list of all the hardware IPs that make up the asic is walked and the 5229 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5230 * handles any IP specific hardware or software state changes that are 5231 * necessary after the IP has been soft reset. 5232 * Returns 0 on success, negative error code on failure. 5233 */ 5234 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5235 { 5236 int i, r = 0; 5237 5238 for (i = 0; i < adev->num_ip_blocks; i++) { 5239 if (!adev->ip_blocks[i].status.valid) 5240 continue; 5241 if (adev->ip_blocks[i].status.hang && 5242 adev->ip_blocks[i].version->funcs->post_soft_reset) 5243 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5244 if (r) 5245 return r; 5246 } 5247 5248 return 0; 5249 } 5250 5251 /** 5252 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5253 * 5254 * @adev: amdgpu_device pointer 5255 * @reset_context: amdgpu reset context pointer 5256 * 5257 * do VF FLR and reinitialize Asic 5258 * return 0 means succeeded otherwise failed 5259 */ 5260 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5261 struct amdgpu_reset_context *reset_context) 5262 { 5263 int r; 5264 struct amdgpu_hive_info *hive = NULL; 5265 5266 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5267 if (!amdgpu_ras_get_fed_status(adev)) 5268 amdgpu_virt_ready_to_reset(adev); 5269 amdgpu_virt_wait_reset(adev); 5270 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5271 r = amdgpu_virt_request_full_gpu(adev, true); 5272 } else { 5273 r = amdgpu_virt_reset_gpu(adev); 5274 } 5275 if (r) 5276 return r; 5277 5278 amdgpu_ras_clear_err_state(adev); 5279 amdgpu_irq_gpu_reset_resume_helper(adev); 5280 5281 /* some sw clean up VF needs to do before recover */ 5282 amdgpu_virt_post_reset(adev); 5283 5284 /* Resume IP prior to SMC */ 5285 r = amdgpu_device_ip_reinit_early_sriov(adev); 5286 if (r) 5287 return r; 5288 5289 amdgpu_virt_init_data_exchange(adev); 5290 5291 r = amdgpu_device_fw_loading(adev); 5292 if (r) 5293 return r; 5294 5295 /* now we are okay to resume SMC/CP/SDMA */ 5296 r = amdgpu_device_ip_reinit_late_sriov(adev); 5297 if (r) 5298 return r; 5299 5300 hive = amdgpu_get_xgmi_hive(adev); 5301 /* Update PSP FW topology after reset */ 5302 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5303 r = amdgpu_xgmi_update_topology(hive, adev); 5304 if (hive) 5305 amdgpu_put_xgmi_hive(hive); 5306 if (r) 5307 return r; 5308 5309 r = amdgpu_ib_ring_tests(adev); 5310 if (r) 5311 return r; 5312 5313 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5314 amdgpu_inc_vram_lost(adev); 5315 5316 /* need to be called during full access so we can't do it later like 5317 * bare-metal does. 5318 */ 5319 amdgpu_amdkfd_post_reset(adev); 5320 amdgpu_virt_release_full_gpu(adev, true); 5321 5322 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5323 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5324 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5325 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5326 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5327 amdgpu_ras_resume(adev); 5328 5329 amdgpu_virt_ras_telemetry_post_reset(adev); 5330 5331 return 0; 5332 } 5333 5334 /** 5335 * amdgpu_device_has_job_running - check if there is any unfinished job 5336 * 5337 * @adev: amdgpu_device pointer 5338 * 5339 * check if there is any job running on the device when guest driver receives 5340 * FLR notification from host driver. If there are still jobs running, then 5341 * the guest driver will not respond the FLR reset. Instead, let the job hit 5342 * the timeout and guest driver then issue the reset request. 5343 */ 5344 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5345 { 5346 int i; 5347 5348 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5349 struct amdgpu_ring *ring = adev->rings[i]; 5350 5351 if (!amdgpu_ring_sched_ready(ring)) 5352 continue; 5353 5354 if (amdgpu_fence_count_emitted(ring)) 5355 return true; 5356 } 5357 return false; 5358 } 5359 5360 /** 5361 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5362 * 5363 * @adev: amdgpu_device pointer 5364 * 5365 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5366 * a hung GPU. 5367 */ 5368 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5369 { 5370 5371 if (amdgpu_gpu_recovery == 0) 5372 goto disabled; 5373 5374 /* Skip soft reset check in fatal error mode */ 5375 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5376 return true; 5377 5378 if (amdgpu_sriov_vf(adev)) 5379 return true; 5380 5381 if (amdgpu_gpu_recovery == -1) { 5382 switch (adev->asic_type) { 5383 #ifdef CONFIG_DRM_AMDGPU_SI 5384 case CHIP_VERDE: 5385 case CHIP_TAHITI: 5386 case CHIP_PITCAIRN: 5387 case CHIP_OLAND: 5388 case CHIP_HAINAN: 5389 #endif 5390 #ifdef CONFIG_DRM_AMDGPU_CIK 5391 case CHIP_KAVERI: 5392 case CHIP_KABINI: 5393 case CHIP_MULLINS: 5394 #endif 5395 case CHIP_CARRIZO: 5396 case CHIP_STONEY: 5397 case CHIP_CYAN_SKILLFISH: 5398 goto disabled; 5399 default: 5400 break; 5401 } 5402 } 5403 5404 return true; 5405 5406 disabled: 5407 dev_info(adev->dev, "GPU recovery disabled.\n"); 5408 return false; 5409 } 5410 5411 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5412 { 5413 u32 i; 5414 int ret = 0; 5415 5416 if (adev->bios) 5417 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5418 5419 dev_info(adev->dev, "GPU mode1 reset\n"); 5420 5421 /* Cache the state before bus master disable. The saved config space 5422 * values are used in other cases like restore after mode-2 reset. 5423 */ 5424 amdgpu_device_cache_pci_state(adev->pdev); 5425 5426 /* disable BM */ 5427 pci_clear_master(adev->pdev); 5428 5429 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5430 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5431 ret = amdgpu_dpm_mode1_reset(adev); 5432 } else { 5433 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5434 ret = psp_gpu_reset(adev); 5435 } 5436 5437 if (ret) 5438 goto mode1_reset_failed; 5439 5440 amdgpu_device_load_pci_state(adev->pdev); 5441 ret = amdgpu_psp_wait_for_bootloader(adev); 5442 if (ret) 5443 goto mode1_reset_failed; 5444 5445 /* wait for asic to come out of reset */ 5446 for (i = 0; i < adev->usec_timeout; i++) { 5447 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5448 5449 if (memsize != 0xffffffff) 5450 break; 5451 udelay(1); 5452 } 5453 5454 if (i >= adev->usec_timeout) { 5455 ret = -ETIMEDOUT; 5456 goto mode1_reset_failed; 5457 } 5458 5459 if (adev->bios) 5460 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5461 5462 return 0; 5463 5464 mode1_reset_failed: 5465 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5466 return ret; 5467 } 5468 5469 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5470 struct amdgpu_reset_context *reset_context) 5471 { 5472 int i, r = 0; 5473 struct amdgpu_job *job = NULL; 5474 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5475 bool need_full_reset = 5476 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5477 5478 if (reset_context->reset_req_dev == adev) 5479 job = reset_context->job; 5480 5481 if (amdgpu_sriov_vf(adev)) 5482 amdgpu_virt_pre_reset(adev); 5483 5484 amdgpu_fence_driver_isr_toggle(adev, true); 5485 5486 /* block all schedulers and reset given job's ring */ 5487 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5488 struct amdgpu_ring *ring = adev->rings[i]; 5489 5490 if (!amdgpu_ring_sched_ready(ring)) 5491 continue; 5492 5493 /* Clear job fence from fence drv to avoid force_completion 5494 * leave NULL and vm flush fence in fence drv 5495 */ 5496 amdgpu_fence_driver_clear_job_fences(ring); 5497 5498 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5499 amdgpu_fence_driver_force_completion(ring); 5500 } 5501 5502 amdgpu_fence_driver_isr_toggle(adev, false); 5503 5504 if (job && job->vm) 5505 drm_sched_increase_karma(&job->base); 5506 5507 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5508 /* If reset handler not implemented, continue; otherwise return */ 5509 if (r == -EOPNOTSUPP) 5510 r = 0; 5511 else 5512 return r; 5513 5514 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5515 if (!amdgpu_sriov_vf(adev)) { 5516 5517 if (!need_full_reset) 5518 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5519 5520 if (!need_full_reset && amdgpu_gpu_recovery && 5521 amdgpu_device_ip_check_soft_reset(adev)) { 5522 amdgpu_device_ip_pre_soft_reset(adev); 5523 r = amdgpu_device_ip_soft_reset(adev); 5524 amdgpu_device_ip_post_soft_reset(adev); 5525 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5526 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5527 need_full_reset = true; 5528 } 5529 } 5530 5531 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5532 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5533 /* Trigger ip dump before we reset the asic */ 5534 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5535 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5536 tmp_adev->ip_blocks[i].version->funcs 5537 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5538 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5539 } 5540 5541 if (need_full_reset) 5542 r = amdgpu_device_ip_suspend(adev); 5543 if (need_full_reset) 5544 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5545 else 5546 clear_bit(AMDGPU_NEED_FULL_RESET, 5547 &reset_context->flags); 5548 } 5549 5550 return r; 5551 } 5552 5553 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5554 { 5555 struct list_head *device_list_handle; 5556 bool full_reset, vram_lost = false; 5557 struct amdgpu_device *tmp_adev; 5558 int r, init_level; 5559 5560 device_list_handle = reset_context->reset_device_list; 5561 5562 if (!device_list_handle) 5563 return -EINVAL; 5564 5565 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5566 5567 /** 5568 * If it's reset on init, it's default init level, otherwise keep level 5569 * as recovery level. 5570 */ 5571 if (reset_context->method == AMD_RESET_METHOD_ON_INIT) 5572 init_level = AMDGPU_INIT_LEVEL_DEFAULT; 5573 else 5574 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; 5575 5576 r = 0; 5577 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5578 amdgpu_set_init_level(tmp_adev, init_level); 5579 if (full_reset) { 5580 /* post card */ 5581 amdgpu_ras_clear_err_state(tmp_adev); 5582 r = amdgpu_device_asic_init(tmp_adev); 5583 if (r) { 5584 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5585 } else { 5586 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5587 5588 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5589 if (r) 5590 goto out; 5591 5592 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5593 5594 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5595 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 5596 5597 if (vram_lost) { 5598 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5599 amdgpu_inc_vram_lost(tmp_adev); 5600 } 5601 5602 r = amdgpu_device_fw_loading(tmp_adev); 5603 if (r) 5604 return r; 5605 5606 r = amdgpu_xcp_restore_partition_mode( 5607 tmp_adev->xcp_mgr); 5608 if (r) 5609 goto out; 5610 5611 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5612 if (r) 5613 goto out; 5614 5615 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5616 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5617 5618 r = amdgpu_device_ip_resume_phase3(tmp_adev); 5619 if (r) 5620 goto out; 5621 5622 if (vram_lost) 5623 amdgpu_device_fill_reset_magic(tmp_adev); 5624 5625 /* 5626 * Add this ASIC as tracked as reset was already 5627 * complete successfully. 5628 */ 5629 amdgpu_register_gpu_instance(tmp_adev); 5630 5631 if (!reset_context->hive && 5632 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5633 amdgpu_xgmi_add_device(tmp_adev); 5634 5635 r = amdgpu_device_ip_late_init(tmp_adev); 5636 if (r) 5637 goto out; 5638 5639 drm_client_dev_resume(adev_to_drm(tmp_adev), false); 5640 5641 /* 5642 * The GPU enters bad state once faulty pages 5643 * by ECC has reached the threshold, and ras 5644 * recovery is scheduled next. So add one check 5645 * here to break recovery if it indeed exceeds 5646 * bad page threshold, and remind user to 5647 * retire this GPU or setting one bigger 5648 * bad_page_threshold value to fix this once 5649 * probing driver again. 5650 */ 5651 if (!amdgpu_ras_is_rma(tmp_adev)) { 5652 /* must succeed. */ 5653 amdgpu_ras_resume(tmp_adev); 5654 } else { 5655 r = -EINVAL; 5656 goto out; 5657 } 5658 5659 /* Update PSP FW topology after reset */ 5660 if (reset_context->hive && 5661 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5662 r = amdgpu_xgmi_update_topology( 5663 reset_context->hive, tmp_adev); 5664 } 5665 } 5666 5667 out: 5668 if (!r) { 5669 /* IP init is complete now, set level as default */ 5670 amdgpu_set_init_level(tmp_adev, 5671 AMDGPU_INIT_LEVEL_DEFAULT); 5672 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5673 r = amdgpu_ib_ring_tests(tmp_adev); 5674 if (r) { 5675 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5676 r = -EAGAIN; 5677 goto end; 5678 } 5679 } 5680 5681 if (r) 5682 tmp_adev->asic_reset_res = r; 5683 } 5684 5685 end: 5686 return r; 5687 } 5688 5689 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5690 struct amdgpu_reset_context *reset_context) 5691 { 5692 struct amdgpu_device *tmp_adev = NULL; 5693 bool need_full_reset, skip_hw_reset; 5694 int r = 0; 5695 5696 /* Try reset handler method first */ 5697 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5698 reset_list); 5699 5700 reset_context->reset_device_list = device_list_handle; 5701 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5702 /* If reset handler not implemented, continue; otherwise return */ 5703 if (r == -EOPNOTSUPP) 5704 r = 0; 5705 else 5706 return r; 5707 5708 /* Reset handler not implemented, use the default method */ 5709 need_full_reset = 5710 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5711 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5712 5713 /* 5714 * ASIC reset has to be done on all XGMI hive nodes ASAP 5715 * to allow proper links negotiation in FW (within 1 sec) 5716 */ 5717 if (!skip_hw_reset && need_full_reset) { 5718 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5719 /* For XGMI run all resets in parallel to speed up the process */ 5720 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5721 if (!queue_work(system_unbound_wq, 5722 &tmp_adev->xgmi_reset_work)) 5723 r = -EALREADY; 5724 } else 5725 r = amdgpu_asic_reset(tmp_adev); 5726 5727 if (r) { 5728 dev_err(tmp_adev->dev, 5729 "ASIC reset failed with error, %d for drm dev, %s", 5730 r, adev_to_drm(tmp_adev)->unique); 5731 goto out; 5732 } 5733 } 5734 5735 /* For XGMI wait for all resets to complete before proceed */ 5736 if (!r) { 5737 list_for_each_entry(tmp_adev, device_list_handle, 5738 reset_list) { 5739 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5740 flush_work(&tmp_adev->xgmi_reset_work); 5741 r = tmp_adev->asic_reset_res; 5742 if (r) 5743 break; 5744 } 5745 } 5746 } 5747 } 5748 5749 if (!r && amdgpu_ras_intr_triggered()) { 5750 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5751 amdgpu_ras_reset_error_count(tmp_adev, 5752 AMDGPU_RAS_BLOCK__MMHUB); 5753 } 5754 5755 amdgpu_ras_intr_cleared(); 5756 } 5757 5758 r = amdgpu_device_reinit_after_reset(reset_context); 5759 if (r == -EAGAIN) 5760 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5761 else 5762 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5763 5764 out: 5765 return r; 5766 } 5767 5768 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5769 { 5770 5771 switch (amdgpu_asic_reset_method(adev)) { 5772 case AMD_RESET_METHOD_MODE1: 5773 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5774 break; 5775 case AMD_RESET_METHOD_MODE2: 5776 adev->mp1_state = PP_MP1_STATE_RESET; 5777 break; 5778 default: 5779 adev->mp1_state = PP_MP1_STATE_NONE; 5780 break; 5781 } 5782 } 5783 5784 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5785 { 5786 amdgpu_vf_error_trans_all(adev); 5787 adev->mp1_state = PP_MP1_STATE_NONE; 5788 } 5789 5790 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5791 { 5792 struct pci_dev *p = NULL; 5793 5794 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5795 adev->pdev->bus->number, 1); 5796 if (p) { 5797 pm_runtime_enable(&(p->dev)); 5798 pm_runtime_resume(&(p->dev)); 5799 } 5800 5801 pci_dev_put(p); 5802 } 5803 5804 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5805 { 5806 enum amd_reset_method reset_method; 5807 struct pci_dev *p = NULL; 5808 u64 expires; 5809 5810 /* 5811 * For now, only BACO and mode1 reset are confirmed 5812 * to suffer the audio issue without proper suspended. 5813 */ 5814 reset_method = amdgpu_asic_reset_method(adev); 5815 if ((reset_method != AMD_RESET_METHOD_BACO) && 5816 (reset_method != AMD_RESET_METHOD_MODE1)) 5817 return -EINVAL; 5818 5819 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5820 adev->pdev->bus->number, 1); 5821 if (!p) 5822 return -ENODEV; 5823 5824 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5825 if (!expires) 5826 /* 5827 * If we cannot get the audio device autosuspend delay, 5828 * a fixed 4S interval will be used. Considering 3S is 5829 * the audio controller default autosuspend delay setting. 5830 * 4S used here is guaranteed to cover that. 5831 */ 5832 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5833 5834 while (!pm_runtime_status_suspended(&(p->dev))) { 5835 if (!pm_runtime_suspend(&(p->dev))) 5836 break; 5837 5838 if (expires < ktime_get_mono_fast_ns()) { 5839 dev_warn(adev->dev, "failed to suspend display audio\n"); 5840 pci_dev_put(p); 5841 /* TODO: abort the succeeding gpu reset? */ 5842 return -ETIMEDOUT; 5843 } 5844 } 5845 5846 pm_runtime_disable(&(p->dev)); 5847 5848 pci_dev_put(p); 5849 return 0; 5850 } 5851 5852 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5853 { 5854 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5855 5856 #if defined(CONFIG_DEBUG_FS) 5857 if (!amdgpu_sriov_vf(adev)) 5858 cancel_work(&adev->reset_work); 5859 #endif 5860 5861 if (adev->kfd.dev) 5862 cancel_work(&adev->kfd.reset_work); 5863 5864 if (amdgpu_sriov_vf(adev)) 5865 cancel_work(&adev->virt.flr_work); 5866 5867 if (con && adev->ras_enabled) 5868 cancel_work(&con->recovery_work); 5869 5870 } 5871 5872 static int amdgpu_device_health_check(struct list_head *device_list_handle) 5873 { 5874 struct amdgpu_device *tmp_adev; 5875 int ret = 0; 5876 u32 status; 5877 5878 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5879 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status); 5880 if (PCI_POSSIBLE_ERROR(status)) { 5881 dev_err(tmp_adev->dev, "device lost from bus!"); 5882 ret = -ENODEV; 5883 } 5884 } 5885 5886 return ret; 5887 } 5888 5889 /** 5890 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5891 * 5892 * @adev: amdgpu_device pointer 5893 * @job: which job trigger hang 5894 * @reset_context: amdgpu reset context pointer 5895 * 5896 * Attempt to reset the GPU if it has hung (all asics). 5897 * Attempt to do soft-reset or full-reset and reinitialize Asic 5898 * Returns 0 for success or an error on failure. 5899 */ 5900 5901 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5902 struct amdgpu_job *job, 5903 struct amdgpu_reset_context *reset_context) 5904 { 5905 struct list_head device_list, *device_list_handle = NULL; 5906 bool job_signaled = false; 5907 struct amdgpu_hive_info *hive = NULL; 5908 struct amdgpu_device *tmp_adev = NULL; 5909 int i, r = 0; 5910 bool need_emergency_restart = false; 5911 bool audio_suspended = false; 5912 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 5913 5914 /* 5915 * If it reaches here because of hang/timeout and a RAS error is 5916 * detected at the same time, let RAS recovery take care of it. 5917 */ 5918 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && 5919 !amdgpu_sriov_vf(adev) && 5920 reset_context->src != AMDGPU_RESET_SRC_RAS) { 5921 dev_dbg(adev->dev, 5922 "Gpu recovery from source: %d yielding to RAS error recovery handling", 5923 reset_context->src); 5924 return 0; 5925 } 5926 /* 5927 * Special case: RAS triggered and full reset isn't supported 5928 */ 5929 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5930 5931 /* 5932 * Flush RAM to disk so that after reboot 5933 * the user can read log and see why the system rebooted. 5934 */ 5935 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5936 amdgpu_ras_get_context(adev)->reboot) { 5937 DRM_WARN("Emergency reboot."); 5938 5939 ksys_sync_helper(); 5940 emergency_restart(); 5941 } 5942 5943 dev_info(adev->dev, "GPU %s begin!\n", 5944 need_emergency_restart ? "jobs stop":"reset"); 5945 5946 if (!amdgpu_sriov_vf(adev)) 5947 hive = amdgpu_get_xgmi_hive(adev); 5948 if (hive) 5949 mutex_lock(&hive->hive_lock); 5950 5951 reset_context->job = job; 5952 reset_context->hive = hive; 5953 /* 5954 * Build list of devices to reset. 5955 * In case we are in XGMI hive mode, resort the device list 5956 * to put adev in the 1st position. 5957 */ 5958 INIT_LIST_HEAD(&device_list); 5959 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 5960 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5961 list_add_tail(&tmp_adev->reset_list, &device_list); 5962 if (adev->shutdown) 5963 tmp_adev->shutdown = true; 5964 } 5965 if (!list_is_first(&adev->reset_list, &device_list)) 5966 list_rotate_to_front(&adev->reset_list, &device_list); 5967 device_list_handle = &device_list; 5968 } else { 5969 list_add_tail(&adev->reset_list, &device_list); 5970 device_list_handle = &device_list; 5971 } 5972 5973 if (!amdgpu_sriov_vf(adev)) { 5974 r = amdgpu_device_health_check(device_list_handle); 5975 if (r) 5976 goto end_reset; 5977 } 5978 5979 /* We need to lock reset domain only once both for XGMI and single device */ 5980 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5981 reset_list); 5982 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5983 5984 /* block all schedulers and reset given job's ring */ 5985 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5986 5987 amdgpu_device_set_mp1_state(tmp_adev); 5988 5989 /* 5990 * Try to put the audio codec into suspend state 5991 * before gpu reset started. 5992 * 5993 * Due to the power domain of the graphics device 5994 * is shared with AZ power domain. Without this, 5995 * we may change the audio hardware from behind 5996 * the audio driver's back. That will trigger 5997 * some audio codec errors. 5998 */ 5999 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 6000 audio_suspended = true; 6001 6002 amdgpu_ras_set_error_query_ready(tmp_adev, false); 6003 6004 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 6005 6006 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 6007 6008 /* 6009 * Mark these ASICs to be reset as untracked first 6010 * And add them back after reset completed 6011 */ 6012 amdgpu_unregister_gpu_instance(tmp_adev); 6013 6014 drm_client_dev_suspend(adev_to_drm(tmp_adev), false); 6015 6016 /* disable ras on ALL IPs */ 6017 if (!need_emergency_restart && 6018 amdgpu_device_ip_need_full_reset(tmp_adev)) 6019 amdgpu_ras_suspend(tmp_adev); 6020 6021 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6022 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6023 6024 if (!amdgpu_ring_sched_ready(ring)) 6025 continue; 6026 6027 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 6028 6029 if (need_emergency_restart) 6030 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 6031 } 6032 atomic_inc(&tmp_adev->gpu_reset_counter); 6033 } 6034 6035 if (need_emergency_restart) 6036 goto skip_sched_resume; 6037 6038 /* 6039 * Must check guilty signal here since after this point all old 6040 * HW fences are force signaled. 6041 * 6042 * job->base holds a reference to parent fence 6043 */ 6044 if (job && dma_fence_is_signaled(&job->hw_fence)) { 6045 job_signaled = true; 6046 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 6047 goto skip_hw_reset; 6048 } 6049 6050 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 6051 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6052 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 6053 /*TODO Should we stop ?*/ 6054 if (r) { 6055 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 6056 r, adev_to_drm(tmp_adev)->unique); 6057 tmp_adev->asic_reset_res = r; 6058 } 6059 } 6060 6061 /* Actual ASIC resets if needed.*/ 6062 /* Host driver will handle XGMI hive reset for SRIOV */ 6063 if (amdgpu_sriov_vf(adev)) { 6064 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 6065 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 6066 amdgpu_ras_set_fed(adev, true); 6067 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 6068 } 6069 6070 r = amdgpu_device_reset_sriov(adev, reset_context); 6071 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 6072 amdgpu_virt_release_full_gpu(adev, true); 6073 goto retry; 6074 } 6075 if (r) 6076 adev->asic_reset_res = r; 6077 } else { 6078 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 6079 if (r && r == -EAGAIN) 6080 goto retry; 6081 } 6082 6083 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6084 /* 6085 * Drop any pending non scheduler resets queued before reset is done. 6086 * Any reset scheduled after this point would be valid. Scheduler resets 6087 * were already dropped during drm_sched_stop and no new ones can come 6088 * in before drm_sched_start. 6089 */ 6090 amdgpu_device_stop_pending_resets(tmp_adev); 6091 } 6092 6093 skip_hw_reset: 6094 6095 /* Post ASIC reset for all devs .*/ 6096 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6097 6098 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6099 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6100 6101 if (!amdgpu_ring_sched_ready(ring)) 6102 continue; 6103 6104 drm_sched_start(&ring->sched, 0); 6105 } 6106 6107 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 6108 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 6109 6110 if (tmp_adev->asic_reset_res) 6111 r = tmp_adev->asic_reset_res; 6112 6113 tmp_adev->asic_reset_res = 0; 6114 6115 if (r) { 6116 /* bad news, how to tell it to userspace ? 6117 * for ras error, we should report GPU bad status instead of 6118 * reset failure 6119 */ 6120 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 6121 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 6122 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", 6123 atomic_read(&tmp_adev->gpu_reset_counter)); 6124 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 6125 } else { 6126 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 6127 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 6128 DRM_WARN("smart shift update failed\n"); 6129 } 6130 } 6131 6132 skip_sched_resume: 6133 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6134 /* unlock kfd: SRIOV would do it separately */ 6135 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 6136 amdgpu_amdkfd_post_reset(tmp_adev); 6137 6138 /* kfd_post_reset will do nothing if kfd device is not initialized, 6139 * need to bring up kfd here if it's not be initialized before 6140 */ 6141 if (!adev->kfd.init_complete) 6142 amdgpu_amdkfd_device_init(adev); 6143 6144 if (audio_suspended) 6145 amdgpu_device_resume_display_audio(tmp_adev); 6146 6147 amdgpu_device_unset_mp1_state(tmp_adev); 6148 6149 amdgpu_ras_set_error_query_ready(tmp_adev, true); 6150 } 6151 6152 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 6153 reset_list); 6154 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 6155 6156 end_reset: 6157 if (hive) { 6158 mutex_unlock(&hive->hive_lock); 6159 amdgpu_put_xgmi_hive(hive); 6160 } 6161 6162 if (r) 6163 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 6164 6165 atomic_set(&adev->reset_domain->reset_res, r); 6166 return r; 6167 } 6168 6169 /** 6170 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6171 * 6172 * @adev: amdgpu_device pointer 6173 * @speed: pointer to the speed of the link 6174 * @width: pointer to the width of the link 6175 * 6176 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6177 * first physical partner to an AMD dGPU. 6178 * This will exclude any virtual switches and links. 6179 */ 6180 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6181 enum pci_bus_speed *speed, 6182 enum pcie_link_width *width) 6183 { 6184 struct pci_dev *parent = adev->pdev; 6185 6186 if (!speed || !width) 6187 return; 6188 6189 *speed = PCI_SPEED_UNKNOWN; 6190 *width = PCIE_LNK_WIDTH_UNKNOWN; 6191 6192 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6193 while ((parent = pci_upstream_bridge(parent))) { 6194 /* skip upstream/downstream switches internal to dGPU*/ 6195 if (parent->vendor == PCI_VENDOR_ID_ATI) 6196 continue; 6197 *speed = pcie_get_speed_cap(parent); 6198 *width = pcie_get_width_cap(parent); 6199 break; 6200 } 6201 } else { 6202 /* use the current speeds rather than max if switching is not supported */ 6203 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6204 } 6205 } 6206 6207 /** 6208 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU 6209 * 6210 * @adev: amdgpu_device pointer 6211 * @speed: pointer to the speed of the link 6212 * @width: pointer to the width of the link 6213 * 6214 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6215 * AMD dGPU which may be a virtual upstream bridge. 6216 */ 6217 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev, 6218 enum pci_bus_speed *speed, 6219 enum pcie_link_width *width) 6220 { 6221 struct pci_dev *parent = adev->pdev; 6222 6223 if (!speed || !width) 6224 return; 6225 6226 parent = pci_upstream_bridge(parent); 6227 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) { 6228 /* use the upstream/downstream switches internal to dGPU */ 6229 *speed = pcie_get_speed_cap(parent); 6230 *width = pcie_get_width_cap(parent); 6231 while ((parent = pci_upstream_bridge(parent))) { 6232 if (parent->vendor == PCI_VENDOR_ID_ATI) { 6233 /* use the upstream/downstream switches internal to dGPU */ 6234 *speed = pcie_get_speed_cap(parent); 6235 *width = pcie_get_width_cap(parent); 6236 } 6237 } 6238 } else { 6239 /* use the device itself */ 6240 *speed = pcie_get_speed_cap(adev->pdev); 6241 *width = pcie_get_width_cap(adev->pdev); 6242 } 6243 } 6244 6245 /** 6246 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6247 * 6248 * @adev: amdgpu_device pointer 6249 * 6250 * Fetches and stores in the driver the PCIE capabilities (gen speed 6251 * and lanes) of the slot the device is in. Handles APUs and 6252 * virtualized environments where PCIE config space may not be available. 6253 */ 6254 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6255 { 6256 enum pci_bus_speed speed_cap, platform_speed_cap; 6257 enum pcie_link_width platform_link_width, link_width; 6258 6259 if (amdgpu_pcie_gen_cap) 6260 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6261 6262 if (amdgpu_pcie_lane_cap) 6263 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6264 6265 /* covers APUs as well */ 6266 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6267 if (adev->pm.pcie_gen_mask == 0) 6268 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6269 if (adev->pm.pcie_mlw_mask == 0) 6270 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6271 return; 6272 } 6273 6274 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6275 return; 6276 6277 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6278 &platform_link_width); 6279 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width); 6280 6281 if (adev->pm.pcie_gen_mask == 0) { 6282 /* asic caps */ 6283 if (speed_cap == PCI_SPEED_UNKNOWN) { 6284 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6285 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6286 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6287 } else { 6288 if (speed_cap == PCIE_SPEED_32_0GT) 6289 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6290 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6291 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6292 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6293 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6294 else if (speed_cap == PCIE_SPEED_16_0GT) 6295 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6296 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6297 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6298 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6299 else if (speed_cap == PCIE_SPEED_8_0GT) 6300 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6301 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6302 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6303 else if (speed_cap == PCIE_SPEED_5_0GT) 6304 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6305 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6306 else 6307 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6308 } 6309 /* platform caps */ 6310 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6311 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6312 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6313 } else { 6314 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6315 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6316 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6317 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6318 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6319 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6320 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6321 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6322 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6323 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6324 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6325 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6326 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6327 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6328 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6329 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6330 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6331 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6332 else 6333 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6334 6335 } 6336 } 6337 if (adev->pm.pcie_mlw_mask == 0) { 6338 /* asic caps */ 6339 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6340 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK; 6341 } else { 6342 switch (link_width) { 6343 case PCIE_LNK_X32: 6344 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 | 6345 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6346 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6347 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6348 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6349 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6350 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6351 break; 6352 case PCIE_LNK_X16: 6353 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6354 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6355 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6356 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6357 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6358 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6359 break; 6360 case PCIE_LNK_X12: 6361 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6362 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6363 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6364 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6365 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6366 break; 6367 case PCIE_LNK_X8: 6368 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6369 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6370 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6371 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6372 break; 6373 case PCIE_LNK_X4: 6374 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6375 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6376 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6377 break; 6378 case PCIE_LNK_X2: 6379 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6380 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6381 break; 6382 case PCIE_LNK_X1: 6383 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1; 6384 break; 6385 default: 6386 break; 6387 } 6388 } 6389 /* platform caps */ 6390 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6391 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6392 } else { 6393 switch (platform_link_width) { 6394 case PCIE_LNK_X32: 6395 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6396 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6397 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6398 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6399 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6400 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6401 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6402 break; 6403 case PCIE_LNK_X16: 6404 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6405 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6406 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6407 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6408 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6409 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6410 break; 6411 case PCIE_LNK_X12: 6412 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6413 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6414 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6415 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6416 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6417 break; 6418 case PCIE_LNK_X8: 6419 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6420 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6421 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6422 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6423 break; 6424 case PCIE_LNK_X4: 6425 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6426 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6427 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6428 break; 6429 case PCIE_LNK_X2: 6430 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6431 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6432 break; 6433 case PCIE_LNK_X1: 6434 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6435 break; 6436 default: 6437 break; 6438 } 6439 } 6440 } 6441 } 6442 6443 /** 6444 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6445 * 6446 * @adev: amdgpu_device pointer 6447 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6448 * 6449 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6450 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6451 * @peer_adev. 6452 */ 6453 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6454 struct amdgpu_device *peer_adev) 6455 { 6456 #ifdef CONFIG_HSA_AMD_P2P 6457 bool p2p_access = 6458 !adev->gmc.xgmi.connected_to_cpu && 6459 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6460 if (!p2p_access) 6461 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 6462 pci_name(peer_adev->pdev)); 6463 6464 bool is_large_bar = adev->gmc.visible_vram_size && 6465 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 6466 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 6467 6468 if (!p2p_addressable) { 6469 uint64_t address_mask = peer_adev->dev->dma_mask ? 6470 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6471 resource_size_t aper_limit = 6472 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6473 6474 p2p_addressable = !(adev->gmc.aper_base & address_mask || 6475 aper_limit & address_mask); 6476 } 6477 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 6478 #else 6479 return false; 6480 #endif 6481 } 6482 6483 int amdgpu_device_baco_enter(struct drm_device *dev) 6484 { 6485 struct amdgpu_device *adev = drm_to_adev(dev); 6486 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6487 6488 if (!amdgpu_device_supports_baco(dev)) 6489 return -ENOTSUPP; 6490 6491 if (ras && adev->ras_enabled && 6492 adev->nbio.funcs->enable_doorbell_interrupt) 6493 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6494 6495 return amdgpu_dpm_baco_enter(adev); 6496 } 6497 6498 int amdgpu_device_baco_exit(struct drm_device *dev) 6499 { 6500 struct amdgpu_device *adev = drm_to_adev(dev); 6501 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6502 int ret = 0; 6503 6504 if (!amdgpu_device_supports_baco(dev)) 6505 return -ENOTSUPP; 6506 6507 ret = amdgpu_dpm_baco_exit(adev); 6508 if (ret) 6509 return ret; 6510 6511 if (ras && adev->ras_enabled && 6512 adev->nbio.funcs->enable_doorbell_interrupt) 6513 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6514 6515 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6516 adev->nbio.funcs->clear_doorbell_interrupt) 6517 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6518 6519 return 0; 6520 } 6521 6522 /** 6523 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6524 * @pdev: PCI device struct 6525 * @state: PCI channel state 6526 * 6527 * Description: Called when a PCI error is detected. 6528 * 6529 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6530 */ 6531 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6532 { 6533 struct drm_device *dev = pci_get_drvdata(pdev); 6534 struct amdgpu_device *adev = drm_to_adev(dev); 6535 int i; 6536 6537 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 6538 6539 if (adev->gmc.xgmi.num_physical_nodes > 1) { 6540 DRM_WARN("No support for XGMI hive yet..."); 6541 return PCI_ERS_RESULT_DISCONNECT; 6542 } 6543 6544 adev->pci_channel_state = state; 6545 6546 switch (state) { 6547 case pci_channel_io_normal: 6548 return PCI_ERS_RESULT_CAN_RECOVER; 6549 /* Fatal error, prepare for slot reset */ 6550 case pci_channel_io_frozen: 6551 /* 6552 * Locking adev->reset_domain->sem will prevent any external access 6553 * to GPU during PCI error recovery 6554 */ 6555 amdgpu_device_lock_reset_domain(adev->reset_domain); 6556 amdgpu_device_set_mp1_state(adev); 6557 6558 /* 6559 * Block any work scheduling as we do for regular GPU reset 6560 * for the duration of the recovery 6561 */ 6562 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6563 struct amdgpu_ring *ring = adev->rings[i]; 6564 6565 if (!amdgpu_ring_sched_ready(ring)) 6566 continue; 6567 6568 drm_sched_stop(&ring->sched, NULL); 6569 } 6570 atomic_inc(&adev->gpu_reset_counter); 6571 return PCI_ERS_RESULT_NEED_RESET; 6572 case pci_channel_io_perm_failure: 6573 /* Permanent error, prepare for device removal */ 6574 return PCI_ERS_RESULT_DISCONNECT; 6575 } 6576 6577 return PCI_ERS_RESULT_NEED_RESET; 6578 } 6579 6580 /** 6581 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6582 * @pdev: pointer to PCI device 6583 */ 6584 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6585 { 6586 6587 DRM_INFO("PCI error: mmio enabled callback!!\n"); 6588 6589 /* TODO - dump whatever for debugging purposes */ 6590 6591 /* This called only if amdgpu_pci_error_detected returns 6592 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6593 * works, no need to reset slot. 6594 */ 6595 6596 return PCI_ERS_RESULT_RECOVERED; 6597 } 6598 6599 /** 6600 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6601 * @pdev: PCI device struct 6602 * 6603 * Description: This routine is called by the pci error recovery 6604 * code after the PCI slot has been reset, just before we 6605 * should resume normal operations. 6606 */ 6607 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6608 { 6609 struct drm_device *dev = pci_get_drvdata(pdev); 6610 struct amdgpu_device *adev = drm_to_adev(dev); 6611 int r, i; 6612 struct amdgpu_reset_context reset_context; 6613 u32 memsize; 6614 struct list_head device_list; 6615 6616 /* PCI error slot reset should be skipped During RAS recovery */ 6617 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 6618 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && 6619 amdgpu_ras_in_recovery(adev)) 6620 return PCI_ERS_RESULT_RECOVERED; 6621 6622 DRM_INFO("PCI error: slot reset callback!!\n"); 6623 6624 memset(&reset_context, 0, sizeof(reset_context)); 6625 6626 INIT_LIST_HEAD(&device_list); 6627 list_add_tail(&adev->reset_list, &device_list); 6628 6629 /* wait for asic to come out of reset */ 6630 msleep(500); 6631 6632 /* Restore PCI confspace */ 6633 amdgpu_device_load_pci_state(pdev); 6634 6635 /* confirm ASIC came out of reset */ 6636 for (i = 0; i < adev->usec_timeout; i++) { 6637 memsize = amdgpu_asic_get_config_memsize(adev); 6638 6639 if (memsize != 0xffffffff) 6640 break; 6641 udelay(1); 6642 } 6643 if (memsize == 0xffffffff) { 6644 r = -ETIME; 6645 goto out; 6646 } 6647 6648 reset_context.method = AMD_RESET_METHOD_NONE; 6649 reset_context.reset_req_dev = adev; 6650 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6651 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6652 6653 adev->no_hw_access = true; 6654 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 6655 adev->no_hw_access = false; 6656 if (r) 6657 goto out; 6658 6659 r = amdgpu_do_asic_reset(&device_list, &reset_context); 6660 6661 out: 6662 if (!r) { 6663 if (amdgpu_device_cache_pci_state(adev->pdev)) 6664 pci_restore_state(adev->pdev); 6665 6666 DRM_INFO("PCIe error recovery succeeded\n"); 6667 } else { 6668 DRM_ERROR("PCIe error recovery failed, err:%d", r); 6669 amdgpu_device_unset_mp1_state(adev); 6670 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6671 } 6672 6673 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6674 } 6675 6676 /** 6677 * amdgpu_pci_resume() - resume normal ops after PCI reset 6678 * @pdev: pointer to PCI device 6679 * 6680 * Called when the error recovery driver tells us that its 6681 * OK to resume normal operation. 6682 */ 6683 void amdgpu_pci_resume(struct pci_dev *pdev) 6684 { 6685 struct drm_device *dev = pci_get_drvdata(pdev); 6686 struct amdgpu_device *adev = drm_to_adev(dev); 6687 int i; 6688 6689 6690 DRM_INFO("PCI error: resume callback!!\n"); 6691 6692 /* Only continue execution for the case of pci_channel_io_frozen */ 6693 if (adev->pci_channel_state != pci_channel_io_frozen) 6694 return; 6695 6696 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6697 struct amdgpu_ring *ring = adev->rings[i]; 6698 6699 if (!amdgpu_ring_sched_ready(ring)) 6700 continue; 6701 6702 drm_sched_start(&ring->sched, 0); 6703 } 6704 6705 amdgpu_device_unset_mp1_state(adev); 6706 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6707 } 6708 6709 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6710 { 6711 struct drm_device *dev = pci_get_drvdata(pdev); 6712 struct amdgpu_device *adev = drm_to_adev(dev); 6713 int r; 6714 6715 if (amdgpu_sriov_vf(adev)) 6716 return false; 6717 6718 r = pci_save_state(pdev); 6719 if (!r) { 6720 kfree(adev->pci_state); 6721 6722 adev->pci_state = pci_store_saved_state(pdev); 6723 6724 if (!adev->pci_state) { 6725 DRM_ERROR("Failed to store PCI saved state"); 6726 return false; 6727 } 6728 } else { 6729 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6730 return false; 6731 } 6732 6733 return true; 6734 } 6735 6736 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6737 { 6738 struct drm_device *dev = pci_get_drvdata(pdev); 6739 struct amdgpu_device *adev = drm_to_adev(dev); 6740 int r; 6741 6742 if (!adev->pci_state) 6743 return false; 6744 6745 r = pci_load_saved_state(pdev, adev->pci_state); 6746 6747 if (!r) { 6748 pci_restore_state(pdev); 6749 } else { 6750 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6751 return false; 6752 } 6753 6754 return true; 6755 } 6756 6757 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6758 struct amdgpu_ring *ring) 6759 { 6760 #ifdef CONFIG_X86_64 6761 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6762 return; 6763 #endif 6764 if (adev->gmc.xgmi.connected_to_cpu) 6765 return; 6766 6767 if (ring && ring->funcs->emit_hdp_flush) 6768 amdgpu_ring_emit_hdp_flush(ring); 6769 else 6770 amdgpu_asic_flush_hdp(adev, ring); 6771 } 6772 6773 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6774 struct amdgpu_ring *ring) 6775 { 6776 #ifdef CONFIG_X86_64 6777 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6778 return; 6779 #endif 6780 if (adev->gmc.xgmi.connected_to_cpu) 6781 return; 6782 6783 amdgpu_asic_invalidate_hdp(adev, ring); 6784 } 6785 6786 int amdgpu_in_reset(struct amdgpu_device *adev) 6787 { 6788 return atomic_read(&adev->reset_domain->in_gpu_reset); 6789 } 6790 6791 /** 6792 * amdgpu_device_halt() - bring hardware to some kind of halt state 6793 * 6794 * @adev: amdgpu_device pointer 6795 * 6796 * Bring hardware to some kind of halt state so that no one can touch it 6797 * any more. It will help to maintain error context when error occurred. 6798 * Compare to a simple hang, the system will keep stable at least for SSH 6799 * access. Then it should be trivial to inspect the hardware state and 6800 * see what's going on. Implemented as following: 6801 * 6802 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6803 * clears all CPU mappings to device, disallows remappings through page faults 6804 * 2. amdgpu_irq_disable_all() disables all interrupts 6805 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6806 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6807 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6808 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6809 * flush any in flight DMA operations 6810 */ 6811 void amdgpu_device_halt(struct amdgpu_device *adev) 6812 { 6813 struct pci_dev *pdev = adev->pdev; 6814 struct drm_device *ddev = adev_to_drm(adev); 6815 6816 amdgpu_xcp_dev_unplug(adev); 6817 drm_dev_unplug(ddev); 6818 6819 amdgpu_irq_disable_all(adev); 6820 6821 amdgpu_fence_driver_hw_fini(adev); 6822 6823 adev->no_hw_access = true; 6824 6825 amdgpu_device_unmap_mmio(adev); 6826 6827 pci_disable_device(pdev); 6828 pci_wait_for_pending_transaction(pdev); 6829 } 6830 6831 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6832 u32 reg) 6833 { 6834 unsigned long flags, address, data; 6835 u32 r; 6836 6837 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6838 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6839 6840 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6841 WREG32(address, reg * 4); 6842 (void)RREG32(address); 6843 r = RREG32(data); 6844 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6845 return r; 6846 } 6847 6848 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6849 u32 reg, u32 v) 6850 { 6851 unsigned long flags, address, data; 6852 6853 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6854 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6855 6856 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6857 WREG32(address, reg * 4); 6858 (void)RREG32(address); 6859 WREG32(data, v); 6860 (void)RREG32(data); 6861 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6862 } 6863 6864 /** 6865 * amdgpu_device_get_gang - return a reference to the current gang 6866 * @adev: amdgpu_device pointer 6867 * 6868 * Returns: A new reference to the current gang leader. 6869 */ 6870 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 6871 { 6872 struct dma_fence *fence; 6873 6874 rcu_read_lock(); 6875 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 6876 rcu_read_unlock(); 6877 return fence; 6878 } 6879 6880 /** 6881 * amdgpu_device_switch_gang - switch to a new gang 6882 * @adev: amdgpu_device pointer 6883 * @gang: the gang to switch to 6884 * 6885 * Try to switch to a new gang. 6886 * Returns: NULL if we switched to the new gang or a reference to the current 6887 * gang leader. 6888 */ 6889 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6890 struct dma_fence *gang) 6891 { 6892 struct dma_fence *old = NULL; 6893 6894 do { 6895 dma_fence_put(old); 6896 old = amdgpu_device_get_gang(adev); 6897 if (old == gang) 6898 break; 6899 6900 if (!dma_fence_is_signaled(old)) 6901 return old; 6902 6903 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6904 old, gang) != old); 6905 6906 dma_fence_put(old); 6907 return NULL; 6908 } 6909 6910 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6911 { 6912 switch (adev->asic_type) { 6913 #ifdef CONFIG_DRM_AMDGPU_SI 6914 case CHIP_HAINAN: 6915 #endif 6916 case CHIP_TOPAZ: 6917 /* chips with no display hardware */ 6918 return false; 6919 #ifdef CONFIG_DRM_AMDGPU_SI 6920 case CHIP_TAHITI: 6921 case CHIP_PITCAIRN: 6922 case CHIP_VERDE: 6923 case CHIP_OLAND: 6924 #endif 6925 #ifdef CONFIG_DRM_AMDGPU_CIK 6926 case CHIP_BONAIRE: 6927 case CHIP_HAWAII: 6928 case CHIP_KAVERI: 6929 case CHIP_KABINI: 6930 case CHIP_MULLINS: 6931 #endif 6932 case CHIP_TONGA: 6933 case CHIP_FIJI: 6934 case CHIP_POLARIS10: 6935 case CHIP_POLARIS11: 6936 case CHIP_POLARIS12: 6937 case CHIP_VEGAM: 6938 case CHIP_CARRIZO: 6939 case CHIP_STONEY: 6940 /* chips with display hardware */ 6941 return true; 6942 default: 6943 /* IP discovery */ 6944 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 6945 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6946 return false; 6947 return true; 6948 } 6949 } 6950 6951 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6952 uint32_t inst, uint32_t reg_addr, char reg_name[], 6953 uint32_t expected_value, uint32_t mask) 6954 { 6955 uint32_t ret = 0; 6956 uint32_t old_ = 0; 6957 uint32_t tmp_ = RREG32(reg_addr); 6958 uint32_t loop = adev->usec_timeout; 6959 6960 while ((tmp_ & (mask)) != (expected_value)) { 6961 if (old_ != tmp_) { 6962 loop = adev->usec_timeout; 6963 old_ = tmp_; 6964 } else 6965 udelay(1); 6966 tmp_ = RREG32(reg_addr); 6967 loop--; 6968 if (!loop) { 6969 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6970 inst, reg_name, (uint32_t)expected_value, 6971 (uint32_t)(tmp_ & (mask))); 6972 ret = -ETIMEDOUT; 6973 break; 6974 } 6975 } 6976 return ret; 6977 } 6978 6979 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 6980 { 6981 ssize_t size = 0; 6982 6983 if (!ring || !ring->adev) 6984 return size; 6985 6986 if (amdgpu_device_should_recover_gpu(ring->adev)) 6987 size |= AMDGPU_RESET_TYPE_FULL; 6988 6989 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 6990 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 6991 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 6992 6993 return size; 6994 } 6995 6996 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 6997 { 6998 ssize_t size = 0; 6999 7000 if (supported_reset == 0) { 7001 size += sysfs_emit_at(buf, size, "unsupported"); 7002 size += sysfs_emit_at(buf, size, "\n"); 7003 return size; 7004 7005 } 7006 7007 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 7008 size += sysfs_emit_at(buf, size, "soft "); 7009 7010 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 7011 size += sysfs_emit_at(buf, size, "queue "); 7012 7013 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 7014 size += sysfs_emit_at(buf, size, "pipe "); 7015 7016 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 7017 size += sysfs_emit_at(buf, size, "full "); 7018 7019 size += sysfs_emit_at(buf, size, "\n"); 7020 return size; 7021 } 7022