1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_client_event.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_probe_helper.h> 44 #include <drm/amdgpu_drm.h> 45 #include <linux/device.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 #include "amdgpu_virt.h" 78 #include "amdgpu_dev_coredump.h" 79 80 #include <linux/suspend.h> 81 #include <drm/task_barrier.h> 82 #include <linux/pm_runtime.h> 83 84 #include <drm/drm_drv.h> 85 86 #if IS_ENABLED(CONFIG_X86) 87 #include <asm/intel-family.h> 88 #endif 89 90 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 97 98 #define AMDGPU_RESUME_MS 2000 99 #define AMDGPU_MAX_RETRY_LIMIT 2 100 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 101 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 102 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 103 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 104 105 #define AMDGPU_VBIOS_SKIP (1U << 0) 106 #define AMDGPU_VBIOS_OPTIONAL (1U << 1) 107 108 static const struct drm_driver amdgpu_kms_driver; 109 110 const char *amdgpu_asic_name[] = { 111 "TAHITI", 112 "PITCAIRN", 113 "VERDE", 114 "OLAND", 115 "HAINAN", 116 "BONAIRE", 117 "KAVERI", 118 "KABINI", 119 "HAWAII", 120 "MULLINS", 121 "TOPAZ", 122 "TONGA", 123 "FIJI", 124 "CARRIZO", 125 "STONEY", 126 "POLARIS10", 127 "POLARIS11", 128 "POLARIS12", 129 "VEGAM", 130 "VEGA10", 131 "VEGA12", 132 "VEGA20", 133 "RAVEN", 134 "ARCTURUS", 135 "RENOIR", 136 "ALDEBARAN", 137 "NAVI10", 138 "CYAN_SKILLFISH", 139 "NAVI14", 140 "NAVI12", 141 "SIENNA_CICHLID", 142 "NAVY_FLOUNDER", 143 "VANGOGH", 144 "DIMGREY_CAVEFISH", 145 "BEIGE_GOBY", 146 "YELLOW_CARP", 147 "IP DISCOVERY", 148 "LAST", 149 }; 150 151 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0) 152 /* 153 * Default init level where all blocks are expected to be initialized. This is 154 * the level of initialization expected by default and also after a full reset 155 * of the device. 156 */ 157 struct amdgpu_init_level amdgpu_init_default = { 158 .level = AMDGPU_INIT_LEVEL_DEFAULT, 159 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 160 }; 161 162 struct amdgpu_init_level amdgpu_init_recovery = { 163 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, 164 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 165 }; 166 167 /* 168 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 169 * is used for cases like reset on initialization where the entire hive needs to 170 * be reset before first use. 171 */ 172 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 173 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 174 .hwini_ip_block_mask = 175 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 176 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 177 BIT(AMD_IP_BLOCK_TYPE_PSP) 178 }; 179 180 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 181 enum amd_ip_block_type block) 182 { 183 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 184 } 185 186 void amdgpu_set_init_level(struct amdgpu_device *adev, 187 enum amdgpu_init_lvl_id lvl) 188 { 189 switch (lvl) { 190 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 191 adev->init_lvl = &amdgpu_init_minimal_xgmi; 192 break; 193 case AMDGPU_INIT_LEVEL_RESET_RECOVERY: 194 adev->init_lvl = &amdgpu_init_recovery; 195 break; 196 case AMDGPU_INIT_LEVEL_DEFAULT: 197 fallthrough; 198 default: 199 adev->init_lvl = &amdgpu_init_default; 200 break; 201 } 202 } 203 204 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 205 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 206 void *data); 207 208 /** 209 * DOC: pcie_replay_count 210 * 211 * The amdgpu driver provides a sysfs API for reporting the total number 212 * of PCIe replays (NAKs). 213 * The file pcie_replay_count is used for this and returns the total 214 * number of replays as a sum of the NAKs generated and NAKs received. 215 */ 216 217 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 218 struct device_attribute *attr, char *buf) 219 { 220 struct drm_device *ddev = dev_get_drvdata(dev); 221 struct amdgpu_device *adev = drm_to_adev(ddev); 222 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 223 224 return sysfs_emit(buf, "%llu\n", cnt); 225 } 226 227 static DEVICE_ATTR(pcie_replay_count, 0444, 228 amdgpu_device_get_pcie_replay_count, NULL); 229 230 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 231 struct bin_attribute *attr, char *buf, 232 loff_t ppos, size_t count) 233 { 234 struct device *dev = kobj_to_dev(kobj); 235 struct drm_device *ddev = dev_get_drvdata(dev); 236 struct amdgpu_device *adev = drm_to_adev(ddev); 237 ssize_t bytes_read; 238 239 switch (ppos) { 240 case AMDGPU_SYS_REG_STATE_XGMI: 241 bytes_read = amdgpu_asic_get_reg_state( 242 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 243 break; 244 case AMDGPU_SYS_REG_STATE_WAFL: 245 bytes_read = amdgpu_asic_get_reg_state( 246 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 247 break; 248 case AMDGPU_SYS_REG_STATE_PCIE: 249 bytes_read = amdgpu_asic_get_reg_state( 250 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 251 break; 252 case AMDGPU_SYS_REG_STATE_USR: 253 bytes_read = amdgpu_asic_get_reg_state( 254 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 255 break; 256 case AMDGPU_SYS_REG_STATE_USR_1: 257 bytes_read = amdgpu_asic_get_reg_state( 258 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 259 break; 260 default: 261 return -EINVAL; 262 } 263 264 return bytes_read; 265 } 266 267 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 268 AMDGPU_SYS_REG_STATE_END); 269 270 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 271 { 272 int ret; 273 274 if (!amdgpu_asic_get_reg_state_supported(adev)) 275 return 0; 276 277 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 278 279 return ret; 280 } 281 282 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 283 { 284 if (!amdgpu_asic_get_reg_state_supported(adev)) 285 return; 286 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 287 } 288 289 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block) 290 { 291 int r; 292 293 if (ip_block->version->funcs->suspend) { 294 r = ip_block->version->funcs->suspend(ip_block); 295 if (r) { 296 dev_err(ip_block->adev->dev, 297 "suspend of IP block <%s> failed %d\n", 298 ip_block->version->funcs->name, r); 299 return r; 300 } 301 } 302 303 ip_block->status.hw = false; 304 return 0; 305 } 306 307 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block) 308 { 309 int r; 310 311 if (ip_block->version->funcs->resume) { 312 r = ip_block->version->funcs->resume(ip_block); 313 if (r) { 314 dev_err(ip_block->adev->dev, 315 "resume of IP block <%s> failed %d\n", 316 ip_block->version->funcs->name, r); 317 return r; 318 } 319 } 320 321 ip_block->status.hw = true; 322 return 0; 323 } 324 325 /** 326 * DOC: board_info 327 * 328 * The amdgpu driver provides a sysfs API for giving board related information. 329 * It provides the form factor information in the format 330 * 331 * type : form factor 332 * 333 * Possible form factor values 334 * 335 * - "cem" - PCIE CEM card 336 * - "oam" - Open Compute Accelerator Module 337 * - "unknown" - Not known 338 * 339 */ 340 341 static ssize_t amdgpu_device_get_board_info(struct device *dev, 342 struct device_attribute *attr, 343 char *buf) 344 { 345 struct drm_device *ddev = dev_get_drvdata(dev); 346 struct amdgpu_device *adev = drm_to_adev(ddev); 347 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 348 const char *pkg; 349 350 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 351 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 352 353 switch (pkg_type) { 354 case AMDGPU_PKG_TYPE_CEM: 355 pkg = "cem"; 356 break; 357 case AMDGPU_PKG_TYPE_OAM: 358 pkg = "oam"; 359 break; 360 default: 361 pkg = "unknown"; 362 break; 363 } 364 365 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 366 } 367 368 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 369 370 static struct attribute *amdgpu_board_attrs[] = { 371 &dev_attr_board_info.attr, 372 NULL, 373 }; 374 375 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 376 struct attribute *attr, int n) 377 { 378 struct device *dev = kobj_to_dev(kobj); 379 struct drm_device *ddev = dev_get_drvdata(dev); 380 struct amdgpu_device *adev = drm_to_adev(ddev); 381 382 if (adev->flags & AMD_IS_APU) 383 return 0; 384 385 return attr->mode; 386 } 387 388 static const struct attribute_group amdgpu_board_attrs_group = { 389 .attrs = amdgpu_board_attrs, 390 .is_visible = amdgpu_board_attrs_is_visible 391 }; 392 393 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 394 395 396 /** 397 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 398 * 399 * @dev: drm_device pointer 400 * 401 * Returns true if the device is a dGPU with ATPX power control, 402 * otherwise return false. 403 */ 404 bool amdgpu_device_supports_px(struct drm_device *dev) 405 { 406 struct amdgpu_device *adev = drm_to_adev(dev); 407 408 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 409 return true; 410 return false; 411 } 412 413 /** 414 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 415 * 416 * @dev: drm_device pointer 417 * 418 * Returns true if the device is a dGPU with ACPI power control, 419 * otherwise return false. 420 */ 421 bool amdgpu_device_supports_boco(struct drm_device *dev) 422 { 423 struct amdgpu_device *adev = drm_to_adev(dev); 424 425 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) 426 return false; 427 428 if (adev->has_pr3 || 429 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 430 return true; 431 return false; 432 } 433 434 /** 435 * amdgpu_device_supports_baco - Does the device support BACO 436 * 437 * @dev: drm_device pointer 438 * 439 * Return: 440 * 1 if the device supports BACO; 441 * 3 if the device supports MACO (only works if BACO is supported) 442 * otherwise return 0. 443 */ 444 int amdgpu_device_supports_baco(struct drm_device *dev) 445 { 446 struct amdgpu_device *adev = drm_to_adev(dev); 447 448 return amdgpu_asic_supports_baco(adev); 449 } 450 451 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 452 { 453 struct drm_device *dev; 454 int bamaco_support; 455 456 dev = adev_to_drm(adev); 457 458 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 459 bamaco_support = amdgpu_device_supports_baco(dev); 460 461 switch (amdgpu_runtime_pm) { 462 case 2: 463 if (bamaco_support & MACO_SUPPORT) { 464 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 465 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 466 } else if (bamaco_support == BACO_SUPPORT) { 467 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 468 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 469 } 470 break; 471 case 1: 472 if (bamaco_support & BACO_SUPPORT) { 473 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 474 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 475 } 476 break; 477 case -1: 478 case -2: 479 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */ 480 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 481 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 482 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */ 483 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 484 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 485 } else { 486 if (!bamaco_support) 487 goto no_runtime_pm; 488 489 switch (adev->asic_type) { 490 case CHIP_VEGA20: 491 case CHIP_ARCTURUS: 492 /* BACO are not supported on vega20 and arctrus */ 493 break; 494 case CHIP_VEGA10: 495 /* enable BACO as runpm mode if noretry=0 */ 496 if (!adev->gmc.noretry) 497 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 498 break; 499 default: 500 /* enable BACO as runpm mode on CI+ */ 501 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 502 break; 503 } 504 505 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 506 if (bamaco_support & MACO_SUPPORT) { 507 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 508 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 509 } else { 510 dev_info(adev->dev, "Using BACO for runtime pm\n"); 511 } 512 } 513 } 514 break; 515 case 0: 516 dev_info(adev->dev, "runtime pm is manually disabled\n"); 517 break; 518 default: 519 break; 520 } 521 522 no_runtime_pm: 523 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 524 dev_info(adev->dev, "Runtime PM not available\n"); 525 } 526 /** 527 * amdgpu_device_supports_smart_shift - Is the device dGPU with 528 * smart shift support 529 * 530 * @dev: drm_device pointer 531 * 532 * Returns true if the device is a dGPU with Smart Shift support, 533 * otherwise returns false. 534 */ 535 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 536 { 537 return (amdgpu_device_supports_boco(dev) && 538 amdgpu_acpi_is_power_shift_control_supported()); 539 } 540 541 /* 542 * VRAM access helper functions 543 */ 544 545 /** 546 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 547 * 548 * @adev: amdgpu_device pointer 549 * @pos: offset of the buffer in vram 550 * @buf: virtual address of the buffer in system memory 551 * @size: read/write size, sizeof(@buf) must > @size 552 * @write: true - write to vram, otherwise - read from vram 553 */ 554 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 555 void *buf, size_t size, bool write) 556 { 557 unsigned long flags; 558 uint32_t hi = ~0, tmp = 0; 559 uint32_t *data = buf; 560 uint64_t last; 561 int idx; 562 563 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 564 return; 565 566 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 567 568 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 569 for (last = pos + size; pos < last; pos += 4) { 570 tmp = pos >> 31; 571 572 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 573 if (tmp != hi) { 574 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 575 hi = tmp; 576 } 577 if (write) 578 WREG32_NO_KIQ(mmMM_DATA, *data++); 579 else 580 *data++ = RREG32_NO_KIQ(mmMM_DATA); 581 } 582 583 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 584 drm_dev_exit(idx); 585 } 586 587 /** 588 * amdgpu_device_aper_access - access vram by vram aperture 589 * 590 * @adev: amdgpu_device pointer 591 * @pos: offset of the buffer in vram 592 * @buf: virtual address of the buffer in system memory 593 * @size: read/write size, sizeof(@buf) must > @size 594 * @write: true - write to vram, otherwise - read from vram 595 * 596 * The return value means how many bytes have been transferred. 597 */ 598 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 599 void *buf, size_t size, bool write) 600 { 601 #ifdef CONFIG_64BIT 602 void __iomem *addr; 603 size_t count = 0; 604 uint64_t last; 605 606 if (!adev->mman.aper_base_kaddr) 607 return 0; 608 609 last = min(pos + size, adev->gmc.visible_vram_size); 610 if (last > pos) { 611 addr = adev->mman.aper_base_kaddr + pos; 612 count = last - pos; 613 614 if (write) { 615 memcpy_toio(addr, buf, count); 616 /* Make sure HDP write cache flush happens without any reordering 617 * after the system memory contents are sent over PCIe device 618 */ 619 mb(); 620 amdgpu_device_flush_hdp(adev, NULL); 621 } else { 622 amdgpu_device_invalidate_hdp(adev, NULL); 623 /* Make sure HDP read cache is invalidated before issuing a read 624 * to the PCIe device 625 */ 626 mb(); 627 memcpy_fromio(buf, addr, count); 628 } 629 630 } 631 632 return count; 633 #else 634 return 0; 635 #endif 636 } 637 638 /** 639 * amdgpu_device_vram_access - read/write a buffer in vram 640 * 641 * @adev: amdgpu_device pointer 642 * @pos: offset of the buffer in vram 643 * @buf: virtual address of the buffer in system memory 644 * @size: read/write size, sizeof(@buf) must > @size 645 * @write: true - write to vram, otherwise - read from vram 646 */ 647 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 648 void *buf, size_t size, bool write) 649 { 650 size_t count; 651 652 /* try to using vram apreature to access vram first */ 653 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 654 size -= count; 655 if (size) { 656 /* using MM to access rest vram */ 657 pos += count; 658 buf += count; 659 amdgpu_device_mm_access(adev, pos, buf, size, write); 660 } 661 } 662 663 /* 664 * register access helper functions. 665 */ 666 667 /* Check if hw access should be skipped because of hotplug or device error */ 668 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 669 { 670 if (adev->no_hw_access) 671 return true; 672 673 #ifdef CONFIG_LOCKDEP 674 /* 675 * This is a bit complicated to understand, so worth a comment. What we assert 676 * here is that the GPU reset is not running on another thread in parallel. 677 * 678 * For this we trylock the read side of the reset semaphore, if that succeeds 679 * we know that the reset is not running in parallel. 680 * 681 * If the trylock fails we assert that we are either already holding the read 682 * side of the lock or are the reset thread itself and hold the write side of 683 * the lock. 684 */ 685 if (in_task()) { 686 if (down_read_trylock(&adev->reset_domain->sem)) 687 up_read(&adev->reset_domain->sem); 688 else 689 lockdep_assert_held(&adev->reset_domain->sem); 690 } 691 #endif 692 return false; 693 } 694 695 /** 696 * amdgpu_device_rreg - read a memory mapped IO or indirect register 697 * 698 * @adev: amdgpu_device pointer 699 * @reg: dword aligned register offset 700 * @acc_flags: access flags which require special behavior 701 * 702 * Returns the 32 bit value from the offset specified. 703 */ 704 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 705 uint32_t reg, uint32_t acc_flags) 706 { 707 uint32_t ret; 708 709 if (amdgpu_device_skip_hw_access(adev)) 710 return 0; 711 712 if ((reg * 4) < adev->rmmio_size) { 713 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 714 amdgpu_sriov_runtime(adev) && 715 down_read_trylock(&adev->reset_domain->sem)) { 716 ret = amdgpu_kiq_rreg(adev, reg, 0); 717 up_read(&adev->reset_domain->sem); 718 } else { 719 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 720 } 721 } else { 722 ret = adev->pcie_rreg(adev, reg * 4); 723 } 724 725 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 726 727 return ret; 728 } 729 730 /* 731 * MMIO register read with bytes helper functions 732 * @offset:bytes offset from MMIO start 733 */ 734 735 /** 736 * amdgpu_mm_rreg8 - read a memory mapped IO register 737 * 738 * @adev: amdgpu_device pointer 739 * @offset: byte aligned register offset 740 * 741 * Returns the 8 bit value from the offset specified. 742 */ 743 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 744 { 745 if (amdgpu_device_skip_hw_access(adev)) 746 return 0; 747 748 if (offset < adev->rmmio_size) 749 return (readb(adev->rmmio + offset)); 750 BUG(); 751 } 752 753 754 /** 755 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 756 * 757 * @adev: amdgpu_device pointer 758 * @reg: dword aligned register offset 759 * @acc_flags: access flags which require special behavior 760 * @xcc_id: xcc accelerated compute core id 761 * 762 * Returns the 32 bit value from the offset specified. 763 */ 764 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 765 uint32_t reg, uint32_t acc_flags, 766 uint32_t xcc_id) 767 { 768 uint32_t ret, rlcg_flag; 769 770 if (amdgpu_device_skip_hw_access(adev)) 771 return 0; 772 773 if ((reg * 4) < adev->rmmio_size) { 774 if (amdgpu_sriov_vf(adev) && 775 !amdgpu_sriov_runtime(adev) && 776 adev->gfx.rlc.rlcg_reg_access_supported && 777 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 778 GC_HWIP, false, 779 &rlcg_flag)) { 780 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 781 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 782 amdgpu_sriov_runtime(adev) && 783 down_read_trylock(&adev->reset_domain->sem)) { 784 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 785 up_read(&adev->reset_domain->sem); 786 } else { 787 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 788 } 789 } else { 790 ret = adev->pcie_rreg(adev, reg * 4); 791 } 792 793 return ret; 794 } 795 796 /* 797 * MMIO register write with bytes helper functions 798 * @offset:bytes offset from MMIO start 799 * @value: the value want to be written to the register 800 */ 801 802 /** 803 * amdgpu_mm_wreg8 - read a memory mapped IO register 804 * 805 * @adev: amdgpu_device pointer 806 * @offset: byte aligned register offset 807 * @value: 8 bit value to write 808 * 809 * Writes the value specified to the offset specified. 810 */ 811 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 812 { 813 if (amdgpu_device_skip_hw_access(adev)) 814 return; 815 816 if (offset < adev->rmmio_size) 817 writeb(value, adev->rmmio + offset); 818 else 819 BUG(); 820 } 821 822 /** 823 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 824 * 825 * @adev: amdgpu_device pointer 826 * @reg: dword aligned register offset 827 * @v: 32 bit value to write to the register 828 * @acc_flags: access flags which require special behavior 829 * 830 * Writes the value specified to the offset specified. 831 */ 832 void amdgpu_device_wreg(struct amdgpu_device *adev, 833 uint32_t reg, uint32_t v, 834 uint32_t acc_flags) 835 { 836 if (amdgpu_device_skip_hw_access(adev)) 837 return; 838 839 if ((reg * 4) < adev->rmmio_size) { 840 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 841 amdgpu_sriov_runtime(adev) && 842 down_read_trylock(&adev->reset_domain->sem)) { 843 amdgpu_kiq_wreg(adev, reg, v, 0); 844 up_read(&adev->reset_domain->sem); 845 } else { 846 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 847 } 848 } else { 849 adev->pcie_wreg(adev, reg * 4, v); 850 } 851 852 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 853 } 854 855 /** 856 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 857 * 858 * @adev: amdgpu_device pointer 859 * @reg: mmio/rlc register 860 * @v: value to write 861 * @xcc_id: xcc accelerated compute core id 862 * 863 * this function is invoked only for the debugfs register access 864 */ 865 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 866 uint32_t reg, uint32_t v, 867 uint32_t xcc_id) 868 { 869 if (amdgpu_device_skip_hw_access(adev)) 870 return; 871 872 if (amdgpu_sriov_fullaccess(adev) && 873 adev->gfx.rlc.funcs && 874 adev->gfx.rlc.funcs->is_rlcg_access_range) { 875 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 876 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 877 } else if ((reg * 4) >= adev->rmmio_size) { 878 adev->pcie_wreg(adev, reg * 4, v); 879 } else { 880 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 881 } 882 } 883 884 /** 885 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 886 * 887 * @adev: amdgpu_device pointer 888 * @reg: dword aligned register offset 889 * @v: 32 bit value to write to the register 890 * @acc_flags: access flags which require special behavior 891 * @xcc_id: xcc accelerated compute core id 892 * 893 * Writes the value specified to the offset specified. 894 */ 895 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 896 uint32_t reg, uint32_t v, 897 uint32_t acc_flags, uint32_t xcc_id) 898 { 899 uint32_t rlcg_flag; 900 901 if (amdgpu_device_skip_hw_access(adev)) 902 return; 903 904 if ((reg * 4) < adev->rmmio_size) { 905 if (amdgpu_sriov_vf(adev) && 906 !amdgpu_sriov_runtime(adev) && 907 adev->gfx.rlc.rlcg_reg_access_supported && 908 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 909 GC_HWIP, true, 910 &rlcg_flag)) { 911 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 912 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 913 amdgpu_sriov_runtime(adev) && 914 down_read_trylock(&adev->reset_domain->sem)) { 915 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 916 up_read(&adev->reset_domain->sem); 917 } else { 918 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 919 } 920 } else { 921 adev->pcie_wreg(adev, reg * 4, v); 922 } 923 } 924 925 /** 926 * amdgpu_device_indirect_rreg - read an indirect register 927 * 928 * @adev: amdgpu_device pointer 929 * @reg_addr: indirect register address to read from 930 * 931 * Returns the value of indirect register @reg_addr 932 */ 933 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 934 u32 reg_addr) 935 { 936 unsigned long flags, pcie_index, pcie_data; 937 void __iomem *pcie_index_offset; 938 void __iomem *pcie_data_offset; 939 u32 r; 940 941 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 942 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 943 944 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 945 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 946 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 947 948 writel(reg_addr, pcie_index_offset); 949 readl(pcie_index_offset); 950 r = readl(pcie_data_offset); 951 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 952 953 return r; 954 } 955 956 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 957 u64 reg_addr) 958 { 959 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 960 u32 r; 961 void __iomem *pcie_index_offset; 962 void __iomem *pcie_index_hi_offset; 963 void __iomem *pcie_data_offset; 964 965 if (unlikely(!adev->nbio.funcs)) { 966 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 967 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 968 } else { 969 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 970 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 971 } 972 973 if (reg_addr >> 32) { 974 if (unlikely(!adev->nbio.funcs)) 975 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 976 else 977 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 978 } else { 979 pcie_index_hi = 0; 980 } 981 982 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 983 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 984 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 985 if (pcie_index_hi != 0) 986 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 987 pcie_index_hi * 4; 988 989 writel(reg_addr, pcie_index_offset); 990 readl(pcie_index_offset); 991 if (pcie_index_hi != 0) { 992 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 993 readl(pcie_index_hi_offset); 994 } 995 r = readl(pcie_data_offset); 996 997 /* clear the high bits */ 998 if (pcie_index_hi != 0) { 999 writel(0, pcie_index_hi_offset); 1000 readl(pcie_index_hi_offset); 1001 } 1002 1003 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1004 1005 return r; 1006 } 1007 1008 /** 1009 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 1010 * 1011 * @adev: amdgpu_device pointer 1012 * @reg_addr: indirect register address to read from 1013 * 1014 * Returns the value of indirect register @reg_addr 1015 */ 1016 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1017 u32 reg_addr) 1018 { 1019 unsigned long flags, pcie_index, pcie_data; 1020 void __iomem *pcie_index_offset; 1021 void __iomem *pcie_data_offset; 1022 u64 r; 1023 1024 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1025 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1026 1027 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1028 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1029 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1030 1031 /* read low 32 bits */ 1032 writel(reg_addr, pcie_index_offset); 1033 readl(pcie_index_offset); 1034 r = readl(pcie_data_offset); 1035 /* read high 32 bits */ 1036 writel(reg_addr + 4, pcie_index_offset); 1037 readl(pcie_index_offset); 1038 r |= ((u64)readl(pcie_data_offset) << 32); 1039 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1040 1041 return r; 1042 } 1043 1044 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1045 u64 reg_addr) 1046 { 1047 unsigned long flags, pcie_index, pcie_data; 1048 unsigned long pcie_index_hi = 0; 1049 void __iomem *pcie_index_offset; 1050 void __iomem *pcie_index_hi_offset; 1051 void __iomem *pcie_data_offset; 1052 u64 r; 1053 1054 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1055 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1056 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1057 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1058 1059 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1060 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1061 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1062 if (pcie_index_hi != 0) 1063 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1064 pcie_index_hi * 4; 1065 1066 /* read low 32 bits */ 1067 writel(reg_addr, pcie_index_offset); 1068 readl(pcie_index_offset); 1069 if (pcie_index_hi != 0) { 1070 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1071 readl(pcie_index_hi_offset); 1072 } 1073 r = readl(pcie_data_offset); 1074 /* read high 32 bits */ 1075 writel(reg_addr + 4, pcie_index_offset); 1076 readl(pcie_index_offset); 1077 if (pcie_index_hi != 0) { 1078 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1079 readl(pcie_index_hi_offset); 1080 } 1081 r |= ((u64)readl(pcie_data_offset) << 32); 1082 1083 /* clear the high bits */ 1084 if (pcie_index_hi != 0) { 1085 writel(0, pcie_index_hi_offset); 1086 readl(pcie_index_hi_offset); 1087 } 1088 1089 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1090 1091 return r; 1092 } 1093 1094 /** 1095 * amdgpu_device_indirect_wreg - write an indirect register address 1096 * 1097 * @adev: amdgpu_device pointer 1098 * @reg_addr: indirect register offset 1099 * @reg_data: indirect register data 1100 * 1101 */ 1102 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1103 u32 reg_addr, u32 reg_data) 1104 { 1105 unsigned long flags, pcie_index, pcie_data; 1106 void __iomem *pcie_index_offset; 1107 void __iomem *pcie_data_offset; 1108 1109 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1110 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1111 1112 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1113 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1114 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1115 1116 writel(reg_addr, pcie_index_offset); 1117 readl(pcie_index_offset); 1118 writel(reg_data, pcie_data_offset); 1119 readl(pcie_data_offset); 1120 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1121 } 1122 1123 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1124 u64 reg_addr, u32 reg_data) 1125 { 1126 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1127 void __iomem *pcie_index_offset; 1128 void __iomem *pcie_index_hi_offset; 1129 void __iomem *pcie_data_offset; 1130 1131 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1132 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1133 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1134 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1135 else 1136 pcie_index_hi = 0; 1137 1138 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1139 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1140 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1141 if (pcie_index_hi != 0) 1142 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1143 pcie_index_hi * 4; 1144 1145 writel(reg_addr, pcie_index_offset); 1146 readl(pcie_index_offset); 1147 if (pcie_index_hi != 0) { 1148 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1149 readl(pcie_index_hi_offset); 1150 } 1151 writel(reg_data, pcie_data_offset); 1152 readl(pcie_data_offset); 1153 1154 /* clear the high bits */ 1155 if (pcie_index_hi != 0) { 1156 writel(0, pcie_index_hi_offset); 1157 readl(pcie_index_hi_offset); 1158 } 1159 1160 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1161 } 1162 1163 /** 1164 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1165 * 1166 * @adev: amdgpu_device pointer 1167 * @reg_addr: indirect register offset 1168 * @reg_data: indirect register data 1169 * 1170 */ 1171 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1172 u32 reg_addr, u64 reg_data) 1173 { 1174 unsigned long flags, pcie_index, pcie_data; 1175 void __iomem *pcie_index_offset; 1176 void __iomem *pcie_data_offset; 1177 1178 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1179 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1180 1181 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1182 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1183 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1184 1185 /* write low 32 bits */ 1186 writel(reg_addr, pcie_index_offset); 1187 readl(pcie_index_offset); 1188 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1189 readl(pcie_data_offset); 1190 /* write high 32 bits */ 1191 writel(reg_addr + 4, pcie_index_offset); 1192 readl(pcie_index_offset); 1193 writel((u32)(reg_data >> 32), pcie_data_offset); 1194 readl(pcie_data_offset); 1195 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1196 } 1197 1198 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1199 u64 reg_addr, u64 reg_data) 1200 { 1201 unsigned long flags, pcie_index, pcie_data; 1202 unsigned long pcie_index_hi = 0; 1203 void __iomem *pcie_index_offset; 1204 void __iomem *pcie_index_hi_offset; 1205 void __iomem *pcie_data_offset; 1206 1207 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1208 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1209 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1210 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1211 1212 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1213 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1214 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1215 if (pcie_index_hi != 0) 1216 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1217 pcie_index_hi * 4; 1218 1219 /* write low 32 bits */ 1220 writel(reg_addr, pcie_index_offset); 1221 readl(pcie_index_offset); 1222 if (pcie_index_hi != 0) { 1223 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1224 readl(pcie_index_hi_offset); 1225 } 1226 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1227 readl(pcie_data_offset); 1228 /* write high 32 bits */ 1229 writel(reg_addr + 4, pcie_index_offset); 1230 readl(pcie_index_offset); 1231 if (pcie_index_hi != 0) { 1232 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1233 readl(pcie_index_hi_offset); 1234 } 1235 writel((u32)(reg_data >> 32), pcie_data_offset); 1236 readl(pcie_data_offset); 1237 1238 /* clear the high bits */ 1239 if (pcie_index_hi != 0) { 1240 writel(0, pcie_index_hi_offset); 1241 readl(pcie_index_hi_offset); 1242 } 1243 1244 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1245 } 1246 1247 /** 1248 * amdgpu_device_get_rev_id - query device rev_id 1249 * 1250 * @adev: amdgpu_device pointer 1251 * 1252 * Return device rev_id 1253 */ 1254 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1255 { 1256 return adev->nbio.funcs->get_rev_id(adev); 1257 } 1258 1259 /** 1260 * amdgpu_invalid_rreg - dummy reg read function 1261 * 1262 * @adev: amdgpu_device pointer 1263 * @reg: offset of register 1264 * 1265 * Dummy register read function. Used for register blocks 1266 * that certain asics don't have (all asics). 1267 * Returns the value in the register. 1268 */ 1269 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1270 { 1271 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 1272 BUG(); 1273 return 0; 1274 } 1275 1276 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1277 { 1278 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1279 BUG(); 1280 return 0; 1281 } 1282 1283 /** 1284 * amdgpu_invalid_wreg - dummy reg write function 1285 * 1286 * @adev: amdgpu_device pointer 1287 * @reg: offset of register 1288 * @v: value to write to the register 1289 * 1290 * Dummy register read function. Used for register blocks 1291 * that certain asics don't have (all asics). 1292 */ 1293 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1294 { 1295 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 1296 reg, v); 1297 BUG(); 1298 } 1299 1300 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1301 { 1302 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 1303 reg, v); 1304 BUG(); 1305 } 1306 1307 /** 1308 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1309 * 1310 * @adev: amdgpu_device pointer 1311 * @reg: offset of register 1312 * 1313 * Dummy register read function. Used for register blocks 1314 * that certain asics don't have (all asics). 1315 * Returns the value in the register. 1316 */ 1317 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1318 { 1319 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 1320 BUG(); 1321 return 0; 1322 } 1323 1324 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1325 { 1326 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1327 BUG(); 1328 return 0; 1329 } 1330 1331 /** 1332 * amdgpu_invalid_wreg64 - dummy reg write function 1333 * 1334 * @adev: amdgpu_device pointer 1335 * @reg: offset of register 1336 * @v: value to write to the register 1337 * 1338 * Dummy register read function. Used for register blocks 1339 * that certain asics don't have (all asics). 1340 */ 1341 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1342 { 1343 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1344 reg, v); 1345 BUG(); 1346 } 1347 1348 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1349 { 1350 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1351 reg, v); 1352 BUG(); 1353 } 1354 1355 /** 1356 * amdgpu_block_invalid_rreg - dummy reg read function 1357 * 1358 * @adev: amdgpu_device pointer 1359 * @block: offset of instance 1360 * @reg: offset of register 1361 * 1362 * Dummy register read function. Used for register blocks 1363 * that certain asics don't have (all asics). 1364 * Returns the value in the register. 1365 */ 1366 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1367 uint32_t block, uint32_t reg) 1368 { 1369 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1370 reg, block); 1371 BUG(); 1372 return 0; 1373 } 1374 1375 /** 1376 * amdgpu_block_invalid_wreg - dummy reg write function 1377 * 1378 * @adev: amdgpu_device pointer 1379 * @block: offset of instance 1380 * @reg: offset of register 1381 * @v: value to write to the register 1382 * 1383 * Dummy register read function. Used for register blocks 1384 * that certain asics don't have (all asics). 1385 */ 1386 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1387 uint32_t block, 1388 uint32_t reg, uint32_t v) 1389 { 1390 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1391 reg, block, v); 1392 BUG(); 1393 } 1394 1395 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev) 1396 { 1397 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1398 return AMDGPU_VBIOS_SKIP; 1399 1400 if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev)) 1401 return AMDGPU_VBIOS_OPTIONAL; 1402 1403 return 0; 1404 } 1405 1406 /** 1407 * amdgpu_device_asic_init - Wrapper for atom asic_init 1408 * 1409 * @adev: amdgpu_device pointer 1410 * 1411 * Does any asic specific work and then calls atom asic init. 1412 */ 1413 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1414 { 1415 uint32_t flags; 1416 bool optional; 1417 int ret; 1418 1419 amdgpu_asic_pre_asic_init(adev); 1420 flags = amdgpu_device_get_vbios_flags(adev); 1421 optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP)); 1422 1423 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1424 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1425 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 1426 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1427 amdgpu_psp_wait_for_bootloader(adev); 1428 if (optional && !adev->bios) 1429 return 0; 1430 1431 ret = amdgpu_atomfirmware_asic_init(adev, true); 1432 return ret; 1433 } else { 1434 if (optional && !adev->bios) 1435 return 0; 1436 1437 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1438 } 1439 1440 return 0; 1441 } 1442 1443 /** 1444 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1445 * 1446 * @adev: amdgpu_device pointer 1447 * 1448 * Allocates a scratch page of VRAM for use by various things in the 1449 * driver. 1450 */ 1451 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1452 { 1453 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1454 AMDGPU_GEM_DOMAIN_VRAM | 1455 AMDGPU_GEM_DOMAIN_GTT, 1456 &adev->mem_scratch.robj, 1457 &adev->mem_scratch.gpu_addr, 1458 (void **)&adev->mem_scratch.ptr); 1459 } 1460 1461 /** 1462 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1463 * 1464 * @adev: amdgpu_device pointer 1465 * 1466 * Frees the VRAM scratch page. 1467 */ 1468 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1469 { 1470 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1471 } 1472 1473 /** 1474 * amdgpu_device_program_register_sequence - program an array of registers. 1475 * 1476 * @adev: amdgpu_device pointer 1477 * @registers: pointer to the register array 1478 * @array_size: size of the register array 1479 * 1480 * Programs an array or registers with and or masks. 1481 * This is a helper for setting golden registers. 1482 */ 1483 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1484 const u32 *registers, 1485 const u32 array_size) 1486 { 1487 u32 tmp, reg, and_mask, or_mask; 1488 int i; 1489 1490 if (array_size % 3) 1491 return; 1492 1493 for (i = 0; i < array_size; i += 3) { 1494 reg = registers[i + 0]; 1495 and_mask = registers[i + 1]; 1496 or_mask = registers[i + 2]; 1497 1498 if (and_mask == 0xffffffff) { 1499 tmp = or_mask; 1500 } else { 1501 tmp = RREG32(reg); 1502 tmp &= ~and_mask; 1503 if (adev->family >= AMDGPU_FAMILY_AI) 1504 tmp |= (or_mask & and_mask); 1505 else 1506 tmp |= or_mask; 1507 } 1508 WREG32(reg, tmp); 1509 } 1510 } 1511 1512 /** 1513 * amdgpu_device_pci_config_reset - reset the GPU 1514 * 1515 * @adev: amdgpu_device pointer 1516 * 1517 * Resets the GPU using the pci config reset sequence. 1518 * Only applicable to asics prior to vega10. 1519 */ 1520 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1521 { 1522 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1523 } 1524 1525 /** 1526 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1527 * 1528 * @adev: amdgpu_device pointer 1529 * 1530 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1531 */ 1532 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1533 { 1534 return pci_reset_function(adev->pdev); 1535 } 1536 1537 /* 1538 * amdgpu_device_wb_*() 1539 * Writeback is the method by which the GPU updates special pages in memory 1540 * with the status of certain GPU events (fences, ring pointers,etc.). 1541 */ 1542 1543 /** 1544 * amdgpu_device_wb_fini - Disable Writeback and free memory 1545 * 1546 * @adev: amdgpu_device pointer 1547 * 1548 * Disables Writeback and frees the Writeback memory (all asics). 1549 * Used at driver shutdown. 1550 */ 1551 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1552 { 1553 if (adev->wb.wb_obj) { 1554 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1555 &adev->wb.gpu_addr, 1556 (void **)&adev->wb.wb); 1557 adev->wb.wb_obj = NULL; 1558 } 1559 } 1560 1561 /** 1562 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1563 * 1564 * @adev: amdgpu_device pointer 1565 * 1566 * Initializes writeback and allocates writeback memory (all asics). 1567 * Used at driver startup. 1568 * Returns 0 on success or an -error on failure. 1569 */ 1570 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1571 { 1572 int r; 1573 1574 if (adev->wb.wb_obj == NULL) { 1575 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1576 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1577 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1578 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1579 (void **)&adev->wb.wb); 1580 if (r) { 1581 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1582 return r; 1583 } 1584 1585 adev->wb.num_wb = AMDGPU_MAX_WB; 1586 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1587 1588 /* clear wb memory */ 1589 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1590 } 1591 1592 return 0; 1593 } 1594 1595 /** 1596 * amdgpu_device_wb_get - Allocate a wb entry 1597 * 1598 * @adev: amdgpu_device pointer 1599 * @wb: wb index 1600 * 1601 * Allocate a wb slot for use by the driver (all asics). 1602 * Returns 0 on success or -EINVAL on failure. 1603 */ 1604 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1605 { 1606 unsigned long flags, offset; 1607 1608 spin_lock_irqsave(&adev->wb.lock, flags); 1609 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1610 if (offset < adev->wb.num_wb) { 1611 __set_bit(offset, adev->wb.used); 1612 spin_unlock_irqrestore(&adev->wb.lock, flags); 1613 *wb = offset << 3; /* convert to dw offset */ 1614 return 0; 1615 } else { 1616 spin_unlock_irqrestore(&adev->wb.lock, flags); 1617 return -EINVAL; 1618 } 1619 } 1620 1621 /** 1622 * amdgpu_device_wb_free - Free a wb entry 1623 * 1624 * @adev: amdgpu_device pointer 1625 * @wb: wb index 1626 * 1627 * Free a wb slot allocated for use by the driver (all asics) 1628 */ 1629 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1630 { 1631 unsigned long flags; 1632 1633 wb >>= 3; 1634 spin_lock_irqsave(&adev->wb.lock, flags); 1635 if (wb < adev->wb.num_wb) 1636 __clear_bit(wb, adev->wb.used); 1637 spin_unlock_irqrestore(&adev->wb.lock, flags); 1638 } 1639 1640 /** 1641 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1642 * 1643 * @adev: amdgpu_device pointer 1644 * 1645 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1646 * to fail, but if any of the BARs is not accessible after the size we abort 1647 * driver loading by returning -ENODEV. 1648 */ 1649 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1650 { 1651 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1652 struct pci_bus *root; 1653 struct resource *res; 1654 unsigned int i; 1655 u16 cmd; 1656 int r; 1657 1658 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1659 return 0; 1660 1661 /* Bypass for VF */ 1662 if (amdgpu_sriov_vf(adev)) 1663 return 0; 1664 1665 /* resizing on Dell G5 SE platforms causes problems with runtime pm */ 1666 if ((amdgpu_runtime_pm != 0) && 1667 adev->pdev->vendor == PCI_VENDOR_ID_ATI && 1668 adev->pdev->device == 0x731f && 1669 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL) 1670 return 0; 1671 1672 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1673 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1674 DRM_WARN("System can't access extended configuration space, please check!!\n"); 1675 1676 /* skip if the bios has already enabled large BAR */ 1677 if (adev->gmc.real_vram_size && 1678 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1679 return 0; 1680 1681 /* Check if the root BUS has 64bit memory resources */ 1682 root = adev->pdev->bus; 1683 while (root->parent) 1684 root = root->parent; 1685 1686 pci_bus_for_each_resource(root, res, i) { 1687 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1688 res->start > 0x100000000ull) 1689 break; 1690 } 1691 1692 /* Trying to resize is pointless without a root hub window above 4GB */ 1693 if (!res) 1694 return 0; 1695 1696 /* Limit the BAR size to what is available */ 1697 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1698 rbar_size); 1699 1700 /* Disable memory decoding while we change the BAR addresses and size */ 1701 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1702 pci_write_config_word(adev->pdev, PCI_COMMAND, 1703 cmd & ~PCI_COMMAND_MEMORY); 1704 1705 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1706 amdgpu_doorbell_fini(adev); 1707 if (adev->asic_type >= CHIP_BONAIRE) 1708 pci_release_resource(adev->pdev, 2); 1709 1710 pci_release_resource(adev->pdev, 0); 1711 1712 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1713 if (r == -ENOSPC) 1714 DRM_INFO("Not enough PCI address space for a large BAR."); 1715 else if (r && r != -ENOTSUPP) 1716 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1717 1718 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1719 1720 /* When the doorbell or fb BAR isn't available we have no chance of 1721 * using the device. 1722 */ 1723 r = amdgpu_doorbell_init(adev); 1724 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1725 return -ENODEV; 1726 1727 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1728 1729 return 0; 1730 } 1731 1732 /* 1733 * GPU helpers function. 1734 */ 1735 /** 1736 * amdgpu_device_need_post - check if the hw need post or not 1737 * 1738 * @adev: amdgpu_device pointer 1739 * 1740 * Check if the asic has been initialized (all asics) at driver startup 1741 * or post is needed if hw reset is performed. 1742 * Returns true if need or false if not. 1743 */ 1744 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1745 { 1746 uint32_t reg, flags; 1747 1748 if (amdgpu_sriov_vf(adev)) 1749 return false; 1750 1751 flags = amdgpu_device_get_vbios_flags(adev); 1752 if (flags & AMDGPU_VBIOS_SKIP) 1753 return false; 1754 if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios) 1755 return false; 1756 1757 if (amdgpu_passthrough(adev)) { 1758 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1759 * some old smc fw still need driver do vPost otherwise gpu hang, while 1760 * those smc fw version above 22.15 doesn't have this flaw, so we force 1761 * vpost executed for smc version below 22.15 1762 */ 1763 if (adev->asic_type == CHIP_FIJI) { 1764 int err; 1765 uint32_t fw_ver; 1766 1767 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1768 /* force vPost if error occurred */ 1769 if (err) 1770 return true; 1771 1772 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1773 release_firmware(adev->pm.fw); 1774 if (fw_ver < 0x00160e00) 1775 return true; 1776 } 1777 } 1778 1779 /* Don't post if we need to reset whole hive on init */ 1780 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1781 return false; 1782 1783 if (adev->has_hw_reset) { 1784 adev->has_hw_reset = false; 1785 return true; 1786 } 1787 1788 /* bios scratch used on CIK+ */ 1789 if (adev->asic_type >= CHIP_BONAIRE) 1790 return amdgpu_atombios_scratch_need_asic_init(adev); 1791 1792 /* check MEM_SIZE for older asics */ 1793 reg = amdgpu_asic_get_config_memsize(adev); 1794 1795 if ((reg != 0) && (reg != 0xffffffff)) 1796 return false; 1797 1798 return true; 1799 } 1800 1801 /* 1802 * Check whether seamless boot is supported. 1803 * 1804 * So far we only support seamless boot on DCE 3.0 or later. 1805 * If users report that it works on older ASICS as well, we may 1806 * loosen this. 1807 */ 1808 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1809 { 1810 switch (amdgpu_seamless) { 1811 case -1: 1812 break; 1813 case 1: 1814 return true; 1815 case 0: 1816 return false; 1817 default: 1818 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1819 amdgpu_seamless); 1820 return false; 1821 } 1822 1823 if (!(adev->flags & AMD_IS_APU)) 1824 return false; 1825 1826 if (adev->mman.keep_stolen_vga_memory) 1827 return false; 1828 1829 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1830 } 1831 1832 /* 1833 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1834 * don't support dynamic speed switching. Until we have confirmation from Intel 1835 * that a specific host supports it, it's safer that we keep it disabled for all. 1836 * 1837 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1838 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1839 */ 1840 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1841 { 1842 #if IS_ENABLED(CONFIG_X86) 1843 struct cpuinfo_x86 *c = &cpu_data(0); 1844 1845 /* eGPU change speeds based on USB4 fabric conditions */ 1846 if (dev_is_removable(adev->dev)) 1847 return true; 1848 1849 if (c->x86_vendor == X86_VENDOR_INTEL) 1850 return false; 1851 #endif 1852 return true; 1853 } 1854 1855 /** 1856 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1857 * 1858 * @adev: amdgpu_device pointer 1859 * 1860 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1861 * be set for this device. 1862 * 1863 * Returns true if it should be used or false if not. 1864 */ 1865 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1866 { 1867 switch (amdgpu_aspm) { 1868 case -1: 1869 break; 1870 case 0: 1871 return false; 1872 case 1: 1873 return true; 1874 default: 1875 return false; 1876 } 1877 if (adev->flags & AMD_IS_APU) 1878 return false; 1879 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) 1880 return false; 1881 return pcie_aspm_enabled(adev->pdev); 1882 } 1883 1884 /* if we get transitioned to only one device, take VGA back */ 1885 /** 1886 * amdgpu_device_vga_set_decode - enable/disable vga decode 1887 * 1888 * @pdev: PCI device pointer 1889 * @state: enable/disable vga decode 1890 * 1891 * Enable/disable vga decode (all asics). 1892 * Returns VGA resource flags. 1893 */ 1894 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1895 bool state) 1896 { 1897 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1898 1899 amdgpu_asic_set_vga_state(adev, state); 1900 if (state) 1901 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1902 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1903 else 1904 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1905 } 1906 1907 /** 1908 * amdgpu_device_check_block_size - validate the vm block size 1909 * 1910 * @adev: amdgpu_device pointer 1911 * 1912 * Validates the vm block size specified via module parameter. 1913 * The vm block size defines number of bits in page table versus page directory, 1914 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1915 * page table and the remaining bits are in the page directory. 1916 */ 1917 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1918 { 1919 /* defines number of bits in page table versus page directory, 1920 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1921 * page table and the remaining bits are in the page directory 1922 */ 1923 if (amdgpu_vm_block_size == -1) 1924 return; 1925 1926 if (amdgpu_vm_block_size < 9) { 1927 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1928 amdgpu_vm_block_size); 1929 amdgpu_vm_block_size = -1; 1930 } 1931 } 1932 1933 /** 1934 * amdgpu_device_check_vm_size - validate the vm size 1935 * 1936 * @adev: amdgpu_device pointer 1937 * 1938 * Validates the vm size in GB specified via module parameter. 1939 * The VM size is the size of the GPU virtual memory space in GB. 1940 */ 1941 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1942 { 1943 /* no need to check the default value */ 1944 if (amdgpu_vm_size == -1) 1945 return; 1946 1947 if (amdgpu_vm_size < 1) { 1948 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1949 amdgpu_vm_size); 1950 amdgpu_vm_size = -1; 1951 } 1952 } 1953 1954 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1955 { 1956 struct sysinfo si; 1957 bool is_os_64 = (sizeof(void *) == 8); 1958 uint64_t total_memory; 1959 uint64_t dram_size_seven_GB = 0x1B8000000; 1960 uint64_t dram_size_three_GB = 0xB8000000; 1961 1962 if (amdgpu_smu_memory_pool_size == 0) 1963 return; 1964 1965 if (!is_os_64) { 1966 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1967 goto def_value; 1968 } 1969 si_meminfo(&si); 1970 total_memory = (uint64_t)si.totalram * si.mem_unit; 1971 1972 if ((amdgpu_smu_memory_pool_size == 1) || 1973 (amdgpu_smu_memory_pool_size == 2)) { 1974 if (total_memory < dram_size_three_GB) 1975 goto def_value1; 1976 } else if ((amdgpu_smu_memory_pool_size == 4) || 1977 (amdgpu_smu_memory_pool_size == 8)) { 1978 if (total_memory < dram_size_seven_GB) 1979 goto def_value1; 1980 } else { 1981 DRM_WARN("Smu memory pool size not supported\n"); 1982 goto def_value; 1983 } 1984 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1985 1986 return; 1987 1988 def_value1: 1989 DRM_WARN("No enough system memory\n"); 1990 def_value: 1991 adev->pm.smu_prv_buffer_size = 0; 1992 } 1993 1994 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1995 { 1996 if (!(adev->flags & AMD_IS_APU) || 1997 adev->asic_type < CHIP_RAVEN) 1998 return 0; 1999 2000 switch (adev->asic_type) { 2001 case CHIP_RAVEN: 2002 if (adev->pdev->device == 0x15dd) 2003 adev->apu_flags |= AMD_APU_IS_RAVEN; 2004 if (adev->pdev->device == 0x15d8) 2005 adev->apu_flags |= AMD_APU_IS_PICASSO; 2006 break; 2007 case CHIP_RENOIR: 2008 if ((adev->pdev->device == 0x1636) || 2009 (adev->pdev->device == 0x164c)) 2010 adev->apu_flags |= AMD_APU_IS_RENOIR; 2011 else 2012 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 2013 break; 2014 case CHIP_VANGOGH: 2015 adev->apu_flags |= AMD_APU_IS_VANGOGH; 2016 break; 2017 case CHIP_YELLOW_CARP: 2018 break; 2019 case CHIP_CYAN_SKILLFISH: 2020 if ((adev->pdev->device == 0x13FE) || 2021 (adev->pdev->device == 0x143F)) 2022 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 2023 break; 2024 default: 2025 break; 2026 } 2027 2028 return 0; 2029 } 2030 2031 /** 2032 * amdgpu_device_check_arguments - validate module params 2033 * 2034 * @adev: amdgpu_device pointer 2035 * 2036 * Validates certain module parameters and updates 2037 * the associated values used by the driver (all asics). 2038 */ 2039 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 2040 { 2041 int i; 2042 2043 if (amdgpu_sched_jobs < 4) { 2044 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2045 amdgpu_sched_jobs); 2046 amdgpu_sched_jobs = 4; 2047 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2048 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2049 amdgpu_sched_jobs); 2050 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2051 } 2052 2053 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2054 /* gart size must be greater or equal to 32M */ 2055 dev_warn(adev->dev, "gart size (%d) too small\n", 2056 amdgpu_gart_size); 2057 amdgpu_gart_size = -1; 2058 } 2059 2060 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2061 /* gtt size must be greater or equal to 32M */ 2062 dev_warn(adev->dev, "gtt size (%d) too small\n", 2063 amdgpu_gtt_size); 2064 amdgpu_gtt_size = -1; 2065 } 2066 2067 /* valid range is between 4 and 9 inclusive */ 2068 if (amdgpu_vm_fragment_size != -1 && 2069 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2070 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2071 amdgpu_vm_fragment_size = -1; 2072 } 2073 2074 if (amdgpu_sched_hw_submission < 2) { 2075 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2076 amdgpu_sched_hw_submission); 2077 amdgpu_sched_hw_submission = 2; 2078 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2079 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2080 amdgpu_sched_hw_submission); 2081 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2082 } 2083 2084 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2085 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2086 amdgpu_reset_method = -1; 2087 } 2088 2089 amdgpu_device_check_smu_prv_buffer_size(adev); 2090 2091 amdgpu_device_check_vm_size(adev); 2092 2093 amdgpu_device_check_block_size(adev); 2094 2095 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2096 2097 for (i = 0; i < MAX_XCP; i++) 2098 adev->enforce_isolation[i] = !!enforce_isolation; 2099 2100 return 0; 2101 } 2102 2103 /** 2104 * amdgpu_switcheroo_set_state - set switcheroo state 2105 * 2106 * @pdev: pci dev pointer 2107 * @state: vga_switcheroo state 2108 * 2109 * Callback for the switcheroo driver. Suspends or resumes 2110 * the asics before or after it is powered up using ACPI methods. 2111 */ 2112 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2113 enum vga_switcheroo_state state) 2114 { 2115 struct drm_device *dev = pci_get_drvdata(pdev); 2116 int r; 2117 2118 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 2119 return; 2120 2121 if (state == VGA_SWITCHEROO_ON) { 2122 pr_info("switched on\n"); 2123 /* don't suspend or resume card normally */ 2124 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2125 2126 pci_set_power_state(pdev, PCI_D0); 2127 amdgpu_device_load_pci_state(pdev); 2128 r = pci_enable_device(pdev); 2129 if (r) 2130 DRM_WARN("pci_enable_device failed (%d)\n", r); 2131 amdgpu_device_resume(dev, true); 2132 2133 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2134 } else { 2135 pr_info("switched off\n"); 2136 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2137 amdgpu_device_prepare(dev); 2138 amdgpu_device_suspend(dev, true); 2139 amdgpu_device_cache_pci_state(pdev); 2140 /* Shut down the device */ 2141 pci_disable_device(pdev); 2142 pci_set_power_state(pdev, PCI_D3cold); 2143 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2144 } 2145 } 2146 2147 /** 2148 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2149 * 2150 * @pdev: pci dev pointer 2151 * 2152 * Callback for the switcheroo driver. Check of the switcheroo 2153 * state can be changed. 2154 * Returns true if the state can be changed, false if not. 2155 */ 2156 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2157 { 2158 struct drm_device *dev = pci_get_drvdata(pdev); 2159 2160 /* 2161 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2162 * locking inversion with the driver load path. And the access here is 2163 * completely racy anyway. So don't bother with locking for now. 2164 */ 2165 return atomic_read(&dev->open_count) == 0; 2166 } 2167 2168 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2169 .set_gpu_state = amdgpu_switcheroo_set_state, 2170 .reprobe = NULL, 2171 .can_switch = amdgpu_switcheroo_can_switch, 2172 }; 2173 2174 /** 2175 * amdgpu_device_ip_set_clockgating_state - set the CG state 2176 * 2177 * @dev: amdgpu_device pointer 2178 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2179 * @state: clockgating state (gate or ungate) 2180 * 2181 * Sets the requested clockgating state for all instances of 2182 * the hardware IP specified. 2183 * Returns the error code from the last instance. 2184 */ 2185 int amdgpu_device_ip_set_clockgating_state(void *dev, 2186 enum amd_ip_block_type block_type, 2187 enum amd_clockgating_state state) 2188 { 2189 struct amdgpu_device *adev = dev; 2190 int i, r = 0; 2191 2192 for (i = 0; i < adev->num_ip_blocks; i++) { 2193 if (!adev->ip_blocks[i].status.valid) 2194 continue; 2195 if (adev->ip_blocks[i].version->type != block_type) 2196 continue; 2197 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2198 continue; 2199 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2200 &adev->ip_blocks[i], state); 2201 if (r) 2202 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 2203 adev->ip_blocks[i].version->funcs->name, r); 2204 } 2205 return r; 2206 } 2207 2208 /** 2209 * amdgpu_device_ip_set_powergating_state - set the PG state 2210 * 2211 * @dev: amdgpu_device pointer 2212 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2213 * @state: powergating state (gate or ungate) 2214 * 2215 * Sets the requested powergating state for all instances of 2216 * the hardware IP specified. 2217 * Returns the error code from the last instance. 2218 */ 2219 int amdgpu_device_ip_set_powergating_state(void *dev, 2220 enum amd_ip_block_type block_type, 2221 enum amd_powergating_state state) 2222 { 2223 struct amdgpu_device *adev = dev; 2224 int i, r = 0; 2225 2226 for (i = 0; i < adev->num_ip_blocks; i++) { 2227 if (!adev->ip_blocks[i].status.valid) 2228 continue; 2229 if (adev->ip_blocks[i].version->type != block_type) 2230 continue; 2231 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2232 continue; 2233 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2234 &adev->ip_blocks[i], state); 2235 if (r) 2236 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 2237 adev->ip_blocks[i].version->funcs->name, r); 2238 } 2239 return r; 2240 } 2241 2242 /** 2243 * amdgpu_device_ip_get_clockgating_state - get the CG state 2244 * 2245 * @adev: amdgpu_device pointer 2246 * @flags: clockgating feature flags 2247 * 2248 * Walks the list of IPs on the device and updates the clockgating 2249 * flags for each IP. 2250 * Updates @flags with the feature flags for each hardware IP where 2251 * clockgating is enabled. 2252 */ 2253 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2254 u64 *flags) 2255 { 2256 int i; 2257 2258 for (i = 0; i < adev->num_ip_blocks; i++) { 2259 if (!adev->ip_blocks[i].status.valid) 2260 continue; 2261 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2262 adev->ip_blocks[i].version->funcs->get_clockgating_state( 2263 &adev->ip_blocks[i], flags); 2264 } 2265 } 2266 2267 /** 2268 * amdgpu_device_ip_wait_for_idle - wait for idle 2269 * 2270 * @adev: amdgpu_device pointer 2271 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2272 * 2273 * Waits for the request hardware IP to be idle. 2274 * Returns 0 for success or a negative error code on failure. 2275 */ 2276 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2277 enum amd_ip_block_type block_type) 2278 { 2279 int i, r; 2280 2281 for (i = 0; i < adev->num_ip_blocks; i++) { 2282 if (!adev->ip_blocks[i].status.valid) 2283 continue; 2284 if (adev->ip_blocks[i].version->type == block_type) { 2285 if (adev->ip_blocks[i].version->funcs->wait_for_idle) { 2286 r = adev->ip_blocks[i].version->funcs->wait_for_idle( 2287 &adev->ip_blocks[i]); 2288 if (r) 2289 return r; 2290 } 2291 break; 2292 } 2293 } 2294 return 0; 2295 2296 } 2297 2298 /** 2299 * amdgpu_device_ip_is_valid - is the hardware IP enabled 2300 * 2301 * @adev: amdgpu_device pointer 2302 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2303 * 2304 * Check if the hardware IP is enable or not. 2305 * Returns true if it the IP is enable, false if not. 2306 */ 2307 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, 2308 enum amd_ip_block_type block_type) 2309 { 2310 int i; 2311 2312 for (i = 0; i < adev->num_ip_blocks; i++) { 2313 if (adev->ip_blocks[i].version->type == block_type) 2314 return adev->ip_blocks[i].status.valid; 2315 } 2316 return false; 2317 2318 } 2319 2320 /** 2321 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2322 * 2323 * @adev: amdgpu_device pointer 2324 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2325 * 2326 * Returns a pointer to the hardware IP block structure 2327 * if it exists for the asic, otherwise NULL. 2328 */ 2329 struct amdgpu_ip_block * 2330 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2331 enum amd_ip_block_type type) 2332 { 2333 int i; 2334 2335 for (i = 0; i < adev->num_ip_blocks; i++) 2336 if (adev->ip_blocks[i].version->type == type) 2337 return &adev->ip_blocks[i]; 2338 2339 return NULL; 2340 } 2341 2342 /** 2343 * amdgpu_device_ip_block_version_cmp 2344 * 2345 * @adev: amdgpu_device pointer 2346 * @type: enum amd_ip_block_type 2347 * @major: major version 2348 * @minor: minor version 2349 * 2350 * return 0 if equal or greater 2351 * return 1 if smaller or the ip_block doesn't exist 2352 */ 2353 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2354 enum amd_ip_block_type type, 2355 u32 major, u32 minor) 2356 { 2357 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2358 2359 if (ip_block && ((ip_block->version->major > major) || 2360 ((ip_block->version->major == major) && 2361 (ip_block->version->minor >= minor)))) 2362 return 0; 2363 2364 return 1; 2365 } 2366 2367 /** 2368 * amdgpu_device_ip_block_add 2369 * 2370 * @adev: amdgpu_device pointer 2371 * @ip_block_version: pointer to the IP to add 2372 * 2373 * Adds the IP block driver information to the collection of IPs 2374 * on the asic. 2375 */ 2376 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2377 const struct amdgpu_ip_block_version *ip_block_version) 2378 { 2379 if (!ip_block_version) 2380 return -EINVAL; 2381 2382 switch (ip_block_version->type) { 2383 case AMD_IP_BLOCK_TYPE_VCN: 2384 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2385 return 0; 2386 break; 2387 case AMD_IP_BLOCK_TYPE_JPEG: 2388 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2389 return 0; 2390 break; 2391 default: 2392 break; 2393 } 2394 2395 dev_info(adev->dev, "detected ip block number %d <%s>\n", 2396 adev->num_ip_blocks, ip_block_version->funcs->name); 2397 2398 adev->ip_blocks[adev->num_ip_blocks].adev = adev; 2399 2400 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2401 2402 return 0; 2403 } 2404 2405 /** 2406 * amdgpu_device_enable_virtual_display - enable virtual display feature 2407 * 2408 * @adev: amdgpu_device pointer 2409 * 2410 * Enabled the virtual display feature if the user has enabled it via 2411 * the module parameter virtual_display. This feature provides a virtual 2412 * display hardware on headless boards or in virtualized environments. 2413 * This function parses and validates the configuration string specified by 2414 * the user and configures the virtual display configuration (number of 2415 * virtual connectors, crtcs, etc.) specified. 2416 */ 2417 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2418 { 2419 adev->enable_virtual_display = false; 2420 2421 if (amdgpu_virtual_display) { 2422 const char *pci_address_name = pci_name(adev->pdev); 2423 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2424 2425 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2426 pciaddstr_tmp = pciaddstr; 2427 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2428 pciaddname = strsep(&pciaddname_tmp, ","); 2429 if (!strcmp("all", pciaddname) 2430 || !strcmp(pci_address_name, pciaddname)) { 2431 long num_crtc; 2432 int res = -1; 2433 2434 adev->enable_virtual_display = true; 2435 2436 if (pciaddname_tmp) 2437 res = kstrtol(pciaddname_tmp, 10, 2438 &num_crtc); 2439 2440 if (!res) { 2441 if (num_crtc < 1) 2442 num_crtc = 1; 2443 if (num_crtc > 6) 2444 num_crtc = 6; 2445 adev->mode_info.num_crtc = num_crtc; 2446 } else { 2447 adev->mode_info.num_crtc = 1; 2448 } 2449 break; 2450 } 2451 } 2452 2453 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2454 amdgpu_virtual_display, pci_address_name, 2455 adev->enable_virtual_display, adev->mode_info.num_crtc); 2456 2457 kfree(pciaddstr); 2458 } 2459 } 2460 2461 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2462 { 2463 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2464 adev->mode_info.num_crtc = 1; 2465 adev->enable_virtual_display = true; 2466 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2467 adev->enable_virtual_display, adev->mode_info.num_crtc); 2468 } 2469 } 2470 2471 /** 2472 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2473 * 2474 * @adev: amdgpu_device pointer 2475 * 2476 * Parses the asic configuration parameters specified in the gpu info 2477 * firmware and makes them available to the driver for use in configuring 2478 * the asic. 2479 * Returns 0 on success, -EINVAL on failure. 2480 */ 2481 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2482 { 2483 const char *chip_name; 2484 int err; 2485 const struct gpu_info_firmware_header_v1_0 *hdr; 2486 2487 adev->firmware.gpu_info_fw = NULL; 2488 2489 if (adev->mman.discovery_bin) 2490 return 0; 2491 2492 switch (adev->asic_type) { 2493 default: 2494 return 0; 2495 case CHIP_VEGA10: 2496 chip_name = "vega10"; 2497 break; 2498 case CHIP_VEGA12: 2499 chip_name = "vega12"; 2500 break; 2501 case CHIP_RAVEN: 2502 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2503 chip_name = "raven2"; 2504 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2505 chip_name = "picasso"; 2506 else 2507 chip_name = "raven"; 2508 break; 2509 case CHIP_ARCTURUS: 2510 chip_name = "arcturus"; 2511 break; 2512 case CHIP_NAVI12: 2513 chip_name = "navi12"; 2514 break; 2515 } 2516 2517 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2518 AMDGPU_UCODE_OPTIONAL, 2519 "amdgpu/%s_gpu_info.bin", chip_name); 2520 if (err) { 2521 dev_err(adev->dev, 2522 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2523 chip_name); 2524 goto out; 2525 } 2526 2527 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2528 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2529 2530 switch (hdr->version_major) { 2531 case 1: 2532 { 2533 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2534 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2535 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2536 2537 /* 2538 * Should be dropped when DAL no longer needs it. 2539 */ 2540 if (adev->asic_type == CHIP_NAVI12) 2541 goto parse_soc_bounding_box; 2542 2543 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2544 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2545 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2546 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2547 adev->gfx.config.max_texture_channel_caches = 2548 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2549 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2550 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2551 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2552 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2553 adev->gfx.config.double_offchip_lds_buf = 2554 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2555 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2556 adev->gfx.cu_info.max_waves_per_simd = 2557 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2558 adev->gfx.cu_info.max_scratch_slots_per_cu = 2559 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2560 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2561 if (hdr->version_minor >= 1) { 2562 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2563 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2564 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2565 adev->gfx.config.num_sc_per_sh = 2566 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2567 adev->gfx.config.num_packer_per_sc = 2568 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2569 } 2570 2571 parse_soc_bounding_box: 2572 /* 2573 * soc bounding box info is not integrated in disocovery table, 2574 * we always need to parse it from gpu info firmware if needed. 2575 */ 2576 if (hdr->version_minor == 2) { 2577 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2578 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2579 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2580 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2581 } 2582 break; 2583 } 2584 default: 2585 dev_err(adev->dev, 2586 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2587 err = -EINVAL; 2588 goto out; 2589 } 2590 out: 2591 return err; 2592 } 2593 2594 /** 2595 * amdgpu_device_ip_early_init - run early init for hardware IPs 2596 * 2597 * @adev: amdgpu_device pointer 2598 * 2599 * Early initialization pass for hardware IPs. The hardware IPs that make 2600 * up each asic are discovered each IP's early_init callback is run. This 2601 * is the first stage in initializing the asic. 2602 * Returns 0 on success, negative error code on failure. 2603 */ 2604 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2605 { 2606 struct amdgpu_ip_block *ip_block; 2607 struct pci_dev *parent; 2608 bool total, skip_bios; 2609 uint32_t bios_flags; 2610 int i, r; 2611 2612 amdgpu_device_enable_virtual_display(adev); 2613 2614 if (amdgpu_sriov_vf(adev)) { 2615 r = amdgpu_virt_request_full_gpu(adev, true); 2616 if (r) 2617 return r; 2618 } 2619 2620 switch (adev->asic_type) { 2621 #ifdef CONFIG_DRM_AMDGPU_SI 2622 case CHIP_VERDE: 2623 case CHIP_TAHITI: 2624 case CHIP_PITCAIRN: 2625 case CHIP_OLAND: 2626 case CHIP_HAINAN: 2627 adev->family = AMDGPU_FAMILY_SI; 2628 r = si_set_ip_blocks(adev); 2629 if (r) 2630 return r; 2631 break; 2632 #endif 2633 #ifdef CONFIG_DRM_AMDGPU_CIK 2634 case CHIP_BONAIRE: 2635 case CHIP_HAWAII: 2636 case CHIP_KAVERI: 2637 case CHIP_KABINI: 2638 case CHIP_MULLINS: 2639 if (adev->flags & AMD_IS_APU) 2640 adev->family = AMDGPU_FAMILY_KV; 2641 else 2642 adev->family = AMDGPU_FAMILY_CI; 2643 2644 r = cik_set_ip_blocks(adev); 2645 if (r) 2646 return r; 2647 break; 2648 #endif 2649 case CHIP_TOPAZ: 2650 case CHIP_TONGA: 2651 case CHIP_FIJI: 2652 case CHIP_POLARIS10: 2653 case CHIP_POLARIS11: 2654 case CHIP_POLARIS12: 2655 case CHIP_VEGAM: 2656 case CHIP_CARRIZO: 2657 case CHIP_STONEY: 2658 if (adev->flags & AMD_IS_APU) 2659 adev->family = AMDGPU_FAMILY_CZ; 2660 else 2661 adev->family = AMDGPU_FAMILY_VI; 2662 2663 r = vi_set_ip_blocks(adev); 2664 if (r) 2665 return r; 2666 break; 2667 default: 2668 r = amdgpu_discovery_set_ip_blocks(adev); 2669 if (r) 2670 return r; 2671 break; 2672 } 2673 2674 if (amdgpu_has_atpx() && 2675 (amdgpu_is_atpx_hybrid() || 2676 amdgpu_has_atpx_dgpu_power_cntl()) && 2677 ((adev->flags & AMD_IS_APU) == 0) && 2678 !dev_is_removable(&adev->pdev->dev)) 2679 adev->flags |= AMD_IS_PX; 2680 2681 if (!(adev->flags & AMD_IS_APU)) { 2682 parent = pcie_find_root_port(adev->pdev); 2683 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2684 } 2685 2686 2687 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2688 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2689 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2690 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2691 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2692 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2693 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2694 2695 total = true; 2696 for (i = 0; i < adev->num_ip_blocks; i++) { 2697 ip_block = &adev->ip_blocks[i]; 2698 2699 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2700 DRM_WARN("disabled ip block: %d <%s>\n", 2701 i, adev->ip_blocks[i].version->funcs->name); 2702 adev->ip_blocks[i].status.valid = false; 2703 } else if (ip_block->version->funcs->early_init) { 2704 r = ip_block->version->funcs->early_init(ip_block); 2705 if (r == -ENOENT) { 2706 adev->ip_blocks[i].status.valid = false; 2707 } else if (r) { 2708 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2709 adev->ip_blocks[i].version->funcs->name, r); 2710 total = false; 2711 } else { 2712 adev->ip_blocks[i].status.valid = true; 2713 } 2714 } else { 2715 adev->ip_blocks[i].status.valid = true; 2716 } 2717 /* get the vbios after the asic_funcs are set up */ 2718 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2719 r = amdgpu_device_parse_gpu_info_fw(adev); 2720 if (r) 2721 return r; 2722 2723 bios_flags = amdgpu_device_get_vbios_flags(adev); 2724 skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP); 2725 /* Read BIOS */ 2726 if (!skip_bios) { 2727 bool optional = 2728 !!(bios_flags & AMDGPU_VBIOS_OPTIONAL); 2729 if (!amdgpu_get_bios(adev) && !optional) 2730 return -EINVAL; 2731 2732 if (optional && !adev->bios) 2733 dev_info( 2734 adev->dev, 2735 "VBIOS image optional, proceeding without VBIOS image"); 2736 2737 if (adev->bios) { 2738 r = amdgpu_atombios_init(adev); 2739 if (r) { 2740 dev_err(adev->dev, 2741 "amdgpu_atombios_init failed\n"); 2742 amdgpu_vf_error_put( 2743 adev, 2744 AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 2745 0, 0); 2746 return r; 2747 } 2748 } 2749 } 2750 2751 /*get pf2vf msg info at it's earliest time*/ 2752 if (amdgpu_sriov_vf(adev)) 2753 amdgpu_virt_init_data_exchange(adev); 2754 2755 } 2756 } 2757 if (!total) 2758 return -ENODEV; 2759 2760 if (adev->gmc.xgmi.supported) 2761 amdgpu_xgmi_early_init(adev); 2762 2763 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2764 if (ip_block->status.valid != false) 2765 amdgpu_amdkfd_device_probe(adev); 2766 2767 adev->cg_flags &= amdgpu_cg_mask; 2768 adev->pg_flags &= amdgpu_pg_mask; 2769 2770 return 0; 2771 } 2772 2773 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2774 { 2775 int i, r; 2776 2777 for (i = 0; i < adev->num_ip_blocks; i++) { 2778 if (!adev->ip_blocks[i].status.sw) 2779 continue; 2780 if (adev->ip_blocks[i].status.hw) 2781 continue; 2782 if (!amdgpu_ip_member_of_hwini( 2783 adev, adev->ip_blocks[i].version->type)) 2784 continue; 2785 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2786 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2787 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2788 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2789 if (r) { 2790 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2791 adev->ip_blocks[i].version->funcs->name, r); 2792 return r; 2793 } 2794 adev->ip_blocks[i].status.hw = true; 2795 } 2796 } 2797 2798 return 0; 2799 } 2800 2801 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2802 { 2803 int i, r; 2804 2805 for (i = 0; i < adev->num_ip_blocks; i++) { 2806 if (!adev->ip_blocks[i].status.sw) 2807 continue; 2808 if (adev->ip_blocks[i].status.hw) 2809 continue; 2810 if (!amdgpu_ip_member_of_hwini( 2811 adev, adev->ip_blocks[i].version->type)) 2812 continue; 2813 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2814 if (r) { 2815 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2816 adev->ip_blocks[i].version->funcs->name, r); 2817 return r; 2818 } 2819 adev->ip_blocks[i].status.hw = true; 2820 } 2821 2822 return 0; 2823 } 2824 2825 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2826 { 2827 int r = 0; 2828 int i; 2829 uint32_t smu_version; 2830 2831 if (adev->asic_type >= CHIP_VEGA10) { 2832 for (i = 0; i < adev->num_ip_blocks; i++) { 2833 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2834 continue; 2835 2836 if (!amdgpu_ip_member_of_hwini(adev, 2837 AMD_IP_BLOCK_TYPE_PSP)) 2838 break; 2839 2840 if (!adev->ip_blocks[i].status.sw) 2841 continue; 2842 2843 /* no need to do the fw loading again if already done*/ 2844 if (adev->ip_blocks[i].status.hw == true) 2845 break; 2846 2847 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2848 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 2849 if (r) 2850 return r; 2851 } else { 2852 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2853 if (r) { 2854 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2855 adev->ip_blocks[i].version->funcs->name, r); 2856 return r; 2857 } 2858 adev->ip_blocks[i].status.hw = true; 2859 } 2860 break; 2861 } 2862 } 2863 2864 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2865 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2866 2867 return r; 2868 } 2869 2870 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2871 { 2872 struct drm_sched_init_args args = { 2873 .ops = &amdgpu_sched_ops, 2874 .num_rqs = DRM_SCHED_PRIORITY_COUNT, 2875 .timeout_wq = adev->reset_domain->wq, 2876 .dev = adev->dev, 2877 }; 2878 long timeout; 2879 int r, i; 2880 2881 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2882 struct amdgpu_ring *ring = adev->rings[i]; 2883 2884 /* No need to setup the GPU scheduler for rings that don't need it */ 2885 if (!ring || ring->no_scheduler) 2886 continue; 2887 2888 switch (ring->funcs->type) { 2889 case AMDGPU_RING_TYPE_GFX: 2890 timeout = adev->gfx_timeout; 2891 break; 2892 case AMDGPU_RING_TYPE_COMPUTE: 2893 timeout = adev->compute_timeout; 2894 break; 2895 case AMDGPU_RING_TYPE_SDMA: 2896 timeout = adev->sdma_timeout; 2897 break; 2898 default: 2899 timeout = adev->video_timeout; 2900 break; 2901 } 2902 2903 args.timeout = timeout; 2904 args.credit_limit = ring->num_hw_submission; 2905 args.score = ring->sched_score; 2906 args.name = ring->name; 2907 2908 r = drm_sched_init(&ring->sched, &args); 2909 if (r) { 2910 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2911 ring->name); 2912 return r; 2913 } 2914 r = amdgpu_uvd_entity_init(adev, ring); 2915 if (r) { 2916 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", 2917 ring->name); 2918 return r; 2919 } 2920 r = amdgpu_vce_entity_init(adev, ring); 2921 if (r) { 2922 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", 2923 ring->name); 2924 return r; 2925 } 2926 } 2927 2928 amdgpu_xcp_update_partition_sched_list(adev); 2929 2930 return 0; 2931 } 2932 2933 2934 /** 2935 * amdgpu_device_ip_init - run init for hardware IPs 2936 * 2937 * @adev: amdgpu_device pointer 2938 * 2939 * Main initialization pass for hardware IPs. The list of all the hardware 2940 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2941 * are run. sw_init initializes the software state associated with each IP 2942 * and hw_init initializes the hardware associated with each IP. 2943 * Returns 0 on success, negative error code on failure. 2944 */ 2945 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2946 { 2947 bool init_badpage; 2948 int i, r; 2949 2950 r = amdgpu_ras_init(adev); 2951 if (r) 2952 return r; 2953 2954 for (i = 0; i < adev->num_ip_blocks; i++) { 2955 if (!adev->ip_blocks[i].status.valid) 2956 continue; 2957 if (adev->ip_blocks[i].version->funcs->sw_init) { 2958 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 2959 if (r) { 2960 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2961 adev->ip_blocks[i].version->funcs->name, r); 2962 goto init_failed; 2963 } 2964 } 2965 adev->ip_blocks[i].status.sw = true; 2966 2967 if (!amdgpu_ip_member_of_hwini( 2968 adev, adev->ip_blocks[i].version->type)) 2969 continue; 2970 2971 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2972 /* need to do common hw init early so everything is set up for gmc */ 2973 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2974 if (r) { 2975 DRM_ERROR("hw_init %d failed %d\n", i, r); 2976 goto init_failed; 2977 } 2978 adev->ip_blocks[i].status.hw = true; 2979 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2980 /* need to do gmc hw init early so we can allocate gpu mem */ 2981 /* Try to reserve bad pages early */ 2982 if (amdgpu_sriov_vf(adev)) 2983 amdgpu_virt_exchange_data(adev); 2984 2985 r = amdgpu_device_mem_scratch_init(adev); 2986 if (r) { 2987 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2988 goto init_failed; 2989 } 2990 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2991 if (r) { 2992 DRM_ERROR("hw_init %d failed %d\n", i, r); 2993 goto init_failed; 2994 } 2995 r = amdgpu_device_wb_init(adev); 2996 if (r) { 2997 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2998 goto init_failed; 2999 } 3000 adev->ip_blocks[i].status.hw = true; 3001 3002 /* right after GMC hw init, we create CSA */ 3003 if (adev->gfx.mcbp) { 3004 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 3005 AMDGPU_GEM_DOMAIN_VRAM | 3006 AMDGPU_GEM_DOMAIN_GTT, 3007 AMDGPU_CSA_SIZE); 3008 if (r) { 3009 DRM_ERROR("allocate CSA failed %d\n", r); 3010 goto init_failed; 3011 } 3012 } 3013 3014 r = amdgpu_seq64_init(adev); 3015 if (r) { 3016 DRM_ERROR("allocate seq64 failed %d\n", r); 3017 goto init_failed; 3018 } 3019 } 3020 } 3021 3022 if (amdgpu_sriov_vf(adev)) 3023 amdgpu_virt_init_data_exchange(adev); 3024 3025 r = amdgpu_ib_pool_init(adev); 3026 if (r) { 3027 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 3028 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 3029 goto init_failed; 3030 } 3031 3032 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 3033 if (r) 3034 goto init_failed; 3035 3036 r = amdgpu_device_ip_hw_init_phase1(adev); 3037 if (r) 3038 goto init_failed; 3039 3040 r = amdgpu_device_fw_loading(adev); 3041 if (r) 3042 goto init_failed; 3043 3044 r = amdgpu_device_ip_hw_init_phase2(adev); 3045 if (r) 3046 goto init_failed; 3047 3048 /* 3049 * retired pages will be loaded from eeprom and reserved here, 3050 * it should be called after amdgpu_device_ip_hw_init_phase2 since 3051 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 3052 * for I2C communication which only true at this point. 3053 * 3054 * amdgpu_ras_recovery_init may fail, but the upper only cares the 3055 * failure from bad gpu situation and stop amdgpu init process 3056 * accordingly. For other failed cases, it will still release all 3057 * the resource and print error message, rather than returning one 3058 * negative value to upper level. 3059 * 3060 * Note: theoretically, this should be called before all vram allocations 3061 * to protect retired page from abusing 3062 */ 3063 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 3064 r = amdgpu_ras_recovery_init(adev, init_badpage); 3065 if (r) 3066 goto init_failed; 3067 3068 /** 3069 * In case of XGMI grab extra reference for reset domain for this device 3070 */ 3071 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3072 if (amdgpu_xgmi_add_device(adev) == 0) { 3073 if (!amdgpu_sriov_vf(adev)) { 3074 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3075 3076 if (WARN_ON(!hive)) { 3077 r = -ENOENT; 3078 goto init_failed; 3079 } 3080 3081 if (!hive->reset_domain || 3082 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 3083 r = -ENOENT; 3084 amdgpu_put_xgmi_hive(hive); 3085 goto init_failed; 3086 } 3087 3088 /* Drop the early temporary reset domain we created for device */ 3089 amdgpu_reset_put_reset_domain(adev->reset_domain); 3090 adev->reset_domain = hive->reset_domain; 3091 amdgpu_put_xgmi_hive(hive); 3092 } 3093 } 3094 } 3095 3096 r = amdgpu_device_init_schedulers(adev); 3097 if (r) 3098 goto init_failed; 3099 3100 if (adev->mman.buffer_funcs_ring->sched.ready) 3101 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3102 3103 /* Don't init kfd if whole hive need to be reset during init */ 3104 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3105 kgd2kfd_init_zone_device(adev); 3106 amdgpu_amdkfd_device_init(adev); 3107 } 3108 3109 amdgpu_fru_get_product_info(adev); 3110 3111 if (!amdgpu_sriov_vf(adev) || amdgpu_sriov_ras_cper_en(adev)) 3112 r = amdgpu_cper_init(adev); 3113 3114 init_failed: 3115 3116 return r; 3117 } 3118 3119 /** 3120 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3121 * 3122 * @adev: amdgpu_device pointer 3123 * 3124 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3125 * this function before a GPU reset. If the value is retained after a 3126 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents. 3127 */ 3128 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3129 { 3130 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3131 } 3132 3133 /** 3134 * amdgpu_device_check_vram_lost - check if vram is valid 3135 * 3136 * @adev: amdgpu_device pointer 3137 * 3138 * Checks the reset magic value written to the gart pointer in VRAM. 3139 * The driver calls this after a GPU reset to see if the contents of 3140 * VRAM is lost or now. 3141 * returns true if vram is lost, false if not. 3142 */ 3143 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3144 { 3145 if (memcmp(adev->gart.ptr, adev->reset_magic, 3146 AMDGPU_RESET_MAGIC_NUM)) 3147 return true; 3148 3149 if (!amdgpu_in_reset(adev)) 3150 return false; 3151 3152 /* 3153 * For all ASICs with baco/mode1 reset, the VRAM is 3154 * always assumed to be lost. 3155 */ 3156 switch (amdgpu_asic_reset_method(adev)) { 3157 case AMD_RESET_METHOD_BACO: 3158 case AMD_RESET_METHOD_MODE1: 3159 return true; 3160 default: 3161 return false; 3162 } 3163 } 3164 3165 /** 3166 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3167 * 3168 * @adev: amdgpu_device pointer 3169 * @state: clockgating state (gate or ungate) 3170 * 3171 * The list of all the hardware IPs that make up the asic is walked and the 3172 * set_clockgating_state callbacks are run. 3173 * Late initialization pass enabling clockgating for hardware IPs. 3174 * Fini or suspend, pass disabling clockgating for hardware IPs. 3175 * Returns 0 on success, negative error code on failure. 3176 */ 3177 3178 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3179 enum amd_clockgating_state state) 3180 { 3181 int i, j, r; 3182 3183 if (amdgpu_emu_mode == 1) 3184 return 0; 3185 3186 for (j = 0; j < adev->num_ip_blocks; j++) { 3187 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3188 if (!adev->ip_blocks[i].status.late_initialized) 3189 continue; 3190 /* skip CG for GFX, SDMA on S0ix */ 3191 if (adev->in_s0ix && 3192 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3193 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3194 continue; 3195 /* skip CG for VCE/UVD, it's handled specially */ 3196 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3197 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3198 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3199 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3200 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3201 /* enable clockgating to save power */ 3202 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i], 3203 state); 3204 if (r) { 3205 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 3206 adev->ip_blocks[i].version->funcs->name, r); 3207 return r; 3208 } 3209 } 3210 } 3211 3212 return 0; 3213 } 3214 3215 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3216 enum amd_powergating_state state) 3217 { 3218 int i, j, r; 3219 3220 if (amdgpu_emu_mode == 1) 3221 return 0; 3222 3223 for (j = 0; j < adev->num_ip_blocks; j++) { 3224 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3225 if (!adev->ip_blocks[i].status.late_initialized) 3226 continue; 3227 /* skip PG for GFX, SDMA on S0ix */ 3228 if (adev->in_s0ix && 3229 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3230 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3231 continue; 3232 /* skip CG for VCE/UVD, it's handled specially */ 3233 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3234 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3235 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3236 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3237 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3238 /* enable powergating to save power */ 3239 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i], 3240 state); 3241 if (r) { 3242 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 3243 adev->ip_blocks[i].version->funcs->name, r); 3244 return r; 3245 } 3246 } 3247 } 3248 return 0; 3249 } 3250 3251 static int amdgpu_device_enable_mgpu_fan_boost(void) 3252 { 3253 struct amdgpu_gpu_instance *gpu_ins; 3254 struct amdgpu_device *adev; 3255 int i, ret = 0; 3256 3257 mutex_lock(&mgpu_info.mutex); 3258 3259 /* 3260 * MGPU fan boost feature should be enabled 3261 * only when there are two or more dGPUs in 3262 * the system 3263 */ 3264 if (mgpu_info.num_dgpu < 2) 3265 goto out; 3266 3267 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3268 gpu_ins = &(mgpu_info.gpu_ins[i]); 3269 adev = gpu_ins->adev; 3270 if (!(adev->flags & AMD_IS_APU) && 3271 !gpu_ins->mgpu_fan_enabled) { 3272 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3273 if (ret) 3274 break; 3275 3276 gpu_ins->mgpu_fan_enabled = 1; 3277 } 3278 } 3279 3280 out: 3281 mutex_unlock(&mgpu_info.mutex); 3282 3283 return ret; 3284 } 3285 3286 /** 3287 * amdgpu_device_ip_late_init - run late init for hardware IPs 3288 * 3289 * @adev: amdgpu_device pointer 3290 * 3291 * Late initialization pass for hardware IPs. The list of all the hardware 3292 * IPs that make up the asic is walked and the late_init callbacks are run. 3293 * late_init covers any special initialization that an IP requires 3294 * after all of the have been initialized or something that needs to happen 3295 * late in the init process. 3296 * Returns 0 on success, negative error code on failure. 3297 */ 3298 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3299 { 3300 struct amdgpu_gpu_instance *gpu_instance; 3301 int i = 0, r; 3302 3303 for (i = 0; i < adev->num_ip_blocks; i++) { 3304 if (!adev->ip_blocks[i].status.hw) 3305 continue; 3306 if (adev->ip_blocks[i].version->funcs->late_init) { 3307 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3308 if (r) { 3309 DRM_ERROR("late_init of IP block <%s> failed %d\n", 3310 adev->ip_blocks[i].version->funcs->name, r); 3311 return r; 3312 } 3313 } 3314 adev->ip_blocks[i].status.late_initialized = true; 3315 } 3316 3317 r = amdgpu_ras_late_init(adev); 3318 if (r) { 3319 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 3320 return r; 3321 } 3322 3323 if (!amdgpu_reset_in_recovery(adev)) 3324 amdgpu_ras_set_error_query_ready(adev, true); 3325 3326 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3327 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3328 3329 amdgpu_device_fill_reset_magic(adev); 3330 3331 r = amdgpu_device_enable_mgpu_fan_boost(); 3332 if (r) 3333 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 3334 3335 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3336 if (amdgpu_passthrough(adev) && 3337 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3338 adev->asic_type == CHIP_ALDEBARAN)) 3339 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3340 3341 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3342 mutex_lock(&mgpu_info.mutex); 3343 3344 /* 3345 * Reset device p-state to low as this was booted with high. 3346 * 3347 * This should be performed only after all devices from the same 3348 * hive get initialized. 3349 * 3350 * However, it's unknown how many device in the hive in advance. 3351 * As this is counted one by one during devices initializations. 3352 * 3353 * So, we wait for all XGMI interlinked devices initialized. 3354 * This may bring some delays as those devices may come from 3355 * different hives. But that should be OK. 3356 */ 3357 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3358 for (i = 0; i < mgpu_info.num_gpu; i++) { 3359 gpu_instance = &(mgpu_info.gpu_ins[i]); 3360 if (gpu_instance->adev->flags & AMD_IS_APU) 3361 continue; 3362 3363 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3364 AMDGPU_XGMI_PSTATE_MIN); 3365 if (r) { 3366 DRM_ERROR("pstate setting failed (%d).\n", r); 3367 break; 3368 } 3369 } 3370 } 3371 3372 mutex_unlock(&mgpu_info.mutex); 3373 } 3374 3375 return 0; 3376 } 3377 3378 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3379 { 3380 int r; 3381 3382 if (!ip_block->version->funcs->hw_fini) { 3383 DRM_ERROR("hw_fini of IP block <%s> not defined\n", 3384 ip_block->version->funcs->name); 3385 } else { 3386 r = ip_block->version->funcs->hw_fini(ip_block); 3387 /* XXX handle errors */ 3388 if (r) { 3389 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3390 ip_block->version->funcs->name, r); 3391 } 3392 } 3393 3394 ip_block->status.hw = false; 3395 } 3396 3397 /** 3398 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3399 * 3400 * @adev: amdgpu_device pointer 3401 * 3402 * For ASICs need to disable SMC first 3403 */ 3404 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3405 { 3406 int i; 3407 3408 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3409 return; 3410 3411 for (i = 0; i < adev->num_ip_blocks; i++) { 3412 if (!adev->ip_blocks[i].status.hw) 3413 continue; 3414 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3415 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3416 break; 3417 } 3418 } 3419 } 3420 3421 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3422 { 3423 int i, r; 3424 3425 for (i = 0; i < adev->num_ip_blocks; i++) { 3426 if (!adev->ip_blocks[i].version->funcs->early_fini) 3427 continue; 3428 3429 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3430 if (r) { 3431 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3432 adev->ip_blocks[i].version->funcs->name, r); 3433 } 3434 } 3435 3436 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3437 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3438 3439 amdgpu_amdkfd_suspend(adev, false); 3440 3441 /* Workaround for ASICs need to disable SMC first */ 3442 amdgpu_device_smu_fini_early(adev); 3443 3444 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3445 if (!adev->ip_blocks[i].status.hw) 3446 continue; 3447 3448 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3449 } 3450 3451 if (amdgpu_sriov_vf(adev)) { 3452 if (amdgpu_virt_release_full_gpu(adev, false)) 3453 DRM_ERROR("failed to release exclusive mode on fini\n"); 3454 } 3455 3456 return 0; 3457 } 3458 3459 /** 3460 * amdgpu_device_ip_fini - run fini for hardware IPs 3461 * 3462 * @adev: amdgpu_device pointer 3463 * 3464 * Main teardown pass for hardware IPs. The list of all the hardware 3465 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3466 * are run. hw_fini tears down the hardware associated with each IP 3467 * and sw_fini tears down any software state associated with each IP. 3468 * Returns 0 on success, negative error code on failure. 3469 */ 3470 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3471 { 3472 int i, r; 3473 3474 amdgpu_cper_fini(adev); 3475 3476 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3477 amdgpu_virt_release_ras_err_handler_data(adev); 3478 3479 if (adev->gmc.xgmi.num_physical_nodes > 1) 3480 amdgpu_xgmi_remove_device(adev); 3481 3482 amdgpu_amdkfd_device_fini_sw(adev); 3483 3484 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3485 if (!adev->ip_blocks[i].status.sw) 3486 continue; 3487 3488 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3489 amdgpu_ucode_free_bo(adev); 3490 amdgpu_free_static_csa(&adev->virt.csa_obj); 3491 amdgpu_device_wb_fini(adev); 3492 amdgpu_device_mem_scratch_fini(adev); 3493 amdgpu_ib_pool_fini(adev); 3494 amdgpu_seq64_fini(adev); 3495 } 3496 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3497 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3498 /* XXX handle errors */ 3499 if (r) { 3500 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3501 adev->ip_blocks[i].version->funcs->name, r); 3502 } 3503 } 3504 adev->ip_blocks[i].status.sw = false; 3505 adev->ip_blocks[i].status.valid = false; 3506 } 3507 3508 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3509 if (!adev->ip_blocks[i].status.late_initialized) 3510 continue; 3511 if (adev->ip_blocks[i].version->funcs->late_fini) 3512 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3513 adev->ip_blocks[i].status.late_initialized = false; 3514 } 3515 3516 amdgpu_ras_fini(adev); 3517 3518 return 0; 3519 } 3520 3521 /** 3522 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3523 * 3524 * @work: work_struct. 3525 */ 3526 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3527 { 3528 struct amdgpu_device *adev = 3529 container_of(work, struct amdgpu_device, delayed_init_work.work); 3530 int r; 3531 3532 r = amdgpu_ib_ring_tests(adev); 3533 if (r) 3534 DRM_ERROR("ib ring test failed (%d).\n", r); 3535 } 3536 3537 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3538 { 3539 struct amdgpu_device *adev = 3540 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3541 3542 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3543 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3544 3545 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0)) 3546 adev->gfx.gfx_off_state = true; 3547 } 3548 3549 /** 3550 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3551 * 3552 * @adev: amdgpu_device pointer 3553 * 3554 * Main suspend function for hardware IPs. The list of all the hardware 3555 * IPs that make up the asic is walked, clockgating is disabled and the 3556 * suspend callbacks are run. suspend puts the hardware and software state 3557 * in each IP into a state suitable for suspend. 3558 * Returns 0 on success, negative error code on failure. 3559 */ 3560 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3561 { 3562 int i, r; 3563 3564 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3565 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3566 3567 /* 3568 * Per PMFW team's suggestion, driver needs to handle gfxoff 3569 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3570 * scenario. Add the missing df cstate disablement here. 3571 */ 3572 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3573 dev_warn(adev->dev, "Failed to disallow df cstate"); 3574 3575 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3576 if (!adev->ip_blocks[i].status.valid) 3577 continue; 3578 3579 /* displays are handled separately */ 3580 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3581 continue; 3582 3583 /* XXX handle errors */ 3584 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3585 if (r) 3586 return r; 3587 } 3588 3589 return 0; 3590 } 3591 3592 /** 3593 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3594 * 3595 * @adev: amdgpu_device pointer 3596 * 3597 * Main suspend function for hardware IPs. The list of all the hardware 3598 * IPs that make up the asic is walked, clockgating is disabled and the 3599 * suspend callbacks are run. suspend puts the hardware and software state 3600 * in each IP into a state suitable for suspend. 3601 * Returns 0 on success, negative error code on failure. 3602 */ 3603 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3604 { 3605 int i, r; 3606 3607 if (adev->in_s0ix) 3608 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3609 3610 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3611 if (!adev->ip_blocks[i].status.valid) 3612 continue; 3613 /* displays are handled in phase1 */ 3614 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3615 continue; 3616 /* PSP lost connection when err_event_athub occurs */ 3617 if (amdgpu_ras_intr_triggered() && 3618 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3619 adev->ip_blocks[i].status.hw = false; 3620 continue; 3621 } 3622 3623 /* skip unnecessary suspend if we do not initialize them yet */ 3624 if (!amdgpu_ip_member_of_hwini( 3625 adev, adev->ip_blocks[i].version->type)) 3626 continue; 3627 3628 /* skip suspend of gfx/mes and psp for S0ix 3629 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3630 * like at runtime. PSP is also part of the always on hardware 3631 * so no need to suspend it. 3632 */ 3633 if (adev->in_s0ix && 3634 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3635 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3636 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3637 continue; 3638 3639 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3640 if (adev->in_s0ix && 3641 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3642 IP_VERSION(5, 0, 0)) && 3643 (adev->ip_blocks[i].version->type == 3644 AMD_IP_BLOCK_TYPE_SDMA)) 3645 continue; 3646 3647 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3648 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3649 * from this location and RLC Autoload automatically also gets loaded 3650 * from here based on PMFW -> PSP message during re-init sequence. 3651 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3652 * the TMR and reload FWs again for IMU enabled APU ASICs. 3653 */ 3654 if (amdgpu_in_reset(adev) && 3655 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3656 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3657 continue; 3658 3659 /* XXX handle errors */ 3660 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3661 adev->ip_blocks[i].status.hw = false; 3662 3663 /* handle putting the SMC in the appropriate state */ 3664 if (!amdgpu_sriov_vf(adev)) { 3665 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3666 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3667 if (r) { 3668 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3669 adev->mp1_state, r); 3670 return r; 3671 } 3672 } 3673 } 3674 } 3675 3676 return 0; 3677 } 3678 3679 /** 3680 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3681 * 3682 * @adev: amdgpu_device pointer 3683 * 3684 * Main suspend function for hardware IPs. The list of all the hardware 3685 * IPs that make up the asic is walked, clockgating is disabled and the 3686 * suspend callbacks are run. suspend puts the hardware and software state 3687 * in each IP into a state suitable for suspend. 3688 * Returns 0 on success, negative error code on failure. 3689 */ 3690 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3691 { 3692 int r; 3693 3694 if (amdgpu_sriov_vf(adev)) { 3695 amdgpu_virt_fini_data_exchange(adev); 3696 amdgpu_virt_request_full_gpu(adev, false); 3697 } 3698 3699 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3700 3701 r = amdgpu_device_ip_suspend_phase1(adev); 3702 if (r) 3703 return r; 3704 r = amdgpu_device_ip_suspend_phase2(adev); 3705 3706 if (amdgpu_sriov_vf(adev)) 3707 amdgpu_virt_release_full_gpu(adev, false); 3708 3709 return r; 3710 } 3711 3712 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3713 { 3714 int i, r; 3715 3716 static enum amd_ip_block_type ip_order[] = { 3717 AMD_IP_BLOCK_TYPE_COMMON, 3718 AMD_IP_BLOCK_TYPE_GMC, 3719 AMD_IP_BLOCK_TYPE_PSP, 3720 AMD_IP_BLOCK_TYPE_IH, 3721 }; 3722 3723 for (i = 0; i < adev->num_ip_blocks; i++) { 3724 int j; 3725 struct amdgpu_ip_block *block; 3726 3727 block = &adev->ip_blocks[i]; 3728 block->status.hw = false; 3729 3730 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3731 3732 if (block->version->type != ip_order[j] || 3733 !block->status.valid) 3734 continue; 3735 3736 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3737 if (r) { 3738 dev_err(adev->dev, "RE-INIT-early: %s failed\n", 3739 block->version->funcs->name); 3740 return r; 3741 } 3742 block->status.hw = true; 3743 } 3744 } 3745 3746 return 0; 3747 } 3748 3749 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3750 { 3751 struct amdgpu_ip_block *block; 3752 int i, r = 0; 3753 3754 static enum amd_ip_block_type ip_order[] = { 3755 AMD_IP_BLOCK_TYPE_SMC, 3756 AMD_IP_BLOCK_TYPE_DCE, 3757 AMD_IP_BLOCK_TYPE_GFX, 3758 AMD_IP_BLOCK_TYPE_SDMA, 3759 AMD_IP_BLOCK_TYPE_MES, 3760 AMD_IP_BLOCK_TYPE_UVD, 3761 AMD_IP_BLOCK_TYPE_VCE, 3762 AMD_IP_BLOCK_TYPE_VCN, 3763 AMD_IP_BLOCK_TYPE_JPEG 3764 }; 3765 3766 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3767 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]); 3768 3769 if (!block) 3770 continue; 3771 3772 if (block->status.valid && !block->status.hw) { 3773 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 3774 r = amdgpu_ip_block_resume(block); 3775 } else { 3776 r = block->version->funcs->hw_init(block); 3777 } 3778 3779 if (r) { 3780 dev_err(adev->dev, "RE-INIT-late: %s failed\n", 3781 block->version->funcs->name); 3782 break; 3783 } 3784 block->status.hw = true; 3785 } 3786 } 3787 3788 return r; 3789 } 3790 3791 /** 3792 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3793 * 3794 * @adev: amdgpu_device pointer 3795 * 3796 * First resume function for hardware IPs. The list of all the hardware 3797 * IPs that make up the asic is walked and the resume callbacks are run for 3798 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3799 * after a suspend and updates the software state as necessary. This 3800 * function is also used for restoring the GPU after a GPU reset. 3801 * Returns 0 on success, negative error code on failure. 3802 */ 3803 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3804 { 3805 int i, r; 3806 3807 for (i = 0; i < adev->num_ip_blocks; i++) { 3808 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3809 continue; 3810 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3811 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3812 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3813 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3814 3815 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3816 if (r) 3817 return r; 3818 } 3819 } 3820 3821 return 0; 3822 } 3823 3824 /** 3825 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3826 * 3827 * @adev: amdgpu_device pointer 3828 * 3829 * Second resume function for hardware IPs. The list of all the hardware 3830 * IPs that make up the asic is walked and the resume callbacks are run for 3831 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3832 * functional state after a suspend and updates the software state as 3833 * necessary. This function is also used for restoring the GPU after a GPU 3834 * reset. 3835 * Returns 0 on success, negative error code on failure. 3836 */ 3837 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3838 { 3839 int i, r; 3840 3841 for (i = 0; i < adev->num_ip_blocks; i++) { 3842 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3843 continue; 3844 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3845 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3846 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3847 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE || 3848 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3849 continue; 3850 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3851 if (r) 3852 return r; 3853 } 3854 3855 return 0; 3856 } 3857 3858 /** 3859 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs 3860 * 3861 * @adev: amdgpu_device pointer 3862 * 3863 * Third resume function for hardware IPs. The list of all the hardware 3864 * IPs that make up the asic is walked and the resume callbacks are run for 3865 * all DCE. resume puts the hardware into a functional state after a suspend 3866 * and updates the software state as necessary. This function is also used 3867 * for restoring the GPU after a GPU reset. 3868 * 3869 * Returns 0 on success, negative error code on failure. 3870 */ 3871 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev) 3872 { 3873 int i, r; 3874 3875 for (i = 0; i < adev->num_ip_blocks; i++) { 3876 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3877 continue; 3878 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 3879 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3880 if (r) 3881 return r; 3882 } 3883 } 3884 3885 return 0; 3886 } 3887 3888 /** 3889 * amdgpu_device_ip_resume - run resume for hardware IPs 3890 * 3891 * @adev: amdgpu_device pointer 3892 * 3893 * Main resume function for hardware IPs. The hardware IPs 3894 * are split into two resume functions because they are 3895 * also used in recovering from a GPU reset and some additional 3896 * steps need to be take between them. In this case (S3/S4) they are 3897 * run sequentially. 3898 * Returns 0 on success, negative error code on failure. 3899 */ 3900 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3901 { 3902 int r; 3903 3904 r = amdgpu_device_ip_resume_phase1(adev); 3905 if (r) 3906 return r; 3907 3908 r = amdgpu_device_fw_loading(adev); 3909 if (r) 3910 return r; 3911 3912 r = amdgpu_device_ip_resume_phase2(adev); 3913 3914 if (adev->mman.buffer_funcs_ring->sched.ready) 3915 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3916 3917 if (r) 3918 return r; 3919 3920 amdgpu_fence_driver_hw_init(adev); 3921 3922 r = amdgpu_device_ip_resume_phase3(adev); 3923 3924 return r; 3925 } 3926 3927 /** 3928 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3929 * 3930 * @adev: amdgpu_device pointer 3931 * 3932 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3933 */ 3934 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3935 { 3936 if (amdgpu_sriov_vf(adev)) { 3937 if (adev->is_atom_fw) { 3938 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3939 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3940 } else { 3941 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3942 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3943 } 3944 3945 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3946 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3947 } 3948 } 3949 3950 /** 3951 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3952 * 3953 * @asic_type: AMD asic type 3954 * 3955 * Check if there is DC (new modesetting infrastructre) support for an asic. 3956 * returns true if DC has support, false if not. 3957 */ 3958 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3959 { 3960 switch (asic_type) { 3961 #ifdef CONFIG_DRM_AMDGPU_SI 3962 case CHIP_HAINAN: 3963 #endif 3964 case CHIP_TOPAZ: 3965 /* chips with no display hardware */ 3966 return false; 3967 #if defined(CONFIG_DRM_AMD_DC) 3968 case CHIP_TAHITI: 3969 case CHIP_PITCAIRN: 3970 case CHIP_VERDE: 3971 case CHIP_OLAND: 3972 /* 3973 * We have systems in the wild with these ASICs that require 3974 * LVDS and VGA support which is not supported with DC. 3975 * 3976 * Fallback to the non-DC driver here by default so as not to 3977 * cause regressions. 3978 */ 3979 #if defined(CONFIG_DRM_AMD_DC_SI) 3980 return amdgpu_dc > 0; 3981 #else 3982 return false; 3983 #endif 3984 case CHIP_BONAIRE: 3985 case CHIP_KAVERI: 3986 case CHIP_KABINI: 3987 case CHIP_MULLINS: 3988 /* 3989 * We have systems in the wild with these ASICs that require 3990 * VGA support which is not supported with DC. 3991 * 3992 * Fallback to the non-DC driver here by default so as not to 3993 * cause regressions. 3994 */ 3995 return amdgpu_dc > 0; 3996 default: 3997 return amdgpu_dc != 0; 3998 #else 3999 default: 4000 if (amdgpu_dc > 0) 4001 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 4002 return false; 4003 #endif 4004 } 4005 } 4006 4007 /** 4008 * amdgpu_device_has_dc_support - check if dc is supported 4009 * 4010 * @adev: amdgpu_device pointer 4011 * 4012 * Returns true for supported, false for not supported 4013 */ 4014 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 4015 { 4016 if (adev->enable_virtual_display || 4017 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 4018 return false; 4019 4020 return amdgpu_device_asic_has_dc_support(adev->asic_type); 4021 } 4022 4023 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 4024 { 4025 struct amdgpu_device *adev = 4026 container_of(__work, struct amdgpu_device, xgmi_reset_work); 4027 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 4028 4029 /* It's a bug to not have a hive within this function */ 4030 if (WARN_ON(!hive)) 4031 return; 4032 4033 /* 4034 * Use task barrier to synchronize all xgmi reset works across the 4035 * hive. task_barrier_enter and task_barrier_exit will block 4036 * until all the threads running the xgmi reset works reach 4037 * those points. task_barrier_full will do both blocks. 4038 */ 4039 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 4040 4041 task_barrier_enter(&hive->tb); 4042 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 4043 4044 if (adev->asic_reset_res) 4045 goto fail; 4046 4047 task_barrier_exit(&hive->tb); 4048 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 4049 4050 if (adev->asic_reset_res) 4051 goto fail; 4052 4053 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 4054 } else { 4055 4056 task_barrier_full(&hive->tb); 4057 adev->asic_reset_res = amdgpu_asic_reset(adev); 4058 } 4059 4060 fail: 4061 if (adev->asic_reset_res) 4062 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 4063 adev->asic_reset_res, adev_to_drm(adev)->unique); 4064 amdgpu_put_xgmi_hive(hive); 4065 } 4066 4067 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 4068 { 4069 char *input = amdgpu_lockup_timeout; 4070 char *timeout_setting = NULL; 4071 int index = 0; 4072 long timeout; 4073 int ret = 0; 4074 4075 /* 4076 * By default timeout for non compute jobs is 10000 4077 * and 60000 for compute jobs. 4078 * In SR-IOV or passthrough mode, timeout for compute 4079 * jobs are 60000 by default. 4080 */ 4081 adev->gfx_timeout = msecs_to_jiffies(10000); 4082 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4083 if (amdgpu_sriov_vf(adev)) 4084 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 4085 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 4086 else 4087 adev->compute_timeout = msecs_to_jiffies(60000); 4088 4089 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4090 while ((timeout_setting = strsep(&input, ",")) && 4091 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4092 ret = kstrtol(timeout_setting, 0, &timeout); 4093 if (ret) 4094 return ret; 4095 4096 if (timeout == 0) { 4097 index++; 4098 continue; 4099 } else if (timeout < 0) { 4100 timeout = MAX_SCHEDULE_TIMEOUT; 4101 dev_warn(adev->dev, "lockup timeout disabled"); 4102 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 4103 } else { 4104 timeout = msecs_to_jiffies(timeout); 4105 } 4106 4107 switch (index++) { 4108 case 0: 4109 adev->gfx_timeout = timeout; 4110 break; 4111 case 1: 4112 adev->compute_timeout = timeout; 4113 break; 4114 case 2: 4115 adev->sdma_timeout = timeout; 4116 break; 4117 case 3: 4118 adev->video_timeout = timeout; 4119 break; 4120 default: 4121 break; 4122 } 4123 } 4124 /* 4125 * There is only one value specified and 4126 * it should apply to all non-compute jobs. 4127 */ 4128 if (index == 1) { 4129 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4130 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 4131 adev->compute_timeout = adev->gfx_timeout; 4132 } 4133 } 4134 4135 return ret; 4136 } 4137 4138 /** 4139 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4140 * 4141 * @adev: amdgpu_device pointer 4142 * 4143 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4144 */ 4145 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4146 { 4147 struct iommu_domain *domain; 4148 4149 domain = iommu_get_domain_for_dev(adev->dev); 4150 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4151 adev->ram_is_direct_mapped = true; 4152 } 4153 4154 #if defined(CONFIG_HSA_AMD_P2P) 4155 /** 4156 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4157 * 4158 * @adev: amdgpu_device pointer 4159 * 4160 * return if IOMMU remapping bar address 4161 */ 4162 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4163 { 4164 struct iommu_domain *domain; 4165 4166 domain = iommu_get_domain_for_dev(adev->dev); 4167 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4168 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4169 return true; 4170 4171 return false; 4172 } 4173 #endif 4174 4175 static const struct attribute *amdgpu_dev_attributes[] = { 4176 &dev_attr_pcie_replay_count.attr, 4177 NULL 4178 }; 4179 4180 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4181 { 4182 if (amdgpu_mcbp == 1) 4183 adev->gfx.mcbp = true; 4184 else if (amdgpu_mcbp == 0) 4185 adev->gfx.mcbp = false; 4186 4187 if (amdgpu_sriov_vf(adev)) 4188 adev->gfx.mcbp = true; 4189 4190 if (adev->gfx.mcbp) 4191 DRM_INFO("MCBP is enabled\n"); 4192 } 4193 4194 /** 4195 * amdgpu_device_init - initialize the driver 4196 * 4197 * @adev: amdgpu_device pointer 4198 * @flags: driver flags 4199 * 4200 * Initializes the driver info and hw (all asics). 4201 * Returns 0 for success or an error on failure. 4202 * Called at driver startup. 4203 */ 4204 int amdgpu_device_init(struct amdgpu_device *adev, 4205 uint32_t flags) 4206 { 4207 struct drm_device *ddev = adev_to_drm(adev); 4208 struct pci_dev *pdev = adev->pdev; 4209 int r, i; 4210 bool px = false; 4211 u32 max_MBps; 4212 int tmp; 4213 4214 adev->shutdown = false; 4215 adev->flags = flags; 4216 4217 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4218 adev->asic_type = amdgpu_force_asic_type; 4219 else 4220 adev->asic_type = flags & AMD_ASIC_MASK; 4221 4222 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4223 if (amdgpu_emu_mode == 1) 4224 adev->usec_timeout *= 10; 4225 adev->gmc.gart_size = 512 * 1024 * 1024; 4226 adev->accel_working = false; 4227 adev->num_rings = 0; 4228 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4229 adev->mman.buffer_funcs = NULL; 4230 adev->mman.buffer_funcs_ring = NULL; 4231 adev->vm_manager.vm_pte_funcs = NULL; 4232 adev->vm_manager.vm_pte_num_scheds = 0; 4233 adev->gmc.gmc_funcs = NULL; 4234 adev->harvest_ip_mask = 0x0; 4235 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4236 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4237 4238 adev->smc_rreg = &amdgpu_invalid_rreg; 4239 adev->smc_wreg = &amdgpu_invalid_wreg; 4240 adev->pcie_rreg = &amdgpu_invalid_rreg; 4241 adev->pcie_wreg = &amdgpu_invalid_wreg; 4242 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4243 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4244 adev->pciep_rreg = &amdgpu_invalid_rreg; 4245 adev->pciep_wreg = &amdgpu_invalid_wreg; 4246 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4247 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4248 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4249 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4250 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4251 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4252 adev->didt_rreg = &amdgpu_invalid_rreg; 4253 adev->didt_wreg = &amdgpu_invalid_wreg; 4254 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4255 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4256 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4257 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4258 4259 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4260 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4261 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4262 4263 /* mutex initialization are all done here so we 4264 * can recall function without having locking issues 4265 */ 4266 mutex_init(&adev->firmware.mutex); 4267 mutex_init(&adev->pm.mutex); 4268 mutex_init(&adev->gfx.gpu_clock_mutex); 4269 mutex_init(&adev->srbm_mutex); 4270 mutex_init(&adev->gfx.pipe_reserve_mutex); 4271 mutex_init(&adev->gfx.gfx_off_mutex); 4272 mutex_init(&adev->gfx.partition_mutex); 4273 mutex_init(&adev->grbm_idx_mutex); 4274 mutex_init(&adev->mn_lock); 4275 mutex_init(&adev->virt.vf_errors.lock); 4276 hash_init(adev->mn_hash); 4277 mutex_init(&adev->psp.mutex); 4278 mutex_init(&adev->notifier_lock); 4279 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4280 mutex_init(&adev->benchmark_mutex); 4281 mutex_init(&adev->gfx.reset_sem_mutex); 4282 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4283 mutex_init(&adev->enforce_isolation_mutex); 4284 mutex_init(&adev->gfx.kfd_sch_mutex); 4285 mutex_init(&adev->gfx.workload_profile_mutex); 4286 mutex_init(&adev->vcn.workload_profile_mutex); 4287 4288 amdgpu_device_init_apu_flags(adev); 4289 4290 r = amdgpu_device_check_arguments(adev); 4291 if (r) 4292 return r; 4293 4294 spin_lock_init(&adev->mmio_idx_lock); 4295 spin_lock_init(&adev->smc_idx_lock); 4296 spin_lock_init(&adev->pcie_idx_lock); 4297 spin_lock_init(&adev->uvd_ctx_idx_lock); 4298 spin_lock_init(&adev->didt_idx_lock); 4299 spin_lock_init(&adev->gc_cac_idx_lock); 4300 spin_lock_init(&adev->se_cac_idx_lock); 4301 spin_lock_init(&adev->audio_endpt_idx_lock); 4302 spin_lock_init(&adev->mm_stats.lock); 4303 spin_lock_init(&adev->virt.rlcg_reg_lock); 4304 spin_lock_init(&adev->wb.lock); 4305 4306 INIT_LIST_HEAD(&adev->reset_list); 4307 4308 INIT_LIST_HEAD(&adev->ras_list); 4309 4310 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4311 4312 INIT_DELAYED_WORK(&adev->delayed_init_work, 4313 amdgpu_device_delayed_init_work_handler); 4314 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4315 amdgpu_device_delay_enable_gfx_off); 4316 /* 4317 * Initialize the enforce_isolation work structures for each XCP 4318 * partition. This work handler is responsible for enforcing shader 4319 * isolation on AMD GPUs. It counts the number of emitted fences for 4320 * each GFX and compute ring. If there are any fences, it schedules 4321 * the `enforce_isolation_work` to be run after a delay. If there are 4322 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4323 * runqueue. 4324 */ 4325 for (i = 0; i < MAX_XCP; i++) { 4326 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4327 amdgpu_gfx_enforce_isolation_handler); 4328 adev->gfx.enforce_isolation[i].adev = adev; 4329 adev->gfx.enforce_isolation[i].xcp_id = i; 4330 } 4331 4332 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4333 4334 adev->gfx.gfx_off_req_count = 1; 4335 adev->gfx.gfx_off_residency = 0; 4336 adev->gfx.gfx_off_entrycount = 0; 4337 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4338 4339 atomic_set(&adev->throttling_logging_enabled, 1); 4340 /* 4341 * If throttling continues, logging will be performed every minute 4342 * to avoid log flooding. "-1" is subtracted since the thermal 4343 * throttling interrupt comes every second. Thus, the total logging 4344 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4345 * for throttling interrupt) = 60 seconds. 4346 */ 4347 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4348 4349 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4350 4351 /* Registers mapping */ 4352 /* TODO: block userspace mapping of io register */ 4353 if (adev->asic_type >= CHIP_BONAIRE) { 4354 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4355 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4356 } else { 4357 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4358 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4359 } 4360 4361 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4362 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4363 4364 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4365 if (!adev->rmmio) 4366 return -ENOMEM; 4367 4368 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 4369 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 4370 4371 /* 4372 * Reset domain needs to be present early, before XGMI hive discovered 4373 * (if any) and initialized to use reset sem and in_gpu reset flag 4374 * early on during init and before calling to RREG32. 4375 */ 4376 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4377 if (!adev->reset_domain) 4378 return -ENOMEM; 4379 4380 /* detect hw virtualization here */ 4381 amdgpu_virt_init(adev); 4382 4383 amdgpu_device_get_pcie_info(adev); 4384 4385 r = amdgpu_device_get_job_timeout_settings(adev); 4386 if (r) { 4387 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4388 return r; 4389 } 4390 4391 amdgpu_device_set_mcbp(adev); 4392 4393 /* 4394 * By default, use default mode where all blocks are expected to be 4395 * initialized. At present a 'swinit' of blocks is required to be 4396 * completed before the need for a different level is detected. 4397 */ 4398 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4399 /* early init functions */ 4400 r = amdgpu_device_ip_early_init(adev); 4401 if (r) 4402 return r; 4403 4404 /* 4405 * No need to remove conflicting FBs for non-display class devices. 4406 * This prevents the sysfb from being freed accidently. 4407 */ 4408 if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA || 4409 (pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) { 4410 /* Get rid of things like offb */ 4411 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4412 if (r) 4413 return r; 4414 } 4415 4416 /* Enable TMZ based on IP_VERSION */ 4417 amdgpu_gmc_tmz_set(adev); 4418 4419 if (amdgpu_sriov_vf(adev) && 4420 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4421 /* VF MMIO access (except mailbox range) from CPU 4422 * will be blocked during sriov runtime 4423 */ 4424 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4425 4426 amdgpu_gmc_noretry_set(adev); 4427 /* Need to get xgmi info early to decide the reset behavior*/ 4428 if (adev->gmc.xgmi.supported) { 4429 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4430 if (r) 4431 return r; 4432 } 4433 4434 /* enable PCIE atomic ops */ 4435 if (amdgpu_sriov_vf(adev)) { 4436 if (adev->virt.fw_reserve.p_pf2vf) 4437 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4438 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4439 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4440 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4441 * internal path natively support atomics, set have_atomics_support to true. 4442 */ 4443 } else if ((adev->flags & AMD_IS_APU) && 4444 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4445 IP_VERSION(9, 0, 0))) { 4446 adev->have_atomics_support = true; 4447 } else { 4448 adev->have_atomics_support = 4449 !pci_enable_atomic_ops_to_root(adev->pdev, 4450 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4451 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4452 } 4453 4454 if (!adev->have_atomics_support) 4455 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4456 4457 /* doorbell bar mapping and doorbell index init*/ 4458 amdgpu_doorbell_init(adev); 4459 4460 if (amdgpu_emu_mode == 1) { 4461 /* post the asic on emulation mode */ 4462 emu_soc_asic_init(adev); 4463 goto fence_driver_init; 4464 } 4465 4466 amdgpu_reset_init(adev); 4467 4468 /* detect if we are with an SRIOV vbios */ 4469 if (adev->bios) 4470 amdgpu_device_detect_sriov_bios(adev); 4471 4472 /* check if we need to reset the asic 4473 * E.g., driver was not cleanly unloaded previously, etc. 4474 */ 4475 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4476 if (adev->gmc.xgmi.num_physical_nodes) { 4477 dev_info(adev->dev, "Pending hive reset.\n"); 4478 amdgpu_set_init_level(adev, 4479 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4480 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4481 !amdgpu_device_has_display_hardware(adev)) { 4482 r = psp_gpu_reset(adev); 4483 } else { 4484 tmp = amdgpu_reset_method; 4485 /* It should do a default reset when loading or reloading the driver, 4486 * regardless of the module parameter reset_method. 4487 */ 4488 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4489 r = amdgpu_asic_reset(adev); 4490 amdgpu_reset_method = tmp; 4491 } 4492 4493 if (r) { 4494 dev_err(adev->dev, "asic reset on init failed\n"); 4495 goto failed; 4496 } 4497 } 4498 4499 /* Post card if necessary */ 4500 if (amdgpu_device_need_post(adev)) { 4501 if (!adev->bios) { 4502 dev_err(adev->dev, "no vBIOS found\n"); 4503 r = -EINVAL; 4504 goto failed; 4505 } 4506 DRM_INFO("GPU posting now...\n"); 4507 r = amdgpu_device_asic_init(adev); 4508 if (r) { 4509 dev_err(adev->dev, "gpu post error!\n"); 4510 goto failed; 4511 } 4512 } 4513 4514 if (adev->bios) { 4515 if (adev->is_atom_fw) { 4516 /* Initialize clocks */ 4517 r = amdgpu_atomfirmware_get_clock_info(adev); 4518 if (r) { 4519 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4520 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4521 goto failed; 4522 } 4523 } else { 4524 /* Initialize clocks */ 4525 r = amdgpu_atombios_get_clock_info(adev); 4526 if (r) { 4527 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4528 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4529 goto failed; 4530 } 4531 /* init i2c buses */ 4532 amdgpu_i2c_init(adev); 4533 } 4534 } 4535 4536 fence_driver_init: 4537 /* Fence driver */ 4538 r = amdgpu_fence_driver_sw_init(adev); 4539 if (r) { 4540 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4541 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4542 goto failed; 4543 } 4544 4545 /* init the mode config */ 4546 drm_mode_config_init(adev_to_drm(adev)); 4547 4548 r = amdgpu_device_ip_init(adev); 4549 if (r) { 4550 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4551 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4552 goto release_ras_con; 4553 } 4554 4555 amdgpu_fence_driver_hw_init(adev); 4556 4557 dev_info(adev->dev, 4558 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4559 adev->gfx.config.max_shader_engines, 4560 adev->gfx.config.max_sh_per_se, 4561 adev->gfx.config.max_cu_per_sh, 4562 adev->gfx.cu_info.number); 4563 4564 adev->accel_working = true; 4565 4566 amdgpu_vm_check_compute_bug(adev); 4567 4568 /* Initialize the buffer migration limit. */ 4569 if (amdgpu_moverate >= 0) 4570 max_MBps = amdgpu_moverate; 4571 else 4572 max_MBps = 8; /* Allow 8 MB/s. */ 4573 /* Get a log2 for easy divisions. */ 4574 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4575 4576 /* 4577 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4578 * Otherwise the mgpu fan boost feature will be skipped due to the 4579 * gpu instance is counted less. 4580 */ 4581 amdgpu_register_gpu_instance(adev); 4582 4583 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4584 * explicit gating rather than handling it automatically. 4585 */ 4586 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4587 r = amdgpu_device_ip_late_init(adev); 4588 if (r) { 4589 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4590 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4591 goto release_ras_con; 4592 } 4593 /* must succeed. */ 4594 amdgpu_ras_resume(adev); 4595 queue_delayed_work(system_wq, &adev->delayed_init_work, 4596 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4597 } 4598 4599 if (amdgpu_sriov_vf(adev)) { 4600 amdgpu_virt_release_full_gpu(adev, true); 4601 flush_delayed_work(&adev->delayed_init_work); 4602 } 4603 4604 /* 4605 * Place those sysfs registering after `late_init`. As some of those 4606 * operations performed in `late_init` might affect the sysfs 4607 * interfaces creating. 4608 */ 4609 r = amdgpu_atombios_sysfs_init(adev); 4610 if (r) 4611 drm_err(&adev->ddev, 4612 "registering atombios sysfs failed (%d).\n", r); 4613 4614 r = amdgpu_pm_sysfs_init(adev); 4615 if (r) 4616 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4617 4618 r = amdgpu_ucode_sysfs_init(adev); 4619 if (r) { 4620 adev->ucode_sysfs_en = false; 4621 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4622 } else 4623 adev->ucode_sysfs_en = true; 4624 4625 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4626 if (r) 4627 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4628 4629 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4630 if (r) 4631 dev_err(adev->dev, 4632 "Could not create amdgpu board attributes\n"); 4633 4634 amdgpu_fru_sysfs_init(adev); 4635 amdgpu_reg_state_sysfs_init(adev); 4636 amdgpu_xcp_cfg_sysfs_init(adev); 4637 4638 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4639 r = amdgpu_pmu_init(adev); 4640 if (r) 4641 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4642 4643 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4644 if (amdgpu_device_cache_pci_state(adev->pdev)) 4645 pci_restore_state(pdev); 4646 4647 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4648 /* this will fail for cards that aren't VGA class devices, just 4649 * ignore it 4650 */ 4651 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4652 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4653 4654 px = amdgpu_device_supports_px(ddev); 4655 4656 if (px || (!dev_is_removable(&adev->pdev->dev) && 4657 apple_gmux_detect(NULL, NULL))) 4658 vga_switcheroo_register_client(adev->pdev, 4659 &amdgpu_switcheroo_ops, px); 4660 4661 if (px) 4662 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4663 4664 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4665 amdgpu_xgmi_reset_on_init(adev); 4666 4667 amdgpu_device_check_iommu_direct_map(adev); 4668 4669 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; 4670 r = register_pm_notifier(&adev->pm_nb); 4671 if (r) 4672 goto failed; 4673 4674 return 0; 4675 4676 release_ras_con: 4677 if (amdgpu_sriov_vf(adev)) 4678 amdgpu_virt_release_full_gpu(adev, true); 4679 4680 /* failed in exclusive mode due to timeout */ 4681 if (amdgpu_sriov_vf(adev) && 4682 !amdgpu_sriov_runtime(adev) && 4683 amdgpu_virt_mmio_blocked(adev) && 4684 !amdgpu_virt_wait_reset(adev)) { 4685 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4686 /* Don't send request since VF is inactive. */ 4687 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4688 adev->virt.ops = NULL; 4689 r = -EAGAIN; 4690 } 4691 amdgpu_release_ras_context(adev); 4692 4693 failed: 4694 amdgpu_vf_error_trans_all(adev); 4695 4696 return r; 4697 } 4698 4699 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4700 { 4701 4702 /* Clear all CPU mappings pointing to this device */ 4703 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4704 4705 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4706 amdgpu_doorbell_fini(adev); 4707 4708 iounmap(adev->rmmio); 4709 adev->rmmio = NULL; 4710 if (adev->mman.aper_base_kaddr) 4711 iounmap(adev->mman.aper_base_kaddr); 4712 adev->mman.aper_base_kaddr = NULL; 4713 4714 /* Memory manager related */ 4715 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4716 arch_phys_wc_del(adev->gmc.vram_mtrr); 4717 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4718 } 4719 } 4720 4721 /** 4722 * amdgpu_device_fini_hw - tear down the driver 4723 * 4724 * @adev: amdgpu_device pointer 4725 * 4726 * Tear down the driver info (all asics). 4727 * Called at driver shutdown. 4728 */ 4729 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4730 { 4731 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4732 flush_delayed_work(&adev->delayed_init_work); 4733 4734 if (adev->mman.initialized) 4735 drain_workqueue(adev->mman.bdev.wq); 4736 adev->shutdown = true; 4737 4738 unregister_pm_notifier(&adev->pm_nb); 4739 4740 /* make sure IB test finished before entering exclusive mode 4741 * to avoid preemption on IB test 4742 */ 4743 if (amdgpu_sriov_vf(adev)) { 4744 amdgpu_virt_request_full_gpu(adev, false); 4745 amdgpu_virt_fini_data_exchange(adev); 4746 } 4747 4748 /* disable all interrupts */ 4749 amdgpu_irq_disable_all(adev); 4750 if (adev->mode_info.mode_config_initialized) { 4751 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4752 drm_helper_force_disable_all(adev_to_drm(adev)); 4753 else 4754 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4755 } 4756 amdgpu_fence_driver_hw_fini(adev); 4757 4758 if (adev->pm.sysfs_initialized) 4759 amdgpu_pm_sysfs_fini(adev); 4760 if (adev->ucode_sysfs_en) 4761 amdgpu_ucode_sysfs_fini(adev); 4762 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4763 amdgpu_fru_sysfs_fini(adev); 4764 4765 amdgpu_reg_state_sysfs_fini(adev); 4766 amdgpu_xcp_cfg_sysfs_fini(adev); 4767 4768 /* disable ras feature must before hw fini */ 4769 amdgpu_ras_pre_fini(adev); 4770 4771 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4772 4773 amdgpu_device_ip_fini_early(adev); 4774 4775 amdgpu_irq_fini_hw(adev); 4776 4777 if (adev->mman.initialized) 4778 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4779 4780 amdgpu_gart_dummy_page_fini(adev); 4781 4782 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4783 amdgpu_device_unmap_mmio(adev); 4784 4785 } 4786 4787 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4788 { 4789 int idx; 4790 bool px; 4791 4792 amdgpu_device_ip_fini(adev); 4793 amdgpu_fence_driver_sw_fini(adev); 4794 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4795 adev->accel_working = false; 4796 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4797 4798 amdgpu_reset_fini(adev); 4799 4800 /* free i2c buses */ 4801 amdgpu_i2c_fini(adev); 4802 4803 if (adev->bios) { 4804 if (amdgpu_emu_mode != 1) 4805 amdgpu_atombios_fini(adev); 4806 amdgpu_bios_release(adev); 4807 } 4808 4809 kfree(adev->fru_info); 4810 adev->fru_info = NULL; 4811 4812 kfree(adev->xcp_mgr); 4813 adev->xcp_mgr = NULL; 4814 4815 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4816 4817 if (px || (!dev_is_removable(&adev->pdev->dev) && 4818 apple_gmux_detect(NULL, NULL))) 4819 vga_switcheroo_unregister_client(adev->pdev); 4820 4821 if (px) 4822 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4823 4824 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4825 vga_client_unregister(adev->pdev); 4826 4827 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4828 4829 iounmap(adev->rmmio); 4830 adev->rmmio = NULL; 4831 amdgpu_doorbell_fini(adev); 4832 drm_dev_exit(idx); 4833 } 4834 4835 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4836 amdgpu_pmu_fini(adev); 4837 if (adev->mman.discovery_bin) 4838 amdgpu_discovery_fini(adev); 4839 4840 amdgpu_reset_put_reset_domain(adev->reset_domain); 4841 adev->reset_domain = NULL; 4842 4843 kfree(adev->pci_state); 4844 4845 } 4846 4847 /** 4848 * amdgpu_device_evict_resources - evict device resources 4849 * @adev: amdgpu device object 4850 * 4851 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4852 * of the vram memory type. Mainly used for evicting device resources 4853 * at suspend time. 4854 * 4855 */ 4856 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4857 { 4858 int ret; 4859 4860 /* No need to evict vram on APUs unless going to S4 */ 4861 if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) 4862 return 0; 4863 4864 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4865 if (ret) 4866 DRM_WARN("evicting device resources failed\n"); 4867 return ret; 4868 } 4869 4870 /* 4871 * Suspend & resume. 4872 */ 4873 /** 4874 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events 4875 * @nb: notifier block 4876 * @mode: suspend mode 4877 * @data: data 4878 * 4879 * This function is called when the system is about to suspend or hibernate. 4880 * It is used to evict resources from the device before the system goes to 4881 * sleep while there is still access to swap. 4882 */ 4883 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 4884 void *data) 4885 { 4886 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); 4887 int r; 4888 4889 switch (mode) { 4890 case PM_HIBERNATION_PREPARE: 4891 adev->in_s4 = true; 4892 fallthrough; 4893 case PM_SUSPEND_PREPARE: 4894 r = amdgpu_device_evict_resources(adev); 4895 /* 4896 * This is considered non-fatal at this time because 4897 * amdgpu_device_prepare() will also fatally evict resources. 4898 * See https://gitlab.freedesktop.org/drm/amd/-/issues/3781 4899 */ 4900 if (r) 4901 drm_warn(adev_to_drm(adev), "Failed to evict resources, freeze active processes if problems occur: %d\n", r); 4902 break; 4903 } 4904 4905 return NOTIFY_DONE; 4906 } 4907 4908 /** 4909 * amdgpu_device_prepare - prepare for device suspend 4910 * 4911 * @dev: drm dev pointer 4912 * 4913 * Prepare to put the hw in the suspend state (all asics). 4914 * Returns 0 for success or an error on failure. 4915 * Called at driver suspend. 4916 */ 4917 int amdgpu_device_prepare(struct drm_device *dev) 4918 { 4919 struct amdgpu_device *adev = drm_to_adev(dev); 4920 int i, r; 4921 4922 amdgpu_choose_low_power_state(adev); 4923 4924 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4925 return 0; 4926 4927 /* Evict the majority of BOs before starting suspend sequence */ 4928 r = amdgpu_device_evict_resources(adev); 4929 if (r) 4930 goto unprepare; 4931 4932 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4933 4934 for (i = 0; i < adev->num_ip_blocks; i++) { 4935 if (!adev->ip_blocks[i].status.valid) 4936 continue; 4937 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 4938 continue; 4939 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 4940 if (r) 4941 goto unprepare; 4942 } 4943 4944 return 0; 4945 4946 unprepare: 4947 adev->in_s0ix = adev->in_s3 = adev->in_s4 = false; 4948 4949 return r; 4950 } 4951 4952 /** 4953 * amdgpu_device_suspend - initiate device suspend 4954 * 4955 * @dev: drm dev pointer 4956 * @notify_clients: notify in-kernel DRM clients 4957 * 4958 * Puts the hw in the suspend state (all asics). 4959 * Returns 0 for success or an error on failure. 4960 * Called at driver suspend. 4961 */ 4962 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 4963 { 4964 struct amdgpu_device *adev = drm_to_adev(dev); 4965 int r = 0; 4966 4967 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4968 return 0; 4969 4970 adev->in_suspend = true; 4971 4972 if (amdgpu_sriov_vf(adev)) { 4973 amdgpu_virt_fini_data_exchange(adev); 4974 r = amdgpu_virt_request_full_gpu(adev, false); 4975 if (r) 4976 return r; 4977 } 4978 4979 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4980 DRM_WARN("smart shift update failed\n"); 4981 4982 if (notify_clients) 4983 drm_client_dev_suspend(adev_to_drm(adev), false); 4984 4985 cancel_delayed_work_sync(&adev->delayed_init_work); 4986 4987 amdgpu_ras_suspend(adev); 4988 4989 amdgpu_device_ip_suspend_phase1(adev); 4990 4991 if (!adev->in_s0ix) 4992 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4993 4994 r = amdgpu_device_evict_resources(adev); 4995 if (r) 4996 return r; 4997 4998 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4999 5000 amdgpu_fence_driver_hw_fini(adev); 5001 5002 amdgpu_device_ip_suspend_phase2(adev); 5003 5004 if (amdgpu_sriov_vf(adev)) 5005 amdgpu_virt_release_full_gpu(adev, false); 5006 5007 r = amdgpu_dpm_notify_rlc_state(adev, false); 5008 if (r) 5009 return r; 5010 5011 return 0; 5012 } 5013 5014 /** 5015 * amdgpu_device_resume - initiate device resume 5016 * 5017 * @dev: drm dev pointer 5018 * @notify_clients: notify in-kernel DRM clients 5019 * 5020 * Bring the hw back to operating state (all asics). 5021 * Returns 0 for success or an error on failure. 5022 * Called at driver resume. 5023 */ 5024 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 5025 { 5026 struct amdgpu_device *adev = drm_to_adev(dev); 5027 int r = 0; 5028 5029 if (amdgpu_sriov_vf(adev)) { 5030 r = amdgpu_virt_request_full_gpu(adev, true); 5031 if (r) 5032 return r; 5033 } 5034 5035 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5036 return 0; 5037 5038 if (adev->in_s0ix) 5039 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 5040 5041 /* post card */ 5042 if (amdgpu_device_need_post(adev)) { 5043 r = amdgpu_device_asic_init(adev); 5044 if (r) 5045 dev_err(adev->dev, "amdgpu asic init failed\n"); 5046 } 5047 5048 r = amdgpu_device_ip_resume(adev); 5049 5050 if (r) { 5051 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 5052 goto exit; 5053 } 5054 5055 if (!adev->in_s0ix) { 5056 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 5057 if (r) 5058 goto exit; 5059 } 5060 5061 r = amdgpu_device_ip_late_init(adev); 5062 if (r) 5063 goto exit; 5064 5065 queue_delayed_work(system_wq, &adev->delayed_init_work, 5066 msecs_to_jiffies(AMDGPU_RESUME_MS)); 5067 exit: 5068 if (amdgpu_sriov_vf(adev)) { 5069 amdgpu_virt_init_data_exchange(adev); 5070 amdgpu_virt_release_full_gpu(adev, true); 5071 } 5072 5073 if (r) 5074 return r; 5075 5076 /* Make sure IB tests flushed */ 5077 flush_delayed_work(&adev->delayed_init_work); 5078 5079 if (notify_clients) 5080 drm_client_dev_resume(adev_to_drm(adev), false); 5081 5082 amdgpu_ras_resume(adev); 5083 5084 if (adev->mode_info.num_crtc) { 5085 /* 5086 * Most of the connector probing functions try to acquire runtime pm 5087 * refs to ensure that the GPU is powered on when connector polling is 5088 * performed. Since we're calling this from a runtime PM callback, 5089 * trying to acquire rpm refs will cause us to deadlock. 5090 * 5091 * Since we're guaranteed to be holding the rpm lock, it's safe to 5092 * temporarily disable the rpm helpers so this doesn't deadlock us. 5093 */ 5094 #ifdef CONFIG_PM 5095 dev->dev->power.disable_depth++; 5096 #endif 5097 if (!adev->dc_enabled) 5098 drm_helper_hpd_irq_event(dev); 5099 else 5100 drm_kms_helper_hotplug_event(dev); 5101 #ifdef CONFIG_PM 5102 dev->dev->power.disable_depth--; 5103 #endif 5104 } 5105 adev->in_suspend = false; 5106 5107 if (adev->enable_mes) 5108 amdgpu_mes_self_test(adev); 5109 5110 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 5111 DRM_WARN("smart shift update failed\n"); 5112 5113 return 0; 5114 } 5115 5116 /** 5117 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 5118 * 5119 * @adev: amdgpu_device pointer 5120 * 5121 * The list of all the hardware IPs that make up the asic is walked and 5122 * the check_soft_reset callbacks are run. check_soft_reset determines 5123 * if the asic is still hung or not. 5124 * Returns true if any of the IPs are still in a hung state, false if not. 5125 */ 5126 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 5127 { 5128 int i; 5129 bool asic_hang = false; 5130 5131 if (amdgpu_sriov_vf(adev)) 5132 return true; 5133 5134 if (amdgpu_asic_need_full_reset(adev)) 5135 return true; 5136 5137 for (i = 0; i < adev->num_ip_blocks; i++) { 5138 if (!adev->ip_blocks[i].status.valid) 5139 continue; 5140 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 5141 adev->ip_blocks[i].status.hang = 5142 adev->ip_blocks[i].version->funcs->check_soft_reset( 5143 &adev->ip_blocks[i]); 5144 if (adev->ip_blocks[i].status.hang) { 5145 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 5146 asic_hang = true; 5147 } 5148 } 5149 return asic_hang; 5150 } 5151 5152 /** 5153 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 5154 * 5155 * @adev: amdgpu_device pointer 5156 * 5157 * The list of all the hardware IPs that make up the asic is walked and the 5158 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 5159 * handles any IP specific hardware or software state changes that are 5160 * necessary for a soft reset to succeed. 5161 * Returns 0 on success, negative error code on failure. 5162 */ 5163 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5164 { 5165 int i, r = 0; 5166 5167 for (i = 0; i < adev->num_ip_blocks; i++) { 5168 if (!adev->ip_blocks[i].status.valid) 5169 continue; 5170 if (adev->ip_blocks[i].status.hang && 5171 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5172 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5173 if (r) 5174 return r; 5175 } 5176 } 5177 5178 return 0; 5179 } 5180 5181 /** 5182 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5183 * 5184 * @adev: amdgpu_device pointer 5185 * 5186 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5187 * reset is necessary to recover. 5188 * Returns true if a full asic reset is required, false if not. 5189 */ 5190 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5191 { 5192 int i; 5193 5194 if (amdgpu_asic_need_full_reset(adev)) 5195 return true; 5196 5197 for (i = 0; i < adev->num_ip_blocks; i++) { 5198 if (!adev->ip_blocks[i].status.valid) 5199 continue; 5200 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5201 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5202 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5203 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5204 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5205 if (adev->ip_blocks[i].status.hang) { 5206 dev_info(adev->dev, "Some block need full reset!\n"); 5207 return true; 5208 } 5209 } 5210 } 5211 return false; 5212 } 5213 5214 /** 5215 * amdgpu_device_ip_soft_reset - do a soft reset 5216 * 5217 * @adev: amdgpu_device pointer 5218 * 5219 * The list of all the hardware IPs that make up the asic is walked and the 5220 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5221 * IP specific hardware or software state changes that are necessary to soft 5222 * reset the IP. 5223 * Returns 0 on success, negative error code on failure. 5224 */ 5225 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5226 { 5227 int i, r = 0; 5228 5229 for (i = 0; i < adev->num_ip_blocks; i++) { 5230 if (!adev->ip_blocks[i].status.valid) 5231 continue; 5232 if (adev->ip_blocks[i].status.hang && 5233 adev->ip_blocks[i].version->funcs->soft_reset) { 5234 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5235 if (r) 5236 return r; 5237 } 5238 } 5239 5240 return 0; 5241 } 5242 5243 /** 5244 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5245 * 5246 * @adev: amdgpu_device pointer 5247 * 5248 * The list of all the hardware IPs that make up the asic is walked and the 5249 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5250 * handles any IP specific hardware or software state changes that are 5251 * necessary after the IP has been soft reset. 5252 * Returns 0 on success, negative error code on failure. 5253 */ 5254 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5255 { 5256 int i, r = 0; 5257 5258 for (i = 0; i < adev->num_ip_blocks; i++) { 5259 if (!adev->ip_blocks[i].status.valid) 5260 continue; 5261 if (adev->ip_blocks[i].status.hang && 5262 adev->ip_blocks[i].version->funcs->post_soft_reset) 5263 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5264 if (r) 5265 return r; 5266 } 5267 5268 return 0; 5269 } 5270 5271 /** 5272 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5273 * 5274 * @adev: amdgpu_device pointer 5275 * @reset_context: amdgpu reset context pointer 5276 * 5277 * do VF FLR and reinitialize Asic 5278 * return 0 means succeeded otherwise failed 5279 */ 5280 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5281 struct amdgpu_reset_context *reset_context) 5282 { 5283 int r; 5284 struct amdgpu_hive_info *hive = NULL; 5285 5286 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5287 if (!amdgpu_ras_get_fed_status(adev)) 5288 amdgpu_virt_ready_to_reset(adev); 5289 amdgpu_virt_wait_reset(adev); 5290 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5291 r = amdgpu_virt_request_full_gpu(adev, true); 5292 } else { 5293 r = amdgpu_virt_reset_gpu(adev); 5294 } 5295 if (r) 5296 return r; 5297 5298 amdgpu_ras_clear_err_state(adev); 5299 amdgpu_irq_gpu_reset_resume_helper(adev); 5300 5301 /* some sw clean up VF needs to do before recover */ 5302 amdgpu_virt_post_reset(adev); 5303 5304 /* Resume IP prior to SMC */ 5305 r = amdgpu_device_ip_reinit_early_sriov(adev); 5306 if (r) 5307 return r; 5308 5309 amdgpu_virt_init_data_exchange(adev); 5310 5311 r = amdgpu_device_fw_loading(adev); 5312 if (r) 5313 return r; 5314 5315 /* now we are okay to resume SMC/CP/SDMA */ 5316 r = amdgpu_device_ip_reinit_late_sriov(adev); 5317 if (r) 5318 return r; 5319 5320 hive = amdgpu_get_xgmi_hive(adev); 5321 /* Update PSP FW topology after reset */ 5322 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5323 r = amdgpu_xgmi_update_topology(hive, adev); 5324 if (hive) 5325 amdgpu_put_xgmi_hive(hive); 5326 if (r) 5327 return r; 5328 5329 r = amdgpu_ib_ring_tests(adev); 5330 if (r) 5331 return r; 5332 5333 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5334 amdgpu_inc_vram_lost(adev); 5335 5336 /* need to be called during full access so we can't do it later like 5337 * bare-metal does. 5338 */ 5339 amdgpu_amdkfd_post_reset(adev); 5340 amdgpu_virt_release_full_gpu(adev, true); 5341 5342 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5343 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5344 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5345 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5346 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5347 amdgpu_ras_resume(adev); 5348 5349 amdgpu_virt_ras_telemetry_post_reset(adev); 5350 5351 return 0; 5352 } 5353 5354 /** 5355 * amdgpu_device_has_job_running - check if there is any unfinished job 5356 * 5357 * @adev: amdgpu_device pointer 5358 * 5359 * check if there is any job running on the device when guest driver receives 5360 * FLR notification from host driver. If there are still jobs running, then 5361 * the guest driver will not respond the FLR reset. Instead, let the job hit 5362 * the timeout and guest driver then issue the reset request. 5363 */ 5364 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5365 { 5366 int i; 5367 5368 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5369 struct amdgpu_ring *ring = adev->rings[i]; 5370 5371 if (!amdgpu_ring_sched_ready(ring)) 5372 continue; 5373 5374 if (amdgpu_fence_count_emitted(ring)) 5375 return true; 5376 } 5377 return false; 5378 } 5379 5380 /** 5381 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5382 * 5383 * @adev: amdgpu_device pointer 5384 * 5385 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5386 * a hung GPU. 5387 */ 5388 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5389 { 5390 5391 if (amdgpu_gpu_recovery == 0) 5392 goto disabled; 5393 5394 /* Skip soft reset check in fatal error mode */ 5395 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5396 return true; 5397 5398 if (amdgpu_sriov_vf(adev)) 5399 return true; 5400 5401 if (amdgpu_gpu_recovery == -1) { 5402 switch (adev->asic_type) { 5403 #ifdef CONFIG_DRM_AMDGPU_SI 5404 case CHIP_VERDE: 5405 case CHIP_TAHITI: 5406 case CHIP_PITCAIRN: 5407 case CHIP_OLAND: 5408 case CHIP_HAINAN: 5409 #endif 5410 #ifdef CONFIG_DRM_AMDGPU_CIK 5411 case CHIP_KAVERI: 5412 case CHIP_KABINI: 5413 case CHIP_MULLINS: 5414 #endif 5415 case CHIP_CARRIZO: 5416 case CHIP_STONEY: 5417 case CHIP_CYAN_SKILLFISH: 5418 goto disabled; 5419 default: 5420 break; 5421 } 5422 } 5423 5424 return true; 5425 5426 disabled: 5427 dev_info(adev->dev, "GPU recovery disabled.\n"); 5428 return false; 5429 } 5430 5431 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5432 { 5433 u32 i; 5434 int ret = 0; 5435 5436 if (adev->bios) 5437 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5438 5439 dev_info(adev->dev, "GPU mode1 reset\n"); 5440 5441 /* Cache the state before bus master disable. The saved config space 5442 * values are used in other cases like restore after mode-2 reset. 5443 */ 5444 amdgpu_device_cache_pci_state(adev->pdev); 5445 5446 /* disable BM */ 5447 pci_clear_master(adev->pdev); 5448 5449 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5450 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5451 ret = amdgpu_dpm_mode1_reset(adev); 5452 } else { 5453 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5454 ret = psp_gpu_reset(adev); 5455 } 5456 5457 if (ret) 5458 goto mode1_reset_failed; 5459 5460 amdgpu_device_load_pci_state(adev->pdev); 5461 ret = amdgpu_psp_wait_for_bootloader(adev); 5462 if (ret) 5463 goto mode1_reset_failed; 5464 5465 /* wait for asic to come out of reset */ 5466 for (i = 0; i < adev->usec_timeout; i++) { 5467 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5468 5469 if (memsize != 0xffffffff) 5470 break; 5471 udelay(1); 5472 } 5473 5474 if (i >= adev->usec_timeout) { 5475 ret = -ETIMEDOUT; 5476 goto mode1_reset_failed; 5477 } 5478 5479 if (adev->bios) 5480 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5481 5482 return 0; 5483 5484 mode1_reset_failed: 5485 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5486 return ret; 5487 } 5488 5489 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5490 struct amdgpu_reset_context *reset_context) 5491 { 5492 int i, r = 0; 5493 struct amdgpu_job *job = NULL; 5494 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5495 bool need_full_reset = 5496 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5497 5498 if (reset_context->reset_req_dev == adev) 5499 job = reset_context->job; 5500 5501 if (amdgpu_sriov_vf(adev)) 5502 amdgpu_virt_pre_reset(adev); 5503 5504 amdgpu_fence_driver_isr_toggle(adev, true); 5505 5506 /* block all schedulers and reset given job's ring */ 5507 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5508 struct amdgpu_ring *ring = adev->rings[i]; 5509 5510 if (!amdgpu_ring_sched_ready(ring)) 5511 continue; 5512 5513 /* Clear job fence from fence drv to avoid force_completion 5514 * leave NULL and vm flush fence in fence drv 5515 */ 5516 amdgpu_fence_driver_clear_job_fences(ring); 5517 5518 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5519 amdgpu_fence_driver_force_completion(ring); 5520 } 5521 5522 amdgpu_fence_driver_isr_toggle(adev, false); 5523 5524 if (job && job->vm) 5525 drm_sched_increase_karma(&job->base); 5526 5527 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5528 /* If reset handler not implemented, continue; otherwise return */ 5529 if (r == -EOPNOTSUPP) 5530 r = 0; 5531 else 5532 return r; 5533 5534 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5535 if (!amdgpu_sriov_vf(adev)) { 5536 5537 if (!need_full_reset) 5538 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5539 5540 if (!need_full_reset && amdgpu_gpu_recovery && 5541 amdgpu_device_ip_check_soft_reset(adev)) { 5542 amdgpu_device_ip_pre_soft_reset(adev); 5543 r = amdgpu_device_ip_soft_reset(adev); 5544 amdgpu_device_ip_post_soft_reset(adev); 5545 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5546 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5547 need_full_reset = true; 5548 } 5549 } 5550 5551 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5552 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5553 /* Trigger ip dump before we reset the asic */ 5554 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5555 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5556 tmp_adev->ip_blocks[i].version->funcs 5557 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5558 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5559 } 5560 5561 if (need_full_reset) 5562 r = amdgpu_device_ip_suspend(adev); 5563 if (need_full_reset) 5564 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5565 else 5566 clear_bit(AMDGPU_NEED_FULL_RESET, 5567 &reset_context->flags); 5568 } 5569 5570 return r; 5571 } 5572 5573 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5574 { 5575 struct list_head *device_list_handle; 5576 bool full_reset, vram_lost = false; 5577 struct amdgpu_device *tmp_adev; 5578 int r, init_level; 5579 5580 device_list_handle = reset_context->reset_device_list; 5581 5582 if (!device_list_handle) 5583 return -EINVAL; 5584 5585 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5586 5587 /** 5588 * If it's reset on init, it's default init level, otherwise keep level 5589 * as recovery level. 5590 */ 5591 if (reset_context->method == AMD_RESET_METHOD_ON_INIT) 5592 init_level = AMDGPU_INIT_LEVEL_DEFAULT; 5593 else 5594 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; 5595 5596 r = 0; 5597 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5598 amdgpu_set_init_level(tmp_adev, init_level); 5599 if (full_reset) { 5600 /* post card */ 5601 amdgpu_ras_clear_err_state(tmp_adev); 5602 r = amdgpu_device_asic_init(tmp_adev); 5603 if (r) { 5604 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5605 } else { 5606 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5607 5608 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5609 if (r) 5610 goto out; 5611 5612 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5613 5614 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5615 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 5616 5617 if (vram_lost) { 5618 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5619 amdgpu_inc_vram_lost(tmp_adev); 5620 } 5621 5622 r = amdgpu_device_fw_loading(tmp_adev); 5623 if (r) 5624 return r; 5625 5626 r = amdgpu_xcp_restore_partition_mode( 5627 tmp_adev->xcp_mgr); 5628 if (r) 5629 goto out; 5630 5631 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5632 if (r) 5633 goto out; 5634 5635 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5636 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5637 5638 r = amdgpu_device_ip_resume_phase3(tmp_adev); 5639 if (r) 5640 goto out; 5641 5642 if (vram_lost) 5643 amdgpu_device_fill_reset_magic(tmp_adev); 5644 5645 /* 5646 * Add this ASIC as tracked as reset was already 5647 * complete successfully. 5648 */ 5649 amdgpu_register_gpu_instance(tmp_adev); 5650 5651 if (!reset_context->hive && 5652 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5653 amdgpu_xgmi_add_device(tmp_adev); 5654 5655 r = amdgpu_device_ip_late_init(tmp_adev); 5656 if (r) 5657 goto out; 5658 5659 drm_client_dev_resume(adev_to_drm(tmp_adev), false); 5660 5661 /* 5662 * The GPU enters bad state once faulty pages 5663 * by ECC has reached the threshold, and ras 5664 * recovery is scheduled next. So add one check 5665 * here to break recovery if it indeed exceeds 5666 * bad page threshold, and remind user to 5667 * retire this GPU or setting one bigger 5668 * bad_page_threshold value to fix this once 5669 * probing driver again. 5670 */ 5671 if (!amdgpu_ras_is_rma(tmp_adev)) { 5672 /* must succeed. */ 5673 amdgpu_ras_resume(tmp_adev); 5674 } else { 5675 r = -EINVAL; 5676 goto out; 5677 } 5678 5679 /* Update PSP FW topology after reset */ 5680 if (reset_context->hive && 5681 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5682 r = amdgpu_xgmi_update_topology( 5683 reset_context->hive, tmp_adev); 5684 } 5685 } 5686 5687 out: 5688 if (!r) { 5689 /* IP init is complete now, set level as default */ 5690 amdgpu_set_init_level(tmp_adev, 5691 AMDGPU_INIT_LEVEL_DEFAULT); 5692 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5693 r = amdgpu_ib_ring_tests(tmp_adev); 5694 if (r) { 5695 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5696 r = -EAGAIN; 5697 goto end; 5698 } 5699 } 5700 5701 if (r) 5702 tmp_adev->asic_reset_res = r; 5703 } 5704 5705 end: 5706 return r; 5707 } 5708 5709 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5710 struct amdgpu_reset_context *reset_context) 5711 { 5712 struct amdgpu_device *tmp_adev = NULL; 5713 bool need_full_reset, skip_hw_reset; 5714 int r = 0; 5715 5716 /* Try reset handler method first */ 5717 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5718 reset_list); 5719 5720 reset_context->reset_device_list = device_list_handle; 5721 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5722 /* If reset handler not implemented, continue; otherwise return */ 5723 if (r == -EOPNOTSUPP) 5724 r = 0; 5725 else 5726 return r; 5727 5728 /* Reset handler not implemented, use the default method */ 5729 need_full_reset = 5730 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5731 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5732 5733 /* 5734 * ASIC reset has to be done on all XGMI hive nodes ASAP 5735 * to allow proper links negotiation in FW (within 1 sec) 5736 */ 5737 if (!skip_hw_reset && need_full_reset) { 5738 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5739 /* For XGMI run all resets in parallel to speed up the process */ 5740 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5741 if (!queue_work(system_unbound_wq, 5742 &tmp_adev->xgmi_reset_work)) 5743 r = -EALREADY; 5744 } else 5745 r = amdgpu_asic_reset(tmp_adev); 5746 5747 if (r) { 5748 dev_err(tmp_adev->dev, 5749 "ASIC reset failed with error, %d for drm dev, %s", 5750 r, adev_to_drm(tmp_adev)->unique); 5751 goto out; 5752 } 5753 } 5754 5755 /* For XGMI wait for all resets to complete before proceed */ 5756 if (!r) { 5757 list_for_each_entry(tmp_adev, device_list_handle, 5758 reset_list) { 5759 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5760 flush_work(&tmp_adev->xgmi_reset_work); 5761 r = tmp_adev->asic_reset_res; 5762 if (r) 5763 break; 5764 } 5765 } 5766 } 5767 } 5768 5769 if (!r && amdgpu_ras_intr_triggered()) { 5770 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5771 amdgpu_ras_reset_error_count(tmp_adev, 5772 AMDGPU_RAS_BLOCK__MMHUB); 5773 } 5774 5775 amdgpu_ras_intr_cleared(); 5776 } 5777 5778 r = amdgpu_device_reinit_after_reset(reset_context); 5779 if (r == -EAGAIN) 5780 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5781 else 5782 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5783 5784 out: 5785 return r; 5786 } 5787 5788 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5789 { 5790 5791 switch (amdgpu_asic_reset_method(adev)) { 5792 case AMD_RESET_METHOD_MODE1: 5793 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5794 break; 5795 case AMD_RESET_METHOD_MODE2: 5796 adev->mp1_state = PP_MP1_STATE_RESET; 5797 break; 5798 default: 5799 adev->mp1_state = PP_MP1_STATE_NONE; 5800 break; 5801 } 5802 } 5803 5804 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5805 { 5806 amdgpu_vf_error_trans_all(adev); 5807 adev->mp1_state = PP_MP1_STATE_NONE; 5808 } 5809 5810 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5811 { 5812 struct pci_dev *p = NULL; 5813 5814 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5815 adev->pdev->bus->number, 1); 5816 if (p) { 5817 pm_runtime_enable(&(p->dev)); 5818 pm_runtime_resume(&(p->dev)); 5819 } 5820 5821 pci_dev_put(p); 5822 } 5823 5824 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5825 { 5826 enum amd_reset_method reset_method; 5827 struct pci_dev *p = NULL; 5828 u64 expires; 5829 5830 /* 5831 * For now, only BACO and mode1 reset are confirmed 5832 * to suffer the audio issue without proper suspended. 5833 */ 5834 reset_method = amdgpu_asic_reset_method(adev); 5835 if ((reset_method != AMD_RESET_METHOD_BACO) && 5836 (reset_method != AMD_RESET_METHOD_MODE1)) 5837 return -EINVAL; 5838 5839 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5840 adev->pdev->bus->number, 1); 5841 if (!p) 5842 return -ENODEV; 5843 5844 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5845 if (!expires) 5846 /* 5847 * If we cannot get the audio device autosuspend delay, 5848 * a fixed 4S interval will be used. Considering 3S is 5849 * the audio controller default autosuspend delay setting. 5850 * 4S used here is guaranteed to cover that. 5851 */ 5852 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5853 5854 while (!pm_runtime_status_suspended(&(p->dev))) { 5855 if (!pm_runtime_suspend(&(p->dev))) 5856 break; 5857 5858 if (expires < ktime_get_mono_fast_ns()) { 5859 dev_warn(adev->dev, "failed to suspend display audio\n"); 5860 pci_dev_put(p); 5861 /* TODO: abort the succeeding gpu reset? */ 5862 return -ETIMEDOUT; 5863 } 5864 } 5865 5866 pm_runtime_disable(&(p->dev)); 5867 5868 pci_dev_put(p); 5869 return 0; 5870 } 5871 5872 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5873 { 5874 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5875 5876 #if defined(CONFIG_DEBUG_FS) 5877 if (!amdgpu_sriov_vf(adev)) 5878 cancel_work(&adev->reset_work); 5879 #endif 5880 5881 if (adev->kfd.dev) 5882 cancel_work(&adev->kfd.reset_work); 5883 5884 if (amdgpu_sriov_vf(adev)) 5885 cancel_work(&adev->virt.flr_work); 5886 5887 if (con && adev->ras_enabled) 5888 cancel_work(&con->recovery_work); 5889 5890 } 5891 5892 static int amdgpu_device_health_check(struct list_head *device_list_handle) 5893 { 5894 struct amdgpu_device *tmp_adev; 5895 int ret = 0; 5896 u32 status; 5897 5898 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5899 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status); 5900 if (PCI_POSSIBLE_ERROR(status)) { 5901 dev_err(tmp_adev->dev, "device lost from bus!"); 5902 ret = -ENODEV; 5903 } 5904 } 5905 5906 return ret; 5907 } 5908 5909 /** 5910 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5911 * 5912 * @adev: amdgpu_device pointer 5913 * @job: which job trigger hang 5914 * @reset_context: amdgpu reset context pointer 5915 * 5916 * Attempt to reset the GPU if it has hung (all asics). 5917 * Attempt to do soft-reset or full-reset and reinitialize Asic 5918 * Returns 0 for success or an error on failure. 5919 */ 5920 5921 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5922 struct amdgpu_job *job, 5923 struct amdgpu_reset_context *reset_context) 5924 { 5925 struct list_head device_list, *device_list_handle = NULL; 5926 bool job_signaled = false; 5927 struct amdgpu_hive_info *hive = NULL; 5928 struct amdgpu_device *tmp_adev = NULL; 5929 int i, r = 0; 5930 bool need_emergency_restart = false; 5931 bool audio_suspended = false; 5932 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 5933 5934 /* 5935 * If it reaches here because of hang/timeout and a RAS error is 5936 * detected at the same time, let RAS recovery take care of it. 5937 */ 5938 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && 5939 !amdgpu_sriov_vf(adev) && 5940 reset_context->src != AMDGPU_RESET_SRC_RAS) { 5941 dev_dbg(adev->dev, 5942 "Gpu recovery from source: %d yielding to RAS error recovery handling", 5943 reset_context->src); 5944 return 0; 5945 } 5946 /* 5947 * Special case: RAS triggered and full reset isn't supported 5948 */ 5949 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5950 5951 /* 5952 * Flush RAM to disk so that after reboot 5953 * the user can read log and see why the system rebooted. 5954 */ 5955 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5956 amdgpu_ras_get_context(adev)->reboot) { 5957 DRM_WARN("Emergency reboot."); 5958 5959 ksys_sync_helper(); 5960 emergency_restart(); 5961 } 5962 5963 dev_info(adev->dev, "GPU %s begin!\n", 5964 need_emergency_restart ? "jobs stop":"reset"); 5965 5966 if (!amdgpu_sriov_vf(adev)) 5967 hive = amdgpu_get_xgmi_hive(adev); 5968 if (hive) 5969 mutex_lock(&hive->hive_lock); 5970 5971 reset_context->job = job; 5972 reset_context->hive = hive; 5973 /* 5974 * Build list of devices to reset. 5975 * In case we are in XGMI hive mode, resort the device list 5976 * to put adev in the 1st position. 5977 */ 5978 INIT_LIST_HEAD(&device_list); 5979 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 5980 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5981 list_add_tail(&tmp_adev->reset_list, &device_list); 5982 if (adev->shutdown) 5983 tmp_adev->shutdown = true; 5984 } 5985 if (!list_is_first(&adev->reset_list, &device_list)) 5986 list_rotate_to_front(&adev->reset_list, &device_list); 5987 device_list_handle = &device_list; 5988 } else { 5989 list_add_tail(&adev->reset_list, &device_list); 5990 device_list_handle = &device_list; 5991 } 5992 5993 if (!amdgpu_sriov_vf(adev)) { 5994 r = amdgpu_device_health_check(device_list_handle); 5995 if (r) 5996 goto end_reset; 5997 } 5998 5999 /* We need to lock reset domain only once both for XGMI and single device */ 6000 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 6001 reset_list); 6002 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 6003 6004 /* block all schedulers and reset given job's ring */ 6005 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6006 6007 amdgpu_device_set_mp1_state(tmp_adev); 6008 6009 /* 6010 * Try to put the audio codec into suspend state 6011 * before gpu reset started. 6012 * 6013 * Due to the power domain of the graphics device 6014 * is shared with AZ power domain. Without this, 6015 * we may change the audio hardware from behind 6016 * the audio driver's back. That will trigger 6017 * some audio codec errors. 6018 */ 6019 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 6020 audio_suspended = true; 6021 6022 amdgpu_ras_set_error_query_ready(tmp_adev, false); 6023 6024 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 6025 6026 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 6027 6028 /* 6029 * Mark these ASICs to be reset as untracked first 6030 * And add them back after reset completed 6031 */ 6032 amdgpu_unregister_gpu_instance(tmp_adev); 6033 6034 drm_client_dev_suspend(adev_to_drm(tmp_adev), false); 6035 6036 /* disable ras on ALL IPs */ 6037 if (!need_emergency_restart && 6038 amdgpu_device_ip_need_full_reset(tmp_adev)) 6039 amdgpu_ras_suspend(tmp_adev); 6040 6041 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6042 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6043 6044 if (!amdgpu_ring_sched_ready(ring)) 6045 continue; 6046 6047 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 6048 6049 if (need_emergency_restart) 6050 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 6051 } 6052 atomic_inc(&tmp_adev->gpu_reset_counter); 6053 } 6054 6055 if (need_emergency_restart) 6056 goto skip_sched_resume; 6057 6058 /* 6059 * Must check guilty signal here since after this point all old 6060 * HW fences are force signaled. 6061 * 6062 * job->base holds a reference to parent fence 6063 */ 6064 if (job && dma_fence_is_signaled(&job->hw_fence)) { 6065 job_signaled = true; 6066 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 6067 goto skip_hw_reset; 6068 } 6069 6070 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 6071 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6072 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 6073 /*TODO Should we stop ?*/ 6074 if (r) { 6075 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 6076 r, adev_to_drm(tmp_adev)->unique); 6077 tmp_adev->asic_reset_res = r; 6078 } 6079 } 6080 6081 /* Actual ASIC resets if needed.*/ 6082 /* Host driver will handle XGMI hive reset for SRIOV */ 6083 if (amdgpu_sriov_vf(adev)) { 6084 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 6085 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 6086 amdgpu_ras_set_fed(adev, true); 6087 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 6088 } 6089 6090 r = amdgpu_device_reset_sriov(adev, reset_context); 6091 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 6092 amdgpu_virt_release_full_gpu(adev, true); 6093 goto retry; 6094 } 6095 if (r) 6096 adev->asic_reset_res = r; 6097 } else { 6098 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 6099 if (r && r == -EAGAIN) 6100 goto retry; 6101 } 6102 6103 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6104 /* 6105 * Drop any pending non scheduler resets queued before reset is done. 6106 * Any reset scheduled after this point would be valid. Scheduler resets 6107 * were already dropped during drm_sched_stop and no new ones can come 6108 * in before drm_sched_start. 6109 */ 6110 amdgpu_device_stop_pending_resets(tmp_adev); 6111 } 6112 6113 skip_hw_reset: 6114 6115 /* Post ASIC reset for all devs .*/ 6116 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6117 6118 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6119 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6120 6121 if (!amdgpu_ring_sched_ready(ring)) 6122 continue; 6123 6124 drm_sched_start(&ring->sched, 0); 6125 } 6126 6127 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 6128 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 6129 6130 if (tmp_adev->asic_reset_res) 6131 r = tmp_adev->asic_reset_res; 6132 6133 tmp_adev->asic_reset_res = 0; 6134 6135 if (r) { 6136 /* bad news, how to tell it to userspace ? 6137 * for ras error, we should report GPU bad status instead of 6138 * reset failure 6139 */ 6140 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 6141 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 6142 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", 6143 atomic_read(&tmp_adev->gpu_reset_counter)); 6144 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 6145 } else { 6146 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 6147 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 6148 DRM_WARN("smart shift update failed\n"); 6149 } 6150 } 6151 6152 skip_sched_resume: 6153 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6154 /* unlock kfd: SRIOV would do it separately */ 6155 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 6156 amdgpu_amdkfd_post_reset(tmp_adev); 6157 6158 /* kfd_post_reset will do nothing if kfd device is not initialized, 6159 * need to bring up kfd here if it's not be initialized before 6160 */ 6161 if (!adev->kfd.init_complete) 6162 amdgpu_amdkfd_device_init(adev); 6163 6164 if (audio_suspended) 6165 amdgpu_device_resume_display_audio(tmp_adev); 6166 6167 amdgpu_device_unset_mp1_state(tmp_adev); 6168 6169 amdgpu_ras_set_error_query_ready(tmp_adev, true); 6170 } 6171 6172 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 6173 reset_list); 6174 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 6175 6176 end_reset: 6177 if (hive) { 6178 mutex_unlock(&hive->hive_lock); 6179 amdgpu_put_xgmi_hive(hive); 6180 } 6181 6182 if (r) 6183 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 6184 6185 atomic_set(&adev->reset_domain->reset_res, r); 6186 6187 if (!r) 6188 drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE); 6189 6190 return r; 6191 } 6192 6193 /** 6194 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6195 * 6196 * @adev: amdgpu_device pointer 6197 * @speed: pointer to the speed of the link 6198 * @width: pointer to the width of the link 6199 * 6200 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6201 * first physical partner to an AMD dGPU. 6202 * This will exclude any virtual switches and links. 6203 */ 6204 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6205 enum pci_bus_speed *speed, 6206 enum pcie_link_width *width) 6207 { 6208 struct pci_dev *parent = adev->pdev; 6209 6210 if (!speed || !width) 6211 return; 6212 6213 *speed = PCI_SPEED_UNKNOWN; 6214 *width = PCIE_LNK_WIDTH_UNKNOWN; 6215 6216 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6217 while ((parent = pci_upstream_bridge(parent))) { 6218 /* skip upstream/downstream switches internal to dGPU*/ 6219 if (parent->vendor == PCI_VENDOR_ID_ATI) 6220 continue; 6221 *speed = pcie_get_speed_cap(parent); 6222 *width = pcie_get_width_cap(parent); 6223 break; 6224 } 6225 } else { 6226 /* use the current speeds rather than max if switching is not supported */ 6227 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6228 } 6229 } 6230 6231 /** 6232 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU 6233 * 6234 * @adev: amdgpu_device pointer 6235 * @speed: pointer to the speed of the link 6236 * @width: pointer to the width of the link 6237 * 6238 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6239 * AMD dGPU which may be a virtual upstream bridge. 6240 */ 6241 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev, 6242 enum pci_bus_speed *speed, 6243 enum pcie_link_width *width) 6244 { 6245 struct pci_dev *parent = adev->pdev; 6246 6247 if (!speed || !width) 6248 return; 6249 6250 parent = pci_upstream_bridge(parent); 6251 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) { 6252 /* use the upstream/downstream switches internal to dGPU */ 6253 *speed = pcie_get_speed_cap(parent); 6254 *width = pcie_get_width_cap(parent); 6255 while ((parent = pci_upstream_bridge(parent))) { 6256 if (parent->vendor == PCI_VENDOR_ID_ATI) { 6257 /* use the upstream/downstream switches internal to dGPU */ 6258 *speed = pcie_get_speed_cap(parent); 6259 *width = pcie_get_width_cap(parent); 6260 } 6261 } 6262 } else { 6263 /* use the device itself */ 6264 *speed = pcie_get_speed_cap(adev->pdev); 6265 *width = pcie_get_width_cap(adev->pdev); 6266 } 6267 } 6268 6269 /** 6270 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6271 * 6272 * @adev: amdgpu_device pointer 6273 * 6274 * Fetches and stores in the driver the PCIE capabilities (gen speed 6275 * and lanes) of the slot the device is in. Handles APUs and 6276 * virtualized environments where PCIE config space may not be available. 6277 */ 6278 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6279 { 6280 enum pci_bus_speed speed_cap, platform_speed_cap; 6281 enum pcie_link_width platform_link_width, link_width; 6282 6283 if (amdgpu_pcie_gen_cap) 6284 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6285 6286 if (amdgpu_pcie_lane_cap) 6287 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6288 6289 /* covers APUs as well */ 6290 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6291 if (adev->pm.pcie_gen_mask == 0) 6292 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6293 if (adev->pm.pcie_mlw_mask == 0) 6294 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6295 return; 6296 } 6297 6298 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6299 return; 6300 6301 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6302 &platform_link_width); 6303 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width); 6304 6305 if (adev->pm.pcie_gen_mask == 0) { 6306 /* asic caps */ 6307 if (speed_cap == PCI_SPEED_UNKNOWN) { 6308 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6309 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6310 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6311 } else { 6312 if (speed_cap == PCIE_SPEED_32_0GT) 6313 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6314 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6315 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6316 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6317 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6318 else if (speed_cap == PCIE_SPEED_16_0GT) 6319 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6320 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6321 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6322 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6323 else if (speed_cap == PCIE_SPEED_8_0GT) 6324 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6325 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6326 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6327 else if (speed_cap == PCIE_SPEED_5_0GT) 6328 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6329 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6330 else 6331 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6332 } 6333 /* platform caps */ 6334 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6335 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6336 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6337 } else { 6338 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6339 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6340 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6341 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6342 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6343 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6344 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6345 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6346 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6347 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6348 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6349 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6350 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6351 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6352 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6353 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6354 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6355 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6356 else 6357 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6358 6359 } 6360 } 6361 if (adev->pm.pcie_mlw_mask == 0) { 6362 /* asic caps */ 6363 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6364 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK; 6365 } else { 6366 switch (link_width) { 6367 case PCIE_LNK_X32: 6368 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 | 6369 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6370 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6371 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6372 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6373 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6374 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6375 break; 6376 case PCIE_LNK_X16: 6377 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6378 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6379 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6380 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6381 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6382 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6383 break; 6384 case PCIE_LNK_X12: 6385 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6386 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6387 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6388 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6389 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6390 break; 6391 case PCIE_LNK_X8: 6392 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6393 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6394 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6395 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6396 break; 6397 case PCIE_LNK_X4: 6398 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6399 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6400 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6401 break; 6402 case PCIE_LNK_X2: 6403 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6404 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6405 break; 6406 case PCIE_LNK_X1: 6407 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1; 6408 break; 6409 default: 6410 break; 6411 } 6412 } 6413 /* platform caps */ 6414 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6415 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6416 } else { 6417 switch (platform_link_width) { 6418 case PCIE_LNK_X32: 6419 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6420 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6421 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6422 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6423 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6424 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6425 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6426 break; 6427 case PCIE_LNK_X16: 6428 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6429 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6430 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6431 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6432 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6433 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6434 break; 6435 case PCIE_LNK_X12: 6436 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6437 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6438 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6439 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6440 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6441 break; 6442 case PCIE_LNK_X8: 6443 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6444 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6445 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6446 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6447 break; 6448 case PCIE_LNK_X4: 6449 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6450 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6451 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6452 break; 6453 case PCIE_LNK_X2: 6454 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6455 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6456 break; 6457 case PCIE_LNK_X1: 6458 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6459 break; 6460 default: 6461 break; 6462 } 6463 } 6464 } 6465 } 6466 6467 /** 6468 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6469 * 6470 * @adev: amdgpu_device pointer 6471 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6472 * 6473 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6474 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6475 * @peer_adev. 6476 */ 6477 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6478 struct amdgpu_device *peer_adev) 6479 { 6480 #ifdef CONFIG_HSA_AMD_P2P 6481 bool p2p_access = 6482 !adev->gmc.xgmi.connected_to_cpu && 6483 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6484 if (!p2p_access) 6485 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 6486 pci_name(peer_adev->pdev)); 6487 6488 bool is_large_bar = adev->gmc.visible_vram_size && 6489 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 6490 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 6491 6492 if (!p2p_addressable) { 6493 uint64_t address_mask = peer_adev->dev->dma_mask ? 6494 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6495 resource_size_t aper_limit = 6496 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6497 6498 p2p_addressable = !(adev->gmc.aper_base & address_mask || 6499 aper_limit & address_mask); 6500 } 6501 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 6502 #else 6503 return false; 6504 #endif 6505 } 6506 6507 int amdgpu_device_baco_enter(struct drm_device *dev) 6508 { 6509 struct amdgpu_device *adev = drm_to_adev(dev); 6510 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6511 6512 if (!amdgpu_device_supports_baco(dev)) 6513 return -ENOTSUPP; 6514 6515 if (ras && adev->ras_enabled && 6516 adev->nbio.funcs->enable_doorbell_interrupt) 6517 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6518 6519 return amdgpu_dpm_baco_enter(adev); 6520 } 6521 6522 int amdgpu_device_baco_exit(struct drm_device *dev) 6523 { 6524 struct amdgpu_device *adev = drm_to_adev(dev); 6525 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6526 int ret = 0; 6527 6528 if (!amdgpu_device_supports_baco(dev)) 6529 return -ENOTSUPP; 6530 6531 ret = amdgpu_dpm_baco_exit(adev); 6532 if (ret) 6533 return ret; 6534 6535 if (ras && adev->ras_enabled && 6536 adev->nbio.funcs->enable_doorbell_interrupt) 6537 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6538 6539 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6540 adev->nbio.funcs->clear_doorbell_interrupt) 6541 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6542 6543 return 0; 6544 } 6545 6546 /** 6547 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6548 * @pdev: PCI device struct 6549 * @state: PCI channel state 6550 * 6551 * Description: Called when a PCI error is detected. 6552 * 6553 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6554 */ 6555 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6556 { 6557 struct drm_device *dev = pci_get_drvdata(pdev); 6558 struct amdgpu_device *adev = drm_to_adev(dev); 6559 int i; 6560 6561 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 6562 6563 if (adev->gmc.xgmi.num_physical_nodes > 1) { 6564 DRM_WARN("No support for XGMI hive yet..."); 6565 return PCI_ERS_RESULT_DISCONNECT; 6566 } 6567 6568 adev->pci_channel_state = state; 6569 6570 switch (state) { 6571 case pci_channel_io_normal: 6572 return PCI_ERS_RESULT_CAN_RECOVER; 6573 /* Fatal error, prepare for slot reset */ 6574 case pci_channel_io_frozen: 6575 /* 6576 * Locking adev->reset_domain->sem will prevent any external access 6577 * to GPU during PCI error recovery 6578 */ 6579 amdgpu_device_lock_reset_domain(adev->reset_domain); 6580 amdgpu_device_set_mp1_state(adev); 6581 6582 /* 6583 * Block any work scheduling as we do for regular GPU reset 6584 * for the duration of the recovery 6585 */ 6586 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6587 struct amdgpu_ring *ring = adev->rings[i]; 6588 6589 if (!amdgpu_ring_sched_ready(ring)) 6590 continue; 6591 6592 drm_sched_stop(&ring->sched, NULL); 6593 } 6594 atomic_inc(&adev->gpu_reset_counter); 6595 return PCI_ERS_RESULT_NEED_RESET; 6596 case pci_channel_io_perm_failure: 6597 /* Permanent error, prepare for device removal */ 6598 return PCI_ERS_RESULT_DISCONNECT; 6599 } 6600 6601 return PCI_ERS_RESULT_NEED_RESET; 6602 } 6603 6604 /** 6605 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6606 * @pdev: pointer to PCI device 6607 */ 6608 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6609 { 6610 6611 DRM_INFO("PCI error: mmio enabled callback!!\n"); 6612 6613 /* TODO - dump whatever for debugging purposes */ 6614 6615 /* This called only if amdgpu_pci_error_detected returns 6616 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6617 * works, no need to reset slot. 6618 */ 6619 6620 return PCI_ERS_RESULT_RECOVERED; 6621 } 6622 6623 /** 6624 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6625 * @pdev: PCI device struct 6626 * 6627 * Description: This routine is called by the pci error recovery 6628 * code after the PCI slot has been reset, just before we 6629 * should resume normal operations. 6630 */ 6631 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6632 { 6633 struct drm_device *dev = pci_get_drvdata(pdev); 6634 struct amdgpu_device *adev = drm_to_adev(dev); 6635 int r, i; 6636 struct amdgpu_reset_context reset_context; 6637 u32 memsize; 6638 struct list_head device_list; 6639 6640 /* PCI error slot reset should be skipped During RAS recovery */ 6641 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 6642 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && 6643 amdgpu_ras_in_recovery(adev)) 6644 return PCI_ERS_RESULT_RECOVERED; 6645 6646 DRM_INFO("PCI error: slot reset callback!!\n"); 6647 6648 memset(&reset_context, 0, sizeof(reset_context)); 6649 6650 INIT_LIST_HEAD(&device_list); 6651 list_add_tail(&adev->reset_list, &device_list); 6652 6653 /* wait for asic to come out of reset */ 6654 msleep(500); 6655 6656 /* Restore PCI confspace */ 6657 amdgpu_device_load_pci_state(pdev); 6658 6659 /* confirm ASIC came out of reset */ 6660 for (i = 0; i < adev->usec_timeout; i++) { 6661 memsize = amdgpu_asic_get_config_memsize(adev); 6662 6663 if (memsize != 0xffffffff) 6664 break; 6665 udelay(1); 6666 } 6667 if (memsize == 0xffffffff) { 6668 r = -ETIME; 6669 goto out; 6670 } 6671 6672 reset_context.method = AMD_RESET_METHOD_NONE; 6673 reset_context.reset_req_dev = adev; 6674 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6675 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6676 6677 adev->no_hw_access = true; 6678 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 6679 adev->no_hw_access = false; 6680 if (r) 6681 goto out; 6682 6683 r = amdgpu_do_asic_reset(&device_list, &reset_context); 6684 6685 out: 6686 if (!r) { 6687 if (amdgpu_device_cache_pci_state(adev->pdev)) 6688 pci_restore_state(adev->pdev); 6689 6690 DRM_INFO("PCIe error recovery succeeded\n"); 6691 } else { 6692 DRM_ERROR("PCIe error recovery failed, err:%d", r); 6693 amdgpu_device_unset_mp1_state(adev); 6694 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6695 } 6696 6697 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6698 } 6699 6700 /** 6701 * amdgpu_pci_resume() - resume normal ops after PCI reset 6702 * @pdev: pointer to PCI device 6703 * 6704 * Called when the error recovery driver tells us that its 6705 * OK to resume normal operation. 6706 */ 6707 void amdgpu_pci_resume(struct pci_dev *pdev) 6708 { 6709 struct drm_device *dev = pci_get_drvdata(pdev); 6710 struct amdgpu_device *adev = drm_to_adev(dev); 6711 int i; 6712 6713 6714 DRM_INFO("PCI error: resume callback!!\n"); 6715 6716 /* Only continue execution for the case of pci_channel_io_frozen */ 6717 if (adev->pci_channel_state != pci_channel_io_frozen) 6718 return; 6719 6720 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6721 struct amdgpu_ring *ring = adev->rings[i]; 6722 6723 if (!amdgpu_ring_sched_ready(ring)) 6724 continue; 6725 6726 drm_sched_start(&ring->sched, 0); 6727 } 6728 6729 amdgpu_device_unset_mp1_state(adev); 6730 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6731 } 6732 6733 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6734 { 6735 struct drm_device *dev = pci_get_drvdata(pdev); 6736 struct amdgpu_device *adev = drm_to_adev(dev); 6737 int r; 6738 6739 if (amdgpu_sriov_vf(adev)) 6740 return false; 6741 6742 r = pci_save_state(pdev); 6743 if (!r) { 6744 kfree(adev->pci_state); 6745 6746 adev->pci_state = pci_store_saved_state(pdev); 6747 6748 if (!adev->pci_state) { 6749 DRM_ERROR("Failed to store PCI saved state"); 6750 return false; 6751 } 6752 } else { 6753 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6754 return false; 6755 } 6756 6757 return true; 6758 } 6759 6760 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6761 { 6762 struct drm_device *dev = pci_get_drvdata(pdev); 6763 struct amdgpu_device *adev = drm_to_adev(dev); 6764 int r; 6765 6766 if (!adev->pci_state) 6767 return false; 6768 6769 r = pci_load_saved_state(pdev, adev->pci_state); 6770 6771 if (!r) { 6772 pci_restore_state(pdev); 6773 } else { 6774 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6775 return false; 6776 } 6777 6778 return true; 6779 } 6780 6781 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6782 struct amdgpu_ring *ring) 6783 { 6784 #ifdef CONFIG_X86_64 6785 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6786 return; 6787 #endif 6788 if (adev->gmc.xgmi.connected_to_cpu) 6789 return; 6790 6791 if (ring && ring->funcs->emit_hdp_flush) 6792 amdgpu_ring_emit_hdp_flush(ring); 6793 else 6794 amdgpu_asic_flush_hdp(adev, ring); 6795 } 6796 6797 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6798 struct amdgpu_ring *ring) 6799 { 6800 #ifdef CONFIG_X86_64 6801 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6802 return; 6803 #endif 6804 if (adev->gmc.xgmi.connected_to_cpu) 6805 return; 6806 6807 amdgpu_asic_invalidate_hdp(adev, ring); 6808 } 6809 6810 int amdgpu_in_reset(struct amdgpu_device *adev) 6811 { 6812 return atomic_read(&adev->reset_domain->in_gpu_reset); 6813 } 6814 6815 /** 6816 * amdgpu_device_halt() - bring hardware to some kind of halt state 6817 * 6818 * @adev: amdgpu_device pointer 6819 * 6820 * Bring hardware to some kind of halt state so that no one can touch it 6821 * any more. It will help to maintain error context when error occurred. 6822 * Compare to a simple hang, the system will keep stable at least for SSH 6823 * access. Then it should be trivial to inspect the hardware state and 6824 * see what's going on. Implemented as following: 6825 * 6826 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6827 * clears all CPU mappings to device, disallows remappings through page faults 6828 * 2. amdgpu_irq_disable_all() disables all interrupts 6829 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6830 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6831 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6832 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6833 * flush any in flight DMA operations 6834 */ 6835 void amdgpu_device_halt(struct amdgpu_device *adev) 6836 { 6837 struct pci_dev *pdev = adev->pdev; 6838 struct drm_device *ddev = adev_to_drm(adev); 6839 6840 amdgpu_xcp_dev_unplug(adev); 6841 drm_dev_unplug(ddev); 6842 6843 amdgpu_irq_disable_all(adev); 6844 6845 amdgpu_fence_driver_hw_fini(adev); 6846 6847 adev->no_hw_access = true; 6848 6849 amdgpu_device_unmap_mmio(adev); 6850 6851 pci_disable_device(pdev); 6852 pci_wait_for_pending_transaction(pdev); 6853 } 6854 6855 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6856 u32 reg) 6857 { 6858 unsigned long flags, address, data; 6859 u32 r; 6860 6861 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6862 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6863 6864 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6865 WREG32(address, reg * 4); 6866 (void)RREG32(address); 6867 r = RREG32(data); 6868 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6869 return r; 6870 } 6871 6872 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6873 u32 reg, u32 v) 6874 { 6875 unsigned long flags, address, data; 6876 6877 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6878 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6879 6880 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6881 WREG32(address, reg * 4); 6882 (void)RREG32(address); 6883 WREG32(data, v); 6884 (void)RREG32(data); 6885 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6886 } 6887 6888 /** 6889 * amdgpu_device_get_gang - return a reference to the current gang 6890 * @adev: amdgpu_device pointer 6891 * 6892 * Returns: A new reference to the current gang leader. 6893 */ 6894 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 6895 { 6896 struct dma_fence *fence; 6897 6898 rcu_read_lock(); 6899 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 6900 rcu_read_unlock(); 6901 return fence; 6902 } 6903 6904 /** 6905 * amdgpu_device_switch_gang - switch to a new gang 6906 * @adev: amdgpu_device pointer 6907 * @gang: the gang to switch to 6908 * 6909 * Try to switch to a new gang. 6910 * Returns: NULL if we switched to the new gang or a reference to the current 6911 * gang leader. 6912 */ 6913 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6914 struct dma_fence *gang) 6915 { 6916 struct dma_fence *old = NULL; 6917 6918 dma_fence_get(gang); 6919 do { 6920 dma_fence_put(old); 6921 old = amdgpu_device_get_gang(adev); 6922 if (old == gang) 6923 break; 6924 6925 if (!dma_fence_is_signaled(old)) { 6926 dma_fence_put(gang); 6927 return old; 6928 } 6929 6930 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6931 old, gang) != old); 6932 6933 /* 6934 * Drop it once for the exchanged reference in adev and once for the 6935 * thread local reference acquired in amdgpu_device_get_gang(). 6936 */ 6937 dma_fence_put(old); 6938 dma_fence_put(old); 6939 return NULL; 6940 } 6941 6942 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6943 { 6944 switch (adev->asic_type) { 6945 #ifdef CONFIG_DRM_AMDGPU_SI 6946 case CHIP_HAINAN: 6947 #endif 6948 case CHIP_TOPAZ: 6949 /* chips with no display hardware */ 6950 return false; 6951 #ifdef CONFIG_DRM_AMDGPU_SI 6952 case CHIP_TAHITI: 6953 case CHIP_PITCAIRN: 6954 case CHIP_VERDE: 6955 case CHIP_OLAND: 6956 #endif 6957 #ifdef CONFIG_DRM_AMDGPU_CIK 6958 case CHIP_BONAIRE: 6959 case CHIP_HAWAII: 6960 case CHIP_KAVERI: 6961 case CHIP_KABINI: 6962 case CHIP_MULLINS: 6963 #endif 6964 case CHIP_TONGA: 6965 case CHIP_FIJI: 6966 case CHIP_POLARIS10: 6967 case CHIP_POLARIS11: 6968 case CHIP_POLARIS12: 6969 case CHIP_VEGAM: 6970 case CHIP_CARRIZO: 6971 case CHIP_STONEY: 6972 /* chips with display hardware */ 6973 return true; 6974 default: 6975 /* IP discovery */ 6976 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 6977 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6978 return false; 6979 return true; 6980 } 6981 } 6982 6983 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6984 uint32_t inst, uint32_t reg_addr, char reg_name[], 6985 uint32_t expected_value, uint32_t mask) 6986 { 6987 uint32_t ret = 0; 6988 uint32_t old_ = 0; 6989 uint32_t tmp_ = RREG32(reg_addr); 6990 uint32_t loop = adev->usec_timeout; 6991 6992 while ((tmp_ & (mask)) != (expected_value)) { 6993 if (old_ != tmp_) { 6994 loop = adev->usec_timeout; 6995 old_ = tmp_; 6996 } else 6997 udelay(1); 6998 tmp_ = RREG32(reg_addr); 6999 loop--; 7000 if (!loop) { 7001 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 7002 inst, reg_name, (uint32_t)expected_value, 7003 (uint32_t)(tmp_ & (mask))); 7004 ret = -ETIMEDOUT; 7005 break; 7006 } 7007 } 7008 return ret; 7009 } 7010 7011 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 7012 { 7013 ssize_t size = 0; 7014 7015 if (!ring || !ring->adev) 7016 return size; 7017 7018 if (amdgpu_device_should_recover_gpu(ring->adev)) 7019 size |= AMDGPU_RESET_TYPE_FULL; 7020 7021 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 7022 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 7023 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 7024 7025 return size; 7026 } 7027 7028 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 7029 { 7030 ssize_t size = 0; 7031 7032 if (supported_reset == 0) { 7033 size += sysfs_emit_at(buf, size, "unsupported"); 7034 size += sysfs_emit_at(buf, size, "\n"); 7035 return size; 7036 7037 } 7038 7039 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 7040 size += sysfs_emit_at(buf, size, "soft "); 7041 7042 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 7043 size += sysfs_emit_at(buf, size, "queue "); 7044 7045 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 7046 size += sysfs_emit_at(buf, size, "pipe "); 7047 7048 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 7049 size += sysfs_emit_at(buf, size, "full "); 7050 7051 size += sysfs_emit_at(buf, size, "\n"); 7052 return size; 7053 } 7054