1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_client_event.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_probe_helper.h> 44 #include <drm/amdgpu_drm.h> 45 #include <linux/device.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 #include "amdgpu_virt.h" 78 #include "amdgpu_dev_coredump.h" 79 80 #include <linux/suspend.h> 81 #include <drm/task_barrier.h> 82 #include <linux/pm_runtime.h> 83 84 #include <drm/drm_drv.h> 85 86 #if IS_ENABLED(CONFIG_X86) 87 #include <asm/intel-family.h> 88 #endif 89 90 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 97 98 #define AMDGPU_RESUME_MS 2000 99 #define AMDGPU_MAX_RETRY_LIMIT 2 100 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 101 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 102 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 103 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 104 105 #define AMDGPU_VBIOS_SKIP (1U << 0) 106 #define AMDGPU_VBIOS_OPTIONAL (1U << 1) 107 108 static const struct drm_driver amdgpu_kms_driver; 109 110 const char *amdgpu_asic_name[] = { 111 "TAHITI", 112 "PITCAIRN", 113 "VERDE", 114 "OLAND", 115 "HAINAN", 116 "BONAIRE", 117 "KAVERI", 118 "KABINI", 119 "HAWAII", 120 "MULLINS", 121 "TOPAZ", 122 "TONGA", 123 "FIJI", 124 "CARRIZO", 125 "STONEY", 126 "POLARIS10", 127 "POLARIS11", 128 "POLARIS12", 129 "VEGAM", 130 "VEGA10", 131 "VEGA12", 132 "VEGA20", 133 "RAVEN", 134 "ARCTURUS", 135 "RENOIR", 136 "ALDEBARAN", 137 "NAVI10", 138 "CYAN_SKILLFISH", 139 "NAVI14", 140 "NAVI12", 141 "SIENNA_CICHLID", 142 "NAVY_FLOUNDER", 143 "VANGOGH", 144 "DIMGREY_CAVEFISH", 145 "BEIGE_GOBY", 146 "YELLOW_CARP", 147 "IP DISCOVERY", 148 "LAST", 149 }; 150 151 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0) 152 /* 153 * Default init level where all blocks are expected to be initialized. This is 154 * the level of initialization expected by default and also after a full reset 155 * of the device. 156 */ 157 struct amdgpu_init_level amdgpu_init_default = { 158 .level = AMDGPU_INIT_LEVEL_DEFAULT, 159 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 160 }; 161 162 struct amdgpu_init_level amdgpu_init_recovery = { 163 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, 164 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 165 }; 166 167 /* 168 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 169 * is used for cases like reset on initialization where the entire hive needs to 170 * be reset before first use. 171 */ 172 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 173 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 174 .hwini_ip_block_mask = 175 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 176 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 177 BIT(AMD_IP_BLOCK_TYPE_PSP) 178 }; 179 180 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 181 enum amd_ip_block_type block) 182 { 183 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 184 } 185 186 void amdgpu_set_init_level(struct amdgpu_device *adev, 187 enum amdgpu_init_lvl_id lvl) 188 { 189 switch (lvl) { 190 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 191 adev->init_lvl = &amdgpu_init_minimal_xgmi; 192 break; 193 case AMDGPU_INIT_LEVEL_RESET_RECOVERY: 194 adev->init_lvl = &amdgpu_init_recovery; 195 break; 196 case AMDGPU_INIT_LEVEL_DEFAULT: 197 fallthrough; 198 default: 199 adev->init_lvl = &amdgpu_init_default; 200 break; 201 } 202 } 203 204 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 205 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 206 void *data); 207 208 /** 209 * DOC: pcie_replay_count 210 * 211 * The amdgpu driver provides a sysfs API for reporting the total number 212 * of PCIe replays (NAKs). 213 * The file pcie_replay_count is used for this and returns the total 214 * number of replays as a sum of the NAKs generated and NAKs received. 215 */ 216 217 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 218 struct device_attribute *attr, char *buf) 219 { 220 struct drm_device *ddev = dev_get_drvdata(dev); 221 struct amdgpu_device *adev = drm_to_adev(ddev); 222 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 223 224 return sysfs_emit(buf, "%llu\n", cnt); 225 } 226 227 static DEVICE_ATTR(pcie_replay_count, 0444, 228 amdgpu_device_get_pcie_replay_count, NULL); 229 230 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 231 struct bin_attribute *attr, char *buf, 232 loff_t ppos, size_t count) 233 { 234 struct device *dev = kobj_to_dev(kobj); 235 struct drm_device *ddev = dev_get_drvdata(dev); 236 struct amdgpu_device *adev = drm_to_adev(ddev); 237 ssize_t bytes_read; 238 239 switch (ppos) { 240 case AMDGPU_SYS_REG_STATE_XGMI: 241 bytes_read = amdgpu_asic_get_reg_state( 242 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 243 break; 244 case AMDGPU_SYS_REG_STATE_WAFL: 245 bytes_read = amdgpu_asic_get_reg_state( 246 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 247 break; 248 case AMDGPU_SYS_REG_STATE_PCIE: 249 bytes_read = amdgpu_asic_get_reg_state( 250 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 251 break; 252 case AMDGPU_SYS_REG_STATE_USR: 253 bytes_read = amdgpu_asic_get_reg_state( 254 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 255 break; 256 case AMDGPU_SYS_REG_STATE_USR_1: 257 bytes_read = amdgpu_asic_get_reg_state( 258 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 259 break; 260 default: 261 return -EINVAL; 262 } 263 264 return bytes_read; 265 } 266 267 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 268 AMDGPU_SYS_REG_STATE_END); 269 270 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 271 { 272 int ret; 273 274 if (!amdgpu_asic_get_reg_state_supported(adev)) 275 return 0; 276 277 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 278 279 return ret; 280 } 281 282 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 283 { 284 if (!amdgpu_asic_get_reg_state_supported(adev)) 285 return; 286 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 287 } 288 289 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block) 290 { 291 int r; 292 293 if (ip_block->version->funcs->suspend) { 294 r = ip_block->version->funcs->suspend(ip_block); 295 if (r) { 296 dev_err(ip_block->adev->dev, 297 "suspend of IP block <%s> failed %d\n", 298 ip_block->version->funcs->name, r); 299 return r; 300 } 301 } 302 303 ip_block->status.hw = false; 304 return 0; 305 } 306 307 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block) 308 { 309 int r; 310 311 if (ip_block->version->funcs->resume) { 312 r = ip_block->version->funcs->resume(ip_block); 313 if (r) { 314 dev_err(ip_block->adev->dev, 315 "resume of IP block <%s> failed %d\n", 316 ip_block->version->funcs->name, r); 317 return r; 318 } 319 } 320 321 ip_block->status.hw = true; 322 return 0; 323 } 324 325 /** 326 * DOC: board_info 327 * 328 * The amdgpu driver provides a sysfs API for giving board related information. 329 * It provides the form factor information in the format 330 * 331 * type : form factor 332 * 333 * Possible form factor values 334 * 335 * - "cem" - PCIE CEM card 336 * - "oam" - Open Compute Accelerator Module 337 * - "unknown" - Not known 338 * 339 */ 340 341 static ssize_t amdgpu_device_get_board_info(struct device *dev, 342 struct device_attribute *attr, 343 char *buf) 344 { 345 struct drm_device *ddev = dev_get_drvdata(dev); 346 struct amdgpu_device *adev = drm_to_adev(ddev); 347 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 348 const char *pkg; 349 350 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 351 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 352 353 switch (pkg_type) { 354 case AMDGPU_PKG_TYPE_CEM: 355 pkg = "cem"; 356 break; 357 case AMDGPU_PKG_TYPE_OAM: 358 pkg = "oam"; 359 break; 360 default: 361 pkg = "unknown"; 362 break; 363 } 364 365 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 366 } 367 368 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 369 370 static struct attribute *amdgpu_board_attrs[] = { 371 &dev_attr_board_info.attr, 372 NULL, 373 }; 374 375 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 376 struct attribute *attr, int n) 377 { 378 struct device *dev = kobj_to_dev(kobj); 379 struct drm_device *ddev = dev_get_drvdata(dev); 380 struct amdgpu_device *adev = drm_to_adev(ddev); 381 382 if (adev->flags & AMD_IS_APU) 383 return 0; 384 385 return attr->mode; 386 } 387 388 static const struct attribute_group amdgpu_board_attrs_group = { 389 .attrs = amdgpu_board_attrs, 390 .is_visible = amdgpu_board_attrs_is_visible 391 }; 392 393 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 394 395 396 /** 397 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 398 * 399 * @dev: drm_device pointer 400 * 401 * Returns true if the device is a dGPU with ATPX power control, 402 * otherwise return false. 403 */ 404 bool amdgpu_device_supports_px(struct drm_device *dev) 405 { 406 struct amdgpu_device *adev = drm_to_adev(dev); 407 408 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 409 return true; 410 return false; 411 } 412 413 /** 414 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 415 * 416 * @dev: drm_device pointer 417 * 418 * Returns true if the device is a dGPU with ACPI power control, 419 * otherwise return false. 420 */ 421 bool amdgpu_device_supports_boco(struct drm_device *dev) 422 { 423 struct amdgpu_device *adev = drm_to_adev(dev); 424 425 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) 426 return false; 427 428 if (adev->has_pr3 || 429 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 430 return true; 431 return false; 432 } 433 434 /** 435 * amdgpu_device_supports_baco - Does the device support BACO 436 * 437 * @dev: drm_device pointer 438 * 439 * Return: 440 * 1 if the device supports BACO; 441 * 3 if the device supports MACO (only works if BACO is supported) 442 * otherwise return 0. 443 */ 444 int amdgpu_device_supports_baco(struct drm_device *dev) 445 { 446 struct amdgpu_device *adev = drm_to_adev(dev); 447 448 return amdgpu_asic_supports_baco(adev); 449 } 450 451 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 452 { 453 struct drm_device *dev; 454 int bamaco_support; 455 456 dev = adev_to_drm(adev); 457 458 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 459 bamaco_support = amdgpu_device_supports_baco(dev); 460 461 switch (amdgpu_runtime_pm) { 462 case 2: 463 if (bamaco_support & MACO_SUPPORT) { 464 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 465 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 466 } else if (bamaco_support == BACO_SUPPORT) { 467 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 468 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 469 } 470 break; 471 case 1: 472 if (bamaco_support & BACO_SUPPORT) { 473 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 474 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 475 } 476 break; 477 case -1: 478 case -2: 479 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */ 480 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 481 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 482 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */ 483 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 484 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 485 } else { 486 if (!bamaco_support) 487 goto no_runtime_pm; 488 489 switch (adev->asic_type) { 490 case CHIP_VEGA20: 491 case CHIP_ARCTURUS: 492 /* BACO are not supported on vega20 and arctrus */ 493 break; 494 case CHIP_VEGA10: 495 /* enable BACO as runpm mode if noretry=0 */ 496 if (!adev->gmc.noretry) 497 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 498 break; 499 default: 500 /* enable BACO as runpm mode on CI+ */ 501 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 502 break; 503 } 504 505 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 506 if (bamaco_support & MACO_SUPPORT) { 507 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 508 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 509 } else { 510 dev_info(adev->dev, "Using BACO for runtime pm\n"); 511 } 512 } 513 } 514 break; 515 case 0: 516 dev_info(adev->dev, "runtime pm is manually disabled\n"); 517 break; 518 default: 519 break; 520 } 521 522 no_runtime_pm: 523 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 524 dev_info(adev->dev, "Runtime PM not available\n"); 525 } 526 /** 527 * amdgpu_device_supports_smart_shift - Is the device dGPU with 528 * smart shift support 529 * 530 * @dev: drm_device pointer 531 * 532 * Returns true if the device is a dGPU with Smart Shift support, 533 * otherwise returns false. 534 */ 535 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 536 { 537 return (amdgpu_device_supports_boco(dev) && 538 amdgpu_acpi_is_power_shift_control_supported()); 539 } 540 541 /* 542 * VRAM access helper functions 543 */ 544 545 /** 546 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 547 * 548 * @adev: amdgpu_device pointer 549 * @pos: offset of the buffer in vram 550 * @buf: virtual address of the buffer in system memory 551 * @size: read/write size, sizeof(@buf) must > @size 552 * @write: true - write to vram, otherwise - read from vram 553 */ 554 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 555 void *buf, size_t size, bool write) 556 { 557 unsigned long flags; 558 uint32_t hi = ~0, tmp = 0; 559 uint32_t *data = buf; 560 uint64_t last; 561 int idx; 562 563 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 564 return; 565 566 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 567 568 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 569 for (last = pos + size; pos < last; pos += 4) { 570 tmp = pos >> 31; 571 572 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 573 if (tmp != hi) { 574 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 575 hi = tmp; 576 } 577 if (write) 578 WREG32_NO_KIQ(mmMM_DATA, *data++); 579 else 580 *data++ = RREG32_NO_KIQ(mmMM_DATA); 581 } 582 583 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 584 drm_dev_exit(idx); 585 } 586 587 /** 588 * amdgpu_device_aper_access - access vram by vram aperture 589 * 590 * @adev: amdgpu_device pointer 591 * @pos: offset of the buffer in vram 592 * @buf: virtual address of the buffer in system memory 593 * @size: read/write size, sizeof(@buf) must > @size 594 * @write: true - write to vram, otherwise - read from vram 595 * 596 * The return value means how many bytes have been transferred. 597 */ 598 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 599 void *buf, size_t size, bool write) 600 { 601 #ifdef CONFIG_64BIT 602 void __iomem *addr; 603 size_t count = 0; 604 uint64_t last; 605 606 if (!adev->mman.aper_base_kaddr) 607 return 0; 608 609 last = min(pos + size, adev->gmc.visible_vram_size); 610 if (last > pos) { 611 addr = adev->mman.aper_base_kaddr + pos; 612 count = last - pos; 613 614 if (write) { 615 memcpy_toio(addr, buf, count); 616 /* Make sure HDP write cache flush happens without any reordering 617 * after the system memory contents are sent over PCIe device 618 */ 619 mb(); 620 amdgpu_device_flush_hdp(adev, NULL); 621 } else { 622 amdgpu_device_invalidate_hdp(adev, NULL); 623 /* Make sure HDP read cache is invalidated before issuing a read 624 * to the PCIe device 625 */ 626 mb(); 627 memcpy_fromio(buf, addr, count); 628 } 629 630 } 631 632 return count; 633 #else 634 return 0; 635 #endif 636 } 637 638 /** 639 * amdgpu_device_vram_access - read/write a buffer in vram 640 * 641 * @adev: amdgpu_device pointer 642 * @pos: offset of the buffer in vram 643 * @buf: virtual address of the buffer in system memory 644 * @size: read/write size, sizeof(@buf) must > @size 645 * @write: true - write to vram, otherwise - read from vram 646 */ 647 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 648 void *buf, size_t size, bool write) 649 { 650 size_t count; 651 652 /* try to using vram apreature to access vram first */ 653 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 654 size -= count; 655 if (size) { 656 /* using MM to access rest vram */ 657 pos += count; 658 buf += count; 659 amdgpu_device_mm_access(adev, pos, buf, size, write); 660 } 661 } 662 663 /* 664 * register access helper functions. 665 */ 666 667 /* Check if hw access should be skipped because of hotplug or device error */ 668 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 669 { 670 if (adev->no_hw_access) 671 return true; 672 673 #ifdef CONFIG_LOCKDEP 674 /* 675 * This is a bit complicated to understand, so worth a comment. What we assert 676 * here is that the GPU reset is not running on another thread in parallel. 677 * 678 * For this we trylock the read side of the reset semaphore, if that succeeds 679 * we know that the reset is not running in parallel. 680 * 681 * If the trylock fails we assert that we are either already holding the read 682 * side of the lock or are the reset thread itself and hold the write side of 683 * the lock. 684 */ 685 if (in_task()) { 686 if (down_read_trylock(&adev->reset_domain->sem)) 687 up_read(&adev->reset_domain->sem); 688 else 689 lockdep_assert_held(&adev->reset_domain->sem); 690 } 691 #endif 692 return false; 693 } 694 695 /** 696 * amdgpu_device_rreg - read a memory mapped IO or indirect register 697 * 698 * @adev: amdgpu_device pointer 699 * @reg: dword aligned register offset 700 * @acc_flags: access flags which require special behavior 701 * 702 * Returns the 32 bit value from the offset specified. 703 */ 704 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 705 uint32_t reg, uint32_t acc_flags) 706 { 707 uint32_t ret; 708 709 if (amdgpu_device_skip_hw_access(adev)) 710 return 0; 711 712 if ((reg * 4) < adev->rmmio_size) { 713 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 714 amdgpu_sriov_runtime(adev) && 715 down_read_trylock(&adev->reset_domain->sem)) { 716 ret = amdgpu_kiq_rreg(adev, reg, 0); 717 up_read(&adev->reset_domain->sem); 718 } else { 719 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 720 } 721 } else { 722 ret = adev->pcie_rreg(adev, reg * 4); 723 } 724 725 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 726 727 return ret; 728 } 729 730 /* 731 * MMIO register read with bytes helper functions 732 * @offset:bytes offset from MMIO start 733 */ 734 735 /** 736 * amdgpu_mm_rreg8 - read a memory mapped IO register 737 * 738 * @adev: amdgpu_device pointer 739 * @offset: byte aligned register offset 740 * 741 * Returns the 8 bit value from the offset specified. 742 */ 743 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 744 { 745 if (amdgpu_device_skip_hw_access(adev)) 746 return 0; 747 748 if (offset < adev->rmmio_size) 749 return (readb(adev->rmmio + offset)); 750 BUG(); 751 } 752 753 754 /** 755 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 756 * 757 * @adev: amdgpu_device pointer 758 * @reg: dword aligned register offset 759 * @acc_flags: access flags which require special behavior 760 * @xcc_id: xcc accelerated compute core id 761 * 762 * Returns the 32 bit value from the offset specified. 763 */ 764 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 765 uint32_t reg, uint32_t acc_flags, 766 uint32_t xcc_id) 767 { 768 uint32_t ret, rlcg_flag; 769 770 if (amdgpu_device_skip_hw_access(adev)) 771 return 0; 772 773 if ((reg * 4) < adev->rmmio_size) { 774 if (amdgpu_sriov_vf(adev) && 775 !amdgpu_sriov_runtime(adev) && 776 adev->gfx.rlc.rlcg_reg_access_supported && 777 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 778 GC_HWIP, false, 779 &rlcg_flag)) { 780 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 781 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 782 amdgpu_sriov_runtime(adev) && 783 down_read_trylock(&adev->reset_domain->sem)) { 784 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 785 up_read(&adev->reset_domain->sem); 786 } else { 787 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 788 } 789 } else { 790 ret = adev->pcie_rreg(adev, reg * 4); 791 } 792 793 return ret; 794 } 795 796 /* 797 * MMIO register write with bytes helper functions 798 * @offset:bytes offset from MMIO start 799 * @value: the value want to be written to the register 800 */ 801 802 /** 803 * amdgpu_mm_wreg8 - read a memory mapped IO register 804 * 805 * @adev: amdgpu_device pointer 806 * @offset: byte aligned register offset 807 * @value: 8 bit value to write 808 * 809 * Writes the value specified to the offset specified. 810 */ 811 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 812 { 813 if (amdgpu_device_skip_hw_access(adev)) 814 return; 815 816 if (offset < adev->rmmio_size) 817 writeb(value, adev->rmmio + offset); 818 else 819 BUG(); 820 } 821 822 /** 823 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 824 * 825 * @adev: amdgpu_device pointer 826 * @reg: dword aligned register offset 827 * @v: 32 bit value to write to the register 828 * @acc_flags: access flags which require special behavior 829 * 830 * Writes the value specified to the offset specified. 831 */ 832 void amdgpu_device_wreg(struct amdgpu_device *adev, 833 uint32_t reg, uint32_t v, 834 uint32_t acc_flags) 835 { 836 if (amdgpu_device_skip_hw_access(adev)) 837 return; 838 839 if ((reg * 4) < adev->rmmio_size) { 840 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 841 amdgpu_sriov_runtime(adev) && 842 down_read_trylock(&adev->reset_domain->sem)) { 843 amdgpu_kiq_wreg(adev, reg, v, 0); 844 up_read(&adev->reset_domain->sem); 845 } else { 846 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 847 } 848 } else { 849 adev->pcie_wreg(adev, reg * 4, v); 850 } 851 852 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 853 } 854 855 /** 856 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 857 * 858 * @adev: amdgpu_device pointer 859 * @reg: mmio/rlc register 860 * @v: value to write 861 * @xcc_id: xcc accelerated compute core id 862 * 863 * this function is invoked only for the debugfs register access 864 */ 865 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 866 uint32_t reg, uint32_t v, 867 uint32_t xcc_id) 868 { 869 if (amdgpu_device_skip_hw_access(adev)) 870 return; 871 872 if (amdgpu_sriov_fullaccess(adev) && 873 adev->gfx.rlc.funcs && 874 adev->gfx.rlc.funcs->is_rlcg_access_range) { 875 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 876 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 877 } else if ((reg * 4) >= adev->rmmio_size) { 878 adev->pcie_wreg(adev, reg * 4, v); 879 } else { 880 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 881 } 882 } 883 884 /** 885 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 886 * 887 * @adev: amdgpu_device pointer 888 * @reg: dword aligned register offset 889 * @v: 32 bit value to write to the register 890 * @acc_flags: access flags which require special behavior 891 * @xcc_id: xcc accelerated compute core id 892 * 893 * Writes the value specified to the offset specified. 894 */ 895 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 896 uint32_t reg, uint32_t v, 897 uint32_t acc_flags, uint32_t xcc_id) 898 { 899 uint32_t rlcg_flag; 900 901 if (amdgpu_device_skip_hw_access(adev)) 902 return; 903 904 if ((reg * 4) < adev->rmmio_size) { 905 if (amdgpu_sriov_vf(adev) && 906 !amdgpu_sriov_runtime(adev) && 907 adev->gfx.rlc.rlcg_reg_access_supported && 908 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 909 GC_HWIP, true, 910 &rlcg_flag)) { 911 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 912 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 913 amdgpu_sriov_runtime(adev) && 914 down_read_trylock(&adev->reset_domain->sem)) { 915 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 916 up_read(&adev->reset_domain->sem); 917 } else { 918 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 919 } 920 } else { 921 adev->pcie_wreg(adev, reg * 4, v); 922 } 923 } 924 925 /** 926 * amdgpu_device_indirect_rreg - read an indirect register 927 * 928 * @adev: amdgpu_device pointer 929 * @reg_addr: indirect register address to read from 930 * 931 * Returns the value of indirect register @reg_addr 932 */ 933 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 934 u32 reg_addr) 935 { 936 unsigned long flags, pcie_index, pcie_data; 937 void __iomem *pcie_index_offset; 938 void __iomem *pcie_data_offset; 939 u32 r; 940 941 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 942 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 943 944 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 945 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 946 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 947 948 writel(reg_addr, pcie_index_offset); 949 readl(pcie_index_offset); 950 r = readl(pcie_data_offset); 951 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 952 953 return r; 954 } 955 956 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 957 u64 reg_addr) 958 { 959 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 960 u32 r; 961 void __iomem *pcie_index_offset; 962 void __iomem *pcie_index_hi_offset; 963 void __iomem *pcie_data_offset; 964 965 if (unlikely(!adev->nbio.funcs)) { 966 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 967 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 968 } else { 969 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 970 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 971 } 972 973 if (reg_addr >> 32) { 974 if (unlikely(!adev->nbio.funcs)) 975 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 976 else 977 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 978 } else { 979 pcie_index_hi = 0; 980 } 981 982 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 983 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 984 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 985 if (pcie_index_hi != 0) 986 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 987 pcie_index_hi * 4; 988 989 writel(reg_addr, pcie_index_offset); 990 readl(pcie_index_offset); 991 if (pcie_index_hi != 0) { 992 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 993 readl(pcie_index_hi_offset); 994 } 995 r = readl(pcie_data_offset); 996 997 /* clear the high bits */ 998 if (pcie_index_hi != 0) { 999 writel(0, pcie_index_hi_offset); 1000 readl(pcie_index_hi_offset); 1001 } 1002 1003 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1004 1005 return r; 1006 } 1007 1008 /** 1009 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 1010 * 1011 * @adev: amdgpu_device pointer 1012 * @reg_addr: indirect register address to read from 1013 * 1014 * Returns the value of indirect register @reg_addr 1015 */ 1016 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1017 u32 reg_addr) 1018 { 1019 unsigned long flags, pcie_index, pcie_data; 1020 void __iomem *pcie_index_offset; 1021 void __iomem *pcie_data_offset; 1022 u64 r; 1023 1024 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1025 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1026 1027 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1028 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1029 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1030 1031 /* read low 32 bits */ 1032 writel(reg_addr, pcie_index_offset); 1033 readl(pcie_index_offset); 1034 r = readl(pcie_data_offset); 1035 /* read high 32 bits */ 1036 writel(reg_addr + 4, pcie_index_offset); 1037 readl(pcie_index_offset); 1038 r |= ((u64)readl(pcie_data_offset) << 32); 1039 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1040 1041 return r; 1042 } 1043 1044 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1045 u64 reg_addr) 1046 { 1047 unsigned long flags, pcie_index, pcie_data; 1048 unsigned long pcie_index_hi = 0; 1049 void __iomem *pcie_index_offset; 1050 void __iomem *pcie_index_hi_offset; 1051 void __iomem *pcie_data_offset; 1052 u64 r; 1053 1054 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1055 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1056 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1057 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1058 1059 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1060 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1061 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1062 if (pcie_index_hi != 0) 1063 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1064 pcie_index_hi * 4; 1065 1066 /* read low 32 bits */ 1067 writel(reg_addr, pcie_index_offset); 1068 readl(pcie_index_offset); 1069 if (pcie_index_hi != 0) { 1070 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1071 readl(pcie_index_hi_offset); 1072 } 1073 r = readl(pcie_data_offset); 1074 /* read high 32 bits */ 1075 writel(reg_addr + 4, pcie_index_offset); 1076 readl(pcie_index_offset); 1077 if (pcie_index_hi != 0) { 1078 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1079 readl(pcie_index_hi_offset); 1080 } 1081 r |= ((u64)readl(pcie_data_offset) << 32); 1082 1083 /* clear the high bits */ 1084 if (pcie_index_hi != 0) { 1085 writel(0, pcie_index_hi_offset); 1086 readl(pcie_index_hi_offset); 1087 } 1088 1089 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1090 1091 return r; 1092 } 1093 1094 /** 1095 * amdgpu_device_indirect_wreg - write an indirect register address 1096 * 1097 * @adev: amdgpu_device pointer 1098 * @reg_addr: indirect register offset 1099 * @reg_data: indirect register data 1100 * 1101 */ 1102 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1103 u32 reg_addr, u32 reg_data) 1104 { 1105 unsigned long flags, pcie_index, pcie_data; 1106 void __iomem *pcie_index_offset; 1107 void __iomem *pcie_data_offset; 1108 1109 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1110 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1111 1112 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1113 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1114 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1115 1116 writel(reg_addr, pcie_index_offset); 1117 readl(pcie_index_offset); 1118 writel(reg_data, pcie_data_offset); 1119 readl(pcie_data_offset); 1120 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1121 } 1122 1123 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1124 u64 reg_addr, u32 reg_data) 1125 { 1126 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1127 void __iomem *pcie_index_offset; 1128 void __iomem *pcie_index_hi_offset; 1129 void __iomem *pcie_data_offset; 1130 1131 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1132 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1133 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1134 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1135 else 1136 pcie_index_hi = 0; 1137 1138 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1139 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1140 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1141 if (pcie_index_hi != 0) 1142 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1143 pcie_index_hi * 4; 1144 1145 writel(reg_addr, pcie_index_offset); 1146 readl(pcie_index_offset); 1147 if (pcie_index_hi != 0) { 1148 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1149 readl(pcie_index_hi_offset); 1150 } 1151 writel(reg_data, pcie_data_offset); 1152 readl(pcie_data_offset); 1153 1154 /* clear the high bits */ 1155 if (pcie_index_hi != 0) { 1156 writel(0, pcie_index_hi_offset); 1157 readl(pcie_index_hi_offset); 1158 } 1159 1160 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1161 } 1162 1163 /** 1164 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1165 * 1166 * @adev: amdgpu_device pointer 1167 * @reg_addr: indirect register offset 1168 * @reg_data: indirect register data 1169 * 1170 */ 1171 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1172 u32 reg_addr, u64 reg_data) 1173 { 1174 unsigned long flags, pcie_index, pcie_data; 1175 void __iomem *pcie_index_offset; 1176 void __iomem *pcie_data_offset; 1177 1178 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1179 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1180 1181 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1182 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1183 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1184 1185 /* write low 32 bits */ 1186 writel(reg_addr, pcie_index_offset); 1187 readl(pcie_index_offset); 1188 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1189 readl(pcie_data_offset); 1190 /* write high 32 bits */ 1191 writel(reg_addr + 4, pcie_index_offset); 1192 readl(pcie_index_offset); 1193 writel((u32)(reg_data >> 32), pcie_data_offset); 1194 readl(pcie_data_offset); 1195 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1196 } 1197 1198 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1199 u64 reg_addr, u64 reg_data) 1200 { 1201 unsigned long flags, pcie_index, pcie_data; 1202 unsigned long pcie_index_hi = 0; 1203 void __iomem *pcie_index_offset; 1204 void __iomem *pcie_index_hi_offset; 1205 void __iomem *pcie_data_offset; 1206 1207 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1208 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1209 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1210 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1211 1212 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1213 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1214 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1215 if (pcie_index_hi != 0) 1216 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1217 pcie_index_hi * 4; 1218 1219 /* write low 32 bits */ 1220 writel(reg_addr, pcie_index_offset); 1221 readl(pcie_index_offset); 1222 if (pcie_index_hi != 0) { 1223 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1224 readl(pcie_index_hi_offset); 1225 } 1226 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1227 readl(pcie_data_offset); 1228 /* write high 32 bits */ 1229 writel(reg_addr + 4, pcie_index_offset); 1230 readl(pcie_index_offset); 1231 if (pcie_index_hi != 0) { 1232 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1233 readl(pcie_index_hi_offset); 1234 } 1235 writel((u32)(reg_data >> 32), pcie_data_offset); 1236 readl(pcie_data_offset); 1237 1238 /* clear the high bits */ 1239 if (pcie_index_hi != 0) { 1240 writel(0, pcie_index_hi_offset); 1241 readl(pcie_index_hi_offset); 1242 } 1243 1244 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1245 } 1246 1247 /** 1248 * amdgpu_device_get_rev_id - query device rev_id 1249 * 1250 * @adev: amdgpu_device pointer 1251 * 1252 * Return device rev_id 1253 */ 1254 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1255 { 1256 return adev->nbio.funcs->get_rev_id(adev); 1257 } 1258 1259 /** 1260 * amdgpu_invalid_rreg - dummy reg read function 1261 * 1262 * @adev: amdgpu_device pointer 1263 * @reg: offset of register 1264 * 1265 * Dummy register read function. Used for register blocks 1266 * that certain asics don't have (all asics). 1267 * Returns the value in the register. 1268 */ 1269 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1270 { 1271 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 1272 BUG(); 1273 return 0; 1274 } 1275 1276 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1277 { 1278 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1279 BUG(); 1280 return 0; 1281 } 1282 1283 /** 1284 * amdgpu_invalid_wreg - dummy reg write function 1285 * 1286 * @adev: amdgpu_device pointer 1287 * @reg: offset of register 1288 * @v: value to write to the register 1289 * 1290 * Dummy register read function. Used for register blocks 1291 * that certain asics don't have (all asics). 1292 */ 1293 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1294 { 1295 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 1296 reg, v); 1297 BUG(); 1298 } 1299 1300 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1301 { 1302 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 1303 reg, v); 1304 BUG(); 1305 } 1306 1307 /** 1308 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1309 * 1310 * @adev: amdgpu_device pointer 1311 * @reg: offset of register 1312 * 1313 * Dummy register read function. Used for register blocks 1314 * that certain asics don't have (all asics). 1315 * Returns the value in the register. 1316 */ 1317 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1318 { 1319 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 1320 BUG(); 1321 return 0; 1322 } 1323 1324 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1325 { 1326 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1327 BUG(); 1328 return 0; 1329 } 1330 1331 /** 1332 * amdgpu_invalid_wreg64 - dummy reg write function 1333 * 1334 * @adev: amdgpu_device pointer 1335 * @reg: offset of register 1336 * @v: value to write to the register 1337 * 1338 * Dummy register read function. Used for register blocks 1339 * that certain asics don't have (all asics). 1340 */ 1341 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1342 { 1343 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1344 reg, v); 1345 BUG(); 1346 } 1347 1348 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1349 { 1350 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1351 reg, v); 1352 BUG(); 1353 } 1354 1355 /** 1356 * amdgpu_block_invalid_rreg - dummy reg read function 1357 * 1358 * @adev: amdgpu_device pointer 1359 * @block: offset of instance 1360 * @reg: offset of register 1361 * 1362 * Dummy register read function. Used for register blocks 1363 * that certain asics don't have (all asics). 1364 * Returns the value in the register. 1365 */ 1366 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1367 uint32_t block, uint32_t reg) 1368 { 1369 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1370 reg, block); 1371 BUG(); 1372 return 0; 1373 } 1374 1375 /** 1376 * amdgpu_block_invalid_wreg - dummy reg write function 1377 * 1378 * @adev: amdgpu_device pointer 1379 * @block: offset of instance 1380 * @reg: offset of register 1381 * @v: value to write to the register 1382 * 1383 * Dummy register read function. Used for register blocks 1384 * that certain asics don't have (all asics). 1385 */ 1386 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1387 uint32_t block, 1388 uint32_t reg, uint32_t v) 1389 { 1390 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1391 reg, block, v); 1392 BUG(); 1393 } 1394 1395 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev) 1396 { 1397 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1398 return AMDGPU_VBIOS_SKIP; 1399 1400 if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev)) 1401 return AMDGPU_VBIOS_OPTIONAL; 1402 1403 return 0; 1404 } 1405 1406 /** 1407 * amdgpu_device_asic_init - Wrapper for atom asic_init 1408 * 1409 * @adev: amdgpu_device pointer 1410 * 1411 * Does any asic specific work and then calls atom asic init. 1412 */ 1413 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1414 { 1415 uint32_t flags; 1416 bool optional; 1417 int ret; 1418 1419 amdgpu_asic_pre_asic_init(adev); 1420 flags = amdgpu_device_get_vbios_flags(adev); 1421 optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP)); 1422 1423 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1424 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1425 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 1426 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1427 amdgpu_psp_wait_for_bootloader(adev); 1428 if (optional && !adev->bios) 1429 return 0; 1430 1431 ret = amdgpu_atomfirmware_asic_init(adev, true); 1432 return ret; 1433 } else { 1434 if (optional && !adev->bios) 1435 return 0; 1436 1437 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1438 } 1439 1440 return 0; 1441 } 1442 1443 /** 1444 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1445 * 1446 * @adev: amdgpu_device pointer 1447 * 1448 * Allocates a scratch page of VRAM for use by various things in the 1449 * driver. 1450 */ 1451 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1452 { 1453 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1454 AMDGPU_GEM_DOMAIN_VRAM | 1455 AMDGPU_GEM_DOMAIN_GTT, 1456 &adev->mem_scratch.robj, 1457 &adev->mem_scratch.gpu_addr, 1458 (void **)&adev->mem_scratch.ptr); 1459 } 1460 1461 /** 1462 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1463 * 1464 * @adev: amdgpu_device pointer 1465 * 1466 * Frees the VRAM scratch page. 1467 */ 1468 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1469 { 1470 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1471 } 1472 1473 /** 1474 * amdgpu_device_program_register_sequence - program an array of registers. 1475 * 1476 * @adev: amdgpu_device pointer 1477 * @registers: pointer to the register array 1478 * @array_size: size of the register array 1479 * 1480 * Programs an array or registers with and or masks. 1481 * This is a helper for setting golden registers. 1482 */ 1483 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1484 const u32 *registers, 1485 const u32 array_size) 1486 { 1487 u32 tmp, reg, and_mask, or_mask; 1488 int i; 1489 1490 if (array_size % 3) 1491 return; 1492 1493 for (i = 0; i < array_size; i += 3) { 1494 reg = registers[i + 0]; 1495 and_mask = registers[i + 1]; 1496 or_mask = registers[i + 2]; 1497 1498 if (and_mask == 0xffffffff) { 1499 tmp = or_mask; 1500 } else { 1501 tmp = RREG32(reg); 1502 tmp &= ~and_mask; 1503 if (adev->family >= AMDGPU_FAMILY_AI) 1504 tmp |= (or_mask & and_mask); 1505 else 1506 tmp |= or_mask; 1507 } 1508 WREG32(reg, tmp); 1509 } 1510 } 1511 1512 /** 1513 * amdgpu_device_pci_config_reset - reset the GPU 1514 * 1515 * @adev: amdgpu_device pointer 1516 * 1517 * Resets the GPU using the pci config reset sequence. 1518 * Only applicable to asics prior to vega10. 1519 */ 1520 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1521 { 1522 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1523 } 1524 1525 /** 1526 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1527 * 1528 * @adev: amdgpu_device pointer 1529 * 1530 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1531 */ 1532 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1533 { 1534 return pci_reset_function(adev->pdev); 1535 } 1536 1537 /* 1538 * amdgpu_device_wb_*() 1539 * Writeback is the method by which the GPU updates special pages in memory 1540 * with the status of certain GPU events (fences, ring pointers,etc.). 1541 */ 1542 1543 /** 1544 * amdgpu_device_wb_fini - Disable Writeback and free memory 1545 * 1546 * @adev: amdgpu_device pointer 1547 * 1548 * Disables Writeback and frees the Writeback memory (all asics). 1549 * Used at driver shutdown. 1550 */ 1551 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1552 { 1553 if (adev->wb.wb_obj) { 1554 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1555 &adev->wb.gpu_addr, 1556 (void **)&adev->wb.wb); 1557 adev->wb.wb_obj = NULL; 1558 } 1559 } 1560 1561 /** 1562 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1563 * 1564 * @adev: amdgpu_device pointer 1565 * 1566 * Initializes writeback and allocates writeback memory (all asics). 1567 * Used at driver startup. 1568 * Returns 0 on success or an -error on failure. 1569 */ 1570 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1571 { 1572 int r; 1573 1574 if (adev->wb.wb_obj == NULL) { 1575 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1576 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1577 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1578 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1579 (void **)&adev->wb.wb); 1580 if (r) { 1581 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1582 return r; 1583 } 1584 1585 adev->wb.num_wb = AMDGPU_MAX_WB; 1586 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1587 1588 /* clear wb memory */ 1589 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1590 } 1591 1592 return 0; 1593 } 1594 1595 /** 1596 * amdgpu_device_wb_get - Allocate a wb entry 1597 * 1598 * @adev: amdgpu_device pointer 1599 * @wb: wb index 1600 * 1601 * Allocate a wb slot for use by the driver (all asics). 1602 * Returns 0 on success or -EINVAL on failure. 1603 */ 1604 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1605 { 1606 unsigned long flags, offset; 1607 1608 spin_lock_irqsave(&adev->wb.lock, flags); 1609 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1610 if (offset < adev->wb.num_wb) { 1611 __set_bit(offset, adev->wb.used); 1612 spin_unlock_irqrestore(&adev->wb.lock, flags); 1613 *wb = offset << 3; /* convert to dw offset */ 1614 return 0; 1615 } else { 1616 spin_unlock_irqrestore(&adev->wb.lock, flags); 1617 return -EINVAL; 1618 } 1619 } 1620 1621 /** 1622 * amdgpu_device_wb_free - Free a wb entry 1623 * 1624 * @adev: amdgpu_device pointer 1625 * @wb: wb index 1626 * 1627 * Free a wb slot allocated for use by the driver (all asics) 1628 */ 1629 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1630 { 1631 unsigned long flags; 1632 1633 wb >>= 3; 1634 spin_lock_irqsave(&adev->wb.lock, flags); 1635 if (wb < adev->wb.num_wb) 1636 __clear_bit(wb, adev->wb.used); 1637 spin_unlock_irqrestore(&adev->wb.lock, flags); 1638 } 1639 1640 /** 1641 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1642 * 1643 * @adev: amdgpu_device pointer 1644 * 1645 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1646 * to fail, but if any of the BARs is not accessible after the size we abort 1647 * driver loading by returning -ENODEV. 1648 */ 1649 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1650 { 1651 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1652 struct pci_bus *root; 1653 struct resource *res; 1654 unsigned int i; 1655 u16 cmd; 1656 int r; 1657 1658 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1659 return 0; 1660 1661 /* Bypass for VF */ 1662 if (amdgpu_sriov_vf(adev)) 1663 return 0; 1664 1665 /* resizing on Dell G5 SE platforms causes problems with runtime pm */ 1666 if ((amdgpu_runtime_pm != 0) && 1667 adev->pdev->vendor == PCI_VENDOR_ID_ATI && 1668 adev->pdev->device == 0x731f && 1669 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL) 1670 return 0; 1671 1672 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1673 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1674 DRM_WARN("System can't access extended configuration space, please check!!\n"); 1675 1676 /* skip if the bios has already enabled large BAR */ 1677 if (adev->gmc.real_vram_size && 1678 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1679 return 0; 1680 1681 /* Check if the root BUS has 64bit memory resources */ 1682 root = adev->pdev->bus; 1683 while (root->parent) 1684 root = root->parent; 1685 1686 pci_bus_for_each_resource(root, res, i) { 1687 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1688 res->start > 0x100000000ull) 1689 break; 1690 } 1691 1692 /* Trying to resize is pointless without a root hub window above 4GB */ 1693 if (!res) 1694 return 0; 1695 1696 /* Limit the BAR size to what is available */ 1697 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1698 rbar_size); 1699 1700 /* Disable memory decoding while we change the BAR addresses and size */ 1701 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1702 pci_write_config_word(adev->pdev, PCI_COMMAND, 1703 cmd & ~PCI_COMMAND_MEMORY); 1704 1705 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1706 amdgpu_doorbell_fini(adev); 1707 if (adev->asic_type >= CHIP_BONAIRE) 1708 pci_release_resource(adev->pdev, 2); 1709 1710 pci_release_resource(adev->pdev, 0); 1711 1712 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1713 if (r == -ENOSPC) 1714 DRM_INFO("Not enough PCI address space for a large BAR."); 1715 else if (r && r != -ENOTSUPP) 1716 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1717 1718 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1719 1720 /* When the doorbell or fb BAR isn't available we have no chance of 1721 * using the device. 1722 */ 1723 r = amdgpu_doorbell_init(adev); 1724 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1725 return -ENODEV; 1726 1727 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1728 1729 return 0; 1730 } 1731 1732 /* 1733 * GPU helpers function. 1734 */ 1735 /** 1736 * amdgpu_device_need_post - check if the hw need post or not 1737 * 1738 * @adev: amdgpu_device pointer 1739 * 1740 * Check if the asic has been initialized (all asics) at driver startup 1741 * or post is needed if hw reset is performed. 1742 * Returns true if need or false if not. 1743 */ 1744 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1745 { 1746 uint32_t reg, flags; 1747 1748 if (amdgpu_sriov_vf(adev)) 1749 return false; 1750 1751 flags = amdgpu_device_get_vbios_flags(adev); 1752 if (flags & AMDGPU_VBIOS_SKIP) 1753 return false; 1754 if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios) 1755 return false; 1756 1757 if (amdgpu_passthrough(adev)) { 1758 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1759 * some old smc fw still need driver do vPost otherwise gpu hang, while 1760 * those smc fw version above 22.15 doesn't have this flaw, so we force 1761 * vpost executed for smc version below 22.15 1762 */ 1763 if (adev->asic_type == CHIP_FIJI) { 1764 int err; 1765 uint32_t fw_ver; 1766 1767 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1768 /* force vPost if error occurred */ 1769 if (err) 1770 return true; 1771 1772 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1773 release_firmware(adev->pm.fw); 1774 if (fw_ver < 0x00160e00) 1775 return true; 1776 } 1777 } 1778 1779 /* Don't post if we need to reset whole hive on init */ 1780 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1781 return false; 1782 1783 if (adev->has_hw_reset) { 1784 adev->has_hw_reset = false; 1785 return true; 1786 } 1787 1788 /* bios scratch used on CIK+ */ 1789 if (adev->asic_type >= CHIP_BONAIRE) 1790 return amdgpu_atombios_scratch_need_asic_init(adev); 1791 1792 /* check MEM_SIZE for older asics */ 1793 reg = amdgpu_asic_get_config_memsize(adev); 1794 1795 if ((reg != 0) && (reg != 0xffffffff)) 1796 return false; 1797 1798 return true; 1799 } 1800 1801 /* 1802 * Check whether seamless boot is supported. 1803 * 1804 * So far we only support seamless boot on DCE 3.0 or later. 1805 * If users report that it works on older ASICS as well, we may 1806 * loosen this. 1807 */ 1808 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1809 { 1810 switch (amdgpu_seamless) { 1811 case -1: 1812 break; 1813 case 1: 1814 return true; 1815 case 0: 1816 return false; 1817 default: 1818 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1819 amdgpu_seamless); 1820 return false; 1821 } 1822 1823 if (!(adev->flags & AMD_IS_APU)) 1824 return false; 1825 1826 if (adev->mman.keep_stolen_vga_memory) 1827 return false; 1828 1829 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1830 } 1831 1832 /* 1833 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1834 * don't support dynamic speed switching. Until we have confirmation from Intel 1835 * that a specific host supports it, it's safer that we keep it disabled for all. 1836 * 1837 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1838 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1839 */ 1840 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1841 { 1842 #if IS_ENABLED(CONFIG_X86) 1843 struct cpuinfo_x86 *c = &cpu_data(0); 1844 1845 /* eGPU change speeds based on USB4 fabric conditions */ 1846 if (dev_is_removable(adev->dev)) 1847 return true; 1848 1849 if (c->x86_vendor == X86_VENDOR_INTEL) 1850 return false; 1851 #endif 1852 return true; 1853 } 1854 1855 /** 1856 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1857 * 1858 * @adev: amdgpu_device pointer 1859 * 1860 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1861 * be set for this device. 1862 * 1863 * Returns true if it should be used or false if not. 1864 */ 1865 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1866 { 1867 switch (amdgpu_aspm) { 1868 case -1: 1869 break; 1870 case 0: 1871 return false; 1872 case 1: 1873 return true; 1874 default: 1875 return false; 1876 } 1877 if (adev->flags & AMD_IS_APU) 1878 return false; 1879 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) 1880 return false; 1881 return pcie_aspm_enabled(adev->pdev); 1882 } 1883 1884 /* if we get transitioned to only one device, take VGA back */ 1885 /** 1886 * amdgpu_device_vga_set_decode - enable/disable vga decode 1887 * 1888 * @pdev: PCI device pointer 1889 * @state: enable/disable vga decode 1890 * 1891 * Enable/disable vga decode (all asics). 1892 * Returns VGA resource flags. 1893 */ 1894 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1895 bool state) 1896 { 1897 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1898 1899 amdgpu_asic_set_vga_state(adev, state); 1900 if (state) 1901 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1902 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1903 else 1904 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1905 } 1906 1907 /** 1908 * amdgpu_device_check_block_size - validate the vm block size 1909 * 1910 * @adev: amdgpu_device pointer 1911 * 1912 * Validates the vm block size specified via module parameter. 1913 * The vm block size defines number of bits in page table versus page directory, 1914 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1915 * page table and the remaining bits are in the page directory. 1916 */ 1917 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1918 { 1919 /* defines number of bits in page table versus page directory, 1920 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1921 * page table and the remaining bits are in the page directory 1922 */ 1923 if (amdgpu_vm_block_size == -1) 1924 return; 1925 1926 if (amdgpu_vm_block_size < 9) { 1927 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1928 amdgpu_vm_block_size); 1929 amdgpu_vm_block_size = -1; 1930 } 1931 } 1932 1933 /** 1934 * amdgpu_device_check_vm_size - validate the vm size 1935 * 1936 * @adev: amdgpu_device pointer 1937 * 1938 * Validates the vm size in GB specified via module parameter. 1939 * The VM size is the size of the GPU virtual memory space in GB. 1940 */ 1941 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1942 { 1943 /* no need to check the default value */ 1944 if (amdgpu_vm_size == -1) 1945 return; 1946 1947 if (amdgpu_vm_size < 1) { 1948 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1949 amdgpu_vm_size); 1950 amdgpu_vm_size = -1; 1951 } 1952 } 1953 1954 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1955 { 1956 struct sysinfo si; 1957 bool is_os_64 = (sizeof(void *) == 8); 1958 uint64_t total_memory; 1959 uint64_t dram_size_seven_GB = 0x1B8000000; 1960 uint64_t dram_size_three_GB = 0xB8000000; 1961 1962 if (amdgpu_smu_memory_pool_size == 0) 1963 return; 1964 1965 if (!is_os_64) { 1966 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1967 goto def_value; 1968 } 1969 si_meminfo(&si); 1970 total_memory = (uint64_t)si.totalram * si.mem_unit; 1971 1972 if ((amdgpu_smu_memory_pool_size == 1) || 1973 (amdgpu_smu_memory_pool_size == 2)) { 1974 if (total_memory < dram_size_three_GB) 1975 goto def_value1; 1976 } else if ((amdgpu_smu_memory_pool_size == 4) || 1977 (amdgpu_smu_memory_pool_size == 8)) { 1978 if (total_memory < dram_size_seven_GB) 1979 goto def_value1; 1980 } else { 1981 DRM_WARN("Smu memory pool size not supported\n"); 1982 goto def_value; 1983 } 1984 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1985 1986 return; 1987 1988 def_value1: 1989 DRM_WARN("No enough system memory\n"); 1990 def_value: 1991 adev->pm.smu_prv_buffer_size = 0; 1992 } 1993 1994 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1995 { 1996 if (!(adev->flags & AMD_IS_APU) || 1997 adev->asic_type < CHIP_RAVEN) 1998 return 0; 1999 2000 switch (adev->asic_type) { 2001 case CHIP_RAVEN: 2002 if (adev->pdev->device == 0x15dd) 2003 adev->apu_flags |= AMD_APU_IS_RAVEN; 2004 if (adev->pdev->device == 0x15d8) 2005 adev->apu_flags |= AMD_APU_IS_PICASSO; 2006 break; 2007 case CHIP_RENOIR: 2008 if ((adev->pdev->device == 0x1636) || 2009 (adev->pdev->device == 0x164c)) 2010 adev->apu_flags |= AMD_APU_IS_RENOIR; 2011 else 2012 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 2013 break; 2014 case CHIP_VANGOGH: 2015 adev->apu_flags |= AMD_APU_IS_VANGOGH; 2016 break; 2017 case CHIP_YELLOW_CARP: 2018 break; 2019 case CHIP_CYAN_SKILLFISH: 2020 if ((adev->pdev->device == 0x13FE) || 2021 (adev->pdev->device == 0x143F)) 2022 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 2023 break; 2024 default: 2025 break; 2026 } 2027 2028 return 0; 2029 } 2030 2031 /** 2032 * amdgpu_device_check_arguments - validate module params 2033 * 2034 * @adev: amdgpu_device pointer 2035 * 2036 * Validates certain module parameters and updates 2037 * the associated values used by the driver (all asics). 2038 */ 2039 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 2040 { 2041 int i; 2042 2043 if (amdgpu_sched_jobs < 4) { 2044 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2045 amdgpu_sched_jobs); 2046 amdgpu_sched_jobs = 4; 2047 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2048 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2049 amdgpu_sched_jobs); 2050 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2051 } 2052 2053 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2054 /* gart size must be greater or equal to 32M */ 2055 dev_warn(adev->dev, "gart size (%d) too small\n", 2056 amdgpu_gart_size); 2057 amdgpu_gart_size = -1; 2058 } 2059 2060 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2061 /* gtt size must be greater or equal to 32M */ 2062 dev_warn(adev->dev, "gtt size (%d) too small\n", 2063 amdgpu_gtt_size); 2064 amdgpu_gtt_size = -1; 2065 } 2066 2067 /* valid range is between 4 and 9 inclusive */ 2068 if (amdgpu_vm_fragment_size != -1 && 2069 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2070 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2071 amdgpu_vm_fragment_size = -1; 2072 } 2073 2074 if (amdgpu_sched_hw_submission < 2) { 2075 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2076 amdgpu_sched_hw_submission); 2077 amdgpu_sched_hw_submission = 2; 2078 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2079 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2080 amdgpu_sched_hw_submission); 2081 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2082 } 2083 2084 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2085 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2086 amdgpu_reset_method = -1; 2087 } 2088 2089 amdgpu_device_check_smu_prv_buffer_size(adev); 2090 2091 amdgpu_device_check_vm_size(adev); 2092 2093 amdgpu_device_check_block_size(adev); 2094 2095 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2096 2097 for (i = 0; i < MAX_XCP; i++) 2098 adev->enforce_isolation[i] = !!enforce_isolation; 2099 2100 return 0; 2101 } 2102 2103 /** 2104 * amdgpu_switcheroo_set_state - set switcheroo state 2105 * 2106 * @pdev: pci dev pointer 2107 * @state: vga_switcheroo state 2108 * 2109 * Callback for the switcheroo driver. Suspends or resumes 2110 * the asics before or after it is powered up using ACPI methods. 2111 */ 2112 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2113 enum vga_switcheroo_state state) 2114 { 2115 struct drm_device *dev = pci_get_drvdata(pdev); 2116 int r; 2117 2118 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 2119 return; 2120 2121 if (state == VGA_SWITCHEROO_ON) { 2122 pr_info("switched on\n"); 2123 /* don't suspend or resume card normally */ 2124 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2125 2126 pci_set_power_state(pdev, PCI_D0); 2127 amdgpu_device_load_pci_state(pdev); 2128 r = pci_enable_device(pdev); 2129 if (r) 2130 DRM_WARN("pci_enable_device failed (%d)\n", r); 2131 amdgpu_device_resume(dev, true); 2132 2133 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2134 } else { 2135 pr_info("switched off\n"); 2136 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2137 amdgpu_device_prepare(dev); 2138 amdgpu_device_suspend(dev, true); 2139 amdgpu_device_cache_pci_state(pdev); 2140 /* Shut down the device */ 2141 pci_disable_device(pdev); 2142 pci_set_power_state(pdev, PCI_D3cold); 2143 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2144 } 2145 } 2146 2147 /** 2148 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2149 * 2150 * @pdev: pci dev pointer 2151 * 2152 * Callback for the switcheroo driver. Check of the switcheroo 2153 * state can be changed. 2154 * Returns true if the state can be changed, false if not. 2155 */ 2156 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2157 { 2158 struct drm_device *dev = pci_get_drvdata(pdev); 2159 2160 /* 2161 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2162 * locking inversion with the driver load path. And the access here is 2163 * completely racy anyway. So don't bother with locking for now. 2164 */ 2165 return atomic_read(&dev->open_count) == 0; 2166 } 2167 2168 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2169 .set_gpu_state = amdgpu_switcheroo_set_state, 2170 .reprobe = NULL, 2171 .can_switch = amdgpu_switcheroo_can_switch, 2172 }; 2173 2174 /** 2175 * amdgpu_device_ip_set_clockgating_state - set the CG state 2176 * 2177 * @dev: amdgpu_device pointer 2178 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2179 * @state: clockgating state (gate or ungate) 2180 * 2181 * Sets the requested clockgating state for all instances of 2182 * the hardware IP specified. 2183 * Returns the error code from the last instance. 2184 */ 2185 int amdgpu_device_ip_set_clockgating_state(void *dev, 2186 enum amd_ip_block_type block_type, 2187 enum amd_clockgating_state state) 2188 { 2189 struct amdgpu_device *adev = dev; 2190 int i, r = 0; 2191 2192 for (i = 0; i < adev->num_ip_blocks; i++) { 2193 if (!adev->ip_blocks[i].status.valid) 2194 continue; 2195 if (adev->ip_blocks[i].version->type != block_type) 2196 continue; 2197 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2198 continue; 2199 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2200 &adev->ip_blocks[i], state); 2201 if (r) 2202 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 2203 adev->ip_blocks[i].version->funcs->name, r); 2204 } 2205 return r; 2206 } 2207 2208 /** 2209 * amdgpu_device_ip_set_powergating_state - set the PG state 2210 * 2211 * @dev: amdgpu_device pointer 2212 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2213 * @state: powergating state (gate or ungate) 2214 * 2215 * Sets the requested powergating state for all instances of 2216 * the hardware IP specified. 2217 * Returns the error code from the last instance. 2218 */ 2219 int amdgpu_device_ip_set_powergating_state(void *dev, 2220 enum amd_ip_block_type block_type, 2221 enum amd_powergating_state state) 2222 { 2223 struct amdgpu_device *adev = dev; 2224 int i, r = 0; 2225 2226 for (i = 0; i < adev->num_ip_blocks; i++) { 2227 if (!adev->ip_blocks[i].status.valid) 2228 continue; 2229 if (adev->ip_blocks[i].version->type != block_type) 2230 continue; 2231 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2232 continue; 2233 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2234 &adev->ip_blocks[i], state); 2235 if (r) 2236 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 2237 adev->ip_blocks[i].version->funcs->name, r); 2238 } 2239 return r; 2240 } 2241 2242 /** 2243 * amdgpu_device_ip_get_clockgating_state - get the CG state 2244 * 2245 * @adev: amdgpu_device pointer 2246 * @flags: clockgating feature flags 2247 * 2248 * Walks the list of IPs on the device and updates the clockgating 2249 * flags for each IP. 2250 * Updates @flags with the feature flags for each hardware IP where 2251 * clockgating is enabled. 2252 */ 2253 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2254 u64 *flags) 2255 { 2256 int i; 2257 2258 for (i = 0; i < adev->num_ip_blocks; i++) { 2259 if (!adev->ip_blocks[i].status.valid) 2260 continue; 2261 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2262 adev->ip_blocks[i].version->funcs->get_clockgating_state( 2263 &adev->ip_blocks[i], flags); 2264 } 2265 } 2266 2267 /** 2268 * amdgpu_device_ip_wait_for_idle - wait for idle 2269 * 2270 * @adev: amdgpu_device pointer 2271 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2272 * 2273 * Waits for the request hardware IP to be idle. 2274 * Returns 0 for success or a negative error code on failure. 2275 */ 2276 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2277 enum amd_ip_block_type block_type) 2278 { 2279 int i, r; 2280 2281 for (i = 0; i < adev->num_ip_blocks; i++) { 2282 if (!adev->ip_blocks[i].status.valid) 2283 continue; 2284 if (adev->ip_blocks[i].version->type == block_type) { 2285 if (adev->ip_blocks[i].version->funcs->wait_for_idle) { 2286 r = adev->ip_blocks[i].version->funcs->wait_for_idle( 2287 &adev->ip_blocks[i]); 2288 if (r) 2289 return r; 2290 } 2291 break; 2292 } 2293 } 2294 return 0; 2295 2296 } 2297 2298 /** 2299 * amdgpu_device_ip_is_valid - is the hardware IP enabled 2300 * 2301 * @adev: amdgpu_device pointer 2302 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2303 * 2304 * Check if the hardware IP is enable or not. 2305 * Returns true if it the IP is enable, false if not. 2306 */ 2307 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, 2308 enum amd_ip_block_type block_type) 2309 { 2310 int i; 2311 2312 for (i = 0; i < adev->num_ip_blocks; i++) { 2313 if (adev->ip_blocks[i].version->type == block_type) 2314 return adev->ip_blocks[i].status.valid; 2315 } 2316 return false; 2317 2318 } 2319 2320 /** 2321 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2322 * 2323 * @adev: amdgpu_device pointer 2324 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2325 * 2326 * Returns a pointer to the hardware IP block structure 2327 * if it exists for the asic, otherwise NULL. 2328 */ 2329 struct amdgpu_ip_block * 2330 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2331 enum amd_ip_block_type type) 2332 { 2333 int i; 2334 2335 for (i = 0; i < adev->num_ip_blocks; i++) 2336 if (adev->ip_blocks[i].version->type == type) 2337 return &adev->ip_blocks[i]; 2338 2339 return NULL; 2340 } 2341 2342 /** 2343 * amdgpu_device_ip_block_version_cmp 2344 * 2345 * @adev: amdgpu_device pointer 2346 * @type: enum amd_ip_block_type 2347 * @major: major version 2348 * @minor: minor version 2349 * 2350 * return 0 if equal or greater 2351 * return 1 if smaller or the ip_block doesn't exist 2352 */ 2353 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2354 enum amd_ip_block_type type, 2355 u32 major, u32 minor) 2356 { 2357 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2358 2359 if (ip_block && ((ip_block->version->major > major) || 2360 ((ip_block->version->major == major) && 2361 (ip_block->version->minor >= minor)))) 2362 return 0; 2363 2364 return 1; 2365 } 2366 2367 /** 2368 * amdgpu_device_ip_block_add 2369 * 2370 * @adev: amdgpu_device pointer 2371 * @ip_block_version: pointer to the IP to add 2372 * 2373 * Adds the IP block driver information to the collection of IPs 2374 * on the asic. 2375 */ 2376 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2377 const struct amdgpu_ip_block_version *ip_block_version) 2378 { 2379 if (!ip_block_version) 2380 return -EINVAL; 2381 2382 switch (ip_block_version->type) { 2383 case AMD_IP_BLOCK_TYPE_VCN: 2384 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2385 return 0; 2386 break; 2387 case AMD_IP_BLOCK_TYPE_JPEG: 2388 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2389 return 0; 2390 break; 2391 default: 2392 break; 2393 } 2394 2395 dev_info(adev->dev, "detected ip block number %d <%s>\n", 2396 adev->num_ip_blocks, ip_block_version->funcs->name); 2397 2398 adev->ip_blocks[adev->num_ip_blocks].adev = adev; 2399 2400 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2401 2402 return 0; 2403 } 2404 2405 /** 2406 * amdgpu_device_enable_virtual_display - enable virtual display feature 2407 * 2408 * @adev: amdgpu_device pointer 2409 * 2410 * Enabled the virtual display feature if the user has enabled it via 2411 * the module parameter virtual_display. This feature provides a virtual 2412 * display hardware on headless boards or in virtualized environments. 2413 * This function parses and validates the configuration string specified by 2414 * the user and configures the virtual display configuration (number of 2415 * virtual connectors, crtcs, etc.) specified. 2416 */ 2417 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2418 { 2419 adev->enable_virtual_display = false; 2420 2421 if (amdgpu_virtual_display) { 2422 const char *pci_address_name = pci_name(adev->pdev); 2423 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2424 2425 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2426 pciaddstr_tmp = pciaddstr; 2427 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2428 pciaddname = strsep(&pciaddname_tmp, ","); 2429 if (!strcmp("all", pciaddname) 2430 || !strcmp(pci_address_name, pciaddname)) { 2431 long num_crtc; 2432 int res = -1; 2433 2434 adev->enable_virtual_display = true; 2435 2436 if (pciaddname_tmp) 2437 res = kstrtol(pciaddname_tmp, 10, 2438 &num_crtc); 2439 2440 if (!res) { 2441 if (num_crtc < 1) 2442 num_crtc = 1; 2443 if (num_crtc > 6) 2444 num_crtc = 6; 2445 adev->mode_info.num_crtc = num_crtc; 2446 } else { 2447 adev->mode_info.num_crtc = 1; 2448 } 2449 break; 2450 } 2451 } 2452 2453 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2454 amdgpu_virtual_display, pci_address_name, 2455 adev->enable_virtual_display, adev->mode_info.num_crtc); 2456 2457 kfree(pciaddstr); 2458 } 2459 } 2460 2461 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2462 { 2463 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2464 adev->mode_info.num_crtc = 1; 2465 adev->enable_virtual_display = true; 2466 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2467 adev->enable_virtual_display, adev->mode_info.num_crtc); 2468 } 2469 } 2470 2471 /** 2472 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2473 * 2474 * @adev: amdgpu_device pointer 2475 * 2476 * Parses the asic configuration parameters specified in the gpu info 2477 * firmware and makes them available to the driver for use in configuring 2478 * the asic. 2479 * Returns 0 on success, -EINVAL on failure. 2480 */ 2481 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2482 { 2483 const char *chip_name; 2484 int err; 2485 const struct gpu_info_firmware_header_v1_0 *hdr; 2486 2487 adev->firmware.gpu_info_fw = NULL; 2488 2489 if (adev->mman.discovery_bin) 2490 return 0; 2491 2492 switch (adev->asic_type) { 2493 default: 2494 return 0; 2495 case CHIP_VEGA10: 2496 chip_name = "vega10"; 2497 break; 2498 case CHIP_VEGA12: 2499 chip_name = "vega12"; 2500 break; 2501 case CHIP_RAVEN: 2502 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2503 chip_name = "raven2"; 2504 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2505 chip_name = "picasso"; 2506 else 2507 chip_name = "raven"; 2508 break; 2509 case CHIP_ARCTURUS: 2510 chip_name = "arcturus"; 2511 break; 2512 case CHIP_NAVI12: 2513 chip_name = "navi12"; 2514 break; 2515 } 2516 2517 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2518 AMDGPU_UCODE_OPTIONAL, 2519 "amdgpu/%s_gpu_info.bin", chip_name); 2520 if (err) { 2521 dev_err(adev->dev, 2522 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2523 chip_name); 2524 goto out; 2525 } 2526 2527 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2528 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2529 2530 switch (hdr->version_major) { 2531 case 1: 2532 { 2533 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2534 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2535 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2536 2537 /* 2538 * Should be dropped when DAL no longer needs it. 2539 */ 2540 if (adev->asic_type == CHIP_NAVI12) 2541 goto parse_soc_bounding_box; 2542 2543 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2544 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2545 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2546 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2547 adev->gfx.config.max_texture_channel_caches = 2548 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2549 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2550 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2551 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2552 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2553 adev->gfx.config.double_offchip_lds_buf = 2554 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2555 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2556 adev->gfx.cu_info.max_waves_per_simd = 2557 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2558 adev->gfx.cu_info.max_scratch_slots_per_cu = 2559 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2560 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2561 if (hdr->version_minor >= 1) { 2562 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2563 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2564 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2565 adev->gfx.config.num_sc_per_sh = 2566 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2567 adev->gfx.config.num_packer_per_sc = 2568 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2569 } 2570 2571 parse_soc_bounding_box: 2572 /* 2573 * soc bounding box info is not integrated in disocovery table, 2574 * we always need to parse it from gpu info firmware if needed. 2575 */ 2576 if (hdr->version_minor == 2) { 2577 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2578 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2579 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2580 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2581 } 2582 break; 2583 } 2584 default: 2585 dev_err(adev->dev, 2586 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2587 err = -EINVAL; 2588 goto out; 2589 } 2590 out: 2591 return err; 2592 } 2593 2594 /** 2595 * amdgpu_device_ip_early_init - run early init for hardware IPs 2596 * 2597 * @adev: amdgpu_device pointer 2598 * 2599 * Early initialization pass for hardware IPs. The hardware IPs that make 2600 * up each asic are discovered each IP's early_init callback is run. This 2601 * is the first stage in initializing the asic. 2602 * Returns 0 on success, negative error code on failure. 2603 */ 2604 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2605 { 2606 struct amdgpu_ip_block *ip_block; 2607 struct pci_dev *parent; 2608 bool total, skip_bios; 2609 uint32_t bios_flags; 2610 int i, r; 2611 2612 amdgpu_device_enable_virtual_display(adev); 2613 2614 if (amdgpu_sriov_vf(adev)) { 2615 r = amdgpu_virt_request_full_gpu(adev, true); 2616 if (r) 2617 return r; 2618 } 2619 2620 switch (adev->asic_type) { 2621 #ifdef CONFIG_DRM_AMDGPU_SI 2622 case CHIP_VERDE: 2623 case CHIP_TAHITI: 2624 case CHIP_PITCAIRN: 2625 case CHIP_OLAND: 2626 case CHIP_HAINAN: 2627 adev->family = AMDGPU_FAMILY_SI; 2628 r = si_set_ip_blocks(adev); 2629 if (r) 2630 return r; 2631 break; 2632 #endif 2633 #ifdef CONFIG_DRM_AMDGPU_CIK 2634 case CHIP_BONAIRE: 2635 case CHIP_HAWAII: 2636 case CHIP_KAVERI: 2637 case CHIP_KABINI: 2638 case CHIP_MULLINS: 2639 if (adev->flags & AMD_IS_APU) 2640 adev->family = AMDGPU_FAMILY_KV; 2641 else 2642 adev->family = AMDGPU_FAMILY_CI; 2643 2644 r = cik_set_ip_blocks(adev); 2645 if (r) 2646 return r; 2647 break; 2648 #endif 2649 case CHIP_TOPAZ: 2650 case CHIP_TONGA: 2651 case CHIP_FIJI: 2652 case CHIP_POLARIS10: 2653 case CHIP_POLARIS11: 2654 case CHIP_POLARIS12: 2655 case CHIP_VEGAM: 2656 case CHIP_CARRIZO: 2657 case CHIP_STONEY: 2658 if (adev->flags & AMD_IS_APU) 2659 adev->family = AMDGPU_FAMILY_CZ; 2660 else 2661 adev->family = AMDGPU_FAMILY_VI; 2662 2663 r = vi_set_ip_blocks(adev); 2664 if (r) 2665 return r; 2666 break; 2667 default: 2668 r = amdgpu_discovery_set_ip_blocks(adev); 2669 if (r) 2670 return r; 2671 break; 2672 } 2673 2674 if (amdgpu_has_atpx() && 2675 (amdgpu_is_atpx_hybrid() || 2676 amdgpu_has_atpx_dgpu_power_cntl()) && 2677 ((adev->flags & AMD_IS_APU) == 0) && 2678 !dev_is_removable(&adev->pdev->dev)) 2679 adev->flags |= AMD_IS_PX; 2680 2681 if (!(adev->flags & AMD_IS_APU)) { 2682 parent = pcie_find_root_port(adev->pdev); 2683 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2684 } 2685 2686 2687 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2688 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2689 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2690 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2691 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2692 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2693 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2694 2695 total = true; 2696 for (i = 0; i < adev->num_ip_blocks; i++) { 2697 ip_block = &adev->ip_blocks[i]; 2698 2699 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2700 DRM_WARN("disabled ip block: %d <%s>\n", 2701 i, adev->ip_blocks[i].version->funcs->name); 2702 adev->ip_blocks[i].status.valid = false; 2703 } else if (ip_block->version->funcs->early_init) { 2704 r = ip_block->version->funcs->early_init(ip_block); 2705 if (r == -ENOENT) { 2706 adev->ip_blocks[i].status.valid = false; 2707 } else if (r) { 2708 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2709 adev->ip_blocks[i].version->funcs->name, r); 2710 total = false; 2711 } else { 2712 adev->ip_blocks[i].status.valid = true; 2713 } 2714 } else { 2715 adev->ip_blocks[i].status.valid = true; 2716 } 2717 /* get the vbios after the asic_funcs are set up */ 2718 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2719 r = amdgpu_device_parse_gpu_info_fw(adev); 2720 if (r) 2721 return r; 2722 2723 bios_flags = amdgpu_device_get_vbios_flags(adev); 2724 skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP); 2725 /* Read BIOS */ 2726 if (!skip_bios) { 2727 bool optional = 2728 !!(bios_flags & AMDGPU_VBIOS_OPTIONAL); 2729 if (!amdgpu_get_bios(adev) && !optional) 2730 return -EINVAL; 2731 2732 if (optional && !adev->bios) 2733 dev_info( 2734 adev->dev, 2735 "VBIOS image optional, proceeding without VBIOS image"); 2736 2737 if (adev->bios) { 2738 r = amdgpu_atombios_init(adev); 2739 if (r) { 2740 dev_err(adev->dev, 2741 "amdgpu_atombios_init failed\n"); 2742 amdgpu_vf_error_put( 2743 adev, 2744 AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 2745 0, 0); 2746 return r; 2747 } 2748 } 2749 } 2750 2751 /*get pf2vf msg info at it's earliest time*/ 2752 if (amdgpu_sriov_vf(adev)) 2753 amdgpu_virt_init_data_exchange(adev); 2754 2755 } 2756 } 2757 if (!total) 2758 return -ENODEV; 2759 2760 if (adev->gmc.xgmi.supported) 2761 amdgpu_xgmi_early_init(adev); 2762 2763 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2764 if (ip_block->status.valid != false) 2765 amdgpu_amdkfd_device_probe(adev); 2766 2767 adev->cg_flags &= amdgpu_cg_mask; 2768 adev->pg_flags &= amdgpu_pg_mask; 2769 2770 return 0; 2771 } 2772 2773 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2774 { 2775 int i, r; 2776 2777 for (i = 0; i < adev->num_ip_blocks; i++) { 2778 if (!adev->ip_blocks[i].status.sw) 2779 continue; 2780 if (adev->ip_blocks[i].status.hw) 2781 continue; 2782 if (!amdgpu_ip_member_of_hwini( 2783 adev, adev->ip_blocks[i].version->type)) 2784 continue; 2785 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2786 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2787 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2788 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2789 if (r) { 2790 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2791 adev->ip_blocks[i].version->funcs->name, r); 2792 return r; 2793 } 2794 adev->ip_blocks[i].status.hw = true; 2795 } 2796 } 2797 2798 return 0; 2799 } 2800 2801 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2802 { 2803 int i, r; 2804 2805 for (i = 0; i < adev->num_ip_blocks; i++) { 2806 if (!adev->ip_blocks[i].status.sw) 2807 continue; 2808 if (adev->ip_blocks[i].status.hw) 2809 continue; 2810 if (!amdgpu_ip_member_of_hwini( 2811 adev, adev->ip_blocks[i].version->type)) 2812 continue; 2813 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2814 if (r) { 2815 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2816 adev->ip_blocks[i].version->funcs->name, r); 2817 return r; 2818 } 2819 adev->ip_blocks[i].status.hw = true; 2820 } 2821 2822 return 0; 2823 } 2824 2825 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2826 { 2827 int r = 0; 2828 int i; 2829 uint32_t smu_version; 2830 2831 if (adev->asic_type >= CHIP_VEGA10) { 2832 for (i = 0; i < adev->num_ip_blocks; i++) { 2833 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2834 continue; 2835 2836 if (!amdgpu_ip_member_of_hwini(adev, 2837 AMD_IP_BLOCK_TYPE_PSP)) 2838 break; 2839 2840 if (!adev->ip_blocks[i].status.sw) 2841 continue; 2842 2843 /* no need to do the fw loading again if already done*/ 2844 if (adev->ip_blocks[i].status.hw == true) 2845 break; 2846 2847 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2848 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 2849 if (r) 2850 return r; 2851 } else { 2852 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2853 if (r) { 2854 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2855 adev->ip_blocks[i].version->funcs->name, r); 2856 return r; 2857 } 2858 adev->ip_blocks[i].status.hw = true; 2859 } 2860 break; 2861 } 2862 } 2863 2864 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2865 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2866 2867 return r; 2868 } 2869 2870 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2871 { 2872 struct drm_sched_init_args args = { 2873 .ops = &amdgpu_sched_ops, 2874 .num_rqs = DRM_SCHED_PRIORITY_COUNT, 2875 .timeout_wq = adev->reset_domain->wq, 2876 .dev = adev->dev, 2877 }; 2878 long timeout; 2879 int r, i; 2880 2881 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2882 struct amdgpu_ring *ring = adev->rings[i]; 2883 2884 /* No need to setup the GPU scheduler for rings that don't need it */ 2885 if (!ring || ring->no_scheduler) 2886 continue; 2887 2888 switch (ring->funcs->type) { 2889 case AMDGPU_RING_TYPE_GFX: 2890 timeout = adev->gfx_timeout; 2891 break; 2892 case AMDGPU_RING_TYPE_COMPUTE: 2893 timeout = adev->compute_timeout; 2894 break; 2895 case AMDGPU_RING_TYPE_SDMA: 2896 timeout = adev->sdma_timeout; 2897 break; 2898 default: 2899 timeout = adev->video_timeout; 2900 break; 2901 } 2902 2903 args.timeout = timeout; 2904 args.credit_limit = ring->num_hw_submission; 2905 args.score = ring->sched_score; 2906 args.name = ring->name; 2907 2908 r = drm_sched_init(&ring->sched, &args); 2909 if (r) { 2910 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2911 ring->name); 2912 return r; 2913 } 2914 r = amdgpu_uvd_entity_init(adev, ring); 2915 if (r) { 2916 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", 2917 ring->name); 2918 return r; 2919 } 2920 r = amdgpu_vce_entity_init(adev, ring); 2921 if (r) { 2922 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", 2923 ring->name); 2924 return r; 2925 } 2926 } 2927 2928 amdgpu_xcp_update_partition_sched_list(adev); 2929 2930 return 0; 2931 } 2932 2933 2934 /** 2935 * amdgpu_device_ip_init - run init for hardware IPs 2936 * 2937 * @adev: amdgpu_device pointer 2938 * 2939 * Main initialization pass for hardware IPs. The list of all the hardware 2940 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2941 * are run. sw_init initializes the software state associated with each IP 2942 * and hw_init initializes the hardware associated with each IP. 2943 * Returns 0 on success, negative error code on failure. 2944 */ 2945 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2946 { 2947 bool init_badpage; 2948 int i, r; 2949 2950 r = amdgpu_ras_init(adev); 2951 if (r) 2952 return r; 2953 2954 for (i = 0; i < adev->num_ip_blocks; i++) { 2955 if (!adev->ip_blocks[i].status.valid) 2956 continue; 2957 if (adev->ip_blocks[i].version->funcs->sw_init) { 2958 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 2959 if (r) { 2960 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2961 adev->ip_blocks[i].version->funcs->name, r); 2962 goto init_failed; 2963 } 2964 } 2965 adev->ip_blocks[i].status.sw = true; 2966 2967 if (!amdgpu_ip_member_of_hwini( 2968 adev, adev->ip_blocks[i].version->type)) 2969 continue; 2970 2971 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2972 /* need to do common hw init early so everything is set up for gmc */ 2973 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2974 if (r) { 2975 DRM_ERROR("hw_init %d failed %d\n", i, r); 2976 goto init_failed; 2977 } 2978 adev->ip_blocks[i].status.hw = true; 2979 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2980 /* need to do gmc hw init early so we can allocate gpu mem */ 2981 /* Try to reserve bad pages early */ 2982 if (amdgpu_sriov_vf(adev)) 2983 amdgpu_virt_exchange_data(adev); 2984 2985 r = amdgpu_device_mem_scratch_init(adev); 2986 if (r) { 2987 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2988 goto init_failed; 2989 } 2990 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2991 if (r) { 2992 DRM_ERROR("hw_init %d failed %d\n", i, r); 2993 goto init_failed; 2994 } 2995 r = amdgpu_device_wb_init(adev); 2996 if (r) { 2997 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2998 goto init_failed; 2999 } 3000 adev->ip_blocks[i].status.hw = true; 3001 3002 /* right after GMC hw init, we create CSA */ 3003 if (adev->gfx.mcbp) { 3004 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 3005 AMDGPU_GEM_DOMAIN_VRAM | 3006 AMDGPU_GEM_DOMAIN_GTT, 3007 AMDGPU_CSA_SIZE); 3008 if (r) { 3009 DRM_ERROR("allocate CSA failed %d\n", r); 3010 goto init_failed; 3011 } 3012 } 3013 3014 r = amdgpu_seq64_init(adev); 3015 if (r) { 3016 DRM_ERROR("allocate seq64 failed %d\n", r); 3017 goto init_failed; 3018 } 3019 } 3020 } 3021 3022 if (amdgpu_sriov_vf(adev)) 3023 amdgpu_virt_init_data_exchange(adev); 3024 3025 r = amdgpu_ib_pool_init(adev); 3026 if (r) { 3027 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 3028 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 3029 goto init_failed; 3030 } 3031 3032 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 3033 if (r) 3034 goto init_failed; 3035 3036 r = amdgpu_device_ip_hw_init_phase1(adev); 3037 if (r) 3038 goto init_failed; 3039 3040 r = amdgpu_device_fw_loading(adev); 3041 if (r) 3042 goto init_failed; 3043 3044 r = amdgpu_device_ip_hw_init_phase2(adev); 3045 if (r) 3046 goto init_failed; 3047 3048 /* 3049 * retired pages will be loaded from eeprom and reserved here, 3050 * it should be called after amdgpu_device_ip_hw_init_phase2 since 3051 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 3052 * for I2C communication which only true at this point. 3053 * 3054 * amdgpu_ras_recovery_init may fail, but the upper only cares the 3055 * failure from bad gpu situation and stop amdgpu init process 3056 * accordingly. For other failed cases, it will still release all 3057 * the resource and print error message, rather than returning one 3058 * negative value to upper level. 3059 * 3060 * Note: theoretically, this should be called before all vram allocations 3061 * to protect retired page from abusing 3062 */ 3063 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 3064 r = amdgpu_ras_recovery_init(adev, init_badpage); 3065 if (r) 3066 goto init_failed; 3067 3068 /** 3069 * In case of XGMI grab extra reference for reset domain for this device 3070 */ 3071 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3072 if (amdgpu_xgmi_add_device(adev) == 0) { 3073 if (!amdgpu_sriov_vf(adev)) { 3074 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3075 3076 if (WARN_ON(!hive)) { 3077 r = -ENOENT; 3078 goto init_failed; 3079 } 3080 3081 if (!hive->reset_domain || 3082 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 3083 r = -ENOENT; 3084 amdgpu_put_xgmi_hive(hive); 3085 goto init_failed; 3086 } 3087 3088 /* Drop the early temporary reset domain we created for device */ 3089 amdgpu_reset_put_reset_domain(adev->reset_domain); 3090 adev->reset_domain = hive->reset_domain; 3091 amdgpu_put_xgmi_hive(hive); 3092 } 3093 } 3094 } 3095 3096 r = amdgpu_device_init_schedulers(adev); 3097 if (r) 3098 goto init_failed; 3099 3100 if (adev->mman.buffer_funcs_ring->sched.ready) 3101 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3102 3103 /* Don't init kfd if whole hive need to be reset during init */ 3104 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3105 kgd2kfd_init_zone_device(adev); 3106 amdgpu_amdkfd_device_init(adev); 3107 } 3108 3109 amdgpu_fru_get_product_info(adev); 3110 3111 if (!amdgpu_sriov_vf(adev) || amdgpu_sriov_ras_cper_en(adev)) 3112 r = amdgpu_cper_init(adev); 3113 3114 init_failed: 3115 3116 return r; 3117 } 3118 3119 /** 3120 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3121 * 3122 * @adev: amdgpu_device pointer 3123 * 3124 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3125 * this function before a GPU reset. If the value is retained after a 3126 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents. 3127 */ 3128 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3129 { 3130 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3131 } 3132 3133 /** 3134 * amdgpu_device_check_vram_lost - check if vram is valid 3135 * 3136 * @adev: amdgpu_device pointer 3137 * 3138 * Checks the reset magic value written to the gart pointer in VRAM. 3139 * The driver calls this after a GPU reset to see if the contents of 3140 * VRAM is lost or now. 3141 * returns true if vram is lost, false if not. 3142 */ 3143 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3144 { 3145 if (memcmp(adev->gart.ptr, adev->reset_magic, 3146 AMDGPU_RESET_MAGIC_NUM)) 3147 return true; 3148 3149 if (!amdgpu_in_reset(adev)) 3150 return false; 3151 3152 /* 3153 * For all ASICs with baco/mode1 reset, the VRAM is 3154 * always assumed to be lost. 3155 */ 3156 switch (amdgpu_asic_reset_method(adev)) { 3157 case AMD_RESET_METHOD_BACO: 3158 case AMD_RESET_METHOD_MODE1: 3159 return true; 3160 default: 3161 return false; 3162 } 3163 } 3164 3165 /** 3166 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3167 * 3168 * @adev: amdgpu_device pointer 3169 * @state: clockgating state (gate or ungate) 3170 * 3171 * The list of all the hardware IPs that make up the asic is walked and the 3172 * set_clockgating_state callbacks are run. 3173 * Late initialization pass enabling clockgating for hardware IPs. 3174 * Fini or suspend, pass disabling clockgating for hardware IPs. 3175 * Returns 0 on success, negative error code on failure. 3176 */ 3177 3178 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3179 enum amd_clockgating_state state) 3180 { 3181 int i, j, r; 3182 3183 if (amdgpu_emu_mode == 1) 3184 return 0; 3185 3186 for (j = 0; j < adev->num_ip_blocks; j++) { 3187 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3188 if (!adev->ip_blocks[i].status.late_initialized) 3189 continue; 3190 /* skip CG for GFX, SDMA on S0ix */ 3191 if (adev->in_s0ix && 3192 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3193 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3194 continue; 3195 /* skip CG for VCE/UVD, it's handled specially */ 3196 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3197 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3198 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3199 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3200 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3201 /* enable clockgating to save power */ 3202 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i], 3203 state); 3204 if (r) { 3205 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 3206 adev->ip_blocks[i].version->funcs->name, r); 3207 return r; 3208 } 3209 } 3210 } 3211 3212 return 0; 3213 } 3214 3215 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3216 enum amd_powergating_state state) 3217 { 3218 int i, j, r; 3219 3220 if (amdgpu_emu_mode == 1) 3221 return 0; 3222 3223 for (j = 0; j < adev->num_ip_blocks; j++) { 3224 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3225 if (!adev->ip_blocks[i].status.late_initialized) 3226 continue; 3227 /* skip PG for GFX, SDMA on S0ix */ 3228 if (adev->in_s0ix && 3229 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3230 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3231 continue; 3232 /* skip CG for VCE/UVD, it's handled specially */ 3233 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3234 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3235 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3236 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3237 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3238 /* enable powergating to save power */ 3239 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i], 3240 state); 3241 if (r) { 3242 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 3243 adev->ip_blocks[i].version->funcs->name, r); 3244 return r; 3245 } 3246 } 3247 } 3248 return 0; 3249 } 3250 3251 static int amdgpu_device_enable_mgpu_fan_boost(void) 3252 { 3253 struct amdgpu_gpu_instance *gpu_ins; 3254 struct amdgpu_device *adev; 3255 int i, ret = 0; 3256 3257 mutex_lock(&mgpu_info.mutex); 3258 3259 /* 3260 * MGPU fan boost feature should be enabled 3261 * only when there are two or more dGPUs in 3262 * the system 3263 */ 3264 if (mgpu_info.num_dgpu < 2) 3265 goto out; 3266 3267 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3268 gpu_ins = &(mgpu_info.gpu_ins[i]); 3269 adev = gpu_ins->adev; 3270 if (!(adev->flags & AMD_IS_APU) && 3271 !gpu_ins->mgpu_fan_enabled) { 3272 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3273 if (ret) 3274 break; 3275 3276 gpu_ins->mgpu_fan_enabled = 1; 3277 } 3278 } 3279 3280 out: 3281 mutex_unlock(&mgpu_info.mutex); 3282 3283 return ret; 3284 } 3285 3286 /** 3287 * amdgpu_device_ip_late_init - run late init for hardware IPs 3288 * 3289 * @adev: amdgpu_device pointer 3290 * 3291 * Late initialization pass for hardware IPs. The list of all the hardware 3292 * IPs that make up the asic is walked and the late_init callbacks are run. 3293 * late_init covers any special initialization that an IP requires 3294 * after all of the have been initialized or something that needs to happen 3295 * late in the init process. 3296 * Returns 0 on success, negative error code on failure. 3297 */ 3298 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3299 { 3300 struct amdgpu_gpu_instance *gpu_instance; 3301 int i = 0, r; 3302 3303 for (i = 0; i < adev->num_ip_blocks; i++) { 3304 if (!adev->ip_blocks[i].status.hw) 3305 continue; 3306 if (adev->ip_blocks[i].version->funcs->late_init) { 3307 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3308 if (r) { 3309 DRM_ERROR("late_init of IP block <%s> failed %d\n", 3310 adev->ip_blocks[i].version->funcs->name, r); 3311 return r; 3312 } 3313 } 3314 adev->ip_blocks[i].status.late_initialized = true; 3315 } 3316 3317 r = amdgpu_ras_late_init(adev); 3318 if (r) { 3319 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 3320 return r; 3321 } 3322 3323 if (!amdgpu_reset_in_recovery(adev)) 3324 amdgpu_ras_set_error_query_ready(adev, true); 3325 3326 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3327 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3328 3329 amdgpu_device_fill_reset_magic(adev); 3330 3331 r = amdgpu_device_enable_mgpu_fan_boost(); 3332 if (r) 3333 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 3334 3335 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3336 if (amdgpu_passthrough(adev) && 3337 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3338 adev->asic_type == CHIP_ALDEBARAN)) 3339 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3340 3341 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3342 mutex_lock(&mgpu_info.mutex); 3343 3344 /* 3345 * Reset device p-state to low as this was booted with high. 3346 * 3347 * This should be performed only after all devices from the same 3348 * hive get initialized. 3349 * 3350 * However, it's unknown how many device in the hive in advance. 3351 * As this is counted one by one during devices initializations. 3352 * 3353 * So, we wait for all XGMI interlinked devices initialized. 3354 * This may bring some delays as those devices may come from 3355 * different hives. But that should be OK. 3356 */ 3357 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3358 for (i = 0; i < mgpu_info.num_gpu; i++) { 3359 gpu_instance = &(mgpu_info.gpu_ins[i]); 3360 if (gpu_instance->adev->flags & AMD_IS_APU) 3361 continue; 3362 3363 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3364 AMDGPU_XGMI_PSTATE_MIN); 3365 if (r) { 3366 DRM_ERROR("pstate setting failed (%d).\n", r); 3367 break; 3368 } 3369 } 3370 } 3371 3372 mutex_unlock(&mgpu_info.mutex); 3373 } 3374 3375 return 0; 3376 } 3377 3378 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3379 { 3380 int r; 3381 3382 if (!ip_block->version->funcs->hw_fini) { 3383 DRM_ERROR("hw_fini of IP block <%s> not defined\n", 3384 ip_block->version->funcs->name); 3385 } else { 3386 r = ip_block->version->funcs->hw_fini(ip_block); 3387 /* XXX handle errors */ 3388 if (r) { 3389 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3390 ip_block->version->funcs->name, r); 3391 } 3392 } 3393 3394 ip_block->status.hw = false; 3395 } 3396 3397 /** 3398 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3399 * 3400 * @adev: amdgpu_device pointer 3401 * 3402 * For ASICs need to disable SMC first 3403 */ 3404 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3405 { 3406 int i; 3407 3408 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3409 return; 3410 3411 for (i = 0; i < adev->num_ip_blocks; i++) { 3412 if (!adev->ip_blocks[i].status.hw) 3413 continue; 3414 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3415 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3416 break; 3417 } 3418 } 3419 } 3420 3421 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3422 { 3423 int i, r; 3424 3425 for (i = 0; i < adev->num_ip_blocks; i++) { 3426 if (!adev->ip_blocks[i].version->funcs->early_fini) 3427 continue; 3428 3429 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3430 if (r) { 3431 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3432 adev->ip_blocks[i].version->funcs->name, r); 3433 } 3434 } 3435 3436 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3437 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3438 3439 amdgpu_amdkfd_suspend(adev, false); 3440 3441 /* Workaround for ASICs need to disable SMC first */ 3442 amdgpu_device_smu_fini_early(adev); 3443 3444 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3445 if (!adev->ip_blocks[i].status.hw) 3446 continue; 3447 3448 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3449 } 3450 3451 if (amdgpu_sriov_vf(adev)) { 3452 if (amdgpu_virt_release_full_gpu(adev, false)) 3453 DRM_ERROR("failed to release exclusive mode on fini\n"); 3454 } 3455 3456 return 0; 3457 } 3458 3459 /** 3460 * amdgpu_device_ip_fini - run fini for hardware IPs 3461 * 3462 * @adev: amdgpu_device pointer 3463 * 3464 * Main teardown pass for hardware IPs. The list of all the hardware 3465 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3466 * are run. hw_fini tears down the hardware associated with each IP 3467 * and sw_fini tears down any software state associated with each IP. 3468 * Returns 0 on success, negative error code on failure. 3469 */ 3470 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3471 { 3472 int i, r; 3473 3474 amdgpu_cper_fini(adev); 3475 3476 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3477 amdgpu_virt_release_ras_err_handler_data(adev); 3478 3479 if (adev->gmc.xgmi.num_physical_nodes > 1) 3480 amdgpu_xgmi_remove_device(adev); 3481 3482 amdgpu_amdkfd_device_fini_sw(adev); 3483 3484 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3485 if (!adev->ip_blocks[i].status.sw) 3486 continue; 3487 3488 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3489 amdgpu_ucode_free_bo(adev); 3490 amdgpu_free_static_csa(&adev->virt.csa_obj); 3491 amdgpu_device_wb_fini(adev); 3492 amdgpu_device_mem_scratch_fini(adev); 3493 amdgpu_ib_pool_fini(adev); 3494 amdgpu_seq64_fini(adev); 3495 } 3496 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3497 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3498 /* XXX handle errors */ 3499 if (r) { 3500 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3501 adev->ip_blocks[i].version->funcs->name, r); 3502 } 3503 } 3504 adev->ip_blocks[i].status.sw = false; 3505 adev->ip_blocks[i].status.valid = false; 3506 } 3507 3508 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3509 if (!adev->ip_blocks[i].status.late_initialized) 3510 continue; 3511 if (adev->ip_blocks[i].version->funcs->late_fini) 3512 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3513 adev->ip_blocks[i].status.late_initialized = false; 3514 } 3515 3516 amdgpu_ras_fini(adev); 3517 3518 return 0; 3519 } 3520 3521 /** 3522 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3523 * 3524 * @work: work_struct. 3525 */ 3526 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3527 { 3528 struct amdgpu_device *adev = 3529 container_of(work, struct amdgpu_device, delayed_init_work.work); 3530 int r; 3531 3532 r = amdgpu_ib_ring_tests(adev); 3533 if (r) 3534 DRM_ERROR("ib ring test failed (%d).\n", r); 3535 } 3536 3537 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3538 { 3539 struct amdgpu_device *adev = 3540 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3541 3542 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3543 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3544 3545 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0)) 3546 adev->gfx.gfx_off_state = true; 3547 } 3548 3549 /** 3550 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3551 * 3552 * @adev: amdgpu_device pointer 3553 * 3554 * Main suspend function for hardware IPs. The list of all the hardware 3555 * IPs that make up the asic is walked, clockgating is disabled and the 3556 * suspend callbacks are run. suspend puts the hardware and software state 3557 * in each IP into a state suitable for suspend. 3558 * Returns 0 on success, negative error code on failure. 3559 */ 3560 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3561 { 3562 int i, r; 3563 3564 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3565 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3566 3567 /* 3568 * Per PMFW team's suggestion, driver needs to handle gfxoff 3569 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3570 * scenario. Add the missing df cstate disablement here. 3571 */ 3572 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3573 dev_warn(adev->dev, "Failed to disallow df cstate"); 3574 3575 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3576 if (!adev->ip_blocks[i].status.valid) 3577 continue; 3578 3579 /* displays are handled separately */ 3580 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3581 continue; 3582 3583 /* XXX handle errors */ 3584 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3585 if (r) 3586 return r; 3587 } 3588 3589 return 0; 3590 } 3591 3592 /** 3593 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3594 * 3595 * @adev: amdgpu_device pointer 3596 * 3597 * Main suspend function for hardware IPs. The list of all the hardware 3598 * IPs that make up the asic is walked, clockgating is disabled and the 3599 * suspend callbacks are run. suspend puts the hardware and software state 3600 * in each IP into a state suitable for suspend. 3601 * Returns 0 on success, negative error code on failure. 3602 */ 3603 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3604 { 3605 int i, r; 3606 3607 if (adev->in_s0ix) 3608 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3609 3610 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3611 if (!adev->ip_blocks[i].status.valid) 3612 continue; 3613 /* displays are handled in phase1 */ 3614 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3615 continue; 3616 /* PSP lost connection when err_event_athub occurs */ 3617 if (amdgpu_ras_intr_triggered() && 3618 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3619 adev->ip_blocks[i].status.hw = false; 3620 continue; 3621 } 3622 3623 /* skip unnecessary suspend if we do not initialize them yet */ 3624 if (!amdgpu_ip_member_of_hwini( 3625 adev, adev->ip_blocks[i].version->type)) 3626 continue; 3627 3628 /* skip suspend of gfx/mes and psp for S0ix 3629 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3630 * like at runtime. PSP is also part of the always on hardware 3631 * so no need to suspend it. 3632 */ 3633 if (adev->in_s0ix && 3634 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3635 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3636 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3637 continue; 3638 3639 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3640 if (adev->in_s0ix && 3641 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3642 IP_VERSION(5, 0, 0)) && 3643 (adev->ip_blocks[i].version->type == 3644 AMD_IP_BLOCK_TYPE_SDMA)) 3645 continue; 3646 3647 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3648 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3649 * from this location and RLC Autoload automatically also gets loaded 3650 * from here based on PMFW -> PSP message during re-init sequence. 3651 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3652 * the TMR and reload FWs again for IMU enabled APU ASICs. 3653 */ 3654 if (amdgpu_in_reset(adev) && 3655 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3656 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3657 continue; 3658 3659 /* XXX handle errors */ 3660 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3661 adev->ip_blocks[i].status.hw = false; 3662 3663 /* handle putting the SMC in the appropriate state */ 3664 if (!amdgpu_sriov_vf(adev)) { 3665 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3666 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3667 if (r) { 3668 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3669 adev->mp1_state, r); 3670 return r; 3671 } 3672 } 3673 } 3674 } 3675 3676 return 0; 3677 } 3678 3679 /** 3680 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3681 * 3682 * @adev: amdgpu_device pointer 3683 * 3684 * Main suspend function for hardware IPs. The list of all the hardware 3685 * IPs that make up the asic is walked, clockgating is disabled and the 3686 * suspend callbacks are run. suspend puts the hardware and software state 3687 * in each IP into a state suitable for suspend. 3688 * Returns 0 on success, negative error code on failure. 3689 */ 3690 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3691 { 3692 int r; 3693 3694 if (amdgpu_sriov_vf(adev)) { 3695 amdgpu_virt_fini_data_exchange(adev); 3696 amdgpu_virt_request_full_gpu(adev, false); 3697 } 3698 3699 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3700 3701 r = amdgpu_device_ip_suspend_phase1(adev); 3702 if (r) 3703 return r; 3704 r = amdgpu_device_ip_suspend_phase2(adev); 3705 3706 if (amdgpu_sriov_vf(adev)) 3707 amdgpu_virt_release_full_gpu(adev, false); 3708 3709 return r; 3710 } 3711 3712 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3713 { 3714 int i, r; 3715 3716 static enum amd_ip_block_type ip_order[] = { 3717 AMD_IP_BLOCK_TYPE_COMMON, 3718 AMD_IP_BLOCK_TYPE_GMC, 3719 AMD_IP_BLOCK_TYPE_PSP, 3720 AMD_IP_BLOCK_TYPE_IH, 3721 }; 3722 3723 for (i = 0; i < adev->num_ip_blocks; i++) { 3724 int j; 3725 struct amdgpu_ip_block *block; 3726 3727 block = &adev->ip_blocks[i]; 3728 block->status.hw = false; 3729 3730 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3731 3732 if (block->version->type != ip_order[j] || 3733 !block->status.valid) 3734 continue; 3735 3736 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3737 if (r) { 3738 dev_err(adev->dev, "RE-INIT-early: %s failed\n", 3739 block->version->funcs->name); 3740 return r; 3741 } 3742 block->status.hw = true; 3743 } 3744 } 3745 3746 return 0; 3747 } 3748 3749 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3750 { 3751 struct amdgpu_ip_block *block; 3752 int i, r = 0; 3753 3754 static enum amd_ip_block_type ip_order[] = { 3755 AMD_IP_BLOCK_TYPE_SMC, 3756 AMD_IP_BLOCK_TYPE_DCE, 3757 AMD_IP_BLOCK_TYPE_GFX, 3758 AMD_IP_BLOCK_TYPE_SDMA, 3759 AMD_IP_BLOCK_TYPE_MES, 3760 AMD_IP_BLOCK_TYPE_UVD, 3761 AMD_IP_BLOCK_TYPE_VCE, 3762 AMD_IP_BLOCK_TYPE_VCN, 3763 AMD_IP_BLOCK_TYPE_JPEG 3764 }; 3765 3766 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3767 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]); 3768 3769 if (!block) 3770 continue; 3771 3772 if (block->status.valid && !block->status.hw) { 3773 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 3774 r = amdgpu_ip_block_resume(block); 3775 } else { 3776 r = block->version->funcs->hw_init(block); 3777 } 3778 3779 if (r) { 3780 dev_err(adev->dev, "RE-INIT-late: %s failed\n", 3781 block->version->funcs->name); 3782 break; 3783 } 3784 block->status.hw = true; 3785 } 3786 } 3787 3788 return r; 3789 } 3790 3791 /** 3792 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3793 * 3794 * @adev: amdgpu_device pointer 3795 * 3796 * First resume function for hardware IPs. The list of all the hardware 3797 * IPs that make up the asic is walked and the resume callbacks are run for 3798 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3799 * after a suspend and updates the software state as necessary. This 3800 * function is also used for restoring the GPU after a GPU reset. 3801 * Returns 0 on success, negative error code on failure. 3802 */ 3803 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3804 { 3805 int i, r; 3806 3807 for (i = 0; i < adev->num_ip_blocks; i++) { 3808 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3809 continue; 3810 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3811 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3812 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3813 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3814 3815 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3816 if (r) 3817 return r; 3818 } 3819 } 3820 3821 return 0; 3822 } 3823 3824 /** 3825 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3826 * 3827 * @adev: amdgpu_device pointer 3828 * 3829 * Second resume function for hardware IPs. The list of all the hardware 3830 * IPs that make up the asic is walked and the resume callbacks are run for 3831 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3832 * functional state after a suspend and updates the software state as 3833 * necessary. This function is also used for restoring the GPU after a GPU 3834 * reset. 3835 * Returns 0 on success, negative error code on failure. 3836 */ 3837 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3838 { 3839 int i, r; 3840 3841 for (i = 0; i < adev->num_ip_blocks; i++) { 3842 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3843 continue; 3844 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3845 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3846 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3847 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE || 3848 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3849 continue; 3850 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3851 if (r) 3852 return r; 3853 } 3854 3855 return 0; 3856 } 3857 3858 /** 3859 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs 3860 * 3861 * @adev: amdgpu_device pointer 3862 * 3863 * Third resume function for hardware IPs. The list of all the hardware 3864 * IPs that make up the asic is walked and the resume callbacks are run for 3865 * all DCE. resume puts the hardware into a functional state after a suspend 3866 * and updates the software state as necessary. This function is also used 3867 * for restoring the GPU after a GPU reset. 3868 * 3869 * Returns 0 on success, negative error code on failure. 3870 */ 3871 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev) 3872 { 3873 int i, r; 3874 3875 for (i = 0; i < adev->num_ip_blocks; i++) { 3876 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3877 continue; 3878 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 3879 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3880 if (r) 3881 return r; 3882 } 3883 } 3884 3885 return 0; 3886 } 3887 3888 /** 3889 * amdgpu_device_ip_resume - run resume for hardware IPs 3890 * 3891 * @adev: amdgpu_device pointer 3892 * 3893 * Main resume function for hardware IPs. The hardware IPs 3894 * are split into two resume functions because they are 3895 * also used in recovering from a GPU reset and some additional 3896 * steps need to be take between them. In this case (S3/S4) they are 3897 * run sequentially. 3898 * Returns 0 on success, negative error code on failure. 3899 */ 3900 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3901 { 3902 int r; 3903 3904 r = amdgpu_device_ip_resume_phase1(adev); 3905 if (r) 3906 return r; 3907 3908 r = amdgpu_device_fw_loading(adev); 3909 if (r) 3910 return r; 3911 3912 r = amdgpu_device_ip_resume_phase2(adev); 3913 3914 if (adev->mman.buffer_funcs_ring->sched.ready) 3915 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3916 3917 if (r) 3918 return r; 3919 3920 amdgpu_fence_driver_hw_init(adev); 3921 3922 r = amdgpu_device_ip_resume_phase3(adev); 3923 3924 return r; 3925 } 3926 3927 /** 3928 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3929 * 3930 * @adev: amdgpu_device pointer 3931 * 3932 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3933 */ 3934 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3935 { 3936 if (amdgpu_sriov_vf(adev)) { 3937 if (adev->is_atom_fw) { 3938 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3939 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3940 } else { 3941 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3942 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3943 } 3944 3945 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3946 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3947 } 3948 } 3949 3950 /** 3951 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3952 * 3953 * @asic_type: AMD asic type 3954 * 3955 * Check if there is DC (new modesetting infrastructre) support for an asic. 3956 * returns true if DC has support, false if not. 3957 */ 3958 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3959 { 3960 switch (asic_type) { 3961 #ifdef CONFIG_DRM_AMDGPU_SI 3962 case CHIP_HAINAN: 3963 #endif 3964 case CHIP_TOPAZ: 3965 /* chips with no display hardware */ 3966 return false; 3967 #if defined(CONFIG_DRM_AMD_DC) 3968 case CHIP_TAHITI: 3969 case CHIP_PITCAIRN: 3970 case CHIP_VERDE: 3971 case CHIP_OLAND: 3972 /* 3973 * We have systems in the wild with these ASICs that require 3974 * LVDS and VGA support which is not supported with DC. 3975 * 3976 * Fallback to the non-DC driver here by default so as not to 3977 * cause regressions. 3978 */ 3979 #if defined(CONFIG_DRM_AMD_DC_SI) 3980 return amdgpu_dc > 0; 3981 #else 3982 return false; 3983 #endif 3984 case CHIP_BONAIRE: 3985 case CHIP_KAVERI: 3986 case CHIP_KABINI: 3987 case CHIP_MULLINS: 3988 /* 3989 * We have systems in the wild with these ASICs that require 3990 * VGA support which is not supported with DC. 3991 * 3992 * Fallback to the non-DC driver here by default so as not to 3993 * cause regressions. 3994 */ 3995 return amdgpu_dc > 0; 3996 default: 3997 return amdgpu_dc != 0; 3998 #else 3999 default: 4000 if (amdgpu_dc > 0) 4001 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 4002 return false; 4003 #endif 4004 } 4005 } 4006 4007 /** 4008 * amdgpu_device_has_dc_support - check if dc is supported 4009 * 4010 * @adev: amdgpu_device pointer 4011 * 4012 * Returns true for supported, false for not supported 4013 */ 4014 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 4015 { 4016 if (adev->enable_virtual_display || 4017 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 4018 return false; 4019 4020 return amdgpu_device_asic_has_dc_support(adev->asic_type); 4021 } 4022 4023 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 4024 { 4025 struct amdgpu_device *adev = 4026 container_of(__work, struct amdgpu_device, xgmi_reset_work); 4027 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 4028 4029 /* It's a bug to not have a hive within this function */ 4030 if (WARN_ON(!hive)) 4031 return; 4032 4033 /* 4034 * Use task barrier to synchronize all xgmi reset works across the 4035 * hive. task_barrier_enter and task_barrier_exit will block 4036 * until all the threads running the xgmi reset works reach 4037 * those points. task_barrier_full will do both blocks. 4038 */ 4039 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 4040 4041 task_barrier_enter(&hive->tb); 4042 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 4043 4044 if (adev->asic_reset_res) 4045 goto fail; 4046 4047 task_barrier_exit(&hive->tb); 4048 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 4049 4050 if (adev->asic_reset_res) 4051 goto fail; 4052 4053 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 4054 } else { 4055 4056 task_barrier_full(&hive->tb); 4057 adev->asic_reset_res = amdgpu_asic_reset(adev); 4058 } 4059 4060 fail: 4061 if (adev->asic_reset_res) 4062 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 4063 adev->asic_reset_res, adev_to_drm(adev)->unique); 4064 amdgpu_put_xgmi_hive(hive); 4065 } 4066 4067 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 4068 { 4069 char *input = amdgpu_lockup_timeout; 4070 char *timeout_setting = NULL; 4071 int index = 0; 4072 long timeout; 4073 int ret = 0; 4074 4075 /* 4076 * By default timeout for non compute jobs is 10000 4077 * and 60000 for compute jobs. 4078 * In SR-IOV or passthrough mode, timeout for compute 4079 * jobs are 60000 by default. 4080 */ 4081 adev->gfx_timeout = msecs_to_jiffies(10000); 4082 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4083 if (amdgpu_sriov_vf(adev)) 4084 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 4085 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 4086 else 4087 adev->compute_timeout = msecs_to_jiffies(60000); 4088 4089 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4090 while ((timeout_setting = strsep(&input, ",")) && 4091 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4092 ret = kstrtol(timeout_setting, 0, &timeout); 4093 if (ret) 4094 return ret; 4095 4096 if (timeout == 0) { 4097 index++; 4098 continue; 4099 } else if (timeout < 0) { 4100 timeout = MAX_SCHEDULE_TIMEOUT; 4101 dev_warn(adev->dev, "lockup timeout disabled"); 4102 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 4103 } else { 4104 timeout = msecs_to_jiffies(timeout); 4105 } 4106 4107 switch (index++) { 4108 case 0: 4109 adev->gfx_timeout = timeout; 4110 break; 4111 case 1: 4112 adev->compute_timeout = timeout; 4113 break; 4114 case 2: 4115 adev->sdma_timeout = timeout; 4116 break; 4117 case 3: 4118 adev->video_timeout = timeout; 4119 break; 4120 default: 4121 break; 4122 } 4123 } 4124 /* 4125 * There is only one value specified and 4126 * it should apply to all non-compute jobs. 4127 */ 4128 if (index == 1) { 4129 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4130 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 4131 adev->compute_timeout = adev->gfx_timeout; 4132 } 4133 } 4134 4135 return ret; 4136 } 4137 4138 /** 4139 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4140 * 4141 * @adev: amdgpu_device pointer 4142 * 4143 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4144 */ 4145 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4146 { 4147 struct iommu_domain *domain; 4148 4149 domain = iommu_get_domain_for_dev(adev->dev); 4150 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4151 adev->ram_is_direct_mapped = true; 4152 } 4153 4154 #if defined(CONFIG_HSA_AMD_P2P) 4155 /** 4156 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4157 * 4158 * @adev: amdgpu_device pointer 4159 * 4160 * return if IOMMU remapping bar address 4161 */ 4162 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4163 { 4164 struct iommu_domain *domain; 4165 4166 domain = iommu_get_domain_for_dev(adev->dev); 4167 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4168 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4169 return true; 4170 4171 return false; 4172 } 4173 #endif 4174 4175 static const struct attribute *amdgpu_dev_attributes[] = { 4176 &dev_attr_pcie_replay_count.attr, 4177 NULL 4178 }; 4179 4180 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4181 { 4182 if (amdgpu_mcbp == 1) 4183 adev->gfx.mcbp = true; 4184 else if (amdgpu_mcbp == 0) 4185 adev->gfx.mcbp = false; 4186 4187 if (amdgpu_sriov_vf(adev)) 4188 adev->gfx.mcbp = true; 4189 4190 if (adev->gfx.mcbp) 4191 DRM_INFO("MCBP is enabled\n"); 4192 } 4193 4194 /** 4195 * amdgpu_device_init - initialize the driver 4196 * 4197 * @adev: amdgpu_device pointer 4198 * @flags: driver flags 4199 * 4200 * Initializes the driver info and hw (all asics). 4201 * Returns 0 for success or an error on failure. 4202 * Called at driver startup. 4203 */ 4204 int amdgpu_device_init(struct amdgpu_device *adev, 4205 uint32_t flags) 4206 { 4207 struct drm_device *ddev = adev_to_drm(adev); 4208 struct pci_dev *pdev = adev->pdev; 4209 int r, i; 4210 bool px = false; 4211 u32 max_MBps; 4212 int tmp; 4213 4214 adev->shutdown = false; 4215 adev->flags = flags; 4216 4217 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4218 adev->asic_type = amdgpu_force_asic_type; 4219 else 4220 adev->asic_type = flags & AMD_ASIC_MASK; 4221 4222 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4223 if (amdgpu_emu_mode == 1) 4224 adev->usec_timeout *= 10; 4225 adev->gmc.gart_size = 512 * 1024 * 1024; 4226 adev->accel_working = false; 4227 adev->num_rings = 0; 4228 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4229 adev->mman.buffer_funcs = NULL; 4230 adev->mman.buffer_funcs_ring = NULL; 4231 adev->vm_manager.vm_pte_funcs = NULL; 4232 adev->vm_manager.vm_pte_num_scheds = 0; 4233 adev->gmc.gmc_funcs = NULL; 4234 adev->harvest_ip_mask = 0x0; 4235 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4236 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4237 4238 adev->smc_rreg = &amdgpu_invalid_rreg; 4239 adev->smc_wreg = &amdgpu_invalid_wreg; 4240 adev->pcie_rreg = &amdgpu_invalid_rreg; 4241 adev->pcie_wreg = &amdgpu_invalid_wreg; 4242 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4243 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4244 adev->pciep_rreg = &amdgpu_invalid_rreg; 4245 adev->pciep_wreg = &amdgpu_invalid_wreg; 4246 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4247 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4248 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4249 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4250 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4251 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4252 adev->didt_rreg = &amdgpu_invalid_rreg; 4253 adev->didt_wreg = &amdgpu_invalid_wreg; 4254 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4255 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4256 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4257 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4258 4259 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4260 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4261 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4262 4263 /* mutex initialization are all done here so we 4264 * can recall function without having locking issues 4265 */ 4266 mutex_init(&adev->firmware.mutex); 4267 mutex_init(&adev->pm.mutex); 4268 mutex_init(&adev->gfx.gpu_clock_mutex); 4269 mutex_init(&adev->srbm_mutex); 4270 mutex_init(&adev->gfx.pipe_reserve_mutex); 4271 mutex_init(&adev->gfx.gfx_off_mutex); 4272 mutex_init(&adev->gfx.partition_mutex); 4273 mutex_init(&adev->grbm_idx_mutex); 4274 mutex_init(&adev->mn_lock); 4275 mutex_init(&adev->virt.vf_errors.lock); 4276 hash_init(adev->mn_hash); 4277 mutex_init(&adev->psp.mutex); 4278 mutex_init(&adev->notifier_lock); 4279 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4280 mutex_init(&adev->benchmark_mutex); 4281 mutex_init(&adev->gfx.reset_sem_mutex); 4282 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4283 mutex_init(&adev->enforce_isolation_mutex); 4284 mutex_init(&adev->gfx.kfd_sch_mutex); 4285 4286 amdgpu_device_init_apu_flags(adev); 4287 4288 r = amdgpu_device_check_arguments(adev); 4289 if (r) 4290 return r; 4291 4292 spin_lock_init(&adev->mmio_idx_lock); 4293 spin_lock_init(&adev->smc_idx_lock); 4294 spin_lock_init(&adev->pcie_idx_lock); 4295 spin_lock_init(&adev->uvd_ctx_idx_lock); 4296 spin_lock_init(&adev->didt_idx_lock); 4297 spin_lock_init(&adev->gc_cac_idx_lock); 4298 spin_lock_init(&adev->se_cac_idx_lock); 4299 spin_lock_init(&adev->audio_endpt_idx_lock); 4300 spin_lock_init(&adev->mm_stats.lock); 4301 spin_lock_init(&adev->virt.rlcg_reg_lock); 4302 spin_lock_init(&adev->wb.lock); 4303 4304 INIT_LIST_HEAD(&adev->reset_list); 4305 4306 INIT_LIST_HEAD(&adev->ras_list); 4307 4308 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4309 4310 INIT_DELAYED_WORK(&adev->delayed_init_work, 4311 amdgpu_device_delayed_init_work_handler); 4312 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4313 amdgpu_device_delay_enable_gfx_off); 4314 /* 4315 * Initialize the enforce_isolation work structures for each XCP 4316 * partition. This work handler is responsible for enforcing shader 4317 * isolation on AMD GPUs. It counts the number of emitted fences for 4318 * each GFX and compute ring. If there are any fences, it schedules 4319 * the `enforce_isolation_work` to be run after a delay. If there are 4320 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4321 * runqueue. 4322 */ 4323 for (i = 0; i < MAX_XCP; i++) { 4324 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4325 amdgpu_gfx_enforce_isolation_handler); 4326 adev->gfx.enforce_isolation[i].adev = adev; 4327 adev->gfx.enforce_isolation[i].xcp_id = i; 4328 } 4329 4330 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4331 4332 adev->gfx.gfx_off_req_count = 1; 4333 adev->gfx.gfx_off_residency = 0; 4334 adev->gfx.gfx_off_entrycount = 0; 4335 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4336 4337 atomic_set(&adev->throttling_logging_enabled, 1); 4338 /* 4339 * If throttling continues, logging will be performed every minute 4340 * to avoid log flooding. "-1" is subtracted since the thermal 4341 * throttling interrupt comes every second. Thus, the total logging 4342 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4343 * for throttling interrupt) = 60 seconds. 4344 */ 4345 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4346 4347 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4348 4349 /* Registers mapping */ 4350 /* TODO: block userspace mapping of io register */ 4351 if (adev->asic_type >= CHIP_BONAIRE) { 4352 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4353 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4354 } else { 4355 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4356 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4357 } 4358 4359 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4360 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4361 4362 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4363 if (!adev->rmmio) 4364 return -ENOMEM; 4365 4366 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 4367 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 4368 4369 /* 4370 * Reset domain needs to be present early, before XGMI hive discovered 4371 * (if any) and initialized to use reset sem and in_gpu reset flag 4372 * early on during init and before calling to RREG32. 4373 */ 4374 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4375 if (!adev->reset_domain) 4376 return -ENOMEM; 4377 4378 /* detect hw virtualization here */ 4379 amdgpu_virt_init(adev); 4380 4381 amdgpu_device_get_pcie_info(adev); 4382 4383 r = amdgpu_device_get_job_timeout_settings(adev); 4384 if (r) { 4385 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4386 return r; 4387 } 4388 4389 amdgpu_device_set_mcbp(adev); 4390 4391 /* 4392 * By default, use default mode where all blocks are expected to be 4393 * initialized. At present a 'swinit' of blocks is required to be 4394 * completed before the need for a different level is detected. 4395 */ 4396 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4397 /* early init functions */ 4398 r = amdgpu_device_ip_early_init(adev); 4399 if (r) 4400 return r; 4401 4402 /* 4403 * No need to remove conflicting FBs for non-display class devices. 4404 * This prevents the sysfb from being freed accidently. 4405 */ 4406 if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA || 4407 (pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) { 4408 /* Get rid of things like offb */ 4409 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4410 if (r) 4411 return r; 4412 } 4413 4414 /* Enable TMZ based on IP_VERSION */ 4415 amdgpu_gmc_tmz_set(adev); 4416 4417 if (amdgpu_sriov_vf(adev) && 4418 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4419 /* VF MMIO access (except mailbox range) from CPU 4420 * will be blocked during sriov runtime 4421 */ 4422 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4423 4424 amdgpu_gmc_noretry_set(adev); 4425 /* Need to get xgmi info early to decide the reset behavior*/ 4426 if (adev->gmc.xgmi.supported) { 4427 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4428 if (r) 4429 return r; 4430 } 4431 4432 /* enable PCIE atomic ops */ 4433 if (amdgpu_sriov_vf(adev)) { 4434 if (adev->virt.fw_reserve.p_pf2vf) 4435 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4436 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4437 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4438 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4439 * internal path natively support atomics, set have_atomics_support to true. 4440 */ 4441 } else if ((adev->flags & AMD_IS_APU) && 4442 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4443 IP_VERSION(9, 0, 0))) { 4444 adev->have_atomics_support = true; 4445 } else { 4446 adev->have_atomics_support = 4447 !pci_enable_atomic_ops_to_root(adev->pdev, 4448 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4449 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4450 } 4451 4452 if (!adev->have_atomics_support) 4453 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4454 4455 /* doorbell bar mapping and doorbell index init*/ 4456 amdgpu_doorbell_init(adev); 4457 4458 if (amdgpu_emu_mode == 1) { 4459 /* post the asic on emulation mode */ 4460 emu_soc_asic_init(adev); 4461 goto fence_driver_init; 4462 } 4463 4464 amdgpu_reset_init(adev); 4465 4466 /* detect if we are with an SRIOV vbios */ 4467 if (adev->bios) 4468 amdgpu_device_detect_sriov_bios(adev); 4469 4470 /* check if we need to reset the asic 4471 * E.g., driver was not cleanly unloaded previously, etc. 4472 */ 4473 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4474 if (adev->gmc.xgmi.num_physical_nodes) { 4475 dev_info(adev->dev, "Pending hive reset.\n"); 4476 amdgpu_set_init_level(adev, 4477 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4478 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4479 !amdgpu_device_has_display_hardware(adev)) { 4480 r = psp_gpu_reset(adev); 4481 } else { 4482 tmp = amdgpu_reset_method; 4483 /* It should do a default reset when loading or reloading the driver, 4484 * regardless of the module parameter reset_method. 4485 */ 4486 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4487 r = amdgpu_asic_reset(adev); 4488 amdgpu_reset_method = tmp; 4489 } 4490 4491 if (r) { 4492 dev_err(adev->dev, "asic reset on init failed\n"); 4493 goto failed; 4494 } 4495 } 4496 4497 /* Post card if necessary */ 4498 if (amdgpu_device_need_post(adev)) { 4499 if (!adev->bios) { 4500 dev_err(adev->dev, "no vBIOS found\n"); 4501 r = -EINVAL; 4502 goto failed; 4503 } 4504 DRM_INFO("GPU posting now...\n"); 4505 r = amdgpu_device_asic_init(adev); 4506 if (r) { 4507 dev_err(adev->dev, "gpu post error!\n"); 4508 goto failed; 4509 } 4510 } 4511 4512 if (adev->bios) { 4513 if (adev->is_atom_fw) { 4514 /* Initialize clocks */ 4515 r = amdgpu_atomfirmware_get_clock_info(adev); 4516 if (r) { 4517 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4518 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4519 goto failed; 4520 } 4521 } else { 4522 /* Initialize clocks */ 4523 r = amdgpu_atombios_get_clock_info(adev); 4524 if (r) { 4525 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4526 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4527 goto failed; 4528 } 4529 /* init i2c buses */ 4530 amdgpu_i2c_init(adev); 4531 } 4532 } 4533 4534 fence_driver_init: 4535 /* Fence driver */ 4536 r = amdgpu_fence_driver_sw_init(adev); 4537 if (r) { 4538 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4539 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4540 goto failed; 4541 } 4542 4543 /* init the mode config */ 4544 drm_mode_config_init(adev_to_drm(adev)); 4545 4546 r = amdgpu_device_ip_init(adev); 4547 if (r) { 4548 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4549 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4550 goto release_ras_con; 4551 } 4552 4553 amdgpu_fence_driver_hw_init(adev); 4554 4555 dev_info(adev->dev, 4556 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4557 adev->gfx.config.max_shader_engines, 4558 adev->gfx.config.max_sh_per_se, 4559 adev->gfx.config.max_cu_per_sh, 4560 adev->gfx.cu_info.number); 4561 4562 adev->accel_working = true; 4563 4564 amdgpu_vm_check_compute_bug(adev); 4565 4566 /* Initialize the buffer migration limit. */ 4567 if (amdgpu_moverate >= 0) 4568 max_MBps = amdgpu_moverate; 4569 else 4570 max_MBps = 8; /* Allow 8 MB/s. */ 4571 /* Get a log2 for easy divisions. */ 4572 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4573 4574 /* 4575 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4576 * Otherwise the mgpu fan boost feature will be skipped due to the 4577 * gpu instance is counted less. 4578 */ 4579 amdgpu_register_gpu_instance(adev); 4580 4581 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4582 * explicit gating rather than handling it automatically. 4583 */ 4584 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4585 r = amdgpu_device_ip_late_init(adev); 4586 if (r) { 4587 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4588 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4589 goto release_ras_con; 4590 } 4591 /* must succeed. */ 4592 amdgpu_ras_resume(adev); 4593 queue_delayed_work(system_wq, &adev->delayed_init_work, 4594 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4595 } 4596 4597 if (amdgpu_sriov_vf(adev)) { 4598 amdgpu_virt_release_full_gpu(adev, true); 4599 flush_delayed_work(&adev->delayed_init_work); 4600 } 4601 4602 /* 4603 * Place those sysfs registering after `late_init`. As some of those 4604 * operations performed in `late_init` might affect the sysfs 4605 * interfaces creating. 4606 */ 4607 r = amdgpu_atombios_sysfs_init(adev); 4608 if (r) 4609 drm_err(&adev->ddev, 4610 "registering atombios sysfs failed (%d).\n", r); 4611 4612 r = amdgpu_pm_sysfs_init(adev); 4613 if (r) 4614 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4615 4616 r = amdgpu_ucode_sysfs_init(adev); 4617 if (r) { 4618 adev->ucode_sysfs_en = false; 4619 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4620 } else 4621 adev->ucode_sysfs_en = true; 4622 4623 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4624 if (r) 4625 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4626 4627 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4628 if (r) 4629 dev_err(adev->dev, 4630 "Could not create amdgpu board attributes\n"); 4631 4632 amdgpu_fru_sysfs_init(adev); 4633 amdgpu_reg_state_sysfs_init(adev); 4634 amdgpu_xcp_cfg_sysfs_init(adev); 4635 4636 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4637 r = amdgpu_pmu_init(adev); 4638 if (r) 4639 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4640 4641 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4642 if (amdgpu_device_cache_pci_state(adev->pdev)) 4643 pci_restore_state(pdev); 4644 4645 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4646 /* this will fail for cards that aren't VGA class devices, just 4647 * ignore it 4648 */ 4649 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4650 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4651 4652 px = amdgpu_device_supports_px(ddev); 4653 4654 if (px || (!dev_is_removable(&adev->pdev->dev) && 4655 apple_gmux_detect(NULL, NULL))) 4656 vga_switcheroo_register_client(adev->pdev, 4657 &amdgpu_switcheroo_ops, px); 4658 4659 if (px) 4660 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4661 4662 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4663 amdgpu_xgmi_reset_on_init(adev); 4664 4665 amdgpu_device_check_iommu_direct_map(adev); 4666 4667 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; 4668 r = register_pm_notifier(&adev->pm_nb); 4669 if (r) 4670 goto failed; 4671 4672 return 0; 4673 4674 release_ras_con: 4675 if (amdgpu_sriov_vf(adev)) 4676 amdgpu_virt_release_full_gpu(adev, true); 4677 4678 /* failed in exclusive mode due to timeout */ 4679 if (amdgpu_sriov_vf(adev) && 4680 !amdgpu_sriov_runtime(adev) && 4681 amdgpu_virt_mmio_blocked(adev) && 4682 !amdgpu_virt_wait_reset(adev)) { 4683 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4684 /* Don't send request since VF is inactive. */ 4685 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4686 adev->virt.ops = NULL; 4687 r = -EAGAIN; 4688 } 4689 amdgpu_release_ras_context(adev); 4690 4691 failed: 4692 amdgpu_vf_error_trans_all(adev); 4693 4694 return r; 4695 } 4696 4697 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4698 { 4699 4700 /* Clear all CPU mappings pointing to this device */ 4701 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4702 4703 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4704 amdgpu_doorbell_fini(adev); 4705 4706 iounmap(adev->rmmio); 4707 adev->rmmio = NULL; 4708 if (adev->mman.aper_base_kaddr) 4709 iounmap(adev->mman.aper_base_kaddr); 4710 adev->mman.aper_base_kaddr = NULL; 4711 4712 /* Memory manager related */ 4713 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4714 arch_phys_wc_del(adev->gmc.vram_mtrr); 4715 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4716 } 4717 } 4718 4719 /** 4720 * amdgpu_device_fini_hw - tear down the driver 4721 * 4722 * @adev: amdgpu_device pointer 4723 * 4724 * Tear down the driver info (all asics). 4725 * Called at driver shutdown. 4726 */ 4727 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4728 { 4729 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4730 flush_delayed_work(&adev->delayed_init_work); 4731 4732 if (adev->mman.initialized) 4733 drain_workqueue(adev->mman.bdev.wq); 4734 adev->shutdown = true; 4735 4736 unregister_pm_notifier(&adev->pm_nb); 4737 4738 /* make sure IB test finished before entering exclusive mode 4739 * to avoid preemption on IB test 4740 */ 4741 if (amdgpu_sriov_vf(adev)) { 4742 amdgpu_virt_request_full_gpu(adev, false); 4743 amdgpu_virt_fini_data_exchange(adev); 4744 } 4745 4746 /* disable all interrupts */ 4747 amdgpu_irq_disable_all(adev); 4748 if (adev->mode_info.mode_config_initialized) { 4749 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4750 drm_helper_force_disable_all(adev_to_drm(adev)); 4751 else 4752 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4753 } 4754 amdgpu_fence_driver_hw_fini(adev); 4755 4756 if (adev->pm.sysfs_initialized) 4757 amdgpu_pm_sysfs_fini(adev); 4758 if (adev->ucode_sysfs_en) 4759 amdgpu_ucode_sysfs_fini(adev); 4760 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4761 amdgpu_fru_sysfs_fini(adev); 4762 4763 amdgpu_reg_state_sysfs_fini(adev); 4764 amdgpu_xcp_cfg_sysfs_fini(adev); 4765 4766 /* disable ras feature must before hw fini */ 4767 amdgpu_ras_pre_fini(adev); 4768 4769 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4770 4771 amdgpu_device_ip_fini_early(adev); 4772 4773 amdgpu_irq_fini_hw(adev); 4774 4775 if (adev->mman.initialized) 4776 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4777 4778 amdgpu_gart_dummy_page_fini(adev); 4779 4780 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4781 amdgpu_device_unmap_mmio(adev); 4782 4783 } 4784 4785 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4786 { 4787 int idx; 4788 bool px; 4789 4790 amdgpu_device_ip_fini(adev); 4791 amdgpu_fence_driver_sw_fini(adev); 4792 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4793 adev->accel_working = false; 4794 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4795 4796 amdgpu_reset_fini(adev); 4797 4798 /* free i2c buses */ 4799 amdgpu_i2c_fini(adev); 4800 4801 if (adev->bios) { 4802 if (amdgpu_emu_mode != 1) 4803 amdgpu_atombios_fini(adev); 4804 amdgpu_bios_release(adev); 4805 } 4806 4807 kfree(adev->fru_info); 4808 adev->fru_info = NULL; 4809 4810 kfree(adev->xcp_mgr); 4811 adev->xcp_mgr = NULL; 4812 4813 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4814 4815 if (px || (!dev_is_removable(&adev->pdev->dev) && 4816 apple_gmux_detect(NULL, NULL))) 4817 vga_switcheroo_unregister_client(adev->pdev); 4818 4819 if (px) 4820 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4821 4822 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4823 vga_client_unregister(adev->pdev); 4824 4825 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4826 4827 iounmap(adev->rmmio); 4828 adev->rmmio = NULL; 4829 amdgpu_doorbell_fini(adev); 4830 drm_dev_exit(idx); 4831 } 4832 4833 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4834 amdgpu_pmu_fini(adev); 4835 if (adev->mman.discovery_bin) 4836 amdgpu_discovery_fini(adev); 4837 4838 amdgpu_reset_put_reset_domain(adev->reset_domain); 4839 adev->reset_domain = NULL; 4840 4841 kfree(adev->pci_state); 4842 4843 } 4844 4845 /** 4846 * amdgpu_device_evict_resources - evict device resources 4847 * @adev: amdgpu device object 4848 * 4849 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4850 * of the vram memory type. Mainly used for evicting device resources 4851 * at suspend time. 4852 * 4853 */ 4854 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4855 { 4856 int ret; 4857 4858 /* No need to evict vram on APUs unless going to S4 */ 4859 if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) 4860 return 0; 4861 4862 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4863 if (ret) 4864 DRM_WARN("evicting device resources failed\n"); 4865 return ret; 4866 } 4867 4868 /* 4869 * Suspend & resume. 4870 */ 4871 /** 4872 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events 4873 * @nb: notifier block 4874 * @mode: suspend mode 4875 * @data: data 4876 * 4877 * This function is called when the system is about to suspend or hibernate. 4878 * It is used to evict resources from the device before the system goes to 4879 * sleep while there is still access to swap. 4880 */ 4881 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 4882 void *data) 4883 { 4884 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); 4885 int r; 4886 4887 switch (mode) { 4888 case PM_HIBERNATION_PREPARE: 4889 adev->in_s4 = true; 4890 fallthrough; 4891 case PM_SUSPEND_PREPARE: 4892 r = amdgpu_device_evict_resources(adev); 4893 /* 4894 * This is considered non-fatal at this time because 4895 * amdgpu_device_prepare() will also fatally evict resources. 4896 * See https://gitlab.freedesktop.org/drm/amd/-/issues/3781 4897 */ 4898 if (r) 4899 drm_warn(adev_to_drm(adev), "Failed to evict resources, freeze active processes if problems occur: %d\n", r); 4900 break; 4901 } 4902 4903 return NOTIFY_DONE; 4904 } 4905 4906 /** 4907 * amdgpu_device_prepare - prepare for device suspend 4908 * 4909 * @dev: drm dev pointer 4910 * 4911 * Prepare to put the hw in the suspend state (all asics). 4912 * Returns 0 for success or an error on failure. 4913 * Called at driver suspend. 4914 */ 4915 int amdgpu_device_prepare(struct drm_device *dev) 4916 { 4917 struct amdgpu_device *adev = drm_to_adev(dev); 4918 int i, r; 4919 4920 amdgpu_choose_low_power_state(adev); 4921 4922 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4923 return 0; 4924 4925 /* Evict the majority of BOs before starting suspend sequence */ 4926 r = amdgpu_device_evict_resources(adev); 4927 if (r) 4928 goto unprepare; 4929 4930 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4931 4932 for (i = 0; i < adev->num_ip_blocks; i++) { 4933 if (!adev->ip_blocks[i].status.valid) 4934 continue; 4935 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 4936 continue; 4937 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 4938 if (r) 4939 goto unprepare; 4940 } 4941 4942 return 0; 4943 4944 unprepare: 4945 adev->in_s0ix = adev->in_s3 = adev->in_s4 = false; 4946 4947 return r; 4948 } 4949 4950 /** 4951 * amdgpu_device_suspend - initiate device suspend 4952 * 4953 * @dev: drm dev pointer 4954 * @notify_clients: notify in-kernel DRM clients 4955 * 4956 * Puts the hw in the suspend state (all asics). 4957 * Returns 0 for success or an error on failure. 4958 * Called at driver suspend. 4959 */ 4960 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 4961 { 4962 struct amdgpu_device *adev = drm_to_adev(dev); 4963 int r = 0; 4964 4965 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4966 return 0; 4967 4968 adev->in_suspend = true; 4969 4970 if (amdgpu_sriov_vf(adev)) { 4971 amdgpu_virt_fini_data_exchange(adev); 4972 r = amdgpu_virt_request_full_gpu(adev, false); 4973 if (r) 4974 return r; 4975 } 4976 4977 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4978 DRM_WARN("smart shift update failed\n"); 4979 4980 if (notify_clients) 4981 drm_client_dev_suspend(adev_to_drm(adev), false); 4982 4983 cancel_delayed_work_sync(&adev->delayed_init_work); 4984 4985 amdgpu_ras_suspend(adev); 4986 4987 amdgpu_device_ip_suspend_phase1(adev); 4988 4989 if (!adev->in_s0ix) 4990 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4991 4992 r = amdgpu_device_evict_resources(adev); 4993 if (r) 4994 return r; 4995 4996 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4997 4998 amdgpu_fence_driver_hw_fini(adev); 4999 5000 amdgpu_device_ip_suspend_phase2(adev); 5001 5002 if (amdgpu_sriov_vf(adev)) 5003 amdgpu_virt_release_full_gpu(adev, false); 5004 5005 r = amdgpu_dpm_notify_rlc_state(adev, false); 5006 if (r) 5007 return r; 5008 5009 return 0; 5010 } 5011 5012 /** 5013 * amdgpu_device_resume - initiate device resume 5014 * 5015 * @dev: drm dev pointer 5016 * @notify_clients: notify in-kernel DRM clients 5017 * 5018 * Bring the hw back to operating state (all asics). 5019 * Returns 0 for success or an error on failure. 5020 * Called at driver resume. 5021 */ 5022 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 5023 { 5024 struct amdgpu_device *adev = drm_to_adev(dev); 5025 int r = 0; 5026 5027 if (amdgpu_sriov_vf(adev)) { 5028 r = amdgpu_virt_request_full_gpu(adev, true); 5029 if (r) 5030 return r; 5031 } 5032 5033 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5034 return 0; 5035 5036 if (adev->in_s0ix) 5037 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 5038 5039 /* post card */ 5040 if (amdgpu_device_need_post(adev)) { 5041 r = amdgpu_device_asic_init(adev); 5042 if (r) 5043 dev_err(adev->dev, "amdgpu asic init failed\n"); 5044 } 5045 5046 r = amdgpu_device_ip_resume(adev); 5047 5048 if (r) { 5049 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 5050 goto exit; 5051 } 5052 5053 if (!adev->in_s0ix) { 5054 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 5055 if (r) 5056 goto exit; 5057 } 5058 5059 r = amdgpu_device_ip_late_init(adev); 5060 if (r) 5061 goto exit; 5062 5063 queue_delayed_work(system_wq, &adev->delayed_init_work, 5064 msecs_to_jiffies(AMDGPU_RESUME_MS)); 5065 exit: 5066 if (amdgpu_sriov_vf(adev)) { 5067 amdgpu_virt_init_data_exchange(adev); 5068 amdgpu_virt_release_full_gpu(adev, true); 5069 } 5070 5071 if (r) 5072 return r; 5073 5074 /* Make sure IB tests flushed */ 5075 flush_delayed_work(&adev->delayed_init_work); 5076 5077 if (notify_clients) 5078 drm_client_dev_resume(adev_to_drm(adev), false); 5079 5080 amdgpu_ras_resume(adev); 5081 5082 if (adev->mode_info.num_crtc) { 5083 /* 5084 * Most of the connector probing functions try to acquire runtime pm 5085 * refs to ensure that the GPU is powered on when connector polling is 5086 * performed. Since we're calling this from a runtime PM callback, 5087 * trying to acquire rpm refs will cause us to deadlock. 5088 * 5089 * Since we're guaranteed to be holding the rpm lock, it's safe to 5090 * temporarily disable the rpm helpers so this doesn't deadlock us. 5091 */ 5092 #ifdef CONFIG_PM 5093 dev->dev->power.disable_depth++; 5094 #endif 5095 if (!adev->dc_enabled) 5096 drm_helper_hpd_irq_event(dev); 5097 else 5098 drm_kms_helper_hotplug_event(dev); 5099 #ifdef CONFIG_PM 5100 dev->dev->power.disable_depth--; 5101 #endif 5102 } 5103 adev->in_suspend = false; 5104 5105 if (adev->enable_mes) 5106 amdgpu_mes_self_test(adev); 5107 5108 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 5109 DRM_WARN("smart shift update failed\n"); 5110 5111 return 0; 5112 } 5113 5114 /** 5115 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 5116 * 5117 * @adev: amdgpu_device pointer 5118 * 5119 * The list of all the hardware IPs that make up the asic is walked and 5120 * the check_soft_reset callbacks are run. check_soft_reset determines 5121 * if the asic is still hung or not. 5122 * Returns true if any of the IPs are still in a hung state, false if not. 5123 */ 5124 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 5125 { 5126 int i; 5127 bool asic_hang = false; 5128 5129 if (amdgpu_sriov_vf(adev)) 5130 return true; 5131 5132 if (amdgpu_asic_need_full_reset(adev)) 5133 return true; 5134 5135 for (i = 0; i < adev->num_ip_blocks; i++) { 5136 if (!adev->ip_blocks[i].status.valid) 5137 continue; 5138 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 5139 adev->ip_blocks[i].status.hang = 5140 adev->ip_blocks[i].version->funcs->check_soft_reset( 5141 &adev->ip_blocks[i]); 5142 if (adev->ip_blocks[i].status.hang) { 5143 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 5144 asic_hang = true; 5145 } 5146 } 5147 return asic_hang; 5148 } 5149 5150 /** 5151 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 5152 * 5153 * @adev: amdgpu_device pointer 5154 * 5155 * The list of all the hardware IPs that make up the asic is walked and the 5156 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 5157 * handles any IP specific hardware or software state changes that are 5158 * necessary for a soft reset to succeed. 5159 * Returns 0 on success, negative error code on failure. 5160 */ 5161 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5162 { 5163 int i, r = 0; 5164 5165 for (i = 0; i < adev->num_ip_blocks; i++) { 5166 if (!adev->ip_blocks[i].status.valid) 5167 continue; 5168 if (adev->ip_blocks[i].status.hang && 5169 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5170 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5171 if (r) 5172 return r; 5173 } 5174 } 5175 5176 return 0; 5177 } 5178 5179 /** 5180 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5181 * 5182 * @adev: amdgpu_device pointer 5183 * 5184 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5185 * reset is necessary to recover. 5186 * Returns true if a full asic reset is required, false if not. 5187 */ 5188 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5189 { 5190 int i; 5191 5192 if (amdgpu_asic_need_full_reset(adev)) 5193 return true; 5194 5195 for (i = 0; i < adev->num_ip_blocks; i++) { 5196 if (!adev->ip_blocks[i].status.valid) 5197 continue; 5198 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5199 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5200 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5201 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5202 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5203 if (adev->ip_blocks[i].status.hang) { 5204 dev_info(adev->dev, "Some block need full reset!\n"); 5205 return true; 5206 } 5207 } 5208 } 5209 return false; 5210 } 5211 5212 /** 5213 * amdgpu_device_ip_soft_reset - do a soft reset 5214 * 5215 * @adev: amdgpu_device pointer 5216 * 5217 * The list of all the hardware IPs that make up the asic is walked and the 5218 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5219 * IP specific hardware or software state changes that are necessary to soft 5220 * reset the IP. 5221 * Returns 0 on success, negative error code on failure. 5222 */ 5223 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5224 { 5225 int i, r = 0; 5226 5227 for (i = 0; i < adev->num_ip_blocks; i++) { 5228 if (!adev->ip_blocks[i].status.valid) 5229 continue; 5230 if (adev->ip_blocks[i].status.hang && 5231 adev->ip_blocks[i].version->funcs->soft_reset) { 5232 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5233 if (r) 5234 return r; 5235 } 5236 } 5237 5238 return 0; 5239 } 5240 5241 /** 5242 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5243 * 5244 * @adev: amdgpu_device pointer 5245 * 5246 * The list of all the hardware IPs that make up the asic is walked and the 5247 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5248 * handles any IP specific hardware or software state changes that are 5249 * necessary after the IP has been soft reset. 5250 * Returns 0 on success, negative error code on failure. 5251 */ 5252 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5253 { 5254 int i, r = 0; 5255 5256 for (i = 0; i < adev->num_ip_blocks; i++) { 5257 if (!adev->ip_blocks[i].status.valid) 5258 continue; 5259 if (adev->ip_blocks[i].status.hang && 5260 adev->ip_blocks[i].version->funcs->post_soft_reset) 5261 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5262 if (r) 5263 return r; 5264 } 5265 5266 return 0; 5267 } 5268 5269 /** 5270 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5271 * 5272 * @adev: amdgpu_device pointer 5273 * @reset_context: amdgpu reset context pointer 5274 * 5275 * do VF FLR and reinitialize Asic 5276 * return 0 means succeeded otherwise failed 5277 */ 5278 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5279 struct amdgpu_reset_context *reset_context) 5280 { 5281 int r; 5282 struct amdgpu_hive_info *hive = NULL; 5283 5284 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5285 if (!amdgpu_ras_get_fed_status(adev)) 5286 amdgpu_virt_ready_to_reset(adev); 5287 amdgpu_virt_wait_reset(adev); 5288 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5289 r = amdgpu_virt_request_full_gpu(adev, true); 5290 } else { 5291 r = amdgpu_virt_reset_gpu(adev); 5292 } 5293 if (r) 5294 return r; 5295 5296 amdgpu_ras_clear_err_state(adev); 5297 amdgpu_irq_gpu_reset_resume_helper(adev); 5298 5299 /* some sw clean up VF needs to do before recover */ 5300 amdgpu_virt_post_reset(adev); 5301 5302 /* Resume IP prior to SMC */ 5303 r = amdgpu_device_ip_reinit_early_sriov(adev); 5304 if (r) 5305 return r; 5306 5307 amdgpu_virt_init_data_exchange(adev); 5308 5309 r = amdgpu_device_fw_loading(adev); 5310 if (r) 5311 return r; 5312 5313 /* now we are okay to resume SMC/CP/SDMA */ 5314 r = amdgpu_device_ip_reinit_late_sriov(adev); 5315 if (r) 5316 return r; 5317 5318 hive = amdgpu_get_xgmi_hive(adev); 5319 /* Update PSP FW topology after reset */ 5320 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5321 r = amdgpu_xgmi_update_topology(hive, adev); 5322 if (hive) 5323 amdgpu_put_xgmi_hive(hive); 5324 if (r) 5325 return r; 5326 5327 r = amdgpu_ib_ring_tests(adev); 5328 if (r) 5329 return r; 5330 5331 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5332 amdgpu_inc_vram_lost(adev); 5333 5334 /* need to be called during full access so we can't do it later like 5335 * bare-metal does. 5336 */ 5337 amdgpu_amdkfd_post_reset(adev); 5338 amdgpu_virt_release_full_gpu(adev, true); 5339 5340 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5341 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5342 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5343 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5344 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5345 amdgpu_ras_resume(adev); 5346 5347 amdgpu_virt_ras_telemetry_post_reset(adev); 5348 5349 return 0; 5350 } 5351 5352 /** 5353 * amdgpu_device_has_job_running - check if there is any unfinished job 5354 * 5355 * @adev: amdgpu_device pointer 5356 * 5357 * check if there is any job running on the device when guest driver receives 5358 * FLR notification from host driver. If there are still jobs running, then 5359 * the guest driver will not respond the FLR reset. Instead, let the job hit 5360 * the timeout and guest driver then issue the reset request. 5361 */ 5362 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5363 { 5364 int i; 5365 5366 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5367 struct amdgpu_ring *ring = adev->rings[i]; 5368 5369 if (!amdgpu_ring_sched_ready(ring)) 5370 continue; 5371 5372 if (amdgpu_fence_count_emitted(ring)) 5373 return true; 5374 } 5375 return false; 5376 } 5377 5378 /** 5379 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5380 * 5381 * @adev: amdgpu_device pointer 5382 * 5383 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5384 * a hung GPU. 5385 */ 5386 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5387 { 5388 5389 if (amdgpu_gpu_recovery == 0) 5390 goto disabled; 5391 5392 /* Skip soft reset check in fatal error mode */ 5393 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5394 return true; 5395 5396 if (amdgpu_sriov_vf(adev)) 5397 return true; 5398 5399 if (amdgpu_gpu_recovery == -1) { 5400 switch (adev->asic_type) { 5401 #ifdef CONFIG_DRM_AMDGPU_SI 5402 case CHIP_VERDE: 5403 case CHIP_TAHITI: 5404 case CHIP_PITCAIRN: 5405 case CHIP_OLAND: 5406 case CHIP_HAINAN: 5407 #endif 5408 #ifdef CONFIG_DRM_AMDGPU_CIK 5409 case CHIP_KAVERI: 5410 case CHIP_KABINI: 5411 case CHIP_MULLINS: 5412 #endif 5413 case CHIP_CARRIZO: 5414 case CHIP_STONEY: 5415 case CHIP_CYAN_SKILLFISH: 5416 goto disabled; 5417 default: 5418 break; 5419 } 5420 } 5421 5422 return true; 5423 5424 disabled: 5425 dev_info(adev->dev, "GPU recovery disabled.\n"); 5426 return false; 5427 } 5428 5429 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5430 { 5431 u32 i; 5432 int ret = 0; 5433 5434 if (adev->bios) 5435 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5436 5437 dev_info(adev->dev, "GPU mode1 reset\n"); 5438 5439 /* Cache the state before bus master disable. The saved config space 5440 * values are used in other cases like restore after mode-2 reset. 5441 */ 5442 amdgpu_device_cache_pci_state(adev->pdev); 5443 5444 /* disable BM */ 5445 pci_clear_master(adev->pdev); 5446 5447 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5448 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5449 ret = amdgpu_dpm_mode1_reset(adev); 5450 } else { 5451 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5452 ret = psp_gpu_reset(adev); 5453 } 5454 5455 if (ret) 5456 goto mode1_reset_failed; 5457 5458 amdgpu_device_load_pci_state(adev->pdev); 5459 ret = amdgpu_psp_wait_for_bootloader(adev); 5460 if (ret) 5461 goto mode1_reset_failed; 5462 5463 /* wait for asic to come out of reset */ 5464 for (i = 0; i < adev->usec_timeout; i++) { 5465 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5466 5467 if (memsize != 0xffffffff) 5468 break; 5469 udelay(1); 5470 } 5471 5472 if (i >= adev->usec_timeout) { 5473 ret = -ETIMEDOUT; 5474 goto mode1_reset_failed; 5475 } 5476 5477 if (adev->bios) 5478 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5479 5480 return 0; 5481 5482 mode1_reset_failed: 5483 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5484 return ret; 5485 } 5486 5487 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5488 struct amdgpu_reset_context *reset_context) 5489 { 5490 int i, r = 0; 5491 struct amdgpu_job *job = NULL; 5492 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5493 bool need_full_reset = 5494 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5495 5496 if (reset_context->reset_req_dev == adev) 5497 job = reset_context->job; 5498 5499 if (amdgpu_sriov_vf(adev)) 5500 amdgpu_virt_pre_reset(adev); 5501 5502 amdgpu_fence_driver_isr_toggle(adev, true); 5503 5504 /* block all schedulers and reset given job's ring */ 5505 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5506 struct amdgpu_ring *ring = adev->rings[i]; 5507 5508 if (!amdgpu_ring_sched_ready(ring)) 5509 continue; 5510 5511 /* Clear job fence from fence drv to avoid force_completion 5512 * leave NULL and vm flush fence in fence drv 5513 */ 5514 amdgpu_fence_driver_clear_job_fences(ring); 5515 5516 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5517 amdgpu_fence_driver_force_completion(ring); 5518 } 5519 5520 amdgpu_fence_driver_isr_toggle(adev, false); 5521 5522 if (job && job->vm) 5523 drm_sched_increase_karma(&job->base); 5524 5525 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5526 /* If reset handler not implemented, continue; otherwise return */ 5527 if (r == -EOPNOTSUPP) 5528 r = 0; 5529 else 5530 return r; 5531 5532 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5533 if (!amdgpu_sriov_vf(adev)) { 5534 5535 if (!need_full_reset) 5536 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5537 5538 if (!need_full_reset && amdgpu_gpu_recovery && 5539 amdgpu_device_ip_check_soft_reset(adev)) { 5540 amdgpu_device_ip_pre_soft_reset(adev); 5541 r = amdgpu_device_ip_soft_reset(adev); 5542 amdgpu_device_ip_post_soft_reset(adev); 5543 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5544 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5545 need_full_reset = true; 5546 } 5547 } 5548 5549 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5550 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5551 /* Trigger ip dump before we reset the asic */ 5552 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5553 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5554 tmp_adev->ip_blocks[i].version->funcs 5555 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5556 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5557 } 5558 5559 if (need_full_reset) 5560 r = amdgpu_device_ip_suspend(adev); 5561 if (need_full_reset) 5562 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5563 else 5564 clear_bit(AMDGPU_NEED_FULL_RESET, 5565 &reset_context->flags); 5566 } 5567 5568 return r; 5569 } 5570 5571 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5572 { 5573 struct list_head *device_list_handle; 5574 bool full_reset, vram_lost = false; 5575 struct amdgpu_device *tmp_adev; 5576 int r, init_level; 5577 5578 device_list_handle = reset_context->reset_device_list; 5579 5580 if (!device_list_handle) 5581 return -EINVAL; 5582 5583 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5584 5585 /** 5586 * If it's reset on init, it's default init level, otherwise keep level 5587 * as recovery level. 5588 */ 5589 if (reset_context->method == AMD_RESET_METHOD_ON_INIT) 5590 init_level = AMDGPU_INIT_LEVEL_DEFAULT; 5591 else 5592 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; 5593 5594 r = 0; 5595 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5596 amdgpu_set_init_level(tmp_adev, init_level); 5597 if (full_reset) { 5598 /* post card */ 5599 amdgpu_ras_clear_err_state(tmp_adev); 5600 r = amdgpu_device_asic_init(tmp_adev); 5601 if (r) { 5602 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5603 } else { 5604 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5605 5606 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5607 if (r) 5608 goto out; 5609 5610 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5611 5612 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5613 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 5614 5615 if (vram_lost) { 5616 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5617 amdgpu_inc_vram_lost(tmp_adev); 5618 } 5619 5620 r = amdgpu_device_fw_loading(tmp_adev); 5621 if (r) 5622 return r; 5623 5624 r = amdgpu_xcp_restore_partition_mode( 5625 tmp_adev->xcp_mgr); 5626 if (r) 5627 goto out; 5628 5629 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5630 if (r) 5631 goto out; 5632 5633 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5634 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5635 5636 r = amdgpu_device_ip_resume_phase3(tmp_adev); 5637 if (r) 5638 goto out; 5639 5640 if (vram_lost) 5641 amdgpu_device_fill_reset_magic(tmp_adev); 5642 5643 /* 5644 * Add this ASIC as tracked as reset was already 5645 * complete successfully. 5646 */ 5647 amdgpu_register_gpu_instance(tmp_adev); 5648 5649 if (!reset_context->hive && 5650 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5651 amdgpu_xgmi_add_device(tmp_adev); 5652 5653 r = amdgpu_device_ip_late_init(tmp_adev); 5654 if (r) 5655 goto out; 5656 5657 drm_client_dev_resume(adev_to_drm(tmp_adev), false); 5658 5659 /* 5660 * The GPU enters bad state once faulty pages 5661 * by ECC has reached the threshold, and ras 5662 * recovery is scheduled next. So add one check 5663 * here to break recovery if it indeed exceeds 5664 * bad page threshold, and remind user to 5665 * retire this GPU or setting one bigger 5666 * bad_page_threshold value to fix this once 5667 * probing driver again. 5668 */ 5669 if (!amdgpu_ras_is_rma(tmp_adev)) { 5670 /* must succeed. */ 5671 amdgpu_ras_resume(tmp_adev); 5672 } else { 5673 r = -EINVAL; 5674 goto out; 5675 } 5676 5677 /* Update PSP FW topology after reset */ 5678 if (reset_context->hive && 5679 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5680 r = amdgpu_xgmi_update_topology( 5681 reset_context->hive, tmp_adev); 5682 } 5683 } 5684 5685 out: 5686 if (!r) { 5687 /* IP init is complete now, set level as default */ 5688 amdgpu_set_init_level(tmp_adev, 5689 AMDGPU_INIT_LEVEL_DEFAULT); 5690 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5691 r = amdgpu_ib_ring_tests(tmp_adev); 5692 if (r) { 5693 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5694 r = -EAGAIN; 5695 goto end; 5696 } 5697 } 5698 5699 if (r) 5700 tmp_adev->asic_reset_res = r; 5701 } 5702 5703 end: 5704 return r; 5705 } 5706 5707 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5708 struct amdgpu_reset_context *reset_context) 5709 { 5710 struct amdgpu_device *tmp_adev = NULL; 5711 bool need_full_reset, skip_hw_reset; 5712 int r = 0; 5713 5714 /* Try reset handler method first */ 5715 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5716 reset_list); 5717 5718 reset_context->reset_device_list = device_list_handle; 5719 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5720 /* If reset handler not implemented, continue; otherwise return */ 5721 if (r == -EOPNOTSUPP) 5722 r = 0; 5723 else 5724 return r; 5725 5726 /* Reset handler not implemented, use the default method */ 5727 need_full_reset = 5728 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5729 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5730 5731 /* 5732 * ASIC reset has to be done on all XGMI hive nodes ASAP 5733 * to allow proper links negotiation in FW (within 1 sec) 5734 */ 5735 if (!skip_hw_reset && need_full_reset) { 5736 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5737 /* For XGMI run all resets in parallel to speed up the process */ 5738 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5739 if (!queue_work(system_unbound_wq, 5740 &tmp_adev->xgmi_reset_work)) 5741 r = -EALREADY; 5742 } else 5743 r = amdgpu_asic_reset(tmp_adev); 5744 5745 if (r) { 5746 dev_err(tmp_adev->dev, 5747 "ASIC reset failed with error, %d for drm dev, %s", 5748 r, adev_to_drm(tmp_adev)->unique); 5749 goto out; 5750 } 5751 } 5752 5753 /* For XGMI wait for all resets to complete before proceed */ 5754 if (!r) { 5755 list_for_each_entry(tmp_adev, device_list_handle, 5756 reset_list) { 5757 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5758 flush_work(&tmp_adev->xgmi_reset_work); 5759 r = tmp_adev->asic_reset_res; 5760 if (r) 5761 break; 5762 } 5763 } 5764 } 5765 } 5766 5767 if (!r && amdgpu_ras_intr_triggered()) { 5768 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5769 amdgpu_ras_reset_error_count(tmp_adev, 5770 AMDGPU_RAS_BLOCK__MMHUB); 5771 } 5772 5773 amdgpu_ras_intr_cleared(); 5774 } 5775 5776 r = amdgpu_device_reinit_after_reset(reset_context); 5777 if (r == -EAGAIN) 5778 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5779 else 5780 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5781 5782 out: 5783 return r; 5784 } 5785 5786 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5787 { 5788 5789 switch (amdgpu_asic_reset_method(adev)) { 5790 case AMD_RESET_METHOD_MODE1: 5791 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5792 break; 5793 case AMD_RESET_METHOD_MODE2: 5794 adev->mp1_state = PP_MP1_STATE_RESET; 5795 break; 5796 default: 5797 adev->mp1_state = PP_MP1_STATE_NONE; 5798 break; 5799 } 5800 } 5801 5802 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5803 { 5804 amdgpu_vf_error_trans_all(adev); 5805 adev->mp1_state = PP_MP1_STATE_NONE; 5806 } 5807 5808 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5809 { 5810 struct pci_dev *p = NULL; 5811 5812 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5813 adev->pdev->bus->number, 1); 5814 if (p) { 5815 pm_runtime_enable(&(p->dev)); 5816 pm_runtime_resume(&(p->dev)); 5817 } 5818 5819 pci_dev_put(p); 5820 } 5821 5822 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5823 { 5824 enum amd_reset_method reset_method; 5825 struct pci_dev *p = NULL; 5826 u64 expires; 5827 5828 /* 5829 * For now, only BACO and mode1 reset are confirmed 5830 * to suffer the audio issue without proper suspended. 5831 */ 5832 reset_method = amdgpu_asic_reset_method(adev); 5833 if ((reset_method != AMD_RESET_METHOD_BACO) && 5834 (reset_method != AMD_RESET_METHOD_MODE1)) 5835 return -EINVAL; 5836 5837 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5838 adev->pdev->bus->number, 1); 5839 if (!p) 5840 return -ENODEV; 5841 5842 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5843 if (!expires) 5844 /* 5845 * If we cannot get the audio device autosuspend delay, 5846 * a fixed 4S interval will be used. Considering 3S is 5847 * the audio controller default autosuspend delay setting. 5848 * 4S used here is guaranteed to cover that. 5849 */ 5850 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5851 5852 while (!pm_runtime_status_suspended(&(p->dev))) { 5853 if (!pm_runtime_suspend(&(p->dev))) 5854 break; 5855 5856 if (expires < ktime_get_mono_fast_ns()) { 5857 dev_warn(adev->dev, "failed to suspend display audio\n"); 5858 pci_dev_put(p); 5859 /* TODO: abort the succeeding gpu reset? */ 5860 return -ETIMEDOUT; 5861 } 5862 } 5863 5864 pm_runtime_disable(&(p->dev)); 5865 5866 pci_dev_put(p); 5867 return 0; 5868 } 5869 5870 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5871 { 5872 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5873 5874 #if defined(CONFIG_DEBUG_FS) 5875 if (!amdgpu_sriov_vf(adev)) 5876 cancel_work(&adev->reset_work); 5877 #endif 5878 5879 if (adev->kfd.dev) 5880 cancel_work(&adev->kfd.reset_work); 5881 5882 if (amdgpu_sriov_vf(adev)) 5883 cancel_work(&adev->virt.flr_work); 5884 5885 if (con && adev->ras_enabled) 5886 cancel_work(&con->recovery_work); 5887 5888 } 5889 5890 static int amdgpu_device_health_check(struct list_head *device_list_handle) 5891 { 5892 struct amdgpu_device *tmp_adev; 5893 int ret = 0; 5894 u32 status; 5895 5896 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5897 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status); 5898 if (PCI_POSSIBLE_ERROR(status)) { 5899 dev_err(tmp_adev->dev, "device lost from bus!"); 5900 ret = -ENODEV; 5901 } 5902 } 5903 5904 return ret; 5905 } 5906 5907 /** 5908 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5909 * 5910 * @adev: amdgpu_device pointer 5911 * @job: which job trigger hang 5912 * @reset_context: amdgpu reset context pointer 5913 * 5914 * Attempt to reset the GPU if it has hung (all asics). 5915 * Attempt to do soft-reset or full-reset and reinitialize Asic 5916 * Returns 0 for success or an error on failure. 5917 */ 5918 5919 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5920 struct amdgpu_job *job, 5921 struct amdgpu_reset_context *reset_context) 5922 { 5923 struct list_head device_list, *device_list_handle = NULL; 5924 bool job_signaled = false; 5925 struct amdgpu_hive_info *hive = NULL; 5926 struct amdgpu_device *tmp_adev = NULL; 5927 int i, r = 0; 5928 bool need_emergency_restart = false; 5929 bool audio_suspended = false; 5930 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 5931 5932 /* 5933 * If it reaches here because of hang/timeout and a RAS error is 5934 * detected at the same time, let RAS recovery take care of it. 5935 */ 5936 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && 5937 !amdgpu_sriov_vf(adev) && 5938 reset_context->src != AMDGPU_RESET_SRC_RAS) { 5939 dev_dbg(adev->dev, 5940 "Gpu recovery from source: %d yielding to RAS error recovery handling", 5941 reset_context->src); 5942 return 0; 5943 } 5944 /* 5945 * Special case: RAS triggered and full reset isn't supported 5946 */ 5947 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5948 5949 /* 5950 * Flush RAM to disk so that after reboot 5951 * the user can read log and see why the system rebooted. 5952 */ 5953 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5954 amdgpu_ras_get_context(adev)->reboot) { 5955 DRM_WARN("Emergency reboot."); 5956 5957 ksys_sync_helper(); 5958 emergency_restart(); 5959 } 5960 5961 dev_info(adev->dev, "GPU %s begin!\n", 5962 need_emergency_restart ? "jobs stop":"reset"); 5963 5964 if (!amdgpu_sriov_vf(adev)) 5965 hive = amdgpu_get_xgmi_hive(adev); 5966 if (hive) 5967 mutex_lock(&hive->hive_lock); 5968 5969 reset_context->job = job; 5970 reset_context->hive = hive; 5971 /* 5972 * Build list of devices to reset. 5973 * In case we are in XGMI hive mode, resort the device list 5974 * to put adev in the 1st position. 5975 */ 5976 INIT_LIST_HEAD(&device_list); 5977 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 5978 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5979 list_add_tail(&tmp_adev->reset_list, &device_list); 5980 if (adev->shutdown) 5981 tmp_adev->shutdown = true; 5982 } 5983 if (!list_is_first(&adev->reset_list, &device_list)) 5984 list_rotate_to_front(&adev->reset_list, &device_list); 5985 device_list_handle = &device_list; 5986 } else { 5987 list_add_tail(&adev->reset_list, &device_list); 5988 device_list_handle = &device_list; 5989 } 5990 5991 if (!amdgpu_sriov_vf(adev)) { 5992 r = amdgpu_device_health_check(device_list_handle); 5993 if (r) 5994 goto end_reset; 5995 } 5996 5997 /* We need to lock reset domain only once both for XGMI and single device */ 5998 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5999 reset_list); 6000 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 6001 6002 /* block all schedulers and reset given job's ring */ 6003 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6004 6005 amdgpu_device_set_mp1_state(tmp_adev); 6006 6007 /* 6008 * Try to put the audio codec into suspend state 6009 * before gpu reset started. 6010 * 6011 * Due to the power domain of the graphics device 6012 * is shared with AZ power domain. Without this, 6013 * we may change the audio hardware from behind 6014 * the audio driver's back. That will trigger 6015 * some audio codec errors. 6016 */ 6017 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 6018 audio_suspended = true; 6019 6020 amdgpu_ras_set_error_query_ready(tmp_adev, false); 6021 6022 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 6023 6024 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 6025 6026 /* 6027 * Mark these ASICs to be reset as untracked first 6028 * And add them back after reset completed 6029 */ 6030 amdgpu_unregister_gpu_instance(tmp_adev); 6031 6032 drm_client_dev_suspend(adev_to_drm(tmp_adev), false); 6033 6034 /* disable ras on ALL IPs */ 6035 if (!need_emergency_restart && 6036 amdgpu_device_ip_need_full_reset(tmp_adev)) 6037 amdgpu_ras_suspend(tmp_adev); 6038 6039 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6040 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6041 6042 if (!amdgpu_ring_sched_ready(ring)) 6043 continue; 6044 6045 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 6046 6047 if (need_emergency_restart) 6048 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 6049 } 6050 atomic_inc(&tmp_adev->gpu_reset_counter); 6051 } 6052 6053 if (need_emergency_restart) 6054 goto skip_sched_resume; 6055 6056 /* 6057 * Must check guilty signal here since after this point all old 6058 * HW fences are force signaled. 6059 * 6060 * job->base holds a reference to parent fence 6061 */ 6062 if (job && dma_fence_is_signaled(&job->hw_fence)) { 6063 job_signaled = true; 6064 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 6065 goto skip_hw_reset; 6066 } 6067 6068 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 6069 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6070 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 6071 /*TODO Should we stop ?*/ 6072 if (r) { 6073 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 6074 r, adev_to_drm(tmp_adev)->unique); 6075 tmp_adev->asic_reset_res = r; 6076 } 6077 } 6078 6079 /* Actual ASIC resets if needed.*/ 6080 /* Host driver will handle XGMI hive reset for SRIOV */ 6081 if (amdgpu_sriov_vf(adev)) { 6082 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 6083 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 6084 amdgpu_ras_set_fed(adev, true); 6085 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 6086 } 6087 6088 r = amdgpu_device_reset_sriov(adev, reset_context); 6089 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 6090 amdgpu_virt_release_full_gpu(adev, true); 6091 goto retry; 6092 } 6093 if (r) 6094 adev->asic_reset_res = r; 6095 } else { 6096 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 6097 if (r && r == -EAGAIN) 6098 goto retry; 6099 } 6100 6101 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6102 /* 6103 * Drop any pending non scheduler resets queued before reset is done. 6104 * Any reset scheduled after this point would be valid. Scheduler resets 6105 * were already dropped during drm_sched_stop and no new ones can come 6106 * in before drm_sched_start. 6107 */ 6108 amdgpu_device_stop_pending_resets(tmp_adev); 6109 } 6110 6111 skip_hw_reset: 6112 6113 /* Post ASIC reset for all devs .*/ 6114 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6115 6116 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6117 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6118 6119 if (!amdgpu_ring_sched_ready(ring)) 6120 continue; 6121 6122 drm_sched_start(&ring->sched, 0); 6123 } 6124 6125 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 6126 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 6127 6128 if (tmp_adev->asic_reset_res) 6129 r = tmp_adev->asic_reset_res; 6130 6131 tmp_adev->asic_reset_res = 0; 6132 6133 if (r) { 6134 /* bad news, how to tell it to userspace ? 6135 * for ras error, we should report GPU bad status instead of 6136 * reset failure 6137 */ 6138 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 6139 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 6140 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", 6141 atomic_read(&tmp_adev->gpu_reset_counter)); 6142 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 6143 } else { 6144 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 6145 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 6146 DRM_WARN("smart shift update failed\n"); 6147 } 6148 } 6149 6150 skip_sched_resume: 6151 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6152 /* unlock kfd: SRIOV would do it separately */ 6153 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 6154 amdgpu_amdkfd_post_reset(tmp_adev); 6155 6156 /* kfd_post_reset will do nothing if kfd device is not initialized, 6157 * need to bring up kfd here if it's not be initialized before 6158 */ 6159 if (!adev->kfd.init_complete) 6160 amdgpu_amdkfd_device_init(adev); 6161 6162 if (audio_suspended) 6163 amdgpu_device_resume_display_audio(tmp_adev); 6164 6165 amdgpu_device_unset_mp1_state(tmp_adev); 6166 6167 amdgpu_ras_set_error_query_ready(tmp_adev, true); 6168 } 6169 6170 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 6171 reset_list); 6172 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 6173 6174 end_reset: 6175 if (hive) { 6176 mutex_unlock(&hive->hive_lock); 6177 amdgpu_put_xgmi_hive(hive); 6178 } 6179 6180 if (r) 6181 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 6182 6183 atomic_set(&adev->reset_domain->reset_res, r); 6184 6185 if (!r) 6186 drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE); 6187 6188 return r; 6189 } 6190 6191 /** 6192 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6193 * 6194 * @adev: amdgpu_device pointer 6195 * @speed: pointer to the speed of the link 6196 * @width: pointer to the width of the link 6197 * 6198 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6199 * first physical partner to an AMD dGPU. 6200 * This will exclude any virtual switches and links. 6201 */ 6202 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6203 enum pci_bus_speed *speed, 6204 enum pcie_link_width *width) 6205 { 6206 struct pci_dev *parent = adev->pdev; 6207 6208 if (!speed || !width) 6209 return; 6210 6211 *speed = PCI_SPEED_UNKNOWN; 6212 *width = PCIE_LNK_WIDTH_UNKNOWN; 6213 6214 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6215 while ((parent = pci_upstream_bridge(parent))) { 6216 /* skip upstream/downstream switches internal to dGPU*/ 6217 if (parent->vendor == PCI_VENDOR_ID_ATI) 6218 continue; 6219 *speed = pcie_get_speed_cap(parent); 6220 *width = pcie_get_width_cap(parent); 6221 break; 6222 } 6223 } else { 6224 /* use the current speeds rather than max if switching is not supported */ 6225 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6226 } 6227 } 6228 6229 /** 6230 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU 6231 * 6232 * @adev: amdgpu_device pointer 6233 * @speed: pointer to the speed of the link 6234 * @width: pointer to the width of the link 6235 * 6236 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6237 * AMD dGPU which may be a virtual upstream bridge. 6238 */ 6239 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev, 6240 enum pci_bus_speed *speed, 6241 enum pcie_link_width *width) 6242 { 6243 struct pci_dev *parent = adev->pdev; 6244 6245 if (!speed || !width) 6246 return; 6247 6248 parent = pci_upstream_bridge(parent); 6249 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) { 6250 /* use the upstream/downstream switches internal to dGPU */ 6251 *speed = pcie_get_speed_cap(parent); 6252 *width = pcie_get_width_cap(parent); 6253 while ((parent = pci_upstream_bridge(parent))) { 6254 if (parent->vendor == PCI_VENDOR_ID_ATI) { 6255 /* use the upstream/downstream switches internal to dGPU */ 6256 *speed = pcie_get_speed_cap(parent); 6257 *width = pcie_get_width_cap(parent); 6258 } 6259 } 6260 } else { 6261 /* use the device itself */ 6262 *speed = pcie_get_speed_cap(adev->pdev); 6263 *width = pcie_get_width_cap(adev->pdev); 6264 } 6265 } 6266 6267 /** 6268 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6269 * 6270 * @adev: amdgpu_device pointer 6271 * 6272 * Fetches and stores in the driver the PCIE capabilities (gen speed 6273 * and lanes) of the slot the device is in. Handles APUs and 6274 * virtualized environments where PCIE config space may not be available. 6275 */ 6276 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6277 { 6278 enum pci_bus_speed speed_cap, platform_speed_cap; 6279 enum pcie_link_width platform_link_width, link_width; 6280 6281 if (amdgpu_pcie_gen_cap) 6282 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6283 6284 if (amdgpu_pcie_lane_cap) 6285 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6286 6287 /* covers APUs as well */ 6288 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6289 if (adev->pm.pcie_gen_mask == 0) 6290 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6291 if (adev->pm.pcie_mlw_mask == 0) 6292 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6293 return; 6294 } 6295 6296 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6297 return; 6298 6299 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6300 &platform_link_width); 6301 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width); 6302 6303 if (adev->pm.pcie_gen_mask == 0) { 6304 /* asic caps */ 6305 if (speed_cap == PCI_SPEED_UNKNOWN) { 6306 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6307 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6308 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6309 } else { 6310 if (speed_cap == PCIE_SPEED_32_0GT) 6311 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6312 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6313 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6314 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6315 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6316 else if (speed_cap == PCIE_SPEED_16_0GT) 6317 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6318 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6319 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6320 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6321 else if (speed_cap == PCIE_SPEED_8_0GT) 6322 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6323 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6324 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6325 else if (speed_cap == PCIE_SPEED_5_0GT) 6326 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6327 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6328 else 6329 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6330 } 6331 /* platform caps */ 6332 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6333 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6334 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6335 } else { 6336 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6337 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6338 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6339 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6340 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6341 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6342 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6343 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6344 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6345 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6346 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6347 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6348 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6349 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6350 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6351 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6352 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6353 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6354 else 6355 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6356 6357 } 6358 } 6359 if (adev->pm.pcie_mlw_mask == 0) { 6360 /* asic caps */ 6361 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6362 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK; 6363 } else { 6364 switch (link_width) { 6365 case PCIE_LNK_X32: 6366 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 | 6367 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6368 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6369 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6370 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6371 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6372 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6373 break; 6374 case PCIE_LNK_X16: 6375 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6376 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6377 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6378 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6379 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6380 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6381 break; 6382 case PCIE_LNK_X12: 6383 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6384 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6385 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6386 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6387 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6388 break; 6389 case PCIE_LNK_X8: 6390 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6391 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6392 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6393 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6394 break; 6395 case PCIE_LNK_X4: 6396 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6397 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6398 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6399 break; 6400 case PCIE_LNK_X2: 6401 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6402 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6403 break; 6404 case PCIE_LNK_X1: 6405 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1; 6406 break; 6407 default: 6408 break; 6409 } 6410 } 6411 /* platform caps */ 6412 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6413 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6414 } else { 6415 switch (platform_link_width) { 6416 case PCIE_LNK_X32: 6417 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6418 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6419 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6420 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6421 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6422 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6423 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6424 break; 6425 case PCIE_LNK_X16: 6426 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6427 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6428 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6429 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6430 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6431 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6432 break; 6433 case PCIE_LNK_X12: 6434 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6435 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6436 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6437 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6438 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6439 break; 6440 case PCIE_LNK_X8: 6441 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6442 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6443 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6444 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6445 break; 6446 case PCIE_LNK_X4: 6447 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6448 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6449 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6450 break; 6451 case PCIE_LNK_X2: 6452 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6453 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6454 break; 6455 case PCIE_LNK_X1: 6456 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6457 break; 6458 default: 6459 break; 6460 } 6461 } 6462 } 6463 } 6464 6465 /** 6466 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6467 * 6468 * @adev: amdgpu_device pointer 6469 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6470 * 6471 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6472 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6473 * @peer_adev. 6474 */ 6475 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6476 struct amdgpu_device *peer_adev) 6477 { 6478 #ifdef CONFIG_HSA_AMD_P2P 6479 bool p2p_access = 6480 !adev->gmc.xgmi.connected_to_cpu && 6481 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6482 if (!p2p_access) 6483 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 6484 pci_name(peer_adev->pdev)); 6485 6486 bool is_large_bar = adev->gmc.visible_vram_size && 6487 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 6488 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 6489 6490 if (!p2p_addressable) { 6491 uint64_t address_mask = peer_adev->dev->dma_mask ? 6492 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6493 resource_size_t aper_limit = 6494 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6495 6496 p2p_addressable = !(adev->gmc.aper_base & address_mask || 6497 aper_limit & address_mask); 6498 } 6499 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 6500 #else 6501 return false; 6502 #endif 6503 } 6504 6505 int amdgpu_device_baco_enter(struct drm_device *dev) 6506 { 6507 struct amdgpu_device *adev = drm_to_adev(dev); 6508 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6509 6510 if (!amdgpu_device_supports_baco(dev)) 6511 return -ENOTSUPP; 6512 6513 if (ras && adev->ras_enabled && 6514 adev->nbio.funcs->enable_doorbell_interrupt) 6515 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6516 6517 return amdgpu_dpm_baco_enter(adev); 6518 } 6519 6520 int amdgpu_device_baco_exit(struct drm_device *dev) 6521 { 6522 struct amdgpu_device *adev = drm_to_adev(dev); 6523 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6524 int ret = 0; 6525 6526 if (!amdgpu_device_supports_baco(dev)) 6527 return -ENOTSUPP; 6528 6529 ret = amdgpu_dpm_baco_exit(adev); 6530 if (ret) 6531 return ret; 6532 6533 if (ras && adev->ras_enabled && 6534 adev->nbio.funcs->enable_doorbell_interrupt) 6535 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6536 6537 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6538 adev->nbio.funcs->clear_doorbell_interrupt) 6539 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6540 6541 return 0; 6542 } 6543 6544 /** 6545 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6546 * @pdev: PCI device struct 6547 * @state: PCI channel state 6548 * 6549 * Description: Called when a PCI error is detected. 6550 * 6551 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6552 */ 6553 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6554 { 6555 struct drm_device *dev = pci_get_drvdata(pdev); 6556 struct amdgpu_device *adev = drm_to_adev(dev); 6557 int i; 6558 6559 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 6560 6561 if (adev->gmc.xgmi.num_physical_nodes > 1) { 6562 DRM_WARN("No support for XGMI hive yet..."); 6563 return PCI_ERS_RESULT_DISCONNECT; 6564 } 6565 6566 adev->pci_channel_state = state; 6567 6568 switch (state) { 6569 case pci_channel_io_normal: 6570 return PCI_ERS_RESULT_CAN_RECOVER; 6571 /* Fatal error, prepare for slot reset */ 6572 case pci_channel_io_frozen: 6573 /* 6574 * Locking adev->reset_domain->sem will prevent any external access 6575 * to GPU during PCI error recovery 6576 */ 6577 amdgpu_device_lock_reset_domain(adev->reset_domain); 6578 amdgpu_device_set_mp1_state(adev); 6579 6580 /* 6581 * Block any work scheduling as we do for regular GPU reset 6582 * for the duration of the recovery 6583 */ 6584 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6585 struct amdgpu_ring *ring = adev->rings[i]; 6586 6587 if (!amdgpu_ring_sched_ready(ring)) 6588 continue; 6589 6590 drm_sched_stop(&ring->sched, NULL); 6591 } 6592 atomic_inc(&adev->gpu_reset_counter); 6593 return PCI_ERS_RESULT_NEED_RESET; 6594 case pci_channel_io_perm_failure: 6595 /* Permanent error, prepare for device removal */ 6596 return PCI_ERS_RESULT_DISCONNECT; 6597 } 6598 6599 return PCI_ERS_RESULT_NEED_RESET; 6600 } 6601 6602 /** 6603 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6604 * @pdev: pointer to PCI device 6605 */ 6606 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6607 { 6608 6609 DRM_INFO("PCI error: mmio enabled callback!!\n"); 6610 6611 /* TODO - dump whatever for debugging purposes */ 6612 6613 /* This called only if amdgpu_pci_error_detected returns 6614 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6615 * works, no need to reset slot. 6616 */ 6617 6618 return PCI_ERS_RESULT_RECOVERED; 6619 } 6620 6621 /** 6622 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6623 * @pdev: PCI device struct 6624 * 6625 * Description: This routine is called by the pci error recovery 6626 * code after the PCI slot has been reset, just before we 6627 * should resume normal operations. 6628 */ 6629 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6630 { 6631 struct drm_device *dev = pci_get_drvdata(pdev); 6632 struct amdgpu_device *adev = drm_to_adev(dev); 6633 int r, i; 6634 struct amdgpu_reset_context reset_context; 6635 u32 memsize; 6636 struct list_head device_list; 6637 6638 /* PCI error slot reset should be skipped During RAS recovery */ 6639 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 6640 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && 6641 amdgpu_ras_in_recovery(adev)) 6642 return PCI_ERS_RESULT_RECOVERED; 6643 6644 DRM_INFO("PCI error: slot reset callback!!\n"); 6645 6646 memset(&reset_context, 0, sizeof(reset_context)); 6647 6648 INIT_LIST_HEAD(&device_list); 6649 list_add_tail(&adev->reset_list, &device_list); 6650 6651 /* wait for asic to come out of reset */ 6652 msleep(500); 6653 6654 /* Restore PCI confspace */ 6655 amdgpu_device_load_pci_state(pdev); 6656 6657 /* confirm ASIC came out of reset */ 6658 for (i = 0; i < adev->usec_timeout; i++) { 6659 memsize = amdgpu_asic_get_config_memsize(adev); 6660 6661 if (memsize != 0xffffffff) 6662 break; 6663 udelay(1); 6664 } 6665 if (memsize == 0xffffffff) { 6666 r = -ETIME; 6667 goto out; 6668 } 6669 6670 reset_context.method = AMD_RESET_METHOD_NONE; 6671 reset_context.reset_req_dev = adev; 6672 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6673 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6674 6675 adev->no_hw_access = true; 6676 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 6677 adev->no_hw_access = false; 6678 if (r) 6679 goto out; 6680 6681 r = amdgpu_do_asic_reset(&device_list, &reset_context); 6682 6683 out: 6684 if (!r) { 6685 if (amdgpu_device_cache_pci_state(adev->pdev)) 6686 pci_restore_state(adev->pdev); 6687 6688 DRM_INFO("PCIe error recovery succeeded\n"); 6689 } else { 6690 DRM_ERROR("PCIe error recovery failed, err:%d", r); 6691 amdgpu_device_unset_mp1_state(adev); 6692 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6693 } 6694 6695 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6696 } 6697 6698 /** 6699 * amdgpu_pci_resume() - resume normal ops after PCI reset 6700 * @pdev: pointer to PCI device 6701 * 6702 * Called when the error recovery driver tells us that its 6703 * OK to resume normal operation. 6704 */ 6705 void amdgpu_pci_resume(struct pci_dev *pdev) 6706 { 6707 struct drm_device *dev = pci_get_drvdata(pdev); 6708 struct amdgpu_device *adev = drm_to_adev(dev); 6709 int i; 6710 6711 6712 DRM_INFO("PCI error: resume callback!!\n"); 6713 6714 /* Only continue execution for the case of pci_channel_io_frozen */ 6715 if (adev->pci_channel_state != pci_channel_io_frozen) 6716 return; 6717 6718 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6719 struct amdgpu_ring *ring = adev->rings[i]; 6720 6721 if (!amdgpu_ring_sched_ready(ring)) 6722 continue; 6723 6724 drm_sched_start(&ring->sched, 0); 6725 } 6726 6727 amdgpu_device_unset_mp1_state(adev); 6728 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6729 } 6730 6731 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6732 { 6733 struct drm_device *dev = pci_get_drvdata(pdev); 6734 struct amdgpu_device *adev = drm_to_adev(dev); 6735 int r; 6736 6737 if (amdgpu_sriov_vf(adev)) 6738 return false; 6739 6740 r = pci_save_state(pdev); 6741 if (!r) { 6742 kfree(adev->pci_state); 6743 6744 adev->pci_state = pci_store_saved_state(pdev); 6745 6746 if (!adev->pci_state) { 6747 DRM_ERROR("Failed to store PCI saved state"); 6748 return false; 6749 } 6750 } else { 6751 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6752 return false; 6753 } 6754 6755 return true; 6756 } 6757 6758 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6759 { 6760 struct drm_device *dev = pci_get_drvdata(pdev); 6761 struct amdgpu_device *adev = drm_to_adev(dev); 6762 int r; 6763 6764 if (!adev->pci_state) 6765 return false; 6766 6767 r = pci_load_saved_state(pdev, adev->pci_state); 6768 6769 if (!r) { 6770 pci_restore_state(pdev); 6771 } else { 6772 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6773 return false; 6774 } 6775 6776 return true; 6777 } 6778 6779 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6780 struct amdgpu_ring *ring) 6781 { 6782 #ifdef CONFIG_X86_64 6783 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6784 return; 6785 #endif 6786 if (adev->gmc.xgmi.connected_to_cpu) 6787 return; 6788 6789 if (ring && ring->funcs->emit_hdp_flush) 6790 amdgpu_ring_emit_hdp_flush(ring); 6791 else 6792 amdgpu_asic_flush_hdp(adev, ring); 6793 } 6794 6795 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6796 struct amdgpu_ring *ring) 6797 { 6798 #ifdef CONFIG_X86_64 6799 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6800 return; 6801 #endif 6802 if (adev->gmc.xgmi.connected_to_cpu) 6803 return; 6804 6805 amdgpu_asic_invalidate_hdp(adev, ring); 6806 } 6807 6808 int amdgpu_in_reset(struct amdgpu_device *adev) 6809 { 6810 return atomic_read(&adev->reset_domain->in_gpu_reset); 6811 } 6812 6813 /** 6814 * amdgpu_device_halt() - bring hardware to some kind of halt state 6815 * 6816 * @adev: amdgpu_device pointer 6817 * 6818 * Bring hardware to some kind of halt state so that no one can touch it 6819 * any more. It will help to maintain error context when error occurred. 6820 * Compare to a simple hang, the system will keep stable at least for SSH 6821 * access. Then it should be trivial to inspect the hardware state and 6822 * see what's going on. Implemented as following: 6823 * 6824 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6825 * clears all CPU mappings to device, disallows remappings through page faults 6826 * 2. amdgpu_irq_disable_all() disables all interrupts 6827 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6828 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6829 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6830 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6831 * flush any in flight DMA operations 6832 */ 6833 void amdgpu_device_halt(struct amdgpu_device *adev) 6834 { 6835 struct pci_dev *pdev = adev->pdev; 6836 struct drm_device *ddev = adev_to_drm(adev); 6837 6838 amdgpu_xcp_dev_unplug(adev); 6839 drm_dev_unplug(ddev); 6840 6841 amdgpu_irq_disable_all(adev); 6842 6843 amdgpu_fence_driver_hw_fini(adev); 6844 6845 adev->no_hw_access = true; 6846 6847 amdgpu_device_unmap_mmio(adev); 6848 6849 pci_disable_device(pdev); 6850 pci_wait_for_pending_transaction(pdev); 6851 } 6852 6853 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6854 u32 reg) 6855 { 6856 unsigned long flags, address, data; 6857 u32 r; 6858 6859 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6860 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6861 6862 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6863 WREG32(address, reg * 4); 6864 (void)RREG32(address); 6865 r = RREG32(data); 6866 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6867 return r; 6868 } 6869 6870 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6871 u32 reg, u32 v) 6872 { 6873 unsigned long flags, address, data; 6874 6875 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6876 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6877 6878 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6879 WREG32(address, reg * 4); 6880 (void)RREG32(address); 6881 WREG32(data, v); 6882 (void)RREG32(data); 6883 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6884 } 6885 6886 /** 6887 * amdgpu_device_get_gang - return a reference to the current gang 6888 * @adev: amdgpu_device pointer 6889 * 6890 * Returns: A new reference to the current gang leader. 6891 */ 6892 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 6893 { 6894 struct dma_fence *fence; 6895 6896 rcu_read_lock(); 6897 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 6898 rcu_read_unlock(); 6899 return fence; 6900 } 6901 6902 /** 6903 * amdgpu_device_switch_gang - switch to a new gang 6904 * @adev: amdgpu_device pointer 6905 * @gang: the gang to switch to 6906 * 6907 * Try to switch to a new gang. 6908 * Returns: NULL if we switched to the new gang or a reference to the current 6909 * gang leader. 6910 */ 6911 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6912 struct dma_fence *gang) 6913 { 6914 struct dma_fence *old = NULL; 6915 6916 do { 6917 dma_fence_put(old); 6918 old = amdgpu_device_get_gang(adev); 6919 if (old == gang) 6920 break; 6921 6922 if (!dma_fence_is_signaled(old)) 6923 return old; 6924 6925 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6926 old, gang) != old); 6927 6928 dma_fence_put(old); 6929 return NULL; 6930 } 6931 6932 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6933 { 6934 switch (adev->asic_type) { 6935 #ifdef CONFIG_DRM_AMDGPU_SI 6936 case CHIP_HAINAN: 6937 #endif 6938 case CHIP_TOPAZ: 6939 /* chips with no display hardware */ 6940 return false; 6941 #ifdef CONFIG_DRM_AMDGPU_SI 6942 case CHIP_TAHITI: 6943 case CHIP_PITCAIRN: 6944 case CHIP_VERDE: 6945 case CHIP_OLAND: 6946 #endif 6947 #ifdef CONFIG_DRM_AMDGPU_CIK 6948 case CHIP_BONAIRE: 6949 case CHIP_HAWAII: 6950 case CHIP_KAVERI: 6951 case CHIP_KABINI: 6952 case CHIP_MULLINS: 6953 #endif 6954 case CHIP_TONGA: 6955 case CHIP_FIJI: 6956 case CHIP_POLARIS10: 6957 case CHIP_POLARIS11: 6958 case CHIP_POLARIS12: 6959 case CHIP_VEGAM: 6960 case CHIP_CARRIZO: 6961 case CHIP_STONEY: 6962 /* chips with display hardware */ 6963 return true; 6964 default: 6965 /* IP discovery */ 6966 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 6967 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6968 return false; 6969 return true; 6970 } 6971 } 6972 6973 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6974 uint32_t inst, uint32_t reg_addr, char reg_name[], 6975 uint32_t expected_value, uint32_t mask) 6976 { 6977 uint32_t ret = 0; 6978 uint32_t old_ = 0; 6979 uint32_t tmp_ = RREG32(reg_addr); 6980 uint32_t loop = adev->usec_timeout; 6981 6982 while ((tmp_ & (mask)) != (expected_value)) { 6983 if (old_ != tmp_) { 6984 loop = adev->usec_timeout; 6985 old_ = tmp_; 6986 } else 6987 udelay(1); 6988 tmp_ = RREG32(reg_addr); 6989 loop--; 6990 if (!loop) { 6991 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6992 inst, reg_name, (uint32_t)expected_value, 6993 (uint32_t)(tmp_ & (mask))); 6994 ret = -ETIMEDOUT; 6995 break; 6996 } 6997 } 6998 return ret; 6999 } 7000 7001 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 7002 { 7003 ssize_t size = 0; 7004 7005 if (!ring || !ring->adev) 7006 return size; 7007 7008 if (amdgpu_device_should_recover_gpu(ring->adev)) 7009 size |= AMDGPU_RESET_TYPE_FULL; 7010 7011 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 7012 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 7013 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 7014 7015 return size; 7016 } 7017 7018 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 7019 { 7020 ssize_t size = 0; 7021 7022 if (supported_reset == 0) { 7023 size += sysfs_emit_at(buf, size, "unsupported"); 7024 size += sysfs_emit_at(buf, size, "\n"); 7025 return size; 7026 7027 } 7028 7029 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 7030 size += sysfs_emit_at(buf, size, "soft "); 7031 7032 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 7033 size += sysfs_emit_at(buf, size, "queue "); 7034 7035 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 7036 size += sysfs_emit_at(buf, size, "pipe "); 7037 7038 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 7039 size += sysfs_emit_at(buf, size, "full "); 7040 7041 size += sysfs_emit_at(buf, size, "\n"); 7042 return size; 7043 } 7044