1 /*
2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors: Dave Airlie
25 * Alex Deucher
26 * Jerome Glisse
27 */
28 #include <linux/power_supply.h>
29 #include <linux/kthread.h>
30 #include <linux/module.h>
31 #include <linux/console.h>
32 #include <linux/slab.h>
33 #include <linux/iommu.h>
34 #include <linux/pci.h>
35 #include <linux/pci-p2pdma.h>
36 #include <linux/apple-gmux.h>
37
38 #include <drm/drm_aperture.h>
39 #include <drm/drm_atomic_helper.h>
40 #include <drm/drm_crtc_helper.h>
41 #include <drm/drm_fb_helper.h>
42 #include <drm/drm_probe_helper.h>
43 #include <drm/amdgpu_drm.h>
44 #include <linux/device.h>
45 #include <linux/vgaarb.h>
46 #include <linux/vga_switcheroo.h>
47 #include <linux/efi.h>
48 #include "amdgpu.h"
49 #include "amdgpu_trace.h"
50 #include "amdgpu_i2c.h"
51 #include "atom.h"
52 #include "amdgpu_atombios.h"
53 #include "amdgpu_atomfirmware.h"
54 #include "amd_pcie.h"
55 #ifdef CONFIG_DRM_AMDGPU_SI
56 #include "si.h"
57 #endif
58 #ifdef CONFIG_DRM_AMDGPU_CIK
59 #include "cik.h"
60 #endif
61 #include "vi.h"
62 #include "soc15.h"
63 #include "nv.h"
64 #include "bif/bif_4_1_d.h"
65 #include <linux/firmware.h>
66 #include "amdgpu_vf_error.h"
67
68 #include "amdgpu_amdkfd.h"
69 #include "amdgpu_pm.h"
70
71 #include "amdgpu_xgmi.h"
72 #include "amdgpu_ras.h"
73 #include "amdgpu_pmu.h"
74 #include "amdgpu_fru_eeprom.h"
75 #include "amdgpu_reset.h"
76 #include "amdgpu_virt.h"
77 #include "amdgpu_dev_coredump.h"
78
79 #include <linux/suspend.h>
80 #include <drm/task_barrier.h>
81 #include <linux/pm_runtime.h>
82
83 #include <drm/drm_drv.h>
84
85 #if IS_ENABLED(CONFIG_X86) && defined(__linux__)
86 #include <asm/intel-family.h>
87 #endif
88
89 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
90 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
91 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
92 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
93 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
94 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
95 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
96
97 #define AMDGPU_RESUME_MS 2000
98 #define AMDGPU_MAX_RETRY_LIMIT 2
99 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
100 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2)
101 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2)
102 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2)
103
104 static const struct drm_driver amdgpu_kms_driver;
105
106 const char *amdgpu_asic_name[] = {
107 "TAHITI",
108 "PITCAIRN",
109 "VERDE",
110 "OLAND",
111 "HAINAN",
112 "BONAIRE",
113 "KAVERI",
114 "KABINI",
115 "HAWAII",
116 "MULLINS",
117 "TOPAZ",
118 "TONGA",
119 "FIJI",
120 "CARRIZO",
121 "STONEY",
122 "POLARIS10",
123 "POLARIS11",
124 "POLARIS12",
125 "VEGAM",
126 "VEGA10",
127 "VEGA12",
128 "VEGA20",
129 "RAVEN",
130 "ARCTURUS",
131 "RENOIR",
132 "ALDEBARAN",
133 "NAVI10",
134 "CYAN_SKILLFISH",
135 "NAVI14",
136 "NAVI12",
137 "SIENNA_CICHLID",
138 "NAVY_FLOUNDER",
139 "VANGOGH",
140 "DIMGREY_CAVEFISH",
141 "BEIGE_GOBY",
142 "YELLOW_CARP",
143 "IP DISCOVERY",
144 "LAST",
145 };
146
147 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev);
148
149 /**
150 * DOC: pcie_replay_count
151 *
152 * The amdgpu driver provides a sysfs API for reporting the total number
153 * of PCIe replays (NAKs)
154 * The file pcie_replay_count is used for this and returns the total
155 * number of replays as a sum of the NAKs generated and NAKs received
156 */
157
amdgpu_device_get_pcie_replay_count(struct device * dev,struct device_attribute * attr,char * buf)158 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
159 struct device_attribute *attr, char *buf)
160 {
161 struct drm_device *ddev = dev_get_drvdata(dev);
162 struct amdgpu_device *adev = drm_to_adev(ddev);
163 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
164
165 return sysfs_emit(buf, "%llu\n", cnt);
166 }
167
168 static DEVICE_ATTR(pcie_replay_count, 0444,
169 amdgpu_device_get_pcie_replay_count, NULL);
170
171 #ifdef __linux__
172
amdgpu_sysfs_reg_state_get(struct file * f,struct kobject * kobj,struct bin_attribute * attr,char * buf,loff_t ppos,size_t count)173 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj,
174 struct bin_attribute *attr, char *buf,
175 loff_t ppos, size_t count)
176 {
177 struct device *dev = kobj_to_dev(kobj);
178 struct drm_device *ddev = dev_get_drvdata(dev);
179 struct amdgpu_device *adev = drm_to_adev(ddev);
180 ssize_t bytes_read;
181
182 switch (ppos) {
183 case AMDGPU_SYS_REG_STATE_XGMI:
184 bytes_read = amdgpu_asic_get_reg_state(
185 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count);
186 break;
187 case AMDGPU_SYS_REG_STATE_WAFL:
188 bytes_read = amdgpu_asic_get_reg_state(
189 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count);
190 break;
191 case AMDGPU_SYS_REG_STATE_PCIE:
192 bytes_read = amdgpu_asic_get_reg_state(
193 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count);
194 break;
195 case AMDGPU_SYS_REG_STATE_USR:
196 bytes_read = amdgpu_asic_get_reg_state(
197 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count);
198 break;
199 case AMDGPU_SYS_REG_STATE_USR_1:
200 bytes_read = amdgpu_asic_get_reg_state(
201 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count);
202 break;
203 default:
204 return -EINVAL;
205 }
206
207 return bytes_read;
208 }
209
210 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL,
211 AMDGPU_SYS_REG_STATE_END);
212
213 #endif /* __linux__ */
214
amdgpu_reg_state_sysfs_init(struct amdgpu_device * adev)215 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev)
216 {
217 int ret;
218
219 if (!amdgpu_asic_get_reg_state_supported(adev))
220 return 0;
221
222 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state);
223
224 return ret;
225 }
226
amdgpu_reg_state_sysfs_fini(struct amdgpu_device * adev)227 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev)
228 {
229 if (!amdgpu_asic_get_reg_state_supported(adev))
230 return;
231 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state);
232 }
233
234 /**
235 * DOC: board_info
236 *
237 * The amdgpu driver provides a sysfs API for giving board related information.
238 * It provides the form factor information in the format
239 *
240 * type : form factor
241 *
242 * Possible form factor values
243 *
244 * - "cem" - PCIE CEM card
245 * - "oam" - Open Compute Accelerator Module
246 * - "unknown" - Not known
247 *
248 */
249
amdgpu_device_get_board_info(struct device * dev,struct device_attribute * attr,char * buf)250 static ssize_t amdgpu_device_get_board_info(struct device *dev,
251 struct device_attribute *attr,
252 char *buf)
253 {
254 struct drm_device *ddev = dev_get_drvdata(dev);
255 struct amdgpu_device *adev = drm_to_adev(ddev);
256 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM;
257 const char *pkg;
258
259 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type)
260 pkg_type = adev->smuio.funcs->get_pkg_type(adev);
261
262 switch (pkg_type) {
263 case AMDGPU_PKG_TYPE_CEM:
264 pkg = "cem";
265 break;
266 case AMDGPU_PKG_TYPE_OAM:
267 pkg = "oam";
268 break;
269 default:
270 pkg = "unknown";
271 break;
272 }
273
274 return sysfs_emit(buf, "%s : %s\n", "type", pkg);
275 }
276
277 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL);
278
279 static struct attribute *amdgpu_board_attrs[] = {
280 &dev_attr_board_info.attr,
281 NULL,
282 };
283
284 #ifdef notyet
amdgpu_board_attrs_is_visible(struct kobject * kobj,struct attribute * attr,int n)285 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj,
286 struct attribute *attr, int n)
287 {
288 struct device *dev = kobj_to_dev(kobj);
289 struct drm_device *ddev = dev_get_drvdata(dev);
290 struct amdgpu_device *adev = drm_to_adev(ddev);
291
292 if (adev->flags & AMD_IS_APU)
293 return 0;
294
295 return attr->mode;
296 }
297 #endif
298
299 static const struct attribute_group amdgpu_board_attrs_group = {
300 .attrs = amdgpu_board_attrs,
301 #ifdef notyet
302 .is_visible = amdgpu_board_attrs_is_visible
303 #endif
304 };
305
306 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
307
308
309 /**
310 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
311 *
312 * @dev: drm_device pointer
313 *
314 * Returns true if the device is a dGPU with ATPX power control,
315 * otherwise return false.
316 */
amdgpu_device_supports_px(struct drm_device * dev)317 bool amdgpu_device_supports_px(struct drm_device *dev)
318 {
319 struct amdgpu_device *adev = drm_to_adev(dev);
320
321 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
322 return true;
323 return false;
324 }
325
326 /**
327 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
328 *
329 * @dev: drm_device pointer
330 *
331 * Returns true if the device is a dGPU with ACPI power control,
332 * otherwise return false.
333 */
amdgpu_device_supports_boco(struct drm_device * dev)334 bool amdgpu_device_supports_boco(struct drm_device *dev)
335 {
336 struct amdgpu_device *adev = drm_to_adev(dev);
337
338 if (adev->has_pr3 ||
339 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
340 return true;
341 return false;
342 }
343
344 /**
345 * amdgpu_device_supports_baco - Does the device support BACO
346 *
347 * @dev: drm_device pointer
348 *
349 * Return:
350 * 1 if the device supporte BACO;
351 * 3 if the device support MACO (only works if BACO is supported)
352 * otherwise return 0.
353 */
amdgpu_device_supports_baco(struct drm_device * dev)354 int amdgpu_device_supports_baco(struct drm_device *dev)
355 {
356 struct amdgpu_device *adev = drm_to_adev(dev);
357
358 return amdgpu_asic_supports_baco(adev);
359 }
360
amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device * adev)361 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev)
362 {
363 struct drm_device *dev;
364 int bamaco_support;
365
366 dev = adev_to_drm(adev);
367
368 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE;
369 bamaco_support = amdgpu_device_supports_baco(dev);
370
371 switch (amdgpu_runtime_pm) {
372 case 2:
373 if (bamaco_support & MACO_SUPPORT) {
374 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
375 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n");
376 } else if (bamaco_support == BACO_SUPPORT) {
377 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
378 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n");
379 }
380 break;
381 case 1:
382 if (bamaco_support & BACO_SUPPORT) {
383 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
384 dev_info(adev->dev, "Forcing BACO for runtime pm\n");
385 }
386 break;
387 case -1:
388 case -2:
389 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */
390 adev->pm.rpm_mode = AMDGPU_RUNPM_PX;
391 dev_info(adev->dev, "Using ATPX for runtime pm\n");
392 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */
393 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO;
394 dev_info(adev->dev, "Using BOCO for runtime pm\n");
395 } else {
396 if (!bamaco_support)
397 goto no_runtime_pm;
398
399 switch (adev->asic_type) {
400 case CHIP_VEGA20:
401 case CHIP_ARCTURUS:
402 /* BACO are not supported on vega20 and arctrus */
403 break;
404 case CHIP_VEGA10:
405 /* enable BACO as runpm mode if noretry=0 */
406 if (!adev->gmc.noretry)
407 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
408 break;
409 default:
410 /* enable BACO as runpm mode on CI+ */
411 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
412 break;
413 }
414
415 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) {
416 if (bamaco_support & MACO_SUPPORT) {
417 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
418 dev_info(adev->dev, "Using BAMACO for runtime pm\n");
419 } else {
420 dev_info(adev->dev, "Using BACO for runtime pm\n");
421 }
422 }
423 }
424 break;
425 case 0:
426 dev_info(adev->dev, "runtime pm is manually disabled\n");
427 break;
428 default:
429 break;
430 }
431
432 no_runtime_pm:
433 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE)
434 dev_info(adev->dev, "Runtime PM not available\n");
435 }
436 /**
437 * amdgpu_device_supports_smart_shift - Is the device dGPU with
438 * smart shift support
439 *
440 * @dev: drm_device pointer
441 *
442 * Returns true if the device is a dGPU with Smart Shift support,
443 * otherwise returns false.
444 */
amdgpu_device_supports_smart_shift(struct drm_device * dev)445 bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
446 {
447 return (amdgpu_device_supports_boco(dev) &&
448 amdgpu_acpi_is_power_shift_control_supported());
449 }
450
451 /*
452 * VRAM access helper functions
453 */
454
455 /**
456 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
457 *
458 * @adev: amdgpu_device pointer
459 * @pos: offset of the buffer in vram
460 * @buf: virtual address of the buffer in system memory
461 * @size: read/write size, sizeof(@buf) must > @size
462 * @write: true - write to vram, otherwise - read from vram
463 */
amdgpu_device_mm_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)464 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
465 void *buf, size_t size, bool write)
466 {
467 unsigned long flags;
468 uint32_t hi = ~0, tmp = 0;
469 uint32_t *data = buf;
470 uint64_t last;
471 int idx;
472
473 if (!drm_dev_enter(adev_to_drm(adev), &idx))
474 return;
475
476 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
477
478 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
479 for (last = pos + size; pos < last; pos += 4) {
480 tmp = pos >> 31;
481
482 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
483 if (tmp != hi) {
484 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
485 hi = tmp;
486 }
487 if (write)
488 WREG32_NO_KIQ(mmMM_DATA, *data++);
489 else
490 *data++ = RREG32_NO_KIQ(mmMM_DATA);
491 }
492
493 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
494 drm_dev_exit(idx);
495 }
496
497 /**
498 * amdgpu_device_aper_access - access vram by vram aperature
499 *
500 * @adev: amdgpu_device pointer
501 * @pos: offset of the buffer in vram
502 * @buf: virtual address of the buffer in system memory
503 * @size: read/write size, sizeof(@buf) must > @size
504 * @write: true - write to vram, otherwise - read from vram
505 *
506 * The return value means how many bytes have been transferred.
507 */
amdgpu_device_aper_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)508 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
509 void *buf, size_t size, bool write)
510 {
511 #ifdef CONFIG_64BIT
512 void __iomem *addr;
513 size_t count = 0;
514 uint64_t last;
515
516 if (!adev->mman.aper_base_kaddr)
517 return 0;
518
519 last = min(pos + size, adev->gmc.visible_vram_size);
520 if (last > pos) {
521 addr = adev->mman.aper_base_kaddr + pos;
522 count = last - pos;
523
524 if (write) {
525 memcpy_toio(addr, buf, count);
526 /* Make sure HDP write cache flush happens without any reordering
527 * after the system memory contents are sent over PCIe device
528 */
529 mb();
530 amdgpu_device_flush_hdp(adev, NULL);
531 } else {
532 amdgpu_device_invalidate_hdp(adev, NULL);
533 /* Make sure HDP read cache is invalidated before issuing a read
534 * to the PCIe device
535 */
536 mb();
537 memcpy_fromio(buf, addr, count);
538 }
539
540 }
541
542 return count;
543 #else
544 return 0;
545 #endif
546 }
547
548 /**
549 * amdgpu_device_vram_access - read/write a buffer in vram
550 *
551 * @adev: amdgpu_device pointer
552 * @pos: offset of the buffer in vram
553 * @buf: virtual address of the buffer in system memory
554 * @size: read/write size, sizeof(@buf) must > @size
555 * @write: true - write to vram, otherwise - read from vram
556 */
amdgpu_device_vram_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)557 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
558 void *buf, size_t size, bool write)
559 {
560 size_t count;
561
562 /* try to using vram apreature to access vram first */
563 count = amdgpu_device_aper_access(adev, pos, buf, size, write);
564 size -= count;
565 if (size) {
566 /* using MM to access rest vram */
567 pos += count;
568 buf += count;
569 amdgpu_device_mm_access(adev, pos, buf, size, write);
570 }
571 }
572
573 /*
574 * register access helper functions.
575 */
576
577 /* Check if hw access should be skipped because of hotplug or device error */
amdgpu_device_skip_hw_access(struct amdgpu_device * adev)578 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
579 {
580 if (adev->no_hw_access)
581 return true;
582
583 #ifdef CONFIG_LOCKDEP
584 /*
585 * This is a bit complicated to understand, so worth a comment. What we assert
586 * here is that the GPU reset is not running on another thread in parallel.
587 *
588 * For this we trylock the read side of the reset semaphore, if that succeeds
589 * we know that the reset is not running in paralell.
590 *
591 * If the trylock fails we assert that we are either already holding the read
592 * side of the lock or are the reset thread itself and hold the write side of
593 * the lock.
594 */
595 if (in_task()) {
596 if (down_read_trylock(&adev->reset_domain->sem))
597 up_read(&adev->reset_domain->sem);
598 else
599 lockdep_assert_held(&adev->reset_domain->sem);
600 }
601 #endif
602 return false;
603 }
604
605 /**
606 * amdgpu_device_rreg - read a memory mapped IO or indirect register
607 *
608 * @adev: amdgpu_device pointer
609 * @reg: dword aligned register offset
610 * @acc_flags: access flags which require special behavior
611 *
612 * Returns the 32 bit value from the offset specified.
613 */
amdgpu_device_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags)614 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
615 uint32_t reg, uint32_t acc_flags)
616 {
617 uint32_t ret;
618
619 if (amdgpu_device_skip_hw_access(adev))
620 return 0;
621
622 if ((reg * 4) < adev->rmmio_size) {
623 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
624 amdgpu_sriov_runtime(adev) &&
625 down_read_trylock(&adev->reset_domain->sem)) {
626 ret = amdgpu_kiq_rreg(adev, reg, 0);
627 up_read(&adev->reset_domain->sem);
628 } else {
629 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
630 }
631 } else {
632 ret = adev->pcie_rreg(adev, reg * 4);
633 }
634
635 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
636
637 return ret;
638 }
639
640 /*
641 * MMIO register read with bytes helper functions
642 * @offset:bytes offset from MMIO start
643 */
644
645 /**
646 * amdgpu_mm_rreg8 - read a memory mapped IO register
647 *
648 * @adev: amdgpu_device pointer
649 * @offset: byte aligned register offset
650 *
651 * Returns the 8 bit value from the offset specified.
652 */
amdgpu_mm_rreg8(struct amdgpu_device * adev,uint32_t offset)653 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
654 {
655 if (amdgpu_device_skip_hw_access(adev))
656 return 0;
657
658 if (offset < adev->rmmio_size)
659 return (readb(adev->rmmio + offset));
660 BUG();
661 }
662
663
664 /**
665 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC
666 *
667 * @adev: amdgpu_device pointer
668 * @reg: dword aligned register offset
669 * @acc_flags: access flags which require special behavior
670 * @xcc_id: xcc accelerated compute core id
671 *
672 * Returns the 32 bit value from the offset specified.
673 */
amdgpu_device_xcc_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags,uint32_t xcc_id)674 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev,
675 uint32_t reg, uint32_t acc_flags,
676 uint32_t xcc_id)
677 {
678 uint32_t ret, rlcg_flag;
679
680 if (amdgpu_device_skip_hw_access(adev))
681 return 0;
682
683 if ((reg * 4) < adev->rmmio_size) {
684 if (amdgpu_sriov_vf(adev) &&
685 !amdgpu_sriov_runtime(adev) &&
686 adev->gfx.rlc.rlcg_reg_access_supported &&
687 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
688 GC_HWIP, false,
689 &rlcg_flag)) {
690 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id));
691 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
692 amdgpu_sriov_runtime(adev) &&
693 down_read_trylock(&adev->reset_domain->sem)) {
694 ret = amdgpu_kiq_rreg(adev, reg, xcc_id);
695 up_read(&adev->reset_domain->sem);
696 } else {
697 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
698 }
699 } else {
700 ret = adev->pcie_rreg(adev, reg * 4);
701 }
702
703 return ret;
704 }
705
706 /*
707 * MMIO register write with bytes helper functions
708 * @offset:bytes offset from MMIO start
709 * @value: the value want to be written to the register
710 */
711
712 /**
713 * amdgpu_mm_wreg8 - read a memory mapped IO register
714 *
715 * @adev: amdgpu_device pointer
716 * @offset: byte aligned register offset
717 * @value: 8 bit value to write
718 *
719 * Writes the value specified to the offset specified.
720 */
amdgpu_mm_wreg8(struct amdgpu_device * adev,uint32_t offset,uint8_t value)721 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
722 {
723 if (amdgpu_device_skip_hw_access(adev))
724 return;
725
726 if (offset < adev->rmmio_size)
727 writeb(value, adev->rmmio + offset);
728 else
729 BUG();
730 }
731
732 /**
733 * amdgpu_device_wreg - write to a memory mapped IO or indirect register
734 *
735 * @adev: amdgpu_device pointer
736 * @reg: dword aligned register offset
737 * @v: 32 bit value to write to the register
738 * @acc_flags: access flags which require special behavior
739 *
740 * Writes the value specified to the offset specified.
741 */
amdgpu_device_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags)742 void amdgpu_device_wreg(struct amdgpu_device *adev,
743 uint32_t reg, uint32_t v,
744 uint32_t acc_flags)
745 {
746 if (amdgpu_device_skip_hw_access(adev))
747 return;
748
749 if ((reg * 4) < adev->rmmio_size) {
750 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
751 amdgpu_sriov_runtime(adev) &&
752 down_read_trylock(&adev->reset_domain->sem)) {
753 amdgpu_kiq_wreg(adev, reg, v, 0);
754 up_read(&adev->reset_domain->sem);
755 } else {
756 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
757 }
758 } else {
759 adev->pcie_wreg(adev, reg * 4, v);
760 }
761
762 trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
763 }
764
765 /**
766 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range
767 *
768 * @adev: amdgpu_device pointer
769 * @reg: mmio/rlc register
770 * @v: value to write
771 * @xcc_id: xcc accelerated compute core id
772 *
773 * this function is invoked only for the debugfs register access
774 */
amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t xcc_id)775 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
776 uint32_t reg, uint32_t v,
777 uint32_t xcc_id)
778 {
779 if (amdgpu_device_skip_hw_access(adev))
780 return;
781
782 if (amdgpu_sriov_fullaccess(adev) &&
783 adev->gfx.rlc.funcs &&
784 adev->gfx.rlc.funcs->is_rlcg_access_range) {
785 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
786 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id);
787 } else if ((reg * 4) >= adev->rmmio_size) {
788 adev->pcie_wreg(adev, reg * 4, v);
789 } else {
790 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
791 }
792 }
793
794 /**
795 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC
796 *
797 * @adev: amdgpu_device pointer
798 * @reg: dword aligned register offset
799 * @v: 32 bit value to write to the register
800 * @acc_flags: access flags which require special behavior
801 * @xcc_id: xcc accelerated compute core id
802 *
803 * Writes the value specified to the offset specified.
804 */
amdgpu_device_xcc_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags,uint32_t xcc_id)805 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev,
806 uint32_t reg, uint32_t v,
807 uint32_t acc_flags, uint32_t xcc_id)
808 {
809 uint32_t rlcg_flag;
810
811 if (amdgpu_device_skip_hw_access(adev))
812 return;
813
814 if ((reg * 4) < adev->rmmio_size) {
815 if (amdgpu_sriov_vf(adev) &&
816 !amdgpu_sriov_runtime(adev) &&
817 adev->gfx.rlc.rlcg_reg_access_supported &&
818 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
819 GC_HWIP, true,
820 &rlcg_flag)) {
821 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id));
822 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
823 amdgpu_sriov_runtime(adev) &&
824 down_read_trylock(&adev->reset_domain->sem)) {
825 amdgpu_kiq_wreg(adev, reg, v, xcc_id);
826 up_read(&adev->reset_domain->sem);
827 } else {
828 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
829 }
830 } else {
831 adev->pcie_wreg(adev, reg * 4, v);
832 }
833 }
834
835 /**
836 * amdgpu_device_indirect_rreg - read an indirect register
837 *
838 * @adev: amdgpu_device pointer
839 * @reg_addr: indirect register address to read from
840 *
841 * Returns the value of indirect register @reg_addr
842 */
amdgpu_device_indirect_rreg(struct amdgpu_device * adev,u32 reg_addr)843 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
844 u32 reg_addr)
845 {
846 unsigned long flags, pcie_index, pcie_data;
847 void __iomem *pcie_index_offset;
848 void __iomem *pcie_data_offset;
849 u32 r;
850
851 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
852 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
853
854 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
855 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
856 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
857
858 writel(reg_addr, pcie_index_offset);
859 readl(pcie_index_offset);
860 r = readl(pcie_data_offset);
861 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
862
863 return r;
864 }
865
amdgpu_device_indirect_rreg_ext(struct amdgpu_device * adev,u64 reg_addr)866 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev,
867 u64 reg_addr)
868 {
869 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
870 u32 r;
871 void __iomem *pcie_index_offset;
872 void __iomem *pcie_index_hi_offset;
873 void __iomem *pcie_data_offset;
874
875 if (unlikely(!adev->nbio.funcs)) {
876 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK;
877 pcie_data = AMDGPU_PCIE_DATA_FALLBACK;
878 } else {
879 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
880 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
881 }
882
883 if (reg_addr >> 32) {
884 if (unlikely(!adev->nbio.funcs))
885 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK;
886 else
887 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
888 } else {
889 pcie_index_hi = 0;
890 }
891
892 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
893 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
894 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
895 if (pcie_index_hi != 0)
896 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
897 pcie_index_hi * 4;
898
899 writel(reg_addr, pcie_index_offset);
900 readl(pcie_index_offset);
901 if (pcie_index_hi != 0) {
902 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
903 readl(pcie_index_hi_offset);
904 }
905 r = readl(pcie_data_offset);
906
907 /* clear the high bits */
908 if (pcie_index_hi != 0) {
909 writel(0, pcie_index_hi_offset);
910 readl(pcie_index_hi_offset);
911 }
912
913 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
914
915 return r;
916 }
917
918 /**
919 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
920 *
921 * @adev: amdgpu_device pointer
922 * @reg_addr: indirect register address to read from
923 *
924 * Returns the value of indirect register @reg_addr
925 */
amdgpu_device_indirect_rreg64(struct amdgpu_device * adev,u32 reg_addr)926 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
927 u32 reg_addr)
928 {
929 unsigned long flags, pcie_index, pcie_data;
930 void __iomem *pcie_index_offset;
931 void __iomem *pcie_data_offset;
932 u64 r;
933
934 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
935 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
936
937 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
938 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
939 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
940
941 /* read low 32 bits */
942 writel(reg_addr, pcie_index_offset);
943 readl(pcie_index_offset);
944 r = readl(pcie_data_offset);
945 /* read high 32 bits */
946 writel(reg_addr + 4, pcie_index_offset);
947 readl(pcie_index_offset);
948 r |= ((u64)readl(pcie_data_offset) << 32);
949 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
950
951 return r;
952 }
953
amdgpu_device_indirect_rreg64_ext(struct amdgpu_device * adev,u64 reg_addr)954 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev,
955 u64 reg_addr)
956 {
957 unsigned long flags, pcie_index, pcie_data;
958 unsigned long pcie_index_hi = 0;
959 void __iomem *pcie_index_offset;
960 void __iomem *pcie_index_hi_offset;
961 void __iomem *pcie_data_offset;
962 u64 r;
963
964 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
965 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
966 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
967 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
968
969 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
970 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
971 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
972 if (pcie_index_hi != 0)
973 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
974 pcie_index_hi * 4;
975
976 /* read low 32 bits */
977 writel(reg_addr, pcie_index_offset);
978 readl(pcie_index_offset);
979 if (pcie_index_hi != 0) {
980 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
981 readl(pcie_index_hi_offset);
982 }
983 r = readl(pcie_data_offset);
984 /* read high 32 bits */
985 writel(reg_addr + 4, pcie_index_offset);
986 readl(pcie_index_offset);
987 if (pcie_index_hi != 0) {
988 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
989 readl(pcie_index_hi_offset);
990 }
991 r |= ((u64)readl(pcie_data_offset) << 32);
992
993 /* clear the high bits */
994 if (pcie_index_hi != 0) {
995 writel(0, pcie_index_hi_offset);
996 readl(pcie_index_hi_offset);
997 }
998
999 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1000
1001 return r;
1002 }
1003
1004 /**
1005 * amdgpu_device_indirect_wreg - write an indirect register address
1006 *
1007 * @adev: amdgpu_device pointer
1008 * @reg_addr: indirect register offset
1009 * @reg_data: indirect register data
1010 *
1011 */
amdgpu_device_indirect_wreg(struct amdgpu_device * adev,u32 reg_addr,u32 reg_data)1012 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
1013 u32 reg_addr, u32 reg_data)
1014 {
1015 unsigned long flags, pcie_index, pcie_data;
1016 void __iomem *pcie_index_offset;
1017 void __iomem *pcie_data_offset;
1018
1019 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1020 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1021
1022 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1023 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1024 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1025
1026 writel(reg_addr, pcie_index_offset);
1027 readl(pcie_index_offset);
1028 writel(reg_data, pcie_data_offset);
1029 readl(pcie_data_offset);
1030 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1031 }
1032
amdgpu_device_indirect_wreg_ext(struct amdgpu_device * adev,u64 reg_addr,u32 reg_data)1033 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev,
1034 u64 reg_addr, u32 reg_data)
1035 {
1036 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
1037 void __iomem *pcie_index_offset;
1038 void __iomem *pcie_index_hi_offset;
1039 void __iomem *pcie_data_offset;
1040
1041 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1042 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1043 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1044 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1045 else
1046 pcie_index_hi = 0;
1047
1048 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1049 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1050 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1051 if (pcie_index_hi != 0)
1052 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1053 pcie_index_hi * 4;
1054
1055 writel(reg_addr, pcie_index_offset);
1056 readl(pcie_index_offset);
1057 if (pcie_index_hi != 0) {
1058 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1059 readl(pcie_index_hi_offset);
1060 }
1061 writel(reg_data, pcie_data_offset);
1062 readl(pcie_data_offset);
1063
1064 /* clear the high bits */
1065 if (pcie_index_hi != 0) {
1066 writel(0, pcie_index_hi_offset);
1067 readl(pcie_index_hi_offset);
1068 }
1069
1070 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1071 }
1072
1073 /**
1074 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
1075 *
1076 * @adev: amdgpu_device pointer
1077 * @reg_addr: indirect register offset
1078 * @reg_data: indirect register data
1079 *
1080 */
amdgpu_device_indirect_wreg64(struct amdgpu_device * adev,u32 reg_addr,u64 reg_data)1081 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
1082 u32 reg_addr, u64 reg_data)
1083 {
1084 unsigned long flags, pcie_index, pcie_data;
1085 void __iomem *pcie_index_offset;
1086 void __iomem *pcie_data_offset;
1087
1088 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1089 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1090
1091 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1092 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1093 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1094
1095 /* write low 32 bits */
1096 writel(reg_addr, pcie_index_offset);
1097 readl(pcie_index_offset);
1098 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
1099 readl(pcie_data_offset);
1100 /* write high 32 bits */
1101 writel(reg_addr + 4, pcie_index_offset);
1102 readl(pcie_index_offset);
1103 writel((u32)(reg_data >> 32), pcie_data_offset);
1104 readl(pcie_data_offset);
1105 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1106 }
1107
amdgpu_device_indirect_wreg64_ext(struct amdgpu_device * adev,u64 reg_addr,u64 reg_data)1108 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev,
1109 u64 reg_addr, u64 reg_data)
1110 {
1111 unsigned long flags, pcie_index, pcie_data;
1112 unsigned long pcie_index_hi = 0;
1113 void __iomem *pcie_index_offset;
1114 void __iomem *pcie_index_hi_offset;
1115 void __iomem *pcie_data_offset;
1116
1117 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1118 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1119 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1120 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1121
1122 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1123 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1124 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1125 if (pcie_index_hi != 0)
1126 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1127 pcie_index_hi * 4;
1128
1129 /* write low 32 bits */
1130 writel(reg_addr, pcie_index_offset);
1131 readl(pcie_index_offset);
1132 if (pcie_index_hi != 0) {
1133 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1134 readl(pcie_index_hi_offset);
1135 }
1136 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
1137 readl(pcie_data_offset);
1138 /* write high 32 bits */
1139 writel(reg_addr + 4, pcie_index_offset);
1140 readl(pcie_index_offset);
1141 if (pcie_index_hi != 0) {
1142 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1143 readl(pcie_index_hi_offset);
1144 }
1145 writel((u32)(reg_data >> 32), pcie_data_offset);
1146 readl(pcie_data_offset);
1147
1148 /* clear the high bits */
1149 if (pcie_index_hi != 0) {
1150 writel(0, pcie_index_hi_offset);
1151 readl(pcie_index_hi_offset);
1152 }
1153
1154 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1155 }
1156
1157 /**
1158 * amdgpu_device_get_rev_id - query device rev_id
1159 *
1160 * @adev: amdgpu_device pointer
1161 *
1162 * Return device rev_id
1163 */
amdgpu_device_get_rev_id(struct amdgpu_device * adev)1164 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev)
1165 {
1166 return adev->nbio.funcs->get_rev_id(adev);
1167 }
1168
1169 /**
1170 * amdgpu_invalid_rreg - dummy reg read function
1171 *
1172 * @adev: amdgpu_device pointer
1173 * @reg: offset of register
1174 *
1175 * Dummy register read function. Used for register blocks
1176 * that certain asics don't have (all asics).
1177 * Returns the value in the register.
1178 */
amdgpu_invalid_rreg(struct amdgpu_device * adev,uint32_t reg)1179 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
1180 {
1181 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
1182 BUG();
1183 return 0;
1184 }
1185
amdgpu_invalid_rreg_ext(struct amdgpu_device * adev,uint64_t reg)1186 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg)
1187 {
1188 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
1189 BUG();
1190 return 0;
1191 }
1192
1193 /**
1194 * amdgpu_invalid_wreg - dummy reg write function
1195 *
1196 * @adev: amdgpu_device pointer
1197 * @reg: offset of register
1198 * @v: value to write to the register
1199 *
1200 * Dummy register read function. Used for register blocks
1201 * that certain asics don't have (all asics).
1202 */
amdgpu_invalid_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v)1203 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
1204 {
1205 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
1206 reg, v);
1207 BUG();
1208 }
1209
amdgpu_invalid_wreg_ext(struct amdgpu_device * adev,uint64_t reg,uint32_t v)1210 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v)
1211 {
1212 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n",
1213 reg, v);
1214 BUG();
1215 }
1216
1217 /**
1218 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
1219 *
1220 * @adev: amdgpu_device pointer
1221 * @reg: offset of register
1222 *
1223 * Dummy register read function. Used for register blocks
1224 * that certain asics don't have (all asics).
1225 * Returns the value in the register.
1226 */
amdgpu_invalid_rreg64(struct amdgpu_device * adev,uint32_t reg)1227 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
1228 {
1229 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
1230 BUG();
1231 return 0;
1232 }
1233
amdgpu_invalid_rreg64_ext(struct amdgpu_device * adev,uint64_t reg)1234 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg)
1235 {
1236 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
1237 BUG();
1238 return 0;
1239 }
1240
1241 /**
1242 * amdgpu_invalid_wreg64 - dummy reg write function
1243 *
1244 * @adev: amdgpu_device pointer
1245 * @reg: offset of register
1246 * @v: value to write to the register
1247 *
1248 * Dummy register read function. Used for register blocks
1249 * that certain asics don't have (all asics).
1250 */
amdgpu_invalid_wreg64(struct amdgpu_device * adev,uint32_t reg,uint64_t v)1251 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
1252 {
1253 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
1254 reg, v);
1255 BUG();
1256 }
1257
amdgpu_invalid_wreg64_ext(struct amdgpu_device * adev,uint64_t reg,uint64_t v)1258 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v)
1259 {
1260 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n",
1261 reg, v);
1262 BUG();
1263 }
1264
1265 /**
1266 * amdgpu_block_invalid_rreg - dummy reg read function
1267 *
1268 * @adev: amdgpu_device pointer
1269 * @block: offset of instance
1270 * @reg: offset of register
1271 *
1272 * Dummy register read function. Used for register blocks
1273 * that certain asics don't have (all asics).
1274 * Returns the value in the register.
1275 */
amdgpu_block_invalid_rreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg)1276 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
1277 uint32_t block, uint32_t reg)
1278 {
1279 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
1280 reg, block);
1281 BUG();
1282 return 0;
1283 }
1284
1285 /**
1286 * amdgpu_block_invalid_wreg - dummy reg write function
1287 *
1288 * @adev: amdgpu_device pointer
1289 * @block: offset of instance
1290 * @reg: offset of register
1291 * @v: value to write to the register
1292 *
1293 * Dummy register read function. Used for register blocks
1294 * that certain asics don't have (all asics).
1295 */
amdgpu_block_invalid_wreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg,uint32_t v)1296 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
1297 uint32_t block,
1298 uint32_t reg, uint32_t v)
1299 {
1300 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
1301 reg, block, v);
1302 BUG();
1303 }
1304
1305 /**
1306 * amdgpu_device_asic_init - Wrapper for atom asic_init
1307 *
1308 * @adev: amdgpu_device pointer
1309 *
1310 * Does any asic specific work and then calls atom asic init.
1311 */
amdgpu_device_asic_init(struct amdgpu_device * adev)1312 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
1313 {
1314 int ret;
1315
1316 amdgpu_asic_pre_asic_init(adev);
1317
1318 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
1319 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
1320 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) {
1321 amdgpu_psp_wait_for_bootloader(adev);
1322 ret = amdgpu_atomfirmware_asic_init(adev, true);
1323 return ret;
1324 } else {
1325 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
1326 }
1327
1328 return 0;
1329 }
1330
1331 /**
1332 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page
1333 *
1334 * @adev: amdgpu_device pointer
1335 *
1336 * Allocates a scratch page of VRAM for use by various things in the
1337 * driver.
1338 */
amdgpu_device_mem_scratch_init(struct amdgpu_device * adev)1339 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev)
1340 {
1341 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE,
1342 AMDGPU_GEM_DOMAIN_VRAM |
1343 AMDGPU_GEM_DOMAIN_GTT,
1344 &adev->mem_scratch.robj,
1345 &adev->mem_scratch.gpu_addr,
1346 (void **)&adev->mem_scratch.ptr);
1347 }
1348
1349 /**
1350 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page
1351 *
1352 * @adev: amdgpu_device pointer
1353 *
1354 * Frees the VRAM scratch page.
1355 */
amdgpu_device_mem_scratch_fini(struct amdgpu_device * adev)1356 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev)
1357 {
1358 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL);
1359 }
1360
1361 /**
1362 * amdgpu_device_program_register_sequence - program an array of registers.
1363 *
1364 * @adev: amdgpu_device pointer
1365 * @registers: pointer to the register array
1366 * @array_size: size of the register array
1367 *
1368 * Programs an array or registers with and or masks.
1369 * This is a helper for setting golden registers.
1370 */
amdgpu_device_program_register_sequence(struct amdgpu_device * adev,const u32 * registers,const u32 array_size)1371 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
1372 const u32 *registers,
1373 const u32 array_size)
1374 {
1375 u32 tmp, reg, and_mask, or_mask;
1376 int i;
1377
1378 if (array_size % 3)
1379 return;
1380
1381 for (i = 0; i < array_size; i += 3) {
1382 reg = registers[i + 0];
1383 and_mask = registers[i + 1];
1384 or_mask = registers[i + 2];
1385
1386 if (and_mask == 0xffffffff) {
1387 tmp = or_mask;
1388 } else {
1389 tmp = RREG32(reg);
1390 tmp &= ~and_mask;
1391 if (adev->family >= AMDGPU_FAMILY_AI)
1392 tmp |= (or_mask & and_mask);
1393 else
1394 tmp |= or_mask;
1395 }
1396 WREG32(reg, tmp);
1397 }
1398 }
1399
1400 /**
1401 * amdgpu_device_pci_config_reset - reset the GPU
1402 *
1403 * @adev: amdgpu_device pointer
1404 *
1405 * Resets the GPU using the pci config reset sequence.
1406 * Only applicable to asics prior to vega10.
1407 */
amdgpu_device_pci_config_reset(struct amdgpu_device * adev)1408 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
1409 {
1410 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
1411 }
1412
1413 /**
1414 * amdgpu_device_pci_reset - reset the GPU using generic PCI means
1415 *
1416 * @adev: amdgpu_device pointer
1417 *
1418 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
1419 */
amdgpu_device_pci_reset(struct amdgpu_device * adev)1420 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
1421 {
1422 STUB();
1423 return -ENOSYS;
1424 #ifdef notyet
1425 return pci_reset_function(adev->pdev);
1426 #endif
1427 }
1428
1429 /*
1430 * amdgpu_device_wb_*()
1431 * Writeback is the method by which the GPU updates special pages in memory
1432 * with the status of certain GPU events (fences, ring pointers,etc.).
1433 */
1434
1435 /**
1436 * amdgpu_device_wb_fini - Disable Writeback and free memory
1437 *
1438 * @adev: amdgpu_device pointer
1439 *
1440 * Disables Writeback and frees the Writeback memory (all asics).
1441 * Used at driver shutdown.
1442 */
amdgpu_device_wb_fini(struct amdgpu_device * adev)1443 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1444 {
1445 if (adev->wb.wb_obj) {
1446 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1447 &adev->wb.gpu_addr,
1448 (void **)&adev->wb.wb);
1449 adev->wb.wb_obj = NULL;
1450 }
1451 }
1452
1453 /**
1454 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
1455 *
1456 * @adev: amdgpu_device pointer
1457 *
1458 * Initializes writeback and allocates writeback memory (all asics).
1459 * Used at driver startup.
1460 * Returns 0 on success or an -error on failure.
1461 */
amdgpu_device_wb_init(struct amdgpu_device * adev)1462 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1463 {
1464 int r;
1465
1466 if (adev->wb.wb_obj == NULL) {
1467 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1468 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1469 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1470 &adev->wb.wb_obj, &adev->wb.gpu_addr,
1471 (void **)&adev->wb.wb);
1472 if (r) {
1473 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1474 return r;
1475 }
1476
1477 adev->wb.num_wb = AMDGPU_MAX_WB;
1478 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1479
1480 /* clear wb memory */
1481 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1482 }
1483
1484 return 0;
1485 }
1486
1487 /**
1488 * amdgpu_device_wb_get - Allocate a wb entry
1489 *
1490 * @adev: amdgpu_device pointer
1491 * @wb: wb index
1492 *
1493 * Allocate a wb slot for use by the driver (all asics).
1494 * Returns 0 on success or -EINVAL on failure.
1495 */
amdgpu_device_wb_get(struct amdgpu_device * adev,u32 * wb)1496 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1497 {
1498 unsigned long flags, offset;
1499
1500 spin_lock_irqsave(&adev->wb.lock, flags);
1501 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1502 if (offset < adev->wb.num_wb) {
1503 __set_bit(offset, adev->wb.used);
1504 spin_unlock_irqrestore(&adev->wb.lock, flags);
1505 *wb = offset << 3; /* convert to dw offset */
1506 return 0;
1507 } else {
1508 spin_unlock_irqrestore(&adev->wb.lock, flags);
1509 return -EINVAL;
1510 }
1511 }
1512
1513 /**
1514 * amdgpu_device_wb_free - Free a wb entry
1515 *
1516 * @adev: amdgpu_device pointer
1517 * @wb: wb index
1518 *
1519 * Free a wb slot allocated for use by the driver (all asics)
1520 */
amdgpu_device_wb_free(struct amdgpu_device * adev,u32 wb)1521 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1522 {
1523 unsigned long flags;
1524
1525 wb >>= 3;
1526 spin_lock_irqsave(&adev->wb.lock, flags);
1527 if (wb < adev->wb.num_wb)
1528 __clear_bit(wb, adev->wb.used);
1529 spin_unlock_irqrestore(&adev->wb.lock, flags);
1530 }
1531
1532 /**
1533 * amdgpu_device_resize_fb_bar - try to resize FB BAR
1534 *
1535 * @adev: amdgpu_device pointer
1536 *
1537 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1538 * to fail, but if any of the BARs is not accessible after the size we abort
1539 * driver loading by returning -ENODEV.
1540 */
amdgpu_device_resize_fb_bar(struct amdgpu_device * adev)1541 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1542 {
1543 #ifdef __linux__
1544 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1545 struct pci_bus *root;
1546 struct resource *res;
1547 unsigned int i;
1548 u16 cmd;
1549 int r;
1550
1551 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
1552 return 0;
1553
1554 /* Bypass for VF */
1555 if (amdgpu_sriov_vf(adev))
1556 return 0;
1557
1558 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */
1559 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR))
1560 DRM_WARN("System can't access extended configuration space, please check!!\n");
1561
1562 /* skip if the bios has already enabled large BAR */
1563 if (adev->gmc.real_vram_size &&
1564 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1565 return 0;
1566
1567 /* Check if the root BUS has 64bit memory resources */
1568 root = adev->pdev->bus;
1569 while (root->parent)
1570 root = root->parent;
1571
1572 pci_bus_for_each_resource(root, res, i) {
1573 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1574 res->start > 0x100000000ull)
1575 break;
1576 }
1577
1578 /* Trying to resize is pointless without a root hub window above 4GB */
1579 if (!res)
1580 return 0;
1581
1582 /* Limit the BAR size to what is available */
1583 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1584 rbar_size);
1585
1586 /* Disable memory decoding while we change the BAR addresses and size */
1587 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1588 pci_write_config_word(adev->pdev, PCI_COMMAND,
1589 cmd & ~PCI_COMMAND_MEMORY);
1590
1591 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1592 amdgpu_doorbell_fini(adev);
1593 if (adev->asic_type >= CHIP_BONAIRE)
1594 pci_release_resource(adev->pdev, 2);
1595
1596 pci_release_resource(adev->pdev, 0);
1597
1598 r = pci_resize_resource(adev->pdev, 0, rbar_size);
1599 if (r == -ENOSPC)
1600 DRM_INFO("Not enough PCI address space for a large BAR.");
1601 else if (r && r != -ENOTSUPP)
1602 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1603
1604 pci_assign_unassigned_bus_resources(adev->pdev->bus);
1605
1606 /* When the doorbell or fb BAR isn't available we have no chance of
1607 * using the device.
1608 */
1609 r = amdgpu_doorbell_init(adev);
1610 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1611 return -ENODEV;
1612
1613 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1614 #endif /* __linux__ */
1615
1616 return 0;
1617 }
1618
amdgpu_device_read_bios(struct amdgpu_device * adev)1619 static bool amdgpu_device_read_bios(struct amdgpu_device *adev)
1620 {
1621 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU))
1622 return false;
1623
1624 return true;
1625 }
1626
1627 /*
1628 * GPU helpers function.
1629 */
1630 /**
1631 * amdgpu_device_need_post - check if the hw need post or not
1632 *
1633 * @adev: amdgpu_device pointer
1634 *
1635 * Check if the asic has been initialized (all asics) at driver startup
1636 * or post is needed if hw reset is performed.
1637 * Returns true if need or false if not.
1638 */
amdgpu_device_need_post(struct amdgpu_device * adev)1639 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1640 {
1641 uint32_t reg;
1642
1643 if (amdgpu_sriov_vf(adev))
1644 return false;
1645
1646 if (!amdgpu_device_read_bios(adev))
1647 return false;
1648
1649 if (amdgpu_passthrough(adev)) {
1650 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1651 * some old smc fw still need driver do vPost otherwise gpu hang, while
1652 * those smc fw version above 22.15 doesn't have this flaw, so we force
1653 * vpost executed for smc version below 22.15
1654 */
1655 if (adev->asic_type == CHIP_FIJI) {
1656 int err;
1657 uint32_t fw_ver;
1658
1659 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1660 /* force vPost if error occured */
1661 if (err)
1662 return true;
1663
1664 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1665 release_firmware(adev->pm.fw);
1666 if (fw_ver < 0x00160e00)
1667 return true;
1668 }
1669 }
1670
1671 /* Don't post if we need to reset whole hive on init */
1672 if (adev->gmc.xgmi.pending_reset)
1673 return false;
1674
1675 if (adev->has_hw_reset) {
1676 adev->has_hw_reset = false;
1677 return true;
1678 }
1679
1680 /* bios scratch used on CIK+ */
1681 if (adev->asic_type >= CHIP_BONAIRE)
1682 return amdgpu_atombios_scratch_need_asic_init(adev);
1683
1684 /* check MEM_SIZE for older asics */
1685 reg = amdgpu_asic_get_config_memsize(adev);
1686
1687 if ((reg != 0) && (reg != 0xffffffff))
1688 return false;
1689
1690 return true;
1691 }
1692
1693 /*
1694 * Check whether seamless boot is supported.
1695 *
1696 * So far we only support seamless boot on DCE 3.0 or later.
1697 * If users report that it works on older ASICS as well, we may
1698 * loosen this.
1699 */
amdgpu_device_seamless_boot_supported(struct amdgpu_device * adev)1700 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev)
1701 {
1702 switch (amdgpu_seamless) {
1703 case -1:
1704 break;
1705 case 1:
1706 return true;
1707 case 0:
1708 return false;
1709 default:
1710 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n",
1711 amdgpu_seamless);
1712 return false;
1713 }
1714
1715 if (!(adev->flags & AMD_IS_APU))
1716 return false;
1717
1718 if (adev->mman.keep_stolen_vga_memory)
1719 return false;
1720
1721 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0);
1722 }
1723
1724 /*
1725 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids
1726 * don't support dynamic speed switching. Until we have confirmation from Intel
1727 * that a specific host supports it, it's safer that we keep it disabled for all.
1728 *
1729 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/
1730 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663
1731 */
amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device * adev)1732 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev)
1733 {
1734 #if IS_ENABLED(CONFIG_X86)
1735 #ifdef __linux__
1736 struct cpuinfo_x86 *c = &cpu_data(0);
1737
1738 /* eGPU change speeds based on USB4 fabric conditions */
1739 if (dev_is_removable(adev->dev))
1740 return true;
1741
1742 if (c->x86_vendor == X86_VENDOR_INTEL)
1743 #else
1744 if (strcmp(cpu_vendor, "GenuineIntel") == 0)
1745 #endif
1746 return false;
1747 #endif
1748 return true;
1749 }
1750
1751 /**
1752 * amdgpu_device_should_use_aspm - check if the device should program ASPM
1753 *
1754 * @adev: amdgpu_device pointer
1755 *
1756 * Confirm whether the module parameter and pcie bridge agree that ASPM should
1757 * be set for this device.
1758 *
1759 * Returns true if it should be used or false if not.
1760 */
amdgpu_device_should_use_aspm(struct amdgpu_device * adev)1761 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1762 {
1763 switch (amdgpu_aspm) {
1764 case -1:
1765 break;
1766 case 0:
1767 return false;
1768 case 1:
1769 return true;
1770 default:
1771 return false;
1772 }
1773 if (adev->flags & AMD_IS_APU)
1774 return false;
1775 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK))
1776 return false;
1777 return pcie_aspm_enabled(adev->pdev);
1778 }
1779
1780 /* if we get transitioned to only one device, take VGA back */
1781 /**
1782 * amdgpu_device_vga_set_decode - enable/disable vga decode
1783 *
1784 * @pdev: PCI device pointer
1785 * @state: enable/disable vga decode
1786 *
1787 * Enable/disable vga decode (all asics).
1788 * Returns VGA resource flags.
1789 */
1790 #ifdef notyet
amdgpu_device_vga_set_decode(struct pci_dev * pdev,bool state)1791 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1792 bool state)
1793 {
1794 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1795
1796 amdgpu_asic_set_vga_state(adev, state);
1797 if (state)
1798 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1799 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1800 else
1801 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1802 }
1803 #endif
1804
1805 /**
1806 * amdgpu_device_check_block_size - validate the vm block size
1807 *
1808 * @adev: amdgpu_device pointer
1809 *
1810 * Validates the vm block size specified via module parameter.
1811 * The vm block size defines number of bits in page table versus page directory,
1812 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1813 * page table and the remaining bits are in the page directory.
1814 */
amdgpu_device_check_block_size(struct amdgpu_device * adev)1815 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1816 {
1817 /* defines number of bits in page table versus page directory,
1818 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1819 * page table and the remaining bits are in the page directory
1820 */
1821 if (amdgpu_vm_block_size == -1)
1822 return;
1823
1824 if (amdgpu_vm_block_size < 9) {
1825 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1826 amdgpu_vm_block_size);
1827 amdgpu_vm_block_size = -1;
1828 }
1829 }
1830
1831 /**
1832 * amdgpu_device_check_vm_size - validate the vm size
1833 *
1834 * @adev: amdgpu_device pointer
1835 *
1836 * Validates the vm size in GB specified via module parameter.
1837 * The VM size is the size of the GPU virtual memory space in GB.
1838 */
amdgpu_device_check_vm_size(struct amdgpu_device * adev)1839 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1840 {
1841 /* no need to check the default value */
1842 if (amdgpu_vm_size == -1)
1843 return;
1844
1845 if (amdgpu_vm_size < 1) {
1846 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1847 amdgpu_vm_size);
1848 amdgpu_vm_size = -1;
1849 }
1850 }
1851
amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device * adev)1852 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1853 {
1854 #ifdef __linux__
1855 struct sysinfo si;
1856 #endif
1857 bool is_os_64 = (sizeof(void *) == 8);
1858 uint64_t total_memory;
1859 uint64_t dram_size_seven_GB = 0x1B8000000;
1860 uint64_t dram_size_three_GB = 0xB8000000;
1861
1862 if (amdgpu_smu_memory_pool_size == 0)
1863 return;
1864
1865 if (!is_os_64) {
1866 DRM_WARN("Not 64-bit OS, feature not supported\n");
1867 goto def_value;
1868 }
1869 #ifdef __linux__
1870 si_meminfo(&si);
1871 total_memory = (uint64_t)si.totalram * si.mem_unit;
1872 #else
1873 total_memory = ptoa(physmem);
1874 #endif
1875
1876 if ((amdgpu_smu_memory_pool_size == 1) ||
1877 (amdgpu_smu_memory_pool_size == 2)) {
1878 if (total_memory < dram_size_three_GB)
1879 goto def_value1;
1880 } else if ((amdgpu_smu_memory_pool_size == 4) ||
1881 (amdgpu_smu_memory_pool_size == 8)) {
1882 if (total_memory < dram_size_seven_GB)
1883 goto def_value1;
1884 } else {
1885 DRM_WARN("Smu memory pool size not supported\n");
1886 goto def_value;
1887 }
1888 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1889
1890 return;
1891
1892 def_value1:
1893 DRM_WARN("No enough system memory\n");
1894 def_value:
1895 adev->pm.smu_prv_buffer_size = 0;
1896 }
1897
amdgpu_device_init_apu_flags(struct amdgpu_device * adev)1898 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1899 {
1900 if (!(adev->flags & AMD_IS_APU) ||
1901 adev->asic_type < CHIP_RAVEN)
1902 return 0;
1903
1904 switch (adev->asic_type) {
1905 case CHIP_RAVEN:
1906 if (adev->pdev->device == 0x15dd)
1907 adev->apu_flags |= AMD_APU_IS_RAVEN;
1908 if (adev->pdev->device == 0x15d8)
1909 adev->apu_flags |= AMD_APU_IS_PICASSO;
1910 break;
1911 case CHIP_RENOIR:
1912 if ((adev->pdev->device == 0x1636) ||
1913 (adev->pdev->device == 0x164c))
1914 adev->apu_flags |= AMD_APU_IS_RENOIR;
1915 else
1916 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1917 break;
1918 case CHIP_VANGOGH:
1919 adev->apu_flags |= AMD_APU_IS_VANGOGH;
1920 break;
1921 case CHIP_YELLOW_CARP:
1922 break;
1923 case CHIP_CYAN_SKILLFISH:
1924 if ((adev->pdev->device == 0x13FE) ||
1925 (adev->pdev->device == 0x143F))
1926 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
1927 break;
1928 default:
1929 break;
1930 }
1931
1932 return 0;
1933 }
1934
1935 /**
1936 * amdgpu_device_check_arguments - validate module params
1937 *
1938 * @adev: amdgpu_device pointer
1939 *
1940 * Validates certain module parameters and updates
1941 * the associated values used by the driver (all asics).
1942 */
amdgpu_device_check_arguments(struct amdgpu_device * adev)1943 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1944 {
1945 int i;
1946
1947 if (amdgpu_sched_jobs < 4) {
1948 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1949 amdgpu_sched_jobs);
1950 amdgpu_sched_jobs = 4;
1951 } else if (!is_power_of_2(amdgpu_sched_jobs)) {
1952 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1953 amdgpu_sched_jobs);
1954 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1955 }
1956
1957 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1958 /* gart size must be greater or equal to 32M */
1959 dev_warn(adev->dev, "gart size (%d) too small\n",
1960 amdgpu_gart_size);
1961 amdgpu_gart_size = -1;
1962 }
1963
1964 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1965 /* gtt size must be greater or equal to 32M */
1966 dev_warn(adev->dev, "gtt size (%d) too small\n",
1967 amdgpu_gtt_size);
1968 amdgpu_gtt_size = -1;
1969 }
1970
1971 /* valid range is between 4 and 9 inclusive */
1972 if (amdgpu_vm_fragment_size != -1 &&
1973 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1974 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1975 amdgpu_vm_fragment_size = -1;
1976 }
1977
1978 if (amdgpu_sched_hw_submission < 2) {
1979 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1980 amdgpu_sched_hw_submission);
1981 amdgpu_sched_hw_submission = 2;
1982 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1983 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1984 amdgpu_sched_hw_submission);
1985 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1986 }
1987
1988 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
1989 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
1990 amdgpu_reset_method = -1;
1991 }
1992
1993 amdgpu_device_check_smu_prv_buffer_size(adev);
1994
1995 amdgpu_device_check_vm_size(adev);
1996
1997 amdgpu_device_check_block_size(adev);
1998
1999 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
2000
2001 for (i = 0; i < MAX_XCP; i++)
2002 adev->enforce_isolation[i] = !!enforce_isolation;
2003
2004 return 0;
2005 }
2006
2007 #ifdef __linux__
2008 /**
2009 * amdgpu_switcheroo_set_state - set switcheroo state
2010 *
2011 * @pdev: pci dev pointer
2012 * @state: vga_switcheroo state
2013 *
2014 * Callback for the switcheroo driver. Suspends or resumes
2015 * the asics before or after it is powered up using ACPI methods.
2016 */
amdgpu_switcheroo_set_state(struct pci_dev * pdev,enum vga_switcheroo_state state)2017 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
2018 enum vga_switcheroo_state state)
2019 {
2020 struct drm_device *dev = pci_get_drvdata(pdev);
2021 int r;
2022
2023 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
2024 return;
2025
2026 if (state == VGA_SWITCHEROO_ON) {
2027 pr_info("switched on\n");
2028 /* don't suspend or resume card normally */
2029 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
2030
2031 pci_set_power_state(pdev, PCI_D0);
2032 amdgpu_device_load_pci_state(pdev);
2033 r = pci_enable_device(pdev);
2034 if (r)
2035 DRM_WARN("pci_enable_device failed (%d)\n", r);
2036 amdgpu_device_resume(dev, true);
2037
2038 dev->switch_power_state = DRM_SWITCH_POWER_ON;
2039 } else {
2040 pr_info("switched off\n");
2041 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
2042 amdgpu_device_prepare(dev);
2043 amdgpu_device_suspend(dev, true);
2044 amdgpu_device_cache_pci_state(pdev);
2045 /* Shut down the device */
2046 pci_disable_device(pdev);
2047 pci_set_power_state(pdev, PCI_D3cold);
2048 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
2049 }
2050 }
2051
2052 /**
2053 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
2054 *
2055 * @pdev: pci dev pointer
2056 *
2057 * Callback for the switcheroo driver. Check of the switcheroo
2058 * state can be changed.
2059 * Returns true if the state can be changed, false if not.
2060 */
amdgpu_switcheroo_can_switch(struct pci_dev * pdev)2061 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
2062 {
2063 struct drm_device *dev = pci_get_drvdata(pdev);
2064
2065 /*
2066 * FIXME: open_count is protected by drm_global_mutex but that would lead to
2067 * locking inversion with the driver load path. And the access here is
2068 * completely racy anyway. So don't bother with locking for now.
2069 */
2070 return atomic_read(&dev->open_count) == 0;
2071 }
2072 #endif /* __linux__ */
2073
2074 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
2075 #ifdef notyet
2076 .set_gpu_state = amdgpu_switcheroo_set_state,
2077 .reprobe = NULL,
2078 .can_switch = amdgpu_switcheroo_can_switch,
2079 #endif
2080 };
2081
2082 /**
2083 * amdgpu_device_ip_set_clockgating_state - set the CG state
2084 *
2085 * @dev: amdgpu_device pointer
2086 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2087 * @state: clockgating state (gate or ungate)
2088 *
2089 * Sets the requested clockgating state for all instances of
2090 * the hardware IP specified.
2091 * Returns the error code from the last instance.
2092 */
amdgpu_device_ip_set_clockgating_state(void * dev,enum amd_ip_block_type block_type,enum amd_clockgating_state state)2093 int amdgpu_device_ip_set_clockgating_state(void *dev,
2094 enum amd_ip_block_type block_type,
2095 enum amd_clockgating_state state)
2096 {
2097 struct amdgpu_device *adev = dev;
2098 int i, r = 0;
2099
2100 for (i = 0; i < adev->num_ip_blocks; i++) {
2101 if (!adev->ip_blocks[i].status.valid)
2102 continue;
2103 if (adev->ip_blocks[i].version->type != block_type)
2104 continue;
2105 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
2106 continue;
2107 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
2108 (void *)adev, state);
2109 if (r)
2110 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
2111 adev->ip_blocks[i].version->funcs->name, r);
2112 }
2113 return r;
2114 }
2115
2116 /**
2117 * amdgpu_device_ip_set_powergating_state - set the PG state
2118 *
2119 * @dev: amdgpu_device pointer
2120 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2121 * @state: powergating state (gate or ungate)
2122 *
2123 * Sets the requested powergating state for all instances of
2124 * the hardware IP specified.
2125 * Returns the error code from the last instance.
2126 */
amdgpu_device_ip_set_powergating_state(void * dev,enum amd_ip_block_type block_type,enum amd_powergating_state state)2127 int amdgpu_device_ip_set_powergating_state(void *dev,
2128 enum amd_ip_block_type block_type,
2129 enum amd_powergating_state state)
2130 {
2131 struct amdgpu_device *adev = dev;
2132 int i, r = 0;
2133
2134 for (i = 0; i < adev->num_ip_blocks; i++) {
2135 if (!adev->ip_blocks[i].status.valid)
2136 continue;
2137 if (adev->ip_blocks[i].version->type != block_type)
2138 continue;
2139 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
2140 continue;
2141 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
2142 (void *)adev, state);
2143 if (r)
2144 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
2145 adev->ip_blocks[i].version->funcs->name, r);
2146 }
2147 return r;
2148 }
2149
2150 /**
2151 * amdgpu_device_ip_get_clockgating_state - get the CG state
2152 *
2153 * @adev: amdgpu_device pointer
2154 * @flags: clockgating feature flags
2155 *
2156 * Walks the list of IPs on the device and updates the clockgating
2157 * flags for each IP.
2158 * Updates @flags with the feature flags for each hardware IP where
2159 * clockgating is enabled.
2160 */
amdgpu_device_ip_get_clockgating_state(struct amdgpu_device * adev,u64 * flags)2161 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
2162 u64 *flags)
2163 {
2164 int i;
2165
2166 for (i = 0; i < adev->num_ip_blocks; i++) {
2167 if (!adev->ip_blocks[i].status.valid)
2168 continue;
2169 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
2170 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
2171 }
2172 }
2173
2174 /**
2175 * amdgpu_device_ip_wait_for_idle - wait for idle
2176 *
2177 * @adev: amdgpu_device pointer
2178 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2179 *
2180 * Waits for the request hardware IP to be idle.
2181 * Returns 0 for success or a negative error code on failure.
2182 */
amdgpu_device_ip_wait_for_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)2183 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
2184 enum amd_ip_block_type block_type)
2185 {
2186 int i, r;
2187
2188 for (i = 0; i < adev->num_ip_blocks; i++) {
2189 if (!adev->ip_blocks[i].status.valid)
2190 continue;
2191 if (adev->ip_blocks[i].version->type == block_type) {
2192 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
2193 if (r)
2194 return r;
2195 break;
2196 }
2197 }
2198 return 0;
2199
2200 }
2201
2202 /**
2203 * amdgpu_device_ip_is_idle - is the hardware IP idle
2204 *
2205 * @adev: amdgpu_device pointer
2206 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2207 *
2208 * Check if the hardware IP is idle or not.
2209 * Returns true if it the IP is idle, false if not.
2210 */
amdgpu_device_ip_is_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)2211 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
2212 enum amd_ip_block_type block_type)
2213 {
2214 int i;
2215
2216 for (i = 0; i < adev->num_ip_blocks; i++) {
2217 if (!adev->ip_blocks[i].status.valid)
2218 continue;
2219 if (adev->ip_blocks[i].version->type == block_type)
2220 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
2221 }
2222 return true;
2223
2224 }
2225
2226 /**
2227 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
2228 *
2229 * @adev: amdgpu_device pointer
2230 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
2231 *
2232 * Returns a pointer to the hardware IP block structure
2233 * if it exists for the asic, otherwise NULL.
2234 */
2235 struct amdgpu_ip_block *
amdgpu_device_ip_get_ip_block(struct amdgpu_device * adev,enum amd_ip_block_type type)2236 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
2237 enum amd_ip_block_type type)
2238 {
2239 int i;
2240
2241 for (i = 0; i < adev->num_ip_blocks; i++)
2242 if (adev->ip_blocks[i].version->type == type)
2243 return &adev->ip_blocks[i];
2244
2245 return NULL;
2246 }
2247
2248 /**
2249 * amdgpu_device_ip_block_version_cmp
2250 *
2251 * @adev: amdgpu_device pointer
2252 * @type: enum amd_ip_block_type
2253 * @major: major version
2254 * @minor: minor version
2255 *
2256 * return 0 if equal or greater
2257 * return 1 if smaller or the ip_block doesn't exist
2258 */
amdgpu_device_ip_block_version_cmp(struct amdgpu_device * adev,enum amd_ip_block_type type,u32 major,u32 minor)2259 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
2260 enum amd_ip_block_type type,
2261 u32 major, u32 minor)
2262 {
2263 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
2264
2265 if (ip_block && ((ip_block->version->major > major) ||
2266 ((ip_block->version->major == major) &&
2267 (ip_block->version->minor >= minor))))
2268 return 0;
2269
2270 return 1;
2271 }
2272
2273 /**
2274 * amdgpu_device_ip_block_add
2275 *
2276 * @adev: amdgpu_device pointer
2277 * @ip_block_version: pointer to the IP to add
2278 *
2279 * Adds the IP block driver information to the collection of IPs
2280 * on the asic.
2281 */
amdgpu_device_ip_block_add(struct amdgpu_device * adev,const struct amdgpu_ip_block_version * ip_block_version)2282 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
2283 const struct amdgpu_ip_block_version *ip_block_version)
2284 {
2285 if (!ip_block_version)
2286 return -EINVAL;
2287
2288 switch (ip_block_version->type) {
2289 case AMD_IP_BLOCK_TYPE_VCN:
2290 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
2291 return 0;
2292 break;
2293 case AMD_IP_BLOCK_TYPE_JPEG:
2294 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
2295 return 0;
2296 break;
2297 default:
2298 break;
2299 }
2300
2301 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
2302 ip_block_version->funcs->name);
2303
2304 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
2305
2306 return 0;
2307 }
2308
2309 /**
2310 * amdgpu_device_enable_virtual_display - enable virtual display feature
2311 *
2312 * @adev: amdgpu_device pointer
2313 *
2314 * Enabled the virtual display feature if the user has enabled it via
2315 * the module parameter virtual_display. This feature provides a virtual
2316 * display hardware on headless boards or in virtualized environments.
2317 * This function parses and validates the configuration string specified by
2318 * the user and configues the virtual display configuration (number of
2319 * virtual connectors, crtcs, etc.) specified.
2320 */
amdgpu_device_enable_virtual_display(struct amdgpu_device * adev)2321 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
2322 {
2323 adev->enable_virtual_display = false;
2324
2325 #ifdef notyet
2326 if (amdgpu_virtual_display) {
2327 const char *pci_address_name = pci_name(adev->pdev);
2328 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
2329
2330 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
2331 pciaddstr_tmp = pciaddstr;
2332 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
2333 pciaddname = strsep(&pciaddname_tmp, ",");
2334 if (!strcmp("all", pciaddname)
2335 || !strcmp(pci_address_name, pciaddname)) {
2336 long num_crtc;
2337 int res = -1;
2338
2339 adev->enable_virtual_display = true;
2340
2341 if (pciaddname_tmp)
2342 res = kstrtol(pciaddname_tmp, 10,
2343 &num_crtc);
2344
2345 if (!res) {
2346 if (num_crtc < 1)
2347 num_crtc = 1;
2348 if (num_crtc > 6)
2349 num_crtc = 6;
2350 adev->mode_info.num_crtc = num_crtc;
2351 } else {
2352 adev->mode_info.num_crtc = 1;
2353 }
2354 break;
2355 }
2356 }
2357
2358 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
2359 amdgpu_virtual_display, pci_address_name,
2360 adev->enable_virtual_display, adev->mode_info.num_crtc);
2361
2362 kfree(pciaddstr);
2363 }
2364 #endif
2365 }
2366
amdgpu_device_set_sriov_virtual_display(struct amdgpu_device * adev)2367 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
2368 {
2369 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) {
2370 adev->mode_info.num_crtc = 1;
2371 adev->enable_virtual_display = true;
2372 DRM_INFO("virtual_display:%d, num_crtc:%d\n",
2373 adev->enable_virtual_display, adev->mode_info.num_crtc);
2374 }
2375 }
2376
2377 /**
2378 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
2379 *
2380 * @adev: amdgpu_device pointer
2381 *
2382 * Parses the asic configuration parameters specified in the gpu info
2383 * firmware and makes them availale to the driver for use in configuring
2384 * the asic.
2385 * Returns 0 on success, -EINVAL on failure.
2386 */
amdgpu_device_parse_gpu_info_fw(struct amdgpu_device * adev)2387 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
2388 {
2389 const char *chip_name;
2390 int err;
2391 const struct gpu_info_firmware_header_v1_0 *hdr;
2392
2393 adev->firmware.gpu_info_fw = NULL;
2394
2395 if (adev->mman.discovery_bin)
2396 return 0;
2397
2398 switch (adev->asic_type) {
2399 default:
2400 return 0;
2401 case CHIP_VEGA10:
2402 chip_name = "vega10";
2403 break;
2404 case CHIP_VEGA12:
2405 chip_name = "vega12";
2406 break;
2407 case CHIP_RAVEN:
2408 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
2409 chip_name = "raven2";
2410 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
2411 chip_name = "picasso";
2412 else
2413 chip_name = "raven";
2414 break;
2415 case CHIP_ARCTURUS:
2416 chip_name = "arcturus";
2417 break;
2418 case CHIP_NAVI12:
2419 chip_name = "navi12";
2420 break;
2421 }
2422
2423 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw,
2424 "amdgpu/%s_gpu_info.bin", chip_name);
2425 if (err) {
2426 dev_err(adev->dev,
2427 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n",
2428 chip_name);
2429 goto out;
2430 }
2431
2432 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
2433 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
2434
2435 switch (hdr->version_major) {
2436 case 1:
2437 {
2438 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
2439 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
2440 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2441
2442 /*
2443 * Should be droped when DAL no longer needs it.
2444 */
2445 if (adev->asic_type == CHIP_NAVI12)
2446 goto parse_soc_bounding_box;
2447
2448 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
2449 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
2450 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
2451 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
2452 adev->gfx.config.max_texture_channel_caches =
2453 le32_to_cpu(gpu_info_fw->gc_num_tccs);
2454 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
2455 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
2456 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
2457 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
2458 adev->gfx.config.double_offchip_lds_buf =
2459 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
2460 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
2461 adev->gfx.cu_info.max_waves_per_simd =
2462 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
2463 adev->gfx.cu_info.max_scratch_slots_per_cu =
2464 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
2465 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
2466 if (hdr->version_minor >= 1) {
2467 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
2468 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
2469 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2470 adev->gfx.config.num_sc_per_sh =
2471 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2472 adev->gfx.config.num_packer_per_sc =
2473 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2474 }
2475
2476 parse_soc_bounding_box:
2477 /*
2478 * soc bounding box info is not integrated in disocovery table,
2479 * we always need to parse it from gpu info firmware if needed.
2480 */
2481 if (hdr->version_minor == 2) {
2482 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2483 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2484 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2485 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2486 }
2487 break;
2488 }
2489 default:
2490 dev_err(adev->dev,
2491 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2492 err = -EINVAL;
2493 goto out;
2494 }
2495 out:
2496 return err;
2497 }
2498
2499 /**
2500 * amdgpu_device_ip_early_init - run early init for hardware IPs
2501 *
2502 * @adev: amdgpu_device pointer
2503 *
2504 * Early initialization pass for hardware IPs. The hardware IPs that make
2505 * up each asic are discovered each IP's early_init callback is run. This
2506 * is the first stage in initializing the asic.
2507 * Returns 0 on success, negative error code on failure.
2508 */
amdgpu_device_ip_early_init(struct amdgpu_device * adev)2509 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2510 {
2511 struct amdgpu_ip_block *ip_block;
2512 struct pci_dev *parent;
2513 int i, r;
2514 bool total;
2515
2516 amdgpu_device_enable_virtual_display(adev);
2517
2518 if (amdgpu_sriov_vf(adev)) {
2519 r = amdgpu_virt_request_full_gpu(adev, true);
2520 if (r)
2521 return r;
2522 }
2523
2524 switch (adev->asic_type) {
2525 #ifdef CONFIG_DRM_AMDGPU_SI
2526 case CHIP_VERDE:
2527 case CHIP_TAHITI:
2528 case CHIP_PITCAIRN:
2529 case CHIP_OLAND:
2530 case CHIP_HAINAN:
2531 adev->family = AMDGPU_FAMILY_SI;
2532 r = si_set_ip_blocks(adev);
2533 if (r)
2534 return r;
2535 break;
2536 #endif
2537 #ifdef CONFIG_DRM_AMDGPU_CIK
2538 case CHIP_BONAIRE:
2539 case CHIP_HAWAII:
2540 case CHIP_KAVERI:
2541 case CHIP_KABINI:
2542 case CHIP_MULLINS:
2543 if (adev->flags & AMD_IS_APU)
2544 adev->family = AMDGPU_FAMILY_KV;
2545 else
2546 adev->family = AMDGPU_FAMILY_CI;
2547
2548 r = cik_set_ip_blocks(adev);
2549 if (r)
2550 return r;
2551 break;
2552 #endif
2553 case CHIP_TOPAZ:
2554 case CHIP_TONGA:
2555 case CHIP_FIJI:
2556 case CHIP_POLARIS10:
2557 case CHIP_POLARIS11:
2558 case CHIP_POLARIS12:
2559 case CHIP_VEGAM:
2560 case CHIP_CARRIZO:
2561 case CHIP_STONEY:
2562 if (adev->flags & AMD_IS_APU)
2563 adev->family = AMDGPU_FAMILY_CZ;
2564 else
2565 adev->family = AMDGPU_FAMILY_VI;
2566
2567 r = vi_set_ip_blocks(adev);
2568 if (r)
2569 return r;
2570 break;
2571 default:
2572 r = amdgpu_discovery_set_ip_blocks(adev);
2573 if (r)
2574 return r;
2575 break;
2576 }
2577
2578 if (amdgpu_has_atpx() &&
2579 (amdgpu_is_atpx_hybrid() ||
2580 amdgpu_has_atpx_dgpu_power_cntl()) &&
2581 ((adev->flags & AMD_IS_APU) == 0) &&
2582 !dev_is_removable(&adev->pdev->dev))
2583 adev->flags |= AMD_IS_PX;
2584
2585 if (!(adev->flags & AMD_IS_APU)) {
2586 #ifdef notyet
2587 parent = pcie_find_root_port(adev->pdev);
2588 adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2589 #else
2590 adev->has_pr3 = false;
2591 #endif
2592 }
2593
2594
2595 adev->pm.pp_feature = amdgpu_pp_feature_mask;
2596 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2597 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2598 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2599 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2600 if (!amdgpu_device_pcie_dynamic_switching_supported(adev))
2601 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK;
2602
2603 total = true;
2604 for (i = 0; i < adev->num_ip_blocks; i++) {
2605 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2606 DRM_WARN("disabled ip block: %d <%s>\n",
2607 i, adev->ip_blocks[i].version->funcs->name);
2608 adev->ip_blocks[i].status.valid = false;
2609 } else {
2610 if (adev->ip_blocks[i].version->funcs->early_init) {
2611 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2612 if (r == -ENOENT) {
2613 adev->ip_blocks[i].status.valid = false;
2614 } else if (r) {
2615 DRM_ERROR("early_init of IP block <%s> failed %d\n",
2616 adev->ip_blocks[i].version->funcs->name, r);
2617 total = false;
2618 } else {
2619 adev->ip_blocks[i].status.valid = true;
2620 }
2621 } else {
2622 adev->ip_blocks[i].status.valid = true;
2623 }
2624 }
2625 /* get the vbios after the asic_funcs are set up */
2626 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2627 r = amdgpu_device_parse_gpu_info_fw(adev);
2628 if (r)
2629 return r;
2630
2631 /* Read BIOS */
2632 if (amdgpu_device_read_bios(adev)) {
2633 if (!amdgpu_get_bios(adev))
2634 return -EINVAL;
2635
2636 r = amdgpu_atombios_init(adev);
2637 if (r) {
2638 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2639 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2640 return r;
2641 }
2642 }
2643
2644 /*get pf2vf msg info at it's earliest time*/
2645 if (amdgpu_sriov_vf(adev))
2646 amdgpu_virt_init_data_exchange(adev);
2647
2648 }
2649 }
2650 if (!total)
2651 return -ENODEV;
2652
2653 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX);
2654 if (ip_block->status.valid != false)
2655 amdgpu_amdkfd_device_probe(adev);
2656
2657 adev->cg_flags &= amdgpu_cg_mask;
2658 adev->pg_flags &= amdgpu_pg_mask;
2659
2660 return 0;
2661 }
2662
amdgpu_device_ip_hw_init_phase1(struct amdgpu_device * adev)2663 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2664 {
2665 int i, r;
2666
2667 for (i = 0; i < adev->num_ip_blocks; i++) {
2668 if (!adev->ip_blocks[i].status.sw)
2669 continue;
2670 if (adev->ip_blocks[i].status.hw)
2671 continue;
2672 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2673 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2674 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2675 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2676 if (r) {
2677 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2678 adev->ip_blocks[i].version->funcs->name, r);
2679 return r;
2680 }
2681 adev->ip_blocks[i].status.hw = true;
2682 }
2683 }
2684
2685 return 0;
2686 }
2687
amdgpu_device_ip_hw_init_phase2(struct amdgpu_device * adev)2688 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2689 {
2690 int i, r;
2691
2692 for (i = 0; i < adev->num_ip_blocks; i++) {
2693 if (!adev->ip_blocks[i].status.sw)
2694 continue;
2695 if (adev->ip_blocks[i].status.hw)
2696 continue;
2697 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2698 if (r) {
2699 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2700 adev->ip_blocks[i].version->funcs->name, r);
2701 return r;
2702 }
2703 adev->ip_blocks[i].status.hw = true;
2704 }
2705
2706 return 0;
2707 }
2708
amdgpu_device_fw_loading(struct amdgpu_device * adev)2709 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2710 {
2711 int r = 0;
2712 int i;
2713 uint32_t smu_version;
2714
2715 if (adev->asic_type >= CHIP_VEGA10) {
2716 for (i = 0; i < adev->num_ip_blocks; i++) {
2717 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2718 continue;
2719
2720 if (!adev->ip_blocks[i].status.sw)
2721 continue;
2722
2723 /* no need to do the fw loading again if already done*/
2724 if (adev->ip_blocks[i].status.hw == true)
2725 break;
2726
2727 if (amdgpu_in_reset(adev) || adev->in_suspend) {
2728 r = adev->ip_blocks[i].version->funcs->resume(adev);
2729 if (r) {
2730 DRM_ERROR("resume of IP block <%s> failed %d\n",
2731 adev->ip_blocks[i].version->funcs->name, r);
2732 return r;
2733 }
2734 } else {
2735 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2736 if (r) {
2737 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2738 adev->ip_blocks[i].version->funcs->name, r);
2739 return r;
2740 }
2741 }
2742
2743 adev->ip_blocks[i].status.hw = true;
2744 break;
2745 }
2746 }
2747
2748 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2749 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2750
2751 return r;
2752 }
2753
amdgpu_device_init_schedulers(struct amdgpu_device * adev)2754 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2755 {
2756 long timeout;
2757 int r, i;
2758
2759 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2760 struct amdgpu_ring *ring = adev->rings[i];
2761
2762 /* No need to setup the GPU scheduler for rings that don't need it */
2763 if (!ring || ring->no_scheduler)
2764 continue;
2765
2766 switch (ring->funcs->type) {
2767 case AMDGPU_RING_TYPE_GFX:
2768 timeout = adev->gfx_timeout;
2769 break;
2770 case AMDGPU_RING_TYPE_COMPUTE:
2771 timeout = adev->compute_timeout;
2772 break;
2773 case AMDGPU_RING_TYPE_SDMA:
2774 timeout = adev->sdma_timeout;
2775 break;
2776 default:
2777 timeout = adev->video_timeout;
2778 break;
2779 }
2780
2781 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL,
2782 DRM_SCHED_PRIORITY_COUNT,
2783 ring->num_hw_submission, 0,
2784 timeout, adev->reset_domain->wq,
2785 ring->sched_score, ring->name,
2786 adev->dev);
2787 if (r) {
2788 DRM_ERROR("Failed to create scheduler on ring %s.\n",
2789 ring->name);
2790 return r;
2791 }
2792 r = amdgpu_uvd_entity_init(adev, ring);
2793 if (r) {
2794 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n",
2795 ring->name);
2796 return r;
2797 }
2798 r = amdgpu_vce_entity_init(adev, ring);
2799 if (r) {
2800 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n",
2801 ring->name);
2802 return r;
2803 }
2804 }
2805
2806 amdgpu_xcp_update_partition_sched_list(adev);
2807
2808 return 0;
2809 }
2810
2811
2812 /**
2813 * amdgpu_device_ip_init - run init for hardware IPs
2814 *
2815 * @adev: amdgpu_device pointer
2816 *
2817 * Main initialization pass for hardware IPs. The list of all the hardware
2818 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2819 * are run. sw_init initializes the software state associated with each IP
2820 * and hw_init initializes the hardware associated with each IP.
2821 * Returns 0 on success, negative error code on failure.
2822 */
amdgpu_device_ip_init(struct amdgpu_device * adev)2823 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2824 {
2825 int i, r;
2826
2827 r = amdgpu_ras_init(adev);
2828 if (r)
2829 return r;
2830
2831 for (i = 0; i < adev->num_ip_blocks; i++) {
2832 if (!adev->ip_blocks[i].status.valid)
2833 continue;
2834 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2835 if (r) {
2836 DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2837 adev->ip_blocks[i].version->funcs->name, r);
2838 goto init_failed;
2839 }
2840 adev->ip_blocks[i].status.sw = true;
2841
2842 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2843 /* need to do common hw init early so everything is set up for gmc */
2844 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2845 if (r) {
2846 DRM_ERROR("hw_init %d failed %d\n", i, r);
2847 goto init_failed;
2848 }
2849 adev->ip_blocks[i].status.hw = true;
2850 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2851 /* need to do gmc hw init early so we can allocate gpu mem */
2852 /* Try to reserve bad pages early */
2853 if (amdgpu_sriov_vf(adev))
2854 amdgpu_virt_exchange_data(adev);
2855
2856 r = amdgpu_device_mem_scratch_init(adev);
2857 if (r) {
2858 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r);
2859 goto init_failed;
2860 }
2861 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2862 if (r) {
2863 DRM_ERROR("hw_init %d failed %d\n", i, r);
2864 goto init_failed;
2865 }
2866 r = amdgpu_device_wb_init(adev);
2867 if (r) {
2868 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2869 goto init_failed;
2870 }
2871 adev->ip_blocks[i].status.hw = true;
2872
2873 /* right after GMC hw init, we create CSA */
2874 if (adev->gfx.mcbp) {
2875 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2876 AMDGPU_GEM_DOMAIN_VRAM |
2877 AMDGPU_GEM_DOMAIN_GTT,
2878 AMDGPU_CSA_SIZE);
2879 if (r) {
2880 DRM_ERROR("allocate CSA failed %d\n", r);
2881 goto init_failed;
2882 }
2883 }
2884
2885 r = amdgpu_seq64_init(adev);
2886 if (r) {
2887 DRM_ERROR("allocate seq64 failed %d\n", r);
2888 goto init_failed;
2889 }
2890 }
2891 }
2892
2893 if (amdgpu_sriov_vf(adev))
2894 amdgpu_virt_init_data_exchange(adev);
2895
2896 r = amdgpu_ib_pool_init(adev);
2897 if (r) {
2898 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2899 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2900 goto init_failed;
2901 }
2902
2903 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2904 if (r)
2905 goto init_failed;
2906
2907 r = amdgpu_device_ip_hw_init_phase1(adev);
2908 if (r)
2909 goto init_failed;
2910
2911 r = amdgpu_device_fw_loading(adev);
2912 if (r)
2913 goto init_failed;
2914
2915 r = amdgpu_device_ip_hw_init_phase2(adev);
2916 if (r)
2917 goto init_failed;
2918
2919 /*
2920 * retired pages will be loaded from eeprom and reserved here,
2921 * it should be called after amdgpu_device_ip_hw_init_phase2 since
2922 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2923 * for I2C communication which only true at this point.
2924 *
2925 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2926 * failure from bad gpu situation and stop amdgpu init process
2927 * accordingly. For other failed cases, it will still release all
2928 * the resource and print error message, rather than returning one
2929 * negative value to upper level.
2930 *
2931 * Note: theoretically, this should be called before all vram allocations
2932 * to protect retired page from abusing
2933 */
2934 r = amdgpu_ras_recovery_init(adev);
2935 if (r)
2936 goto init_failed;
2937
2938 /**
2939 * In case of XGMI grab extra reference for reset domain for this device
2940 */
2941 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2942 if (amdgpu_xgmi_add_device(adev) == 0) {
2943 if (!amdgpu_sriov_vf(adev)) {
2944 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
2945
2946 if (WARN_ON(!hive)) {
2947 r = -ENOENT;
2948 goto init_failed;
2949 }
2950
2951 if (!hive->reset_domain ||
2952 !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
2953 r = -ENOENT;
2954 amdgpu_put_xgmi_hive(hive);
2955 goto init_failed;
2956 }
2957
2958 /* Drop the early temporary reset domain we created for device */
2959 amdgpu_reset_put_reset_domain(adev->reset_domain);
2960 adev->reset_domain = hive->reset_domain;
2961 amdgpu_put_xgmi_hive(hive);
2962 }
2963 }
2964 }
2965
2966 r = amdgpu_device_init_schedulers(adev);
2967 if (r)
2968 goto init_failed;
2969
2970 if (adev->mman.buffer_funcs_ring->sched.ready)
2971 amdgpu_ttm_set_buffer_funcs_status(adev, true);
2972
2973 /* Don't init kfd if whole hive need to be reset during init */
2974 if (!adev->gmc.xgmi.pending_reset) {
2975 kgd2kfd_init_zone_device(adev);
2976 amdgpu_amdkfd_device_init(adev);
2977 }
2978
2979 amdgpu_fru_get_product_info(adev);
2980
2981 init_failed:
2982
2983 return r;
2984 }
2985
2986 /**
2987 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2988 *
2989 * @adev: amdgpu_device pointer
2990 *
2991 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
2992 * this function before a GPU reset. If the value is retained after a
2993 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents.
2994 */
amdgpu_device_fill_reset_magic(struct amdgpu_device * adev)2995 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2996 {
2997 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2998 }
2999
3000 /**
3001 * amdgpu_device_check_vram_lost - check if vram is valid
3002 *
3003 * @adev: amdgpu_device pointer
3004 *
3005 * Checks the reset magic value written to the gart pointer in VRAM.
3006 * The driver calls this after a GPU reset to see if the contents of
3007 * VRAM is lost or now.
3008 * returns true if vram is lost, false if not.
3009 */
amdgpu_device_check_vram_lost(struct amdgpu_device * adev)3010 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
3011 {
3012 if (memcmp(adev->gart.ptr, adev->reset_magic,
3013 AMDGPU_RESET_MAGIC_NUM))
3014 return true;
3015
3016 if (!amdgpu_in_reset(adev))
3017 return false;
3018
3019 /*
3020 * For all ASICs with baco/mode1 reset, the VRAM is
3021 * always assumed to be lost.
3022 */
3023 switch (amdgpu_asic_reset_method(adev)) {
3024 case AMD_RESET_METHOD_BACO:
3025 case AMD_RESET_METHOD_MODE1:
3026 return true;
3027 default:
3028 return false;
3029 }
3030 }
3031
3032 /**
3033 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
3034 *
3035 * @adev: amdgpu_device pointer
3036 * @state: clockgating state (gate or ungate)
3037 *
3038 * The list of all the hardware IPs that make up the asic is walked and the
3039 * set_clockgating_state callbacks are run.
3040 * Late initialization pass enabling clockgating for hardware IPs.
3041 * Fini or suspend, pass disabling clockgating for hardware IPs.
3042 * Returns 0 on success, negative error code on failure.
3043 */
3044
amdgpu_device_set_cg_state(struct amdgpu_device * adev,enum amd_clockgating_state state)3045 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
3046 enum amd_clockgating_state state)
3047 {
3048 int i, j, r;
3049
3050 if (amdgpu_emu_mode == 1)
3051 return 0;
3052
3053 for (j = 0; j < adev->num_ip_blocks; j++) {
3054 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
3055 if (!adev->ip_blocks[i].status.late_initialized)
3056 continue;
3057 /* skip CG for GFX, SDMA on S0ix */
3058 if (adev->in_s0ix &&
3059 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3060 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3061 continue;
3062 /* skip CG for VCE/UVD, it's handled specially */
3063 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
3064 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
3065 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
3066 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
3067 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
3068 /* enable clockgating to save power */
3069 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
3070 state);
3071 if (r) {
3072 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
3073 adev->ip_blocks[i].version->funcs->name, r);
3074 return r;
3075 }
3076 }
3077 }
3078
3079 return 0;
3080 }
3081
amdgpu_device_set_pg_state(struct amdgpu_device * adev,enum amd_powergating_state state)3082 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
3083 enum amd_powergating_state state)
3084 {
3085 int i, j, r;
3086
3087 if (amdgpu_emu_mode == 1)
3088 return 0;
3089
3090 for (j = 0; j < adev->num_ip_blocks; j++) {
3091 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
3092 if (!adev->ip_blocks[i].status.late_initialized)
3093 continue;
3094 /* skip PG for GFX, SDMA on S0ix */
3095 if (adev->in_s0ix &&
3096 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3097 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3098 continue;
3099 /* skip CG for VCE/UVD, it's handled specially */
3100 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
3101 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
3102 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
3103 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
3104 adev->ip_blocks[i].version->funcs->set_powergating_state) {
3105 /* enable powergating to save power */
3106 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
3107 state);
3108 if (r) {
3109 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
3110 adev->ip_blocks[i].version->funcs->name, r);
3111 return r;
3112 }
3113 }
3114 }
3115 return 0;
3116 }
3117
amdgpu_device_enable_mgpu_fan_boost(void)3118 static int amdgpu_device_enable_mgpu_fan_boost(void)
3119 {
3120 struct amdgpu_gpu_instance *gpu_ins;
3121 struct amdgpu_device *adev;
3122 int i, ret = 0;
3123
3124 mutex_lock(&mgpu_info.mutex);
3125
3126 /*
3127 * MGPU fan boost feature should be enabled
3128 * only when there are two or more dGPUs in
3129 * the system
3130 */
3131 if (mgpu_info.num_dgpu < 2)
3132 goto out;
3133
3134 for (i = 0; i < mgpu_info.num_dgpu; i++) {
3135 gpu_ins = &(mgpu_info.gpu_ins[i]);
3136 adev = gpu_ins->adev;
3137 if (!(adev->flags & AMD_IS_APU) &&
3138 !gpu_ins->mgpu_fan_enabled) {
3139 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
3140 if (ret)
3141 break;
3142
3143 gpu_ins->mgpu_fan_enabled = 1;
3144 }
3145 }
3146
3147 out:
3148 mutex_unlock(&mgpu_info.mutex);
3149
3150 return ret;
3151 }
3152
3153 /**
3154 * amdgpu_device_ip_late_init - run late init for hardware IPs
3155 *
3156 * @adev: amdgpu_device pointer
3157 *
3158 * Late initialization pass for hardware IPs. The list of all the hardware
3159 * IPs that make up the asic is walked and the late_init callbacks are run.
3160 * late_init covers any special initialization that an IP requires
3161 * after all of the have been initialized or something that needs to happen
3162 * late in the init process.
3163 * Returns 0 on success, negative error code on failure.
3164 */
amdgpu_device_ip_late_init(struct amdgpu_device * adev)3165 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
3166 {
3167 struct amdgpu_gpu_instance *gpu_instance;
3168 int i = 0, r;
3169
3170 for (i = 0; i < adev->num_ip_blocks; i++) {
3171 if (!adev->ip_blocks[i].status.hw)
3172 continue;
3173 if (adev->ip_blocks[i].version->funcs->late_init) {
3174 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
3175 if (r) {
3176 DRM_ERROR("late_init of IP block <%s> failed %d\n",
3177 adev->ip_blocks[i].version->funcs->name, r);
3178 return r;
3179 }
3180 }
3181 adev->ip_blocks[i].status.late_initialized = true;
3182 }
3183
3184 r = amdgpu_ras_late_init(adev);
3185 if (r) {
3186 DRM_ERROR("amdgpu_ras_late_init failed %d", r);
3187 return r;
3188 }
3189
3190 if (!amdgpu_in_reset(adev))
3191 amdgpu_ras_set_error_query_ready(adev, true);
3192
3193 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
3194 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
3195
3196 amdgpu_device_fill_reset_magic(adev);
3197
3198 r = amdgpu_device_enable_mgpu_fan_boost();
3199 if (r)
3200 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
3201
3202 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
3203 if (amdgpu_passthrough(adev) &&
3204 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) ||
3205 adev->asic_type == CHIP_ALDEBARAN))
3206 amdgpu_dpm_handle_passthrough_sbr(adev, true);
3207
3208 if (adev->gmc.xgmi.num_physical_nodes > 1) {
3209 mutex_lock(&mgpu_info.mutex);
3210
3211 /*
3212 * Reset device p-state to low as this was booted with high.
3213 *
3214 * This should be performed only after all devices from the same
3215 * hive get initialized.
3216 *
3217 * However, it's unknown how many device in the hive in advance.
3218 * As this is counted one by one during devices initializations.
3219 *
3220 * So, we wait for all XGMI interlinked devices initialized.
3221 * This may bring some delays as those devices may come from
3222 * different hives. But that should be OK.
3223 */
3224 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
3225 for (i = 0; i < mgpu_info.num_gpu; i++) {
3226 gpu_instance = &(mgpu_info.gpu_ins[i]);
3227 if (gpu_instance->adev->flags & AMD_IS_APU)
3228 continue;
3229
3230 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
3231 AMDGPU_XGMI_PSTATE_MIN);
3232 if (r) {
3233 DRM_ERROR("pstate setting failed (%d).\n", r);
3234 break;
3235 }
3236 }
3237 }
3238
3239 mutex_unlock(&mgpu_info.mutex);
3240 }
3241
3242 return 0;
3243 }
3244
3245 /**
3246 * amdgpu_device_smu_fini_early - smu hw_fini wrapper
3247 *
3248 * @adev: amdgpu_device pointer
3249 *
3250 * For ASICs need to disable SMC first
3251 */
amdgpu_device_smu_fini_early(struct amdgpu_device * adev)3252 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
3253 {
3254 int i, r;
3255
3256 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0))
3257 return;
3258
3259 for (i = 0; i < adev->num_ip_blocks; i++) {
3260 if (!adev->ip_blocks[i].status.hw)
3261 continue;
3262 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3263 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
3264 /* XXX handle errors */
3265 if (r) {
3266 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
3267 adev->ip_blocks[i].version->funcs->name, r);
3268 }
3269 adev->ip_blocks[i].status.hw = false;
3270 break;
3271 }
3272 }
3273 }
3274
amdgpu_device_ip_fini_early(struct amdgpu_device * adev)3275 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
3276 {
3277 int i, r;
3278
3279 for (i = 0; i < adev->num_ip_blocks; i++) {
3280 if (!adev->ip_blocks[i].version->funcs->early_fini)
3281 continue;
3282
3283 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
3284 if (r) {
3285 DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
3286 adev->ip_blocks[i].version->funcs->name, r);
3287 }
3288 }
3289
3290 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
3291 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
3292
3293 amdgpu_amdkfd_suspend(adev, false);
3294
3295 /* Workaroud for ASICs need to disable SMC first */
3296 amdgpu_device_smu_fini_early(adev);
3297
3298 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3299 if (!adev->ip_blocks[i].status.hw)
3300 continue;
3301
3302 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
3303 /* XXX handle errors */
3304 if (r) {
3305 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
3306 adev->ip_blocks[i].version->funcs->name, r);
3307 }
3308
3309 adev->ip_blocks[i].status.hw = false;
3310 }
3311
3312 if (amdgpu_sriov_vf(adev)) {
3313 if (amdgpu_virt_release_full_gpu(adev, false))
3314 DRM_ERROR("failed to release exclusive mode on fini\n");
3315 }
3316
3317 return 0;
3318 }
3319
3320 /**
3321 * amdgpu_device_ip_fini - run fini for hardware IPs
3322 *
3323 * @adev: amdgpu_device pointer
3324 *
3325 * Main teardown pass for hardware IPs. The list of all the hardware
3326 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
3327 * are run. hw_fini tears down the hardware associated with each IP
3328 * and sw_fini tears down any software state associated with each IP.
3329 * Returns 0 on success, negative error code on failure.
3330 */
amdgpu_device_ip_fini(struct amdgpu_device * adev)3331 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
3332 {
3333 int i, r;
3334
3335 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
3336 amdgpu_virt_release_ras_err_handler_data(adev);
3337
3338 if (adev->gmc.xgmi.num_physical_nodes > 1)
3339 amdgpu_xgmi_remove_device(adev);
3340
3341 amdgpu_amdkfd_device_fini_sw(adev);
3342
3343 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3344 if (!adev->ip_blocks[i].status.sw)
3345 continue;
3346
3347 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
3348 amdgpu_ucode_free_bo(adev);
3349 amdgpu_free_static_csa(&adev->virt.csa_obj);
3350 amdgpu_device_wb_fini(adev);
3351 amdgpu_device_mem_scratch_fini(adev);
3352 amdgpu_ib_pool_fini(adev);
3353 amdgpu_seq64_fini(adev);
3354 }
3355
3356 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
3357 /* XXX handle errors */
3358 if (r) {
3359 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
3360 adev->ip_blocks[i].version->funcs->name, r);
3361 }
3362 adev->ip_blocks[i].status.sw = false;
3363 adev->ip_blocks[i].status.valid = false;
3364 }
3365
3366 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3367 if (!adev->ip_blocks[i].status.late_initialized)
3368 continue;
3369 if (adev->ip_blocks[i].version->funcs->late_fini)
3370 adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
3371 adev->ip_blocks[i].status.late_initialized = false;
3372 }
3373
3374 amdgpu_ras_fini(adev);
3375
3376 return 0;
3377 }
3378
3379 /**
3380 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
3381 *
3382 * @work: work_struct.
3383 */
amdgpu_device_delayed_init_work_handler(struct work_struct * work)3384 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
3385 {
3386 struct amdgpu_device *adev =
3387 container_of(work, struct amdgpu_device, delayed_init_work.work);
3388 int r;
3389
3390 r = amdgpu_ib_ring_tests(adev);
3391 if (r)
3392 DRM_ERROR("ib ring test failed (%d).\n", r);
3393 }
3394
amdgpu_device_delay_enable_gfx_off(struct work_struct * work)3395 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
3396 {
3397 struct amdgpu_device *adev =
3398 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
3399
3400 WARN_ON_ONCE(adev->gfx.gfx_off_state);
3401 WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
3402
3403 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
3404 adev->gfx.gfx_off_state = true;
3405 }
3406
3407 /**
3408 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
3409 *
3410 * @adev: amdgpu_device pointer
3411 *
3412 * Main suspend function for hardware IPs. The list of all the hardware
3413 * IPs that make up the asic is walked, clockgating is disabled and the
3414 * suspend callbacks are run. suspend puts the hardware and software state
3415 * in each IP into a state suitable for suspend.
3416 * Returns 0 on success, negative error code on failure.
3417 */
amdgpu_device_ip_suspend_phase1(struct amdgpu_device * adev)3418 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
3419 {
3420 int i, r;
3421
3422 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
3423 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
3424
3425 /*
3426 * Per PMFW team's suggestion, driver needs to handle gfxoff
3427 * and df cstate features disablement for gpu reset(e.g. Mode1Reset)
3428 * scenario. Add the missing df cstate disablement here.
3429 */
3430 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
3431 dev_warn(adev->dev, "Failed to disallow df cstate");
3432
3433 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3434 if (!adev->ip_blocks[i].status.valid)
3435 continue;
3436
3437 /* displays are handled separately */
3438 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
3439 continue;
3440
3441 /* XXX handle errors */
3442 r = adev->ip_blocks[i].version->funcs->suspend(adev);
3443 /* XXX handle errors */
3444 if (r) {
3445 DRM_ERROR("suspend of IP block <%s> failed %d\n",
3446 adev->ip_blocks[i].version->funcs->name, r);
3447 return r;
3448 }
3449
3450 adev->ip_blocks[i].status.hw = false;
3451 }
3452
3453 return 0;
3454 }
3455
3456 /**
3457 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
3458 *
3459 * @adev: amdgpu_device pointer
3460 *
3461 * Main suspend function for hardware IPs. The list of all the hardware
3462 * IPs that make up the asic is walked, clockgating is disabled and the
3463 * suspend callbacks are run. suspend puts the hardware and software state
3464 * in each IP into a state suitable for suspend.
3465 * Returns 0 on success, negative error code on failure.
3466 */
amdgpu_device_ip_suspend_phase2(struct amdgpu_device * adev)3467 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
3468 {
3469 int i, r;
3470
3471 if (adev->in_s0ix)
3472 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
3473
3474 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3475 if (!adev->ip_blocks[i].status.valid)
3476 continue;
3477 /* displays are handled in phase1 */
3478 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
3479 continue;
3480 /* PSP lost connection when err_event_athub occurs */
3481 if (amdgpu_ras_intr_triggered() &&
3482 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3483 adev->ip_blocks[i].status.hw = false;
3484 continue;
3485 }
3486
3487 /* skip unnecessary suspend if we do not initialize them yet */
3488 if (adev->gmc.xgmi.pending_reset &&
3489 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3490 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
3491 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3492 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
3493 adev->ip_blocks[i].status.hw = false;
3494 continue;
3495 }
3496
3497 /* skip suspend of gfx/mes and psp for S0ix
3498 * gfx is in gfxoff state, so on resume it will exit gfxoff just
3499 * like at runtime. PSP is also part of the always on hardware
3500 * so no need to suspend it.
3501 */
3502 if (adev->in_s0ix &&
3503 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
3504 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3505 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES))
3506 continue;
3507
3508 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */
3509 if (adev->in_s0ix &&
3510 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >=
3511 IP_VERSION(5, 0, 0)) &&
3512 (adev->ip_blocks[i].version->type ==
3513 AMD_IP_BLOCK_TYPE_SDMA))
3514 continue;
3515
3516 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot.
3517 * These are in TMR, hence are expected to be reused by PSP-TOS to reload
3518 * from this location and RLC Autoload automatically also gets loaded
3519 * from here based on PMFW -> PSP message during re-init sequence.
3520 * Therefore, the psp suspend & resume should be skipped to avoid destroy
3521 * the TMR and reload FWs again for IMU enabled APU ASICs.
3522 */
3523 if (amdgpu_in_reset(adev) &&
3524 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs &&
3525 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3526 continue;
3527
3528 /* XXX handle errors */
3529 r = adev->ip_blocks[i].version->funcs->suspend(adev);
3530 /* XXX handle errors */
3531 if (r) {
3532 DRM_ERROR("suspend of IP block <%s> failed %d\n",
3533 adev->ip_blocks[i].version->funcs->name, r);
3534 }
3535 adev->ip_blocks[i].status.hw = false;
3536 /* handle putting the SMC in the appropriate state */
3537 if (!amdgpu_sriov_vf(adev)) {
3538 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3539 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3540 if (r) {
3541 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3542 adev->mp1_state, r);
3543 return r;
3544 }
3545 }
3546 }
3547 }
3548
3549 return 0;
3550 }
3551
3552 /**
3553 * amdgpu_device_ip_suspend - run suspend for hardware IPs
3554 *
3555 * @adev: amdgpu_device pointer
3556 *
3557 * Main suspend function for hardware IPs. The list of all the hardware
3558 * IPs that make up the asic is walked, clockgating is disabled and the
3559 * suspend callbacks are run. suspend puts the hardware and software state
3560 * in each IP into a state suitable for suspend.
3561 * Returns 0 on success, negative error code on failure.
3562 */
amdgpu_device_ip_suspend(struct amdgpu_device * adev)3563 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3564 {
3565 int r;
3566
3567 if (amdgpu_sriov_vf(adev)) {
3568 amdgpu_virt_fini_data_exchange(adev);
3569 amdgpu_virt_request_full_gpu(adev, false);
3570 }
3571
3572 amdgpu_ttm_set_buffer_funcs_status(adev, false);
3573
3574 r = amdgpu_device_ip_suspend_phase1(adev);
3575 if (r)
3576 return r;
3577 r = amdgpu_device_ip_suspend_phase2(adev);
3578
3579 if (amdgpu_sriov_vf(adev))
3580 amdgpu_virt_release_full_gpu(adev, false);
3581
3582 return r;
3583 }
3584
amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device * adev)3585 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3586 {
3587 int i, r;
3588
3589 static enum amd_ip_block_type ip_order[] = {
3590 AMD_IP_BLOCK_TYPE_COMMON,
3591 AMD_IP_BLOCK_TYPE_GMC,
3592 AMD_IP_BLOCK_TYPE_PSP,
3593 AMD_IP_BLOCK_TYPE_IH,
3594 };
3595
3596 for (i = 0; i < adev->num_ip_blocks; i++) {
3597 int j;
3598 struct amdgpu_ip_block *block;
3599
3600 block = &adev->ip_blocks[i];
3601 block->status.hw = false;
3602
3603 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3604
3605 if (block->version->type != ip_order[j] ||
3606 !block->status.valid)
3607 continue;
3608
3609 r = block->version->funcs->hw_init(adev);
3610 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3611 if (r)
3612 return r;
3613 block->status.hw = true;
3614 }
3615 }
3616
3617 return 0;
3618 }
3619
amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device * adev)3620 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3621 {
3622 int i, r;
3623
3624 static enum amd_ip_block_type ip_order[] = {
3625 AMD_IP_BLOCK_TYPE_SMC,
3626 AMD_IP_BLOCK_TYPE_DCE,
3627 AMD_IP_BLOCK_TYPE_GFX,
3628 AMD_IP_BLOCK_TYPE_SDMA,
3629 AMD_IP_BLOCK_TYPE_MES,
3630 AMD_IP_BLOCK_TYPE_UVD,
3631 AMD_IP_BLOCK_TYPE_VCE,
3632 AMD_IP_BLOCK_TYPE_VCN,
3633 AMD_IP_BLOCK_TYPE_JPEG
3634 };
3635
3636 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3637 int j;
3638 struct amdgpu_ip_block *block;
3639
3640 for (j = 0; j < adev->num_ip_blocks; j++) {
3641 block = &adev->ip_blocks[j];
3642
3643 if (block->version->type != ip_order[i] ||
3644 !block->status.valid ||
3645 block->status.hw)
3646 continue;
3647
3648 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
3649 r = block->version->funcs->resume(adev);
3650 else
3651 r = block->version->funcs->hw_init(adev);
3652
3653 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3654 if (r)
3655 return r;
3656 block->status.hw = true;
3657 }
3658 }
3659
3660 return 0;
3661 }
3662
3663 /**
3664 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3665 *
3666 * @adev: amdgpu_device pointer
3667 *
3668 * First resume function for hardware IPs. The list of all the hardware
3669 * IPs that make up the asic is walked and the resume callbacks are run for
3670 * COMMON, GMC, and IH. resume puts the hardware into a functional state
3671 * after a suspend and updates the software state as necessary. This
3672 * function is also used for restoring the GPU after a GPU reset.
3673 * Returns 0 on success, negative error code on failure.
3674 */
amdgpu_device_ip_resume_phase1(struct amdgpu_device * adev)3675 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3676 {
3677 int i, r;
3678
3679 for (i = 0; i < adev->num_ip_blocks; i++) {
3680 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3681 continue;
3682 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3683 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3684 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3685 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
3686
3687 r = adev->ip_blocks[i].version->funcs->resume(adev);
3688 if (r) {
3689 DRM_ERROR("resume of IP block <%s> failed %d\n",
3690 adev->ip_blocks[i].version->funcs->name, r);
3691 return r;
3692 }
3693 adev->ip_blocks[i].status.hw = true;
3694 }
3695 }
3696
3697 return 0;
3698 }
3699
3700 /**
3701 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3702 *
3703 * @adev: amdgpu_device pointer
3704 *
3705 * Second resume function for hardware IPs. The list of all the hardware
3706 * IPs that make up the asic is walked and the resume callbacks are run for
3707 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
3708 * functional state after a suspend and updates the software state as
3709 * necessary. This function is also used for restoring the GPU after a GPU
3710 * reset.
3711 * Returns 0 on success, negative error code on failure.
3712 */
amdgpu_device_ip_resume_phase2(struct amdgpu_device * adev)3713 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3714 {
3715 int i, r;
3716
3717 for (i = 0; i < adev->num_ip_blocks; i++) {
3718 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3719 continue;
3720 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3721 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3722 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3723 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE ||
3724 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3725 continue;
3726 r = adev->ip_blocks[i].version->funcs->resume(adev);
3727 if (r) {
3728 DRM_ERROR("resume of IP block <%s> failed %d\n",
3729 adev->ip_blocks[i].version->funcs->name, r);
3730 return r;
3731 }
3732 adev->ip_blocks[i].status.hw = true;
3733 }
3734
3735 return 0;
3736 }
3737
3738 /**
3739 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs
3740 *
3741 * @adev: amdgpu_device pointer
3742 *
3743 * Third resume function for hardware IPs. The list of all the hardware
3744 * IPs that make up the asic is walked and the resume callbacks are run for
3745 * all DCE. resume puts the hardware into a functional state after a suspend
3746 * and updates the software state as necessary. This function is also used
3747 * for restoring the GPU after a GPU reset.
3748 *
3749 * Returns 0 on success, negative error code on failure.
3750 */
amdgpu_device_ip_resume_phase3(struct amdgpu_device * adev)3751 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev)
3752 {
3753 int i, r;
3754
3755 for (i = 0; i < adev->num_ip_blocks; i++) {
3756 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3757 continue;
3758 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) {
3759 r = adev->ip_blocks[i].version->funcs->resume(adev);
3760 if (r) {
3761 DRM_ERROR("resume of IP block <%s> failed %d\n",
3762 adev->ip_blocks[i].version->funcs->name, r);
3763 return r;
3764 }
3765 adev->ip_blocks[i].status.hw = true;
3766 }
3767 }
3768
3769 return 0;
3770 }
3771
3772 /**
3773 * amdgpu_device_ip_resume - run resume for hardware IPs
3774 *
3775 * @adev: amdgpu_device pointer
3776 *
3777 * Main resume function for hardware IPs. The hardware IPs
3778 * are split into two resume functions because they are
3779 * also used in recovering from a GPU reset and some additional
3780 * steps need to be take between them. In this case (S3/S4) they are
3781 * run sequentially.
3782 * Returns 0 on success, negative error code on failure.
3783 */
amdgpu_device_ip_resume(struct amdgpu_device * adev)3784 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3785 {
3786 int r;
3787
3788 r = amdgpu_device_ip_resume_phase1(adev);
3789 if (r)
3790 return r;
3791
3792 r = amdgpu_device_fw_loading(adev);
3793 if (r)
3794 return r;
3795
3796 r = amdgpu_device_ip_resume_phase2(adev);
3797
3798 if (adev->mman.buffer_funcs_ring->sched.ready)
3799 amdgpu_ttm_set_buffer_funcs_status(adev, true);
3800
3801 if (r)
3802 return r;
3803
3804 amdgpu_fence_driver_hw_init(adev);
3805
3806 r = amdgpu_device_ip_resume_phase3(adev);
3807
3808 return r;
3809 }
3810
3811 /**
3812 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3813 *
3814 * @adev: amdgpu_device pointer
3815 *
3816 * Query the VBIOS data tables to determine if the board supports SR-IOV.
3817 */
amdgpu_device_detect_sriov_bios(struct amdgpu_device * adev)3818 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3819 {
3820 if (amdgpu_sriov_vf(adev)) {
3821 if (adev->is_atom_fw) {
3822 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
3823 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3824 } else {
3825 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3826 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3827 }
3828
3829 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3830 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3831 }
3832 }
3833
3834 /**
3835 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3836 *
3837 * @asic_type: AMD asic type
3838 *
3839 * Check if there is DC (new modesetting infrastructre) support for an asic.
3840 * returns true if DC has support, false if not.
3841 */
amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)3842 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3843 {
3844 switch (asic_type) {
3845 #ifdef CONFIG_DRM_AMDGPU_SI
3846 case CHIP_HAINAN:
3847 #endif
3848 case CHIP_TOPAZ:
3849 /* chips with no display hardware */
3850 return false;
3851 #if defined(CONFIG_DRM_AMD_DC)
3852 case CHIP_TAHITI:
3853 case CHIP_PITCAIRN:
3854 case CHIP_VERDE:
3855 case CHIP_OLAND:
3856 /*
3857 * We have systems in the wild with these ASICs that require
3858 * LVDS and VGA support which is not supported with DC.
3859 *
3860 * Fallback to the non-DC driver here by default so as not to
3861 * cause regressions.
3862 */
3863 #if defined(CONFIG_DRM_AMD_DC_SI)
3864 return amdgpu_dc > 0;
3865 #else
3866 return false;
3867 #endif
3868 case CHIP_BONAIRE:
3869 case CHIP_KAVERI:
3870 case CHIP_KABINI:
3871 case CHIP_MULLINS:
3872 /*
3873 * We have systems in the wild with these ASICs that require
3874 * VGA support which is not supported with DC.
3875 *
3876 * Fallback to the non-DC driver here by default so as not to
3877 * cause regressions.
3878 */
3879 return amdgpu_dc > 0;
3880 default:
3881 return amdgpu_dc != 0;
3882 #else
3883 default:
3884 if (amdgpu_dc > 0)
3885 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n");
3886 return false;
3887 #endif
3888 }
3889 }
3890
3891 /**
3892 * amdgpu_device_has_dc_support - check if dc is supported
3893 *
3894 * @adev: amdgpu_device pointer
3895 *
3896 * Returns true for supported, false for not supported
3897 */
amdgpu_device_has_dc_support(struct amdgpu_device * adev)3898 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3899 {
3900 if (adev->enable_virtual_display ||
3901 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
3902 return false;
3903
3904 return amdgpu_device_asic_has_dc_support(adev->asic_type);
3905 }
3906
amdgpu_device_xgmi_reset_func(struct work_struct * __work)3907 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3908 {
3909 struct amdgpu_device *adev =
3910 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3911 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3912
3913 /* It's a bug to not have a hive within this function */
3914 if (WARN_ON(!hive))
3915 return;
3916
3917 /*
3918 * Use task barrier to synchronize all xgmi reset works across the
3919 * hive. task_barrier_enter and task_barrier_exit will block
3920 * until all the threads running the xgmi reset works reach
3921 * those points. task_barrier_full will do both blocks.
3922 */
3923 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3924
3925 task_barrier_enter(&hive->tb);
3926 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3927
3928 if (adev->asic_reset_res)
3929 goto fail;
3930
3931 task_barrier_exit(&hive->tb);
3932 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3933
3934 if (adev->asic_reset_res)
3935 goto fail;
3936
3937 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB);
3938 } else {
3939
3940 task_barrier_full(&hive->tb);
3941 adev->asic_reset_res = amdgpu_asic_reset(adev);
3942 }
3943
3944 fail:
3945 if (adev->asic_reset_res)
3946 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3947 adev->asic_reset_res, adev_to_drm(adev)->unique);
3948 amdgpu_put_xgmi_hive(hive);
3949 }
3950
amdgpu_device_get_job_timeout_settings(struct amdgpu_device * adev)3951 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3952 {
3953 char *input = amdgpu_lockup_timeout;
3954 char *timeout_setting = NULL;
3955 int index = 0;
3956 long timeout;
3957 int ret = 0;
3958
3959 /*
3960 * By default timeout for non compute jobs is 10000
3961 * and 60000 for compute jobs.
3962 * In SR-IOV or passthrough mode, timeout for compute
3963 * jobs are 60000 by default.
3964 */
3965 adev->gfx_timeout = msecs_to_jiffies(10000);
3966 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3967 if (amdgpu_sriov_vf(adev))
3968 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3969 msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
3970 else
3971 adev->compute_timeout = msecs_to_jiffies(60000);
3972
3973 #ifdef notyet
3974 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3975 while ((timeout_setting = strsep(&input, ",")) &&
3976 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3977 ret = kstrtol(timeout_setting, 0, &timeout);
3978 if (ret)
3979 return ret;
3980
3981 if (timeout == 0) {
3982 index++;
3983 continue;
3984 } else if (timeout < 0) {
3985 timeout = MAX_SCHEDULE_TIMEOUT;
3986 dev_warn(adev->dev, "lockup timeout disabled");
3987 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
3988 } else {
3989 timeout = msecs_to_jiffies(timeout);
3990 }
3991
3992 switch (index++) {
3993 case 0:
3994 adev->gfx_timeout = timeout;
3995 break;
3996 case 1:
3997 adev->compute_timeout = timeout;
3998 break;
3999 case 2:
4000 adev->sdma_timeout = timeout;
4001 break;
4002 case 3:
4003 adev->video_timeout = timeout;
4004 break;
4005 default:
4006 break;
4007 }
4008 }
4009 /*
4010 * There is only one value specified and
4011 * it should apply to all non-compute jobs.
4012 */
4013 if (index == 1) {
4014 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
4015 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
4016 adev->compute_timeout = adev->gfx_timeout;
4017 }
4018 }
4019 #endif
4020
4021 return ret;
4022 }
4023
4024 /**
4025 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
4026 *
4027 * @adev: amdgpu_device pointer
4028 *
4029 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
4030 */
amdgpu_device_check_iommu_direct_map(struct amdgpu_device * adev)4031 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
4032 {
4033 #ifdef notyet
4034 struct iommu_domain *domain;
4035
4036 domain = iommu_get_domain_for_dev(adev->dev);
4037 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
4038 #endif
4039 adev->ram_is_direct_mapped = true;
4040 }
4041
4042 #if defined(CONFIG_HSA_AMD_P2P)
4043 /**
4044 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled.
4045 *
4046 * @adev: amdgpu_device pointer
4047 *
4048 * return if IOMMU remapping bar address
4049 */
amdgpu_device_check_iommu_remap(struct amdgpu_device * adev)4050 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev)
4051 {
4052 struct iommu_domain *domain;
4053
4054 domain = iommu_get_domain_for_dev(adev->dev);
4055 if (domain && (domain->type == IOMMU_DOMAIN_DMA ||
4056 domain->type == IOMMU_DOMAIN_DMA_FQ))
4057 return true;
4058
4059 return false;
4060 }
4061 #endif
4062
4063 static const struct attribute *amdgpu_dev_attributes[] = {
4064 &dev_attr_pcie_replay_count.attr,
4065 NULL
4066 };
4067
amdgpu_device_set_mcbp(struct amdgpu_device * adev)4068 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev)
4069 {
4070 if (amdgpu_mcbp == 1)
4071 adev->gfx.mcbp = true;
4072 else if (amdgpu_mcbp == 0)
4073 adev->gfx.mcbp = false;
4074
4075 if (amdgpu_sriov_vf(adev))
4076 adev->gfx.mcbp = true;
4077
4078 if (adev->gfx.mcbp)
4079 DRM_INFO("MCBP is enabled\n");
4080 }
4081
4082 /**
4083 * amdgpu_device_init - initialize the driver
4084 *
4085 * @adev: amdgpu_device pointer
4086 * @flags: driver flags
4087 *
4088 * Initializes the driver info and hw (all asics).
4089 * Returns 0 for success or an error on failure.
4090 * Called at driver startup.
4091 */
amdgpu_device_init(struct amdgpu_device * adev,uint32_t flags)4092 int amdgpu_device_init(struct amdgpu_device *adev,
4093 uint32_t flags)
4094 {
4095 struct drm_device *ddev = adev_to_drm(adev);
4096 struct pci_dev *pdev = adev->pdev;
4097 int r, i;
4098 bool px = false;
4099 u32 max_MBps;
4100 int tmp;
4101
4102 adev->shutdown = false;
4103 adev->flags = flags;
4104
4105 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
4106 adev->asic_type = amdgpu_force_asic_type;
4107 else
4108 adev->asic_type = flags & AMD_ASIC_MASK;
4109
4110 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
4111 if (amdgpu_emu_mode == 1)
4112 adev->usec_timeout *= 10;
4113 adev->gmc.gart_size = 512 * 1024 * 1024;
4114 adev->accel_working = false;
4115 adev->num_rings = 0;
4116 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
4117 adev->mman.buffer_funcs = NULL;
4118 adev->mman.buffer_funcs_ring = NULL;
4119 adev->vm_manager.vm_pte_funcs = NULL;
4120 adev->vm_manager.vm_pte_num_scheds = 0;
4121 adev->gmc.gmc_funcs = NULL;
4122 adev->harvest_ip_mask = 0x0;
4123 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
4124 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
4125
4126 adev->smc_rreg = &amdgpu_invalid_rreg;
4127 adev->smc_wreg = &amdgpu_invalid_wreg;
4128 adev->pcie_rreg = &amdgpu_invalid_rreg;
4129 adev->pcie_wreg = &amdgpu_invalid_wreg;
4130 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext;
4131 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext;
4132 adev->pciep_rreg = &amdgpu_invalid_rreg;
4133 adev->pciep_wreg = &amdgpu_invalid_wreg;
4134 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
4135 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
4136 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext;
4137 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext;
4138 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
4139 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
4140 adev->didt_rreg = &amdgpu_invalid_rreg;
4141 adev->didt_wreg = &amdgpu_invalid_wreg;
4142 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
4143 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
4144 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
4145 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
4146
4147 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
4148 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
4149 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
4150
4151 /* mutex initialization are all done here so we
4152 * can recall function without having locking issues
4153 */
4154 rw_init(&adev->firmware.mutex, "agfw");
4155 rw_init(&adev->pm.mutex, "agpm");
4156 rw_init(&adev->gfx.gpu_clock_mutex, "gfxclk");
4157 rw_init(&adev->srbm_mutex, "srbm");
4158 rw_init(&adev->gfx.pipe_reserve_mutex, "pipers");
4159 rw_init(&adev->gfx.gfx_off_mutex, "gfxoff");
4160 rw_init(&adev->gfx.partition_mutex, "gfxpar");
4161 rw_init(&adev->grbm_idx_mutex, "grbmidx");
4162 rw_init(&adev->mn_lock, "agpumn");
4163 rw_init(&adev->virt.vf_errors.lock, "vferr");
4164 rw_init(&adev->virt.rlcg_reg_lock, "vrlcg");
4165 hash_init(adev->mn_hash);
4166 rw_init(&adev->psp.mutex, "agpsp");
4167 rw_init(&adev->notifier_lock, "agnf");
4168 rw_init(&adev->pm.stable_pstate_ctx_lock, "agps");
4169 rw_init(&adev->benchmark_mutex, "agbm");
4170 rw_init(&adev->gfx.reset_sem_mutex, "agrsem");
4171 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */
4172 rw_init(&adev->enforce_isolation_mutex, "agim");
4173 rw_init(&adev->gfx.kfd_sch_mutex, "kfdsch");
4174
4175 amdgpu_device_init_apu_flags(adev);
4176
4177 r = amdgpu_device_check_arguments(adev);
4178 if (r)
4179 return r;
4180
4181 mtx_init(&adev->mmio_idx_lock, IPL_TTY);
4182 mtx_init(&adev->smc_idx_lock, IPL_TTY);
4183 mtx_init(&adev->pcie_idx_lock, IPL_TTY);
4184 mtx_init(&adev->uvd_ctx_idx_lock, IPL_TTY);
4185 mtx_init(&adev->didt_idx_lock, IPL_TTY);
4186 mtx_init(&adev->gc_cac_idx_lock, IPL_TTY);
4187 mtx_init(&adev->se_cac_idx_lock, IPL_TTY);
4188 mtx_init(&adev->audio_endpt_idx_lock, IPL_TTY);
4189 mtx_init(&adev->mm_stats.lock, IPL_NONE);
4190 mtx_init(&adev->wb.lock, IPL_TTY);
4191
4192 INIT_LIST_HEAD(&adev->reset_list);
4193
4194 INIT_LIST_HEAD(&adev->ras_list);
4195
4196 INIT_LIST_HEAD(&adev->pm.od_kobj_list);
4197
4198 INIT_DELAYED_WORK(&adev->delayed_init_work,
4199 amdgpu_device_delayed_init_work_handler);
4200 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
4201 amdgpu_device_delay_enable_gfx_off);
4202 /*
4203 * Initialize the enforce_isolation work structures for each XCP
4204 * partition. This work handler is responsible for enforcing shader
4205 * isolation on AMD GPUs. It counts the number of emitted fences for
4206 * each GFX and compute ring. If there are any fences, it schedules
4207 * the `enforce_isolation_work` to be run after a delay. If there are
4208 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the
4209 * runqueue.
4210 */
4211 for (i = 0; i < MAX_XCP; i++) {
4212 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work,
4213 amdgpu_gfx_enforce_isolation_handler);
4214 adev->gfx.enforce_isolation[i].adev = adev;
4215 adev->gfx.enforce_isolation[i].xcp_id = i;
4216 }
4217
4218 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
4219
4220 adev->gfx.gfx_off_req_count = 1;
4221 adev->gfx.gfx_off_residency = 0;
4222 adev->gfx.gfx_off_entrycount = 0;
4223 adev->pm.ac_power = power_supply_is_system_supplied() > 0;
4224
4225 atomic_set(&adev->throttling_logging_enabled, 1);
4226 /*
4227 * If throttling continues, logging will be performed every minute
4228 * to avoid log flooding. "-1" is subtracted since the thermal
4229 * throttling interrupt comes every second. Thus, the total logging
4230 * interval is 59 seconds(retelimited printk interval) + 1(waiting
4231 * for throttling interrupt) = 60 seconds.
4232 */
4233 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
4234 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
4235
4236 #ifdef __linux__
4237 /* Registers mapping */
4238 /* TODO: block userspace mapping of io register */
4239 if (adev->asic_type >= CHIP_BONAIRE) {
4240 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
4241 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
4242 } else {
4243 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
4244 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
4245 }
4246 #endif
4247
4248 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
4249 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
4250
4251 #ifdef __linux__
4252 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
4253 if (!adev->rmmio)
4254 return -ENOMEM;
4255 #endif
4256 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
4257 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size);
4258
4259 /*
4260 * Reset domain needs to be present early, before XGMI hive discovered
4261 * (if any) and intitialized to use reset sem and in_gpu reset flag
4262 * early on during init and before calling to RREG32.
4263 */
4264 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
4265 if (!adev->reset_domain)
4266 return -ENOMEM;
4267
4268 /* detect hw virtualization here */
4269 amdgpu_detect_virtualization(adev);
4270
4271 amdgpu_device_get_pcie_info(adev);
4272
4273 r = amdgpu_device_get_job_timeout_settings(adev);
4274 if (r) {
4275 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
4276 return r;
4277 }
4278
4279 amdgpu_device_set_mcbp(adev);
4280
4281 /* early init functions */
4282 r = amdgpu_device_ip_early_init(adev);
4283 if (r)
4284 return r;
4285
4286 /* Get rid of things like offb */
4287 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver);
4288 if (r)
4289 return r;
4290
4291 /* Enable TMZ based on IP_VERSION */
4292 amdgpu_gmc_tmz_set(adev);
4293
4294 if (amdgpu_sriov_vf(adev) &&
4295 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0))
4296 /* VF MMIO access (except mailbox range) from CPU
4297 * will be blocked during sriov runtime
4298 */
4299 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT;
4300
4301 amdgpu_gmc_noretry_set(adev);
4302 /* Need to get xgmi info early to decide the reset behavior*/
4303 if (adev->gmc.xgmi.supported) {
4304 r = adev->gfxhub.funcs->get_xgmi_info(adev);
4305 if (r)
4306 return r;
4307 }
4308
4309 /* enable PCIE atomic ops */
4310 #ifdef notyet
4311 if (amdgpu_sriov_vf(adev)) {
4312 if (adev->virt.fw_reserve.p_pf2vf)
4313 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
4314 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
4315 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
4316 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
4317 * internal path natively support atomics, set have_atomics_support to true.
4318 */
4319 } else if ((adev->flags & AMD_IS_APU) &&
4320 (amdgpu_ip_version(adev, GC_HWIP, 0) >
4321 IP_VERSION(9, 0, 0))) {
4322 adev->have_atomics_support = true;
4323 } else {
4324 adev->have_atomics_support =
4325 !pci_enable_atomic_ops_to_root(adev->pdev,
4326 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
4327 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
4328 }
4329
4330 if (!adev->have_atomics_support)
4331 dev_info(adev->dev, "PCIE atomic ops is not supported\n");
4332 #else
4333 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
4334 * internal path natively support atomics, set have_atomics_support to true.
4335 */
4336 if ((adev->flags & AMD_IS_APU) &&
4337 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)))
4338 adev->have_atomics_support = true;
4339 else
4340 adev->have_atomics_support = false;
4341 #endif
4342
4343 /* doorbell bar mapping and doorbell index init*/
4344 amdgpu_doorbell_init(adev);
4345
4346 if (amdgpu_emu_mode == 1) {
4347 /* post the asic on emulation mode */
4348 emu_soc_asic_init(adev);
4349 goto fence_driver_init;
4350 }
4351
4352 amdgpu_reset_init(adev);
4353
4354 /* detect if we are with an SRIOV vbios */
4355 if (adev->bios)
4356 amdgpu_device_detect_sriov_bios(adev);
4357
4358 /* check if we need to reset the asic
4359 * E.g., driver was not cleanly unloaded previously, etc.
4360 */
4361 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
4362 if (adev->gmc.xgmi.num_physical_nodes) {
4363 dev_info(adev->dev, "Pending hive reset.\n");
4364 adev->gmc.xgmi.pending_reset = true;
4365 /* Only need to init necessary block for SMU to handle the reset */
4366 for (i = 0; i < adev->num_ip_blocks; i++) {
4367 if (!adev->ip_blocks[i].status.valid)
4368 continue;
4369 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
4370 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
4371 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
4372 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
4373 DRM_DEBUG("IP %s disabled for hw_init.\n",
4374 adev->ip_blocks[i].version->funcs->name);
4375 adev->ip_blocks[i].status.hw = true;
4376 }
4377 }
4378 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) &&
4379 !amdgpu_device_has_display_hardware(adev)) {
4380 r = psp_gpu_reset(adev);
4381 } else {
4382 tmp = amdgpu_reset_method;
4383 /* It should do a default reset when loading or reloading the driver,
4384 * regardless of the module parameter reset_method.
4385 */
4386 amdgpu_reset_method = AMD_RESET_METHOD_NONE;
4387 r = amdgpu_asic_reset(adev);
4388 amdgpu_reset_method = tmp;
4389 }
4390
4391 if (r) {
4392 dev_err(adev->dev, "asic reset on init failed\n");
4393 goto failed;
4394 }
4395 }
4396
4397 /* Post card if necessary */
4398 if (amdgpu_device_need_post(adev)) {
4399 if (!adev->bios) {
4400 dev_err(adev->dev, "no vBIOS found\n");
4401 r = -EINVAL;
4402 goto failed;
4403 }
4404 DRM_INFO("GPU posting now...\n");
4405 r = amdgpu_device_asic_init(adev);
4406 if (r) {
4407 dev_err(adev->dev, "gpu post error!\n");
4408 goto failed;
4409 }
4410 }
4411
4412 if (adev->bios) {
4413 if (adev->is_atom_fw) {
4414 /* Initialize clocks */
4415 r = amdgpu_atomfirmware_get_clock_info(adev);
4416 if (r) {
4417 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
4418 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
4419 goto failed;
4420 }
4421 } else {
4422 /* Initialize clocks */
4423 r = amdgpu_atombios_get_clock_info(adev);
4424 if (r) {
4425 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
4426 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
4427 goto failed;
4428 }
4429 /* init i2c buses */
4430 if (!amdgpu_device_has_dc_support(adev))
4431 amdgpu_atombios_i2c_init(adev);
4432 }
4433 }
4434
4435 fence_driver_init:
4436 /* Fence driver */
4437 r = amdgpu_fence_driver_sw_init(adev);
4438 if (r) {
4439 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
4440 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
4441 goto failed;
4442 }
4443
4444 /* init the mode config */
4445 drm_mode_config_init(adev_to_drm(adev));
4446
4447 r = amdgpu_device_ip_init(adev);
4448 if (r) {
4449 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
4450 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
4451 goto release_ras_con;
4452 }
4453
4454 amdgpu_fence_driver_hw_init(adev);
4455
4456 dev_info(adev->dev,
4457 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
4458 adev->gfx.config.max_shader_engines,
4459 adev->gfx.config.max_sh_per_se,
4460 adev->gfx.config.max_cu_per_sh,
4461 adev->gfx.cu_info.number);
4462
4463 #ifdef __OpenBSD__
4464 {
4465 const char *chip_name;
4466 uint32_t version = adev->ip_versions[GC_HWIP][0];
4467 int maj, min, rev;
4468
4469 switch (adev->asic_type) {
4470 case CHIP_RAVEN:
4471 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
4472 chip_name = "RAVEN2";
4473 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
4474 chip_name = "PICASSO";
4475 else
4476 chip_name = "RAVEN";
4477 break;
4478 case CHIP_RENOIR:
4479 if (adev->apu_flags & AMD_APU_IS_RENOIR)
4480 chip_name = "RENOIR";
4481 else
4482 chip_name = "GREEN_SARDINE";
4483 break;
4484 default:
4485 chip_name = amdgpu_asic_name[adev->asic_type];
4486 }
4487
4488 printf("%s: %s", adev->self.dv_xname, chip_name);
4489 /* show graphics/compute ip block version, not set on < GFX9 */
4490 if (version) {
4491 maj = IP_VERSION_MAJ(version);
4492 min = IP_VERSION_MIN(version);
4493 rev = IP_VERSION_REV(version);
4494 printf(" GC %d.%d.%d", maj, min, rev);
4495 }
4496 printf(" %d CU rev 0x%02x\n", adev->gfx.cu_info.number, adev->rev_id);
4497 }
4498 #endif
4499
4500 adev->accel_working = true;
4501
4502 amdgpu_vm_check_compute_bug(adev);
4503
4504 /* Initialize the buffer migration limit. */
4505 if (amdgpu_moverate >= 0)
4506 max_MBps = amdgpu_moverate;
4507 else
4508 max_MBps = 8; /* Allow 8 MB/s. */
4509 /* Get a log2 for easy divisions. */
4510 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
4511
4512 /*
4513 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
4514 * Otherwise the mgpu fan boost feature will be skipped due to the
4515 * gpu instance is counted less.
4516 */
4517 amdgpu_register_gpu_instance(adev);
4518
4519 /* enable clockgating, etc. after ib tests, etc. since some blocks require
4520 * explicit gating rather than handling it automatically.
4521 */
4522 if (!adev->gmc.xgmi.pending_reset) {
4523 r = amdgpu_device_ip_late_init(adev);
4524 if (r) {
4525 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
4526 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
4527 goto release_ras_con;
4528 }
4529 /* must succeed. */
4530 amdgpu_ras_resume(adev);
4531 queue_delayed_work(system_wq, &adev->delayed_init_work,
4532 msecs_to_jiffies(AMDGPU_RESUME_MS));
4533 }
4534
4535 if (amdgpu_sriov_vf(adev)) {
4536 amdgpu_virt_release_full_gpu(adev, true);
4537 flush_delayed_work(&adev->delayed_init_work);
4538 }
4539
4540 /*
4541 * Place those sysfs registering after `late_init`. As some of those
4542 * operations performed in `late_init` might affect the sysfs
4543 * interfaces creating.
4544 */
4545 r = amdgpu_atombios_sysfs_init(adev);
4546 if (r)
4547 drm_err(&adev->ddev,
4548 "registering atombios sysfs failed (%d).\n", r);
4549
4550 r = amdgpu_pm_sysfs_init(adev);
4551 if (r)
4552 DRM_ERROR("registering pm sysfs failed (%d).\n", r);
4553
4554 r = amdgpu_ucode_sysfs_init(adev);
4555 if (r) {
4556 adev->ucode_sysfs_en = false;
4557 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
4558 } else
4559 adev->ucode_sysfs_en = true;
4560
4561 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
4562 if (r)
4563 dev_err(adev->dev, "Could not create amdgpu device attr\n");
4564
4565 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group);
4566 if (r)
4567 dev_err(adev->dev,
4568 "Could not create amdgpu board attributes\n");
4569
4570 amdgpu_fru_sysfs_init(adev);
4571 amdgpu_reg_state_sysfs_init(adev);
4572
4573 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4574 r = amdgpu_pmu_init(adev);
4575 if (r)
4576 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
4577
4578 /* Have stored pci confspace at hand for restore in sudden PCI error */
4579 if (amdgpu_device_cache_pci_state(adev->pdev))
4580 pci_restore_state(pdev);
4581
4582 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
4583 /* this will fail for cards that aren't VGA class devices, just
4584 * ignore it
4585 */
4586 #ifdef notyet
4587 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4588 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
4589 #endif
4590
4591 px = amdgpu_device_supports_px(ddev);
4592
4593 if (px || (!dev_is_removable(&adev->pdev->dev) &&
4594 apple_gmux_detect(NULL, NULL)))
4595 vga_switcheroo_register_client(adev->pdev,
4596 &amdgpu_switcheroo_ops, px);
4597
4598 if (px)
4599 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
4600
4601 if (adev->gmc.xgmi.pending_reset)
4602 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
4603 msecs_to_jiffies(AMDGPU_RESUME_MS));
4604
4605 amdgpu_device_check_iommu_direct_map(adev);
4606
4607 return 0;
4608
4609 release_ras_con:
4610 if (amdgpu_sriov_vf(adev))
4611 amdgpu_virt_release_full_gpu(adev, true);
4612
4613 /* failed in exclusive mode due to timeout */
4614 if (amdgpu_sriov_vf(adev) &&
4615 !amdgpu_sriov_runtime(adev) &&
4616 amdgpu_virt_mmio_blocked(adev) &&
4617 !amdgpu_virt_wait_reset(adev)) {
4618 dev_err(adev->dev, "VF exclusive mode timeout\n");
4619 /* Don't send request since VF is inactive. */
4620 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
4621 adev->virt.ops = NULL;
4622 r = -EAGAIN;
4623 }
4624 amdgpu_release_ras_context(adev);
4625
4626 failed:
4627 amdgpu_vf_error_trans_all(adev);
4628
4629 return r;
4630 }
4631
amdgpu_device_unmap_mmio(struct amdgpu_device * adev)4632 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
4633 {
4634 STUB();
4635 #ifdef notyet
4636
4637 /* Clear all CPU mappings pointing to this device */
4638 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
4639 #endif
4640
4641 /* Unmap all mapped bars - Doorbell, registers and VRAM */
4642 amdgpu_doorbell_fini(adev);
4643
4644 #ifdef __linux__
4645 iounmap(adev->rmmio);
4646 adev->rmmio = NULL;
4647 if (adev->mman.aper_base_kaddr)
4648 iounmap(adev->mman.aper_base_kaddr);
4649 adev->mman.aper_base_kaddr = NULL;
4650 #else
4651 if (adev->rmmio_size > 0)
4652 bus_space_unmap(adev->rmmio_bst, adev->rmmio_bsh,
4653 adev->rmmio_size);
4654 adev->rmmio_size = 0;
4655 adev->rmmio = NULL;
4656 if (adev->mman.aper_base_kaddr)
4657 bus_space_unmap(adev->memt, adev->mman.aper_bsh,
4658 adev->gmc.visible_vram_size);
4659 adev->mman.aper_base_kaddr = NULL;
4660 #endif
4661
4662 /* Memory manager related */
4663 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
4664 #ifdef __linux__
4665 arch_phys_wc_del(adev->gmc.vram_mtrr);
4666 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
4667 #else
4668 drm_mtrr_del(0, adev->gmc.aper_base, adev->gmc.aper_size, DRM_MTRR_WC);
4669 #endif
4670 }
4671 }
4672
4673 /**
4674 * amdgpu_device_fini_hw - tear down the driver
4675 *
4676 * @adev: amdgpu_device pointer
4677 *
4678 * Tear down the driver info (all asics).
4679 * Called at driver shutdown.
4680 */
amdgpu_device_fini_hw(struct amdgpu_device * adev)4681 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
4682 {
4683 dev_info(adev->dev, "amdgpu: finishing device.\n");
4684 flush_delayed_work(&adev->delayed_init_work);
4685
4686 if (adev->mman.initialized)
4687 drain_workqueue(adev->mman.bdev.wq);
4688 adev->shutdown = true;
4689
4690 /* make sure IB test finished before entering exclusive mode
4691 * to avoid preemption on IB test
4692 */
4693 if (amdgpu_sriov_vf(adev)) {
4694 amdgpu_virt_request_full_gpu(adev, false);
4695 amdgpu_virt_fini_data_exchange(adev);
4696 }
4697
4698 /* disable all interrupts */
4699 amdgpu_irq_disable_all(adev);
4700 if (adev->mode_info.mode_config_initialized) {
4701 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
4702 drm_helper_force_disable_all(adev_to_drm(adev));
4703 else
4704 drm_atomic_helper_shutdown(adev_to_drm(adev));
4705 }
4706 amdgpu_fence_driver_hw_fini(adev);
4707
4708 if (adev->pm.sysfs_initialized)
4709 amdgpu_pm_sysfs_fini(adev);
4710 if (adev->ucode_sysfs_en)
4711 amdgpu_ucode_sysfs_fini(adev);
4712 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
4713 amdgpu_fru_sysfs_fini(adev);
4714
4715 amdgpu_reg_state_sysfs_fini(adev);
4716
4717 /* disable ras feature must before hw fini */
4718 amdgpu_ras_pre_fini(adev);
4719
4720 amdgpu_ttm_set_buffer_funcs_status(adev, false);
4721
4722 amdgpu_device_ip_fini_early(adev);
4723
4724 amdgpu_irq_fini_hw(adev);
4725
4726 if (adev->mman.initialized)
4727 ttm_device_clear_dma_mappings(&adev->mman.bdev);
4728
4729 amdgpu_gart_dummy_page_fini(adev);
4730
4731 if (drm_dev_is_unplugged(adev_to_drm(adev)))
4732 amdgpu_device_unmap_mmio(adev);
4733
4734 }
4735
amdgpu_device_fini_sw(struct amdgpu_device * adev)4736 void amdgpu_device_fini_sw(struct amdgpu_device *adev)
4737 {
4738 int idx;
4739 bool px;
4740
4741 amdgpu_device_ip_fini(adev);
4742 amdgpu_fence_driver_sw_fini(adev);
4743 amdgpu_ucode_release(&adev->firmware.gpu_info_fw);
4744 adev->accel_working = false;
4745 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
4746
4747 amdgpu_reset_fini(adev);
4748
4749 /* free i2c buses */
4750 if (!amdgpu_device_has_dc_support(adev))
4751 amdgpu_i2c_fini(adev);
4752
4753 if (amdgpu_emu_mode != 1)
4754 amdgpu_atombios_fini(adev);
4755
4756 kfree(adev->bios);
4757 adev->bios = NULL;
4758
4759 kfree(adev->fru_info);
4760 adev->fru_info = NULL;
4761
4762 px = amdgpu_device_supports_px(adev_to_drm(adev));
4763
4764 if (px || (!dev_is_removable(&adev->pdev->dev) &&
4765 apple_gmux_detect(NULL, NULL)))
4766 vga_switcheroo_unregister_client(adev->pdev);
4767
4768 if (px)
4769 vga_switcheroo_fini_domain_pm_ops(adev->dev);
4770
4771 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4772 vga_client_unregister(adev->pdev);
4773
4774 if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4775 #ifdef __linux__
4776 iounmap(adev->rmmio);
4777 adev->rmmio = NULL;
4778 #else
4779 if (adev->rmmio_size > 0)
4780 bus_space_unmap(adev->rmmio_bst, adev->rmmio_bsh,
4781 adev->rmmio_size);
4782 adev->rmmio_size = 0;
4783 adev->rmmio = NULL;
4784 #endif
4785 amdgpu_doorbell_fini(adev);
4786 drm_dev_exit(idx);
4787 }
4788
4789 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4790 amdgpu_pmu_fini(adev);
4791 if (adev->mman.discovery_bin)
4792 amdgpu_discovery_fini(adev);
4793
4794 amdgpu_reset_put_reset_domain(adev->reset_domain);
4795 adev->reset_domain = NULL;
4796
4797 kfree(adev->pci_state);
4798
4799 }
4800
4801 /**
4802 * amdgpu_device_evict_resources - evict device resources
4803 * @adev: amdgpu device object
4804 *
4805 * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4806 * of the vram memory type. Mainly used for evicting device resources
4807 * at suspend time.
4808 *
4809 */
amdgpu_device_evict_resources(struct amdgpu_device * adev)4810 static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
4811 {
4812 int ret;
4813
4814 /* No need to evict vram on APUs for suspend to ram or s2idle */
4815 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
4816 return 0;
4817
4818 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
4819 if (ret)
4820 DRM_WARN("evicting device resources failed\n");
4821 return ret;
4822 }
4823
4824 /*
4825 * Suspend & resume.
4826 */
4827 /**
4828 * amdgpu_device_prepare - prepare for device suspend
4829 *
4830 * @dev: drm dev pointer
4831 *
4832 * Prepare to put the hw in the suspend state (all asics).
4833 * Returns 0 for success or an error on failure.
4834 * Called at driver suspend.
4835 */
amdgpu_device_prepare(struct drm_device * dev)4836 int amdgpu_device_prepare(struct drm_device *dev)
4837 {
4838 struct amdgpu_device *adev = drm_to_adev(dev);
4839 int i, r;
4840
4841 amdgpu_choose_low_power_state(adev);
4842
4843 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4844 return 0;
4845
4846 /* Evict the majority of BOs before starting suspend sequence */
4847 r = amdgpu_device_evict_resources(adev);
4848 if (r)
4849 goto unprepare;
4850
4851 flush_delayed_work(&adev->gfx.gfx_off_delay_work);
4852
4853 for (i = 0; i < adev->num_ip_blocks; i++) {
4854 if (!adev->ip_blocks[i].status.valid)
4855 continue;
4856 if (!adev->ip_blocks[i].version->funcs->prepare_suspend)
4857 continue;
4858 r = adev->ip_blocks[i].version->funcs->prepare_suspend((void *)adev);
4859 if (r)
4860 goto unprepare;
4861 }
4862
4863 return 0;
4864
4865 unprepare:
4866 adev->in_s0ix = adev->in_s3 = false;
4867
4868 return r;
4869 }
4870
4871 /**
4872 * amdgpu_device_suspend - initiate device suspend
4873 *
4874 * @dev: drm dev pointer
4875 * @fbcon : notify the fbdev of suspend
4876 *
4877 * Puts the hw in the suspend state (all asics).
4878 * Returns 0 for success or an error on failure.
4879 * Called at driver suspend.
4880 */
amdgpu_device_suspend(struct drm_device * dev,bool fbcon)4881 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
4882 {
4883 struct amdgpu_device *adev = drm_to_adev(dev);
4884 int r = 0;
4885
4886 if (adev->shutdown)
4887 return 0;
4888
4889 #ifdef notyet
4890 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4891 return 0;
4892 #endif
4893
4894 adev->in_suspend = true;
4895
4896 if (amdgpu_sriov_vf(adev)) {
4897 amdgpu_virt_fini_data_exchange(adev);
4898 r = amdgpu_virt_request_full_gpu(adev, false);
4899 if (r)
4900 return r;
4901 }
4902
4903 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4904 DRM_WARN("smart shift update failed\n");
4905
4906 if (fbcon)
4907 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);
4908
4909 cancel_delayed_work_sync(&adev->delayed_init_work);
4910
4911 amdgpu_ras_suspend(adev);
4912
4913 amdgpu_device_ip_suspend_phase1(adev);
4914
4915 if (!adev->in_s0ix)
4916 amdgpu_amdkfd_suspend(adev, adev->in_runpm);
4917
4918 r = amdgpu_device_evict_resources(adev);
4919 if (r)
4920 return r;
4921
4922 amdgpu_ttm_set_buffer_funcs_status(adev, false);
4923
4924 amdgpu_fence_driver_hw_fini(adev);
4925
4926 amdgpu_device_ip_suspend_phase2(adev);
4927
4928 if (amdgpu_sriov_vf(adev))
4929 amdgpu_virt_release_full_gpu(adev, false);
4930
4931 r = amdgpu_dpm_notify_rlc_state(adev, false);
4932 if (r)
4933 return r;
4934
4935 return 0;
4936 }
4937
4938 /**
4939 * amdgpu_device_resume - initiate device resume
4940 *
4941 * @dev: drm dev pointer
4942 * @fbcon : notify the fbdev of resume
4943 *
4944 * Bring the hw back to operating state (all asics).
4945 * Returns 0 for success or an error on failure.
4946 * Called at driver resume.
4947 */
amdgpu_device_resume(struct drm_device * dev,bool fbcon)4948 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
4949 {
4950 struct amdgpu_device *adev = drm_to_adev(dev);
4951 int r = 0;
4952
4953 if (amdgpu_sriov_vf(adev)) {
4954 r = amdgpu_virt_request_full_gpu(adev, true);
4955 if (r)
4956 return r;
4957 }
4958
4959 #ifdef notyet
4960 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4961 return 0;
4962 #endif
4963
4964 if (adev->in_s0ix)
4965 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
4966
4967 /* post card */
4968 if (amdgpu_device_need_post(adev)) {
4969 r = amdgpu_device_asic_init(adev);
4970 if (r)
4971 dev_err(adev->dev, "amdgpu asic init failed\n");
4972 }
4973
4974 r = amdgpu_device_ip_resume(adev);
4975
4976 if (r) {
4977 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4978 goto exit;
4979 }
4980
4981 if (!adev->in_s0ix) {
4982 r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4983 if (r)
4984 goto exit;
4985 }
4986
4987 r = amdgpu_device_ip_late_init(adev);
4988 if (r)
4989 goto exit;
4990
4991 queue_delayed_work(system_wq, &adev->delayed_init_work,
4992 msecs_to_jiffies(AMDGPU_RESUME_MS));
4993 exit:
4994 if (amdgpu_sriov_vf(adev)) {
4995 amdgpu_virt_init_data_exchange(adev);
4996 amdgpu_virt_release_full_gpu(adev, true);
4997 }
4998
4999 if (r)
5000 return r;
5001
5002 /* Make sure IB tests flushed */
5003 flush_delayed_work(&adev->delayed_init_work);
5004
5005 if (fbcon)
5006 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false);
5007
5008 amdgpu_ras_resume(adev);
5009
5010 if (adev->mode_info.num_crtc) {
5011 /*
5012 * Most of the connector probing functions try to acquire runtime pm
5013 * refs to ensure that the GPU is powered on when connector polling is
5014 * performed. Since we're calling this from a runtime PM callback,
5015 * trying to acquire rpm refs will cause us to deadlock.
5016 *
5017 * Since we're guaranteed to be holding the rpm lock, it's safe to
5018 * temporarily disable the rpm helpers so this doesn't deadlock us.
5019 */
5020 #if defined(CONFIG_PM) && defined(__linux__)
5021 dev->dev->power.disable_depth++;
5022 #endif
5023 if (!adev->dc_enabled)
5024 drm_helper_hpd_irq_event(dev);
5025 else
5026 drm_kms_helper_hotplug_event(dev);
5027 #if defined(CONFIG_PM) && defined(__linux__)
5028 dev->dev->power.disable_depth--;
5029 #endif
5030 }
5031 adev->in_suspend = false;
5032
5033 if (adev->enable_mes)
5034 amdgpu_mes_self_test(adev);
5035
5036 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
5037 DRM_WARN("smart shift update failed\n");
5038
5039 return 0;
5040 }
5041
5042 /**
5043 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
5044 *
5045 * @adev: amdgpu_device pointer
5046 *
5047 * The list of all the hardware IPs that make up the asic is walked and
5048 * the check_soft_reset callbacks are run. check_soft_reset determines
5049 * if the asic is still hung or not.
5050 * Returns true if any of the IPs are still in a hung state, false if not.
5051 */
amdgpu_device_ip_check_soft_reset(struct amdgpu_device * adev)5052 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
5053 {
5054 int i;
5055 bool asic_hang = false;
5056
5057 if (amdgpu_sriov_vf(adev))
5058 return true;
5059
5060 if (amdgpu_asic_need_full_reset(adev))
5061 return true;
5062
5063 for (i = 0; i < adev->num_ip_blocks; i++) {
5064 if (!adev->ip_blocks[i].status.valid)
5065 continue;
5066 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
5067 adev->ip_blocks[i].status.hang =
5068 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
5069 if (adev->ip_blocks[i].status.hang) {
5070 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
5071 asic_hang = true;
5072 }
5073 }
5074 return asic_hang;
5075 }
5076
5077 /**
5078 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
5079 *
5080 * @adev: amdgpu_device pointer
5081 *
5082 * The list of all the hardware IPs that make up the asic is walked and the
5083 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
5084 * handles any IP specific hardware or software state changes that are
5085 * necessary for a soft reset to succeed.
5086 * Returns 0 on success, negative error code on failure.
5087 */
amdgpu_device_ip_pre_soft_reset(struct amdgpu_device * adev)5088 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
5089 {
5090 int i, r = 0;
5091
5092 for (i = 0; i < adev->num_ip_blocks; i++) {
5093 if (!adev->ip_blocks[i].status.valid)
5094 continue;
5095 if (adev->ip_blocks[i].status.hang &&
5096 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
5097 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
5098 if (r)
5099 return r;
5100 }
5101 }
5102
5103 return 0;
5104 }
5105
5106 /**
5107 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
5108 *
5109 * @adev: amdgpu_device pointer
5110 *
5111 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
5112 * reset is necessary to recover.
5113 * Returns true if a full asic reset is required, false if not.
5114 */
amdgpu_device_ip_need_full_reset(struct amdgpu_device * adev)5115 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
5116 {
5117 int i;
5118
5119 if (amdgpu_asic_need_full_reset(adev))
5120 return true;
5121
5122 for (i = 0; i < adev->num_ip_blocks; i++) {
5123 if (!adev->ip_blocks[i].status.valid)
5124 continue;
5125 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
5126 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
5127 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
5128 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
5129 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
5130 if (adev->ip_blocks[i].status.hang) {
5131 dev_info(adev->dev, "Some block need full reset!\n");
5132 return true;
5133 }
5134 }
5135 }
5136 return false;
5137 }
5138
5139 /**
5140 * amdgpu_device_ip_soft_reset - do a soft reset
5141 *
5142 * @adev: amdgpu_device pointer
5143 *
5144 * The list of all the hardware IPs that make up the asic is walked and the
5145 * soft_reset callbacks are run if the block is hung. soft_reset handles any
5146 * IP specific hardware or software state changes that are necessary to soft
5147 * reset the IP.
5148 * Returns 0 on success, negative error code on failure.
5149 */
amdgpu_device_ip_soft_reset(struct amdgpu_device * adev)5150 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
5151 {
5152 int i, r = 0;
5153
5154 for (i = 0; i < adev->num_ip_blocks; i++) {
5155 if (!adev->ip_blocks[i].status.valid)
5156 continue;
5157 if (adev->ip_blocks[i].status.hang &&
5158 adev->ip_blocks[i].version->funcs->soft_reset) {
5159 r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
5160 if (r)
5161 return r;
5162 }
5163 }
5164
5165 return 0;
5166 }
5167
5168 /**
5169 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
5170 *
5171 * @adev: amdgpu_device pointer
5172 *
5173 * The list of all the hardware IPs that make up the asic is walked and the
5174 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
5175 * handles any IP specific hardware or software state changes that are
5176 * necessary after the IP has been soft reset.
5177 * Returns 0 on success, negative error code on failure.
5178 */
amdgpu_device_ip_post_soft_reset(struct amdgpu_device * adev)5179 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
5180 {
5181 int i, r = 0;
5182
5183 for (i = 0; i < adev->num_ip_blocks; i++) {
5184 if (!adev->ip_blocks[i].status.valid)
5185 continue;
5186 if (adev->ip_blocks[i].status.hang &&
5187 adev->ip_blocks[i].version->funcs->post_soft_reset)
5188 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
5189 if (r)
5190 goto unprepare;
5191 }
5192
5193 return 0;
5194
5195 unprepare:
5196 adev->in_s0ix = adev->in_s3 = false;
5197
5198 return r;
5199 }
5200
5201 /**
5202 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
5203 *
5204 * @adev: amdgpu_device pointer
5205 * @reset_context: amdgpu reset context pointer
5206 *
5207 * do VF FLR and reinitialize Asic
5208 * return 0 means succeeded otherwise failed
5209 */
amdgpu_device_reset_sriov(struct amdgpu_device * adev,struct amdgpu_reset_context * reset_context)5210 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
5211 struct amdgpu_reset_context *reset_context)
5212 {
5213 int r;
5214 struct amdgpu_hive_info *hive = NULL;
5215
5216 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) {
5217 if (!amdgpu_ras_get_fed_status(adev))
5218 amdgpu_virt_ready_to_reset(adev);
5219 amdgpu_virt_wait_reset(adev);
5220 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags);
5221 r = amdgpu_virt_request_full_gpu(adev, true);
5222 } else {
5223 r = amdgpu_virt_reset_gpu(adev);
5224 }
5225 if (r)
5226 return r;
5227
5228 amdgpu_ras_set_fed(adev, false);
5229 amdgpu_irq_gpu_reset_resume_helper(adev);
5230
5231 /* some sw clean up VF needs to do before recover */
5232 amdgpu_virt_post_reset(adev);
5233
5234 /* Resume IP prior to SMC */
5235 r = amdgpu_device_ip_reinit_early_sriov(adev);
5236 if (r)
5237 return r;
5238
5239 amdgpu_virt_init_data_exchange(adev);
5240
5241 r = amdgpu_device_fw_loading(adev);
5242 if (r)
5243 return r;
5244
5245 /* now we are okay to resume SMC/CP/SDMA */
5246 r = amdgpu_device_ip_reinit_late_sriov(adev);
5247 if (r)
5248 return r;
5249
5250 hive = amdgpu_get_xgmi_hive(adev);
5251 /* Update PSP FW topology after reset */
5252 if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
5253 r = amdgpu_xgmi_update_topology(hive, adev);
5254 if (hive)
5255 amdgpu_put_xgmi_hive(hive);
5256 if (r)
5257 return r;
5258
5259 r = amdgpu_ib_ring_tests(adev);
5260 if (r)
5261 return r;
5262
5263 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST)
5264 amdgpu_inc_vram_lost(adev);
5265
5266 /* need to be called during full access so we can't do it later like
5267 * bare-metal does.
5268 */
5269 amdgpu_amdkfd_post_reset(adev);
5270 amdgpu_virt_release_full_gpu(adev, true);
5271
5272 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */
5273 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) ||
5274 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
5275 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
5276 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3))
5277 amdgpu_ras_resume(adev);
5278 return 0;
5279 }
5280
5281 /**
5282 * amdgpu_device_has_job_running - check if there is any job in mirror list
5283 *
5284 * @adev: amdgpu_device pointer
5285 *
5286 * check if there is any job in mirror list
5287 */
amdgpu_device_has_job_running(struct amdgpu_device * adev)5288 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
5289 {
5290 int i;
5291 struct drm_sched_job *job;
5292
5293 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5294 struct amdgpu_ring *ring = adev->rings[i];
5295
5296 if (!amdgpu_ring_sched_ready(ring))
5297 continue;
5298
5299 spin_lock(&ring->sched.job_list_lock);
5300 job = list_first_entry_or_null(&ring->sched.pending_list,
5301 struct drm_sched_job, list);
5302 spin_unlock(&ring->sched.job_list_lock);
5303 if (job)
5304 return true;
5305 }
5306 return false;
5307 }
5308
5309 /**
5310 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
5311 *
5312 * @adev: amdgpu_device pointer
5313 *
5314 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
5315 * a hung GPU.
5316 */
amdgpu_device_should_recover_gpu(struct amdgpu_device * adev)5317 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
5318 {
5319
5320 if (amdgpu_gpu_recovery == 0)
5321 goto disabled;
5322
5323 /* Skip soft reset check in fatal error mode */
5324 if (!amdgpu_ras_is_poison_mode_supported(adev))
5325 return true;
5326
5327 if (amdgpu_sriov_vf(adev))
5328 return true;
5329
5330 if (amdgpu_gpu_recovery == -1) {
5331 switch (adev->asic_type) {
5332 #ifdef CONFIG_DRM_AMDGPU_SI
5333 case CHIP_VERDE:
5334 case CHIP_TAHITI:
5335 case CHIP_PITCAIRN:
5336 case CHIP_OLAND:
5337 case CHIP_HAINAN:
5338 #endif
5339 #ifdef CONFIG_DRM_AMDGPU_CIK
5340 case CHIP_KAVERI:
5341 case CHIP_KABINI:
5342 case CHIP_MULLINS:
5343 #endif
5344 case CHIP_CARRIZO:
5345 case CHIP_STONEY:
5346 case CHIP_CYAN_SKILLFISH:
5347 goto disabled;
5348 default:
5349 break;
5350 }
5351 }
5352
5353 return true;
5354
5355 disabled:
5356 dev_info(adev->dev, "GPU recovery disabled.\n");
5357 return false;
5358 }
5359
amdgpu_device_mode1_reset(struct amdgpu_device * adev)5360 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
5361 {
5362 u32 i;
5363 int ret = 0;
5364
5365 amdgpu_atombios_scratch_regs_engine_hung(adev, true);
5366
5367 dev_info(adev->dev, "GPU mode1 reset\n");
5368
5369 /* Cache the state before bus master disable. The saved config space
5370 * values are used in other cases like restore after mode-2 reset.
5371 */
5372 amdgpu_device_cache_pci_state(adev->pdev);
5373
5374 /* disable BM */
5375 pci_clear_master(adev->pdev);
5376
5377 if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
5378 dev_info(adev->dev, "GPU smu mode1 reset\n");
5379 ret = amdgpu_dpm_mode1_reset(adev);
5380 } else {
5381 dev_info(adev->dev, "GPU psp mode1 reset\n");
5382 ret = psp_gpu_reset(adev);
5383 }
5384
5385 if (ret)
5386 goto mode1_reset_failed;
5387
5388 amdgpu_device_load_pci_state(adev->pdev);
5389 ret = amdgpu_psp_wait_for_bootloader(adev);
5390 if (ret)
5391 goto mode1_reset_failed;
5392
5393 /* wait for asic to come out of reset */
5394 for (i = 0; i < adev->usec_timeout; i++) {
5395 u32 memsize = adev->nbio.funcs->get_memsize(adev);
5396
5397 if (memsize != 0xffffffff)
5398 break;
5399 udelay(1);
5400 }
5401
5402 if (i >= adev->usec_timeout) {
5403 ret = -ETIMEDOUT;
5404 goto mode1_reset_failed;
5405 }
5406
5407 amdgpu_atombios_scratch_regs_engine_hung(adev, false);
5408
5409 return 0;
5410
5411 mode1_reset_failed:
5412 dev_err(adev->dev, "GPU mode1 reset failed\n");
5413 return ret;
5414 }
5415
amdgpu_device_pre_asic_reset(struct amdgpu_device * adev,struct amdgpu_reset_context * reset_context)5416 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
5417 struct amdgpu_reset_context *reset_context)
5418 {
5419 int i, r = 0;
5420 struct amdgpu_job *job = NULL;
5421 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev;
5422 bool need_full_reset =
5423 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5424
5425 if (reset_context->reset_req_dev == adev)
5426 job = reset_context->job;
5427
5428 if (amdgpu_sriov_vf(adev))
5429 amdgpu_virt_pre_reset(adev);
5430
5431 amdgpu_fence_driver_isr_toggle(adev, true);
5432
5433 /* block all schedulers and reset given job's ring */
5434 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5435 struct amdgpu_ring *ring = adev->rings[i];
5436
5437 if (!amdgpu_ring_sched_ready(ring))
5438 continue;
5439
5440 /* Clear job fence from fence drv to avoid force_completion
5441 * leave NULL and vm flush fence in fence drv
5442 */
5443 amdgpu_fence_driver_clear_job_fences(ring);
5444
5445 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
5446 amdgpu_fence_driver_force_completion(ring);
5447 }
5448
5449 amdgpu_fence_driver_isr_toggle(adev, false);
5450
5451 if (job && job->vm)
5452 drm_sched_increase_karma(&job->base);
5453
5454 r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
5455 /* If reset handler not implemented, continue; otherwise return */
5456 if (r == -EOPNOTSUPP)
5457 r = 0;
5458 else
5459 return r;
5460
5461 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
5462 if (!amdgpu_sriov_vf(adev)) {
5463
5464 if (!need_full_reset)
5465 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
5466
5467 if (!need_full_reset && amdgpu_gpu_recovery &&
5468 amdgpu_device_ip_check_soft_reset(adev)) {
5469 amdgpu_device_ip_pre_soft_reset(adev);
5470 r = amdgpu_device_ip_soft_reset(adev);
5471 amdgpu_device_ip_post_soft_reset(adev);
5472 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
5473 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
5474 need_full_reset = true;
5475 }
5476 }
5477
5478 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
5479 dev_info(tmp_adev->dev, "Dumping IP State\n");
5480 /* Trigger ip dump before we reset the asic */
5481 for (i = 0; i < tmp_adev->num_ip_blocks; i++)
5482 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
5483 tmp_adev->ip_blocks[i].version->funcs
5484 ->dump_ip_state((void *)tmp_adev);
5485 dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
5486 }
5487
5488 if (need_full_reset)
5489 r = amdgpu_device_ip_suspend(adev);
5490 if (need_full_reset)
5491 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5492 else
5493 clear_bit(AMDGPU_NEED_FULL_RESET,
5494 &reset_context->flags);
5495 }
5496
5497 return r;
5498 }
5499
amdgpu_do_asic_reset(struct list_head * device_list_handle,struct amdgpu_reset_context * reset_context)5500 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
5501 struct amdgpu_reset_context *reset_context)
5502 {
5503 struct amdgpu_device *tmp_adev = NULL;
5504 bool need_full_reset, skip_hw_reset, vram_lost = false;
5505 int r = 0;
5506
5507 /* Try reset handler method first */
5508 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5509 reset_list);
5510
5511 reset_context->reset_device_list = device_list_handle;
5512 r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
5513 /* If reset handler not implemented, continue; otherwise return */
5514 if (r == -EOPNOTSUPP)
5515 r = 0;
5516 else
5517 return r;
5518
5519 /* Reset handler not implemented, use the default method */
5520 need_full_reset =
5521 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5522 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
5523
5524 /*
5525 * ASIC reset has to be done on all XGMI hive nodes ASAP
5526 * to allow proper links negotiation in FW (within 1 sec)
5527 */
5528 if (!skip_hw_reset && need_full_reset) {
5529 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5530 /* For XGMI run all resets in parallel to speed up the process */
5531 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5532 tmp_adev->gmc.xgmi.pending_reset = false;
5533 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
5534 r = -EALREADY;
5535 } else
5536 r = amdgpu_asic_reset(tmp_adev);
5537
5538 if (r) {
5539 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
5540 r, adev_to_drm(tmp_adev)->unique);
5541 goto out;
5542 }
5543 }
5544
5545 /* For XGMI wait for all resets to complete before proceed */
5546 if (!r) {
5547 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5548 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5549 flush_work(&tmp_adev->xgmi_reset_work);
5550 r = tmp_adev->asic_reset_res;
5551 if (r)
5552 break;
5553 }
5554 }
5555 }
5556 }
5557
5558 if (!r && amdgpu_ras_intr_triggered()) {
5559 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5560 amdgpu_ras_reset_error_count(tmp_adev, AMDGPU_RAS_BLOCK__MMHUB);
5561 }
5562
5563 amdgpu_ras_intr_cleared();
5564 }
5565
5566 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5567 if (need_full_reset) {
5568 /* post card */
5569 amdgpu_ras_set_fed(tmp_adev, false);
5570 r = amdgpu_device_asic_init(tmp_adev);
5571 if (r) {
5572 dev_warn(tmp_adev->dev, "asic atom init failed!");
5573 } else {
5574 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
5575
5576 r = amdgpu_device_ip_resume_phase1(tmp_adev);
5577 if (r)
5578 goto out;
5579
5580 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
5581
5582 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags))
5583 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job);
5584
5585 if (vram_lost) {
5586 DRM_INFO("VRAM is lost due to GPU reset!\n");
5587 amdgpu_inc_vram_lost(tmp_adev);
5588 }
5589
5590 r = amdgpu_device_fw_loading(tmp_adev);
5591 if (r)
5592 return r;
5593
5594 r = amdgpu_xcp_restore_partition_mode(
5595 tmp_adev->xcp_mgr);
5596 if (r)
5597 goto out;
5598
5599 r = amdgpu_device_ip_resume_phase2(tmp_adev);
5600 if (r)
5601 goto out;
5602
5603 if (tmp_adev->mman.buffer_funcs_ring->sched.ready)
5604 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true);
5605
5606 r = amdgpu_device_ip_resume_phase3(tmp_adev);
5607 if (r)
5608 goto out;
5609
5610 if (vram_lost)
5611 amdgpu_device_fill_reset_magic(tmp_adev);
5612
5613 /*
5614 * Add this ASIC as tracked as reset was already
5615 * complete successfully.
5616 */
5617 amdgpu_register_gpu_instance(tmp_adev);
5618
5619 if (!reset_context->hive &&
5620 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5621 amdgpu_xgmi_add_device(tmp_adev);
5622
5623 r = amdgpu_device_ip_late_init(tmp_adev);
5624 if (r)
5625 goto out;
5626
5627 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false);
5628
5629 /*
5630 * The GPU enters bad state once faulty pages
5631 * by ECC has reached the threshold, and ras
5632 * recovery is scheduled next. So add one check
5633 * here to break recovery if it indeed exceeds
5634 * bad page threshold, and remind user to
5635 * retire this GPU or setting one bigger
5636 * bad_page_threshold value to fix this once
5637 * probing driver again.
5638 */
5639 if (!amdgpu_ras_is_rma(tmp_adev)) {
5640 /* must succeed. */
5641 amdgpu_ras_resume(tmp_adev);
5642 } else {
5643 r = -EINVAL;
5644 goto out;
5645 }
5646
5647 /* Update PSP FW topology after reset */
5648 if (reset_context->hive &&
5649 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5650 r = amdgpu_xgmi_update_topology(
5651 reset_context->hive, tmp_adev);
5652 }
5653 }
5654
5655 out:
5656 if (!r) {
5657 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
5658 r = amdgpu_ib_ring_tests(tmp_adev);
5659 if (r) {
5660 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
5661 need_full_reset = true;
5662 r = -EAGAIN;
5663 goto end;
5664 }
5665 }
5666
5667 if (r)
5668 tmp_adev->asic_reset_res = r;
5669 }
5670
5671 end:
5672 if (need_full_reset)
5673 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5674 else
5675 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5676 return r;
5677 }
5678
amdgpu_device_set_mp1_state(struct amdgpu_device * adev)5679 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
5680 {
5681
5682 switch (amdgpu_asic_reset_method(adev)) {
5683 case AMD_RESET_METHOD_MODE1:
5684 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
5685 break;
5686 case AMD_RESET_METHOD_MODE2:
5687 adev->mp1_state = PP_MP1_STATE_RESET;
5688 break;
5689 default:
5690 adev->mp1_state = PP_MP1_STATE_NONE;
5691 break;
5692 }
5693
5694 pci_dev_put(p);
5695 }
5696
amdgpu_device_unset_mp1_state(struct amdgpu_device * adev)5697 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
5698 {
5699 amdgpu_vf_error_trans_all(adev);
5700 adev->mp1_state = PP_MP1_STATE_NONE;
5701 }
5702
amdgpu_device_resume_display_audio(struct amdgpu_device * adev)5703 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
5704 {
5705 STUB();
5706 #ifdef notyet
5707 struct pci_dev *p = NULL;
5708
5709 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5710 adev->pdev->bus->number, 1);
5711 if (p) {
5712 pm_runtime_enable(&(p->dev));
5713 pm_runtime_resume(&(p->dev));
5714 }
5715 #endif
5716 }
5717
amdgpu_device_suspend_display_audio(struct amdgpu_device * adev)5718 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
5719 {
5720 enum amd_reset_method reset_method;
5721 struct pci_dev *p = NULL;
5722 u64 expires;
5723
5724 /*
5725 * For now, only BACO and mode1 reset are confirmed
5726 * to suffer the audio issue without proper suspended.
5727 */
5728 reset_method = amdgpu_asic_reset_method(adev);
5729 if ((reset_method != AMD_RESET_METHOD_BACO) &&
5730 (reset_method != AMD_RESET_METHOD_MODE1))
5731 return -EINVAL;
5732
5733 STUB();
5734 return -ENOSYS;
5735 #ifdef notyet
5736
5737 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5738 adev->pdev->bus->number, 1);
5739 if (!p)
5740 return -ENODEV;
5741
5742 expires = pm_runtime_autosuspend_expiration(&(p->dev));
5743 if (!expires)
5744 /*
5745 * If we cannot get the audio device autosuspend delay,
5746 * a fixed 4S interval will be used. Considering 3S is
5747 * the audio controller default autosuspend delay setting.
5748 * 4S used here is guaranteed to cover that.
5749 */
5750 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
5751
5752 while (!pm_runtime_status_suspended(&(p->dev))) {
5753 if (!pm_runtime_suspend(&(p->dev)))
5754 break;
5755
5756 if (expires < ktime_get_mono_fast_ns()) {
5757 dev_warn(adev->dev, "failed to suspend display audio\n");
5758 pci_dev_put(p);
5759 /* TODO: abort the succeeding gpu reset? */
5760 return -ETIMEDOUT;
5761 }
5762 }
5763
5764 pm_runtime_disable(&(p->dev));
5765
5766 pci_dev_put(p);
5767 return 0;
5768 #endif
5769 }
5770
amdgpu_device_stop_pending_resets(struct amdgpu_device * adev)5771 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
5772 {
5773 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
5774
5775 #if defined(CONFIG_DEBUG_FS)
5776 if (!amdgpu_sriov_vf(adev))
5777 cancel_work(&adev->reset_work);
5778 #endif
5779
5780 if (adev->kfd.dev)
5781 cancel_work(&adev->kfd.reset_work);
5782
5783 if (amdgpu_sriov_vf(adev))
5784 cancel_work(&adev->virt.flr_work);
5785
5786 if (con && adev->ras_enabled)
5787 cancel_work(&con->recovery_work);
5788
5789 }
5790
amdgpu_device_health_check(struct list_head * device_list_handle)5791 static int amdgpu_device_health_check(struct list_head *device_list_handle)
5792 {
5793 struct amdgpu_device *tmp_adev;
5794 int ret = 0;
5795 u32 status;
5796
5797 #ifdef notyet
5798 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5799 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status);
5800 if (PCI_POSSIBLE_ERROR(status)) {
5801 dev_err(tmp_adev->dev, "device lost from bus!");
5802 ret = -ENODEV;
5803 }
5804 }
5805 #endif
5806
5807 return ret;
5808 }
5809
5810 /**
5811 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
5812 *
5813 * @adev: amdgpu_device pointer
5814 * @job: which job trigger hang
5815 * @reset_context: amdgpu reset context pointer
5816 *
5817 * Attempt to reset the GPU if it has hung (all asics).
5818 * Attempt to do soft-reset or full-reset and reinitialize Asic
5819 * Returns 0 for success or an error on failure.
5820 */
5821
amdgpu_device_gpu_recover(struct amdgpu_device * adev,struct amdgpu_job * job,struct amdgpu_reset_context * reset_context)5822 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5823 struct amdgpu_job *job,
5824 struct amdgpu_reset_context *reset_context)
5825 {
5826 struct list_head device_list, *device_list_handle = NULL;
5827 bool job_signaled = false;
5828 struct amdgpu_hive_info *hive = NULL;
5829 struct amdgpu_device *tmp_adev = NULL;
5830 int i, r = 0;
5831 bool need_emergency_restart = false;
5832 bool audio_suspended = false;
5833 int retry_limit = AMDGPU_MAX_RETRY_LIMIT;
5834
5835 /*
5836 * Special case: RAS triggered and full reset isn't supported
5837 */
5838 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5839
5840 /*
5841 * Flush RAM to disk so that after reboot
5842 * the user can read log and see why the system rebooted.
5843 */
5844 if (need_emergency_restart && amdgpu_ras_get_context(adev) &&
5845 amdgpu_ras_get_context(adev)->reboot) {
5846 DRM_WARN("Emergency reboot.");
5847
5848 #ifdef notyet
5849 ksys_sync_helper();
5850 emergency_restart();
5851 #else
5852 panic("emergency_restart");
5853 #endif
5854 }
5855
5856 dev_info(adev->dev, "GPU %s begin!\n",
5857 need_emergency_restart ? "jobs stop":"reset");
5858
5859 if (!amdgpu_sriov_vf(adev))
5860 hive = amdgpu_get_xgmi_hive(adev);
5861 if (hive)
5862 mutex_lock(&hive->hive_lock);
5863
5864 reset_context->job = job;
5865 reset_context->hive = hive;
5866 /*
5867 * Build list of devices to reset.
5868 * In case we are in XGMI hive mode, resort the device list
5869 * to put adev in the 1st position.
5870 */
5871 INIT_LIST_HEAD(&device_list);
5872 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {
5873 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
5874 list_add_tail(&tmp_adev->reset_list, &device_list);
5875 if (adev->shutdown)
5876 tmp_adev->shutdown = true;
5877 }
5878 if (!list_is_first(&adev->reset_list, &device_list))
5879 list_rotate_to_front(&adev->reset_list, &device_list);
5880 device_list_handle = &device_list;
5881 } else {
5882 list_add_tail(&adev->reset_list, &device_list);
5883 device_list_handle = &device_list;
5884 }
5885
5886 if (!amdgpu_sriov_vf(adev)) {
5887 r = amdgpu_device_health_check(device_list_handle);
5888 if (r)
5889 goto end_reset;
5890 }
5891
5892 /* We need to lock reset domain only once both for XGMI and single device */
5893 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5894 reset_list);
5895 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
5896
5897 /* block all schedulers and reset given job's ring */
5898 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5899
5900 amdgpu_device_set_mp1_state(tmp_adev);
5901
5902 /*
5903 * Try to put the audio codec into suspend state
5904 * before gpu reset started.
5905 *
5906 * Due to the power domain of the graphics device
5907 * is shared with AZ power domain. Without this,
5908 * we may change the audio hardware from behind
5909 * the audio driver's back. That will trigger
5910 * some audio codec errors.
5911 */
5912 if (!amdgpu_device_suspend_display_audio(tmp_adev))
5913 audio_suspended = true;
5914
5915 amdgpu_ras_set_error_query_ready(tmp_adev, false);
5916
5917 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5918
5919 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context);
5920
5921 /*
5922 * Mark these ASICs to be reseted as untracked first
5923 * And add them back after reset completed
5924 */
5925 amdgpu_unregister_gpu_instance(tmp_adev);
5926
5927 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true);
5928
5929 /* disable ras on ALL IPs */
5930 if (!need_emergency_restart &&
5931 amdgpu_device_ip_need_full_reset(tmp_adev))
5932 amdgpu_ras_suspend(tmp_adev);
5933
5934 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5935 struct amdgpu_ring *ring = tmp_adev->rings[i];
5936
5937 if (!amdgpu_ring_sched_ready(ring))
5938 continue;
5939
5940 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
5941
5942 if (need_emergency_restart)
5943 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
5944 }
5945 atomic_inc(&tmp_adev->gpu_reset_counter);
5946 }
5947
5948 if (need_emergency_restart)
5949 goto skip_sched_resume;
5950
5951 /*
5952 * Must check guilty signal here since after this point all old
5953 * HW fences are force signaled.
5954 *
5955 * job->base holds a reference to parent fence
5956 */
5957 if (job && dma_fence_is_signaled(&job->hw_fence)) {
5958 job_signaled = true;
5959 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
5960 goto skip_hw_reset;
5961 }
5962
5963 retry: /* Rest of adevs pre asic reset from XGMI hive. */
5964 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5965 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
5966 /*TODO Should we stop ?*/
5967 if (r) {
5968 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
5969 r, adev_to_drm(tmp_adev)->unique);
5970 tmp_adev->asic_reset_res = r;
5971 }
5972 }
5973
5974 /* Actual ASIC resets if needed.*/
5975 /* Host driver will handle XGMI hive reset for SRIOV */
5976 if (amdgpu_sriov_vf(adev)) {
5977 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) {
5978 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n");
5979 amdgpu_ras_set_fed(adev, true);
5980 set_bit(AMDGPU_HOST_FLR, &reset_context->flags);
5981 }
5982
5983 r = amdgpu_device_reset_sriov(adev, reset_context);
5984 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) {
5985 amdgpu_virt_release_full_gpu(adev, true);
5986 goto retry;
5987 }
5988 if (r)
5989 adev->asic_reset_res = r;
5990 } else {
5991 r = amdgpu_do_asic_reset(device_list_handle, reset_context);
5992 if (r && r == -EAGAIN)
5993 goto retry;
5994 }
5995
5996 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5997 /*
5998 * Drop any pending non scheduler resets queued before reset is done.
5999 * Any reset scheduled after this point would be valid. Scheduler resets
6000 * were already dropped during drm_sched_stop and no new ones can come
6001 * in before drm_sched_start.
6002 */
6003 amdgpu_device_stop_pending_resets(tmp_adev);
6004 }
6005
6006 skip_hw_reset:
6007
6008 /* Post ASIC reset for all devs .*/
6009 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
6010
6011 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6012 struct amdgpu_ring *ring = tmp_adev->rings[i];
6013
6014 if (!amdgpu_ring_sched_ready(ring))
6015 continue;
6016
6017 drm_sched_start(&ring->sched);
6018 }
6019
6020 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
6021 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
6022
6023 if (tmp_adev->asic_reset_res)
6024 r = tmp_adev->asic_reset_res;
6025
6026 tmp_adev->asic_reset_res = 0;
6027
6028 if (r) {
6029 /* bad news, how to tell it to userspace ?
6030 * for ras error, we should report GPU bad status instead of
6031 * reset failure
6032 */
6033 if (reset_context->src != AMDGPU_RESET_SRC_RAS ||
6034 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev))
6035 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n",
6036 atomic_read(&tmp_adev->gpu_reset_counter));
6037 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
6038 } else {
6039 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
6040 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
6041 DRM_WARN("smart shift update failed\n");
6042 }
6043 }
6044
6045 skip_sched_resume:
6046 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
6047 /* unlock kfd: SRIOV would do it separately */
6048 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
6049 amdgpu_amdkfd_post_reset(tmp_adev);
6050
6051 /* kfd_post_reset will do nothing if kfd device is not initialized,
6052 * need to bring up kfd here if it's not be initialized before
6053 */
6054 if (!adev->kfd.init_complete)
6055 amdgpu_amdkfd_device_init(adev);
6056
6057 if (audio_suspended)
6058 amdgpu_device_resume_display_audio(tmp_adev);
6059
6060 amdgpu_device_unset_mp1_state(tmp_adev);
6061
6062 amdgpu_ras_set_error_query_ready(tmp_adev, true);
6063 }
6064
6065 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
6066 reset_list);
6067 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
6068
6069 end_reset:
6070 if (hive) {
6071 mutex_unlock(&hive->hive_lock);
6072 amdgpu_put_xgmi_hive(hive);
6073 }
6074
6075 if (r)
6076 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
6077
6078 atomic_set(&adev->reset_domain->reset_res, r);
6079 return r;
6080 }
6081
6082 /**
6083 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner
6084 *
6085 * @adev: amdgpu_device pointer
6086 * @speed: pointer to the speed of the link
6087 * @width: pointer to the width of the link
6088 *
6089 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the
6090 * first physical partner to an AMD dGPU.
6091 * This will exclude any virtual switches and links.
6092 */
amdgpu_device_partner_bandwidth(struct amdgpu_device * adev,enum pci_bus_speed * speed,enum pcie_link_width * width)6093 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev,
6094 enum pci_bus_speed *speed,
6095 enum pcie_link_width *width)
6096 {
6097 struct pci_dev *parent = adev->pdev;
6098
6099 if (!speed || !width)
6100 return;
6101
6102 *speed = PCI_SPEED_UNKNOWN;
6103 *width = PCIE_LNK_WIDTH_UNKNOWN;
6104
6105 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) {
6106 while ((parent = pci_upstream_bridge(parent))) {
6107 /* skip upstream/downstream switches internal to dGPU*/
6108 if (parent->vendor == PCI_VENDOR_ID_ATI)
6109 continue;
6110 *speed = pcie_get_speed_cap(parent);
6111 *width = pcie_get_width_cap(parent);
6112 break;
6113 }
6114 } else {
6115 /* use the current speeds rather than max if switching is not supported */
6116 pcie_bandwidth_available(adev->pdev, NULL, speed, width);
6117 }
6118 }
6119
6120 /**
6121 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
6122 *
6123 * @adev: amdgpu_device pointer
6124 *
6125 * Fetchs and stores in the driver the PCIE capabilities (gen speed
6126 * and lanes) of the slot the device is in. Handles APUs and
6127 * virtualized environments where PCIE config space may not be available.
6128 */
amdgpu_device_get_pcie_info(struct amdgpu_device * adev)6129 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
6130 {
6131 struct pci_dev *pdev;
6132 enum pci_bus_speed speed_cap, platform_speed_cap;
6133 enum pcie_link_width platform_link_width;
6134
6135 if (amdgpu_pcie_gen_cap)
6136 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
6137
6138 if (amdgpu_pcie_lane_cap)
6139 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
6140
6141 /* covers APUs as well */
6142 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) {
6143 if (adev->pm.pcie_gen_mask == 0)
6144 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
6145 if (adev->pm.pcie_mlw_mask == 0)
6146 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
6147 return;
6148 }
6149
6150 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
6151 return;
6152
6153 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap,
6154 &platform_link_width);
6155
6156 if (adev->pm.pcie_gen_mask == 0) {
6157 /* asic caps */
6158 pdev = adev->pdev;
6159 speed_cap = pcie_get_speed_cap(pdev);
6160 if (speed_cap == PCI_SPEED_UNKNOWN) {
6161 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6162 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6163 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
6164 } else {
6165 if (speed_cap == PCIE_SPEED_32_0GT)
6166 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6167 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6168 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6169 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
6170 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
6171 else if (speed_cap == PCIE_SPEED_16_0GT)
6172 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6173 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6174 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6175 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
6176 else if (speed_cap == PCIE_SPEED_8_0GT)
6177 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6178 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6179 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
6180 else if (speed_cap == PCIE_SPEED_5_0GT)
6181 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6182 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
6183 else
6184 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
6185 }
6186 /* platform caps */
6187 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
6188 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6189 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
6190 } else {
6191 if (platform_speed_cap == PCIE_SPEED_32_0GT)
6192 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6193 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6194 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6195 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
6196 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
6197 else if (platform_speed_cap == PCIE_SPEED_16_0GT)
6198 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6199 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6200 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6201 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
6202 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
6203 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6204 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6205 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
6206 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
6207 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6208 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
6209 else
6210 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
6211
6212 }
6213 }
6214 if (adev->pm.pcie_mlw_mask == 0) {
6215 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
6216 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
6217 } else {
6218 switch (platform_link_width) {
6219 case PCIE_LNK_X32:
6220 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
6221 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
6222 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6223 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6224 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6225 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6226 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6227 break;
6228 case PCIE_LNK_X16:
6229 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
6230 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6231 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6232 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6233 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6234 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6235 break;
6236 case PCIE_LNK_X12:
6237 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6238 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6239 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6240 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6241 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6242 break;
6243 case PCIE_LNK_X8:
6244 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6245 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6246 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6247 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6248 break;
6249 case PCIE_LNK_X4:
6250 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6251 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6252 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6253 break;
6254 case PCIE_LNK_X2:
6255 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6256 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6257 break;
6258 case PCIE_LNK_X1:
6259 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
6260 break;
6261 default:
6262 break;
6263 }
6264 }
6265 }
6266 }
6267
6268 /**
6269 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
6270 *
6271 * @adev: amdgpu_device pointer
6272 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
6273 *
6274 * Return true if @peer_adev can access (DMA) @adev through the PCIe
6275 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
6276 * @peer_adev.
6277 */
amdgpu_device_is_peer_accessible(struct amdgpu_device * adev,struct amdgpu_device * peer_adev)6278 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
6279 struct amdgpu_device *peer_adev)
6280 {
6281 #ifdef CONFIG_HSA_AMD_P2P
6282 bool p2p_access =
6283 !adev->gmc.xgmi.connected_to_cpu &&
6284 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
6285
6286 bool is_large_bar = adev->gmc.visible_vram_size &&
6287 adev->gmc.real_vram_size == adev->gmc.visible_vram_size;
6288 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev);
6289
6290 if (!p2p_addressable) {
6291 uint64_t address_mask = peer_adev->dev->dma_mask ?
6292 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
6293 resource_size_t aper_limit =
6294 adev->gmc.aper_base + adev->gmc.aper_size - 1;
6295
6296 p2p_addressable = !(adev->gmc.aper_base & address_mask ||
6297 aper_limit & address_mask);
6298 }
6299 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable;
6300 #else
6301 return false;
6302 #endif
6303 }
6304
amdgpu_device_baco_enter(struct drm_device * dev)6305 int amdgpu_device_baco_enter(struct drm_device *dev)
6306 {
6307 struct amdgpu_device *adev = drm_to_adev(dev);
6308 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
6309
6310 if (!amdgpu_device_supports_baco(dev))
6311 return -ENOTSUPP;
6312
6313 if (ras && adev->ras_enabled &&
6314 adev->nbio.funcs->enable_doorbell_interrupt)
6315 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
6316
6317 return amdgpu_dpm_baco_enter(adev);
6318 }
6319
amdgpu_device_baco_exit(struct drm_device * dev)6320 int amdgpu_device_baco_exit(struct drm_device *dev)
6321 {
6322 struct amdgpu_device *adev = drm_to_adev(dev);
6323 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
6324 int ret = 0;
6325
6326 if (!amdgpu_device_supports_baco(dev))
6327 return -ENOTSUPP;
6328
6329 ret = amdgpu_dpm_baco_exit(adev);
6330 if (ret)
6331 return ret;
6332
6333 if (ras && adev->ras_enabled &&
6334 adev->nbio.funcs->enable_doorbell_interrupt)
6335 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
6336
6337 if (amdgpu_passthrough(adev) && adev->nbio.funcs &&
6338 adev->nbio.funcs->clear_doorbell_interrupt)
6339 adev->nbio.funcs->clear_doorbell_interrupt(adev);
6340
6341 return 0;
6342 }
6343
6344 /**
6345 * amdgpu_pci_error_detected - Called when a PCI error is detected.
6346 * @pdev: PCI device struct
6347 * @state: PCI channel state
6348 *
6349 * Description: Called when a PCI error is detected.
6350 *
6351 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
6352 */
amdgpu_pci_error_detected(struct pci_dev * pdev,pci_channel_state_t state)6353 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
6354 {
6355 STUB();
6356 return 0;
6357 #ifdef notyet
6358 struct drm_device *dev = pci_get_drvdata(pdev);
6359 struct amdgpu_device *adev = drm_to_adev(dev);
6360 int i;
6361
6362 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
6363
6364 if (adev->gmc.xgmi.num_physical_nodes > 1) {
6365 DRM_WARN("No support for XGMI hive yet...");
6366 return PCI_ERS_RESULT_DISCONNECT;
6367 }
6368
6369 adev->pci_channel_state = state;
6370
6371 switch (state) {
6372 case pci_channel_io_normal:
6373 return PCI_ERS_RESULT_CAN_RECOVER;
6374 /* Fatal error, prepare for slot reset */
6375 case pci_channel_io_frozen:
6376 /*
6377 * Locking adev->reset_domain->sem will prevent any external access
6378 * to GPU during PCI error recovery
6379 */
6380 amdgpu_device_lock_reset_domain(adev->reset_domain);
6381 amdgpu_device_set_mp1_state(adev);
6382
6383 /*
6384 * Block any work scheduling as we do for regular GPU reset
6385 * for the duration of the recovery
6386 */
6387 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6388 struct amdgpu_ring *ring = adev->rings[i];
6389
6390 if (!amdgpu_ring_sched_ready(ring))
6391 continue;
6392
6393 drm_sched_stop(&ring->sched, NULL);
6394 }
6395 atomic_inc(&adev->gpu_reset_counter);
6396 return PCI_ERS_RESULT_NEED_RESET;
6397 case pci_channel_io_perm_failure:
6398 /* Permanent error, prepare for device removal */
6399 return PCI_ERS_RESULT_DISCONNECT;
6400 }
6401
6402 return PCI_ERS_RESULT_NEED_RESET;
6403 #endif
6404 }
6405
6406 /**
6407 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
6408 * @pdev: pointer to PCI device
6409 */
amdgpu_pci_mmio_enabled(struct pci_dev * pdev)6410 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
6411 {
6412
6413 DRM_INFO("PCI error: mmio enabled callback!!\n");
6414
6415 /* TODO - dump whatever for debugging purposes */
6416
6417 /* This called only if amdgpu_pci_error_detected returns
6418 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
6419 * works, no need to reset slot.
6420 */
6421
6422 return PCI_ERS_RESULT_RECOVERED;
6423 }
6424
6425 /**
6426 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
6427 * @pdev: PCI device struct
6428 *
6429 * Description: This routine is called by the pci error recovery
6430 * code after the PCI slot has been reset, just before we
6431 * should resume normal operations.
6432 */
amdgpu_pci_slot_reset(struct pci_dev * pdev)6433 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
6434 {
6435 STUB();
6436 return PCI_ERS_RESULT_RECOVERED;
6437 #ifdef notyet
6438 struct drm_device *dev = pci_get_drvdata(pdev);
6439 struct amdgpu_device *adev = drm_to_adev(dev);
6440 int r, i;
6441 struct amdgpu_reset_context reset_context;
6442 u32 memsize;
6443 struct list_head device_list;
6444
6445 /* PCI error slot reset should be skipped During RAS recovery */
6446 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
6447 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
6448 amdgpu_ras_in_recovery(adev))
6449 return PCI_ERS_RESULT_RECOVERED;
6450
6451 DRM_INFO("PCI error: slot reset callback!!\n");
6452
6453 memset(&reset_context, 0, sizeof(reset_context));
6454
6455 INIT_LIST_HEAD(&device_list);
6456 list_add_tail(&adev->reset_list, &device_list);
6457
6458 /* wait for asic to come out of reset */
6459 drm_msleep(500);
6460
6461 /* Restore PCI confspace */
6462 amdgpu_device_load_pci_state(pdev);
6463
6464 /* confirm ASIC came out of reset */
6465 for (i = 0; i < adev->usec_timeout; i++) {
6466 memsize = amdgpu_asic_get_config_memsize(adev);
6467
6468 if (memsize != 0xffffffff)
6469 break;
6470 udelay(1);
6471 }
6472 if (memsize == 0xffffffff) {
6473 r = -ETIME;
6474 goto out;
6475 }
6476
6477 reset_context.method = AMD_RESET_METHOD_NONE;
6478 reset_context.reset_req_dev = adev;
6479 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
6480 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
6481
6482 adev->no_hw_access = true;
6483 r = amdgpu_device_pre_asic_reset(adev, &reset_context);
6484 adev->no_hw_access = false;
6485 if (r)
6486 goto out;
6487
6488 r = amdgpu_do_asic_reset(&device_list, &reset_context);
6489
6490 out:
6491 if (!r) {
6492 if (amdgpu_device_cache_pci_state(adev->pdev))
6493 pci_restore_state(adev->pdev);
6494
6495 DRM_INFO("PCIe error recovery succeeded\n");
6496 } else {
6497 DRM_ERROR("PCIe error recovery failed, err:%d", r);
6498 amdgpu_device_unset_mp1_state(adev);
6499 amdgpu_device_unlock_reset_domain(adev->reset_domain);
6500 }
6501
6502 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
6503 #endif
6504 }
6505
6506 /**
6507 * amdgpu_pci_resume() - resume normal ops after PCI reset
6508 * @pdev: pointer to PCI device
6509 *
6510 * Called when the error recovery driver tells us that its
6511 * OK to resume normal operation.
6512 */
amdgpu_pci_resume(struct pci_dev * pdev)6513 void amdgpu_pci_resume(struct pci_dev *pdev)
6514 {
6515 STUB();
6516 #ifdef notyet
6517 struct drm_device *dev = pci_get_drvdata(pdev);
6518 struct amdgpu_device *adev = drm_to_adev(dev);
6519 int i;
6520
6521
6522 DRM_INFO("PCI error: resume callback!!\n");
6523
6524 /* Only continue execution for the case of pci_channel_io_frozen */
6525 if (adev->pci_channel_state != pci_channel_io_frozen)
6526 return;
6527
6528 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6529 struct amdgpu_ring *ring = adev->rings[i];
6530
6531 if (!amdgpu_ring_sched_ready(ring))
6532 continue;
6533
6534 drm_sched_start(&ring->sched);
6535 }
6536
6537 amdgpu_device_unset_mp1_state(adev);
6538 amdgpu_device_unlock_reset_domain(adev->reset_domain);
6539 #endif
6540 }
6541
amdgpu_device_cache_pci_state(struct pci_dev * pdev)6542 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
6543 {
6544 return false;
6545 #ifdef notyet
6546 struct drm_device *dev = pci_get_drvdata(pdev);
6547 struct amdgpu_device *adev = drm_to_adev(dev);
6548 int r;
6549
6550 if (amdgpu_sriov_vf(adev))
6551 return false;
6552
6553 r = pci_save_state(pdev);
6554 if (!r) {
6555 kfree(adev->pci_state);
6556
6557 adev->pci_state = pci_store_saved_state(pdev);
6558
6559 if (!adev->pci_state) {
6560 DRM_ERROR("Failed to store PCI saved state");
6561 return false;
6562 }
6563 } else {
6564 DRM_WARN("Failed to save PCI state, err:%d\n", r);
6565 return false;
6566 }
6567
6568 return true;
6569 #endif
6570 }
6571
amdgpu_device_load_pci_state(struct pci_dev * pdev)6572 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
6573 {
6574 STUB();
6575 return false;
6576 #ifdef notyet
6577 struct drm_device *dev = pci_get_drvdata(pdev);
6578 struct amdgpu_device *adev = drm_to_adev(dev);
6579 int r;
6580
6581 if (!adev->pci_state)
6582 return false;
6583
6584 r = pci_load_saved_state(pdev, adev->pci_state);
6585
6586 if (!r) {
6587 pci_restore_state(pdev);
6588 } else {
6589 DRM_WARN("Failed to load PCI state, err:%d\n", r);
6590 return false;
6591 }
6592
6593 return true;
6594 #endif
6595 }
6596
amdgpu_device_flush_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)6597 void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
6598 struct amdgpu_ring *ring)
6599 {
6600 #ifdef CONFIG_X86_64
6601 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
6602 return;
6603 #endif
6604 if (adev->gmc.xgmi.connected_to_cpu)
6605 return;
6606
6607 if (ring && ring->funcs->emit_hdp_flush)
6608 amdgpu_ring_emit_hdp_flush(ring);
6609 else
6610 amdgpu_asic_flush_hdp(adev, ring);
6611 }
6612
amdgpu_device_invalidate_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)6613 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
6614 struct amdgpu_ring *ring)
6615 {
6616 #ifdef CONFIG_X86_64
6617 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
6618 return;
6619 #endif
6620 if (adev->gmc.xgmi.connected_to_cpu)
6621 return;
6622
6623 amdgpu_asic_invalidate_hdp(adev, ring);
6624 }
6625
amdgpu_in_reset(struct amdgpu_device * adev)6626 int amdgpu_in_reset(struct amdgpu_device *adev)
6627 {
6628 return atomic_read(&adev->reset_domain->in_gpu_reset);
6629 }
6630
6631 /**
6632 * amdgpu_device_halt() - bring hardware to some kind of halt state
6633 *
6634 * @adev: amdgpu_device pointer
6635 *
6636 * Bring hardware to some kind of halt state so that no one can touch it
6637 * any more. It will help to maintain error context when error occurred.
6638 * Compare to a simple hang, the system will keep stable at least for SSH
6639 * access. Then it should be trivial to inspect the hardware state and
6640 * see what's going on. Implemented as following:
6641 *
6642 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
6643 * clears all CPU mappings to device, disallows remappings through page faults
6644 * 2. amdgpu_irq_disable_all() disables all interrupts
6645 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
6646 * 4. set adev->no_hw_access to avoid potential crashes after setp 5
6647 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
6648 * 6. pci_disable_device() and pci_wait_for_pending_transaction()
6649 * flush any in flight DMA operations
6650 */
amdgpu_device_halt(struct amdgpu_device * adev)6651 void amdgpu_device_halt(struct amdgpu_device *adev)
6652 {
6653 struct pci_dev *pdev = adev->pdev;
6654 struct drm_device *ddev = adev_to_drm(adev);
6655
6656 amdgpu_xcp_dev_unplug(adev);
6657 drm_dev_unplug(ddev);
6658
6659 amdgpu_irq_disable_all(adev);
6660
6661 amdgpu_fence_driver_hw_fini(adev);
6662
6663 adev->no_hw_access = true;
6664
6665 amdgpu_device_unmap_mmio(adev);
6666
6667 pci_disable_device(pdev);
6668 pci_wait_for_pending_transaction(pdev);
6669 }
6670
amdgpu_device_pcie_port_rreg(struct amdgpu_device * adev,u32 reg)6671 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
6672 u32 reg)
6673 {
6674 unsigned long flags, address, data;
6675 u32 r;
6676
6677 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6678 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6679
6680 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6681 WREG32(address, reg * 4);
6682 (void)RREG32(address);
6683 r = RREG32(data);
6684 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6685 return r;
6686 }
6687
amdgpu_device_pcie_port_wreg(struct amdgpu_device * adev,u32 reg,u32 v)6688 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
6689 u32 reg, u32 v)
6690 {
6691 unsigned long flags, address, data;
6692
6693 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6694 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6695
6696 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6697 WREG32(address, reg * 4);
6698 (void)RREG32(address);
6699 WREG32(data, v);
6700 (void)RREG32(data);
6701 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6702 }
6703
6704 /**
6705 * amdgpu_device_get_gang - return a reference to the current gang
6706 * @adev: amdgpu_device pointer
6707 *
6708 * Returns: A new reference to the current gang leader.
6709 */
amdgpu_device_get_gang(struct amdgpu_device * adev)6710 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev)
6711 {
6712 struct dma_fence *fence;
6713
6714 rcu_read_lock();
6715 fence = dma_fence_get_rcu_safe(&adev->gang_submit);
6716 rcu_read_unlock();
6717 return fence;
6718 }
6719
6720 /**
6721 * amdgpu_device_switch_gang - switch to a new gang
6722 * @adev: amdgpu_device pointer
6723 * @gang: the gang to switch to
6724 *
6725 * Try to switch to a new gang.
6726 * Returns: NULL if we switched to the new gang or a reference to the current
6727 * gang leader.
6728 */
amdgpu_device_switch_gang(struct amdgpu_device * adev,struct dma_fence * gang)6729 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
6730 struct dma_fence *gang)
6731 {
6732 struct dma_fence *old = NULL;
6733
6734 do {
6735 dma_fence_put(old);
6736 old = amdgpu_device_get_gang(adev);
6737 if (old == gang)
6738 break;
6739
6740 if (!dma_fence_is_signaled(old))
6741 return old;
6742
6743 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
6744 old, gang) != old);
6745
6746 dma_fence_put(old);
6747 return NULL;
6748 }
6749
amdgpu_device_has_display_hardware(struct amdgpu_device * adev)6750 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
6751 {
6752 switch (adev->asic_type) {
6753 #ifdef CONFIG_DRM_AMDGPU_SI
6754 case CHIP_HAINAN:
6755 #endif
6756 case CHIP_TOPAZ:
6757 /* chips with no display hardware */
6758 return false;
6759 #ifdef CONFIG_DRM_AMDGPU_SI
6760 case CHIP_TAHITI:
6761 case CHIP_PITCAIRN:
6762 case CHIP_VERDE:
6763 case CHIP_OLAND:
6764 #endif
6765 #ifdef CONFIG_DRM_AMDGPU_CIK
6766 case CHIP_BONAIRE:
6767 case CHIP_HAWAII:
6768 case CHIP_KAVERI:
6769 case CHIP_KABINI:
6770 case CHIP_MULLINS:
6771 #endif
6772 case CHIP_TONGA:
6773 case CHIP_FIJI:
6774 case CHIP_POLARIS10:
6775 case CHIP_POLARIS11:
6776 case CHIP_POLARIS12:
6777 case CHIP_VEGAM:
6778 case CHIP_CARRIZO:
6779 case CHIP_STONEY:
6780 /* chips with display hardware */
6781 return true;
6782 default:
6783 /* IP discovery */
6784 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) ||
6785 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
6786 return false;
6787 return true;
6788 }
6789 }
6790
amdgpu_device_wait_on_rreg(struct amdgpu_device * adev,uint32_t inst,uint32_t reg_addr,char reg_name[],uint32_t expected_value,uint32_t mask)6791 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev,
6792 uint32_t inst, uint32_t reg_addr, char reg_name[],
6793 uint32_t expected_value, uint32_t mask)
6794 {
6795 uint32_t ret = 0;
6796 uint32_t old_ = 0;
6797 uint32_t tmp_ = RREG32(reg_addr);
6798 uint32_t loop = adev->usec_timeout;
6799
6800 while ((tmp_ & (mask)) != (expected_value)) {
6801 if (old_ != tmp_) {
6802 loop = adev->usec_timeout;
6803 old_ = tmp_;
6804 } else
6805 udelay(1);
6806 tmp_ = RREG32(reg_addr);
6807 loop--;
6808 if (!loop) {
6809 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn",
6810 inst, reg_name, (uint32_t)expected_value,
6811 (uint32_t)(tmp_ & (mask)));
6812 ret = -ETIMEDOUT;
6813 break;
6814 }
6815 }
6816 return ret;
6817 }
6818