1 /*
2  * Copyright 2021 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  *
22  */
23 #include "amdgpu_ras.h"
24 #include "amdgpu.h"
25 #include "amdgpu_mca.h"
26 
27 #include "umc/umc_6_7_0_offset.h"
28 #include "umc/umc_6_7_0_sh_mask.h"
29 
amdgpu_mca_is_deferred_error(struct amdgpu_device * adev,uint64_t mc_status)30 static bool amdgpu_mca_is_deferred_error(struct amdgpu_device *adev,
31 					uint64_t mc_status)
32 {
33 	if (adev->umc.ras->check_ecc_err_status)
34 		return adev->umc.ras->check_ecc_err_status(adev,
35 				AMDGPU_MCA_ERROR_TYPE_DE, &mc_status);
36 
37 	return false;
38 }
39 
amdgpu_mca_query_correctable_error_count(struct amdgpu_device * adev,uint64_t mc_status_addr,unsigned long * error_count)40 void amdgpu_mca_query_correctable_error_count(struct amdgpu_device *adev,
41 					      uint64_t mc_status_addr,
42 					      unsigned long *error_count)
43 {
44 	uint64_t mc_status = RREG64_PCIE(mc_status_addr);
45 
46 	if (REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
47 	    REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)
48 		*error_count += 1;
49 }
50 
amdgpu_mca_query_uncorrectable_error_count(struct amdgpu_device * adev,uint64_t mc_status_addr,unsigned long * error_count)51 void amdgpu_mca_query_uncorrectable_error_count(struct amdgpu_device *adev,
52 						uint64_t mc_status_addr,
53 						unsigned long *error_count)
54 {
55 	uint64_t mc_status = RREG64_PCIE(mc_status_addr);
56 
57 	if ((REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
58 	    (REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
59 	    REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
60 	    REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
61 	    REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
62 	    REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1))
63 		*error_count += 1;
64 }
65 
amdgpu_mca_reset_error_count(struct amdgpu_device * adev,uint64_t mc_status_addr)66 void amdgpu_mca_reset_error_count(struct amdgpu_device *adev,
67 				  uint64_t mc_status_addr)
68 {
69 	WREG64_PCIE(mc_status_addr, 0x0ULL);
70 }
71 
amdgpu_mca_query_ras_error_count(struct amdgpu_device * adev,uint64_t mc_status_addr,void * ras_error_status)72 void amdgpu_mca_query_ras_error_count(struct amdgpu_device *adev,
73 				      uint64_t mc_status_addr,
74 				      void *ras_error_status)
75 {
76 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
77 
78 	amdgpu_mca_query_correctable_error_count(adev, mc_status_addr, &(err_data->ce_count));
79 	amdgpu_mca_query_uncorrectable_error_count(adev, mc_status_addr, &(err_data->ue_count));
80 
81 	amdgpu_mca_reset_error_count(adev, mc_status_addr);
82 }
83 
amdgpu_mca_mp0_ras_sw_init(struct amdgpu_device * adev)84 int amdgpu_mca_mp0_ras_sw_init(struct amdgpu_device *adev)
85 {
86 	int err;
87 	struct amdgpu_mca_ras_block *ras;
88 
89 	if (!adev->mca.mp0.ras)
90 		return 0;
91 
92 	ras = adev->mca.mp0.ras;
93 
94 	err = amdgpu_ras_register_ras_block(adev, &ras->ras_block);
95 	if (err) {
96 		dev_err(adev->dev, "Failed to register mca.mp0 ras block!\n");
97 		return err;
98 	}
99 
100 	strlcpy(ras->ras_block.ras_comm.name, "mca.mp0",
101 	    sizeof(ras->ras_block.ras_comm.name));
102 	ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__MCA;
103 	ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
104 	adev->mca.mp0.ras_if = &ras->ras_block.ras_comm;
105 
106 	return 0;
107 }
108 
amdgpu_mca_mp1_ras_sw_init(struct amdgpu_device * adev)109 int amdgpu_mca_mp1_ras_sw_init(struct amdgpu_device *adev)
110 {
111 	int err;
112 	struct amdgpu_mca_ras_block *ras;
113 
114 	if (!adev->mca.mp1.ras)
115 		return 0;
116 
117 	ras = adev->mca.mp1.ras;
118 
119 	err = amdgpu_ras_register_ras_block(adev, &ras->ras_block);
120 	if (err) {
121 		dev_err(adev->dev, "Failed to register mca.mp1 ras block!\n");
122 		return err;
123 	}
124 
125 	strlcpy(ras->ras_block.ras_comm.name, "mca.mp1",
126 	    sizeof(ras->ras_block.ras_comm.name));
127 	ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__MCA;
128 	ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
129 	adev->mca.mp1.ras_if = &ras->ras_block.ras_comm;
130 
131 	return 0;
132 }
133 
amdgpu_mca_mpio_ras_sw_init(struct amdgpu_device * adev)134 int amdgpu_mca_mpio_ras_sw_init(struct amdgpu_device *adev)
135 {
136 	int err;
137 	struct amdgpu_mca_ras_block *ras;
138 
139 	if (!adev->mca.mpio.ras)
140 		return 0;
141 
142 	ras = adev->mca.mpio.ras;
143 
144 	err = amdgpu_ras_register_ras_block(adev, &ras->ras_block);
145 	if (err) {
146 		dev_err(adev->dev, "Failed to register mca.mpio ras block!\n");
147 		return err;
148 	}
149 
150 	strlcpy(ras->ras_block.ras_comm.name, "mca.mpio",
151 	    sizeof(ras->ras_block.ras_comm.name));
152 	ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__MCA;
153 	ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
154 	adev->mca.mpio.ras_if = &ras->ras_block.ras_comm;
155 
156 	return 0;
157 }
158 
amdgpu_mca_bank_set_init(struct mca_bank_set * mca_set)159 static void amdgpu_mca_bank_set_init(struct mca_bank_set *mca_set)
160 {
161 	if (!mca_set)
162 		return;
163 
164 	memset(mca_set, 0, sizeof(*mca_set));
165 	INIT_LIST_HEAD(&mca_set->list);
166 }
167 
amdgpu_mca_bank_set_add_entry(struct mca_bank_set * mca_set,struct mca_bank_entry * entry)168 static int amdgpu_mca_bank_set_add_entry(struct mca_bank_set *mca_set, struct mca_bank_entry *entry)
169 {
170 	struct mca_bank_node *node;
171 
172 	if (!entry)
173 		return -EINVAL;
174 
175 	node = kvzalloc(sizeof(*node), GFP_KERNEL);
176 	if (!node)
177 		return -ENOMEM;
178 
179 	memcpy(&node->entry, entry, sizeof(*entry));
180 
181 	INIT_LIST_HEAD(&node->node);
182 	list_add_tail(&node->node, &mca_set->list);
183 
184 	mca_set->nr_entries++;
185 
186 	return 0;
187 }
188 
amdgpu_mca_bank_set_merge(struct mca_bank_set * mca_set,struct mca_bank_set * new)189 static int amdgpu_mca_bank_set_merge(struct mca_bank_set *mca_set, struct mca_bank_set *new)
190 {
191 	struct mca_bank_node *node;
192 
193 	list_for_each_entry(node, &new->list, node)
194 		amdgpu_mca_bank_set_add_entry(mca_set, &node->entry);
195 
196 	return 0;
197 }
198 
amdgpu_mca_bank_set_remove_node(struct mca_bank_set * mca_set,struct mca_bank_node * node)199 static void amdgpu_mca_bank_set_remove_node(struct mca_bank_set *mca_set, struct mca_bank_node *node)
200 {
201 	if (!node)
202 		return;
203 
204 	list_del(&node->node);
205 	kvfree(node);
206 
207 	mca_set->nr_entries--;
208 }
209 
amdgpu_mca_bank_set_release(struct mca_bank_set * mca_set)210 static void amdgpu_mca_bank_set_release(struct mca_bank_set *mca_set)
211 {
212 	struct mca_bank_node *node, *tmp;
213 
214 	if (list_empty(&mca_set->list))
215 		return;
216 
217 	list_for_each_entry_safe(node, tmp, &mca_set->list, node)
218 		amdgpu_mca_bank_set_remove_node(mca_set, node);
219 }
220 
amdgpu_mca_smu_init_funcs(struct amdgpu_device * adev,const struct amdgpu_mca_smu_funcs * mca_funcs)221 void amdgpu_mca_smu_init_funcs(struct amdgpu_device *adev, const struct amdgpu_mca_smu_funcs *mca_funcs)
222 {
223 	struct amdgpu_mca *mca = &adev->mca;
224 
225 	mca->mca_funcs = mca_funcs;
226 }
227 
amdgpu_mca_init(struct amdgpu_device * adev)228 int amdgpu_mca_init(struct amdgpu_device *adev)
229 {
230 	struct amdgpu_mca *mca = &adev->mca;
231 	struct mca_bank_cache *mca_cache;
232 	int i;
233 
234 	atomic_set(&mca->ue_update_flag, 0);
235 
236 	for (i = 0; i < ARRAY_SIZE(mca->mca_caches); i++) {
237 		mca_cache = &mca->mca_caches[i];
238 		rw_init(&mca_cache->lock, "mcacac");
239 		amdgpu_mca_bank_set_init(&mca_cache->mca_set);
240 	}
241 
242 	return 0;
243 }
244 
amdgpu_mca_fini(struct amdgpu_device * adev)245 void amdgpu_mca_fini(struct amdgpu_device *adev)
246 {
247 	struct amdgpu_mca *mca = &adev->mca;
248 	struct mca_bank_cache *mca_cache;
249 	int i;
250 
251 	atomic_set(&mca->ue_update_flag, 0);
252 
253 	for (i = 0; i < ARRAY_SIZE(mca->mca_caches); i++) {
254 		mca_cache = &mca->mca_caches[i];
255 		amdgpu_mca_bank_set_release(&mca_cache->mca_set);
256 		mutex_destroy(&mca_cache->lock);
257 	}
258 }
259 
amdgpu_mca_reset(struct amdgpu_device * adev)260 int amdgpu_mca_reset(struct amdgpu_device *adev)
261 {
262 	amdgpu_mca_fini(adev);
263 
264 	return amdgpu_mca_init(adev);
265 }
266 
amdgpu_mca_smu_set_debug_mode(struct amdgpu_device * adev,bool enable)267 int amdgpu_mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable)
268 {
269 	const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
270 
271 	if (mca_funcs && mca_funcs->mca_set_debug_mode)
272 		return mca_funcs->mca_set_debug_mode(adev, enable);
273 
274 	return -EOPNOTSUPP;
275 }
276 
amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device * adev,int idx,struct mca_bank_entry * entry,struct ras_query_context * qctx)277 static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev, int idx, struct mca_bank_entry *entry,
278 					 struct ras_query_context *qctx)
279 {
280 	u64 event_id = qctx ? qctx->evid.event_id : RAS_EVENT_INVALID_ID;
281 
282 	RAS_EVENT_LOG(adev, event_id, HW_ERR "Accelerator Check Architecture events logged\n");
283 	RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].STATUS=0x%016llx\n",
284 		      idx, entry->regs[MCA_REG_IDX_STATUS]);
285 	RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].ADDR=0x%016llx\n",
286 		      idx, entry->regs[MCA_REG_IDX_ADDR]);
287 	RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].MISC0=0x%016llx\n",
288 		      idx, entry->regs[MCA_REG_IDX_MISC0]);
289 	RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].IPID=0x%016llx\n",
290 		      idx, entry->regs[MCA_REG_IDX_IPID]);
291 	RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].SYND=0x%016llx\n",
292 		      idx, entry->regs[MCA_REG_IDX_SYND]);
293 }
294 
amdgpu_mca_smu_get_valid_mca_count(struct amdgpu_device * adev,enum amdgpu_mca_error_type type,uint32_t * count)295 static int amdgpu_mca_smu_get_valid_mca_count(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, uint32_t *count)
296 {
297 	const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
298 
299 	if (!count)
300 		return -EINVAL;
301 
302 	if (mca_funcs && mca_funcs->mca_get_valid_mca_count)
303 		return mca_funcs->mca_get_valid_mca_count(adev, type, count);
304 
305 	return -EOPNOTSUPP;
306 }
307 
amdgpu_mca_smu_get_mca_entry(struct amdgpu_device * adev,enum amdgpu_mca_error_type type,int idx,struct mca_bank_entry * entry)308 static int amdgpu_mca_smu_get_mca_entry(struct amdgpu_device *adev, enum amdgpu_mca_error_type type,
309 					int idx, struct mca_bank_entry *entry)
310 {
311 	const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
312 	int count;
313 
314 	if (!mca_funcs || !mca_funcs->mca_get_mca_entry)
315 		return -EOPNOTSUPP;
316 
317 	switch (type) {
318 	case AMDGPU_MCA_ERROR_TYPE_UE:
319 		count = mca_funcs->max_ue_count;
320 		break;
321 	case AMDGPU_MCA_ERROR_TYPE_CE:
322 		count = mca_funcs->max_ce_count;
323 		break;
324 	default:
325 		return -EINVAL;
326 	}
327 
328 	if (idx >= count)
329 		return -EINVAL;
330 
331 	return mca_funcs->mca_get_mca_entry(adev, type, idx, entry);
332 }
333 
amdgpu_mca_bank_should_update(struct amdgpu_device * adev,enum amdgpu_mca_error_type type)334 static bool amdgpu_mca_bank_should_update(struct amdgpu_device *adev, enum amdgpu_mca_error_type type)
335 {
336 	struct amdgpu_mca *mca = &adev->mca;
337 	bool ret = true;
338 
339 	/*
340 	 * Because the UE Valid MCA count will only be cleared after reset,
341 	 * in order to avoid repeated counting of the error count,
342 	 * the aca bank is only updated once during the gpu recovery stage.
343 	 */
344 	if (type == AMDGPU_MCA_ERROR_TYPE_UE) {
345 		if (amdgpu_ras_intr_triggered())
346 			ret = atomic_cmpxchg(&mca->ue_update_flag, 0, 1) == 0;
347 		else
348 			atomic_set(&mca->ue_update_flag, 0);
349 	}
350 
351 	return ret;
352 }
353 
amdgpu_mca_smu_get_mca_set(struct amdgpu_device * adev,enum amdgpu_mca_error_type type,struct mca_bank_set * mca_set,struct ras_query_context * qctx)354 static int amdgpu_mca_smu_get_mca_set(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, struct mca_bank_set *mca_set,
355 				      struct ras_query_context *qctx)
356 {
357 	struct mca_bank_entry entry;
358 	uint32_t count = 0, i;
359 	int ret;
360 
361 	if (!mca_set)
362 		return -EINVAL;
363 
364 	if (!amdgpu_mca_bank_should_update(adev, type))
365 		return 0;
366 
367 	ret = amdgpu_mca_smu_get_valid_mca_count(adev, type, &count);
368 	if (ret)
369 		return ret;
370 
371 	for (i = 0; i < count; i++) {
372 		memset(&entry, 0, sizeof(entry));
373 		ret = amdgpu_mca_smu_get_mca_entry(adev, type, i, &entry);
374 		if (ret)
375 			return ret;
376 
377 		amdgpu_mca_bank_set_add_entry(mca_set, &entry);
378 
379 		amdgpu_mca_smu_mca_bank_dump(adev, i, &entry, qctx);
380 	}
381 
382 	return 0;
383 }
384 
amdgpu_mca_smu_parse_mca_error_count(struct amdgpu_device * adev,enum amdgpu_ras_block blk,enum amdgpu_mca_error_type type,struct mca_bank_entry * entry,uint32_t * count)385 static int amdgpu_mca_smu_parse_mca_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
386 						enum amdgpu_mca_error_type type, struct mca_bank_entry *entry, uint32_t *count)
387 {
388 	const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
389 
390 	if (!count || !entry)
391 		return -EINVAL;
392 
393 	if (!mca_funcs || !mca_funcs->mca_parse_mca_error_count)
394 		return -EOPNOTSUPP;
395 
396 	return mca_funcs->mca_parse_mca_error_count(adev, blk, type, entry, count);
397 }
398 
amdgpu_mca_dispatch_mca_set(struct amdgpu_device * adev,enum amdgpu_ras_block blk,enum amdgpu_mca_error_type type,struct mca_bank_set * mca_set,struct ras_err_data * err_data)399 static int amdgpu_mca_dispatch_mca_set(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type,
400 				       struct mca_bank_set *mca_set, struct ras_err_data *err_data)
401 {
402 	struct amdgpu_smuio_mcm_config_info mcm_info;
403 	struct mca_bank_node *node, *tmp;
404 	struct mca_bank_entry *entry;
405 	uint32_t count;
406 	int ret;
407 
408 	if (!mca_set)
409 		return -EINVAL;
410 
411 	if (!mca_set->nr_entries)
412 		return 0;
413 
414 	list_for_each_entry_safe(node, tmp, &mca_set->list, node) {
415 		entry = &node->entry;
416 
417 		count = 0;
418 		ret = amdgpu_mca_smu_parse_mca_error_count(adev, blk, type, entry, &count);
419 		if (ret && ret != -EOPNOTSUPP)
420 			return ret;
421 
422 		if (!count)
423 			continue;
424 
425 		memset(&mcm_info, 0, sizeof(mcm_info));
426 
427 		mcm_info.socket_id = entry->info.socket_id;
428 		mcm_info.die_id = entry->info.aid;
429 
430 		if (type == AMDGPU_MCA_ERROR_TYPE_UE) {
431 			amdgpu_ras_error_statistic_ue_count(err_data,
432 							    &mcm_info, (uint64_t)count);
433 		} else {
434 			if (amdgpu_mca_is_deferred_error(adev, entry->regs[MCA_REG_IDX_STATUS]))
435 				amdgpu_ras_error_statistic_de_count(err_data,
436 								    &mcm_info, (uint64_t)count);
437 			else
438 				amdgpu_ras_error_statistic_ce_count(err_data,
439 								    &mcm_info, (uint64_t)count);
440 		}
441 
442 		amdgpu_mca_bank_set_remove_node(mca_set, node);
443 	}
444 
445 	return 0;
446 }
447 
amdgpu_mca_add_mca_set_to_cache(struct amdgpu_device * adev,enum amdgpu_mca_error_type type,struct mca_bank_set * new)448 static int amdgpu_mca_add_mca_set_to_cache(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, struct mca_bank_set *new)
449 {
450 	struct mca_bank_cache *mca_cache = &adev->mca.mca_caches[type];
451 	int ret;
452 
453 	mutex_lock(&mca_cache->lock);
454 	ret = amdgpu_mca_bank_set_merge(&mca_cache->mca_set, new);
455 	mutex_unlock(&mca_cache->lock);
456 
457 	return ret;
458 }
459 
amdgpu_mca_smu_log_ras_error(struct amdgpu_device * adev,enum amdgpu_ras_block blk,enum amdgpu_mca_error_type type,struct ras_err_data * err_data,struct ras_query_context * qctx)460 int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type,
461 				 struct ras_err_data *err_data, struct ras_query_context *qctx)
462 {
463 	struct mca_bank_set mca_set;
464 	struct mca_bank_cache *mca_cache = &adev->mca.mca_caches[type];
465 	int ret;
466 
467 	amdgpu_mca_bank_set_init(&mca_set);
468 
469 	ret = amdgpu_mca_smu_get_mca_set(adev, type, &mca_set, qctx);
470 	if (ret)
471 		goto out_mca_release;
472 
473 	ret = amdgpu_mca_dispatch_mca_set(adev, blk, type, &mca_set, err_data);
474 	if (ret)
475 		goto out_mca_release;
476 
477 	/* add remain mca bank to mca cache */
478 	if (mca_set.nr_entries) {
479 		ret = amdgpu_mca_add_mca_set_to_cache(adev, type, &mca_set);
480 		if (ret)
481 			goto out_mca_release;
482 	}
483 
484 	/* dispatch mca set again if mca cache has valid data */
485 	mutex_lock(&mca_cache->lock);
486 	if (mca_cache->mca_set.nr_entries)
487 		ret = amdgpu_mca_dispatch_mca_set(adev, blk, type, &mca_cache->mca_set, err_data);
488 	mutex_unlock(&mca_cache->lock);
489 
490 out_mca_release:
491 	amdgpu_mca_bank_set_release(&mca_set);
492 
493 	return ret;
494 }
495 
496 #if defined(CONFIG_DEBUG_FS)
amdgpu_mca_smu_debug_mode_set(void * data,u64 val)497 static int amdgpu_mca_smu_debug_mode_set(void *data, u64 val)
498 {
499 	struct amdgpu_device *adev = (struct amdgpu_device *)data;
500 	int ret;
501 
502 	ret = amdgpu_ras_set_mca_debug_mode(adev, val ? true : false);
503 	if (ret)
504 		return ret;
505 
506 	dev_info(adev->dev, "amdgpu set smu mca debug mode %s success\n", val ? "on" : "off");
507 
508 	return 0;
509 }
510 
mca_dump_entry(struct seq_file * m,struct mca_bank_entry * entry)511 static void mca_dump_entry(struct seq_file *m, struct mca_bank_entry *entry)
512 {
513 	int i, idx = entry->idx;
514 	int reg_idx_array[] = {
515 		MCA_REG_IDX_STATUS,
516 		MCA_REG_IDX_ADDR,
517 		MCA_REG_IDX_MISC0,
518 		MCA_REG_IDX_IPID,
519 		MCA_REG_IDX_SYND,
520 	};
521 
522 	seq_printf(m, "mca entry[%d].type: %s\n", idx, entry->type == AMDGPU_MCA_ERROR_TYPE_UE ? "UE" : "CE");
523 	seq_printf(m, "mca entry[%d].ip: %d\n", idx, entry->ip);
524 	seq_printf(m, "mca entry[%d].info: socketid:%d aid:%d hwid:0x%03x mcatype:0x%04x\n",
525 		   idx, entry->info.socket_id, entry->info.aid, entry->info.hwid, entry->info.mcatype);
526 
527 	for (i = 0; i < ARRAY_SIZE(reg_idx_array); i++)
528 		seq_printf(m, "mca entry[%d].regs[%d]: 0x%016llx\n", idx, reg_idx_array[i], entry->regs[reg_idx_array[i]]);
529 }
530 
mca_dump_show(struct seq_file * m,enum amdgpu_mca_error_type type)531 static int mca_dump_show(struct seq_file *m, enum amdgpu_mca_error_type type)
532 {
533 	struct amdgpu_device *adev = (struct amdgpu_device *)m->private;
534 	struct mca_bank_node *node;
535 	struct mca_bank_set mca_set;
536 	struct ras_query_context qctx;
537 	int ret;
538 
539 	amdgpu_mca_bank_set_init(&mca_set);
540 
541 	qctx.evid.event_id = RAS_EVENT_INVALID_ID;
542 	ret = amdgpu_mca_smu_get_mca_set(adev, type, &mca_set, &qctx);
543 	if (ret)
544 		goto err_free_mca_set;
545 
546 	seq_printf(m, "amdgpu smu %s valid mca count: %d\n",
547 		   type == AMDGPU_MCA_ERROR_TYPE_UE ? "UE" : "CE", mca_set.nr_entries);
548 
549 	if (!mca_set.nr_entries)
550 		goto err_free_mca_set;
551 
552 	list_for_each_entry(node, &mca_set.list, node)
553 		mca_dump_entry(m, &node->entry);
554 
555 	/* add mca bank to mca bank cache */
556 	ret = amdgpu_mca_add_mca_set_to_cache(adev, type, &mca_set);
557 
558 err_free_mca_set:
559 	amdgpu_mca_bank_set_release(&mca_set);
560 
561 	return ret;
562 }
563 
mca_dump_ce_show(struct seq_file * m,void * unused)564 static int mca_dump_ce_show(struct seq_file *m, void *unused)
565 {
566 	return mca_dump_show(m, AMDGPU_MCA_ERROR_TYPE_CE);
567 }
568 
mca_dump_ce_open(struct inode * inode,struct file * file)569 static int mca_dump_ce_open(struct inode *inode, struct file *file)
570 {
571 	return single_open(file, mca_dump_ce_show, inode->i_private);
572 }
573 
574 static const struct file_operations mca_ce_dump_debug_fops = {
575 	.owner = THIS_MODULE,
576 	.open = mca_dump_ce_open,
577 	.read = seq_read,
578 	.llseek = seq_lseek,
579 	.release = single_release,
580 };
581 
mca_dump_ue_show(struct seq_file * m,void * unused)582 static int mca_dump_ue_show(struct seq_file *m, void *unused)
583 {
584 	return mca_dump_show(m, AMDGPU_MCA_ERROR_TYPE_UE);
585 }
586 
mca_dump_ue_open(struct inode * inode,struct file * file)587 static int mca_dump_ue_open(struct inode *inode, struct file *file)
588 {
589 	return single_open(file, mca_dump_ue_show, inode->i_private);
590 }
591 
592 static const struct file_operations mca_ue_dump_debug_fops = {
593 	.owner = THIS_MODULE,
594 	.open = mca_dump_ue_open,
595 	.read = seq_read,
596 	.llseek = seq_lseek,
597 	.release = single_release,
598 };
599 
600 DEFINE_DEBUGFS_ATTRIBUTE(mca_debug_mode_fops, NULL, amdgpu_mca_smu_debug_mode_set, "%llu\n");
601 #endif
602 
amdgpu_mca_smu_debugfs_init(struct amdgpu_device * adev,struct dentry * root)603 void amdgpu_mca_smu_debugfs_init(struct amdgpu_device *adev, struct dentry *root)
604 {
605 #if defined(CONFIG_DEBUG_FS)
606 	if (!root)
607 		return;
608 
609 	debugfs_create_file("mca_debug_mode", 0200, root, adev, &mca_debug_mode_fops);
610 	debugfs_create_file("mca_ue_dump", 0400, root, adev, &mca_ue_dump_debug_fops);
611 	debugfs_create_file("mca_ce_dump", 0400, root, adev, &mca_ce_dump_debug_fops);
612 #endif
613 }
614 
615