mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2026-04-14 09:57:39 +02:00
In error scenarios (e.g., malformed commands), user queue fences may never
be signaled, causing processes to wait indefinitely. To address this while
preserving the requirement of infinite fence waits, implement an independent
timeout detection mechanism:
1. Initialize a hang detect work when creating a user queue (one-time setup)
2. Start the work with queue-type-specific timeout (gfx/compute/sdma) when
the last fence is created via amdgpu_userq_signal_ioctl (per-fence timing)
3. Trigger queue reset logic if the timer expires before the fence is signaled
v2: make timeout per queue type (adev->gfx_timeout vs adev->compute_timeout vs adev->sdma_timeout) to be consistent with kernel queues. (Alex)
v3: The timeout detection must be independent from the fence, e.g. you don't wait for a timeout on the fence
but rather have the timeout start as soon as the fence is initialized. (Christian)
v4: replace the timer with the `hang_detect_work` delayed work.
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Jesse Zhang <jesse.zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
161 lines
5.4 KiB
C
161 lines
5.4 KiB
C
/* SPDX-License-Identifier: MIT */
|
|
/*
|
|
* Copyright 2023 Advanced Micro Devices, Inc.
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be included in
|
|
* all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
|
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
|
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
* OTHER DEALINGS IN THE SOFTWARE.
|
|
*
|
|
*/
|
|
|
|
#ifndef AMDGPU_USERQ_H_
|
|
#define AMDGPU_USERQ_H_
|
|
#include "amdgpu_eviction_fence.h"
|
|
|
|
#define AMDGPU_MAX_USERQ_COUNT 512
|
|
|
|
#define to_ev_fence(f) container_of(f, struct amdgpu_eviction_fence, base)
|
|
#define uq_mgr_to_fpriv(u) container_of(u, struct amdgpu_fpriv, userq_mgr)
|
|
#define work_to_uq_mgr(w, name) container_of(w, struct amdgpu_userq_mgr, name)
|
|
|
|
enum amdgpu_userq_state {
|
|
AMDGPU_USERQ_STATE_UNMAPPED = 0,
|
|
AMDGPU_USERQ_STATE_MAPPED,
|
|
AMDGPU_USERQ_STATE_PREEMPTED,
|
|
AMDGPU_USERQ_STATE_HUNG,
|
|
AMDGPU_USERQ_STATE_INVALID_VA,
|
|
};
|
|
|
|
struct amdgpu_mqd_prop;
|
|
|
|
struct amdgpu_userq_obj {
|
|
void *cpu_ptr;
|
|
uint64_t gpu_addr;
|
|
struct amdgpu_bo *obj;
|
|
};
|
|
|
|
struct amdgpu_userq_va_cursor {
|
|
u64 gpu_addr;
|
|
struct list_head list;
|
|
};
|
|
|
|
struct amdgpu_usermode_queue {
|
|
int queue_type;
|
|
enum amdgpu_userq_state state;
|
|
uint64_t doorbell_handle;
|
|
uint64_t doorbell_index;
|
|
uint64_t flags;
|
|
struct amdgpu_mqd_prop *userq_prop;
|
|
struct amdgpu_userq_mgr *userq_mgr;
|
|
struct amdgpu_vm *vm;
|
|
struct amdgpu_userq_obj mqd;
|
|
struct amdgpu_userq_obj db_obj;
|
|
struct amdgpu_userq_obj fw_obj;
|
|
struct amdgpu_userq_obj wptr_obj;
|
|
struct xarray fence_drv_xa;
|
|
struct amdgpu_userq_fence_driver *fence_drv;
|
|
struct dma_fence *last_fence;
|
|
u32 xcp_id;
|
|
int priority;
|
|
struct dentry *debugfs_queue;
|
|
struct delayed_work hang_detect_work;
|
|
struct dma_fence *hang_detect_fence;
|
|
|
|
struct list_head userq_va_list;
|
|
};
|
|
|
|
struct amdgpu_userq_funcs {
|
|
int (*mqd_create)(struct amdgpu_usermode_queue *queue,
|
|
struct drm_amdgpu_userq_in *args);
|
|
void (*mqd_destroy)(struct amdgpu_usermode_queue *uq);
|
|
int (*unmap)(struct amdgpu_usermode_queue *queue);
|
|
int (*map)(struct amdgpu_usermode_queue *queue);
|
|
int (*preempt)(struct amdgpu_usermode_queue *queue);
|
|
int (*restore)(struct amdgpu_usermode_queue *queue);
|
|
int (*detect_and_reset)(struct amdgpu_device *adev,
|
|
int queue_type);
|
|
};
|
|
|
|
/* Usermode queues for gfx */
|
|
struct amdgpu_userq_mgr {
|
|
/**
|
|
* @userq_xa: Per-process user queue map (queue ID → queue)
|
|
* Key: queue_id (unique ID within the process's userq manager)
|
|
* Value: struct amdgpu_usermode_queue
|
|
*/
|
|
struct xarray userq_xa;
|
|
struct mutex userq_mutex;
|
|
struct amdgpu_device *adev;
|
|
struct delayed_work resume_work;
|
|
struct drm_file *file;
|
|
atomic_t userq_count[AMDGPU_RING_TYPE_MAX];
|
|
};
|
|
|
|
struct amdgpu_db_info {
|
|
uint64_t doorbell_handle;
|
|
uint32_t queue_type;
|
|
uint32_t doorbell_offset;
|
|
struct amdgpu_userq_obj *db_obj;
|
|
};
|
|
|
|
int amdgpu_userq_ioctl(struct drm_device *dev, void *data, struct drm_file *filp);
|
|
|
|
int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct drm_file *file_priv,
|
|
struct amdgpu_device *adev);
|
|
|
|
void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr);
|
|
|
|
int amdgpu_userq_create_object(struct amdgpu_userq_mgr *uq_mgr,
|
|
struct amdgpu_userq_obj *userq_obj,
|
|
int size);
|
|
|
|
void amdgpu_userq_destroy_object(struct amdgpu_userq_mgr *uq_mgr,
|
|
struct amdgpu_userq_obj *userq_obj);
|
|
|
|
void amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr,
|
|
struct amdgpu_eviction_fence *ev_fence);
|
|
|
|
void amdgpu_userq_ensure_ev_fence(struct amdgpu_userq_mgr *userq_mgr,
|
|
struct amdgpu_eviction_fence_mgr *evf_mgr);
|
|
|
|
uint64_t amdgpu_userq_get_doorbell_index(struct amdgpu_userq_mgr *uq_mgr,
|
|
struct amdgpu_db_info *db_info,
|
|
struct drm_file *filp);
|
|
|
|
u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev);
|
|
bool amdgpu_userq_enabled(struct drm_device *dev);
|
|
|
|
int amdgpu_userq_suspend(struct amdgpu_device *adev);
|
|
int amdgpu_userq_resume(struct amdgpu_device *adev);
|
|
|
|
int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,
|
|
u32 idx);
|
|
int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev,
|
|
u32 idx);
|
|
void amdgpu_userq_reset_work(struct work_struct *work);
|
|
void amdgpu_userq_pre_reset(struct amdgpu_device *adev);
|
|
int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost);
|
|
void amdgpu_userq_start_hang_detect_work(struct amdgpu_usermode_queue *queue);
|
|
|
|
int amdgpu_userq_input_va_validate(struct amdgpu_device *adev,
|
|
struct amdgpu_usermode_queue *queue,
|
|
u64 addr, u64 expected_size);
|
|
int amdgpu_userq_gem_va_unmap_validate(struct amdgpu_device *adev,
|
|
struct amdgpu_bo_va_mapping *mapping,
|
|
uint64_t saddr);
|
|
#endif
|