mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2026-06-21 15:43:21 +02:00
81905b5acb
A deadlock occurs in the audit subsystem when duplicating
executable-related rules.
When a file is moved (e.g., via do_renameat2()), the VFS layer locks
the parent directory (I_MUTEX_PARENT), which synchronously triggers an
fsnotify_move event. If an existing executable audit rule matches the
file being moved, the audit subsystem catches this event and calls
audit_dupe_exe() to duplicate the watch and update the rule. Then,
audit_alloc_mark() would call kern_path_parent() to resolve the path,
leading to a blind attempt to acquire the exact same I_MUTEX_PARENT lock
already held by the task, resulting in the following recursive locking
deadlock:
============================================
WARNING: possible recursive locking detected
6.12.0-55.27.1.el10_0.x86_64+debug #1 Not tainted
--------------------------------------------
mv/5099 is trying to acquire lock:
ffff888132845358 (&inode->i_sb->s_type->i_mutex_dir_key/1){+.+.}-{3:3},
at: __kern_path_locked+0x10a/0x2f0
but task is already holding lock:
ffff888132846b58 (&inode->i_sb->s_type->i_mutex_dir_key/1){+.+.}-{3:3},
at: lock_two_directories+0x13f/0x2b0
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0
----
lock(&inode->i_sb->s_type->i_mutex_dir_key/1);
lock(&inode->i_sb->s_type->i_mutex_dir_key/1);
*** DEADLOCK ***
May be due to missing lock nesting notation
6 locks held by mv/5099:
#0: ffff888112a9c440 (sb_writers#13)
at: do_renameat2+0x34c/0xbc0
#1: ffff888112a9c790 (&type->s_vfs_rename_key#3)
at: do_renameat2+0x415/0xbc0
#2: ffff888132846b58 (&inode->i_sb->s_type->i_mutex_dir_key/1)
at: lock_two_directories+0x13f/0x2b0
#3: ffff888132845358 (&inode->i_sb->s_type->i_mutex_dir_key/5)
at: lock_two_directories+0x175/0x2b0
#4: ffffffffb3a1fb10 (&fsnotify_mark_srcu)
at: fsnotify+0x454/0x28a0
#5: ffffffffaf886230 (audit_filter_mutex)
at: audit_update_watch+0x36/0x11e0
stack backtrace:
Call Trace:
<TASK>
dump_stack_lvl+0x6f/0xb0
print_deadlock_bug.cold+0xbd/0xca
validate_chain+0x83a/0xf00
__lock_acquire+0xcac/0x1d20
lock_acquire.part.0+0x11b/0x360
down_write_nested+0x9f/0x230
__kern_path_locked+0x10a/0x2f0
kern_path_locked+0x26/0x40
audit_alloc_mark+0xfb/0x4f0
audit_dupe_exe+0x6c/0xe0
audit_dupe_rule+0x6c2/0xc00
audit_update_watch+0x4cc/0x11e0
audit_watch_handle_event+0x12c/0x1b0
send_to_group+0x5d0/0x8b0
fsnotify+0x615/0x28a0
fsnotify_move+0x1d8/0x630
vfs_rename+0xdcd/0x1df0
do_renameat2+0x9d4/0xbc0
__x64_sys_renameat+0x192/0x260
do_syscall_64+0x92/0x180
entry_SYSCALL_64_after_hwframe+0x76/0x7e
RIP: 0033:0x7f0491fe8c4e
Code: 0f 1f 40 00 48 8b 15 c1 e1 16 00 f7 d8 64 89 02 b8 ff ff ff ff
c3 66 0f 1f 44 00 00 f3 0f 1e fa 49 89 ca b8 08 01 00 00 0f 05 <48>
3d 00 f0 ff ff 77 0a c3 66 0f 1f 84 00 00 00 00 00 48 8b 15 89
RSP: 002b:00007ffc7210bf38 EFLAGS: 00000246 ORIG_RAX: 0000000000000108
RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f0491fe8c4e
RDX: 0000000000000003 RSI: 00007ffc7210e6c8 RDI: 00000000ffffff9c
RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000001
R10: 00005575eb2dae2a R11: 0000000000000246 R12: 00005575eb2dae2a
R13: 00007ffc7210e6c8 R14: 0000000000000003 R15: 00000000ffffff9c
</TASK>
The aforementioned deadlock can be consistently reproduced by running
the script below:
audit-dupe-exe-deadlock.sh
--------------------------
#!/bin/bash
auditctl -D
mkdir -p /tmp/foo
touch /tmp/file
auditctl -a always,exit -F exe=/tmp/file -F path=/tmp/file -S all -k dr
mv /tmp/file /tmp/foo/file
rm -Rf /tmp/foo
This patch fixes the issue by introducing struct audit_watch_ctx to pass
the fsnotify event context down to audit_alloc_mark(). By utilizing the
already-resolved directory inode provided by the event, we bypass the
kern_path_parent() path resolution entirely, safely avoiding the
recursive lock. Furthermore, it explicitly allows duplicate fsnotify
marks (allow_dups = 1) during the rename update, allowing the new rule's
mark to safely coexist with the old rule's mark until the old rule is
freed.
P.S.: This issue was identified and reproduced during a comprehensive
code coverage analysis of the audit subsystem. The full report is
available at the link below:
https://people.redhat.com/rrobaina/audit-code-coverage-analysis.pdf
P.P.S: With the permission of both Ricardo and Nathan, I've squashed a
fixup patch from Nathan that addresses a compile time error when
CONFIG_AUDITSYSCALL=n.
Cc: stable@kernel.org
Fixes: 34d99af52a ("audit: implement audit by executable")
Acked-by: Waiman Long <longman@redhat.com>
Acked-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Ricardo Robaina <rrobaina@redhat.com>
[PM: move link metadata into the msg, apply fix from NC]
Signed-off-by: Paul Moore <paul@paul-moore.com>
360 lines
11 KiB
C
360 lines
11 KiB
C
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
|
/* audit -- definition of audit_context structure and supporting types
|
|
*
|
|
* Copyright 2003-2004 Red Hat, Inc.
|
|
* Copyright 2005 Hewlett-Packard Development Company, L.P.
|
|
* Copyright 2005 IBM Corporation
|
|
*/
|
|
|
|
#ifndef _KERNEL_AUDIT_H_
|
|
#define _KERNEL_AUDIT_H_
|
|
|
|
#include <linux/fs.h>
|
|
#include <linux/audit.h>
|
|
#include <linux/security.h>
|
|
#include <linux/skbuff.h>
|
|
#include <uapi/linux/mqueue.h>
|
|
#include <linux/tty.h>
|
|
#include <uapi/linux/openat2.h> // struct open_how
|
|
|
|
/* AUDIT_NAMES is the number of slots we reserve in the audit_context
|
|
* for saving names from getname(). If we get more names we will allocate
|
|
* a name dynamically and also add those to the list anchored by names_list. */
|
|
#define AUDIT_NAMES 5
|
|
|
|
/* At task start time, the audit_state is set in the audit_context using
|
|
a per-task filter. At syscall entry, the audit_state is augmented by
|
|
the syscall filter. */
|
|
enum audit_state {
|
|
AUDIT_STATE_DISABLED, /* Do not create per-task audit_context.
|
|
* No syscall-specific audit records can
|
|
* be generated. */
|
|
AUDIT_STATE_BUILD, /* Create the per-task audit_context,
|
|
* and fill it in at syscall
|
|
* entry time. This makes a full
|
|
* syscall record available if some
|
|
* other part of the kernel decides it
|
|
* should be recorded. */
|
|
AUDIT_STATE_RECORD /* Create the per-task audit_context,
|
|
* always fill it in at syscall entry
|
|
* time, and always write out the audit
|
|
* record at syscall exit time. */
|
|
};
|
|
|
|
/* Rule lists */
|
|
struct audit_watch;
|
|
struct audit_fsnotify_mark;
|
|
struct audit_tree;
|
|
struct audit_chunk;
|
|
|
|
struct audit_entry {
|
|
struct list_head list;
|
|
struct rcu_head rcu;
|
|
struct audit_krule rule;
|
|
};
|
|
|
|
struct audit_cap_data {
|
|
kernel_cap_t permitted;
|
|
kernel_cap_t inheritable;
|
|
union {
|
|
unsigned int fE; /* effective bit of file cap */
|
|
kernel_cap_t effective; /* effective set of process */
|
|
};
|
|
kernel_cap_t ambient;
|
|
kuid_t rootid;
|
|
};
|
|
|
|
/* When fs/namei.c:getname() is called, we store the pointer in name and bump
|
|
* the refcnt in the associated filename struct.
|
|
*
|
|
* Further, in fs/namei.c:path_lookup() we store the inode and device.
|
|
*/
|
|
struct audit_names {
|
|
struct list_head list; /* audit_context->names_list */
|
|
|
|
struct filename *name;
|
|
int name_len; /* number of chars to log */
|
|
bool hidden; /* don't log this record */
|
|
|
|
u64 ino;
|
|
dev_t dev;
|
|
umode_t mode;
|
|
kuid_t uid;
|
|
kgid_t gid;
|
|
dev_t rdev;
|
|
struct lsm_prop oprop;
|
|
struct audit_cap_data fcap;
|
|
unsigned int fcap_ver;
|
|
unsigned char type; /* record type */
|
|
/*
|
|
* This was an allocated audit_names and not from the array of
|
|
* names allocated in the task audit context. Thus this name
|
|
* should be freed on syscall exit.
|
|
*/
|
|
bool should_free;
|
|
};
|
|
|
|
struct audit_proctitle {
|
|
int len; /* length of the cmdline field. */
|
|
char *value; /* the cmdline field */
|
|
};
|
|
|
|
/* A timestamp/serial pair to identify an event */
|
|
struct audit_stamp {
|
|
struct timespec64 ctime; /* time of syscall entry */
|
|
unsigned int serial; /* serial number for record */
|
|
};
|
|
|
|
/* The per-task audit context. */
|
|
struct audit_context {
|
|
int dummy; /* must be the first element */
|
|
enum {
|
|
AUDIT_CTX_UNUSED, /* audit_context is currently unused */
|
|
AUDIT_CTX_SYSCALL, /* in use by syscall */
|
|
AUDIT_CTX_URING, /* in use by io_uring */
|
|
} context;
|
|
enum audit_state state, current_state;
|
|
struct audit_stamp stamp; /* event identifier */
|
|
int major; /* syscall number */
|
|
int uring_op; /* uring operation */
|
|
unsigned long argv[4]; /* syscall arguments */
|
|
long return_code;/* syscall return code */
|
|
u64 prio;
|
|
int return_valid; /* return code is valid */
|
|
/*
|
|
* The names_list is the list of all audit_names collected during this
|
|
* syscall. The first AUDIT_NAMES entries in the names_list will
|
|
* actually be from the preallocated_names array for performance
|
|
* reasons. Except during allocation they should never be referenced
|
|
* through the preallocated_names array and should only be found/used
|
|
* by running the names_list.
|
|
*/
|
|
struct audit_names preallocated_names[AUDIT_NAMES];
|
|
int name_count; /* total records in names_list */
|
|
struct list_head names_list; /* struct audit_names->list anchor */
|
|
char *filterkey; /* key for rule that triggered record */
|
|
struct path pwd;
|
|
struct audit_aux_data *aux;
|
|
struct audit_aux_data *aux_pids;
|
|
struct sockaddr_storage *sockaddr;
|
|
size_t sockaddr_len;
|
|
/* Save things to print about task_struct */
|
|
pid_t ppid;
|
|
kuid_t uid, euid, suid, fsuid;
|
|
kgid_t gid, egid, sgid, fsgid;
|
|
unsigned long personality;
|
|
int arch;
|
|
|
|
pid_t target_pid;
|
|
kuid_t target_auid;
|
|
kuid_t target_uid;
|
|
unsigned int target_sessionid;
|
|
struct lsm_prop target_ref;
|
|
char target_comm[TASK_COMM_LEN];
|
|
|
|
struct audit_tree_refs *trees, *first_trees;
|
|
struct list_head killed_trees;
|
|
int tree_count;
|
|
|
|
int type;
|
|
union {
|
|
struct {
|
|
int nargs;
|
|
long args[6];
|
|
} socketcall;
|
|
struct {
|
|
kuid_t uid;
|
|
kgid_t gid;
|
|
umode_t mode;
|
|
struct lsm_prop oprop;
|
|
int has_perm;
|
|
uid_t perm_uid;
|
|
gid_t perm_gid;
|
|
umode_t perm_mode;
|
|
unsigned long qbytes;
|
|
} ipc;
|
|
struct {
|
|
mqd_t mqdes;
|
|
struct mq_attr mqstat;
|
|
} mq_getsetattr;
|
|
struct {
|
|
mqd_t mqdes;
|
|
int sigev_signo;
|
|
} mq_notify;
|
|
struct {
|
|
mqd_t mqdes;
|
|
size_t msg_len;
|
|
unsigned int msg_prio;
|
|
struct timespec64 abs_timeout;
|
|
} mq_sendrecv;
|
|
struct {
|
|
int oflag;
|
|
umode_t mode;
|
|
struct mq_attr attr;
|
|
} mq_open;
|
|
struct {
|
|
pid_t pid;
|
|
struct audit_cap_data cap;
|
|
} capset;
|
|
struct {
|
|
int fd;
|
|
int flags;
|
|
} mmap;
|
|
struct open_how openat2;
|
|
struct {
|
|
int argc;
|
|
} execve;
|
|
struct {
|
|
const char *name;
|
|
} module;
|
|
struct {
|
|
struct audit_ntp_data ntp_data;
|
|
struct timespec64 tk_injoffset;
|
|
} time;
|
|
};
|
|
int fds[2];
|
|
struct audit_proctitle proctitle;
|
|
};
|
|
|
|
extern bool audit_ever_enabled;
|
|
|
|
extern void audit_log_session_info(struct audit_buffer *ab);
|
|
|
|
extern int auditd_test_task(struct task_struct *task);
|
|
|
|
#define AUDIT_INODE_BUCKETS 32
|
|
extern struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
|
|
|
|
static inline int audit_hash_ino(u64 ino)
|
|
{
|
|
return ((u32)ino & (AUDIT_INODE_BUCKETS-1));
|
|
}
|
|
|
|
/* Indicates that audit should log the full pathname. */
|
|
#define AUDIT_NAME_FULL -1
|
|
|
|
extern int audit_match_class(int class, unsigned int syscall);
|
|
extern int audit_comparator(const u32 left, const u32 op, const u32 right);
|
|
extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right);
|
|
extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right);
|
|
extern int parent_len(const char *path);
|
|
extern int audit_compare_dname_path(const struct qstr *dname, const char *path, int plen);
|
|
extern struct sk_buff *audit_make_reply(int seq, int type, int done, int multi,
|
|
const void *payload, int size);
|
|
extern void audit_panic(const char *message);
|
|
|
|
struct audit_netlink_list {
|
|
__u32 portid;
|
|
struct net *net;
|
|
struct sk_buff_head q;
|
|
};
|
|
|
|
int audit_send_list_thread(void *_dest);
|
|
|
|
extern struct mutex audit_filter_mutex;
|
|
extern int audit_del_rule(struct audit_entry *entry);
|
|
extern void audit_free_rule_rcu(struct rcu_head *head);
|
|
extern struct list_head audit_filter_list[];
|
|
|
|
struct audit_watch_ctx {
|
|
struct inode *dir;
|
|
struct inode *child;
|
|
};
|
|
|
|
extern struct audit_entry *audit_dupe_rule(struct audit_krule *old,
|
|
struct audit_watch_ctx *ctx);
|
|
extern void audit_log_d_path_exe(struct audit_buffer *ab,
|
|
struct mm_struct *mm);
|
|
|
|
extern struct tty_struct *audit_get_tty(void);
|
|
extern void audit_put_tty(struct tty_struct *tty);
|
|
|
|
/* audit watch/mark/tree functions */
|
|
extern unsigned int audit_serial(void);
|
|
#ifdef CONFIG_AUDITSYSCALL
|
|
extern int auditsc_get_stamp(struct audit_context *ctx,
|
|
struct audit_stamp *stamp);
|
|
|
|
extern void audit_put_watch(struct audit_watch *watch);
|
|
extern void audit_get_watch(struct audit_watch *watch);
|
|
extern int audit_to_watch(struct audit_krule *krule, char *path, int len,
|
|
u32 op);
|
|
extern int audit_add_watch(struct audit_krule *krule, struct list_head **list);
|
|
extern void audit_remove_watch_rule(struct audit_krule *krule);
|
|
extern char *audit_watch_path(struct audit_watch *watch);
|
|
extern int audit_watch_compare(struct audit_watch *watch, u64 ino, dev_t dev);
|
|
|
|
extern struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule,
|
|
char *pathname, int len,
|
|
struct audit_watch_ctx *ctx);
|
|
extern char *audit_mark_path(struct audit_fsnotify_mark *mark);
|
|
extern void audit_remove_mark(struct audit_fsnotify_mark *audit_mark);
|
|
extern void audit_remove_mark_rule(struct audit_krule *krule);
|
|
extern int audit_mark_compare(struct audit_fsnotify_mark *mark, u64 ino,
|
|
dev_t dev);
|
|
extern int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old,
|
|
struct audit_watch_ctx *ctx);
|
|
extern int audit_exe_compare(struct task_struct *tsk,
|
|
struct audit_fsnotify_mark *mark);
|
|
|
|
extern struct audit_chunk *audit_tree_lookup(const struct inode *inode);
|
|
extern void audit_put_chunk(struct audit_chunk *chunk);
|
|
extern bool audit_tree_match(struct audit_chunk *chunk,
|
|
struct audit_tree *tree);
|
|
extern int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op);
|
|
extern int audit_add_tree_rule(struct audit_krule *rule);
|
|
extern int audit_remove_tree_rule(struct audit_krule *rule);
|
|
extern void audit_trim_trees(void);
|
|
extern int audit_tag_tree(char *old, char *new);
|
|
extern const char *audit_tree_path(struct audit_tree *tree);
|
|
extern void audit_put_tree(struct audit_tree *tree);
|
|
extern void audit_kill_trees(struct audit_context *context);
|
|
|
|
extern int audit_signal_info_syscall(struct task_struct *t);
|
|
extern void audit_filter_inodes(struct task_struct *tsk,
|
|
struct audit_context *ctx);
|
|
extern struct list_head *audit_killed_trees(void);
|
|
#else /* CONFIG_AUDITSYSCALL */
|
|
#define auditsc_get_stamp(c, s) 0
|
|
#define audit_put_watch(w) do { } while (0)
|
|
#define audit_get_watch(w) do { } while (0)
|
|
#define audit_to_watch(k, p, l, o) (-EINVAL)
|
|
#define audit_add_watch(k, l) (-EINVAL)
|
|
#define audit_remove_watch_rule(k) BUG()
|
|
#define audit_watch_path(w) ""
|
|
#define audit_watch_compare(w, i, d) 0
|
|
|
|
#define audit_alloc_mark(k, p, l, c) (ERR_PTR(-EINVAL))
|
|
#define audit_mark_path(m) ""
|
|
#define audit_remove_mark(m) do { } while (0)
|
|
#define audit_remove_mark_rule(k) do { } while (0)
|
|
#define audit_mark_compare(m, i, d) 0
|
|
#define audit_exe_compare(t, m) (-EINVAL)
|
|
#define audit_dupe_exe(n, o, c) (-EINVAL)
|
|
|
|
#define audit_remove_tree_rule(rule) BUG()
|
|
#define audit_add_tree_rule(rule) -EINVAL
|
|
#define audit_make_tree(rule, str, op) -EINVAL
|
|
#define audit_trim_trees() do { } while (0)
|
|
#define audit_put_tree(tree) do { } while (0)
|
|
#define audit_tag_tree(old, new) -EINVAL
|
|
#define audit_tree_path(rule) "" /* never called */
|
|
#define audit_kill_trees(context) BUG()
|
|
|
|
static inline int audit_signal_info_syscall(struct task_struct *t)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
#define audit_filter_inodes(t, c) do { } while (0)
|
|
#endif /* CONFIG_AUDITSYSCALL */
|
|
|
|
extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len);
|
|
|
|
extern int audit_filter(int msgtype, unsigned int listtype);
|
|
|
|
extern void audit_ctl_lock(void);
|
|
extern void audit_ctl_unlock(void);
|
|
|
|
#endif
|