Files
linux-stable-mirror/kernel/audit.h
T
Ricardo Robaina 81905b5acb audit: fix recursive locking deadlock in audit_dupe_exe()
A deadlock occurs in the audit subsystem when duplicating
executable-related rules.

When a file is moved (e.g., via do_renameat2()), the VFS layer locks
the parent directory (I_MUTEX_PARENT), which synchronously triggers an
fsnotify_move event. If an existing executable audit rule matches the
file being moved, the audit subsystem catches this event and calls
audit_dupe_exe() to duplicate the watch and update the rule. Then,
audit_alloc_mark() would call kern_path_parent() to resolve the path,
leading to a blind attempt to acquire the exact same I_MUTEX_PARENT lock
already held by the task, resulting in the following recursive locking
deadlock:

 ============================================
 WARNING: possible recursive locking detected
 6.12.0-55.27.1.el10_0.x86_64+debug #1 Not tainted
 --------------------------------------------
 mv/5099 is trying to acquire lock:
 ffff888132845358 (&inode->i_sb->s_type->i_mutex_dir_key/1){+.+.}-{3:3},
 at: __kern_path_locked+0x10a/0x2f0

 but task is already holding lock:
 ffff888132846b58 (&inode->i_sb->s_type->i_mutex_dir_key/1){+.+.}-{3:3},
 at: lock_two_directories+0x13f/0x2b0

 other info that might help us debug this:
  Possible unsafe locking scenario:

        CPU0
        ----
   lock(&inode->i_sb->s_type->i_mutex_dir_key/1);
   lock(&inode->i_sb->s_type->i_mutex_dir_key/1);

  *** DEADLOCK ***

  May be due to missing lock nesting notation

  6 locks held by mv/5099:
  #0: ffff888112a9c440 (sb_writers#13)
  at: do_renameat2+0x34c/0xbc0
  #1: ffff888112a9c790 (&type->s_vfs_rename_key#3)
  at: do_renameat2+0x415/0xbc0
  #2: ffff888132846b58 (&inode->i_sb->s_type->i_mutex_dir_key/1)
  at: lock_two_directories+0x13f/0x2b0
  #3: ffff888132845358 (&inode->i_sb->s_type->i_mutex_dir_key/5)
  at: lock_two_directories+0x175/0x2b0
  #4: ffffffffb3a1fb10 (&fsnotify_mark_srcu)
  at: fsnotify+0x454/0x28a0
  #5: ffffffffaf886230 (audit_filter_mutex)
  at: audit_update_watch+0x36/0x11e0

 stack backtrace:
 Call Trace:
  <TASK>
  dump_stack_lvl+0x6f/0xb0
  print_deadlock_bug.cold+0xbd/0xca
  validate_chain+0x83a/0xf00
  __lock_acquire+0xcac/0x1d20
  lock_acquire.part.0+0x11b/0x360
  down_write_nested+0x9f/0x230
  __kern_path_locked+0x10a/0x2f0
  kern_path_locked+0x26/0x40
  audit_alloc_mark+0xfb/0x4f0
  audit_dupe_exe+0x6c/0xe0
  audit_dupe_rule+0x6c2/0xc00
  audit_update_watch+0x4cc/0x11e0
  audit_watch_handle_event+0x12c/0x1b0
  send_to_group+0x5d0/0x8b0
  fsnotify+0x615/0x28a0
  fsnotify_move+0x1d8/0x630
  vfs_rename+0xdcd/0x1df0
  do_renameat2+0x9d4/0xbc0
  __x64_sys_renameat+0x192/0x260
  do_syscall_64+0x92/0x180
  entry_SYSCALL_64_after_hwframe+0x76/0x7e
 RIP: 0033:0x7f0491fe8c4e
 Code: 0f 1f 40 00 48 8b 15 c1 e1 16 00 f7 d8 64 89 02 b8 ff ff ff ff
 c3 66 0f 1f 44 00 00 f3 0f 1e fa 49 89 ca b8 08 01 00 00 0f 05 <48>
 3d 00 f0 ff ff 77 0a c3 66 0f 1f 84 00 00 00 00 00 48 8b 15 89
 RSP: 002b:00007ffc7210bf38 EFLAGS: 00000246 ORIG_RAX: 0000000000000108
 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f0491fe8c4e
 RDX: 0000000000000003 RSI: 00007ffc7210e6c8 RDI: 00000000ffffff9c
 RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000001
 R10: 00005575eb2dae2a R11: 0000000000000246 R12: 00005575eb2dae2a
 R13: 00007ffc7210e6c8 R14: 0000000000000003 R15: 00000000ffffff9c
  </TASK>

The aforementioned deadlock can be consistently reproduced by running
the script below:

 audit-dupe-exe-deadlock.sh
 --------------------------
 #!/bin/bash
 auditctl -D
 mkdir -p /tmp/foo
 touch /tmp/file
 auditctl -a always,exit -F exe=/tmp/file -F path=/tmp/file -S all -k dr
 mv /tmp/file /tmp/foo/file
 rm -Rf /tmp/foo

This patch fixes the issue by introducing struct audit_watch_ctx to pass
the fsnotify event context down to audit_alloc_mark(). By utilizing the
already-resolved directory inode provided by the event, we bypass the
kern_path_parent() path resolution entirely, safely avoiding the
recursive lock. Furthermore, it explicitly allows duplicate fsnotify
marks (allow_dups = 1) during the rename update, allowing the new rule's
mark to safely coexist with the old rule's mark until the old rule is
freed.

P.S.: This issue was identified and reproduced during a comprehensive
code coverage analysis of the audit subsystem. The full report is
available at the link below:

https://people.redhat.com/rrobaina/audit-code-coverage-analysis.pdf

P.P.S: With the permission of both Ricardo and Nathan, I've squashed a
fixup patch from Nathan that addresses a compile time error when
CONFIG_AUDITSYSCALL=n.

Cc: stable@kernel.org
Fixes: 34d99af52a ("audit: implement audit by executable")
Acked-by: Waiman Long <longman@redhat.com>
Acked-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Ricardo Robaina <rrobaina@redhat.com>
[PM: move link metadata into the msg, apply fix from NC]
Signed-off-by: Paul Moore <paul@paul-moore.com>
2026-05-27 19:15:34 -04:00

360 lines
11 KiB
C

/* SPDX-License-Identifier: GPL-2.0-or-later */
/* audit -- definition of audit_context structure and supporting types
*
* Copyright 2003-2004 Red Hat, Inc.
* Copyright 2005 Hewlett-Packard Development Company, L.P.
* Copyright 2005 IBM Corporation
*/
#ifndef _KERNEL_AUDIT_H_
#define _KERNEL_AUDIT_H_
#include <linux/fs.h>
#include <linux/audit.h>
#include <linux/security.h>
#include <linux/skbuff.h>
#include <uapi/linux/mqueue.h>
#include <linux/tty.h>
#include <uapi/linux/openat2.h> // struct open_how
/* AUDIT_NAMES is the number of slots we reserve in the audit_context
* for saving names from getname(). If we get more names we will allocate
* a name dynamically and also add those to the list anchored by names_list. */
#define AUDIT_NAMES 5
/* At task start time, the audit_state is set in the audit_context using
a per-task filter. At syscall entry, the audit_state is augmented by
the syscall filter. */
enum audit_state {
AUDIT_STATE_DISABLED, /* Do not create per-task audit_context.
* No syscall-specific audit records can
* be generated. */
AUDIT_STATE_BUILD, /* Create the per-task audit_context,
* and fill it in at syscall
* entry time. This makes a full
* syscall record available if some
* other part of the kernel decides it
* should be recorded. */
AUDIT_STATE_RECORD /* Create the per-task audit_context,
* always fill it in at syscall entry
* time, and always write out the audit
* record at syscall exit time. */
};
/* Rule lists */
struct audit_watch;
struct audit_fsnotify_mark;
struct audit_tree;
struct audit_chunk;
struct audit_entry {
struct list_head list;
struct rcu_head rcu;
struct audit_krule rule;
};
struct audit_cap_data {
kernel_cap_t permitted;
kernel_cap_t inheritable;
union {
unsigned int fE; /* effective bit of file cap */
kernel_cap_t effective; /* effective set of process */
};
kernel_cap_t ambient;
kuid_t rootid;
};
/* When fs/namei.c:getname() is called, we store the pointer in name and bump
* the refcnt in the associated filename struct.
*
* Further, in fs/namei.c:path_lookup() we store the inode and device.
*/
struct audit_names {
struct list_head list; /* audit_context->names_list */
struct filename *name;
int name_len; /* number of chars to log */
bool hidden; /* don't log this record */
u64 ino;
dev_t dev;
umode_t mode;
kuid_t uid;
kgid_t gid;
dev_t rdev;
struct lsm_prop oprop;
struct audit_cap_data fcap;
unsigned int fcap_ver;
unsigned char type; /* record type */
/*
* This was an allocated audit_names and not from the array of
* names allocated in the task audit context. Thus this name
* should be freed on syscall exit.
*/
bool should_free;
};
struct audit_proctitle {
int len; /* length of the cmdline field. */
char *value; /* the cmdline field */
};
/* A timestamp/serial pair to identify an event */
struct audit_stamp {
struct timespec64 ctime; /* time of syscall entry */
unsigned int serial; /* serial number for record */
};
/* The per-task audit context. */
struct audit_context {
int dummy; /* must be the first element */
enum {
AUDIT_CTX_UNUSED, /* audit_context is currently unused */
AUDIT_CTX_SYSCALL, /* in use by syscall */
AUDIT_CTX_URING, /* in use by io_uring */
} context;
enum audit_state state, current_state;
struct audit_stamp stamp; /* event identifier */
int major; /* syscall number */
int uring_op; /* uring operation */
unsigned long argv[4]; /* syscall arguments */
long return_code;/* syscall return code */
u64 prio;
int return_valid; /* return code is valid */
/*
* The names_list is the list of all audit_names collected during this
* syscall. The first AUDIT_NAMES entries in the names_list will
* actually be from the preallocated_names array for performance
* reasons. Except during allocation they should never be referenced
* through the preallocated_names array and should only be found/used
* by running the names_list.
*/
struct audit_names preallocated_names[AUDIT_NAMES];
int name_count; /* total records in names_list */
struct list_head names_list; /* struct audit_names->list anchor */
char *filterkey; /* key for rule that triggered record */
struct path pwd;
struct audit_aux_data *aux;
struct audit_aux_data *aux_pids;
struct sockaddr_storage *sockaddr;
size_t sockaddr_len;
/* Save things to print about task_struct */
pid_t ppid;
kuid_t uid, euid, suid, fsuid;
kgid_t gid, egid, sgid, fsgid;
unsigned long personality;
int arch;
pid_t target_pid;
kuid_t target_auid;
kuid_t target_uid;
unsigned int target_sessionid;
struct lsm_prop target_ref;
char target_comm[TASK_COMM_LEN];
struct audit_tree_refs *trees, *first_trees;
struct list_head killed_trees;
int tree_count;
int type;
union {
struct {
int nargs;
long args[6];
} socketcall;
struct {
kuid_t uid;
kgid_t gid;
umode_t mode;
struct lsm_prop oprop;
int has_perm;
uid_t perm_uid;
gid_t perm_gid;
umode_t perm_mode;
unsigned long qbytes;
} ipc;
struct {
mqd_t mqdes;
struct mq_attr mqstat;
} mq_getsetattr;
struct {
mqd_t mqdes;
int sigev_signo;
} mq_notify;
struct {
mqd_t mqdes;
size_t msg_len;
unsigned int msg_prio;
struct timespec64 abs_timeout;
} mq_sendrecv;
struct {
int oflag;
umode_t mode;
struct mq_attr attr;
} mq_open;
struct {
pid_t pid;
struct audit_cap_data cap;
} capset;
struct {
int fd;
int flags;
} mmap;
struct open_how openat2;
struct {
int argc;
} execve;
struct {
const char *name;
} module;
struct {
struct audit_ntp_data ntp_data;
struct timespec64 tk_injoffset;
} time;
};
int fds[2];
struct audit_proctitle proctitle;
};
extern bool audit_ever_enabled;
extern void audit_log_session_info(struct audit_buffer *ab);
extern int auditd_test_task(struct task_struct *task);
#define AUDIT_INODE_BUCKETS 32
extern struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
static inline int audit_hash_ino(u64 ino)
{
return ((u32)ino & (AUDIT_INODE_BUCKETS-1));
}
/* Indicates that audit should log the full pathname. */
#define AUDIT_NAME_FULL -1
extern int audit_match_class(int class, unsigned int syscall);
extern int audit_comparator(const u32 left, const u32 op, const u32 right);
extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right);
extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right);
extern int parent_len(const char *path);
extern int audit_compare_dname_path(const struct qstr *dname, const char *path, int plen);
extern struct sk_buff *audit_make_reply(int seq, int type, int done, int multi,
const void *payload, int size);
extern void audit_panic(const char *message);
struct audit_netlink_list {
__u32 portid;
struct net *net;
struct sk_buff_head q;
};
int audit_send_list_thread(void *_dest);
extern struct mutex audit_filter_mutex;
extern int audit_del_rule(struct audit_entry *entry);
extern void audit_free_rule_rcu(struct rcu_head *head);
extern struct list_head audit_filter_list[];
struct audit_watch_ctx {
struct inode *dir;
struct inode *child;
};
extern struct audit_entry *audit_dupe_rule(struct audit_krule *old,
struct audit_watch_ctx *ctx);
extern void audit_log_d_path_exe(struct audit_buffer *ab,
struct mm_struct *mm);
extern struct tty_struct *audit_get_tty(void);
extern void audit_put_tty(struct tty_struct *tty);
/* audit watch/mark/tree functions */
extern unsigned int audit_serial(void);
#ifdef CONFIG_AUDITSYSCALL
extern int auditsc_get_stamp(struct audit_context *ctx,
struct audit_stamp *stamp);
extern void audit_put_watch(struct audit_watch *watch);
extern void audit_get_watch(struct audit_watch *watch);
extern int audit_to_watch(struct audit_krule *krule, char *path, int len,
u32 op);
extern int audit_add_watch(struct audit_krule *krule, struct list_head **list);
extern void audit_remove_watch_rule(struct audit_krule *krule);
extern char *audit_watch_path(struct audit_watch *watch);
extern int audit_watch_compare(struct audit_watch *watch, u64 ino, dev_t dev);
extern struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule,
char *pathname, int len,
struct audit_watch_ctx *ctx);
extern char *audit_mark_path(struct audit_fsnotify_mark *mark);
extern void audit_remove_mark(struct audit_fsnotify_mark *audit_mark);
extern void audit_remove_mark_rule(struct audit_krule *krule);
extern int audit_mark_compare(struct audit_fsnotify_mark *mark, u64 ino,
dev_t dev);
extern int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old,
struct audit_watch_ctx *ctx);
extern int audit_exe_compare(struct task_struct *tsk,
struct audit_fsnotify_mark *mark);
extern struct audit_chunk *audit_tree_lookup(const struct inode *inode);
extern void audit_put_chunk(struct audit_chunk *chunk);
extern bool audit_tree_match(struct audit_chunk *chunk,
struct audit_tree *tree);
extern int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op);
extern int audit_add_tree_rule(struct audit_krule *rule);
extern int audit_remove_tree_rule(struct audit_krule *rule);
extern void audit_trim_trees(void);
extern int audit_tag_tree(char *old, char *new);
extern const char *audit_tree_path(struct audit_tree *tree);
extern void audit_put_tree(struct audit_tree *tree);
extern void audit_kill_trees(struct audit_context *context);
extern int audit_signal_info_syscall(struct task_struct *t);
extern void audit_filter_inodes(struct task_struct *tsk,
struct audit_context *ctx);
extern struct list_head *audit_killed_trees(void);
#else /* CONFIG_AUDITSYSCALL */
#define auditsc_get_stamp(c, s) 0
#define audit_put_watch(w) do { } while (0)
#define audit_get_watch(w) do { } while (0)
#define audit_to_watch(k, p, l, o) (-EINVAL)
#define audit_add_watch(k, l) (-EINVAL)
#define audit_remove_watch_rule(k) BUG()
#define audit_watch_path(w) ""
#define audit_watch_compare(w, i, d) 0
#define audit_alloc_mark(k, p, l, c) (ERR_PTR(-EINVAL))
#define audit_mark_path(m) ""
#define audit_remove_mark(m) do { } while (0)
#define audit_remove_mark_rule(k) do { } while (0)
#define audit_mark_compare(m, i, d) 0
#define audit_exe_compare(t, m) (-EINVAL)
#define audit_dupe_exe(n, o, c) (-EINVAL)
#define audit_remove_tree_rule(rule) BUG()
#define audit_add_tree_rule(rule) -EINVAL
#define audit_make_tree(rule, str, op) -EINVAL
#define audit_trim_trees() do { } while (0)
#define audit_put_tree(tree) do { } while (0)
#define audit_tag_tree(old, new) -EINVAL
#define audit_tree_path(rule) "" /* never called */
#define audit_kill_trees(context) BUG()
static inline int audit_signal_info_syscall(struct task_struct *t)
{
return 0;
}
#define audit_filter_inodes(t, c) do { } while (0)
#endif /* CONFIG_AUDITSYSCALL */
extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len);
extern int audit_filter(int msgtype, unsigned int listtype);
extern void audit_ctl_lock(void);
extern void audit_ctl_unlock(void);
#endif