Files
linux-stable-mirror/include/linux/pid.h
Christian Brauner cb12fd8e0d pidfd: add pidfs
This moves pidfds from the anonymous inode infrastructure to a tiny
pseudo filesystem. This has been on my todo for quite a while as it will
unblock further work that we weren't able to do simply because of the
very justified limitations of anonymous inodes. Moving pidfds to a tiny
pseudo filesystem allows:

* statx() on pidfds becomes useful for the first time.
* pidfds can be compared simply via statx() and then comparing inode
  numbers.
* pidfds have unique inode numbers for the system lifetime.
* struct pid is now stashed in inode->i_private instead of
  file->private_data. This means it is now possible to introduce
  concepts that operate on a process once all file descriptors have been
  closed. A concrete example is kill-on-last-close.
* file->private_data is freed up for per-file options for pidfds.
* Each struct pid will refer to a different inode but the same struct
  pid will refer to the same inode if it's opened multiple times. In
  contrast to now where each struct pid refers to the same inode. Even
  if we were to move to anon_inode_create_getfile() which creates new
  inodes we'd still be associating the same struct pid with multiple
  different inodes.

The tiny pseudo filesystem is not visible anywhere in userspace exactly
like e.g., pipefs and sockfs. There's no lookup, there's no complex
inode operations, nothing. Dentries and inodes are always deleted when
the last pidfd is closed.

We allocate a new inode for each struct pid and we reuse that inode for
all pidfds. We use iget_locked() to find that inode again based on the
inode number which isn't recycled. We allocate a new dentry for each
pidfd that uses the same inode. That is similar to anonymous inodes
which reuse the same inode for thousands of dentries. For pidfds we're
talking way less than that. There usually won't be a lot of concurrent
openers of the same struct pid. They can probably often be counted on
two hands. I know that systemd does use separate pidfd for the same
struct pid for various complex process tracking issues. So I think with
that things actually become way simpler. Especially because we don't
have to care about lookup. Dentries and inodes continue to be always
deleted.

The code is entirely optional and fairly small. If it's not selected we
fallback to anonymous inodes. Heavily inspired by nsfs which uses a
similar stashing mechanism just for namespaces.

Link: https://lore.kernel.org/r/20240213-vfs-pidfd_fs-v1-2-f863f58cfce1@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-03-01 12:23:37 +01:00

334 lines
9.4 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PID_H
#define _LINUX_PID_H
#include <linux/pid_types.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <linux/refcount.h>
#include <linux/sched.h>
#include <linux/wait.h>
/*
* What is struct pid?
*
* A struct pid is the kernel's internal notion of a process identifier.
* It refers to individual tasks, process groups, and sessions. While
* there are processes attached to it the struct pid lives in a hash
* table, so it and then the processes that it refers to can be found
* quickly from the numeric pid value. The attached processes may be
* quickly accessed by following pointers from struct pid.
*
* Storing pid_t values in the kernel and referring to them later has a
* problem. The process originally with that pid may have exited and the
* pid allocator wrapped, and another process could have come along
* and been assigned that pid.
*
* Referring to user space processes by holding a reference to struct
* task_struct has a problem. When the user space process exits
* the now useless task_struct is still kept. A task_struct plus a
* stack consumes around 10K of low kernel memory. More precisely
* this is THREAD_SIZE + sizeof(struct task_struct). By comparison
* a struct pid is about 64 bytes.
*
* Holding a reference to struct pid solves both of these problems.
* It is small so holding a reference does not consume a lot of
* resources, and since a new struct pid is allocated when the numeric pid
* value is reused (when pids wrap around) we don't mistakenly refer to new
* processes.
*/
/*
* struct upid is used to get the id of the struct pid, as it is
* seen in particular namespace. Later the struct pid is found with
* find_pid_ns() using the int nr and struct pid_namespace *ns.
*/
struct upid {
int nr;
struct pid_namespace *ns;
};
struct pid
{
refcount_t count;
unsigned int level;
spinlock_t lock;
#ifdef CONFIG_FS_PID
unsigned long ino;
#endif
/* lists of tasks that use this pid */
struct hlist_head tasks[PIDTYPE_MAX];
struct hlist_head inodes;
/* wait queue for pidfd notifications */
wait_queue_head_t wait_pidfd;
struct rcu_head rcu;
struct upid numbers[];
};
extern struct pid init_struct_pid;
struct file;
struct pid *pidfd_pid(const struct file *file);
struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags);
struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags);
int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret);
void do_notify_pidfd(struct task_struct *task);
static inline struct pid *get_pid(struct pid *pid)
{
if (pid)
refcount_inc(&pid->count);
return pid;
}
extern void put_pid(struct pid *pid);
extern struct task_struct *pid_task(struct pid *pid, enum pid_type);
static inline bool pid_has_task(struct pid *pid, enum pid_type type)
{
return !hlist_empty(&pid->tasks[type]);
}
extern struct task_struct *get_pid_task(struct pid *pid, enum pid_type);
extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type);
/*
* these helpers must be called with the tasklist_lock write-held.
*/
extern void attach_pid(struct task_struct *task, enum pid_type);
extern void detach_pid(struct task_struct *task, enum pid_type);
extern void change_pid(struct task_struct *task, enum pid_type,
struct pid *pid);
extern void exchange_tids(struct task_struct *task, struct task_struct *old);
extern void transfer_pid(struct task_struct *old, struct task_struct *new,
enum pid_type);
extern int pid_max;
extern int pid_max_min, pid_max_max;
/*
* look up a PID in the hash table. Must be called with the tasklist_lock
* or rcu_read_lock() held.
*
* find_pid_ns() finds the pid in the namespace specified
* find_vpid() finds the pid by its virtual id, i.e. in the current namespace
*
* see also find_task_by_vpid() set in include/linux/sched.h
*/
extern struct pid *find_pid_ns(int nr, struct pid_namespace *ns);
extern struct pid *find_vpid(int nr);
/*
* Lookup a PID in the hash table, and return with it's count elevated.
*/
extern struct pid *find_get_pid(int nr);
extern struct pid *find_ge_pid(int nr, struct pid_namespace *);
extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
size_t set_tid_size);
extern void free_pid(struct pid *pid);
extern void disable_pid_allocation(struct pid_namespace *ns);
/*
* ns_of_pid() returns the pid namespace in which the specified pid was
* allocated.
*
* NOTE:
* ns_of_pid() is expected to be called for a process (task) that has
* an attached 'struct pid' (see attach_pid(), detach_pid()) i.e @pid
* is expected to be non-NULL. If @pid is NULL, caller should handle
* the resulting NULL pid-ns.
*/
static inline struct pid_namespace *ns_of_pid(struct pid *pid)
{
struct pid_namespace *ns = NULL;
if (pid)
ns = pid->numbers[pid->level].ns;
return ns;
}
/*
* is_child_reaper returns true if the pid is the init process
* of the current namespace. As this one could be checked before
* pid_ns->child_reaper is assigned in copy_process, we check
* with the pid number.
*/
static inline bool is_child_reaper(struct pid *pid)
{
return pid->numbers[pid->level].nr == 1;
}
/*
* the helpers to get the pid's id seen from different namespaces
*
* pid_nr() : global id, i.e. the id seen from the init namespace;
* pid_vnr() : virtual id, i.e. the id seen from the pid namespace of
* current.
* pid_nr_ns() : id seen from the ns specified.
*
* see also task_xid_nr() etc in include/linux/sched.h
*/
static inline pid_t pid_nr(struct pid *pid)
{
pid_t nr = 0;
if (pid)
nr = pid->numbers[0].nr;
return nr;
}
pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns);
pid_t pid_vnr(struct pid *pid);
#define do_each_pid_task(pid, type, task) \
do { \
if ((pid) != NULL) \
hlist_for_each_entry_rcu((task), \
&(pid)->tasks[type], pid_links[type]) {
/*
* Both old and new leaders may be attached to
* the same pid in the middle of de_thread().
*/
#define while_each_pid_task(pid, type, task) \
if (type == PIDTYPE_PID) \
break; \
} \
} while (0)
#define do_each_pid_thread(pid, type, task) \
do_each_pid_task(pid, type, task) { \
struct task_struct *tg___ = task; \
for_each_thread(tg___, task) {
#define while_each_pid_thread(pid, type, task) \
} \
task = tg___; \
} while_each_pid_task(pid, type, task)
static inline struct pid *task_pid(struct task_struct *task)
{
return task->thread_pid;
}
/*
* the helpers to get the task's different pids as they are seen
* from various namespaces
*
* task_xid_nr() : global id, i.e. the id seen from the init namespace;
* task_xid_vnr() : virtual id, i.e. the id seen from the pid namespace of
* current.
* task_xid_nr_ns() : id seen from the ns specified;
*
* see also pid_nr() etc in include/linux/pid.h
*/
pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns);
static inline pid_t task_pid_nr(struct task_struct *tsk)
{
return tsk->pid;
}
static inline pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns);
}
static inline pid_t task_pid_vnr(struct task_struct *tsk)
{
return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
}
static inline pid_t task_tgid_nr(struct task_struct *tsk)
{
return tsk->tgid;
}
/**
* pid_alive - check that a task structure is not stale
* @p: Task structure to be checked.
*
* Test if a process is not yet dead (at most zombie state)
* If pid_alive fails, then pointers within the task structure
* can be stale and must not be dereferenced.
*
* Return: 1 if the process is alive. 0 otherwise.
*/
static inline int pid_alive(const struct task_struct *p)
{
return p->thread_pid != NULL;
}
static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
}
static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
{
return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL);
}
static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns);
}
static inline pid_t task_session_vnr(struct task_struct *tsk)
{
return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
}
static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns);
}
static inline pid_t task_tgid_vnr(struct task_struct *tsk)
{
return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL);
}
static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns)
{
pid_t pid = 0;
rcu_read_lock();
if (pid_alive(tsk))
pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns);
rcu_read_unlock();
return pid;
}
static inline pid_t task_ppid_nr(const struct task_struct *tsk)
{
return task_ppid_nr_ns(tsk, &init_pid_ns);
}
/* Obsolete, do not use: */
static inline pid_t task_pgrp_nr(struct task_struct *tsk)
{
return task_pgrp_nr_ns(tsk, &init_pid_ns);
}
/**
* is_global_init - check if a task structure is init. Since init
* is free to have sub-threads we need to check tgid.
* @tsk: Task structure to be checked.
*
* Check if a task structure is the first user space task the kernel created.
*
* Return: 1 if the task structure is init. 0 otherwise.
*/
static inline int is_global_init(struct task_struct *tsk)
{
return task_tgid_nr(tsk) == 1;
}
#endif /* _LINUX_PID_H */