mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2026-04-14 09:57:39 +02:00
Add a new listns() system call that allows userspace to iterate through
namespaces in the system. This provides a programmatic interface to
discover and inspect namespaces, enhancing existing namespace apis.
Currently, there is no direct way for userspace to enumerate namespaces
in the system. Applications must resort to scanning /proc/<pid>/ns/
across all processes, which is:
1. Inefficient - requires iterating over all processes
2. Incomplete - misses inactive namespaces that aren't attached to any
running process but are kept alive by file descriptors, bind mounts,
or parent namespace references
3. Permission-heavy - requires access to /proc for many processes
4. No ordering or ownership.
5. No filtering per namespace type: Must always iterate and check all
namespaces.
The list goes on. The listns() system call solves these problems by
providing direct kernel-level enumeration of namespaces. It is similar
to listmount() but obviously tailored to namespaces.
/*
* @req: Pointer to struct ns_id_req specifying search parameters
* @ns_ids: User buffer to receive namespace IDs
* @nr_ns_ids: Size of ns_ids buffer (maximum number of IDs to return)
* @flags: Reserved for future use (must be 0)
*/
ssize_t listns(const struct ns_id_req *req, u64 *ns_ids,
size_t nr_ns_ids, unsigned int flags);
Returns:
- On success: Number of namespace IDs written to ns_ids
- On error: Negative error code
/*
* @size: Structure size
* @ns_id: Starting point for iteration; use 0 for first call, then
* use the last returned ID for subsequent calls to paginate
* @ns_type: Bitmask of namespace types to include (from enum ns_type):
* 0: Return all namespace types
* MNT_NS: Mount namespaces
* NET_NS: Network namespaces
* USER_NS: User namespaces
* etc. Can be OR'd together
* @user_ns_id: Filter results to namespaces owned by this user namespace:
* 0: Return all namespaces (subject to permission checks)
* LISTNS_CURRENT_USER: Namespaces owned by caller's user namespace
* Other value: Namespaces owned by the specified user namespace ID
*/
struct ns_id_req {
__u32 size; /* sizeof(struct ns_id_req) */
__u32 spare; /* Reserved, must be 0 */
__u64 ns_id; /* Last seen namespace ID (for pagination) */
__u32 ns_type; /* Filter by namespace type(s) */
__u32 spare2; /* Reserved, must be 0 */
__u64 user_ns_id; /* Filter by owning user namespace */
};
Example 1: List all namespaces
void list_all_namespaces(void)
{
struct ns_id_req req = {
.size = sizeof(req),
.ns_id = 0, /* Start from beginning */
.ns_type = 0, /* All types */
.user_ns_id = 0, /* All user namespaces */
};
uint64_t ids[100];
ssize_t ret;
printf("All namespaces in the system:\n");
do {
ret = listns(&req, ids, 100, 0);
if (ret < 0) {
perror("listns");
break;
}
for (ssize_t i = 0; i < ret; i++)
printf(" Namespace ID: %llu\n", (unsigned long long)ids[i]);
/* Continue from last seen ID */
if (ret > 0)
req.ns_id = ids[ret - 1];
} while (ret == 100); /* Buffer was full, more may exist */
}
Example 2: List network namespaces only
void list_network_namespaces(void)
{
struct ns_id_req req = {
.size = sizeof(req),
.ns_id = 0,
.ns_type = NET_NS, /* Only network namespaces */
.user_ns_id = 0,
};
uint64_t ids[100];
ssize_t ret;
ret = listns(&req, ids, 100, 0);
if (ret < 0) {
perror("listns");
return;
}
printf("Network namespaces: %zd found\n", ret);
for (ssize_t i = 0; i < ret; i++)
printf(" netns ID: %llu\n", (unsigned long long)ids[i]);
}
Example 3: List namespaces owned by current user namespace
void list_owned_namespaces(void)
{
struct ns_id_req req = {
.size = sizeof(req),
.ns_id = 0,
.ns_type = 0, /* All types */
.user_ns_id = LISTNS_CURRENT_USER, /* Current userns */
};
uint64_t ids[100];
ssize_t ret;
ret = listns(&req, ids, 100, 0);
if (ret < 0) {
perror("listns");
return;
}
printf("Namespaces owned by my user namespace: %zd\n", ret);
for (ssize_t i = 0; i < ret; i++)
printf(" ns ID: %llu\n", (unsigned long long)ids[i]);
}
Example 4: List multiple namespace types
void list_network_and_mount_namespaces(void)
{
struct ns_id_req req = {
.size = sizeof(req),
.ns_id = 0,
.ns_type = NET_NS | MNT_NS, /* Network and mount */
.user_ns_id = 0,
};
uint64_t ids[100];
ssize_t ret;
ret = listns(&req, ids, 100, 0);
printf("Network and mount namespaces: %zd found\n", ret);
}
Example 5: Pagination through large namespace sets
void list_all_with_pagination(void)
{
struct ns_id_req req = {
.size = sizeof(req),
.ns_id = 0,
.ns_type = 0,
.user_ns_id = 0,
};
uint64_t ids[50];
size_t total = 0;
ssize_t ret;
printf("Enumerating all namespaces with pagination:\n");
while (1) {
ret = listns(&req, ids, 50, 0);
if (ret < 0) {
perror("listns");
break;
}
if (ret == 0)
break; /* No more namespaces */
total += ret;
printf(" Batch: %zd namespaces\n", ret);
/* Last ID in this batch becomes start of next batch */
req.ns_id = ids[ret - 1];
if (ret < 50)
break; /* Partial batch = end of results */
}
printf("Total: %zu namespaces\n", total);
}
Permission Model
listns() respects namespace isolation and capabilities:
(1) Global listing (user_ns_id = 0):
- Requires CAP_SYS_ADMIN in the namespace's owning user namespace
- OR the namespace must be in the caller's namespace context (e.g.,
a namespace the caller is currently using)
- User namespaces additionally allow listing if the caller has
CAP_SYS_ADMIN in that user namespace itself
(2) Owner-filtered listing (user_ns_id != 0):
- Requires CAP_SYS_ADMIN in the specified owner user namespace
- OR the namespace must be in the caller's namespace context
- This allows unprivileged processes to enumerate namespaces they own
(3) Visibility:
- Only "active" namespaces are listed
- A namespace is active if it has a non-zero __ns_ref_active count
- This includes namespaces used by running processes, held by open
file descriptors, or kept active by bind mounts
- Inactive namespaces (kept alive only by internal kernel
references) are not visible via listns()
Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-19-2e6f823ebdc0@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
335 lines
13 KiB
C
335 lines
13 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _LINUX_NS_COMMON_H
|
|
#define _LINUX_NS_COMMON_H
|
|
|
|
#include <linux/refcount.h>
|
|
#include <linux/rbtree.h>
|
|
#include <linux/vfsdebug.h>
|
|
#include <uapi/linux/sched.h>
|
|
#include <uapi/linux/nsfs.h>
|
|
|
|
struct proc_ns_operations;
|
|
|
|
struct cgroup_namespace;
|
|
struct ipc_namespace;
|
|
struct mnt_namespace;
|
|
struct net;
|
|
struct pid_namespace;
|
|
struct time_namespace;
|
|
struct user_namespace;
|
|
struct uts_namespace;
|
|
|
|
extern struct cgroup_namespace init_cgroup_ns;
|
|
extern struct ipc_namespace init_ipc_ns;
|
|
extern struct mnt_namespace init_mnt_ns;
|
|
extern struct net init_net;
|
|
extern struct pid_namespace init_pid_ns;
|
|
extern struct time_namespace init_time_ns;
|
|
extern struct user_namespace init_user_ns;
|
|
extern struct uts_namespace init_uts_ns;
|
|
|
|
extern const struct proc_ns_operations netns_operations;
|
|
extern const struct proc_ns_operations utsns_operations;
|
|
extern const struct proc_ns_operations ipcns_operations;
|
|
extern const struct proc_ns_operations pidns_operations;
|
|
extern const struct proc_ns_operations pidns_for_children_operations;
|
|
extern const struct proc_ns_operations userns_operations;
|
|
extern const struct proc_ns_operations mntns_operations;
|
|
extern const struct proc_ns_operations cgroupns_operations;
|
|
extern const struct proc_ns_operations timens_operations;
|
|
extern const struct proc_ns_operations timens_for_children_operations;
|
|
|
|
/*
|
|
* Namespace lifetimes are managed via a two-tier reference counting model:
|
|
*
|
|
* (1) __ns_ref (refcount_t): Main reference count tracking memory
|
|
* lifetime. Controls when the namespace structure itself is freed.
|
|
* It also pins the namespace on the namespace trees whereas (2)
|
|
* only regulates their visibility to userspace.
|
|
*
|
|
* (2) __ns_ref_active (atomic_t): Reference count tracking active users.
|
|
* Controls visibility of the namespace in the namespace trees.
|
|
* Any live task that uses the namespace (via nsproxy or cred) holds
|
|
* an active reference. Any open file descriptor or bind-mount of
|
|
* the namespace holds an active reference. Once all tasks have
|
|
* called exited their namespaces and all file descriptors and
|
|
* bind-mounts have been released the active reference count drops
|
|
* to zero and the namespace becomes inactive. IOW, the namespace
|
|
* cannot be listed or opened via file handles anymore.
|
|
*
|
|
* Note that it is valid to transition from active to inactive and
|
|
* back from inactive to active e.g., when resurrecting an inactive
|
|
* namespace tree via the SIOCGSKNS ioctl().
|
|
*
|
|
* Relationship and lifecycle states:
|
|
*
|
|
* - Active (__ns_ref_active > 0):
|
|
* Namespace is actively used and visible to userspace. The namespace
|
|
* can be reopened via /proc/<pid>/ns/<ns_type>, via namespace file
|
|
* handles, or discovered via listns().
|
|
*
|
|
* - Inactive (__ns_ref_active == 0, __ns_ref > 0):
|
|
* No tasks are actively using the namespace and it isn't pinned by
|
|
* any bind-mounts or open file descriptors anymore. But the namespace
|
|
* is still kept alive by internal references. For example, the user
|
|
* namespace could be pinned by an open file through file->f_cred
|
|
* references when one of the now defunct tasks had opened a file and
|
|
* handed the file descriptor off to another process via a UNIX
|
|
* sockets. Such references keep the namespace structure alive through
|
|
* __ns_ref but will not hold an active reference.
|
|
*
|
|
* - Destroyed (__ns_ref == 0):
|
|
* No references remain. The namespace is removed from the tree and freed.
|
|
*
|
|
* State transitions:
|
|
*
|
|
* Active -> Inactive:
|
|
* When the last task using the namespace exits it drops its active
|
|
* references to all namespaces. However, user and pid namespaces
|
|
* remain accessible until the task has been reaped.
|
|
*
|
|
* Inactive -> Active:
|
|
* An inactive namespace tree might be resurrected due to e.g., the
|
|
* SIOCGSKNS ioctl() on a socket.
|
|
*
|
|
* Inactive -> Destroyed:
|
|
* When __ns_ref drops to zero the namespace is removed from the
|
|
* namespaces trees and the memory is freed (after RCU grace period).
|
|
*
|
|
* Initial namespaces:
|
|
* Boot-time namespaces (init_net, init_pid_ns, etc.) start with
|
|
* __ns_ref_active = 1 and remain active forever.
|
|
*/
|
|
struct ns_common {
|
|
u32 ns_type;
|
|
struct dentry *stashed;
|
|
const struct proc_ns_operations *ops;
|
|
unsigned int inum;
|
|
refcount_t __ns_ref; /* do not use directly */
|
|
union {
|
|
struct {
|
|
u64 ns_id;
|
|
struct /* global namespace rbtree and list */ {
|
|
struct rb_node ns_unified_tree_node;
|
|
struct list_head ns_unified_list_node;
|
|
};
|
|
struct /* per type rbtree and list */ {
|
|
struct rb_node ns_tree_node;
|
|
struct list_head ns_list_node;
|
|
};
|
|
struct /* namespace ownership rbtree and list */ {
|
|
struct rb_root ns_owner_tree; /* rbtree of namespaces owned by this namespace */
|
|
struct list_head ns_owner; /* list of namespaces owned by this namespace */
|
|
struct rb_node ns_owner_tree_node; /* node in the owner namespace's rbtree */
|
|
struct list_head ns_owner_entry; /* node in the owner namespace's ns_owned list */
|
|
};
|
|
atomic_t __ns_ref_active; /* do not use directly */
|
|
};
|
|
struct rcu_head ns_rcu;
|
|
};
|
|
};
|
|
|
|
bool is_current_namespace(struct ns_common *ns);
|
|
int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum);
|
|
void __ns_common_free(struct ns_common *ns);
|
|
struct ns_common *__must_check ns_owner(struct ns_common *ns);
|
|
|
|
static __always_inline bool is_initial_namespace(struct ns_common *ns)
|
|
{
|
|
VFS_WARN_ON_ONCE(ns->inum == 0);
|
|
return unlikely(in_range(ns->inum, MNT_NS_INIT_INO,
|
|
IPC_NS_INIT_INO - MNT_NS_INIT_INO + 1));
|
|
}
|
|
|
|
#define to_ns_common(__ns) \
|
|
_Generic((__ns), \
|
|
struct cgroup_namespace *: &(__ns)->ns, \
|
|
const struct cgroup_namespace *: &(__ns)->ns, \
|
|
struct ipc_namespace *: &(__ns)->ns, \
|
|
const struct ipc_namespace *: &(__ns)->ns, \
|
|
struct mnt_namespace *: &(__ns)->ns, \
|
|
const struct mnt_namespace *: &(__ns)->ns, \
|
|
struct net *: &(__ns)->ns, \
|
|
const struct net *: &(__ns)->ns, \
|
|
struct pid_namespace *: &(__ns)->ns, \
|
|
const struct pid_namespace *: &(__ns)->ns, \
|
|
struct time_namespace *: &(__ns)->ns, \
|
|
const struct time_namespace *: &(__ns)->ns, \
|
|
struct user_namespace *: &(__ns)->ns, \
|
|
const struct user_namespace *: &(__ns)->ns, \
|
|
struct uts_namespace *: &(__ns)->ns, \
|
|
const struct uts_namespace *: &(__ns)->ns)
|
|
|
|
#define ns_init_inum(__ns) \
|
|
_Generic((__ns), \
|
|
struct cgroup_namespace *: CGROUP_NS_INIT_INO, \
|
|
struct ipc_namespace *: IPC_NS_INIT_INO, \
|
|
struct mnt_namespace *: MNT_NS_INIT_INO, \
|
|
struct net *: NET_NS_INIT_INO, \
|
|
struct pid_namespace *: PID_NS_INIT_INO, \
|
|
struct time_namespace *: TIME_NS_INIT_INO, \
|
|
struct user_namespace *: USER_NS_INIT_INO, \
|
|
struct uts_namespace *: UTS_NS_INIT_INO)
|
|
|
|
#define ns_init_ns(__ns) \
|
|
_Generic((__ns), \
|
|
struct cgroup_namespace *: &init_cgroup_ns, \
|
|
struct ipc_namespace *: &init_ipc_ns, \
|
|
struct mnt_namespace *: &init_mnt_ns, \
|
|
struct net *: &init_net, \
|
|
struct pid_namespace *: &init_pid_ns, \
|
|
struct time_namespace *: &init_time_ns, \
|
|
struct user_namespace *: &init_user_ns, \
|
|
struct uts_namespace *: &init_uts_ns)
|
|
|
|
#define ns_init_id(__ns) \
|
|
_Generic((__ns), \
|
|
struct cgroup_namespace *: CGROUP_NS_INIT_ID, \
|
|
struct ipc_namespace *: IPC_NS_INIT_ID, \
|
|
struct mnt_namespace *: MNT_NS_INIT_ID, \
|
|
struct net *: NET_NS_INIT_ID, \
|
|
struct pid_namespace *: PID_NS_INIT_ID, \
|
|
struct time_namespace *: TIME_NS_INIT_ID, \
|
|
struct user_namespace *: USER_NS_INIT_ID, \
|
|
struct uts_namespace *: UTS_NS_INIT_ID)
|
|
|
|
#define to_ns_operations(__ns) \
|
|
_Generic((__ns), \
|
|
struct cgroup_namespace *: (IS_ENABLED(CONFIG_CGROUPS) ? &cgroupns_operations : NULL), \
|
|
struct ipc_namespace *: (IS_ENABLED(CONFIG_IPC_NS) ? &ipcns_operations : NULL), \
|
|
struct mnt_namespace *: &mntns_operations, \
|
|
struct net *: (IS_ENABLED(CONFIG_NET_NS) ? &netns_operations : NULL), \
|
|
struct pid_namespace *: (IS_ENABLED(CONFIG_PID_NS) ? &pidns_operations : NULL), \
|
|
struct time_namespace *: (IS_ENABLED(CONFIG_TIME_NS) ? &timens_operations : NULL), \
|
|
struct user_namespace *: (IS_ENABLED(CONFIG_USER_NS) ? &userns_operations : NULL), \
|
|
struct uts_namespace *: (IS_ENABLED(CONFIG_UTS_NS) ? &utsns_operations : NULL))
|
|
|
|
#define ns_common_type(__ns) \
|
|
_Generic((__ns), \
|
|
struct cgroup_namespace *: CLONE_NEWCGROUP, \
|
|
struct ipc_namespace *: CLONE_NEWIPC, \
|
|
struct mnt_namespace *: CLONE_NEWNS, \
|
|
struct net *: CLONE_NEWNET, \
|
|
struct pid_namespace *: CLONE_NEWPID, \
|
|
struct time_namespace *: CLONE_NEWTIME, \
|
|
struct user_namespace *: CLONE_NEWUSER, \
|
|
struct uts_namespace *: CLONE_NEWUTS)
|
|
|
|
#define NS_COMMON_INIT(nsname, refs) \
|
|
{ \
|
|
.ns_type = ns_common_type(&nsname), \
|
|
.ns_id = ns_init_id(&nsname), \
|
|
.inum = ns_init_inum(&nsname), \
|
|
.ops = to_ns_operations(&nsname), \
|
|
.stashed = NULL, \
|
|
.__ns_ref = REFCOUNT_INIT(refs), \
|
|
.__ns_ref_active = ATOMIC_INIT(1), \
|
|
.ns_list_node = LIST_HEAD_INIT(nsname.ns.ns_list_node), \
|
|
.ns_owner_entry = LIST_HEAD_INIT(nsname.ns.ns_owner_entry), \
|
|
.ns_owner = LIST_HEAD_INIT(nsname.ns.ns_owner), \
|
|
.ns_unified_list_node = LIST_HEAD_INIT(nsname.ns.ns_unified_list_node), \
|
|
}
|
|
|
|
#define ns_common_init(__ns) \
|
|
__ns_common_init(to_ns_common(__ns), \
|
|
ns_common_type(__ns), \
|
|
to_ns_operations(__ns), \
|
|
(((__ns) == ns_init_ns(__ns)) ? ns_init_inum(__ns) : 0))
|
|
|
|
#define ns_common_init_inum(__ns, __inum) \
|
|
__ns_common_init(to_ns_common(__ns), \
|
|
ns_common_type(__ns), \
|
|
to_ns_operations(__ns), \
|
|
__inum)
|
|
|
|
#define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns)))
|
|
|
|
static __always_inline __must_check int __ns_ref_active_read(const struct ns_common *ns)
|
|
{
|
|
return atomic_read(&ns->__ns_ref_active);
|
|
}
|
|
|
|
static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns)
|
|
{
|
|
if (refcount_dec_and_test(&ns->__ns_ref)) {
|
|
VFS_WARN_ON_ONCE(__ns_ref_active_read(ns));
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns)
|
|
{
|
|
if (refcount_inc_not_zero(&ns->__ns_ref))
|
|
return true;
|
|
VFS_WARN_ON_ONCE(__ns_ref_active_read(ns));
|
|
return false;
|
|
}
|
|
|
|
static __always_inline __must_check int __ns_ref_read(const struct ns_common *ns)
|
|
{
|
|
return refcount_read(&ns->__ns_ref);
|
|
}
|
|
|
|
#define ns_ref_read(__ns) __ns_ref_read(to_ns_common((__ns)))
|
|
#define ns_ref_inc(__ns) refcount_inc(&to_ns_common((__ns))->__ns_ref)
|
|
#define ns_ref_get(__ns) __ns_ref_get(to_ns_common((__ns)))
|
|
#define ns_ref_put(__ns) __ns_ref_put(to_ns_common((__ns)))
|
|
#define ns_ref_put_and_lock(__ns, __lock) \
|
|
refcount_dec_and_lock(&to_ns_common((__ns))->__ns_ref, (__lock))
|
|
|
|
#define ns_ref_active_read(__ns) \
|
|
((__ns) ? __ns_ref_active_read(to_ns_common(__ns)) : 0)
|
|
|
|
void __ns_ref_active_get_owner(struct ns_common *ns);
|
|
|
|
static __always_inline void __ns_ref_active_get(struct ns_common *ns)
|
|
{
|
|
WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active));
|
|
VFS_WARN_ON_ONCE(is_initial_namespace(ns) && __ns_ref_active_read(ns) <= 0);
|
|
}
|
|
#define ns_ref_active_get(__ns) \
|
|
do { if (__ns) __ns_ref_active_get(to_ns_common(__ns)); } while (0)
|
|
|
|
static __always_inline bool __ns_ref_active_get_not_zero(struct ns_common *ns)
|
|
{
|
|
if (atomic_inc_not_zero(&ns->__ns_ref_active)) {
|
|
VFS_WARN_ON_ONCE(!__ns_ref_read(ns));
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
#define ns_ref_active_get_owner(__ns) \
|
|
do { if (__ns) __ns_ref_active_get_owner(to_ns_common(__ns)); } while (0)
|
|
|
|
void __ns_ref_active_put_owner(struct ns_common *ns);
|
|
|
|
static __always_inline void __ns_ref_active_put(struct ns_common *ns)
|
|
{
|
|
if (atomic_dec_and_test(&ns->__ns_ref_active)) {
|
|
VFS_WARN_ON_ONCE(is_initial_namespace(ns));
|
|
VFS_WARN_ON_ONCE(!__ns_ref_read(ns));
|
|
__ns_ref_active_put_owner(ns);
|
|
}
|
|
}
|
|
#define ns_ref_active_put(__ns) \
|
|
do { if (__ns) __ns_ref_active_put(to_ns_common(__ns)); } while (0)
|
|
|
|
static __always_inline struct ns_common *__must_check ns_get_unless_inactive(struct ns_common *ns)
|
|
{
|
|
VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) && !__ns_ref_read(ns));
|
|
if (!__ns_ref_active_read(ns))
|
|
return NULL;
|
|
if (!__ns_ref_get(ns))
|
|
return NULL;
|
|
return ns;
|
|
}
|
|
|
|
void __ns_ref_active_resurrect(struct ns_common *ns);
|
|
|
|
#define ns_ref_active_resurrect(__ns) \
|
|
do { if (__ns) __ns_ref_active_resurrect(to_ns_common(__ns)); } while (0)
|
|
|
|
#endif
|