Files
linux-stable-mirror/include/linux/netlink.h
Linus Torvalds fcc79e1714 Merge tag 'net-next-6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Paolo Abeni:
 "The most significant set of changes is the per netns RTNL. The new
  behavior is disabled by default, regression risk should be contained.

  Notably the new config knob PTP_1588_CLOCK_VMCLOCK will inherit its
  default value from PTP_1588_CLOCK_KVM, as the first is intended to be
  a more reliable replacement for the latter.

  Core:

   - Started a very large, in-progress, effort to make the RTNL lock
     scope per network-namespace, thus reducing the lock contention
     significantly in the containerized use-case, comprising:
       - RCU-ified some relevant slices of the FIB control path
       - introduce basic per netns locking helpers
       - namespacified the IPv4 address hash table
       - remove rtnl_register{,_module}() in favour of
         rtnl_register_many()
       - refactor rtnl_{new,del,set}link() moving as much validation as
         possible out of RTNL lock
       - convert all phonet doit() and dumpit() handlers to RCU
       - convert IPv4 addresses manipulation to per-netns RTNL
       - convert virtual interface creation to per-netns RTNL
     the per-netns lock infrastructure is guarded by the
     CONFIG_DEBUG_NET_SMALL_RTNL knob, disabled by default ad interim.

   - Introduce NAPI suspension, to efficiently switching between busy
     polling (NAPI processing suspended) and normal processing.

   - Migrate the IPv4 routing input, output and control path from direct
     ToS usage to DSCP macros. This is a work in progress to make ECN
     handling consistent and reliable.

   - Add drop reasons support to the IPv4 rotue input path, allowing
     better introspection in case of packets drop.

   - Make FIB seqnum lockless, dropping RTNL protection for read access.

   - Make inet{,v6} addresses hashing less predicable.

   - Allow providing timestamp OPT_ID via cmsg, to correlate TX packets
     and timestamps

  Things we sprinkled into general kernel code:

   - Add small file operations for debugfs, to reduce the struct ops
     size.

   - Refactoring and optimization for the implementation of page_frag
     API, This is a preparatory work to consolidate the page_frag
     implementation.

  Netfilter:

   - Optimize set element transactions to reduce memory consumption

   - Extended netlink error reporting for attribute parser failure.

   - Make legacy xtables configs user selectable, giving users the
     option to configure iptables without enabling any other config.

   - Address a lot of false-positive RCU issues, pointed by recent CI
     improvements.

  BPF:

   - Put xsk sockets on a struct diet and add various cleanups. Overall,
     this helps to bump performance by 12% for some workloads.

   - Extend BPF selftests to increase coverage of XDP features in
     combination with BPF cpumap.

   - Optimize and homogenize bpf_csum_diff helper for all archs and also
     add a batch of new BPF selftests for it.

   - Extend netkit with an option to delegate skb->{mark,priority}
     scrubbing to its BPF program.

   - Make the bpf_get_netns_cookie() helper available also to tc(x) BPF
     programs.

  Protocols:

   - Introduces 4-tuple hash for connected udp sockets, speeding-up
     significantly connected sockets lookup.

   - Add a fastpath for some TCP timers that usually expires after
     close, the socket lock contention.

   - Add inbound and outbound xfrm state caches to speed up state
     lookups.

   - Avoid sending MPTCP advertisements on stale subflows, reducing
     risks on loosing them.

   - Make neighbours table flushing more scalable, maintaining per
     device neigh lists.

  Driver API:

   - Introduce a unified interface to configure transmission H/W
     shaping, and expose it to user-space via generic-netlink.

   - Add support for per-NAPI config via netlink. This makes napi
     configuration persistent across queues removal and re-creation.
     Requires driver updates, currently supported drivers are:
     nVidia/Mellanox mlx4 and mlx5, Broadcom brcm and Intel ice.

   - Add ethtool support for writing SFP / PHY firmware blocks.

   - Track RSS context allocation from ethtool core.

   - Implement support for mirroring to DSA CPU port, via TC mirror
     offload.

   - Consolidate FDB updates notification, to avoid duplicates on
     device-specific entries.

   - Expose DPLL clock quality level to the user-space.

   - Support master-slave PHY config via device tree.

  Tests and tooling:

   - forwarding: introduce deferred commands, to simplify the cleanup
     phase

  Drivers:

   - Updated several drivers - Amazon vNic, Google vNic, Microsoft vNic,
     Intel e1000e and Broadcom Tigon3 - to use netdev-genl to link the
     IRQs and queues to NAPI IDs, allowing busy polling and better
     introspection.

   - Ethernet high-speed NICs:
      - nVidia/Mellanox:
         - mlx5:
           - a large refactor to implement support for cross E-Switch
             scheduling
           - refactor H/W conter management to let it scale better
           - H/W GRO cleanups
      - Intel (100G, ice)::
         - add support for ethtool reset
         - implement support for per TX queue H/W shaping
      - AMD/Solarflare:
         - implement per device queue stats support
      - Broadcom (bnxt):
         - improve wildcard l4proto on IPv4/IPv6 ntuple rules
      - Marvell Octeon:
         - Add representor support for each Resource Virtualization Unit
           (RVU) device.
      - Hisilicon:
         - add support for the BMC Gigabit Ethernet
      - IBM (EMAC):
         - driver cleanup and modernization
      - Cisco (VIC):
         - raise the queues number limit to 256

   - Ethernet virtual:
      - Google vNIC:
         - implement page pool support
      - macsec:
         - inherit lower device's features and TSO limits when
           offloading
      - virtio_net:
         - enable premapped mode by default
         - support for XDP socket(AF_XDP) zerocopy TX
      - wireguard:
         - set the TSO max size to be GSO_MAX_SIZE, to aggregate larger
           packets.

   - Ethernet NICs embedded and virtual:
      - Broadcom ASP:
         - enable software timestamping
      - Freescale:
         - add enetc4 PF driver
      - MediaTek: Airoha SoC:
         - implement BQL support
      - RealTek r8169:
         - enable TSO by default on r8168/r8125
         - implement extended ethtool stats
      - Renesas AVB:
         - enable TX checksum offload
      - Synopsys (stmmac):
         - support header splitting for vlan tagged packets
         - move common code for DWMAC4 and DWXGMAC into a separate FPE
           module.
         - add dwmac driver support for T-HEAD TH1520 SoC
      - Synopsys (xpcs):
         - driver refactor and cleanup
      - TI:
         - icssg_prueth: add VLAN offload support
      - Xilinx emaclite:
         - add clock support

   - Ethernet switches:
      - Microchip:
         - implement support for the lan969x Ethernet switch family
         - add LAN9646 switch support to KSZ DSA driver

   - Ethernet PHYs:
      - Marvel: 88q2x: enable auto negotiation
      - Microchip: add support for LAN865X Rev B1 and LAN867X Rev C1/C2

   - PTP:
      - Add support for the Amazon virtual clock device
      - Add PtP driver for s390 clocks

   - WiFi:
      - mac80211
         - EHT 1024 aggregation size for transmissions
         - new operation to indicate that a new interface is to be added
         - support radio separation of multi-band devices
         - move wireless extension spy implementation to libiw
      - Broadcom:
         - brcmfmac: optional LPO clock support
      - Microchip:
         - add support for Atmel WILC3000
      - Qualcomm (ath12k):
         - firmware coredump collection support
         - add debugfs support for a multitude of statistics
      - Qualcomm (ath5k):
         -  Arcadyan ARV45XX AR2417 & Gigaset SX76[23] AR241[34]A support
      - Realtek:
         - rtw88: 8821au and 8812au USB adapters support
         - rtw89: add thermal protection
         - rtw89: fine tune BT-coexsitence to improve user experience
         - rtw89: firmware secure boot for WiFi 6 chip

   - Bluetooth
      - add Qualcomm WCN785x support for ids Foxconn 0xe0fc/0xe0f3 and
        0x13d3:0x3623
      - add Realtek RTL8852BE support for id Foxconn 0xe123
      - add MediaTek MT7920 support for wireless module ids
      - btintel_pcie: add handshake between driver and firmware
      - btintel_pcie: add recovery mechanism
      - btnxpuart: add GPIO support to power save feature"

* tag 'net-next-6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1475 commits)
  mm: page_frag: fix a compile error when kernel is not compiled
  Documentation: tipc: fix formatting issue in tipc.rst
  selftests: nic_performance: Add selftest for performance of NIC driver
  selftests: nic_link_layer: Add selftest case for speed and duplex states
  selftests: nic_link_layer: Add link layer selftest for NIC driver
  bnxt_en: Add FW trace coredump segments to the coredump
  bnxt_en: Add a new ethtool -W dump flag
  bnxt_en: Add 2 parameters to bnxt_fill_coredump_seg_hdr()
  bnxt_en: Add functions to copy host context memory
  bnxt_en: Do not free FW log context memory
  bnxt_en: Manage the FW trace context memory
  bnxt_en: Allocate backing store memory for FW trace logs
  bnxt_en: Add a 'force' parameter to bnxt_free_ctx_mem()
  bnxt_en: Refactor bnxt_free_ctx_mem()
  bnxt_en: Add mem_valid bit to struct bnxt_ctx_mem_type
  bnxt_en: Update firmware interface spec to 1.10.3.85
  selftests/bpf: Add some tests with sockmap SK_PASS
  bpf: fix recursive lock when verdict program return SK_PASS
  wireguard: device: support big tcp GSO
  wireguard: selftests: load nf_conntrack if not present
  ...
2024-11-21 08:28:08 -08:00

361 lines
11 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_NETLINK_H
#define __LINUX_NETLINK_H
#include <linux/capability.h>
#include <linux/skbuff.h>
#include <linux/export.h>
#include <net/scm.h>
#include <uapi/linux/netlink.h>
struct net;
void do_trace_netlink_extack(const char *msg);
static inline struct nlmsghdr *nlmsg_hdr(const struct sk_buff *skb)
{
return (struct nlmsghdr *)skb->data;
}
enum netlink_skb_flags {
NETLINK_SKB_DST = 0x8, /* Dst set in sendto or sendmsg */
};
struct netlink_skb_parms {
struct scm_creds creds; /* Skb credentials */
__u32 portid;
__u32 dst_group;
__u32 flags;
struct sock *sk;
bool nsid_is_set;
int nsid;
};
#define NETLINK_CB(skb) (*(struct netlink_skb_parms*)&((skb)->cb))
#define NETLINK_CREDS(skb) (&NETLINK_CB((skb)).creds)
#define NETLINK_CTX_SIZE 48
void netlink_table_grab(void);
void netlink_table_ungrab(void);
#define NL_CFG_F_NONROOT_RECV (1 << 0)
#define NL_CFG_F_NONROOT_SEND (1 << 1)
/* optional Netlink kernel configuration parameters */
struct netlink_kernel_cfg {
unsigned int groups;
unsigned int flags;
void (*input)(struct sk_buff *skb);
int (*bind)(struct net *net, int group);
void (*unbind)(struct net *net, int group);
void (*release) (struct sock *sk, unsigned long *groups);
};
struct sock *__netlink_kernel_create(struct net *net, int unit,
struct module *module,
struct netlink_kernel_cfg *cfg);
static inline struct sock *
netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg)
{
return __netlink_kernel_create(net, unit, THIS_MODULE, cfg);
}
/* this can be increased when necessary - don't expose to userland */
#define NETLINK_MAX_COOKIE_LEN 20
#define NETLINK_MAX_FMTMSG_LEN 80
/**
* struct netlink_ext_ack - netlink extended ACK report struct
* @_msg: message string to report - don't access directly, use
* %NL_SET_ERR_MSG
* @bad_attr: attribute with error
* @policy: policy for a bad attribute
* @miss_type: attribute type which was missing
* @miss_nest: nest missing an attribute (%NULL if missing top level attr)
* @cookie: cookie data to return to userspace (for success)
* @cookie_len: actual cookie data length
* @_msg_buf: output buffer for formatted message strings - don't access
* directly, use %NL_SET_ERR_MSG_FMT
*/
struct netlink_ext_ack {
const char *_msg;
const struct nlattr *bad_attr;
const struct nla_policy *policy;
const struct nlattr *miss_nest;
u16 miss_type;
u8 cookie[NETLINK_MAX_COOKIE_LEN];
u8 cookie_len;
char _msg_buf[NETLINK_MAX_FMTMSG_LEN];
};
/* Always use this macro, this allows later putting the
* message into a separate section or such for things
* like translation or listing all possible messages.
* If string formatting is needed use NL_SET_ERR_MSG_FMT.
*/
#define NL_SET_ERR_MSG(extack, msg) do { \
static const char __msg[] = msg; \
struct netlink_ext_ack *__extack = (extack); \
\
do_trace_netlink_extack(__msg); \
\
if (__extack) \
__extack->_msg = __msg; \
} while (0)
/* We splice fmt with %s at each end even in the snprintf so that both calls
* can use the same string constant, avoiding its duplication in .ro
*/
#define NL_SET_ERR_MSG_FMT(extack, fmt, args...) do { \
struct netlink_ext_ack *__extack = (extack); \
\
if (!__extack) \
break; \
if (snprintf(__extack->_msg_buf, NETLINK_MAX_FMTMSG_LEN, \
"%s" fmt "%s", "", ##args, "") >= \
NETLINK_MAX_FMTMSG_LEN) \
net_warn_ratelimited("%s" fmt "%s", "truncated extack: ", \
##args, "\n"); \
\
do_trace_netlink_extack(__extack->_msg_buf); \
\
__extack->_msg = __extack->_msg_buf; \
} while (0)
#define NL_SET_ERR_MSG_MOD(extack, msg) \
NL_SET_ERR_MSG((extack), KBUILD_MODNAME ": " msg)
#define NL_SET_ERR_MSG_FMT_MOD(extack, fmt, args...) \
NL_SET_ERR_MSG_FMT((extack), KBUILD_MODNAME ": " fmt, ##args)
#define NL_SET_ERR_MSG_WEAK(extack, msg) do { \
if ((extack) && !(extack)->_msg) \
NL_SET_ERR_MSG((extack), msg); \
} while (0)
#define NL_SET_ERR_MSG_WEAK_MOD(extack, msg) do { \
if ((extack) && !(extack)->_msg) \
NL_SET_ERR_MSG_MOD((extack), msg); \
} while (0)
#define NL_SET_BAD_ATTR_POLICY(extack, attr, pol) do { \
if ((extack)) { \
(extack)->bad_attr = (attr); \
(extack)->policy = (pol); \
} \
} while (0)
#define NL_SET_BAD_ATTR(extack, attr) NL_SET_BAD_ATTR_POLICY(extack, attr, NULL)
#define NL_SET_ERR_MSG_ATTR_POL(extack, attr, pol, msg) do { \
static const char __msg[] = msg; \
struct netlink_ext_ack *__extack = (extack); \
\
do_trace_netlink_extack(__msg); \
\
if (__extack) { \
__extack->_msg = __msg; \
__extack->bad_attr = (attr); \
__extack->policy = (pol); \
} \
} while (0)
#define NL_SET_ERR_MSG_ATTR_POL_FMT(extack, attr, pol, fmt, args...) do { \
struct netlink_ext_ack *__extack = (extack); \
\
if (!__extack) \
break; \
\
if (snprintf(__extack->_msg_buf, NETLINK_MAX_FMTMSG_LEN, \
"%s" fmt "%s", "", ##args, "") >= \
NETLINK_MAX_FMTMSG_LEN) \
net_warn_ratelimited("%s" fmt "%s", "truncated extack: ", \
##args, "\n"); \
\
do_trace_netlink_extack(__extack->_msg_buf); \
\
__extack->_msg = __extack->_msg_buf; \
__extack->bad_attr = (attr); \
__extack->policy = (pol); \
} while (0)
#define NL_SET_ERR_MSG_ATTR(extack, attr, msg) \
NL_SET_ERR_MSG_ATTR_POL(extack, attr, NULL, msg)
#define NL_SET_ERR_MSG_ATTR_FMT(extack, attr, msg, args...) \
NL_SET_ERR_MSG_ATTR_POL_FMT(extack, attr, NULL, msg, ##args)
#define NL_SET_ERR_ATTR_MISS(extack, nest, type) do { \
struct netlink_ext_ack *__extack = (extack); \
\
if (__extack) { \
__extack->miss_nest = (nest); \
__extack->miss_type = (type); \
} \
} while (0)
#define NL_REQ_ATTR_CHECK(extack, nest, tb, type) ({ \
struct nlattr **__tb = (tb); \
u32 __attr = (type); \
int __retval; \
\
__retval = !__tb[__attr]; \
if (__retval) \
NL_SET_ERR_ATTR_MISS((extack), (nest), __attr); \
__retval; \
})
static inline void nl_set_extack_cookie_u64(struct netlink_ext_ack *extack,
u64 cookie)
{
if (!extack)
return;
memcpy(extack->cookie, &cookie, sizeof(cookie));
extack->cookie_len = sizeof(cookie);
}
void netlink_kernel_release(struct sock *sk);
int __netlink_change_ngroups(struct sock *sk, unsigned int groups);
int netlink_change_ngroups(struct sock *sk, unsigned int groups);
void __netlink_clear_multicast_users(struct sock *sk, unsigned int group);
void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err,
const struct netlink_ext_ack *extack);
int netlink_has_listeners(struct sock *sk, unsigned int group);
bool netlink_strict_get_check(struct sk_buff *skb);
int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 portid, int nonblock);
int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 portid,
__u32 group, gfp_t allocation);
typedef int (*netlink_filter_fn)(struct sock *dsk, struct sk_buff *skb, void *data);
int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb,
__u32 portid, __u32 group, gfp_t allocation,
netlink_filter_fn filter,
void *filter_data);
int netlink_set_err(struct sock *ssk, __u32 portid, __u32 group, int code);
int netlink_register_notifier(struct notifier_block *nb);
int netlink_unregister_notifier(struct notifier_block *nb);
/* finegrained unicast helpers: */
struct sock *netlink_getsockbyfd(int fd);
int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
long *timeo, struct sock *ssk);
void netlink_detachskb(struct sock *sk, struct sk_buff *skb);
int netlink_sendskb(struct sock *sk, struct sk_buff *skb);
static inline struct sk_buff *
netlink_skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
{
struct sk_buff *nskb;
nskb = skb_clone(skb, gfp_mask);
if (!nskb)
return NULL;
/* This is a large skb, set destructor callback to release head */
if (is_vmalloc_addr(skb->head))
nskb->destructor = skb->destructor;
return nskb;
}
/*
* skb should fit one page. This choice is good for headerless malloc.
* But we should limit to 8K so that userspace does not have to
* use enormous buffer sizes on recvmsg() calls just to avoid
* MSG_TRUNC when PAGE_SIZE is very large.
*/
#if PAGE_SIZE < 8192UL
#define NLMSG_GOODSIZE SKB_WITH_OVERHEAD(PAGE_SIZE)
#else
#define NLMSG_GOODSIZE SKB_WITH_OVERHEAD(8192UL)
#endif
#define NLMSG_DEFAULT_SIZE (NLMSG_GOODSIZE - NLMSG_HDRLEN)
struct netlink_callback {
struct sk_buff *skb;
const struct nlmsghdr *nlh;
int (*dump)(struct sk_buff * skb,
struct netlink_callback *cb);
int (*done)(struct netlink_callback *cb);
void *data;
/* the module that dump function belong to */
struct module *module;
struct netlink_ext_ack *extack;
u16 family;
u16 answer_flags;
u32 min_dump_alloc;
unsigned int prev_seq, seq;
int flags;
bool strict_check;
union {
u8 ctx[NETLINK_CTX_SIZE];
/* args is deprecated. Cast a struct over ctx instead
* for proper type safety.
*/
long args[6];
};
};
#define NL_ASSERT_CTX_FITS(type_name) \
BUILD_BUG_ON(sizeof(type_name) > \
sizeof_field(struct netlink_callback, ctx))
struct netlink_notify {
struct net *net;
u32 portid;
int protocol;
};
struct nlmsghdr *
__nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags);
struct netlink_dump_control {
int (*start)(struct netlink_callback *);
int (*dump)(struct sk_buff *skb, struct netlink_callback *);
int (*done)(struct netlink_callback *);
struct netlink_ext_ack *extack;
void *data;
struct module *module;
u32 min_dump_alloc;
int flags;
};
int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
const struct nlmsghdr *nlh,
struct netlink_dump_control *control);
static inline int netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
const struct nlmsghdr *nlh,
struct netlink_dump_control *control)
{
if (!control->module)
control->module = THIS_MODULE;
return __netlink_dump_start(ssk, skb, nlh, control);
}
struct netlink_tap {
struct net_device *dev;
struct module *module;
struct list_head list;
};
int netlink_add_tap(struct netlink_tap *nt);
int netlink_remove_tap(struct netlink_tap *nt);
bool __netlink_ns_capable(const struct netlink_skb_parms *nsp,
struct user_namespace *ns, int cap);
bool netlink_ns_capable(const struct sk_buff *skb,
struct user_namespace *ns, int cap);
bool netlink_capable(const struct sk_buff *skb, int cap);
bool netlink_net_capable(const struct sk_buff *skb, int cap);
struct sk_buff *netlink_alloc_large_skb(unsigned int size, int broadcast);
#endif /* __LINUX_NETLINK_H */