mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2026-03-03 18:28:01 +01:00
The current implementation of CXL memory hotplug notifier gets called
before the HMAT memory hotplug notifier. The CXL driver calculates the
access coordinates (bandwidth and latency values) for the CXL end to
end path (i.e. CPU to endpoint). When the CXL region is onlined, the CXL
memory hotplug notifier writes the access coordinates to the HMAT target
structs. Then the HMAT memory hotplug notifier is called and it creates
the access coordinates for the node sysfs attributes.
During testing on an Intel platform, it was found that although the
newly calculated coordinates were pushed to sysfs, the sysfs attributes for
the access coordinates showed up with the wrong initiator. The system has
4 nodes (0, 1, 2, 3) where node 0 and 1 are CPU nodes and node 2 and 3 are
CXL nodes. The expectation is that node 2 would show up as a target to node
0:
/sys/devices/system/node/node2/access0/initiators/node0
However it was observed that node 2 showed up as a target under node 1:
/sys/devices/system/node/node2/access0/initiators/node1
The original intent of the 'ext_updated' flag in HMAT handling code was to
stop HMAT memory hotplug callback from clobbering the access coordinates
after CXL has injected its calculated coordinates and replaced the generic
target access coordinates provided by the HMAT table in the HMAT target
structs. However the flag is hacky at best and blocks the updates from
other CXL regions that are onlined in the same node later on. Remove the
'ext_updated' flag usage and just update the access coordinates for the
nodes directly without touching HMAT target data.
The hotplug memory callback ordering is changed. Instead of changing CXL,
move HMAT back so there's room for the levels rather than have CXL share
the same level as SLAB_CALLBACK_PRI. The change will resulting in the CXL
callback to be executed after the HMAT callback.
With the change, the CXL hotplug memory notifier runs after the HMAT
callback. The HMAT callback will create the node sysfs attributes for
access coordinates. The CXL callback will write the access coordinates to
the now created node sysfs attributes directly and will not pollute the
HMAT target values.
A nodemask is introduced to keep track if a node has been updated and
prevents further updates.
Fixes: 067353a46d ("cxl/region: Add memory hotplug notifier for cxl region")
Cc: stable@vger.kernel.org
Tested-by: Marc Herbert <marc.herbert@linux.intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Link: https://patch.msgid.link/20250829222907.1290912-4-dave.jiang@intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
219 lines
7.1 KiB
C
219 lines
7.1 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
/*
|
|
* include/linux/memory.h - generic memory definition
|
|
*
|
|
* This is mainly for topological representation. We define the
|
|
* basic "struct memory_block" here, which can be embedded in per-arch
|
|
* definitions or NUMA information.
|
|
*
|
|
* Basic handling of the devices is done in drivers/base/memory.c
|
|
* and system devices are handled in drivers/base/sys.c.
|
|
*
|
|
* Memory block are exported via sysfs in the class/memory/devices/
|
|
* directory.
|
|
*
|
|
*/
|
|
#ifndef _LINUX_MEMORY_H_
|
|
#define _LINUX_MEMORY_H_
|
|
|
|
#include <linux/node.h>
|
|
#include <linux/compiler.h>
|
|
#include <linux/mutex.h>
|
|
|
|
#define MIN_MEMORY_BLOCK_SIZE (1UL << SECTION_SIZE_BITS)
|
|
|
|
/**
|
|
* struct memory_group - a logical group of memory blocks
|
|
* @nid: The node id for all memory blocks inside the memory group.
|
|
* @memory_blocks: List of all memory blocks belonging to this memory group.
|
|
* @present_kernel_pages: Present (online) memory outside ZONE_MOVABLE of this
|
|
* memory group.
|
|
* @present_movable_pages: Present (online) memory in ZONE_MOVABLE of this
|
|
* memory group.
|
|
* @is_dynamic: The memory group type: static vs. dynamic
|
|
* @s.max_pages: Valid with &memory_group.is_dynamic == false. The maximum
|
|
* number of pages we'll have in this static memory group.
|
|
* @d.unit_pages: Valid with &memory_group.is_dynamic == true. Unit in pages
|
|
* in which memory is added/removed in this dynamic memory group.
|
|
* This granularity defines the alignment of a unit in physical
|
|
* address space; it has to be at least as big as a single
|
|
* memory block.
|
|
*
|
|
* A memory group logically groups memory blocks; each memory block
|
|
* belongs to at most one memory group. A memory group corresponds to
|
|
* a memory device, such as a DIMM or a NUMA node, which spans multiple
|
|
* memory blocks and might even span multiple non-contiguous physical memory
|
|
* ranges.
|
|
*
|
|
* Modification of members after registration is serialized by memory
|
|
* hot(un)plug code.
|
|
*/
|
|
struct memory_group {
|
|
int nid;
|
|
struct list_head memory_blocks;
|
|
unsigned long present_kernel_pages;
|
|
unsigned long present_movable_pages;
|
|
bool is_dynamic;
|
|
union {
|
|
struct {
|
|
unsigned long max_pages;
|
|
} s;
|
|
struct {
|
|
unsigned long unit_pages;
|
|
} d;
|
|
};
|
|
};
|
|
|
|
struct memory_block {
|
|
unsigned long start_section_nr;
|
|
unsigned long state; /* serialized by the dev->lock */
|
|
int online_type; /* for passing data to online routine */
|
|
int nid; /* NID for this memory block */
|
|
/*
|
|
* The single zone of this memory block if all PFNs of this memory block
|
|
* that are System RAM (not a memory hole, not ZONE_DEVICE ranges) are
|
|
* managed by a single zone. NULL if multiple zones (including nodes)
|
|
* apply.
|
|
*/
|
|
struct zone *zone;
|
|
struct device dev;
|
|
struct vmem_altmap *altmap;
|
|
struct memory_group *group; /* group (if any) for this block */
|
|
struct list_head group_next; /* next block inside memory group */
|
|
#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
|
|
atomic_long_t nr_hwpoison;
|
|
#endif
|
|
};
|
|
|
|
int arch_get_memory_phys_device(unsigned long start_pfn);
|
|
unsigned long memory_block_size_bytes(void);
|
|
int set_memory_block_size_order(unsigned int order);
|
|
|
|
/* These states are exposed to userspace as text strings in sysfs */
|
|
#define MEM_ONLINE (1<<0) /* exposed to userspace */
|
|
#define MEM_GOING_OFFLINE (1<<1) /* exposed to userspace */
|
|
#define MEM_OFFLINE (1<<2) /* exposed to userspace */
|
|
#define MEM_GOING_ONLINE (1<<3)
|
|
#define MEM_CANCEL_ONLINE (1<<4)
|
|
#define MEM_CANCEL_OFFLINE (1<<5)
|
|
#define MEM_PREPARE_ONLINE (1<<6)
|
|
#define MEM_FINISH_OFFLINE (1<<7)
|
|
|
|
struct memory_notify {
|
|
/*
|
|
* The altmap_start_pfn and altmap_nr_pages fields are designated for
|
|
* specifying the altmap range and are exclusively intended for use in
|
|
* MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers.
|
|
*/
|
|
unsigned long altmap_start_pfn;
|
|
unsigned long altmap_nr_pages;
|
|
unsigned long start_pfn;
|
|
unsigned long nr_pages;
|
|
};
|
|
|
|
struct notifier_block;
|
|
struct mem_section;
|
|
|
|
/*
|
|
* Priorities for the hotplug memory callback routines. Invoked from
|
|
* high to low. Higher priorities correspond to higher numbers.
|
|
*/
|
|
#define DEFAULT_CALLBACK_PRI 0
|
|
#define SLAB_CALLBACK_PRI 1
|
|
#define CXL_CALLBACK_PRI 5
|
|
#define HMAT_CALLBACK_PRI 6
|
|
#define MM_COMPUTE_BATCH_PRI 10
|
|
#define CPUSET_CALLBACK_PRI 10
|
|
#define MEMTIER_HOTPLUG_PRI 100
|
|
#define KSM_CALLBACK_PRI 100
|
|
|
|
#ifndef CONFIG_MEMORY_HOTPLUG
|
|
static inline void memory_dev_init(void)
|
|
{
|
|
return;
|
|
}
|
|
static inline int register_memory_notifier(struct notifier_block *nb)
|
|
{
|
|
return 0;
|
|
}
|
|
static inline void unregister_memory_notifier(struct notifier_block *nb)
|
|
{
|
|
}
|
|
static inline int memory_notify(unsigned long val, void *v)
|
|
{
|
|
return 0;
|
|
}
|
|
static inline int hotplug_memory_notifier(notifier_fn_t fn, int pri)
|
|
{
|
|
return 0;
|
|
}
|
|
static inline int memory_block_advise_max_size(unsigned long size)
|
|
{
|
|
return -ENODEV;
|
|
}
|
|
static inline unsigned long memory_block_advised_max_size(void)
|
|
{
|
|
return 0;
|
|
}
|
|
#else /* CONFIG_MEMORY_HOTPLUG */
|
|
extern int register_memory_notifier(struct notifier_block *nb);
|
|
extern void unregister_memory_notifier(struct notifier_block *nb);
|
|
int create_memory_block_devices(unsigned long start, unsigned long size,
|
|
struct vmem_altmap *altmap,
|
|
struct memory_group *group);
|
|
void remove_memory_block_devices(unsigned long start, unsigned long size);
|
|
extern void memory_dev_init(void);
|
|
extern int memory_notify(unsigned long val, void *v);
|
|
extern struct memory_block *find_memory_block(unsigned long section_nr);
|
|
typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *);
|
|
extern int walk_memory_blocks(unsigned long start, unsigned long size,
|
|
void *arg, walk_memory_blocks_func_t func);
|
|
extern int for_each_memory_block(void *arg, walk_memory_blocks_func_t func);
|
|
|
|
extern int memory_group_register_static(int nid, unsigned long max_pages);
|
|
extern int memory_group_register_dynamic(int nid, unsigned long unit_pages);
|
|
extern int memory_group_unregister(int mgid);
|
|
struct memory_group *memory_group_find_by_id(int mgid);
|
|
typedef int (*walk_memory_groups_func_t)(struct memory_group *, void *);
|
|
int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func,
|
|
struct memory_group *excluded, void *arg);
|
|
struct memory_block *find_memory_block_by_id(unsigned long block_id);
|
|
#define hotplug_memory_notifier(fn, pri) ({ \
|
|
static __meminitdata struct notifier_block fn##_mem_nb =\
|
|
{ .notifier_call = fn, .priority = pri };\
|
|
register_memory_notifier(&fn##_mem_nb); \
|
|
})
|
|
|
|
extern int sections_per_block;
|
|
|
|
static inline unsigned long memory_block_id(unsigned long section_nr)
|
|
{
|
|
return section_nr / sections_per_block;
|
|
}
|
|
|
|
static inline unsigned long pfn_to_block_id(unsigned long pfn)
|
|
{
|
|
return memory_block_id(pfn_to_section_nr(pfn));
|
|
}
|
|
|
|
static inline unsigned long phys_to_block_id(unsigned long phys)
|
|
{
|
|
return pfn_to_block_id(PFN_DOWN(phys));
|
|
}
|
|
|
|
#ifdef CONFIG_NUMA
|
|
void memory_block_add_nid(struct memory_block *mem, int nid,
|
|
enum meminit_context context);
|
|
#endif /* CONFIG_NUMA */
|
|
int memory_block_advise_max_size(unsigned long size);
|
|
unsigned long memory_block_advised_max_size(void);
|
|
#endif /* CONFIG_MEMORY_HOTPLUG */
|
|
|
|
/*
|
|
* Kernel text modification mutex, used for code patching. Users of this lock
|
|
* can sleep.
|
|
*/
|
|
extern struct mutex text_mutex;
|
|
|
|
#endif /* _LINUX_MEMORY_H_ */
|