diff --git a/src/3rdparty/hwloc/NEWS b/src/3rdparty/hwloc/NEWS
index 0bf74d44..71f858a3 100644
--- a/src/3rdparty/hwloc/NEWS
+++ b/src/3rdparty/hwloc/NEWS
@@ -17,6 +17,76 @@ bug fixes (and other actions) for each version of hwloc since version
0.9.
+Version 2.7.0
+-------------
+* Backends
+ + Add support for NUMA nodes and caches with more than 64 PUs across
+ multiple processor groups on Windows 11 and Windows Server 2022.
+ + Group objects are not created for Windows processor groups anymore,
+ except if HWLOC_WINDOWS_PROCESSOR_GROUP_OBJS=1 in the environment.
+ + Expose "Cluster" group objects on Linux kernel 5.16+ for CPUs
+ that share some internal cache or bus. This can be equivalent
+ to the L2 Cache level on some platforms (e.g. x86) or a specific
+ level between L2 and L3 on others (e.g. ARM Kungpeng 920).
+ Thanks to Jonathan Cameron for the help.
+ - HWLOC_DONT_MERGE_CLUSTER_GROUPS=1 may be set in the environment
+ to prevent these groups from being merged with identical caches, etc.
+ + Improve the oneAPI LevelZero backend:
+ - Expose subdevices such as "ze0.1" inside root OS devices ("ze0")
+ when the hardware contains multiple subdevices.
+ - Add many new attributes to describe device type, and the
+ numbers of slices, subslices, execution units and threads.
+ - Expose the memory information as LevelZeroHBM/DDR/MemorySize infos.
+ + Ignore the max frequencies of cores in Linux cpukinds when the
+ base frequencies are available (to avoid exposing hybrid CPUs
+ when Intel Turbo Boost Max 3.0 gives slightly different max
+ frequencies to CPU cores).
+ - May be reverted by setting HWLOC_CPUKINDS_MAXFREQ=1 in the environment.
+* Tools
+ + Add --grey and --palette options to switch lstopo to greyscale or
+ white-background-only graphics, or to tune individual colors.
+* Build
+ + Windows CMake builds now support non-MSVC compilers, detect several
+ features at build time, can build/run tests, etc.
+ Thanks to Michael Hirsch and Alexander Neumann .
+
+
+Version 2.6.0
+-------------
+* Backends
+ + Expose two cpukinds for energy-efficient cores (icestorm) and
+ high-performance cores (firestorm) on Apple M1 on Mac OS X.
+ + Use sysfs CPU "capacity" to rank hybrid cores by efficiency
+ on Linux when available (mostly on recent ARM platforms for now).
+ + Improve HWLOC_MEMBIND_BIND (without the STRICT flag) on Linux kernel
+ >= 5.15: If more than one node is given, the kernel may now use all
+ of them instead of only the first one before falling back to others.
+ + Expose cache os_index when available on Linux, it may be needed
+ when using resctrl to configure cache partitioning, memory bandwidth
+ monitoring, etc.
+ + Add a "XGMIHops" distances matrix in the RSMI backend for AMD GPU
+ interconnected through XGMI links.
+ + Expose AMD GPU memory information (VRAM and GTT) in the RSMI backend.
+ + Add OS devices such as "bxi0" for Atos/Bull BXI HCAs on Linux.
+* Tools
+ + lstopo has a better placement algorithm with respect to I/O
+ objects, see --children-order in the manpage for details.
+ + hwloc-annotate may now change object subtypes and cache or memory
+ sizes.
+* Build
+ + Allow to specify the ROCm installation for building the RSMI backend:
+ - Use a custom installation path if specified with --with-rocm=
.
+ - Use /opt/rocm- if specified with --with-rocm-version=
+ or the ROCM_VERSION environment variable.
+ - Try /opt/rocm if it exists.
+ - See "How do I enable ROCm SMI and select which version to use?"
+ in the FAQ for details.
+ + Add a CMakeLists for Windows under contrib/windows-cmake/ .
+* Documentation
+ + Add FAQ entry "How do I create a custom heterogeneous and
+ asymmetric topology?"
+
+
Version 2.5.0
-------------
* API
diff --git a/src/3rdparty/hwloc/VERSION b/src/3rdparty/hwloc/VERSION
index a74f0a53..7486ae04 100644
--- a/src/3rdparty/hwloc/VERSION
+++ b/src/3rdparty/hwloc/VERSION
@@ -8,7 +8,7 @@
# Please update HWLOC_VERSION* in contrib/windows/hwloc_config.h too.
major=2
-minor=5
+minor=7
release=0
# greek is used for alpha or beta release tags. If it is non-empty,
@@ -22,7 +22,7 @@ greek=
# The date when this release was created
-date="Jun 14, 2021"
+date="Dec 06, 2021"
# If snapshot=1, then use the value from snapshot_version as the
# entire hwloc version (i.e., ignore major, minor, release, and
@@ -41,7 +41,7 @@ snapshot_version=${major}.${minor}.${release}${greek}-git
# 2. Version numbers are described in the Libtool current:revision:age
# format.
-libhwloc_so_version=20:0:5
+libhwloc_so_version=20:2:5
libnetloc_so_version=0:0:0
# Please also update the lines in contrib/windows/libhwloc.vcxproj
diff --git a/src/3rdparty/hwloc/include/hwloc.h b/src/3rdparty/hwloc/include/hwloc.h
index 88fac968..b5f0f48a 100644
--- a/src/3rdparty/hwloc/include/hwloc.h
+++ b/src/3rdparty/hwloc/include/hwloc.h
@@ -346,7 +346,8 @@ typedef enum hwloc_obj_osdev_type_e {
* For instance the "eth0" interface on Linux. */
HWLOC_OBJ_OSDEV_OPENFABRICS, /**< \brief Operating system openfabrics device.
* For instance the "mlx4_0" InfiniBand HCA,
- * or "hfi1_0" Omni-Path interface on Linux. */
+ * "hfi1_0" Omni-Path interface,
+ * or "bxi0" Atos/Bull BXI HCA on Linux. */
HWLOC_OBJ_OSDEV_DMA, /**< \brief Operating system dma engine device.
* For instance the "dma0chan0" DMA channel on Linux. */
HWLOC_OBJ_OSDEV_COPROC /**< \brief Operating system co-processor device.
@@ -1212,8 +1213,9 @@ HWLOC_DECLSPEC int hwloc_set_cpubind(hwloc_topology_t topology, hwloc_const_cpus
/** \brief Get current process or thread binding.
*
- * Writes into \p set the physical cpuset which the process or thread (according to \e
- * flags) was last bound to.
+ * The CPU-set \p set (previously allocated by the caller)
+ * is filled with the list of PUs which the process or
+ * thread (according to \e flags) was last bound to.
*/
HWLOC_DECLSPEC int hwloc_get_cpubind(hwloc_topology_t topology, hwloc_cpuset_t set, int flags);
@@ -1231,6 +1233,10 @@ HWLOC_DECLSPEC int hwloc_get_cpubind(hwloc_topology_t topology, hwloc_cpuset_t s
HWLOC_DECLSPEC int hwloc_set_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_cpuset_t set, int flags);
/** \brief Get the current physical binding of process \p pid.
+ *
+ * The CPU-set \p set (previously allocated by the caller)
+ * is filled with the list of PUs which the process
+ * was last bound to.
*
* \note \p hwloc_pid_t is \p pid_t on Unix platforms,
* and \p HANDLE on native Windows platforms.
@@ -1256,6 +1262,10 @@ HWLOC_DECLSPEC int hwloc_set_thread_cpubind(hwloc_topology_t topology, hwloc_thr
#ifdef hwloc_thread_t
/** \brief Get the current physical binding of thread \p tid.
+ *
+ * The CPU-set \p set (previously allocated by the caller)
+ * is filled with the list of PUs which the thread
+ * was last bound to.
*
* \note \p hwloc_thread_t is \p pthread_t on Unix platforms,
* and \p HANDLE on native Windows platforms.
@@ -1266,6 +1276,10 @@ HWLOC_DECLSPEC int hwloc_get_thread_cpubind(hwloc_topology_t topology, hwloc_thr
#endif
/** \brief Get the last physical CPU where the current process or thread ran.
+ *
+ * The CPU-set \p set (previously allocated by the caller)
+ * is filled with the list of PUs which the process or
+ * thread (according to \e flags) last ran on.
*
* The operating system may move some tasks from one processor
* to another at any time according to their binding,
@@ -1281,6 +1295,10 @@ HWLOC_DECLSPEC int hwloc_get_thread_cpubind(hwloc_topology_t topology, hwloc_thr
HWLOC_DECLSPEC int hwloc_get_last_cpu_location(hwloc_topology_t topology, hwloc_cpuset_t set, int flags);
/** \brief Get the last physical CPU where a process ran.
+ *
+ * The CPU-set \p set (previously allocated by the caller)
+ * is filled with the list of PUs which the process
+ * last ran on.
*
* The operating system may move some tasks from one processor
* to another at any time according to their binding,
@@ -1511,6 +1529,9 @@ HWLOC_DECLSPEC int hwloc_set_membind(hwloc_topology_t topology, hwloc_const_bitm
/** \brief Query the default memory binding policy and physical locality of the
* current process or thread.
*
+ * The bitmap \p set (previously allocated by the caller)
+ * is filled with the process or thread memory binding.
+ *
* This function has two output parameters: \p set and \p policy.
* The values returned in these parameters depend on both the \p flags
* passed in and the current memory binding policies and nodesets in
@@ -1571,6 +1592,9 @@ HWLOC_DECLSPEC int hwloc_set_proc_membind(hwloc_topology_t topology, hwloc_pid_t
/** \brief Query the default memory binding policy and physical locality of the
* specified process.
*
+ * The bitmap \p set (previously allocated by the caller)
+ * is filled with the process memory binding.
+ *
* This function has two output parameters: \p set and \p policy.
* The values returned in these parameters depend on both the \p flags
* passed in and the current memory binding policies and nodesets in
@@ -1624,6 +1648,9 @@ HWLOC_DECLSPEC int hwloc_set_area_membind(hwloc_topology_t topology, const void
/** \brief Query the CPUs near the physical NUMA node(s) and binding policy of
* the memory identified by (\p addr, \p len ).
*
+ * The bitmap \p set (previously allocated by the caller)
+ * is filled with the memory area binding.
+ *
* This function has two output parameters: \p set and \p policy.
* The values returned in these parameters depend on both the \p flags
* passed in and the memory binding policies and nodesets of the pages
@@ -1652,7 +1679,8 @@ HWLOC_DECLSPEC int hwloc_get_area_membind(hwloc_topology_t topology, const void
/** \brief Get the NUMA nodes where memory identified by (\p addr, \p len ) is physically allocated.
*
- * Fills \p set according to the NUMA nodes where the memory area pages
+ * The bitmap \p set (previously allocated by the caller)
+ * is filled according to the NUMA nodes where the memory area pages
* are physically allocated. If no page is actually allocated yet,
* \p set may be empty.
*
@@ -1698,9 +1726,12 @@ HWLOC_DECLSPEC void *hwloc_alloc_membind(hwloc_topology_t topology, size_t len,
/** \brief Allocate some memory on NUMA memory nodes specified by \p set
*
- * This is similar to hwloc_alloc_membind_nodeset() except that it is allowed to change
- * the current memory binding policy, thus providing more binding support, at
- * the expense of changing the current state.
+ * First, try to allocate properly with hwloc_alloc_membind().
+ * On failure, the current process or thread memory binding policy
+ * is changed with hwloc_set_membind() before allocating memory.
+ * Thus this function works in more cases, at the expense of changing
+ * the current state (possibly affecting future allocations that
+ * would not specify any policy).
*
* If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset.
* Otherwise it's a cpuset.
diff --git a/src/3rdparty/hwloc/include/hwloc/autogen/config.h b/src/3rdparty/hwloc/include/hwloc/autogen/config.h
index eb70ba49..8d89fa25 100644
--- a/src/3rdparty/hwloc/include/hwloc/autogen/config.h
+++ b/src/3rdparty/hwloc/include/hwloc/autogen/config.h
@@ -1,6 +1,6 @@
/*
* Copyright © 2009 CNRS
- * Copyright © 2009-2020 Inria. All rights reserved.
+ * Copyright © 2009-2021 Inria. All rights reserved.
* Copyright © 2009-2012 Université Bordeaux
* Copyright © 2009-2011 Cisco Systems, Inc. All rights reserved.
* See COPYING in top-level directory.
@@ -11,9 +11,9 @@
#ifndef HWLOC_CONFIG_H
#define HWLOC_CONFIG_H
-#define HWLOC_VERSION "2.5.0"
+#define HWLOC_VERSION "2.7.0"
#define HWLOC_VERSION_MAJOR 2
-#define HWLOC_VERSION_MINOR 5
+#define HWLOC_VERSION_MINOR 7
#define HWLOC_VERSION_RELEASE 0
#define HWLOC_VERSION_GREEK ""
diff --git a/src/3rdparty/hwloc/include/hwloc/cpukinds.h b/src/3rdparty/hwloc/include/hwloc/cpukinds.h
index f240baf3..524a05af 100644
--- a/src/3rdparty/hwloc/include/hwloc/cpukinds.h
+++ b/src/3rdparty/hwloc/include/hwloc/cpukinds.h
@@ -1,5 +1,5 @@
/*
- * Copyright © 2020 Inria. All rights reserved.
+ * Copyright © 2020-2021 Inria. All rights reserved.
* See COPYING in top-level directory.
*/
@@ -42,18 +42,23 @@ extern "C" {
* (for instance the "CoreType" and "FrequencyMaxMHz",
* see \ref topoattrs_cpukinds).
*
- * A higher efficiency value means intrinsic greater performance
+ * A higher efficiency value means greater intrinsic performance
* (and possibly less performance/power efficiency).
- * Kinds with lower efficiency are ranked first:
+ * Kinds with lower efficiency values are ranked first:
* Passing 0 as \p kind_index to hwloc_cpukinds_get_info() will
- * return information about the less efficient CPU kind.
+ * return information about the CPU kind with lower performance
+ * but higher energy-efficiency.
+ * Higher \p kind_index values would rather return information
+ * about power-hungry high-performance cores.
*
- * When available, efficiency values are gathered from the operating
- * system (when \p cpukind_efficiency is set in the
- * struct hwloc_topology_discovery_support array, only on Windows 10 for now).
- * Otherwise hwloc tries to compute efficiencies
- * by comparing CPU kinds using frequencies (on ARM),
- * or core types and frequencies (on other architectures).
+ * When available, efficiency values are gathered from the operating system.
+ * If so, \p cpukind_efficiency is set in the struct hwloc_topology_discovery_support array.
+ * This is currently available on Windows 10, Mac OS X (Darwin),
+ * and on some Linux platforms where core "capacity" is exposed in sysfs.
+ *
+ * If the operating system does not expose core efficiencies natively,
+ * hwloc tries to compute efficiencies by comparing CPU kinds using
+ * frequencies (on ARM), or core types and frequencies (on other architectures).
* The environment variable HWLOC_CPUKINDS_RANKING may be used
* to change this heuristics, see \ref envvar.
*
diff --git a/src/3rdparty/hwloc/include/hwloc/distances.h b/src/3rdparty/hwloc/include/hwloc/distances.h
index 6eac94e9..44cd7ea1 100644
--- a/src/3rdparty/hwloc/include/hwloc/distances.h
+++ b/src/3rdparty/hwloc/include/hwloc/distances.h
@@ -35,7 +35,8 @@ extern "C" {
* from a core in another node.
* The corresponding kind is ::HWLOC_DISTANCES_KIND_FROM_OS | ::HWLOC_DISTANCES_KIND_FROM_USER.
* The name of this distances structure is "NUMALatency".
- * Others distance structures include and "XGMIBandwidth" and "NVLinkBandwidth".
+ * Others distance structures include and "XGMIBandwidth", "XGMIHops"
+ * and "NVLinkBandwidth".
*
* The matrix may also contain bandwidths between random sets of objects,
* possibly provided by the user, as specified in the \p kind attribute.
@@ -159,7 +160,7 @@ hwloc_distances_get_by_type(hwloc_topology_t topology, hwloc_obj_type_t type,
* Usually only one distances structure may match a given name.
*
* The name of the most common structure is "NUMALatency".
- * Others include "XGMIBandwidth" and "NVLinkBandwidth".
+ * Others include "XGMIBandwidth", "XGMIHops" and "NVLinkBandwidth".
*/
HWLOC_DECLSPEC int
hwloc_distances_get_by_name(hwloc_topology_t topology, const char *name,
diff --git a/src/3rdparty/hwloc/include/hwloc/linux.h b/src/3rdparty/hwloc/include/hwloc/linux.h
index ecc86be3..d76633b0 100644
--- a/src/3rdparty/hwloc/include/hwloc/linux.h
+++ b/src/3rdparty/hwloc/include/hwloc/linux.h
@@ -1,6 +1,6 @@
/*
* Copyright © 2009 CNRS
- * Copyright © 2009-2016 Inria. All rights reserved.
+ * Copyright © 2009-2021 Inria. All rights reserved.
* Copyright © 2009-2011 Université Bordeaux
* See COPYING in top-level directory.
*/
@@ -44,6 +44,10 @@ extern "C" {
HWLOC_DECLSPEC int hwloc_linux_set_tid_cpubind(hwloc_topology_t topology, pid_t tid, hwloc_const_cpuset_t set);
/** \brief Get the current binding of thread \p tid
+ *
+ * The CPU-set \p set (previously allocated by the caller)
+ * is filled with the list of PUs which the thread
+ * was last bound to.
*
* The behavior is exactly the same as the Linux sched_getaffinity system call,
* but uses a hwloc cpuset.
@@ -54,6 +58,9 @@ HWLOC_DECLSPEC int hwloc_linux_set_tid_cpubind(hwloc_topology_t topology, pid_t
HWLOC_DECLSPEC int hwloc_linux_get_tid_cpubind(hwloc_topology_t topology, pid_t tid, hwloc_cpuset_t set);
/** \brief Get the last physical CPU where thread \p tid ran.
+ *
+ * The CPU-set \p set (previously allocated by the caller)
+ * is filled with the PU which the thread last ran on.
*
* \note This is equivalent to calling hwloc_get_proc_last_cpu_location() with
* ::HWLOC_CPUBIND_THREAD as flags.
diff --git a/src/3rdparty/hwloc/include/hwloc/plugins.h b/src/3rdparty/hwloc/include/hwloc/plugins.h
index 6e4f1291..ed4b833d 100644
--- a/src/3rdparty/hwloc/include/hwloc/plugins.h
+++ b/src/3rdparty/hwloc/include/hwloc/plugins.h
@@ -497,6 +497,7 @@ hwloc_filter_check_pcidev_subtype_important(unsigned classid)
return (baseclass == 0x03 /* PCI_BASE_CLASS_DISPLAY */
|| baseclass == 0x02 /* PCI_BASE_CLASS_NETWORK */
|| baseclass == 0x01 /* PCI_BASE_CLASS_STORAGE */
+ || baseclass == 0x00 /* Unclassified, for Atos/Bull BXI */
|| baseclass == 0x0b /* PCI_BASE_CLASS_PROCESSOR */
|| classid == 0x0c04 /* PCI_CLASS_SERIAL_FIBER */
|| classid == 0x0c06 /* PCI_CLASS_SERIAL_INFINIBAND */
diff --git a/src/3rdparty/hwloc/include/private/autogen/config.h b/src/3rdparty/hwloc/include/private/autogen/config.h
index 687e82bc..5bf22fac 100644
--- a/src/3rdparty/hwloc/include/private/autogen/config.h
+++ b/src/3rdparty/hwloc/include/private/autogen/config.h
@@ -1,6 +1,6 @@
/*
* Copyright © 2009, 2011, 2012 CNRS. All rights reserved.
- * Copyright © 2009-2020 Inria. All rights reserved.
+ * Copyright © 2009-2021 Inria. All rights reserved.
* Copyright © 2009, 2011, 2012, 2015 Université Bordeaux. All rights reserved.
* Copyright © 2009-2020 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
@@ -290,10 +290,6 @@
/* Define to '1' if sysctlbyname is present and usable */
/* #undef HAVE_SYSCTLBYNAME */
-/* Define to 1 if the system has the type
- `SYSTEM_LOGICAL_PROCESSOR_INFORMATION'. */
-#define HAVE_SYSTEM_LOGICAL_PROCESSOR_INFORMATION 1
-
/* Define to 1 if the system has the type
`SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX'. */
#define HAVE_SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX 1
diff --git a/src/3rdparty/hwloc/include/private/misc.h b/src/3rdparty/hwloc/include/private/misc.h
index 6c02d793..bc57e98e 100644
--- a/src/3rdparty/hwloc/include/private/misc.h
+++ b/src/3rdparty/hwloc/include/private/misc.h
@@ -504,7 +504,7 @@ hwloc__obj_type_is_icache(hwloc_obj_type_t type)
} \
} while(0)
#else /* HAVE_USELOCALE */
-#if __HWLOC_HAVE_ATTRIBUTE_UNUSED
+#if HWLOC_HAVE_ATTRIBUTE_UNUSED
#define hwloc_localeswitch_declare int __dummy_nolocale __hwloc_attribute_unused
#define hwloc_localeswitch_init()
#else
diff --git a/src/3rdparty/hwloc/include/private/private.h b/src/3rdparty/hwloc/include/private/private.h
index 5e216632..131b0796 100644
--- a/src/3rdparty/hwloc/include/private/private.h
+++ b/src/3rdparty/hwloc/include/private/private.h
@@ -480,6 +480,7 @@ extern char * hwloc_progname(struct hwloc_topology *topology);
#define HWLOC_GROUP_KIND_AIX_SDL_UNKNOWN 210 /* subkind is SDL level */
#define HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP 220 /* no subkind */
#define HWLOC_GROUP_KIND_WINDOWS_RELATIONSHIP_UNKNOWN 221 /* no subkind */
+#define HWLOC_GROUP_KIND_LINUX_CLUSTER 222 /* no subkind */
/* distance groups */
#define HWLOC_GROUP_KIND_DISTANCE 900 /* subkind is round of adding these groups during distance based grouping */
/* finally, hwloc-specific groups required to insert something else, should disappear as soon as possible */
diff --git a/src/3rdparty/hwloc/include/private/windows.h b/src/3rdparty/hwloc/include/private/windows.h
new file mode 100644
index 00000000..0a061b09
--- /dev/null
+++ b/src/3rdparty/hwloc/include/private/windows.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright © 2009 Université Bordeaux
+ * Copyright © 2020 Inria. All rights reserved.
+ *
+ * See COPYING in top-level directory.
+ */
+
+#ifndef HWLOC_PRIVATE_WINDOWS_H
+#define HWLOC_PRIVATE_WINDOWS_H
+
+#ifdef __GNUC__
+#define _ANONYMOUS_UNION __extension__
+#define _ANONYMOUS_STRUCT __extension__
+#else
+#define _ANONYMOUS_UNION
+#define _ANONYMOUS_STRUCT
+#endif /* __GNUC__ */
+#define DUMMYUNIONNAME
+#define DUMMYSTRUCTNAME
+
+#endif /* HWLOC_PRIVATE_WINDOWS_H */
diff --git a/src/3rdparty/hwloc/src/cpukinds.c b/src/3rdparty/hwloc/src/cpukinds.c
index 074b7a73..fc05f17e 100644
--- a/src/3rdparty/hwloc/src/cpukinds.c
+++ b/src/3rdparty/hwloc/src/cpukinds.c
@@ -42,6 +42,9 @@ hwloc_internal_cpukinds_dup(hwloc_topology_t new, hwloc_topology_t old)
struct hwloc_internal_cpukind_s *kinds;
unsigned i;
+ if (!old->nr_cpukinds)
+ return 0;
+
kinds = hwloc_tma_malloc(tma, old->nr_cpukinds * sizeof(*kinds));
if (!kinds)
return -1;
@@ -445,7 +448,9 @@ static int hwloc__cpukinds_compare_ranking_values(const void *_a, const void *_b
{
const struct hwloc_internal_cpukind_s *a = _a;
const struct hwloc_internal_cpukind_s *b = _b;
- return a->ranking_value - b->ranking_value;
+ uint64_t arv = a->ranking_value;
+ uint64_t brv = b->ranking_value;
+ return arv < brv ? -1 : arv > brv ? 1 : 0;
}
/* this function requires ranking values to be unique */
diff --git a/src/3rdparty/hwloc/src/memattrs.c b/src/3rdparty/hwloc/src/memattrs.c
index 16e9896e..92efe575 100644
--- a/src/3rdparty/hwloc/src/memattrs.c
+++ b/src/3rdparty/hwloc/src/memattrs.c
@@ -1,5 +1,5 @@
/*
- * Copyright © 2020 Inria. All rights reserved.
+ * Copyright © 2020-2021 Inria. All rights reserved.
* See COPYING in top-level directory.
*/
@@ -127,6 +127,8 @@ hwloc_internal_memattrs_dup(struct hwloc_topology *new, struct hwloc_topology *o
struct hwloc_internal_memattr_s *imattrs;
hwloc_memattr_id_t id;
+ /* old->nr_memattrs is always > 0 thanks to default memattrs */
+
imattrs = hwloc_tma_malloc(tma, old->nr_memattrs * sizeof(*imattrs));
if (!imattrs)
return -1;
diff --git a/src/3rdparty/hwloc/src/pci-common.c b/src/3rdparty/hwloc/src/pci-common.c
index 24626860..977475eb 100644
--- a/src/3rdparty/hwloc/src/pci-common.c
+++ b/src/3rdparty/hwloc/src/pci-common.c
@@ -810,13 +810,14 @@ hwloc_pcidisc_find_linkspeed(const unsigned char *config,
* PCIe Gen3 = 8 GT/s signal-rate per lane with 128/130 encoding = 1 GB/s data-rate per lane
* PCIe Gen4 = 16 GT/s signal-rate per lane with 128/130 encoding = 2 GB/s data-rate per lane
* PCIe Gen5 = 32 GT/s signal-rate per lane with 128/130 encoding = 4 GB/s data-rate per lane
+ * PCIe Gen6 = 64 GT/s signal-rate per lane with 128/130 encoding = 8 GB/s data-rate per lane
*/
/* lanespeed in Gbit/s */
if (speed <= 2)
lanespeed = 2.5f * speed * 0.8f;
else
- lanespeed = 8.0f * (1<<(speed-3)) * 128/130; /* assume Gen6 will be 64 GT/s and so on */
+ lanespeed = 8.0f * (1<<(speed-3)) * 128/130; /* assume Gen7 will be 128 GT/s and so on */
/* linkspeed in GB/s */
*linkspeed = lanespeed * width / 8;
diff --git a/src/3rdparty/hwloc/src/topology-windows.c b/src/3rdparty/hwloc/src/topology-windows.c
index d67c6b99..df93c5e9 100644
--- a/src/3rdparty/hwloc/src/topology-windows.c
+++ b/src/3rdparty/hwloc/src/topology-windows.c
@@ -13,6 +13,7 @@
#include "hwloc.h"
#include "hwloc/windows.h"
#include "private/private.h"
+#include "private/windows.h" /* must be before windows.h */
#include "private/debug.h"
#include
@@ -65,26 +66,6 @@ typedef enum _LOGICAL_PROCESSOR_RELATIONSHIP {
# endif /* HAVE_RELATIONPROCESSORPACKAGE */
#endif /* HAVE_LOGICAL_PROCESSOR_RELATIONSHIP */
-#ifndef HAVE_SYSTEM_LOGICAL_PROCESSOR_INFORMATION
-typedef struct _SYSTEM_LOGICAL_PROCESSOR_INFORMATION {
- ULONG_PTR ProcessorMask;
- LOGICAL_PROCESSOR_RELATIONSHIP Relationship;
- _ANONYMOUS_UNION
- union {
- struct {
- BYTE flags;
- } ProcessorCore;
- struct {
- DWORD NodeNumber;
- } NumaNode;
- CACHE_DESCRIPTOR Cache;
- ULONGLONG Reserved[2];
- } DUMMYUNIONNAME;
-} SYSTEM_LOGICAL_PROCESSOR_INFORMATION, *PSYSTEM_LOGICAL_PROCESSOR_INFORMATION;
-#endif
-
-/* Extended interface, for group support */
-
#ifndef HAVE_GROUP_AFFINITY
typedef struct _GROUP_AFFINITY {
KAFFINITY Mask;
@@ -93,35 +74,40 @@ typedef struct _GROUP_AFFINITY {
} GROUP_AFFINITY, *PGROUP_AFFINITY;
#endif
-#ifndef HAVE_PROCESSOR_RELATIONSHIP
+/* always use our own structure because the EfficiencyClass field didn't exist before Win10 */
typedef struct HWLOC_PROCESSOR_RELATIONSHIP {
BYTE Flags;
- BYTE EfficiencyClass; /* for RelationProcessorCore, higher means greater performance but less efficiency, only available in Win10+ */
+ BYTE EfficiencyClass; /* for RelationProcessorCore, higher means greater performance but less efficiency */
BYTE Reserved[20];
WORD GroupCount;
GROUP_AFFINITY GroupMask[ANYSIZE_ARRAY];
-} PROCESSOR_RELATIONSHIP, *PPROCESSOR_RELATIONSHIP;
-#endif
+} HWLOC_PROCESSOR_RELATIONSHIP;
-#ifndef HAVE_NUMA_NODE_RELATIONSHIP
-typedef struct _NUMA_NODE_RELATIONSHIP {
+/* always use our own structure because the GroupCount and GroupMasks fields didn't exist in some Win10 */
+typedef struct HWLOC_NUMA_NODE_RELATIONSHIP {
DWORD NodeNumber;
- BYTE Reserved[20];
- GROUP_AFFINITY GroupMask;
-} NUMA_NODE_RELATIONSHIP, *PNUMA_NODE_RELATIONSHIP;
-#endif
+ BYTE Reserved[18];
+ WORD GroupCount;
+ _ANONYMOUS_UNION
+ union {
+ GROUP_AFFINITY GroupMask;
+ GROUP_AFFINITY GroupMasks[ANYSIZE_ARRAY];
+ } DUMMYUNIONNAME;
+} HWLOC_NUMA_NODE_RELATIONSHIP;
-#ifndef HAVE_CACHE_RELATIONSHIP
-typedef struct _CACHE_RELATIONSHIP {
+typedef struct HWLOC_CACHE_RELATIONSHIP {
BYTE Level;
BYTE Associativity;
WORD LineSize;
DWORD CacheSize;
PROCESSOR_CACHE_TYPE Type;
- BYTE Reserved[20];
- GROUP_AFFINITY GroupMask;
-} CACHE_RELATIONSHIP, *PCACHE_RELATIONSHIP;
-#endif
+ BYTE Reserved[18];
+ WORD GroupCount;
+ union {
+ GROUP_AFFINITY GroupMask;
+ GROUP_AFFINITY GroupMasks[ANYSIZE_ARRAY];
+ } DUMMYUNIONNAME;
+} HWLOC_CACHE_RELATIONSHIP;
#ifndef HAVE_PROCESSOR_GROUP_INFO
typedef struct _PROCESSOR_GROUP_INFO {
@@ -141,20 +127,19 @@ typedef struct _GROUP_RELATIONSHIP {
} GROUP_RELATIONSHIP, *PGROUP_RELATIONSHIP;
#endif
-#ifndef HAVE_SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX
-typedef struct _SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX {
+/* always use our own structure because we need our own HWLOC_PROCESSOR/CACHE/NUMA_NODE_RELATIONSHIP */
+typedef struct HWLOC_SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX {
LOGICAL_PROCESSOR_RELATIONSHIP Relationship;
DWORD Size;
_ANONYMOUS_UNION
union {
- PROCESSOR_RELATIONSHIP Processor;
- NUMA_NODE_RELATIONSHIP NumaNode;
- CACHE_RELATIONSHIP Cache;
+ HWLOC_PROCESSOR_RELATIONSHIP Processor;
+ HWLOC_NUMA_NODE_RELATIONSHIP NumaNode;
+ HWLOC_CACHE_RELATIONSHIP Cache;
GROUP_RELATIONSHIP Group;
/* Odd: no member to tell the cpu mask of the package... */
} DUMMYUNIONNAME;
-} SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, *PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX;
-#endif
+} HWLOC_SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX;
#ifndef HAVE_PSAPI_WORKING_SET_EX_BLOCK
typedef union _PSAPI_WORKING_SET_EX_BLOCK {
@@ -200,10 +185,7 @@ static PFN_GETCURRENTPROCESSORNUMBER GetCurrentProcessorNumberProc;
typedef VOID (WINAPI *PFN_GETCURRENTPROCESSORNUMBEREX)(PPROCESSOR_NUMBER);
static PFN_GETCURRENTPROCESSORNUMBEREX GetCurrentProcessorNumberExProc;
-typedef BOOL (WINAPI *PFN_GETLOGICALPROCESSORINFORMATION)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION Buffer, PDWORD ReturnLength);
-static PFN_GETLOGICALPROCESSORINFORMATION GetLogicalProcessorInformationProc;
-
-typedef BOOL (WINAPI *PFN_GETLOGICALPROCESSORINFORMATIONEX)(LOGICAL_PROCESSOR_RELATIONSHIP relationship, PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX Buffer, PDWORD ReturnLength);
+typedef BOOL (WINAPI *PFN_GETLOGICALPROCESSORINFORMATIONEX)(LOGICAL_PROCESSOR_RELATIONSHIP relationship, HWLOC_SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *Buffer, PDWORD ReturnLength);
static PFN_GETLOGICALPROCESSORINFORMATIONEX GetLogicalProcessorInformationExProc;
typedef BOOL (WINAPI *PFN_SETTHREADGROUPAFFINITY)(HANDLE hThread, const GROUP_AFFINITY *GroupAffinity, PGROUP_AFFINITY PreviousGroupAffinity);
@@ -244,8 +226,6 @@ static void hwloc_win_get_function_ptrs(void)
(PFN_GETACTIVEPROCESSORGROUPCOUNT) GetProcAddress(kernel32, "GetActiveProcessorGroupCount");
GetActiveProcessorCountProc =
(PFN_GETACTIVEPROCESSORCOUNT) GetProcAddress(kernel32, "GetActiveProcessorCount");
- GetLogicalProcessorInformationProc =
- (PFN_GETLOGICALPROCESSORINFORMATION) GetProcAddress(kernel32, "GetLogicalProcessorInformation");
GetCurrentProcessorNumberProc =
(PFN_GETCURRENTPROCESSORNUMBER) GetProcAddress(kernel32, "GetCurrentProcessorNumber");
GetCurrentProcessorNumberExProc =
@@ -370,13 +350,13 @@ static hwloc_cpuset_t * processor_group_cpusets = NULL;
static void
hwloc_win_get_processor_groups(void)
{
- PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX procInfoTotal, tmpprocInfoTotal, procInfo;
+ HWLOC_SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *procInfoTotal, *tmpprocInfoTotal, *procInfo;
DWORD length;
unsigned i;
hwloc_debug("querying windows processor groups\n");
- if (!GetActiveProcessorGroupCountProc || !GetLogicalProcessorInformationExProc)
+ if (!GetLogicalProcessorInformationExProc)
goto error;
nr_processor_groups = GetActiveProcessorGroupCountProc();
@@ -415,6 +395,8 @@ hwloc_win_get_processor_groups(void)
assert(procInfo->Relationship == RelationGroup);
+ hwloc_debug("Found %u active windows processor groups\n",
+ (unsigned) procInfo->Group.ActiveGroupCount);
for (id = 0; id < procInfo->Group.ActiveGroupCount; id++) {
KAFFINITY mask;
hwloc_bitmap_t set;
@@ -424,8 +406,8 @@ hwloc_win_get_processor_groups(void)
goto error_with_cpusets;
mask = procInfo->Group.GroupInfo[id].ActiveProcessorMask;
- hwloc_debug("group %u %d cpus mask %lx\n", id,
- procInfo->Group.GroupInfo[id].ActiveProcessorCount, mask);
+ hwloc_debug("group %u with %u cpus mask 0x%llx\n", id,
+ (unsigned) procInfo->Group.GroupInfo[id].ActiveProcessorCount, (unsigned long long) mask);
/* KAFFINITY is ULONG_PTR */
hwloc_bitmap_set_ith_ULONG_PTR(set, id, mask);
/* FIXME: what if running 32bits on a 64bits windows with 64-processor groups?
@@ -1008,6 +990,8 @@ hwloc_look_windows(struct hwloc_backend *backend, struct hwloc_disc_status *dsta
unsigned hostname_size = sizeof(hostname);
int has_efficiencyclass = 0;
struct hwloc_win_efficiency_classes eclasses;
+ char *env = getenv("HWLOC_WINDOWS_PROCESSOR_GROUP_OBJS");
+ int keep_pgroup_objs = (env && atoi(env));
assert(dstatus->phase == HWLOC_DISC_PHASE_CPU);
@@ -1038,137 +1022,8 @@ hwloc_look_windows(struct hwloc_backend *backend, struct hwloc_disc_status *dsta
GetSystemInfo(&SystemInfo);
- if (!GetLogicalProcessorInformationExProc && GetLogicalProcessorInformationProc) {
- PSYSTEM_LOGICAL_PROCESSOR_INFORMATION procInfo, tmpprocInfo;
- unsigned id;
- unsigned i;
- struct hwloc_obj *obj;
- hwloc_obj_type_t type;
-
- length = 0;
- procInfo = NULL;
-
- while (1) {
- if (GetLogicalProcessorInformationProc(procInfo, &length))
- break;
- if (GetLastError() != ERROR_INSUFFICIENT_BUFFER)
- return -1;
- tmpprocInfo = realloc(procInfo, length);
- if (!tmpprocInfo) {
- free(procInfo);
- goto out;
- }
- procInfo = tmpprocInfo;
- }
-
- assert(!length || procInfo);
-
- for (i = 0; i < length / sizeof(*procInfo); i++) {
-
- /* Ignore unknown caches */
- if (procInfo->Relationship == RelationCache
- && procInfo->Cache.Type != CacheUnified
- && procInfo->Cache.Type != CacheData
- && procInfo->Cache.Type != CacheInstruction)
- continue;
-
- id = HWLOC_UNKNOWN_INDEX;
- switch (procInfo[i].Relationship) {
- case RelationNumaNode:
- type = HWLOC_OBJ_NUMANODE;
- id = procInfo[i].NumaNode.NodeNumber;
- gotnuma++;
- if (id > max_numanode_index)
- max_numanode_index = id;
- break;
- case RelationProcessorPackage:
- type = HWLOC_OBJ_PACKAGE;
- break;
- case RelationCache:
- type = (procInfo[i].Cache.Type == CacheInstruction ? HWLOC_OBJ_L1ICACHE : HWLOC_OBJ_L1CACHE) + procInfo[i].Cache.Level - 1;
- break;
- case RelationProcessorCore:
- type = HWLOC_OBJ_CORE;
- break;
- case RelationGroup:
- default:
- type = HWLOC_OBJ_GROUP;
- break;
- }
-
- if (!hwloc_filter_check_keep_object_type(topology, type))
- continue;
-
- obj = hwloc_alloc_setup_object(topology, type, id);
- obj->cpuset = hwloc_bitmap_alloc();
- hwloc_debug("%s#%u mask %llx\n", hwloc_obj_type_string(type), id, (unsigned long long) procInfo[i].ProcessorMask);
- /* ProcessorMask is a ULONG_PTR */
- hwloc_bitmap_set_ith_ULONG_PTR(obj->cpuset, 0, procInfo[i].ProcessorMask);
- hwloc_debug_2args_bitmap("%s#%u bitmap %s\n", hwloc_obj_type_string(type), id, obj->cpuset);
-
- switch (type) {
- case HWLOC_OBJ_NUMANODE:
- {
- ULONGLONG avail;
- obj->nodeset = hwloc_bitmap_alloc();
- hwloc_bitmap_set(obj->nodeset, id);
- if ((GetNumaAvailableMemoryNodeExProc && GetNumaAvailableMemoryNodeExProc(id, &avail))
- || (GetNumaAvailableMemoryNodeProc && GetNumaAvailableMemoryNodeProc(id, &avail))) {
- obj->attr->numanode.local_memory = avail;
- gotnumamemory++;
- }
- obj->attr->numanode.page_types_len = 2;
- obj->attr->numanode.page_types = malloc(2 * sizeof(*obj->attr->numanode.page_types));
- memset(obj->attr->numanode.page_types, 0, 2 * sizeof(*obj->attr->numanode.page_types));
- obj->attr->numanode.page_types_len = 1;
- obj->attr->numanode.page_types[0].size = SystemInfo.dwPageSize;
-#if HAVE_DECL__SC_LARGE_PAGESIZE
- obj->attr->numanode.page_types_len++;
- obj->attr->numanode.page_types[1].size = sysconf(_SC_LARGE_PAGESIZE);
-#endif
- break;
- }
- case HWLOC_OBJ_L1CACHE:
- case HWLOC_OBJ_L2CACHE:
- case HWLOC_OBJ_L3CACHE:
- case HWLOC_OBJ_L4CACHE:
- case HWLOC_OBJ_L5CACHE:
- case HWLOC_OBJ_L1ICACHE:
- case HWLOC_OBJ_L2ICACHE:
- case HWLOC_OBJ_L3ICACHE:
- obj->attr->cache.size = procInfo[i].Cache.Size;
- obj->attr->cache.associativity = procInfo[i].Cache.Associativity == CACHE_FULLY_ASSOCIATIVE ? -1 : procInfo[i].Cache.Associativity ;
- obj->attr->cache.linesize = procInfo[i].Cache.LineSize;
- obj->attr->cache.depth = procInfo[i].Cache.Level;
- switch (procInfo->Cache.Type) {
- case CacheUnified:
- obj->attr->cache.type = HWLOC_OBJ_CACHE_UNIFIED;
- break;
- case CacheData:
- obj->attr->cache.type = HWLOC_OBJ_CACHE_DATA;
- break;
- case CacheInstruction:
- obj->attr->cache.type = HWLOC_OBJ_CACHE_INSTRUCTION;
- break;
- default:
- hwloc_free_unlinked_object(obj);
- continue;
- }
- break;
- case HWLOC_OBJ_GROUP:
- obj->attr->group.kind = procInfo[i].Relationship == RelationGroup ? HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP : HWLOC_GROUP_KIND_WINDOWS_RELATIONSHIP_UNKNOWN;
- break;
- default:
- break;
- }
- hwloc__insert_object_by_cpuset(topology, NULL, obj, "windows:GetLogicalProcessorInformation");
- }
-
- free(procInfo);
- }
-
if (GetLogicalProcessorInformationExProc) {
- PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX procInfoTotal, tmpprocInfoTotal, procInfo;
+ HWLOC_SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *procInfoTotal, *tmpprocInfoTotal, *procInfo;
unsigned id;
struct hwloc_obj *obj;
hwloc_obj_type_t type;
@@ -1207,8 +1062,16 @@ hwloc_look_windows(struct hwloc_backend *backend, struct hwloc_disc_status *dsta
switch (procInfo->Relationship) {
case RelationNumaNode:
type = HWLOC_OBJ_NUMANODE;
- num = 1;
- GroupMask = &procInfo->NumaNode.GroupMask;
+ /* Starting with Windows 11 and Server 2022, the GroupCount field is valid and >=1
+ * and we may read GroupMasks[]. Older releases have GroupCount==0 and we must read GroupMask.
+ */
+ if (procInfo->NumaNode.GroupCount) {
+ num = procInfo->NumaNode.GroupCount;
+ GroupMask = procInfo->NumaNode.GroupMasks;
+ } else {
+ num = 1;
+ GroupMask = &procInfo->NumaNode.GroupMask;
+ }
id = procInfo->NumaNode.NodeNumber;
gotnuma++;
if (id > max_numanode_index)
@@ -1221,18 +1084,20 @@ hwloc_look_windows(struct hwloc_backend *backend, struct hwloc_disc_status *dsta
break;
case RelationCache:
type = (procInfo->Cache.Type == CacheInstruction ? HWLOC_OBJ_L1ICACHE : HWLOC_OBJ_L1CACHE) + procInfo->Cache.Level - 1;
- num = 1;
- GroupMask = &procInfo->Cache.GroupMask;
+ /* GroupCount added approximately with NumaNode.GroupCount above */
+ if (procInfo->Cache.GroupCount) {
+ num = procInfo->Cache.GroupCount;
+ GroupMask = procInfo->Cache.GroupMasks;
+ } else {
+ num = 1;
+ GroupMask = &procInfo->Cache.GroupMask;
+ }
break;
case RelationProcessorCore:
type = HWLOC_OBJ_CORE;
num = procInfo->Processor.GroupCount;
GroupMask = procInfo->Processor.GroupMask;
- if (has_efficiencyclass)
- /* the EfficiencyClass field didn't exist before Windows10 and recent MSVC headers,
- * so just access it manually instead of trying to detect it.
- */
- efficiency_class = * ((&procInfo->Processor.Flags) + 1);
+ efficiency_class = procInfo->Processor.EfficiencyClass;
break;
case RelationGroup:
/* So strange an interface... */
@@ -1257,11 +1122,12 @@ hwloc_look_windows(struct hwloc_backend *backend, struct hwloc_disc_status *dsta
groups_pu_set = hwloc_bitmap_alloc();
hwloc_bitmap_or(groups_pu_set, groups_pu_set, set);
- if (hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_GROUP)) {
+ /* Ignore processor groups unless requested and filtered-in */
+ if (keep_pgroup_objs && hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_GROUP)) {
obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_GROUP, id);
obj->cpuset = set;
obj->attr->group.kind = HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP;
- hwloc__insert_object_by_cpuset(topology, NULL, obj, "windows:GetLogicalProcessorInformation:ProcessorGroup");
+ hwloc__insert_object_by_cpuset(topology, NULL, obj, "windows:GetLogicalProcessorInformationEx:ProcessorGroup");
} else
hwloc_bitmap_free(set);
}
diff --git a/src/3rdparty/hwloc/src/topology-x86.c b/src/3rdparty/hwloc/src/topology-x86.c
index 94f9d453..42172eca 100644
--- a/src/3rdparty/hwloc/src/topology-x86.c
+++ b/src/3rdparty/hwloc/src/topology-x86.c
@@ -500,7 +500,8 @@ static void read_amd_cores_topoext(struct procinfo *infos, unsigned long flags,
nodes_per_proc = ((ecx >> 8) & 7) + 1;
}
if ((infos->cpufamilynumber == 0x15 && nodes_per_proc > 2)
- || ((infos->cpufamilynumber == 0x17 || infos->cpufamilynumber == 0x18) && nodes_per_proc > 4)) {
+ || ((infos->cpufamilynumber == 0x17 || infos->cpufamilynumber == 0x18) && nodes_per_proc > 4)
+ || (infos->cpufamilynumber == 0x19 && nodes_per_proc > 1)) {
hwloc_debug("warning: undefined nodes_per_proc value %u, assuming it means %u\n", nodes_per_proc, nodes_per_proc);
}
}
@@ -775,13 +776,19 @@ static void look_proc(struct hwloc_backend *backend, struct procinfo *infos, uns
} else if (cpuid_type == amd) {
/* AMD quirks */
- if (infos->cpufamilynumber == 0x17
- && cache->level == 3 && cache->nbthreads_sharing == 6) {
- /* AMD family 0x17 always shares L3 between 8 APIC ids,
- * even when only 6 APIC ids are enabled and reported in nbthreads_sharing
- * (on 24-core CPUs).
+ if (infos->cpufamilynumber >= 0x17 && cache->level == 3) {
+ /* AMD family 0x19 always shares L3 between 16 APIC ids (8 HT cores).
+ * while Family 0x17 shares between 8 APIC ids (4 HT cores).
+ * But many models have less APIC ids enabled and reported in nbthreads_sharing.
+ * It means we must round-up nbthreads_sharing to the nearest power of 2
+ * before computing cacheid.
*/
- cache->cacheid = infos->apicid / 8;
+ unsigned nbapics_sharing = cache->nbthreads_sharing;
+ if (nbapics_sharing & (nbapics_sharing-1))
+ /* not a power of two, round-up */
+ nbapics_sharing = 1U<<(1+hwloc_ffsl(nbapics_sharing));
+
+ cache->cacheid = infos->apicid / nbapics_sharing;
} else if (infos->cpufamilynumber== 0x10 && infos->cpumodelnumber == 0x9
&& cache->level == 3
@@ -807,7 +814,7 @@ static void look_proc(struct hwloc_backend *backend, struct procinfo *infos, uns
} else if (infos->cpufamilynumber == 0x15
&& (infos->cpumodelnumber == 0x1 /* Bulldozer */ || infos->cpumodelnumber == 0x2 /* Piledriver */)
&& cache->level == 3 && cache->nbthreads_sharing == 6) {
- /* AMD Bulldozer and Piledriver 12-core processors have same APIC ids as Magny-Cours below,
+ /* AMD Bulldozer and Piledriver 12-core processors have same APIC ids as Magny-Cours above,
* but we can't merge the checks because the original nbthreads_sharing must be exactly 6 here.
*/
cache->cacheid = (infos->apicid % legacy_max_log_proc) / cache->nbthreads_sharing /* cacheid within the package */
@@ -1231,6 +1238,18 @@ static void summarize(struct hwloc_backend *backend, struct procinfo *infos, uns
}
}
cache = hwloc_alloc_setup_object(topology, otype, HWLOC_UNKNOWN_INDEX);
+ /* We don't specify the os_index of caches because we want to be
+ * 100% sure they are identical to what the Linux kernel reports
+ * (so that things like resctrl work).
+ * However, vendor/model-specific quirks in the x86 code above
+ * make this difficult.
+ *
+ * Caveat: if the x86 backend is used on Linux to avoid kernel bugs,
+ * IDs won't be available to resctrl users. But resctrl heavily
+ * relies on the kernel x86 discovery being non-buggy anyway.
+ *
+ * TODO: make this optional? or only disable it on Linux?
+ */
cache->attr->cache.depth = level;
cache->attr->cache.size = infos[i].cache[l].size;
cache->attr->cache.linesize = infos[i].cache[l].linesize;
@@ -1688,7 +1707,7 @@ hwloc_x86_check_cpuiddump_input(const char *src_cpuiddump_path, hwloc_bitmap_t s
char line [32];
dir = opendir(src_cpuiddump_path);
- if (!dir)
+ if (!dir)
return -1;
path = malloc(strlen(src_cpuiddump_path) + strlen("/hwloc-cpuid-info") + 1);
diff --git a/src/3rdparty/hwloc/src/topology-xml.c b/src/3rdparty/hwloc/src/topology-xml.c
index 87e91010..2075d6fa 100644
--- a/src/3rdparty/hwloc/src/topology-xml.c
+++ b/src/3rdparty/hwloc/src/topology-xml.c
@@ -243,7 +243,7 @@ hwloc__xml_import_object_attr(struct hwloc_topology *topology,
else if (!strcmp(name, "dont_merge")) {
unsigned long lvalue = strtoul(value, NULL, 10);
if (obj->type == HWLOC_OBJ_GROUP)
- obj->attr->group.dont_merge = lvalue;
+ obj->attr->group.dont_merge = (unsigned char) lvalue;
else if (hwloc__xml_verbose())
fprintf(stderr, "%s: ignoring dont_merge attribute for non-group object type\n",
state->global->msgprefix);
@@ -2825,6 +2825,7 @@ hwloc__xml_v1export_object_with_memory(hwloc__xml_export_state_t parentstate, hw
/* child has sibling, we must add a Group around those memory children */
hwloc_obj_t group = parentstate->global->v1_memory_group;
parentstate->new_child(parentstate, &gstate, "object");
+ group->parent = obj->parent;
group->cpuset = obj->cpuset;
group->complete_cpuset = obj->complete_cpuset;
group->nodeset = obj->nodeset;
diff --git a/src/3rdparty/hwloc/src/topology.c b/src/3rdparty/hwloc/src/topology.c
index 01e5a863..c0f39c77 100644
--- a/src/3rdparty/hwloc/src/topology.c
+++ b/src/3rdparty/hwloc/src/topology.c
@@ -69,7 +69,7 @@
* it will break in cygwin, we'll have to use both putenv() and SetEnvironmentVariable().
* Hopefully L0 will be provide a way to enable Sysman without env vars before it happens.
*/
-#ifdef HWLOC_HAVE_ATTRIBUTE_CONSTRUCTOR
+#if HWLOC_HAVE_ATTRIBUTE_CONSTRUCTOR
static void hwloc_constructor(void) __attribute__((constructor));
static void hwloc_constructor(void)
{
@@ -1901,6 +1901,9 @@ hwloc_topology_alloc_group_object(struct hwloc_topology *topology)
static void hwloc_propagate_symmetric_subtree(hwloc_topology_t topology, hwloc_obj_t root);
static void propagate_total_memory(hwloc_obj_t obj);
static void hwloc_set_group_depth(hwloc_topology_t topology);
+static void hwloc_connect_children(hwloc_obj_t parent);
+static int hwloc_connect_levels(hwloc_topology_t topology);
+static int hwloc_connect_special_levels(hwloc_topology_t topology);
hwloc_obj_t
hwloc_topology_insert_group_object(struct hwloc_topology *topology, hwloc_obj_t obj)
@@ -2474,13 +2477,26 @@ hwloc_compare_levels_structure(hwloc_topology_t topology, unsigned i)
return 0;
}
-/* return > 0 if any level was removed, which means reconnect is needed */
-static void
+/* return > 0 if any level was removed.
+ * performs its own reconnect internally if needed
+ */
+static int
hwloc_filter_levels_keep_structure(hwloc_topology_t topology)
{
unsigned i, j;
int res = 0;
+ if (topology->modified) {
+ /* WARNING: hwloc_topology_reconnect() is duplicated partially here
+ * and at the end of this function:
+ * - we need normal levels before merging.
+ * - and we'll need to update special levels after merging.
+ */
+ hwloc_connect_children(topology->levels[0][0]);
+ if (hwloc_connect_levels(topology) < 0)
+ return -1;
+ }
+
/* start from the bottom since we'll remove intermediate levels */
for(i=topology->nb_levels-1; i>0; i--) {
int replacechild = 0, replaceparent = 0;
@@ -2646,6 +2662,22 @@ hwloc_filter_levels_keep_structure(hwloc_topology_t topology)
topology->type_depth[type] = HWLOC_TYPE_DEPTH_MULTIPLE;
}
}
+
+
+ if (res > 0 || topology-> modified) {
+ /* WARNING: hwloc_topology_reconnect() is duplicated partially here
+ * and at the beginning of this function.
+ * If we merged some levels, some child+parent special children lisst
+ * may have been merged, hence specials level might need reordering,
+ * So reconnect special levels only here at the end
+ * (it's not needed at the beginning of this function).
+ */
+ if (hwloc_connect_special_levels(topology) < 0)
+ return -1;
+ topology->modified = 0;
+ }
+
+ return 0;
}
static void
@@ -2963,9 +2995,9 @@ hwloc_list_special_objects(hwloc_topology_t topology, hwloc_obj_t obj)
}
}
-/* Build I/O levels */
+/* Build Memory, I/O and Misc levels */
static int
-hwloc_connect_io_misc_levels(hwloc_topology_t topology)
+hwloc_connect_special_levels(hwloc_topology_t topology)
{
unsigned i;
@@ -3176,6 +3208,10 @@ hwloc_connect_levels(hwloc_topology_t topology)
int
hwloc_topology_reconnect(struct hwloc_topology *topology, unsigned long flags)
{
+ /* WARNING: when updating this function, the replicated code must
+ * also be updated inside hwloc_filter_levels_keep_structure()
+ */
+
if (flags) {
errno = EINVAL;
return -1;
@@ -3188,7 +3224,7 @@ hwloc_topology_reconnect(struct hwloc_topology *topology, unsigned long flags)
if (hwloc_connect_levels(topology) < 0)
return -1;
- if (hwloc_connect_io_misc_levels(topology) < 0)
+ if (hwloc_connect_special_levels(topology) < 0)
return -1;
topology->modified = 0;
@@ -3529,15 +3565,12 @@ hwloc_discover(struct hwloc_topology *topology,
}
hwloc_debug_print_objects(0, topology->levels[0][0]);
- /* Reconnect things after all these changes.
- * Often needed because of Groups inserted for I/Os.
- * And required for KEEP_STRUCTURE below.
- */
- if (hwloc_topology_reconnect(topology, 0) < 0)
- return -1;
-
hwloc_debug("%s", "\nRemoving levels with HWLOC_TYPE_FILTER_KEEP_STRUCTURE\n");
- hwloc_filter_levels_keep_structure(topology);
+ if (hwloc_filter_levels_keep_structure(topology) < 0)
+ return -1;
+ /* takes care of reconnecting children/levels internally,
+ * because it needs normal levels.
+ * and it's often needed below because of Groups inserted for I/Os anyway */
hwloc_debug_print_objects(0, topology->levels[0][0]);
/* accumulate children memory in total_memory fields (only once parent is set) */
@@ -4360,14 +4393,13 @@ hwloc_topology_restrict(struct hwloc_topology *topology, hwloc_const_bitmap_t se
hwloc_bitmap_free(droppedcpuset);
hwloc_bitmap_free(droppednodeset);
- if (hwloc_topology_reconnect(topology, 0) < 0)
+ if (hwloc_filter_levels_keep_structure(topology) < 0) /* takes care of reconnecting internally */
goto out;
/* some objects may have disappeared, we need to update distances objs arrays */
hwloc_internal_distances_invalidate_cached_objs(topology);
hwloc_internal_memattrs_need_refresh(topology);
- hwloc_filter_levels_keep_structure(topology);
hwloc_propagate_symmetric_subtree(topology, topology->levels[0][0]);
propagate_total_memory(topology->levels[0][0]);
hwloc_internal_cpukinds_restrict(topology);