Update hwloc for MSVC builds.

2024-12-23 03:59:41 +00:00 · 2022-03-07 04:29:13 +07:00 · 2022-03-07 04:29:13 +07:00 · f25e65b5ac
commit f25e65b5ac
parent bbb19ea2f9
19 changed files with 314 additions and 255 deletions
--- a/src/3rdparty/hwloc/NEWS
+++ b/src/3rdparty/hwloc/NEWS
@ -17,6 +17,76 @@ bug fixes (and other actions) for each version of hwloc since version
 0.9.


+Version 2.7.0
+-------------
+* Backends
+  + Add support for NUMA nodes and caches with more than 64 PUs across
+    multiple processor groups on Windows 11 and Windows Server 2022.
+  + Group objects are not created for Windows processor groups anymore,
+    except if HWLOC_WINDOWS_PROCESSOR_GROUP_OBJS=1 in the environment.
+  + Expose "Cluster" group objects on Linux kernel 5.16+ for CPUs
+    that share some internal cache or bus. This can be equivalent
+    to the L2 Cache level on some platforms (e.g. x86) or a specific
+    level between L2 and L3 on others (e.g. ARM Kungpeng 920).
+    Thanks to Jonathan Cameron for the help.
+    - HWLOC_DONT_MERGE_CLUSTER_GROUPS=1 may be set in the environment
+      to prevent these groups from being merged with identical caches, etc.
+  + Improve the oneAPI LevelZero backend:
+    - Expose subdevices such as "ze0.1" inside root OS devices ("ze0")
+      when the hardware contains multiple subdevices.
+    - Add many new attributes to describe device type, and the
+      numbers of slices, subslices, execution units and threads.
+    - Expose the memory information as LevelZeroHBM/DDR/MemorySize infos.
+  + Ignore the max frequencies of cores in Linux cpukinds when the
+    base frequencies are available (to avoid exposing hybrid CPUs
+    when Intel Turbo Boost Max 3.0 gives slightly different max
+    frequencies to CPU cores).
+    - May be reverted by setting HWLOC_CPUKINDS_MAXFREQ=1 in the environment.
+* Tools
+  + Add --grey and --palette options to switch lstopo to greyscale or
+    white-background-only graphics, or to tune individual colors.
+* Build
+  + Windows CMake builds now support non-MSVC compilers, detect several
+    features at build time, can build/run tests, etc.
+    Thanks to Michael Hirsch and Alexander Neumann .
+
+
+Version 2.6.0
+-------------
+* Backends
+  + Expose two cpukinds for energy-efficient cores (icestorm) and
+    high-performance cores (firestorm) on Apple M1 on Mac OS X.
+  + Use sysfs CPU "capacity" to rank hybrid cores by efficiency
+    on Linux when available (mostly on recent ARM platforms for now).
+  + Improve HWLOC_MEMBIND_BIND (without the STRICT flag) on Linux kernel
+    >= 5.15: If more than one node is given, the kernel may now use all
+    of them instead of only the first one before falling back to others.
+  + Expose cache os_index when available on Linux, it may be needed
+    when using resctrl to configure cache partitioning, memory bandwidth
+    monitoring, etc.
+  + Add a "XGMIHops" distances matrix in the RSMI backend for AMD GPU
+    interconnected through XGMI links.
+  + Expose AMD GPU memory information (VRAM and GTT) in the RSMI backend.
+  + Add OS devices such as "bxi0" for Atos/Bull BXI HCAs on Linux.
+* Tools
+  + lstopo has a better placement algorithm with respect to I/O
+    objects, see --children-order in the manpage for details.
+  + hwloc-annotate may now change object subtypes and cache or memory
+    sizes.
+* Build
+  + Allow to specify the ROCm installation for building the RSMI backend:
+    - Use a custom installation path if specified with --with-rocm=<dir>.
+    - Use /opt/rocm-<version> if specified with --with-rocm-version=<version>
+      or the ROCM_VERSION environment variable.
+    - Try /opt/rocm if it exists.
+    - See "How do I enable ROCm SMI and select which version to use?"
+      in the FAQ for details.
+  + Add a CMakeLists for Windows under contrib/windows-cmake/ .
+* Documentation
+  + Add FAQ entry "How do I create a custom heterogeneous and
+     asymmetric topology?"
+
+
 Version 2.5.0
 -------------
 * API
--- a/src/3rdparty/hwloc/VERSION
+++ b/src/3rdparty/hwloc/VERSION
@ -8,7 +8,7 @@
 # Please update HWLOC_VERSION* in contrib/windows/hwloc_config.h too.

 major=2
-minor=5
+minor=7
 release=0

 # greek is used for alpha or beta release tags.  If it is non-empty,
@ -22,7 +22,7 @@ greek=

 # The date when this release was created

-date="Jun 14, 2021"
+date="Dec 06, 2021"

 # If snapshot=1, then use the value from snapshot_version as the
 # entire hwloc version (i.e., ignore major, minor, release, and
@ -41,7 +41,7 @@ snapshot_version=${major}.${minor}.${release}${greek}-git
 # 2. Version numbers are described in the Libtool current:revision:age
 # format.

-libhwloc_so_version=20:0:5
+libhwloc_so_version=20:2:5
 libnetloc_so_version=0:0:0

 # Please also update the <TargetName> lines in contrib/windows/libhwloc.vcxproj
--- a/src/3rdparty/hwloc/include/hwloc.h
+++ b/src/3rdparty/hwloc/include/hwloc.h
@ -346,7 +346,8 @@ typedef enum hwloc_obj_osdev_type_e {
 				  * For instance the "eth0" interface on Linux. */
  HWLOC_OBJ_OSDEV_OPENFABRICS,	/**< \brief Operating system openfabrics device.
 				  * For instance the "mlx4_0" InfiniBand HCA,
-				  * or "hfi1_0" Omni-Path interface on Linux. */
+				  * "hfi1_0" Omni-Path interface,
+				  * or "bxi0" Atos/Bull BXI HCA on Linux. */
  HWLOC_OBJ_OSDEV_DMA,		/**< \brief Operating system dma engine device.
 				  * For instance the "dma0chan0" DMA channel on Linux. */
  HWLOC_OBJ_OSDEV_COPROC	/**< \brief Operating system co-processor device.
@ -1212,8 +1213,9 @@ HWLOC_DECLSPEC int hwloc_set_cpubind(hwloc_topology_t topology, hwloc_const_cpus

 /** \brief Get current process or thread binding.
 *
- * Writes into \p set the physical cpuset which the process or thread (according to \e
- * flags) was last bound to.
+ * The CPU-set \p set (previously allocated by the caller)
+ * is filled with the list of PUs which the process or
+ * thread (according to \e flags) was last bound to.
 */
 HWLOC_DECLSPEC int hwloc_get_cpubind(hwloc_topology_t topology, hwloc_cpuset_t set, int flags);

@ -1231,6 +1233,10 @@ HWLOC_DECLSPEC int hwloc_get_cpubind(hwloc_topology_t topology, hwloc_cpuset_t s
 HWLOC_DECLSPEC int hwloc_set_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_cpuset_t set, int flags);

 /** \brief Get the current physical binding of process \p pid.
+ *
+ * The CPU-set \p set (previously allocated by the caller)
+ * is filled with the list of PUs which the process
+ * was last bound to.
 *
 * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
 * and \p HANDLE on native Windows platforms.
@ -1256,6 +1262,10 @@ HWLOC_DECLSPEC int hwloc_set_thread_cpubind(hwloc_topology_t topology, hwloc_thr

 #ifdef hwloc_thread_t
 /** \brief Get the current physical binding of thread \p tid.
+ *
+ * The CPU-set \p set (previously allocated by the caller)
+ * is filled with the list of PUs which the thread
+ * was last bound to.
 *
 * \note \p hwloc_thread_t is \p pthread_t on Unix platforms,
 * and \p HANDLE on native Windows platforms.
@ -1266,6 +1276,10 @@ HWLOC_DECLSPEC int hwloc_get_thread_cpubind(hwloc_topology_t topology, hwloc_thr
 #endif

 /** \brief Get the last physical CPU where the current process or thread ran.
+ *
+ * The CPU-set \p set (previously allocated by the caller)
+ * is filled with the list of PUs which the process or
+ * thread (according to \e flags) last ran on.
 *
 * The operating system may move some tasks from one processor
 * to another at any time according to their binding,
@ -1281,6 +1295,10 @@ HWLOC_DECLSPEC int hwloc_get_thread_cpubind(hwloc_topology_t topology, hwloc_thr
 HWLOC_DECLSPEC int hwloc_get_last_cpu_location(hwloc_topology_t topology, hwloc_cpuset_t set, int flags);

 /** \brief Get the last physical CPU where a process ran.
+ *
+ * The CPU-set \p set (previously allocated by the caller)
+ * is filled with the list of PUs which the process
+ * last ran on.
 *
 * The operating system may move some tasks from one processor
 * to another at any time according to their binding,
@ -1511,6 +1529,9 @@ HWLOC_DECLSPEC int hwloc_set_membind(hwloc_topology_t topology, hwloc_const_bitm
 /** \brief Query the default memory binding policy and physical locality of the
 * current process or thread.
 *
+ * The bitmap \p set (previously allocated by the caller)
+ * is filled with the process or thread memory binding.
+ *
 * This function has two output parameters: \p set and \p policy.
 * The values returned in these parameters depend on both the \p flags
 * passed in and the current memory binding policies and nodesets in
@ -1571,6 +1592,9 @@ HWLOC_DECLSPEC int hwloc_set_proc_membind(hwloc_topology_t topology, hwloc_pid_t
 /** \brief Query the default memory binding policy and physical locality of the
 * specified process.
 *
+ * The bitmap \p set (previously allocated by the caller)
+ * is filled with the process memory binding.
+ *
 * This function has two output parameters: \p set and \p policy.
 * The values returned in these parameters depend on both the \p flags
 * passed in and the current memory binding policies and nodesets in
@ -1624,6 +1648,9 @@ HWLOC_DECLSPEC int hwloc_set_area_membind(hwloc_topology_t topology, const void
 /** \brief Query the CPUs near the physical NUMA node(s) and binding policy of
 * the memory identified by (\p addr, \p len ).
 *
+ * The bitmap \p set (previously allocated by the caller)
+ * is filled with the memory area binding.
+ *
 * This function has two output parameters: \p set and \p policy.
 * The values returned in these parameters depend on both the \p flags
 * passed in and the memory binding policies and nodesets of the pages
@ -1652,7 +1679,8 @@ HWLOC_DECLSPEC int hwloc_get_area_membind(hwloc_topology_t topology, const void

 /** \brief Get the NUMA nodes where memory identified by (\p addr, \p len ) is physically allocated.
 *
- * Fills \p set according to the NUMA nodes where the memory area pages
+ * The bitmap \p set (previously allocated by the caller)
+ * is filled according to the NUMA nodes where the memory area pages
 * are physically allocated. If no page is actually allocated yet,
 * \p set may be empty.
 *
@ -1698,9 +1726,12 @@ HWLOC_DECLSPEC void *hwloc_alloc_membind(hwloc_topology_t topology, size_t len,

 /** \brief Allocate some memory on NUMA memory nodes specified by \p set
 *
- * This is similar to hwloc_alloc_membind_nodeset() except that it is allowed to change
- * the current memory binding policy, thus providing more binding support, at
- * the expense of changing the current state.
+ * First, try to allocate properly with hwloc_alloc_membind().
+ * On failure, the current process or thread memory binding policy
+ * is changed with hwloc_set_membind() before allocating memory.
+ * Thus this function works in more cases, at the expense of changing
+ * the current state (possibly affecting future allocations that
+ * would not specify any policy).
 *
 * If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset.
 * Otherwise it's a cpuset.
--- a/src/3rdparty/hwloc/include/hwloc/autogen/config.h
+++ b/src/3rdparty/hwloc/include/hwloc/autogen/config.h
@ -1,6 +1,6 @@
 /*
 * Copyright © 2009 CNRS
- * Copyright © 2009-2020 Inria.  All rights reserved.
+ * Copyright © 2009-2021 Inria.  All rights reserved.
 * Copyright © 2009-2012 Université Bordeaux
 * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
 * See COPYING in top-level directory.
@ -11,9 +11,9 @@
 #ifndef HWLOC_CONFIG_H
 #define HWLOC_CONFIG_H

-#define HWLOC_VERSION "2.5.0"
+#define HWLOC_VERSION "2.7.0"
 #define HWLOC_VERSION_MAJOR 2
-#define HWLOC_VERSION_MINOR 5
+#define HWLOC_VERSION_MINOR 7
 #define HWLOC_VERSION_RELEASE 0
 #define HWLOC_VERSION_GREEK ""

--- a/src/3rdparty/hwloc/include/hwloc/cpukinds.h
+++ b/src/3rdparty/hwloc/include/hwloc/cpukinds.h
@ -1,5 +1,5 @@
 /*
- * Copyright © 2020 Inria.  All rights reserved.
+ * Copyright © 2020-2021 Inria.  All rights reserved.
 * See COPYING in top-level directory.
 */

@ -42,18 +42,23 @@ extern "C" {
 * (for instance the "CoreType" and "FrequencyMaxMHz",
 *  see \ref topoattrs_cpukinds).
 *
- * A higher efficiency value means intrinsic greater performance
+ * A higher efficiency value means greater intrinsic performance
 * (and possibly less performance/power efficiency).
- * Kinds with lower efficiency are ranked first:
+ * Kinds with lower efficiency values are ranked first:
 * Passing 0 as \p kind_index to hwloc_cpukinds_get_info() will
- * return information about the less efficient CPU kind.
+ * return information about the CPU kind with lower performance
+ * but higher energy-efficiency.
+ * Higher \p kind_index values would rather return information
+ * about power-hungry high-performance cores.
 *
- * When available, efficiency values are gathered from the operating
- * system (when \p cpukind_efficiency is set in the
- * struct hwloc_topology_discovery_support array, only on Windows 10 for now).
- * Otherwise hwloc tries to compute efficiencies
- * by comparing CPU kinds using frequencies (on ARM),
- * or core types and frequencies (on other architectures).
+ * When available, efficiency values are gathered from the operating system.
+ * If so, \p cpukind_efficiency is set in the struct hwloc_topology_discovery_support array.
+ * This is currently available on Windows 10, Mac OS X (Darwin),
+ * and on some Linux platforms where core "capacity" is exposed in sysfs.
+ *
+ * If the operating system does not expose core efficiencies natively,
+ * hwloc tries to compute efficiencies by comparing CPU kinds using
+ * frequencies (on ARM), or core types and frequencies (on other architectures).
 * The environment variable HWLOC_CPUKINDS_RANKING may be used
 * to change this heuristics, see \ref envvar.
 *
--- a/src/3rdparty/hwloc/include/hwloc/distances.h
+++ b/src/3rdparty/hwloc/include/hwloc/distances.h
@ -35,7 +35,8 @@ extern "C" {
 * from a core in another node.
 * The corresponding kind is ::HWLOC_DISTANCES_KIND_FROM_OS | ::HWLOC_DISTANCES_KIND_FROM_USER.
 * The name of this distances structure is "NUMALatency".
- * Others distance structures include and "XGMIBandwidth" and "NVLinkBandwidth".
+ * Others distance structures include and "XGMIBandwidth", "XGMIHops"
+ * and "NVLinkBandwidth".
 *
 * The matrix may also contain bandwidths between random sets of objects,
 * possibly provided by the user, as specified in the \p kind attribute.
@ -159,7 +160,7 @@ hwloc_distances_get_by_type(hwloc_topology_t topology, hwloc_obj_type_t type,
 * Usually only one distances structure may match a given name.
 *
 * The name of the most common structure is "NUMALatency".
- * Others include "XGMIBandwidth" and "NVLinkBandwidth".
+ * Others include "XGMIBandwidth", "XGMIHops" and "NVLinkBandwidth".
 */
 HWLOC_DECLSPEC int
 hwloc_distances_get_by_name(hwloc_topology_t topology, const char *name,
--- a/src/3rdparty/hwloc/include/hwloc/linux.h
+++ b/src/3rdparty/hwloc/include/hwloc/linux.h
@ -1,6 +1,6 @@
 /*
 * Copyright © 2009 CNRS
- * Copyright © 2009-2016 Inria.  All rights reserved.
+ * Copyright © 2009-2021 Inria.  All rights reserved.
 * Copyright © 2009-2011 Université Bordeaux
 * See COPYING in top-level directory.
 */
@ -44,6 +44,10 @@ extern "C" {
 HWLOC_DECLSPEC int hwloc_linux_set_tid_cpubind(hwloc_topology_t topology, pid_t tid, hwloc_const_cpuset_t set);

 /** \brief Get the current binding of thread \p tid
+ *
+ * The CPU-set \p set (previously allocated by the caller)
+ * is filled with the list of PUs which the thread
+ * was last bound to.
 *
 * The behavior is exactly the same as the Linux sched_getaffinity system call,
 * but uses a hwloc cpuset.
@ -54,6 +58,9 @@ HWLOC_DECLSPEC int hwloc_linux_set_tid_cpubind(hwloc_topology_t topology, pid_t
 HWLOC_DECLSPEC int hwloc_linux_get_tid_cpubind(hwloc_topology_t topology, pid_t tid, hwloc_cpuset_t set);

 /** \brief Get the last physical CPU where thread \p tid ran.
+ *
+ * The CPU-set \p set (previously allocated by the caller)
+ * is filled with the PU which the thread last ran on.
 *
 * \note This is equivalent to calling hwloc_get_proc_last_cpu_location() with
 * ::HWLOC_CPUBIND_THREAD as flags.
--- a/src/3rdparty/hwloc/include/hwloc/plugins.h
+++ b/src/3rdparty/hwloc/include/hwloc/plugins.h
@ -497,6 +497,7 @@ hwloc_filter_check_pcidev_subtype_important(unsigned classid)
  return (baseclass == 0x03 /* PCI_BASE_CLASS_DISPLAY */
 	  || baseclass == 0x02 /* PCI_BASE_CLASS_NETWORK */
 	  || baseclass == 0x01 /* PCI_BASE_CLASS_STORAGE */
+	  || baseclass == 0x00 /* Unclassified, for Atos/Bull BXI */
 	  || baseclass == 0x0b /* PCI_BASE_CLASS_PROCESSOR */
 	  || classid == 0x0c04 /* PCI_CLASS_SERIAL_FIBER */
 	  || classid == 0x0c06 /* PCI_CLASS_SERIAL_INFINIBAND */
--- a/src/3rdparty/hwloc/include/private/autogen/config.h
+++ b/src/3rdparty/hwloc/include/private/autogen/config.h
@ -1,6 +1,6 @@
 /*
 * Copyright © 2009, 2011, 2012 CNRS.  All rights reserved.
- * Copyright © 2009-2020 Inria.  All rights reserved.
+ * Copyright © 2009-2021 Inria.  All rights reserved.
 * Copyright © 2009, 2011, 2012, 2015 Université Bordeaux.  All rights reserved.
 * Copyright © 2009-2020 Cisco Systems, Inc.  All rights reserved.
 * $COPYRIGHT$
@ -290,10 +290,6 @@
 /* Define to '1' if sysctlbyname is present and usable */
 /* #undef HAVE_SYSCTLBYNAME */

-/* Define to 1 if the system has the type
-   `SYSTEM_LOGICAL_PROCESSOR_INFORMATION'. */
-#define HAVE_SYSTEM_LOGICAL_PROCESSOR_INFORMATION 1
-
 /* Define to 1 if the system has the type
   `SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX'. */
 #define HAVE_SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX 1
--- a/src/3rdparty/hwloc/include/private/misc.h
+++ b/src/3rdparty/hwloc/include/private/misc.h
@ -504,7 +504,7 @@ hwloc__obj_type_is_icache(hwloc_obj_type_t type)
  }                                    \
 } while(0)
 #else /* HAVE_USELOCALE */
-#if __HWLOC_HAVE_ATTRIBUTE_UNUSED
+#if HWLOC_HAVE_ATTRIBUTE_UNUSED
 #define hwloc_localeswitch_declare int __dummy_nolocale __hwloc_attribute_unused
 #define hwloc_localeswitch_init()
 #else
--- a/src/3rdparty/hwloc/include/private/private.h
+++ b/src/3rdparty/hwloc/include/private/private.h
@ -480,6 +480,7 @@ extern char * hwloc_progname(struct hwloc_topology *topology);
 #define HWLOC_GROUP_KIND_AIX_SDL_UNKNOWN		210	/* subkind is SDL level */
 #define HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP	220	/* no subkind */
 #define HWLOC_GROUP_KIND_WINDOWS_RELATIONSHIP_UNKNOWN	221	/* no subkind */
+#define HWLOC_GROUP_KIND_LINUX_CLUSTER                  222     /* no subkind */
 /* distance groups */
 #define HWLOC_GROUP_KIND_DISTANCE			900	/* subkind is round of adding these groups during distance based grouping */
 /* finally, hwloc-specific groups required to insert something else, should disappear as soon as possible */
--- a/src/3rdparty/hwloc/include/private/windows.h
+++ b/src/3rdparty/hwloc/include/private/windows.h
@ -0,0 +1,21 @@
+/*
+ * Copyright © 2009 Université Bordeaux
+ * Copyright © 2020 Inria.  All rights reserved.
+ *
+ * See COPYING in top-level directory.
+ */
+
+#ifndef HWLOC_PRIVATE_WINDOWS_H
+#define HWLOC_PRIVATE_WINDOWS_H
+
+#ifdef __GNUC__
+#define _ANONYMOUS_UNION __extension__
+#define _ANONYMOUS_STRUCT __extension__
+#else
+#define _ANONYMOUS_UNION
+#define _ANONYMOUS_STRUCT
+#endif /* __GNUC__ */
+#define DUMMYUNIONNAME
+#define DUMMYSTRUCTNAME
+
+#endif /* HWLOC_PRIVATE_WINDOWS_H */
--- a/src/3rdparty/hwloc/src/cpukinds.c
+++ b/src/3rdparty/hwloc/src/cpukinds.c
@ -42,6 +42,9 @@ hwloc_internal_cpukinds_dup(hwloc_topology_t new, hwloc_topology_t old)
  struct hwloc_internal_cpukind_s *kinds;
  unsigned i;

+  if (!old->nr_cpukinds)
+    return 0;
+
  kinds = hwloc_tma_malloc(tma, old->nr_cpukinds * sizeof(*kinds));
  if (!kinds)
    return -1;
@ -445,7 +448,9 @@ static int hwloc__cpukinds_compare_ranking_values(const void *_a, const void *_b
 {
  const struct hwloc_internal_cpukind_s *a = _a;
  const struct hwloc_internal_cpukind_s *b = _b;
-  return a->ranking_value - b->ranking_value;
+  uint64_t arv = a->ranking_value;
+  uint64_t brv = b->ranking_value;
+  return arv < brv ? -1 : arv > brv ? 1 : 0;
 }

 /* this function requires ranking values to be unique */
--- a/src/3rdparty/hwloc/src/memattrs.c
+++ b/src/3rdparty/hwloc/src/memattrs.c
@ -1,5 +1,5 @@
 /*
- * Copyright © 2020 Inria.  All rights reserved.
+ * Copyright © 2020-2021 Inria.  All rights reserved.
 * See COPYING in top-level directory.
 */

@ -127,6 +127,8 @@ hwloc_internal_memattrs_dup(struct hwloc_topology *new, struct hwloc_topology *o
  struct hwloc_internal_memattr_s *imattrs;
  hwloc_memattr_id_t id;

+  /* old->nr_memattrs is always > 0 thanks to default memattrs */
+
  imattrs = hwloc_tma_malloc(tma, old->nr_memattrs * sizeof(*imattrs));
  if (!imattrs)
    return -1;
--- a/src/3rdparty/hwloc/src/pci-common.c
+++ b/src/3rdparty/hwloc/src/pci-common.c
@ -810,13 +810,14 @@ hwloc_pcidisc_find_linkspeed(const unsigned char *config,
   * PCIe Gen3 = 8  GT/s signal-rate per lane with 128/130 encoding = 1   GB/s data-rate per lane
   * PCIe Gen4 = 16 GT/s signal-rate per lane with 128/130 encoding = 2   GB/s data-rate per lane
   * PCIe Gen5 = 32 GT/s signal-rate per lane with 128/130 encoding = 4   GB/s data-rate per lane
+   * PCIe Gen6 = 64 GT/s signal-rate per lane with 128/130 encoding = 8   GB/s data-rate per lane
   */

  /* lanespeed in Gbit/s */
  if (speed <= 2)
    lanespeed = 2.5f * speed * 0.8f;
  else
-    lanespeed = 8.0f * (1<<(speed-3)) * 128/130; /* assume Gen6 will be 64 GT/s and so on */
+    lanespeed = 8.0f * (1<<(speed-3)) * 128/130; /* assume Gen7 will be 128 GT/s and so on */

  /* linkspeed in GB/s */
  *linkspeed = lanespeed * width / 8;
--- a/src/3rdparty/hwloc/src/topology-windows.c
+++ b/src/3rdparty/hwloc/src/topology-windows.c
@ -13,6 +13,7 @@
 #include "hwloc.h"
 #include "hwloc/windows.h"
 #include "private/private.h"
+#include "private/windows.h" /* must be before windows.h */
 #include "private/debug.h"

 #include <windows.h>
@ -65,26 +66,6 @@ typedef enum _LOGICAL_PROCESSOR_RELATIONSHIP {
 #  endif /* HAVE_RELATIONPROCESSORPACKAGE */
 #endif /* HAVE_LOGICAL_PROCESSOR_RELATIONSHIP */

-#ifndef HAVE_SYSTEM_LOGICAL_PROCESSOR_INFORMATION
-typedef struct _SYSTEM_LOGICAL_PROCESSOR_INFORMATION {
-  ULONG_PTR ProcessorMask;
-  LOGICAL_PROCESSOR_RELATIONSHIP Relationship;
-  _ANONYMOUS_UNION
-  union {
-    struct {
-      BYTE flags;
-    } ProcessorCore;
-    struct {
-      DWORD NodeNumber;
-    } NumaNode;
-    CACHE_DESCRIPTOR Cache;
-    ULONGLONG Reserved[2];
-  } DUMMYUNIONNAME;
-} SYSTEM_LOGICAL_PROCESSOR_INFORMATION, *PSYSTEM_LOGICAL_PROCESSOR_INFORMATION;
-#endif
-
-/* Extended interface, for group support */
-
 #ifndef HAVE_GROUP_AFFINITY
 typedef struct _GROUP_AFFINITY {
  KAFFINITY Mask;
@ -93,35 +74,40 @@ typedef struct _GROUP_AFFINITY {
 } GROUP_AFFINITY, *PGROUP_AFFINITY;
 #endif

-#ifndef HAVE_PROCESSOR_RELATIONSHIP
+/* always use our own structure because the EfficiencyClass field didn't exist before Win10 */
 typedef struct HWLOC_PROCESSOR_RELATIONSHIP {
  BYTE Flags;
-  BYTE EfficiencyClass; /* for RelationProcessorCore, higher means greater performance but less efficiency, only available in Win10+ */
+  BYTE EfficiencyClass; /* for RelationProcessorCore, higher means greater performance but less efficiency */
  BYTE Reserved[20];
  WORD GroupCount;
  GROUP_AFFINITY GroupMask[ANYSIZE_ARRAY];
-} PROCESSOR_RELATIONSHIP, *PPROCESSOR_RELATIONSHIP;
-#endif
+} HWLOC_PROCESSOR_RELATIONSHIP;

-#ifndef HAVE_NUMA_NODE_RELATIONSHIP
-typedef struct _NUMA_NODE_RELATIONSHIP {
+/* always use our own structure because the GroupCount and GroupMasks fields didn't exist in some Win10 */
+typedef struct HWLOC_NUMA_NODE_RELATIONSHIP {
  DWORD NodeNumber;
-  BYTE Reserved[20];
-  GROUP_AFFINITY GroupMask;
-} NUMA_NODE_RELATIONSHIP, *PNUMA_NODE_RELATIONSHIP;
-#endif
+  BYTE Reserved[18];
+  WORD GroupCount;
+  _ANONYMOUS_UNION
+  union {
+    GROUP_AFFINITY GroupMask;
+    GROUP_AFFINITY GroupMasks[ANYSIZE_ARRAY];
+  } DUMMYUNIONNAME;
+} HWLOC_NUMA_NODE_RELATIONSHIP;

-#ifndef HAVE_CACHE_RELATIONSHIP
-typedef struct _CACHE_RELATIONSHIP {
+typedef struct HWLOC_CACHE_RELATIONSHIP {
  BYTE Level;
  BYTE Associativity;
  WORD LineSize;
  DWORD CacheSize;
  PROCESSOR_CACHE_TYPE Type;
-  BYTE Reserved[20];
-  GROUP_AFFINITY GroupMask;
-} CACHE_RELATIONSHIP, *PCACHE_RELATIONSHIP;
-#endif
+  BYTE Reserved[18];
+  WORD GroupCount;
+  union {
+    GROUP_AFFINITY GroupMask;
+    GROUP_AFFINITY GroupMasks[ANYSIZE_ARRAY];
+  } DUMMYUNIONNAME;
+} HWLOC_CACHE_RELATIONSHIP;

 #ifndef HAVE_PROCESSOR_GROUP_INFO
 typedef struct _PROCESSOR_GROUP_INFO {
@ -141,20 +127,19 @@ typedef struct _GROUP_RELATIONSHIP {
 } GROUP_RELATIONSHIP, *PGROUP_RELATIONSHIP;
 #endif

-#ifndef HAVE_SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX
-typedef struct _SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX {
+/* always use our own structure because we need our own HWLOC_PROCESSOR/CACHE/NUMA_NODE_RELATIONSHIP */
+typedef struct HWLOC_SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX {
  LOGICAL_PROCESSOR_RELATIONSHIP Relationship;
  DWORD Size;
  _ANONYMOUS_UNION
  union {
-    PROCESSOR_RELATIONSHIP Processor;
-    NUMA_NODE_RELATIONSHIP NumaNode;
-    CACHE_RELATIONSHIP Cache;
+    HWLOC_PROCESSOR_RELATIONSHIP Processor;
+    HWLOC_NUMA_NODE_RELATIONSHIP NumaNode;
+    HWLOC_CACHE_RELATIONSHIP Cache;
    GROUP_RELATIONSHIP Group;
    /* Odd: no member to tell the cpu mask of the package... */
  } DUMMYUNIONNAME;
-} SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, *PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX;
-#endif
+} HWLOC_SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX;

 #ifndef HAVE_PSAPI_WORKING_SET_EX_BLOCK
 typedef union _PSAPI_WORKING_SET_EX_BLOCK {
@ -200,10 +185,7 @@ static PFN_GETCURRENTPROCESSORNUMBER GetCurrentProcessorNumberProc;
 typedef VOID (WINAPI *PFN_GETCURRENTPROCESSORNUMBEREX)(PPROCESSOR_NUMBER);
 static PFN_GETCURRENTPROCESSORNUMBEREX GetCurrentProcessorNumberExProc;

-typedef BOOL (WINAPI *PFN_GETLOGICALPROCESSORINFORMATION)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION Buffer, PDWORD ReturnLength);
-static PFN_GETLOGICALPROCESSORINFORMATION GetLogicalProcessorInformationProc;
-
-typedef BOOL (WINAPI *PFN_GETLOGICALPROCESSORINFORMATIONEX)(LOGICAL_PROCESSOR_RELATIONSHIP relationship, PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX Buffer, PDWORD ReturnLength);
+typedef BOOL (WINAPI *PFN_GETLOGICALPROCESSORINFORMATIONEX)(LOGICAL_PROCESSOR_RELATIONSHIP relationship, HWLOC_SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *Buffer, PDWORD ReturnLength);
 static PFN_GETLOGICALPROCESSORINFORMATIONEX GetLogicalProcessorInformationExProc;

 typedef BOOL (WINAPI *PFN_SETTHREADGROUPAFFINITY)(HANDLE hThread, const GROUP_AFFINITY *GroupAffinity, PGROUP_AFFINITY PreviousGroupAffinity);
@ -244,8 +226,6 @@ static void hwloc_win_get_function_ptrs(void)
 	(PFN_GETACTIVEPROCESSORGROUPCOUNT) GetProcAddress(kernel32, "GetActiveProcessorGroupCount");
      GetActiveProcessorCountProc =
 	(PFN_GETACTIVEPROCESSORCOUNT) GetProcAddress(kernel32, "GetActiveProcessorCount");
-      GetLogicalProcessorInformationProc =
-	(PFN_GETLOGICALPROCESSORINFORMATION) GetProcAddress(kernel32, "GetLogicalProcessorInformation");
      GetCurrentProcessorNumberProc =
 	(PFN_GETCURRENTPROCESSORNUMBER) GetProcAddress(kernel32, "GetCurrentProcessorNumber");
      GetCurrentProcessorNumberExProc =
@ -370,13 +350,13 @@ static hwloc_cpuset_t * processor_group_cpusets = NULL;
 static void
 hwloc_win_get_processor_groups(void)
 {
-  PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX procInfoTotal, tmpprocInfoTotal, procInfo;
+  HWLOC_SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *procInfoTotal, *tmpprocInfoTotal, *procInfo;
  DWORD length;
  unsigned i;

  hwloc_debug("querying windows processor groups\n");

-  if (!GetActiveProcessorGroupCountProc || !GetLogicalProcessorInformationExProc)
+  if (!GetLogicalProcessorInformationExProc)
    goto error;

  nr_processor_groups = GetActiveProcessorGroupCountProc();
@ -415,6 +395,8 @@ hwloc_win_get_processor_groups(void)

    assert(procInfo->Relationship == RelationGroup);

+    hwloc_debug("Found %u active windows processor groups\n",
+                (unsigned) procInfo->Group.ActiveGroupCount);
    for (id = 0; id < procInfo->Group.ActiveGroupCount; id++) {
      KAFFINITY mask;
      hwloc_bitmap_t set;
@ -424,8 +406,8 @@ hwloc_win_get_processor_groups(void)
        goto error_with_cpusets;

      mask = procInfo->Group.GroupInfo[id].ActiveProcessorMask;
-      hwloc_debug("group %u %d cpus mask %lx\n", id,
-                  procInfo->Group.GroupInfo[id].ActiveProcessorCount, mask);
+      hwloc_debug("group %u with %u cpus mask 0x%llx\n", id,
+                  (unsigned) procInfo->Group.GroupInfo[id].ActiveProcessorCount, (unsigned long long) mask);
      /* KAFFINITY is ULONG_PTR */
      hwloc_bitmap_set_ith_ULONG_PTR(set, id, mask);
      /* FIXME: what if running 32bits on a 64bits windows with 64-processor groups?
@ -1008,6 +990,8 @@ hwloc_look_windows(struct hwloc_backend *backend, struct hwloc_disc_status *dsta
  unsigned hostname_size = sizeof(hostname);
  int has_efficiencyclass = 0;
  struct hwloc_win_efficiency_classes eclasses;
+  char *env = getenv("HWLOC_WINDOWS_PROCESSOR_GROUP_OBJS");
+  int keep_pgroup_objs = (env && atoi(env));

  assert(dstatus->phase == HWLOC_DISC_PHASE_CPU);

@ -1038,137 +1022,8 @@ hwloc_look_windows(struct hwloc_backend *backend, struct hwloc_disc_status *dsta

  GetSystemInfo(&SystemInfo);

-  if (!GetLogicalProcessorInformationExProc && GetLogicalProcessorInformationProc) {
-      PSYSTEM_LOGICAL_PROCESSOR_INFORMATION procInfo, tmpprocInfo;
-      unsigned id;
-      unsigned i;
-      struct hwloc_obj *obj;
-      hwloc_obj_type_t type;
-
-      length = 0;
-      procInfo = NULL;
-
-      while (1) {
-	if (GetLogicalProcessorInformationProc(procInfo, &length))
-	  break;
-	if (GetLastError() != ERROR_INSUFFICIENT_BUFFER)
-	  return -1;
-	tmpprocInfo = realloc(procInfo, length);
-	if (!tmpprocInfo) {
-	  free(procInfo);
-	  goto out;
-	}
-	procInfo = tmpprocInfo;
-      }
-
-      assert(!length || procInfo);
-
-      for (i = 0; i < length / sizeof(*procInfo); i++) {
-
-        /* Ignore unknown caches */
-	if (procInfo->Relationship == RelationCache
-		&& procInfo->Cache.Type != CacheUnified
-		&& procInfo->Cache.Type != CacheData
-		&& procInfo->Cache.Type != CacheInstruction)
-	  continue;
-
-	id = HWLOC_UNKNOWN_INDEX;
-	switch (procInfo[i].Relationship) {
-	  case RelationNumaNode:
-	    type = HWLOC_OBJ_NUMANODE;
-	    id = procInfo[i].NumaNode.NodeNumber;
-	    gotnuma++;
-	    if (id > max_numanode_index)
-	      max_numanode_index = id;
-	    break;
-	  case RelationProcessorPackage:
-	    type = HWLOC_OBJ_PACKAGE;
-	    break;
-	  case RelationCache:
-	    type = (procInfo[i].Cache.Type == CacheInstruction ? HWLOC_OBJ_L1ICACHE : HWLOC_OBJ_L1CACHE) + procInfo[i].Cache.Level - 1;
-	    break;
-	  case RelationProcessorCore:
-	    type = HWLOC_OBJ_CORE;
-	    break;
-	  case RelationGroup:
-	  default:
-	    type = HWLOC_OBJ_GROUP;
-	    break;
-	}
-
-	if (!hwloc_filter_check_keep_object_type(topology, type))
-	  continue;
-
-	obj = hwloc_alloc_setup_object(topology, type, id);
-        obj->cpuset = hwloc_bitmap_alloc();
-	hwloc_debug("%s#%u mask %llx\n", hwloc_obj_type_string(type), id, (unsigned long long) procInfo[i].ProcessorMask);
-	/* ProcessorMask is a ULONG_PTR */
-	hwloc_bitmap_set_ith_ULONG_PTR(obj->cpuset, 0, procInfo[i].ProcessorMask);
-	hwloc_debug_2args_bitmap("%s#%u bitmap %s\n", hwloc_obj_type_string(type), id, obj->cpuset);
-
-	switch (type) {
-	  case HWLOC_OBJ_NUMANODE:
-	    {
-	      ULONGLONG avail;
-	      obj->nodeset = hwloc_bitmap_alloc();
-	      hwloc_bitmap_set(obj->nodeset, id);
-	      if ((GetNumaAvailableMemoryNodeExProc && GetNumaAvailableMemoryNodeExProc(id, &avail))
-		  || (GetNumaAvailableMemoryNodeProc && GetNumaAvailableMemoryNodeProc(id, &avail))) {
-		obj->attr->numanode.local_memory = avail;
-		gotnumamemory++;
-	      }
-	      obj->attr->numanode.page_types_len = 2;
-	      obj->attr->numanode.page_types = malloc(2 * sizeof(*obj->attr->numanode.page_types));
-	      memset(obj->attr->numanode.page_types, 0, 2 * sizeof(*obj->attr->numanode.page_types));
-	      obj->attr->numanode.page_types_len = 1;
-	      obj->attr->numanode.page_types[0].size = SystemInfo.dwPageSize;
-#if HAVE_DECL__SC_LARGE_PAGESIZE
-	      obj->attr->numanode.page_types_len++;
-	      obj->attr->numanode.page_types[1].size = sysconf(_SC_LARGE_PAGESIZE);
-#endif
-	      break;
-	    }
-	  case HWLOC_OBJ_L1CACHE:
-	  case HWLOC_OBJ_L2CACHE:
-	  case HWLOC_OBJ_L3CACHE:
-	  case HWLOC_OBJ_L4CACHE:
-	  case HWLOC_OBJ_L5CACHE:
-	  case HWLOC_OBJ_L1ICACHE:
-	  case HWLOC_OBJ_L2ICACHE:
-	  case HWLOC_OBJ_L3ICACHE:
-	    obj->attr->cache.size = procInfo[i].Cache.Size;
-	    obj->attr->cache.associativity = procInfo[i].Cache.Associativity == CACHE_FULLY_ASSOCIATIVE ? -1 : procInfo[i].Cache.Associativity ;
-	    obj->attr->cache.linesize = procInfo[i].Cache.LineSize;
-	    obj->attr->cache.depth = procInfo[i].Cache.Level;
-	    switch (procInfo->Cache.Type) {
-	      case CacheUnified:
-		obj->attr->cache.type = HWLOC_OBJ_CACHE_UNIFIED;
-		break;
-	      case CacheData:
-		obj->attr->cache.type = HWLOC_OBJ_CACHE_DATA;
-		break;
-	      case CacheInstruction:
-		obj->attr->cache.type = HWLOC_OBJ_CACHE_INSTRUCTION;
-		break;
-	      default:
-		hwloc_free_unlinked_object(obj);
-		continue;
-	    }
-	    break;
-	  case HWLOC_OBJ_GROUP:
-	    obj->attr->group.kind = procInfo[i].Relationship == RelationGroup ? HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP : HWLOC_GROUP_KIND_WINDOWS_RELATIONSHIP_UNKNOWN;
-	    break;
-	  default:
-	    break;
-	}
-	hwloc__insert_object_by_cpuset(topology, NULL, obj, "windows:GetLogicalProcessorInformation");
-      }
-
-      free(procInfo);
-  }
-
  if (GetLogicalProcessorInformationExProc) {
-      PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX procInfoTotal, tmpprocInfoTotal, procInfo;
+      HWLOC_SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *procInfoTotal, *tmpprocInfoTotal, *procInfo;
      unsigned id;
      struct hwloc_obj *obj;
      hwloc_obj_type_t type;
@ -1207,8 +1062,16 @@ hwloc_look_windows(struct hwloc_backend *backend, struct hwloc_disc_status *dsta
 	switch (procInfo->Relationship) {
 	  case RelationNumaNode:
 	    type = HWLOC_OBJ_NUMANODE;
-            num = 1;
-            GroupMask = &procInfo->NumaNode.GroupMask;
+            /* Starting with Windows 11 and Server 2022, the GroupCount field is valid and >=1
+             * and we may read GroupMasks[]. Older releases have GroupCount==0 and we must read GroupMask.
+             */
+            if (procInfo->NumaNode.GroupCount) {
+              num = procInfo->NumaNode.GroupCount;
+              GroupMask = procInfo->NumaNode.GroupMasks;
+            } else {
+              num = 1;
+              GroupMask = &procInfo->NumaNode.GroupMask;
+            }
 	    id = procInfo->NumaNode.NodeNumber;
 	    gotnuma++;
 	    if (id > max_numanode_index)
@ -1221,18 +1084,20 @@ hwloc_look_windows(struct hwloc_backend *backend, struct hwloc_disc_status *dsta
 	    break;
 	  case RelationCache:
 	    type = (procInfo->Cache.Type == CacheInstruction ? HWLOC_OBJ_L1ICACHE : HWLOC_OBJ_L1CACHE) + procInfo->Cache.Level - 1;
-            num = 1;
-            GroupMask = &procInfo->Cache.GroupMask;
+            /* GroupCount added approximately with NumaNode.GroupCount above */
+            if (procInfo->Cache.GroupCount) {
+              num = procInfo->Cache.GroupCount;
+              GroupMask = procInfo->Cache.GroupMasks;
+            } else {
+              num = 1;
+              GroupMask = &procInfo->Cache.GroupMask;
+            }
 	    break;
 	  case RelationProcessorCore:
 	    type = HWLOC_OBJ_CORE;
            num = procInfo->Processor.GroupCount;
            GroupMask = procInfo->Processor.GroupMask;
-            if (has_efficiencyclass)
-              /* the EfficiencyClass field didn't exist before Windows10 and recent MSVC headers,
-               * so just access it manually instead of trying to detect it.
-               */
-              efficiency_class = * ((&procInfo->Processor.Flags) + 1);
+            efficiency_class = procInfo->Processor.EfficiencyClass;
 	    break;
 	  case RelationGroup:
 	    /* So strange an interface... */
@ -1257,11 +1122,12 @@ hwloc_look_windows(struct hwloc_backend *backend, struct hwloc_disc_status *dsta
 		groups_pu_set = hwloc_bitmap_alloc();
 	      hwloc_bitmap_or(groups_pu_set, groups_pu_set, set);

-	      if (hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_GROUP)) {
+              /* Ignore processor groups unless requested and filtered-in */
+              if (keep_pgroup_objs && hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_GROUP)) {
 		obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_GROUP, id);
 		obj->cpuset = set;
 		obj->attr->group.kind = HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP;
-		hwloc__insert_object_by_cpuset(topology, NULL, obj, "windows:GetLogicalProcessorInformation:ProcessorGroup");
+		hwloc__insert_object_by_cpuset(topology, NULL, obj, "windows:GetLogicalProcessorInformationEx:ProcessorGroup");
 	      } else
 		hwloc_bitmap_free(set);
 	    }
--- a/src/3rdparty/hwloc/src/topology-x86.c
+++ b/src/3rdparty/hwloc/src/topology-x86.c
@ -500,7 +500,8 @@ static void read_amd_cores_topoext(struct procinfo *infos, unsigned long flags,
      nodes_per_proc = ((ecx >> 8) & 7) + 1;
    }
    if ((infos->cpufamilynumber == 0x15 && nodes_per_proc > 2)
-	|| ((infos->cpufamilynumber == 0x17 || infos->cpufamilynumber == 0x18) && nodes_per_proc > 4)) {
+	|| ((infos->cpufamilynumber == 0x17 || infos->cpufamilynumber == 0x18) && nodes_per_proc > 4)
+        || (infos->cpufamilynumber == 0x19 && nodes_per_proc > 1)) {
      hwloc_debug("warning: undefined nodes_per_proc value %u, assuming it means %u\n", nodes_per_proc, nodes_per_proc);
    }
  }
@ -775,13 +776,19 @@ static void look_proc(struct hwloc_backend *backend, struct procinfo *infos, uns

    } else if (cpuid_type == amd) {
      /* AMD quirks */
-      if (infos->cpufamilynumber == 0x17
-	  && cache->level == 3 && cache->nbthreads_sharing == 6) {
-	/* AMD family 0x17 always shares L3 between 8 APIC ids,
-	 * even when only 6 APIC ids are enabled and reported in nbthreads_sharing
-	 * (on 24-core CPUs).
+      if (infos->cpufamilynumber >= 0x17 && cache->level == 3) {
+	/* AMD family 0x19 always shares L3 between 16 APIC ids (8 HT cores).
+         * while Family 0x17 shares between 8 APIC ids (4 HT cores).
+         * But many models have less APIC ids enabled and reported in nbthreads_sharing.
+         * It means we must round-up nbthreads_sharing to the nearest power of 2
+         * before computing cacheid.
 	 */
-	cache->cacheid = infos->apicid / 8;
+        unsigned nbapics_sharing = cache->nbthreads_sharing;
+        if (nbapics_sharing & (nbapics_sharing-1))
+          /* not a power of two, round-up */
+          nbapics_sharing = 1U<<(1+hwloc_ffsl(nbapics_sharing));
+
+	cache->cacheid = infos->apicid / nbapics_sharing;

      } else if (infos->cpufamilynumber== 0x10 && infos->cpumodelnumber == 0x9
 	  && cache->level == 3
@ -807,7 +814,7 @@ static void look_proc(struct hwloc_backend *backend, struct procinfo *infos, uns
      } else if (infos->cpufamilynumber == 0x15
 		 && (infos->cpumodelnumber == 0x1 /* Bulldozer */ || infos->cpumodelnumber == 0x2 /* Piledriver */)
 		 && cache->level == 3 && cache->nbthreads_sharing == 6) {
-	/* AMD Bulldozer and Piledriver 12-core processors have same APIC ids as Magny-Cours below,
+	/* AMD Bulldozer and Piledriver 12-core processors have same APIC ids as Magny-Cours above,
 	 * but we can't merge the checks because the original nbthreads_sharing must be exactly 6 here.
 	 */
 	cache->cacheid = (infos->apicid % legacy_max_log_proc) / cache->nbthreads_sharing /* cacheid within the package */
@ -1231,6 +1238,18 @@ static void summarize(struct hwloc_backend *backend, struct procinfo *infos, uns
 	    }
 	  }
 	  cache = hwloc_alloc_setup_object(topology, otype, HWLOC_UNKNOWN_INDEX);
+          /* We don't specify the os_index of caches because we want to be
+           * 100% sure they are identical to what the Linux kernel reports
+           * (so that things like resctrl work).
+           * However, vendor/model-specific quirks in the x86 code above
+           * make this difficult.
+           *
+           * Caveat: if the x86 backend is used on Linux to avoid kernel bugs,
+           * IDs won't be available to resctrl users. But resctrl heavily
+           * relies on the kernel x86 discovery being non-buggy anyway.
+           *
+           * TODO: make this optional? or only disable it on Linux?
+           */
 	  cache->attr->cache.depth = level;
 	  cache->attr->cache.size = infos[i].cache[l].size;
 	  cache->attr->cache.linesize = infos[i].cache[l].linesize;
--- a/src/3rdparty/hwloc/src/topology-xml.c
+++ b/src/3rdparty/hwloc/src/topology-xml.c
@ -243,7 +243,7 @@ hwloc__xml_import_object_attr(struct hwloc_topology *topology,
  else if (!strcmp(name, "dont_merge")) {
    unsigned long lvalue = strtoul(value, NULL, 10);
    if (obj->type == HWLOC_OBJ_GROUP)
-      obj->attr->group.dont_merge = lvalue;
+      obj->attr->group.dont_merge = (unsigned char) lvalue;
    else if (hwloc__xml_verbose())
      fprintf(stderr, "%s: ignoring dont_merge attribute for non-group object type\n",
 	      state->global->msgprefix);
@ -2825,6 +2825,7 @@ hwloc__xml_v1export_object_with_memory(hwloc__xml_export_state_t parentstate, hw
    /* child has sibling, we must add a Group around those memory children */
    hwloc_obj_t group = parentstate->global->v1_memory_group;
    parentstate->new_child(parentstate, &gstate, "object");
+    group->parent = obj->parent;
    group->cpuset = obj->cpuset;
    group->complete_cpuset = obj->complete_cpuset;
    group->nodeset = obj->nodeset;
--- a/src/3rdparty/hwloc/src/topology.c
+++ b/src/3rdparty/hwloc/src/topology.c
@ -69,7 +69,7 @@
 * it will break in cygwin, we'll have to use both putenv() and SetEnvironmentVariable().
 * Hopefully L0 will be provide a way to enable Sysman without env vars before it happens.
 */
-#ifdef HWLOC_HAVE_ATTRIBUTE_CONSTRUCTOR
+#if HWLOC_HAVE_ATTRIBUTE_CONSTRUCTOR
 static void hwloc_constructor(void) __attribute__((constructor));
 static void hwloc_constructor(void)
 {
@ -1901,6 +1901,9 @@ hwloc_topology_alloc_group_object(struct hwloc_topology *topology)
 static void hwloc_propagate_symmetric_subtree(hwloc_topology_t topology, hwloc_obj_t root);
 static void propagate_total_memory(hwloc_obj_t obj);
 static void hwloc_set_group_depth(hwloc_topology_t topology);
+static void hwloc_connect_children(hwloc_obj_t parent);
+static int hwloc_connect_levels(hwloc_topology_t topology);
+static int hwloc_connect_special_levels(hwloc_topology_t topology);

 hwloc_obj_t
 hwloc_topology_insert_group_object(struct hwloc_topology *topology, hwloc_obj_t obj)
@ -2474,13 +2477,26 @@ hwloc_compare_levels_structure(hwloc_topology_t topology, unsigned i)
  return 0;
 }

-/* return > 0 if any level was removed, which means reconnect is needed */
-static void
+/* return > 0 if any level was removed.
+ * performs its own reconnect internally if needed
+ */
+static int
 hwloc_filter_levels_keep_structure(hwloc_topology_t topology)
 {
  unsigned i, j;
  int res = 0;

+  if (topology->modified) {
+    /* WARNING: hwloc_topology_reconnect() is duplicated partially here
+     * and at the end of this function:
+     * - we need normal levels before merging.
+     * - and we'll need to update special levels after merging.
+     */
+    hwloc_connect_children(topology->levels[0][0]);
+    if (hwloc_connect_levels(topology) < 0)
+      return -1;
+  }
+
  /* start from the bottom since we'll remove intermediate levels */
  for(i=topology->nb_levels-1; i>0; i--) {
    int replacechild = 0, replaceparent = 0;
@ -2646,6 +2662,22 @@ hwloc_filter_levels_keep_structure(hwloc_topology_t topology)
 	topology->type_depth[type] = HWLOC_TYPE_DEPTH_MULTIPLE;
    }
  }
+
+
+  if (res > 0 || topology-> modified) {
+    /* WARNING: hwloc_topology_reconnect() is duplicated partially here
+     * and at the beginning of this function.
+     * If we merged some levels, some child+parent special children lisst
+     * may have been merged, hence specials level might need reordering,
+     * So reconnect special levels only here at the end
+     * (it's not needed at the beginning of this function).
+     */
+    if (hwloc_connect_special_levels(topology) < 0)
+      return -1;
+    topology->modified = 0;
+  }
+
+  return 0;
 }

 static void
@ -2963,9 +2995,9 @@ hwloc_list_special_objects(hwloc_topology_t topology, hwloc_obj_t obj)
  }
 }

-/* Build I/O levels */
+/* Build Memory, I/O and Misc levels */
 static int
-hwloc_connect_io_misc_levels(hwloc_topology_t topology)
+hwloc_connect_special_levels(hwloc_topology_t topology)
 {
  unsigned i;

@ -3176,6 +3208,10 @@ hwloc_connect_levels(hwloc_topology_t topology)
 int
 hwloc_topology_reconnect(struct hwloc_topology *topology, unsigned long flags)
 {
+  /* WARNING: when updating this function, the replicated code must
+   * also be updated inside hwloc_filter_levels_keep_structure()
+   */
+
  if (flags) {
    errno = EINVAL;
    return -1;
@ -3188,7 +3224,7 @@ hwloc_topology_reconnect(struct hwloc_topology *topology, unsigned long flags)
  if (hwloc_connect_levels(topology) < 0)
    return -1;

-  if (hwloc_connect_io_misc_levels(topology) < 0)
+  if (hwloc_connect_special_levels(topology) < 0)
    return -1;

  topology->modified = 0;
@ -3529,15 +3565,12 @@ hwloc_discover(struct hwloc_topology *topology,
  }
  hwloc_debug_print_objects(0, topology->levels[0][0]);

-  /* Reconnect things after all these changes.
-   * Often needed because of Groups inserted for I/Os.
-   * And required for KEEP_STRUCTURE below.
-   */
-  if (hwloc_topology_reconnect(topology, 0) < 0)
-    return -1;
-
  hwloc_debug("%s", "\nRemoving levels with HWLOC_TYPE_FILTER_KEEP_STRUCTURE\n");
-  hwloc_filter_levels_keep_structure(topology);
+  if (hwloc_filter_levels_keep_structure(topology) < 0)
+    return -1;
+  /* takes care of reconnecting children/levels internally,
+   * because it needs normal levels.
+   * and it's often needed below because of Groups inserted for I/Os anyway */
  hwloc_debug_print_objects(0, topology->levels[0][0]);

  /* accumulate children memory in total_memory fields (only once parent is set) */
@ -4360,14 +4393,13 @@ hwloc_topology_restrict(struct hwloc_topology *topology, hwloc_const_bitmap_t se
  hwloc_bitmap_free(droppedcpuset);
  hwloc_bitmap_free(droppednodeset);

-  if (hwloc_topology_reconnect(topology, 0) < 0)
+  if (hwloc_filter_levels_keep_structure(topology) < 0) /* takes care of reconnecting internally */
    goto out;

  /* some objects may have disappeared, we need to update distances objs arrays */
  hwloc_internal_distances_invalidate_cached_objs(topology);
  hwloc_internal_memattrs_need_refresh(topology);

-  hwloc_filter_levels_keep_structure(topology);
  hwloc_propagate_symmetric_subtree(topology, topology->levels[0][0]);
  propagate_total_memory(topology->levels[0][0]);
  hwloc_internal_cpukinds_restrict(topology);