diff --git a/src/3rdparty/hwloc/NEWS b/src/3rdparty/hwloc/NEWS index 0bf74d44..71f858a3 100644 --- a/src/3rdparty/hwloc/NEWS +++ b/src/3rdparty/hwloc/NEWS @@ -17,6 +17,76 @@ bug fixes (and other actions) for each version of hwloc since version 0.9. +Version 2.7.0 +------------- +* Backends + + Add support for NUMA nodes and caches with more than 64 PUs across + multiple processor groups on Windows 11 and Windows Server 2022. + + Group objects are not created for Windows processor groups anymore, + except if HWLOC_WINDOWS_PROCESSOR_GROUP_OBJS=1 in the environment. + + Expose "Cluster" group objects on Linux kernel 5.16+ for CPUs + that share some internal cache or bus. This can be equivalent + to the L2 Cache level on some platforms (e.g. x86) or a specific + level between L2 and L3 on others (e.g. ARM Kungpeng 920). + Thanks to Jonathan Cameron for the help. + - HWLOC_DONT_MERGE_CLUSTER_GROUPS=1 may be set in the environment + to prevent these groups from being merged with identical caches, etc. + + Improve the oneAPI LevelZero backend: + - Expose subdevices such as "ze0.1" inside root OS devices ("ze0") + when the hardware contains multiple subdevices. + - Add many new attributes to describe device type, and the + numbers of slices, subslices, execution units and threads. + - Expose the memory information as LevelZeroHBM/DDR/MemorySize infos. + + Ignore the max frequencies of cores in Linux cpukinds when the + base frequencies are available (to avoid exposing hybrid CPUs + when Intel Turbo Boost Max 3.0 gives slightly different max + frequencies to CPU cores). + - May be reverted by setting HWLOC_CPUKINDS_MAXFREQ=1 in the environment. +* Tools + + Add --grey and --palette options to switch lstopo to greyscale or + white-background-only graphics, or to tune individual colors. +* Build + + Windows CMake builds now support non-MSVC compilers, detect several + features at build time, can build/run tests, etc. + Thanks to Michael Hirsch and Alexander Neumann . + + +Version 2.6.0 +------------- +* Backends + + Expose two cpukinds for energy-efficient cores (icestorm) and + high-performance cores (firestorm) on Apple M1 on Mac OS X. + + Use sysfs CPU "capacity" to rank hybrid cores by efficiency + on Linux when available (mostly on recent ARM platforms for now). + + Improve HWLOC_MEMBIND_BIND (without the STRICT flag) on Linux kernel + >= 5.15: If more than one node is given, the kernel may now use all + of them instead of only the first one before falling back to others. + + Expose cache os_index when available on Linux, it may be needed + when using resctrl to configure cache partitioning, memory bandwidth + monitoring, etc. + + Add a "XGMIHops" distances matrix in the RSMI backend for AMD GPU + interconnected through XGMI links. + + Expose AMD GPU memory information (VRAM and GTT) in the RSMI backend. + + Add OS devices such as "bxi0" for Atos/Bull BXI HCAs on Linux. +* Tools + + lstopo has a better placement algorithm with respect to I/O + objects, see --children-order in the manpage for details. + + hwloc-annotate may now change object subtypes and cache or memory + sizes. +* Build + + Allow to specify the ROCm installation for building the RSMI backend: + - Use a custom installation path if specified with --with-rocm=. + - Use /opt/rocm- if specified with --with-rocm-version= + or the ROCM_VERSION environment variable. + - Try /opt/rocm if it exists. + - See "How do I enable ROCm SMI and select which version to use?" + in the FAQ for details. + + Add a CMakeLists for Windows under contrib/windows-cmake/ . +* Documentation + + Add FAQ entry "How do I create a custom heterogeneous and + asymmetric topology?" + + Version 2.5.0 ------------- * API diff --git a/src/3rdparty/hwloc/VERSION b/src/3rdparty/hwloc/VERSION index a74f0a53..7486ae04 100644 --- a/src/3rdparty/hwloc/VERSION +++ b/src/3rdparty/hwloc/VERSION @@ -8,7 +8,7 @@ # Please update HWLOC_VERSION* in contrib/windows/hwloc_config.h too. major=2 -minor=5 +minor=7 release=0 # greek is used for alpha or beta release tags. If it is non-empty, @@ -22,7 +22,7 @@ greek= # The date when this release was created -date="Jun 14, 2021" +date="Dec 06, 2021" # If snapshot=1, then use the value from snapshot_version as the # entire hwloc version (i.e., ignore major, minor, release, and @@ -41,7 +41,7 @@ snapshot_version=${major}.${minor}.${release}${greek}-git # 2. Version numbers are described in the Libtool current:revision:age # format. -libhwloc_so_version=20:0:5 +libhwloc_so_version=20:2:5 libnetloc_so_version=0:0:0 # Please also update the lines in contrib/windows/libhwloc.vcxproj diff --git a/src/3rdparty/hwloc/include/hwloc.h b/src/3rdparty/hwloc/include/hwloc.h index 88fac968..b5f0f48a 100644 --- a/src/3rdparty/hwloc/include/hwloc.h +++ b/src/3rdparty/hwloc/include/hwloc.h @@ -346,7 +346,8 @@ typedef enum hwloc_obj_osdev_type_e { * For instance the "eth0" interface on Linux. */ HWLOC_OBJ_OSDEV_OPENFABRICS, /**< \brief Operating system openfabrics device. * For instance the "mlx4_0" InfiniBand HCA, - * or "hfi1_0" Omni-Path interface on Linux. */ + * "hfi1_0" Omni-Path interface, + * or "bxi0" Atos/Bull BXI HCA on Linux. */ HWLOC_OBJ_OSDEV_DMA, /**< \brief Operating system dma engine device. * For instance the "dma0chan0" DMA channel on Linux. */ HWLOC_OBJ_OSDEV_COPROC /**< \brief Operating system co-processor device. @@ -1212,8 +1213,9 @@ HWLOC_DECLSPEC int hwloc_set_cpubind(hwloc_topology_t topology, hwloc_const_cpus /** \brief Get current process or thread binding. * - * Writes into \p set the physical cpuset which the process or thread (according to \e - * flags) was last bound to. + * The CPU-set \p set (previously allocated by the caller) + * is filled with the list of PUs which the process or + * thread (according to \e flags) was last bound to. */ HWLOC_DECLSPEC int hwloc_get_cpubind(hwloc_topology_t topology, hwloc_cpuset_t set, int flags); @@ -1231,6 +1233,10 @@ HWLOC_DECLSPEC int hwloc_get_cpubind(hwloc_topology_t topology, hwloc_cpuset_t s HWLOC_DECLSPEC int hwloc_set_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_cpuset_t set, int flags); /** \brief Get the current physical binding of process \p pid. + * + * The CPU-set \p set (previously allocated by the caller) + * is filled with the list of PUs which the process + * was last bound to. * * \note \p hwloc_pid_t is \p pid_t on Unix platforms, * and \p HANDLE on native Windows platforms. @@ -1256,6 +1262,10 @@ HWLOC_DECLSPEC int hwloc_set_thread_cpubind(hwloc_topology_t topology, hwloc_thr #ifdef hwloc_thread_t /** \brief Get the current physical binding of thread \p tid. + * + * The CPU-set \p set (previously allocated by the caller) + * is filled with the list of PUs which the thread + * was last bound to. * * \note \p hwloc_thread_t is \p pthread_t on Unix platforms, * and \p HANDLE on native Windows platforms. @@ -1266,6 +1276,10 @@ HWLOC_DECLSPEC int hwloc_get_thread_cpubind(hwloc_topology_t topology, hwloc_thr #endif /** \brief Get the last physical CPU where the current process or thread ran. + * + * The CPU-set \p set (previously allocated by the caller) + * is filled with the list of PUs which the process or + * thread (according to \e flags) last ran on. * * The operating system may move some tasks from one processor * to another at any time according to their binding, @@ -1281,6 +1295,10 @@ HWLOC_DECLSPEC int hwloc_get_thread_cpubind(hwloc_topology_t topology, hwloc_thr HWLOC_DECLSPEC int hwloc_get_last_cpu_location(hwloc_topology_t topology, hwloc_cpuset_t set, int flags); /** \brief Get the last physical CPU where a process ran. + * + * The CPU-set \p set (previously allocated by the caller) + * is filled with the list of PUs which the process + * last ran on. * * The operating system may move some tasks from one processor * to another at any time according to their binding, @@ -1511,6 +1529,9 @@ HWLOC_DECLSPEC int hwloc_set_membind(hwloc_topology_t topology, hwloc_const_bitm /** \brief Query the default memory binding policy and physical locality of the * current process or thread. * + * The bitmap \p set (previously allocated by the caller) + * is filled with the process or thread memory binding. + * * This function has two output parameters: \p set and \p policy. * The values returned in these parameters depend on both the \p flags * passed in and the current memory binding policies and nodesets in @@ -1571,6 +1592,9 @@ HWLOC_DECLSPEC int hwloc_set_proc_membind(hwloc_topology_t topology, hwloc_pid_t /** \brief Query the default memory binding policy and physical locality of the * specified process. * + * The bitmap \p set (previously allocated by the caller) + * is filled with the process memory binding. + * * This function has two output parameters: \p set and \p policy. * The values returned in these parameters depend on both the \p flags * passed in and the current memory binding policies and nodesets in @@ -1624,6 +1648,9 @@ HWLOC_DECLSPEC int hwloc_set_area_membind(hwloc_topology_t topology, const void /** \brief Query the CPUs near the physical NUMA node(s) and binding policy of * the memory identified by (\p addr, \p len ). * + * The bitmap \p set (previously allocated by the caller) + * is filled with the memory area binding. + * * This function has two output parameters: \p set and \p policy. * The values returned in these parameters depend on both the \p flags * passed in and the memory binding policies and nodesets of the pages @@ -1652,7 +1679,8 @@ HWLOC_DECLSPEC int hwloc_get_area_membind(hwloc_topology_t topology, const void /** \brief Get the NUMA nodes where memory identified by (\p addr, \p len ) is physically allocated. * - * Fills \p set according to the NUMA nodes where the memory area pages + * The bitmap \p set (previously allocated by the caller) + * is filled according to the NUMA nodes where the memory area pages * are physically allocated. If no page is actually allocated yet, * \p set may be empty. * @@ -1698,9 +1726,12 @@ HWLOC_DECLSPEC void *hwloc_alloc_membind(hwloc_topology_t topology, size_t len, /** \brief Allocate some memory on NUMA memory nodes specified by \p set * - * This is similar to hwloc_alloc_membind_nodeset() except that it is allowed to change - * the current memory binding policy, thus providing more binding support, at - * the expense of changing the current state. + * First, try to allocate properly with hwloc_alloc_membind(). + * On failure, the current process or thread memory binding policy + * is changed with hwloc_set_membind() before allocating memory. + * Thus this function works in more cases, at the expense of changing + * the current state (possibly affecting future allocations that + * would not specify any policy). * * If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset. * Otherwise it's a cpuset. diff --git a/src/3rdparty/hwloc/include/hwloc/autogen/config.h b/src/3rdparty/hwloc/include/hwloc/autogen/config.h index eb70ba49..8d89fa25 100644 --- a/src/3rdparty/hwloc/include/hwloc/autogen/config.h +++ b/src/3rdparty/hwloc/include/hwloc/autogen/config.h @@ -1,6 +1,6 @@ /* * Copyright © 2009 CNRS - * Copyright © 2009-2020 Inria. All rights reserved. + * Copyright © 2009-2021 Inria. All rights reserved. * Copyright © 2009-2012 Université Bordeaux * Copyright © 2009-2011 Cisco Systems, Inc. All rights reserved. * See COPYING in top-level directory. @@ -11,9 +11,9 @@ #ifndef HWLOC_CONFIG_H #define HWLOC_CONFIG_H -#define HWLOC_VERSION "2.5.0" +#define HWLOC_VERSION "2.7.0" #define HWLOC_VERSION_MAJOR 2 -#define HWLOC_VERSION_MINOR 5 +#define HWLOC_VERSION_MINOR 7 #define HWLOC_VERSION_RELEASE 0 #define HWLOC_VERSION_GREEK "" diff --git a/src/3rdparty/hwloc/include/hwloc/cpukinds.h b/src/3rdparty/hwloc/include/hwloc/cpukinds.h index f240baf3..524a05af 100644 --- a/src/3rdparty/hwloc/include/hwloc/cpukinds.h +++ b/src/3rdparty/hwloc/include/hwloc/cpukinds.h @@ -1,5 +1,5 @@ /* - * Copyright © 2020 Inria. All rights reserved. + * Copyright © 2020-2021 Inria. All rights reserved. * See COPYING in top-level directory. */ @@ -42,18 +42,23 @@ extern "C" { * (for instance the "CoreType" and "FrequencyMaxMHz", * see \ref topoattrs_cpukinds). * - * A higher efficiency value means intrinsic greater performance + * A higher efficiency value means greater intrinsic performance * (and possibly less performance/power efficiency). - * Kinds with lower efficiency are ranked first: + * Kinds with lower efficiency values are ranked first: * Passing 0 as \p kind_index to hwloc_cpukinds_get_info() will - * return information about the less efficient CPU kind. + * return information about the CPU kind with lower performance + * but higher energy-efficiency. + * Higher \p kind_index values would rather return information + * about power-hungry high-performance cores. * - * When available, efficiency values are gathered from the operating - * system (when \p cpukind_efficiency is set in the - * struct hwloc_topology_discovery_support array, only on Windows 10 for now). - * Otherwise hwloc tries to compute efficiencies - * by comparing CPU kinds using frequencies (on ARM), - * or core types and frequencies (on other architectures). + * When available, efficiency values are gathered from the operating system. + * If so, \p cpukind_efficiency is set in the struct hwloc_topology_discovery_support array. + * This is currently available on Windows 10, Mac OS X (Darwin), + * and on some Linux platforms where core "capacity" is exposed in sysfs. + * + * If the operating system does not expose core efficiencies natively, + * hwloc tries to compute efficiencies by comparing CPU kinds using + * frequencies (on ARM), or core types and frequencies (on other architectures). * The environment variable HWLOC_CPUKINDS_RANKING may be used * to change this heuristics, see \ref envvar. * diff --git a/src/3rdparty/hwloc/include/hwloc/distances.h b/src/3rdparty/hwloc/include/hwloc/distances.h index 6eac94e9..44cd7ea1 100644 --- a/src/3rdparty/hwloc/include/hwloc/distances.h +++ b/src/3rdparty/hwloc/include/hwloc/distances.h @@ -35,7 +35,8 @@ extern "C" { * from a core in another node. * The corresponding kind is ::HWLOC_DISTANCES_KIND_FROM_OS | ::HWLOC_DISTANCES_KIND_FROM_USER. * The name of this distances structure is "NUMALatency". - * Others distance structures include and "XGMIBandwidth" and "NVLinkBandwidth". + * Others distance structures include and "XGMIBandwidth", "XGMIHops" + * and "NVLinkBandwidth". * * The matrix may also contain bandwidths between random sets of objects, * possibly provided by the user, as specified in the \p kind attribute. @@ -159,7 +160,7 @@ hwloc_distances_get_by_type(hwloc_topology_t topology, hwloc_obj_type_t type, * Usually only one distances structure may match a given name. * * The name of the most common structure is "NUMALatency". - * Others include "XGMIBandwidth" and "NVLinkBandwidth". + * Others include "XGMIBandwidth", "XGMIHops" and "NVLinkBandwidth". */ HWLOC_DECLSPEC int hwloc_distances_get_by_name(hwloc_topology_t topology, const char *name, diff --git a/src/3rdparty/hwloc/include/hwloc/linux.h b/src/3rdparty/hwloc/include/hwloc/linux.h index ecc86be3..d76633b0 100644 --- a/src/3rdparty/hwloc/include/hwloc/linux.h +++ b/src/3rdparty/hwloc/include/hwloc/linux.h @@ -1,6 +1,6 @@ /* * Copyright © 2009 CNRS - * Copyright © 2009-2016 Inria. All rights reserved. + * Copyright © 2009-2021 Inria. All rights reserved. * Copyright © 2009-2011 Université Bordeaux * See COPYING in top-level directory. */ @@ -44,6 +44,10 @@ extern "C" { HWLOC_DECLSPEC int hwloc_linux_set_tid_cpubind(hwloc_topology_t topology, pid_t tid, hwloc_const_cpuset_t set); /** \brief Get the current binding of thread \p tid + * + * The CPU-set \p set (previously allocated by the caller) + * is filled with the list of PUs which the thread + * was last bound to. * * The behavior is exactly the same as the Linux sched_getaffinity system call, * but uses a hwloc cpuset. @@ -54,6 +58,9 @@ HWLOC_DECLSPEC int hwloc_linux_set_tid_cpubind(hwloc_topology_t topology, pid_t HWLOC_DECLSPEC int hwloc_linux_get_tid_cpubind(hwloc_topology_t topology, pid_t tid, hwloc_cpuset_t set); /** \brief Get the last physical CPU where thread \p tid ran. + * + * The CPU-set \p set (previously allocated by the caller) + * is filled with the PU which the thread last ran on. * * \note This is equivalent to calling hwloc_get_proc_last_cpu_location() with * ::HWLOC_CPUBIND_THREAD as flags. diff --git a/src/3rdparty/hwloc/include/hwloc/plugins.h b/src/3rdparty/hwloc/include/hwloc/plugins.h index 6e4f1291..ed4b833d 100644 --- a/src/3rdparty/hwloc/include/hwloc/plugins.h +++ b/src/3rdparty/hwloc/include/hwloc/plugins.h @@ -497,6 +497,7 @@ hwloc_filter_check_pcidev_subtype_important(unsigned classid) return (baseclass == 0x03 /* PCI_BASE_CLASS_DISPLAY */ || baseclass == 0x02 /* PCI_BASE_CLASS_NETWORK */ || baseclass == 0x01 /* PCI_BASE_CLASS_STORAGE */ + || baseclass == 0x00 /* Unclassified, for Atos/Bull BXI */ || baseclass == 0x0b /* PCI_BASE_CLASS_PROCESSOR */ || classid == 0x0c04 /* PCI_CLASS_SERIAL_FIBER */ || classid == 0x0c06 /* PCI_CLASS_SERIAL_INFINIBAND */ diff --git a/src/3rdparty/hwloc/include/private/autogen/config.h b/src/3rdparty/hwloc/include/private/autogen/config.h index 687e82bc..5bf22fac 100644 --- a/src/3rdparty/hwloc/include/private/autogen/config.h +++ b/src/3rdparty/hwloc/include/private/autogen/config.h @@ -1,6 +1,6 @@ /* * Copyright © 2009, 2011, 2012 CNRS. All rights reserved. - * Copyright © 2009-2020 Inria. All rights reserved. + * Copyright © 2009-2021 Inria. All rights reserved. * Copyright © 2009, 2011, 2012, 2015 Université Bordeaux. All rights reserved. * Copyright © 2009-2020 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ @@ -290,10 +290,6 @@ /* Define to '1' if sysctlbyname is present and usable */ /* #undef HAVE_SYSCTLBYNAME */ -/* Define to 1 if the system has the type - `SYSTEM_LOGICAL_PROCESSOR_INFORMATION'. */ -#define HAVE_SYSTEM_LOGICAL_PROCESSOR_INFORMATION 1 - /* Define to 1 if the system has the type `SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX'. */ #define HAVE_SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX 1 diff --git a/src/3rdparty/hwloc/include/private/misc.h b/src/3rdparty/hwloc/include/private/misc.h index 6c02d793..bc57e98e 100644 --- a/src/3rdparty/hwloc/include/private/misc.h +++ b/src/3rdparty/hwloc/include/private/misc.h @@ -504,7 +504,7 @@ hwloc__obj_type_is_icache(hwloc_obj_type_t type) } \ } while(0) #else /* HAVE_USELOCALE */ -#if __HWLOC_HAVE_ATTRIBUTE_UNUSED +#if HWLOC_HAVE_ATTRIBUTE_UNUSED #define hwloc_localeswitch_declare int __dummy_nolocale __hwloc_attribute_unused #define hwloc_localeswitch_init() #else diff --git a/src/3rdparty/hwloc/include/private/private.h b/src/3rdparty/hwloc/include/private/private.h index 5e216632..131b0796 100644 --- a/src/3rdparty/hwloc/include/private/private.h +++ b/src/3rdparty/hwloc/include/private/private.h @@ -480,6 +480,7 @@ extern char * hwloc_progname(struct hwloc_topology *topology); #define HWLOC_GROUP_KIND_AIX_SDL_UNKNOWN 210 /* subkind is SDL level */ #define HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP 220 /* no subkind */ #define HWLOC_GROUP_KIND_WINDOWS_RELATIONSHIP_UNKNOWN 221 /* no subkind */ +#define HWLOC_GROUP_KIND_LINUX_CLUSTER 222 /* no subkind */ /* distance groups */ #define HWLOC_GROUP_KIND_DISTANCE 900 /* subkind is round of adding these groups during distance based grouping */ /* finally, hwloc-specific groups required to insert something else, should disappear as soon as possible */ diff --git a/src/3rdparty/hwloc/include/private/windows.h b/src/3rdparty/hwloc/include/private/windows.h new file mode 100644 index 00000000..0a061b09 --- /dev/null +++ b/src/3rdparty/hwloc/include/private/windows.h @@ -0,0 +1,21 @@ +/* + * Copyright © 2009 Université Bordeaux + * Copyright © 2020 Inria. All rights reserved. + * + * See COPYING in top-level directory. + */ + +#ifndef HWLOC_PRIVATE_WINDOWS_H +#define HWLOC_PRIVATE_WINDOWS_H + +#ifdef __GNUC__ +#define _ANONYMOUS_UNION __extension__ +#define _ANONYMOUS_STRUCT __extension__ +#else +#define _ANONYMOUS_UNION +#define _ANONYMOUS_STRUCT +#endif /* __GNUC__ */ +#define DUMMYUNIONNAME +#define DUMMYSTRUCTNAME + +#endif /* HWLOC_PRIVATE_WINDOWS_H */ diff --git a/src/3rdparty/hwloc/src/cpukinds.c b/src/3rdparty/hwloc/src/cpukinds.c index 074b7a73..fc05f17e 100644 --- a/src/3rdparty/hwloc/src/cpukinds.c +++ b/src/3rdparty/hwloc/src/cpukinds.c @@ -42,6 +42,9 @@ hwloc_internal_cpukinds_dup(hwloc_topology_t new, hwloc_topology_t old) struct hwloc_internal_cpukind_s *kinds; unsigned i; + if (!old->nr_cpukinds) + return 0; + kinds = hwloc_tma_malloc(tma, old->nr_cpukinds * sizeof(*kinds)); if (!kinds) return -1; @@ -445,7 +448,9 @@ static int hwloc__cpukinds_compare_ranking_values(const void *_a, const void *_b { const struct hwloc_internal_cpukind_s *a = _a; const struct hwloc_internal_cpukind_s *b = _b; - return a->ranking_value - b->ranking_value; + uint64_t arv = a->ranking_value; + uint64_t brv = b->ranking_value; + return arv < brv ? -1 : arv > brv ? 1 : 0; } /* this function requires ranking values to be unique */ diff --git a/src/3rdparty/hwloc/src/memattrs.c b/src/3rdparty/hwloc/src/memattrs.c index 16e9896e..92efe575 100644 --- a/src/3rdparty/hwloc/src/memattrs.c +++ b/src/3rdparty/hwloc/src/memattrs.c @@ -1,5 +1,5 @@ /* - * Copyright © 2020 Inria. All rights reserved. + * Copyright © 2020-2021 Inria. All rights reserved. * See COPYING in top-level directory. */ @@ -127,6 +127,8 @@ hwloc_internal_memattrs_dup(struct hwloc_topology *new, struct hwloc_topology *o struct hwloc_internal_memattr_s *imattrs; hwloc_memattr_id_t id; + /* old->nr_memattrs is always > 0 thanks to default memattrs */ + imattrs = hwloc_tma_malloc(tma, old->nr_memattrs * sizeof(*imattrs)); if (!imattrs) return -1; diff --git a/src/3rdparty/hwloc/src/pci-common.c b/src/3rdparty/hwloc/src/pci-common.c index 24626860..977475eb 100644 --- a/src/3rdparty/hwloc/src/pci-common.c +++ b/src/3rdparty/hwloc/src/pci-common.c @@ -810,13 +810,14 @@ hwloc_pcidisc_find_linkspeed(const unsigned char *config, * PCIe Gen3 = 8 GT/s signal-rate per lane with 128/130 encoding = 1 GB/s data-rate per lane * PCIe Gen4 = 16 GT/s signal-rate per lane with 128/130 encoding = 2 GB/s data-rate per lane * PCIe Gen5 = 32 GT/s signal-rate per lane with 128/130 encoding = 4 GB/s data-rate per lane + * PCIe Gen6 = 64 GT/s signal-rate per lane with 128/130 encoding = 8 GB/s data-rate per lane */ /* lanespeed in Gbit/s */ if (speed <= 2) lanespeed = 2.5f * speed * 0.8f; else - lanespeed = 8.0f * (1<<(speed-3)) * 128/130; /* assume Gen6 will be 64 GT/s and so on */ + lanespeed = 8.0f * (1<<(speed-3)) * 128/130; /* assume Gen7 will be 128 GT/s and so on */ /* linkspeed in GB/s */ *linkspeed = lanespeed * width / 8; diff --git a/src/3rdparty/hwloc/src/topology-windows.c b/src/3rdparty/hwloc/src/topology-windows.c index d67c6b99..df93c5e9 100644 --- a/src/3rdparty/hwloc/src/topology-windows.c +++ b/src/3rdparty/hwloc/src/topology-windows.c @@ -13,6 +13,7 @@ #include "hwloc.h" #include "hwloc/windows.h" #include "private/private.h" +#include "private/windows.h" /* must be before windows.h */ #include "private/debug.h" #include @@ -65,26 +66,6 @@ typedef enum _LOGICAL_PROCESSOR_RELATIONSHIP { # endif /* HAVE_RELATIONPROCESSORPACKAGE */ #endif /* HAVE_LOGICAL_PROCESSOR_RELATIONSHIP */ -#ifndef HAVE_SYSTEM_LOGICAL_PROCESSOR_INFORMATION -typedef struct _SYSTEM_LOGICAL_PROCESSOR_INFORMATION { - ULONG_PTR ProcessorMask; - LOGICAL_PROCESSOR_RELATIONSHIP Relationship; - _ANONYMOUS_UNION - union { - struct { - BYTE flags; - } ProcessorCore; - struct { - DWORD NodeNumber; - } NumaNode; - CACHE_DESCRIPTOR Cache; - ULONGLONG Reserved[2]; - } DUMMYUNIONNAME; -} SYSTEM_LOGICAL_PROCESSOR_INFORMATION, *PSYSTEM_LOGICAL_PROCESSOR_INFORMATION; -#endif - -/* Extended interface, for group support */ - #ifndef HAVE_GROUP_AFFINITY typedef struct _GROUP_AFFINITY { KAFFINITY Mask; @@ -93,35 +74,40 @@ typedef struct _GROUP_AFFINITY { } GROUP_AFFINITY, *PGROUP_AFFINITY; #endif -#ifndef HAVE_PROCESSOR_RELATIONSHIP +/* always use our own structure because the EfficiencyClass field didn't exist before Win10 */ typedef struct HWLOC_PROCESSOR_RELATIONSHIP { BYTE Flags; - BYTE EfficiencyClass; /* for RelationProcessorCore, higher means greater performance but less efficiency, only available in Win10+ */ + BYTE EfficiencyClass; /* for RelationProcessorCore, higher means greater performance but less efficiency */ BYTE Reserved[20]; WORD GroupCount; GROUP_AFFINITY GroupMask[ANYSIZE_ARRAY]; -} PROCESSOR_RELATIONSHIP, *PPROCESSOR_RELATIONSHIP; -#endif +} HWLOC_PROCESSOR_RELATIONSHIP; -#ifndef HAVE_NUMA_NODE_RELATIONSHIP -typedef struct _NUMA_NODE_RELATIONSHIP { +/* always use our own structure because the GroupCount and GroupMasks fields didn't exist in some Win10 */ +typedef struct HWLOC_NUMA_NODE_RELATIONSHIP { DWORD NodeNumber; - BYTE Reserved[20]; - GROUP_AFFINITY GroupMask; -} NUMA_NODE_RELATIONSHIP, *PNUMA_NODE_RELATIONSHIP; -#endif + BYTE Reserved[18]; + WORD GroupCount; + _ANONYMOUS_UNION + union { + GROUP_AFFINITY GroupMask; + GROUP_AFFINITY GroupMasks[ANYSIZE_ARRAY]; + } DUMMYUNIONNAME; +} HWLOC_NUMA_NODE_RELATIONSHIP; -#ifndef HAVE_CACHE_RELATIONSHIP -typedef struct _CACHE_RELATIONSHIP { +typedef struct HWLOC_CACHE_RELATIONSHIP { BYTE Level; BYTE Associativity; WORD LineSize; DWORD CacheSize; PROCESSOR_CACHE_TYPE Type; - BYTE Reserved[20]; - GROUP_AFFINITY GroupMask; -} CACHE_RELATIONSHIP, *PCACHE_RELATIONSHIP; -#endif + BYTE Reserved[18]; + WORD GroupCount; + union { + GROUP_AFFINITY GroupMask; + GROUP_AFFINITY GroupMasks[ANYSIZE_ARRAY]; + } DUMMYUNIONNAME; +} HWLOC_CACHE_RELATIONSHIP; #ifndef HAVE_PROCESSOR_GROUP_INFO typedef struct _PROCESSOR_GROUP_INFO { @@ -141,20 +127,19 @@ typedef struct _GROUP_RELATIONSHIP { } GROUP_RELATIONSHIP, *PGROUP_RELATIONSHIP; #endif -#ifndef HAVE_SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX -typedef struct _SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX { +/* always use our own structure because we need our own HWLOC_PROCESSOR/CACHE/NUMA_NODE_RELATIONSHIP */ +typedef struct HWLOC_SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX { LOGICAL_PROCESSOR_RELATIONSHIP Relationship; DWORD Size; _ANONYMOUS_UNION union { - PROCESSOR_RELATIONSHIP Processor; - NUMA_NODE_RELATIONSHIP NumaNode; - CACHE_RELATIONSHIP Cache; + HWLOC_PROCESSOR_RELATIONSHIP Processor; + HWLOC_NUMA_NODE_RELATIONSHIP NumaNode; + HWLOC_CACHE_RELATIONSHIP Cache; GROUP_RELATIONSHIP Group; /* Odd: no member to tell the cpu mask of the package... */ } DUMMYUNIONNAME; -} SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, *PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX; -#endif +} HWLOC_SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX; #ifndef HAVE_PSAPI_WORKING_SET_EX_BLOCK typedef union _PSAPI_WORKING_SET_EX_BLOCK { @@ -200,10 +185,7 @@ static PFN_GETCURRENTPROCESSORNUMBER GetCurrentProcessorNumberProc; typedef VOID (WINAPI *PFN_GETCURRENTPROCESSORNUMBEREX)(PPROCESSOR_NUMBER); static PFN_GETCURRENTPROCESSORNUMBEREX GetCurrentProcessorNumberExProc; -typedef BOOL (WINAPI *PFN_GETLOGICALPROCESSORINFORMATION)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION Buffer, PDWORD ReturnLength); -static PFN_GETLOGICALPROCESSORINFORMATION GetLogicalProcessorInformationProc; - -typedef BOOL (WINAPI *PFN_GETLOGICALPROCESSORINFORMATIONEX)(LOGICAL_PROCESSOR_RELATIONSHIP relationship, PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX Buffer, PDWORD ReturnLength); +typedef BOOL (WINAPI *PFN_GETLOGICALPROCESSORINFORMATIONEX)(LOGICAL_PROCESSOR_RELATIONSHIP relationship, HWLOC_SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *Buffer, PDWORD ReturnLength); static PFN_GETLOGICALPROCESSORINFORMATIONEX GetLogicalProcessorInformationExProc; typedef BOOL (WINAPI *PFN_SETTHREADGROUPAFFINITY)(HANDLE hThread, const GROUP_AFFINITY *GroupAffinity, PGROUP_AFFINITY PreviousGroupAffinity); @@ -244,8 +226,6 @@ static void hwloc_win_get_function_ptrs(void) (PFN_GETACTIVEPROCESSORGROUPCOUNT) GetProcAddress(kernel32, "GetActiveProcessorGroupCount"); GetActiveProcessorCountProc = (PFN_GETACTIVEPROCESSORCOUNT) GetProcAddress(kernel32, "GetActiveProcessorCount"); - GetLogicalProcessorInformationProc = - (PFN_GETLOGICALPROCESSORINFORMATION) GetProcAddress(kernel32, "GetLogicalProcessorInformation"); GetCurrentProcessorNumberProc = (PFN_GETCURRENTPROCESSORNUMBER) GetProcAddress(kernel32, "GetCurrentProcessorNumber"); GetCurrentProcessorNumberExProc = @@ -370,13 +350,13 @@ static hwloc_cpuset_t * processor_group_cpusets = NULL; static void hwloc_win_get_processor_groups(void) { - PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX procInfoTotal, tmpprocInfoTotal, procInfo; + HWLOC_SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *procInfoTotal, *tmpprocInfoTotal, *procInfo; DWORD length; unsigned i; hwloc_debug("querying windows processor groups\n"); - if (!GetActiveProcessorGroupCountProc || !GetLogicalProcessorInformationExProc) + if (!GetLogicalProcessorInformationExProc) goto error; nr_processor_groups = GetActiveProcessorGroupCountProc(); @@ -415,6 +395,8 @@ hwloc_win_get_processor_groups(void) assert(procInfo->Relationship == RelationGroup); + hwloc_debug("Found %u active windows processor groups\n", + (unsigned) procInfo->Group.ActiveGroupCount); for (id = 0; id < procInfo->Group.ActiveGroupCount; id++) { KAFFINITY mask; hwloc_bitmap_t set; @@ -424,8 +406,8 @@ hwloc_win_get_processor_groups(void) goto error_with_cpusets; mask = procInfo->Group.GroupInfo[id].ActiveProcessorMask; - hwloc_debug("group %u %d cpus mask %lx\n", id, - procInfo->Group.GroupInfo[id].ActiveProcessorCount, mask); + hwloc_debug("group %u with %u cpus mask 0x%llx\n", id, + (unsigned) procInfo->Group.GroupInfo[id].ActiveProcessorCount, (unsigned long long) mask); /* KAFFINITY is ULONG_PTR */ hwloc_bitmap_set_ith_ULONG_PTR(set, id, mask); /* FIXME: what if running 32bits on a 64bits windows with 64-processor groups? @@ -1008,6 +990,8 @@ hwloc_look_windows(struct hwloc_backend *backend, struct hwloc_disc_status *dsta unsigned hostname_size = sizeof(hostname); int has_efficiencyclass = 0; struct hwloc_win_efficiency_classes eclasses; + char *env = getenv("HWLOC_WINDOWS_PROCESSOR_GROUP_OBJS"); + int keep_pgroup_objs = (env && atoi(env)); assert(dstatus->phase == HWLOC_DISC_PHASE_CPU); @@ -1038,137 +1022,8 @@ hwloc_look_windows(struct hwloc_backend *backend, struct hwloc_disc_status *dsta GetSystemInfo(&SystemInfo); - if (!GetLogicalProcessorInformationExProc && GetLogicalProcessorInformationProc) { - PSYSTEM_LOGICAL_PROCESSOR_INFORMATION procInfo, tmpprocInfo; - unsigned id; - unsigned i; - struct hwloc_obj *obj; - hwloc_obj_type_t type; - - length = 0; - procInfo = NULL; - - while (1) { - if (GetLogicalProcessorInformationProc(procInfo, &length)) - break; - if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) - return -1; - tmpprocInfo = realloc(procInfo, length); - if (!tmpprocInfo) { - free(procInfo); - goto out; - } - procInfo = tmpprocInfo; - } - - assert(!length || procInfo); - - for (i = 0; i < length / sizeof(*procInfo); i++) { - - /* Ignore unknown caches */ - if (procInfo->Relationship == RelationCache - && procInfo->Cache.Type != CacheUnified - && procInfo->Cache.Type != CacheData - && procInfo->Cache.Type != CacheInstruction) - continue; - - id = HWLOC_UNKNOWN_INDEX; - switch (procInfo[i].Relationship) { - case RelationNumaNode: - type = HWLOC_OBJ_NUMANODE; - id = procInfo[i].NumaNode.NodeNumber; - gotnuma++; - if (id > max_numanode_index) - max_numanode_index = id; - break; - case RelationProcessorPackage: - type = HWLOC_OBJ_PACKAGE; - break; - case RelationCache: - type = (procInfo[i].Cache.Type == CacheInstruction ? HWLOC_OBJ_L1ICACHE : HWLOC_OBJ_L1CACHE) + procInfo[i].Cache.Level - 1; - break; - case RelationProcessorCore: - type = HWLOC_OBJ_CORE; - break; - case RelationGroup: - default: - type = HWLOC_OBJ_GROUP; - break; - } - - if (!hwloc_filter_check_keep_object_type(topology, type)) - continue; - - obj = hwloc_alloc_setup_object(topology, type, id); - obj->cpuset = hwloc_bitmap_alloc(); - hwloc_debug("%s#%u mask %llx\n", hwloc_obj_type_string(type), id, (unsigned long long) procInfo[i].ProcessorMask); - /* ProcessorMask is a ULONG_PTR */ - hwloc_bitmap_set_ith_ULONG_PTR(obj->cpuset, 0, procInfo[i].ProcessorMask); - hwloc_debug_2args_bitmap("%s#%u bitmap %s\n", hwloc_obj_type_string(type), id, obj->cpuset); - - switch (type) { - case HWLOC_OBJ_NUMANODE: - { - ULONGLONG avail; - obj->nodeset = hwloc_bitmap_alloc(); - hwloc_bitmap_set(obj->nodeset, id); - if ((GetNumaAvailableMemoryNodeExProc && GetNumaAvailableMemoryNodeExProc(id, &avail)) - || (GetNumaAvailableMemoryNodeProc && GetNumaAvailableMemoryNodeProc(id, &avail))) { - obj->attr->numanode.local_memory = avail; - gotnumamemory++; - } - obj->attr->numanode.page_types_len = 2; - obj->attr->numanode.page_types = malloc(2 * sizeof(*obj->attr->numanode.page_types)); - memset(obj->attr->numanode.page_types, 0, 2 * sizeof(*obj->attr->numanode.page_types)); - obj->attr->numanode.page_types_len = 1; - obj->attr->numanode.page_types[0].size = SystemInfo.dwPageSize; -#if HAVE_DECL__SC_LARGE_PAGESIZE - obj->attr->numanode.page_types_len++; - obj->attr->numanode.page_types[1].size = sysconf(_SC_LARGE_PAGESIZE); -#endif - break; - } - case HWLOC_OBJ_L1CACHE: - case HWLOC_OBJ_L2CACHE: - case HWLOC_OBJ_L3CACHE: - case HWLOC_OBJ_L4CACHE: - case HWLOC_OBJ_L5CACHE: - case HWLOC_OBJ_L1ICACHE: - case HWLOC_OBJ_L2ICACHE: - case HWLOC_OBJ_L3ICACHE: - obj->attr->cache.size = procInfo[i].Cache.Size; - obj->attr->cache.associativity = procInfo[i].Cache.Associativity == CACHE_FULLY_ASSOCIATIVE ? -1 : procInfo[i].Cache.Associativity ; - obj->attr->cache.linesize = procInfo[i].Cache.LineSize; - obj->attr->cache.depth = procInfo[i].Cache.Level; - switch (procInfo->Cache.Type) { - case CacheUnified: - obj->attr->cache.type = HWLOC_OBJ_CACHE_UNIFIED; - break; - case CacheData: - obj->attr->cache.type = HWLOC_OBJ_CACHE_DATA; - break; - case CacheInstruction: - obj->attr->cache.type = HWLOC_OBJ_CACHE_INSTRUCTION; - break; - default: - hwloc_free_unlinked_object(obj); - continue; - } - break; - case HWLOC_OBJ_GROUP: - obj->attr->group.kind = procInfo[i].Relationship == RelationGroup ? HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP : HWLOC_GROUP_KIND_WINDOWS_RELATIONSHIP_UNKNOWN; - break; - default: - break; - } - hwloc__insert_object_by_cpuset(topology, NULL, obj, "windows:GetLogicalProcessorInformation"); - } - - free(procInfo); - } - if (GetLogicalProcessorInformationExProc) { - PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX procInfoTotal, tmpprocInfoTotal, procInfo; + HWLOC_SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *procInfoTotal, *tmpprocInfoTotal, *procInfo; unsigned id; struct hwloc_obj *obj; hwloc_obj_type_t type; @@ -1207,8 +1062,16 @@ hwloc_look_windows(struct hwloc_backend *backend, struct hwloc_disc_status *dsta switch (procInfo->Relationship) { case RelationNumaNode: type = HWLOC_OBJ_NUMANODE; - num = 1; - GroupMask = &procInfo->NumaNode.GroupMask; + /* Starting with Windows 11 and Server 2022, the GroupCount field is valid and >=1 + * and we may read GroupMasks[]. Older releases have GroupCount==0 and we must read GroupMask. + */ + if (procInfo->NumaNode.GroupCount) { + num = procInfo->NumaNode.GroupCount; + GroupMask = procInfo->NumaNode.GroupMasks; + } else { + num = 1; + GroupMask = &procInfo->NumaNode.GroupMask; + } id = procInfo->NumaNode.NodeNumber; gotnuma++; if (id > max_numanode_index) @@ -1221,18 +1084,20 @@ hwloc_look_windows(struct hwloc_backend *backend, struct hwloc_disc_status *dsta break; case RelationCache: type = (procInfo->Cache.Type == CacheInstruction ? HWLOC_OBJ_L1ICACHE : HWLOC_OBJ_L1CACHE) + procInfo->Cache.Level - 1; - num = 1; - GroupMask = &procInfo->Cache.GroupMask; + /* GroupCount added approximately with NumaNode.GroupCount above */ + if (procInfo->Cache.GroupCount) { + num = procInfo->Cache.GroupCount; + GroupMask = procInfo->Cache.GroupMasks; + } else { + num = 1; + GroupMask = &procInfo->Cache.GroupMask; + } break; case RelationProcessorCore: type = HWLOC_OBJ_CORE; num = procInfo->Processor.GroupCount; GroupMask = procInfo->Processor.GroupMask; - if (has_efficiencyclass) - /* the EfficiencyClass field didn't exist before Windows10 and recent MSVC headers, - * so just access it manually instead of trying to detect it. - */ - efficiency_class = * ((&procInfo->Processor.Flags) + 1); + efficiency_class = procInfo->Processor.EfficiencyClass; break; case RelationGroup: /* So strange an interface... */ @@ -1257,11 +1122,12 @@ hwloc_look_windows(struct hwloc_backend *backend, struct hwloc_disc_status *dsta groups_pu_set = hwloc_bitmap_alloc(); hwloc_bitmap_or(groups_pu_set, groups_pu_set, set); - if (hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_GROUP)) { + /* Ignore processor groups unless requested and filtered-in */ + if (keep_pgroup_objs && hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_GROUP)) { obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_GROUP, id); obj->cpuset = set; obj->attr->group.kind = HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP; - hwloc__insert_object_by_cpuset(topology, NULL, obj, "windows:GetLogicalProcessorInformation:ProcessorGroup"); + hwloc__insert_object_by_cpuset(topology, NULL, obj, "windows:GetLogicalProcessorInformationEx:ProcessorGroup"); } else hwloc_bitmap_free(set); } diff --git a/src/3rdparty/hwloc/src/topology-x86.c b/src/3rdparty/hwloc/src/topology-x86.c index 94f9d453..42172eca 100644 --- a/src/3rdparty/hwloc/src/topology-x86.c +++ b/src/3rdparty/hwloc/src/topology-x86.c @@ -500,7 +500,8 @@ static void read_amd_cores_topoext(struct procinfo *infos, unsigned long flags, nodes_per_proc = ((ecx >> 8) & 7) + 1; } if ((infos->cpufamilynumber == 0x15 && nodes_per_proc > 2) - || ((infos->cpufamilynumber == 0x17 || infos->cpufamilynumber == 0x18) && nodes_per_proc > 4)) { + || ((infos->cpufamilynumber == 0x17 || infos->cpufamilynumber == 0x18) && nodes_per_proc > 4) + || (infos->cpufamilynumber == 0x19 && nodes_per_proc > 1)) { hwloc_debug("warning: undefined nodes_per_proc value %u, assuming it means %u\n", nodes_per_proc, nodes_per_proc); } } @@ -775,13 +776,19 @@ static void look_proc(struct hwloc_backend *backend, struct procinfo *infos, uns } else if (cpuid_type == amd) { /* AMD quirks */ - if (infos->cpufamilynumber == 0x17 - && cache->level == 3 && cache->nbthreads_sharing == 6) { - /* AMD family 0x17 always shares L3 between 8 APIC ids, - * even when only 6 APIC ids are enabled and reported in nbthreads_sharing - * (on 24-core CPUs). + if (infos->cpufamilynumber >= 0x17 && cache->level == 3) { + /* AMD family 0x19 always shares L3 between 16 APIC ids (8 HT cores). + * while Family 0x17 shares between 8 APIC ids (4 HT cores). + * But many models have less APIC ids enabled and reported in nbthreads_sharing. + * It means we must round-up nbthreads_sharing to the nearest power of 2 + * before computing cacheid. */ - cache->cacheid = infos->apicid / 8; + unsigned nbapics_sharing = cache->nbthreads_sharing; + if (nbapics_sharing & (nbapics_sharing-1)) + /* not a power of two, round-up */ + nbapics_sharing = 1U<<(1+hwloc_ffsl(nbapics_sharing)); + + cache->cacheid = infos->apicid / nbapics_sharing; } else if (infos->cpufamilynumber== 0x10 && infos->cpumodelnumber == 0x9 && cache->level == 3 @@ -807,7 +814,7 @@ static void look_proc(struct hwloc_backend *backend, struct procinfo *infos, uns } else if (infos->cpufamilynumber == 0x15 && (infos->cpumodelnumber == 0x1 /* Bulldozer */ || infos->cpumodelnumber == 0x2 /* Piledriver */) && cache->level == 3 && cache->nbthreads_sharing == 6) { - /* AMD Bulldozer and Piledriver 12-core processors have same APIC ids as Magny-Cours below, + /* AMD Bulldozer and Piledriver 12-core processors have same APIC ids as Magny-Cours above, * but we can't merge the checks because the original nbthreads_sharing must be exactly 6 here. */ cache->cacheid = (infos->apicid % legacy_max_log_proc) / cache->nbthreads_sharing /* cacheid within the package */ @@ -1231,6 +1238,18 @@ static void summarize(struct hwloc_backend *backend, struct procinfo *infos, uns } } cache = hwloc_alloc_setup_object(topology, otype, HWLOC_UNKNOWN_INDEX); + /* We don't specify the os_index of caches because we want to be + * 100% sure they are identical to what the Linux kernel reports + * (so that things like resctrl work). + * However, vendor/model-specific quirks in the x86 code above + * make this difficult. + * + * Caveat: if the x86 backend is used on Linux to avoid kernel bugs, + * IDs won't be available to resctrl users. But resctrl heavily + * relies on the kernel x86 discovery being non-buggy anyway. + * + * TODO: make this optional? or only disable it on Linux? + */ cache->attr->cache.depth = level; cache->attr->cache.size = infos[i].cache[l].size; cache->attr->cache.linesize = infos[i].cache[l].linesize; @@ -1688,7 +1707,7 @@ hwloc_x86_check_cpuiddump_input(const char *src_cpuiddump_path, hwloc_bitmap_t s char line [32]; dir = opendir(src_cpuiddump_path); - if (!dir) + if (!dir) return -1; path = malloc(strlen(src_cpuiddump_path) + strlen("/hwloc-cpuid-info") + 1); diff --git a/src/3rdparty/hwloc/src/topology-xml.c b/src/3rdparty/hwloc/src/topology-xml.c index 87e91010..2075d6fa 100644 --- a/src/3rdparty/hwloc/src/topology-xml.c +++ b/src/3rdparty/hwloc/src/topology-xml.c @@ -243,7 +243,7 @@ hwloc__xml_import_object_attr(struct hwloc_topology *topology, else if (!strcmp(name, "dont_merge")) { unsigned long lvalue = strtoul(value, NULL, 10); if (obj->type == HWLOC_OBJ_GROUP) - obj->attr->group.dont_merge = lvalue; + obj->attr->group.dont_merge = (unsigned char) lvalue; else if (hwloc__xml_verbose()) fprintf(stderr, "%s: ignoring dont_merge attribute for non-group object type\n", state->global->msgprefix); @@ -2825,6 +2825,7 @@ hwloc__xml_v1export_object_with_memory(hwloc__xml_export_state_t parentstate, hw /* child has sibling, we must add a Group around those memory children */ hwloc_obj_t group = parentstate->global->v1_memory_group; parentstate->new_child(parentstate, &gstate, "object"); + group->parent = obj->parent; group->cpuset = obj->cpuset; group->complete_cpuset = obj->complete_cpuset; group->nodeset = obj->nodeset; diff --git a/src/3rdparty/hwloc/src/topology.c b/src/3rdparty/hwloc/src/topology.c index 01e5a863..c0f39c77 100644 --- a/src/3rdparty/hwloc/src/topology.c +++ b/src/3rdparty/hwloc/src/topology.c @@ -69,7 +69,7 @@ * it will break in cygwin, we'll have to use both putenv() and SetEnvironmentVariable(). * Hopefully L0 will be provide a way to enable Sysman without env vars before it happens. */ -#ifdef HWLOC_HAVE_ATTRIBUTE_CONSTRUCTOR +#if HWLOC_HAVE_ATTRIBUTE_CONSTRUCTOR static void hwloc_constructor(void) __attribute__((constructor)); static void hwloc_constructor(void) { @@ -1901,6 +1901,9 @@ hwloc_topology_alloc_group_object(struct hwloc_topology *topology) static void hwloc_propagate_symmetric_subtree(hwloc_topology_t topology, hwloc_obj_t root); static void propagate_total_memory(hwloc_obj_t obj); static void hwloc_set_group_depth(hwloc_topology_t topology); +static void hwloc_connect_children(hwloc_obj_t parent); +static int hwloc_connect_levels(hwloc_topology_t topology); +static int hwloc_connect_special_levels(hwloc_topology_t topology); hwloc_obj_t hwloc_topology_insert_group_object(struct hwloc_topology *topology, hwloc_obj_t obj) @@ -2474,13 +2477,26 @@ hwloc_compare_levels_structure(hwloc_topology_t topology, unsigned i) return 0; } -/* return > 0 if any level was removed, which means reconnect is needed */ -static void +/* return > 0 if any level was removed. + * performs its own reconnect internally if needed + */ +static int hwloc_filter_levels_keep_structure(hwloc_topology_t topology) { unsigned i, j; int res = 0; + if (topology->modified) { + /* WARNING: hwloc_topology_reconnect() is duplicated partially here + * and at the end of this function: + * - we need normal levels before merging. + * - and we'll need to update special levels after merging. + */ + hwloc_connect_children(topology->levels[0][0]); + if (hwloc_connect_levels(topology) < 0) + return -1; + } + /* start from the bottom since we'll remove intermediate levels */ for(i=topology->nb_levels-1; i>0; i--) { int replacechild = 0, replaceparent = 0; @@ -2646,6 +2662,22 @@ hwloc_filter_levels_keep_structure(hwloc_topology_t topology) topology->type_depth[type] = HWLOC_TYPE_DEPTH_MULTIPLE; } } + + + if (res > 0 || topology-> modified) { + /* WARNING: hwloc_topology_reconnect() is duplicated partially here + * and at the beginning of this function. + * If we merged some levels, some child+parent special children lisst + * may have been merged, hence specials level might need reordering, + * So reconnect special levels only here at the end + * (it's not needed at the beginning of this function). + */ + if (hwloc_connect_special_levels(topology) < 0) + return -1; + topology->modified = 0; + } + + return 0; } static void @@ -2963,9 +2995,9 @@ hwloc_list_special_objects(hwloc_topology_t topology, hwloc_obj_t obj) } } -/* Build I/O levels */ +/* Build Memory, I/O and Misc levels */ static int -hwloc_connect_io_misc_levels(hwloc_topology_t topology) +hwloc_connect_special_levels(hwloc_topology_t topology) { unsigned i; @@ -3176,6 +3208,10 @@ hwloc_connect_levels(hwloc_topology_t topology) int hwloc_topology_reconnect(struct hwloc_topology *topology, unsigned long flags) { + /* WARNING: when updating this function, the replicated code must + * also be updated inside hwloc_filter_levels_keep_structure() + */ + if (flags) { errno = EINVAL; return -1; @@ -3188,7 +3224,7 @@ hwloc_topology_reconnect(struct hwloc_topology *topology, unsigned long flags) if (hwloc_connect_levels(topology) < 0) return -1; - if (hwloc_connect_io_misc_levels(topology) < 0) + if (hwloc_connect_special_levels(topology) < 0) return -1; topology->modified = 0; @@ -3529,15 +3565,12 @@ hwloc_discover(struct hwloc_topology *topology, } hwloc_debug_print_objects(0, topology->levels[0][0]); - /* Reconnect things after all these changes. - * Often needed because of Groups inserted for I/Os. - * And required for KEEP_STRUCTURE below. - */ - if (hwloc_topology_reconnect(topology, 0) < 0) - return -1; - hwloc_debug("%s", "\nRemoving levels with HWLOC_TYPE_FILTER_KEEP_STRUCTURE\n"); - hwloc_filter_levels_keep_structure(topology); + if (hwloc_filter_levels_keep_structure(topology) < 0) + return -1; + /* takes care of reconnecting children/levels internally, + * because it needs normal levels. + * and it's often needed below because of Groups inserted for I/Os anyway */ hwloc_debug_print_objects(0, topology->levels[0][0]); /* accumulate children memory in total_memory fields (only once parent is set) */ @@ -4360,14 +4393,13 @@ hwloc_topology_restrict(struct hwloc_topology *topology, hwloc_const_bitmap_t se hwloc_bitmap_free(droppedcpuset); hwloc_bitmap_free(droppednodeset); - if (hwloc_topology_reconnect(topology, 0) < 0) + if (hwloc_filter_levels_keep_structure(topology) < 0) /* takes care of reconnecting internally */ goto out; /* some objects may have disappeared, we need to update distances objs arrays */ hwloc_internal_distances_invalidate_cached_objs(topology); hwloc_internal_memattrs_need_refresh(topology); - hwloc_filter_levels_keep_structure(topology); hwloc_propagate_symmetric_subtree(topology, topology->levels[0][0]); propagate_total_memory(topology->levels[0][0]); hwloc_internal_cpukinds_restrict(topology);