From 2b29b81b898e7d547f8b80d0bd38aa8632d09dfd Mon Sep 17 00:00:00 2001
From: XMRig <support@xmrig.com>
Date: Sun, 28 Jul 2019 09:24:53 +0700
Subject: [PATCH] Use internal hwloc for MSVC.

---
 CMakeLists.txt                                |    2 +-
 src/3rdparty/hwloc/AUTHORS                    |   44 +
 src/3rdparty/hwloc/CMakeLists.txt             |   38 +
 src/3rdparty/hwloc/COPYING                    |   39 +
 src/3rdparty/hwloc/NEWS                       | 1599 ++++++
 src/3rdparty/hwloc/README                     |   85 +
 src/3rdparty/hwloc/VERSION                    |   47 +
 src/3rdparty/hwloc/include/hwloc.h            | 2270 +++++++++
 .../hwloc/include/hwloc/autogen/config.h      |   59 +
 src/3rdparty/hwloc/include/hwloc/bitmap.h     |  467 ++
 src/3rdparty/hwloc/include/hwloc/cuda.h       |  220 +
 src/3rdparty/hwloc/include/hwloc/cudart.h     |  177 +
 src/3rdparty/hwloc/include/hwloc/deprecated.h |  206 +
 src/3rdparty/hwloc/include/hwloc/diff.h       |  289 ++
 src/3rdparty/hwloc/include/hwloc/distances.h  |  271 +
 src/3rdparty/hwloc/include/hwloc/export.h     |  278 +
 src/3rdparty/hwloc/include/hwloc/gl.h         |  135 +
 .../hwloc/include/hwloc/glibc-sched.h         |  125 +
 src/3rdparty/hwloc/include/hwloc/helper.h     | 1160 +++++
 src/3rdparty/hwloc/include/hwloc/inlines.h    |  146 +
 src/3rdparty/hwloc/include/hwloc/intel-mic.h  |  134 +
 .../hwloc/include/hwloc/linux-libnuma.h       |  273 +
 src/3rdparty/hwloc/include/hwloc/linux.h      |   79 +
 src/3rdparty/hwloc/include/hwloc/nvml.h       |  181 +
 src/3rdparty/hwloc/include/hwloc/opencl.h     |  206 +
 .../hwloc/include/hwloc/openfabrics-verbs.h   |  150 +
 src/3rdparty/hwloc/include/hwloc/plugins.h    |  542 ++
 src/3rdparty/hwloc/include/hwloc/rename.h     |  765 +++
 src/3rdparty/hwloc/include/hwloc/shmem.h      |  137 +
 .../hwloc/include/private/autogen/config.h    |  672 +++
 .../hwloc/include/private/components.h        |   43 +
 .../hwloc/include/private/cpuid-x86.h         |   86 +
 src/3rdparty/hwloc/include/private/debug.h    |   83 +
 .../include/private/internal-components.h     |   41 +
 src/3rdparty/hwloc/include/private/misc.h     |  583 +++
 src/3rdparty/hwloc/include/private/netloc.h   |  578 +++
 src/3rdparty/hwloc/include/private/private.h  |  417 ++
 .../hwloc/include/private/solaris-chiptype.h  |   43 +
 src/3rdparty/hwloc/include/private/xml.h      |  108 +
 src/3rdparty/hwloc/src/base64.c               |  309 ++
 src/3rdparty/hwloc/src/bind.c                 |  922 ++++
 src/3rdparty/hwloc/src/bitmap.c               | 1676 ++++++
 src/3rdparty/hwloc/src/components.c           |  785 +++
 src/3rdparty/hwloc/src/diff.c                 |  492 ++
 src/3rdparty/hwloc/src/distances.c            |  920 ++++
 src/3rdparty/hwloc/src/misc.c                 |  166 +
 src/3rdparty/hwloc/src/pci-common.c           |  941 ++++
 src/3rdparty/hwloc/src/shmem.c                |  287 ++
 src/3rdparty/hwloc/src/static-components.h    |   15 +
 src/3rdparty/hwloc/src/topology-noos.c        |   65 +
 src/3rdparty/hwloc/src/topology-synthetic.c   | 1521 ++++++
 src/3rdparty/hwloc/src/topology-windows.c     | 1189 +++++
 src/3rdparty/hwloc/src/topology-x86.c         | 1583 ++++++
 .../hwloc/src/topology-xml-nolibxml.c         |  919 ++++
 src/3rdparty/hwloc/src/topology-xml.c         | 2886 +++++++++++
 src/3rdparty/hwloc/src/topology.c             | 4484 +++++++++++++++++
 src/3rdparty/hwloc/src/traversal.c            |  616 +++
 src/backend/cpu/cpu.cmake                     |   13 +-
 58 files changed, 32562 insertions(+), 5 deletions(-)
 create mode 100644 src/3rdparty/hwloc/AUTHORS
 create mode 100644 src/3rdparty/hwloc/CMakeLists.txt
 create mode 100644 src/3rdparty/hwloc/COPYING
 create mode 100644 src/3rdparty/hwloc/NEWS
 create mode 100644 src/3rdparty/hwloc/README
 create mode 100644 src/3rdparty/hwloc/VERSION
 create mode 100644 src/3rdparty/hwloc/include/hwloc.h
 create mode 100644 src/3rdparty/hwloc/include/hwloc/autogen/config.h
 create mode 100644 src/3rdparty/hwloc/include/hwloc/bitmap.h
 create mode 100644 src/3rdparty/hwloc/include/hwloc/cuda.h
 create mode 100644 src/3rdparty/hwloc/include/hwloc/cudart.h
 create mode 100644 src/3rdparty/hwloc/include/hwloc/deprecated.h
 create mode 100644 src/3rdparty/hwloc/include/hwloc/diff.h
 create mode 100644 src/3rdparty/hwloc/include/hwloc/distances.h
 create mode 100644 src/3rdparty/hwloc/include/hwloc/export.h
 create mode 100644 src/3rdparty/hwloc/include/hwloc/gl.h
 create mode 100644 src/3rdparty/hwloc/include/hwloc/glibc-sched.h
 create mode 100644 src/3rdparty/hwloc/include/hwloc/helper.h
 create mode 100644 src/3rdparty/hwloc/include/hwloc/inlines.h
 create mode 100644 src/3rdparty/hwloc/include/hwloc/intel-mic.h
 create mode 100644 src/3rdparty/hwloc/include/hwloc/linux-libnuma.h
 create mode 100644 src/3rdparty/hwloc/include/hwloc/linux.h
 create mode 100644 src/3rdparty/hwloc/include/hwloc/nvml.h
 create mode 100644 src/3rdparty/hwloc/include/hwloc/opencl.h
 create mode 100644 src/3rdparty/hwloc/include/hwloc/openfabrics-verbs.h
 create mode 100644 src/3rdparty/hwloc/include/hwloc/plugins.h
 create mode 100644 src/3rdparty/hwloc/include/hwloc/rename.h
 create mode 100644 src/3rdparty/hwloc/include/hwloc/shmem.h
 create mode 100644 src/3rdparty/hwloc/include/private/autogen/config.h
 create mode 100644 src/3rdparty/hwloc/include/private/components.h
 create mode 100644 src/3rdparty/hwloc/include/private/cpuid-x86.h
 create mode 100644 src/3rdparty/hwloc/include/private/debug.h
 create mode 100644 src/3rdparty/hwloc/include/private/internal-components.h
 create mode 100644 src/3rdparty/hwloc/include/private/misc.h
 create mode 100644 src/3rdparty/hwloc/include/private/netloc.h
 create mode 100644 src/3rdparty/hwloc/include/private/private.h
 create mode 100644 src/3rdparty/hwloc/include/private/solaris-chiptype.h
 create mode 100644 src/3rdparty/hwloc/include/private/xml.h
 create mode 100644 src/3rdparty/hwloc/src/base64.c
 create mode 100644 src/3rdparty/hwloc/src/bind.c
 create mode 100644 src/3rdparty/hwloc/src/bitmap.c
 create mode 100644 src/3rdparty/hwloc/src/components.c
 create mode 100644 src/3rdparty/hwloc/src/diff.c
 create mode 100644 src/3rdparty/hwloc/src/distances.c
 create mode 100644 src/3rdparty/hwloc/src/misc.c
 create mode 100644 src/3rdparty/hwloc/src/pci-common.c
 create mode 100644 src/3rdparty/hwloc/src/shmem.c
 create mode 100644 src/3rdparty/hwloc/src/static-components.h
 create mode 100644 src/3rdparty/hwloc/src/topology-noos.c
 create mode 100644 src/3rdparty/hwloc/src/topology-synthetic.c
 create mode 100644 src/3rdparty/hwloc/src/topology-windows.c
 create mode 100644 src/3rdparty/hwloc/src/topology-x86.c
 create mode 100644 src/3rdparty/hwloc/src/topology-xml-nolibxml.c
 create mode 100644 src/3rdparty/hwloc/src/topology-xml.c
 create mode 100644 src/3rdparty/hwloc/src/topology.c
 create mode 100644 src/3rdparty/hwloc/src/traversal.c

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f9dd6fd54..a1779f53f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -261,4 +261,4 @@ if (WITH_DEBUG_LOG)
 endif()
 
 add_executable(${CMAKE_PROJECT_NAME} ${HEADERS} ${SOURCES} ${SOURCES_OS} ${SOURCES_CPUID} ${HEADERS_CRYPTO} ${SOURCES_CRYPTO} ${SOURCES_SYSLOG} ${HTTP_SOURCES} ${TLS_SOURCES} ${XMRIG_ASM_SOURCES} ${CN_GPU_SOURCES})
-target_link_libraries(${CMAKE_PROJECT_NAME} ${XMRIG_ASM_LIBRARY} ${OPENSSL_LIBRARIES} ${UV_LIBRARIES} ${EXTRA_LIBS} ${CPUID_LIB} ${HWLOC_LIBRARY})
+target_link_libraries(${CMAKE_PROJECT_NAME} ${XMRIG_ASM_LIBRARY} ${OPENSSL_LIBRARIES} ${UV_LIBRARIES} ${EXTRA_LIBS} ${CPUID_LIB})
diff --git a/src/3rdparty/hwloc/AUTHORS b/src/3rdparty/hwloc/AUTHORS
new file mode 100644
index 000000000..7187a723d
--- /dev/null
+++ b/src/3rdparty/hwloc/AUTHORS
@@ -0,0 +1,44 @@
+hwloc Authors
+=============
+
+The following cumulative list contains the names of most individuals
+who have committed code to the hwloc repository
+(either directly or through a third party).
+
+Name                         Affiliation(s)
+---------------------------  --------------------
+Grzegorz Andrejczuk          Intel
+Cédric Augonnet              University of Bordeaux
+Guillaume Beauchamp          Inria
+Ahmad Boissetri Binzagr      Inria
+Cyril Bordage                Inria
+Nicholas Buroker             UWL
+Christopher M. Cantalupo     Intel
+Jérôme Clet-Ortega           University of Bordeaux
+Ludovic Courtès              Inria
+Clément Foyer                Inria
+Nathalie Furmento            CNRS
+Bryon Gloden
+Brice Goglin                 Inria
+Gilles Gouaillardet          RIST
+Joshua Hursey                UWL
+Alexey Kardashevskiy         IBM
+Rob Latham                   ANL
+Douglas MacFarland           UWL
+Marc Marí                    BSC
+Jonathan L Peyton            Intel
+Piotr Luc                    Intel
+Antoine Rougier              intern from University of Bordeaux
+Jeff Squyres                 Cisco
+Samuel Thibault              University of Bordeaux
+Jean-Yves VET                DDN
+Benjamin Worpitz
+Jeff Zhao                    Zhaoxin
+
+Affiliaion abbreviations:
+-------------------------
+ANL = Argonne National Lab
+BSC = Barcelona Supercomputing Center
+Cisco = Cisco Systems, Inc.
+CNRS = Centre national de la recherche scientifique (France)
+UWL = University of Wisconsin-La Crosse
diff --git a/src/3rdparty/hwloc/CMakeLists.txt b/src/3rdparty/hwloc/CMakeLists.txt
new file mode 100644
index 000000000..431c11eb3
--- /dev/null
+++ b/src/3rdparty/hwloc/CMakeLists.txt
@@ -0,0 +1,38 @@
+cmake_minimum_required (VERSION 2.8)
+project (hwloc C)
+
+include_directories(include)
+include_directories(src)
+
+add_definitions(/D_CRT_SECURE_NO_WARNINGS)
+set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /MT")
+
+set(HEADERS
+    include/hwloc.h
+    src/static-components.h
+    )
+
+set(SOURCES
+	src/base64.c
+	src/bind.c
+	src/bitmap.c
+	src/components.c
+	src/diff.c
+	src/distances.c
+	src/misc.c
+	src/pci-common.c
+	src/shmem.c
+	src/topology.c
+	src/topology-noos.c
+	src/topology-synthetic.c
+	src/topology-windows.c
+	src/topology-x86.c
+	src/topology-xml.c
+	src/topology-xml-nolibxml.c
+    src/traversal.c
+   )
+
+add_library(hwloc STATIC
+    ${HEADERS}
+    ${SOURCES}
+    )
diff --git a/src/3rdparty/hwloc/COPYING b/src/3rdparty/hwloc/COPYING
new file mode 100644
index 000000000..e77516e18
--- /dev/null
+++ b/src/3rdparty/hwloc/COPYING
@@ -0,0 +1,39 @@
+Copyright © 2004-2006 The Trustees of Indiana University and Indiana University Research and Technology Corporation.  All rights reserved.
+Copyright © 2004-2005 The University of Tennessee and The University of Tennessee Research Foundation.  All rights reserved.
+Copyright © 2004-2005 High Performance Computing Center Stuttgart, University of Stuttgart.  All rights reserved.
+Copyright © 2004-2005 The Regents of the University of California. All rights reserved.
+Copyright © 2009      CNRS
+Copyright © 2009-2016 Inria.  All rights reserved.
+Copyright © 2009-2015 Université Bordeaux
+Copyright © 2009-2015 Cisco Systems, Inc.  All rights reserved.
+Copyright © 2009-2012 Oracle and/or its affiliates.  All rights reserved.
+Copyright © 2010      IBM
+Copyright © 2010      Jirka Hladky
+Copyright © 2012      Aleksej Saushev, The NetBSD Foundation
+Copyright © 2012      Blue Brain Project, EPFL. All rights reserved.
+Copyright © 2013-2014 University of Wisconsin-La Crosse. All rights reserved.
+Copyright © 2015      Research Organization for Information Science and Technology (RIST). All rights reserved.
+Copyright © 2015-2016 Intel, Inc.  All rights reserved.
+See COPYING in top-level directory.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+3. The name of the author may not be used to endorse or promote products
+   derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/src/3rdparty/hwloc/NEWS b/src/3rdparty/hwloc/NEWS
new file mode 100644
index 000000000..664c8d55c
--- /dev/null
+++ b/src/3rdparty/hwloc/NEWS
@@ -0,0 +1,1599 @@
+Copyright © 2009 CNRS
+Copyright © 2009-2019 Inria.  All rights reserved.
+Copyright © 2009-2013 Université Bordeaux
+Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+
+$COPYRIGHT$
+
+Additional copyrights may follow
+
+$HEADER$
+
+===========================================================================
+
+This file contains the main features as well as overviews of specific
+bug fixes (and other actions) for each version of hwloc since version
+0.9 (as initially released as "libtopology", then re-branded to "hwloc"
+in v0.9.1).
+
+
+Version 2.0.4 (also included in 1.11.13 when appropriate)
+-------------
+* Add support for Linux 5.3 new sysfs cpu topology files with Die information.
+* Add support for Intel v2 Extended Topology Enumeration in the x86 backend.
+* Tiles, Modules and Dies are exposed as Groups for now.
+  + HWLOC_DONT_MERGE_DIE_GROUPS=1 may be set in the environment to prevent
+    Die groups from being automatically merged with identical parent or children.
+* Ignore NUMA node information from AMD topoext in the x86 backend,
+  unless HWLOC_X86_TOPOEXT_NUMANODES=1 is set in the environment.
+* Group objects have a new "dont_merge" attribute to prevent them from
+  being automatically merged with identical parent or children.
+
+
+Version 2.0.3 (also included in 1.11.12 when appropriate)
+-------------
+* Fix build on Cygwin, thanks to Marco Atzeri for the patches.
+* Fix a corner case of hwloc_topology_restrict() where children would
+  become out-of-order.
+* Fix the return length of export_xmlbuffer() functions to always
+  include the ending \0.
+* Fix lstopo --children-order argument parsing.
+
+
+Version 2.0.2 (also included in 1.11.11 when appropriate)
+-------------
+* Add support for Hygon Dhyana processors in the x86 backend,
+  thanks to Pu Wen for the patch.
+* Fix symbol renaming to also rename internal components,
+  thanks to Evan Ramos for the patch.
+* Fix build on HP-UX, thanks to Richard Lloyd for reporting the issues.
+* Detect PCI link speed without being root on Linux >= 4.13.
+* Add HWLOC_VERSION* macros to the public headers,
+  thanks to Gilles Gouaillardet for the suggestion.
+
+
+Version 2.0.1 (also included in 1.11.10 when relevant)
+-------------
+* Bump the library soname to 15:0:0 to avoid conflicts with hwloc 1.11.x
+  releases. The hwloc 2.0.0 soname was buggy (12:0:0), applications will
+  have to be recompiled.
+* Serialize pciaccess discovery to fix concurrent topology loads in
+  multiple threads.
+* Fix hwloc-dump-hwdata to only process SMBIOS information that correspond
+  to the KNL and KNM configuration.
+* Add a heuristic for guessing KNL/KNM memory and cluster modes when
+  hwloc-dump-hwdata could not run as root earlier.
+* Add --no-text lstopo option to remove text from some boxes in the
+  graphical output. Mostly useful for removing Group labels.
+* Some minor fixes to memory binding.
+
+
+Version 2.0.0
+-------------
+*** The ABI of the library has changed. ***
+  For instance some hwloc_obj fields were reordered, added or removed, see below.
+  + HWLOC_API_VERSION and hwloc_get_api_version() now give 0x00020000.
+  + See "How do I handle ABI breaks and API upgrades ?" in the FAQ
+    and "Upgrading to hwloc 2.0 API" in the documentation.
+* Major API changes
+  + Memory, I/O and Misc objects are now stored in dedicated children lists,
+    not in the usual children list that is now only used for CPU-side objects.
+    - hwloc_get_next_child() may still be used to iterate over these 4 lists
+      of children at once.
+    - hwloc_obj_type_is_normal(), _memory() and _io() may be used to check
+      the kind of a given object type.
+  + Topologies always have at least one NUMA object. On non-NUMA machines,
+    a single NUMA object is added to describe the entire machine memory.
+    The NUMA level cannot be ignored anymore.
+  + The NUMA level is special since NUMA nodes are not in the main hierarchy
+    of objects anymore. Its depth is a fake negative depth that should not be
+    compared with normal levels.
+    - If all memory objects are attached to parents at the same depth,
+      it may be retrieved with hwloc_get_memory_parents_depth().
+  + The HWLOC_OBJ_CACHE type is replaced with 8 types HWLOC_OBJ_L[1-5]CACHE
+    and HWLOC_OBJ_L[1-3]ICACHE that remove the need to disambiguate levels
+    when looking for caches with _by_type() functions.
+    - New hwloc_obj_type_is_{,d,i}cache() functions may be used to check whether
+      a given type is a cache.
+  + Reworked ignoring/filtering API
+    - Replace hwloc_topology_ignore*() functions with hwloc_topology_set_type_filter()
+      and hwloc_topology_set_all_types_filter().
+      . Contrary to hwloc_topology_ignore_{type,all}_keep_structure() which
+        removed individual objects, HWLOC_TYPE_FILTER_KEEP_STRUCTURE only removes
+        entire levels (so that topology do not become too asymmetric).
+    - Remove HWLOC_TOPOLOGY_FLAG_ICACHES in favor of hwloc_topology_set_icache_types_filter()
+      with HWLOC_TYPE_FILTER_KEEP_ALL.
+    - Remove HWLOC_TOPOLOGY_FLAG_IO_DEVICES, _IO_BRIDGES and _WHOLE_IO in favor of
+      hwloc_topology_set_io_types_filter() with HWLOC_TYPE_FILTER_KEEP_ALL or
+      HWLOC_TYPE_FILTER_KEEP_IMPORTANT.
+  + The distance API has been completely reworked. It is now described
+    in hwloc/distances.h.
+  + Return values
+    - Most functions in hwloc/bitmap.h now return an int that may be negative
+      in case of failure to realloc/extend the internal storage of a bitmap.
+    - hwloc_obj_add_info() also returns an int in case allocations fail.
+* Minor API changes
+  + Object attributes
+    - obj->memory is removed.
+      . local_memory and page_types attributes are now in obj->attr->numanode
+      . total_memory moves obj->total_memory.
+    - Objects do not have allowed_cpuset and allowed_nodeset anymore.
+      They are only available for the entire topology using
+      hwloc_topology_get_allowed_cpuset() and hwloc_topology_get_allowed_nodeset().
+    - Objects now have a "subtype" field that supersedes former "Type" and
+      "CoProcType" info attributes.
+  + Object and level depths are now signed ints.
+  + Object string printing and parsing
+    - hwloc_type_sscanf() deprecates the old hwloc_obj_type_sscanf().
+    - hwloc_type_sscanf_as_depth() is added to convert a type name into
+      a level depth.
+    - hwloc_obj_cpuset_snprintf() is deprecated in favor of hwloc_bitmap_snprintf().
+  + Misc objects
+    - Replace hwloc_topology_insert_misc_object_by_cpuset() with
+      hwloc_topology_insert_group_object() to precisely specify the location
+      of an additional hierarchy level in the topology.
+    - Misc objects have their own level and depth to iterate over all of them.
+    - Misc objects may now only be inserted as a leaf object with
+      hwloc_topology_insert_misc_object() which deprecates
+      hwloc_topology_insert_misc_object_by_parent().
+  + hwloc_topology_restrict() doesn't remove objects that contain memory
+    by default anymore.
+    - The list of existing restrict flags was modified.
+  + The discovery support array now contains some NUMA specific bits.
+  + XML export functions take an additional flags argument,
+    for instance for exporting XMLs that are compatible with hwloc 1.x.
+  + Functions diff_load_xml*(), diff_export_xml*() and diff_destroy() in
+    hwloc/diff.h do not need a topology as first parameter anymore.
+  + hwloc_parse_cpumap_file () superseded by hwloc_linux_read_path_as_cpumask()
+    in hwloc/linux.h.
+  + HWLOC_MEMBIND_DEFAULT and HWLOC_MEMBIND_FIRSTTOUCH were clarified.
+* New APIs and Features
+  + Add hwloc/shmem.h for sharing topologies between processes running on
+    the same machine (for reducing the memory footprint).
+  + Add the experimental netloc subproject. It is disabled by default
+    and can be enabled with --enable-netloc.
+    It currently brings command-line tools to gather and visualize the
+    topology of InfiniBand fabrics, and an API to convert such topologies
+    into Scotch architectures for process mapping.
+    See the documentation for details.
+* Removed APIs and features
+  + Remove the online_cpuset from struct hwloc_obj. Offline PUs get unknown
+    topologies on Linux nowadays, and wrong topology on Solaris. Other OS
+    do not support them. And one cannot do much about them anyway. Just keep
+    them in complete_cpuset.
+  + Remove the now-unused "System" object type HWLOC_OBJ_SYSTEM,
+    defined to MACHINE for backward compatibility.
+  + The almost-unused "os_level" attribute has been removed from the
+    hwloc_obj structure.
+  + Remove the custom interface for assembling the topologies of different
+    nodes as well as the hwloc-assembler tools.
+  + hwloc_topology_set_fsroot() is removed, the environment variable
+    HWLOC_FSROOT may be used for the same remote testing/debugging purpose.
+  + Remove the deprecated hwloc_obj_snprintf(), hwloc_obj_type_of_string(),
+    hwloc_distribute[v]().
+  * Remove Myrinet Express interoperability (hwloc/myriexpress.h).
+  + Remove Kerrighed support from the Linux backend.
+  + Remove Tru64 (OSF/1) support.
+    - Remove HWLOC_MEMBIND_REPLICATE which wasn't available anywhere else.
+* Backend improvements
+  + Linux
+    - OS devices do not have to be attached through PCI anymore,
+      for instance enabling the discovery of NVDIMM block devices.
+    - Remove the dependency on libnuma.
+    - Add a SectorSize attribute to block OS devices.
+  + Mac OS X
+    - Fix detection of cores and hyperthreads.
+    - Add CPUVendor, Model, ... attributes.
+  + Windows
+    - Add get_area_memlocation().
+* Tools
+  + lstopo and hwloc-info have a new --filter option matching the new filtering API.
+  + lstopo can be given --children-order=plain to force a basic displaying
+    of memory and normal children together below their parent.
+  + hwloc-distances was removed and replaced with lstopo --distances.
+* Misc
+  + Exports
+    - Exporting to synthetic now ignores I/O and Misc objects.
+  + PCI discovery
+    - Separate OS device discovery from PCI discovery. Only the latter is disabled
+      with --disable-pci at configure time. Both may be disabled with --disable-io.
+    - The `linuxpci' component is now renamed into `linuxio'.
+    - The old `libpci' component name from hwloc 1.6 is not supported anymore,
+      only the `pci' name from hwloc 1.7 is now recognized.
+    - The HWLOC_PCI_<domain>_<bus>_LOCALCPUS environment variables are superseded
+      with a single HWLOC_PCI_LOCALITY where bus ranges may be specified.
+    - Do not set PCI devices and bridges name automatically. Vendor and device
+      names are already in info attributes.
+  + Components and discovery
+    - Add HWLOC_SYNTHETIC environment variable to enforce a synthetic topology
+      as if hwloc_topology_set_synthetic() had been called.
+    - HWLOC_COMPONENTS doesn't support xml or synthetic component attributes
+      anymore, they should be passed in HWLOC_XMLFILE or HWLOC_SYNTHETIC instead.
+    - HWLOC_COMPONENTS takes precedence over other environment variables
+      for selecting components.
+  + hwloc now requires a C99 compliant compiler.
+
+
+Version 1.11.9
+--------------
+* Add support for Zhaoxin ZX-C and ZX-D processors in the x86 backend,
+  thanks to Jeff Zhao for the patch.
+* Fix AMD Epyc 24-core L3 cache locality in the x86 backend.
+* Don't crash in the x86 backend when the CPUID vendor string is unknown.
+* Fix the missing pu discovery support bit on some OS.
+* Fix the management of the lstopoStyle info attribute for custom colors.
+* Add verbose warnings when failing to load hwloc v2.0+ XMLs.
+
+
+Version 1.11.8
+--------------
+* Multiple Solaris improvements, thanks to Maureen Chew for the help:
+  + Detect caches on Sparc.
+  + Properly detect allowed/disallowed PUs and NUMA nodes with processor sets.
+  + Add hwloc_get_last_cpu_location() support for the current thread.
+* Add support for CUDA compute capability 7.0 and fix support for 6.[12].
+* Tools improvements
+  + Fix search for objects by physical index in command-line tools.
+  + Add missing "cpubind:get_thisthread_last_cpu_location" in the output
+    of hwloc-info --support.
+  + Add --pid and --name to specify target processes in hwloc-ps.
+  + Display thread names in lstopo and hwloc-ps on Linux.
+* Doc improvements
+  + Add a FAQ entry about building on Windows.
+  + Install missing sub-manpage for hwloc_obj_add_info() and
+    hwloc_obj_get_info_by_name().
+
+
+Version 1.11.7
+--------------
+* Fix hwloc-bind --membind for CPU-less NUMA nodes (again).
+  Thanks to Gilles Gouaillardet for reporting the issue.
+* Fix a memory leak on IBM S/390 platforms running Linux.
+* Fix a memory leak when forcing the x86 backend first on amd64/topoext
+  platforms running Linux.
+* Command-line tools now support "hbm" instead "numanode" for filtering
+  only high-bandwidth memory nodes when selecting locations.
+  + hwloc-bind also support --hbm and --no-hbm for filtering only or
+    no HBM nodes.
+  Thanks to Nicolas Denoyelle for the suggestion.
+* Add --children and --descendants to hwloc-info for listing object
+  children or object descendants of a specific type.
+* Add --no-index, --index, --no-attrs, --attrs to disable/enable display
+  of index numbers or attributes in the graphical lstopo output.
+* Try to gather hwloc-dump-hwdata output from all possible locations
+  in hwloc-gather-topology.
+* Updates to the documentation of locations in hwloc(7) and
+  command-line tools manpages.
+
+
+Version 1.11.6
+--------------
+* Make the Linux discovery about twice faster, especially on the CPU side,
+  by trying to avoid sysfs file accesses as much as possible.
+* Add support for AMD Family 17h processors (Zen) SMT cores in the Linux
+  and x86 backends.
+* Add the HWLOC_TOPOLOGY_FLAG_THISSYSTEM_ALLOWED_RESOURCES flag (and the
+  HWLOC_THISSYSTEM_ALLOWED_RESOURCES environment variable) for reading the
+  set of allowed resources from the local operating system even if the
+  topology was loaded from XML or synthetic.
+* Fix hwloc_bitmap_set/clr_range() for infinite ranges that do not
+  overlap currently defined ranges in the bitmap.
+* Don't reset the lstopo zoom scale when moving the X11 window.
+* lstopo now has --flags for manually setting topology flags.
+* hwloc_get_depth_type() returns HWLOC_TYPE_DEPTH_UNKNOWN for Misc objects.
+
+
+Version 1.11.5
+--------------
+* Add support for Knights Mill Xeon Phi, thanks to Piotr Luc for the patch.
+* Reenable distance gathering on Solaris, disabled by mistake since v1.0.
+  Thanks to TU Wien for the help.
+* Fix hwloc_get_*obj*_inside_cpuset() functions to ignore objects with
+  empty CPU sets, for instance, CPU-less NUMA nodes such as KNL MCDRAM.
+  Thanks to Nicolas Denoyelle for the report.
+* Fix XML import of multiple distance matrices.
+* Add a FAQ entry about "hwloc is only a structural model, it ignores
+  performance models, memory bandwidth, etc.?"
+
+
+Version 1.11.4
+--------------
+* Add MemoryMode and ClusterMode attributes in the Machine object on KNL.
+  Add doc/examples/get-knl-modes.c for an example of retrieving them.
+  Thanks to Grzegorz Andrejczuk.
+* Fix Linux build with -m32 with respect to libudev.
+  Thanks to Paul Hargrove for reporting the issue.
+* Fix build with Visual Studio 2015, thanks to Eloi Gaudry for reporting
+  the issue and providing the patch.
+* Don't forget to display OS device children in the graphical lstopo.
+* Fix a memory leak on Solaris, thanks to Bryon Gloden for the patch.
+* Properly handle realloc() failures, thanks to Bryon Gloden for reporting
+  the issue.
+* Fix lstopo crash in ascii/fig/windows outputs when some objects have a
+  lstopoStyle info attribute.
+
+
+Version 1.11.3
+--------------
+* Bug fixes
+  + Fix a memory leak on Linux S/390 hosts with books.
+  + Fix /proc/mounts parsing on Linux by using mntent.h.
+    Thanks to Nathan Hjelm for reporting the issue.
+  + Fix a x86 infinite loop on VMware due to the x2APIC feature being
+    advertised without actually being fully supported.
+    Thanks to Jianjun Wen for reporting the problem and testing the patch.
+  + Fix the return value of hwloc_alloc() on mmap() failure.
+    Thanks to Hugo Brunie for reporting the issue.
+  + Fix the return value of command-line tools in some error cases.
+  + Do not break individual thread bindings during x86 backend discovery in a
+    multithreaded process. Thanks to Farouk Mansouri for the report.
+  + Fix hwloc-bind --membind for CPU-less NUMA nodes.
+  + Fix some corner cases in the XML export/import of application userdata.
+* API Improvements
+  + Add HWLOC_MEMBIND_BYNODESET flag so that membind() functions accept
+    either cpusets or nodesets.
+  + Add hwloc_get_area_memlocation() to check where pages are actually
+    allocated. Only implemented on Linux for now.
+    - There's no _nodeset() variant, but the new flag HWLOC_MEMBIND_BYNODESET
+      is supported.
+  + Make hwloc_obj_type_sscanf() parse back everything that may be outputted
+    by hwloc_obj_type_snprintf().
+* Detection Improvements
+  + Allow the x86 backend to add missing cache levels, so that it completes
+    what the Solaris backend lacks.
+    Thanks to Ryan Zezeski for reporting the issue.
+  + Do not filter-out FibreChannel PCI adapters by default anymore.
+    Thanks to Matt Muggeridge for the report.
+  + Add support for CUDA compute capability 6.x.
+* Tools
+  + Add --support to hwloc-info to list supported features, just like with
+    hwloc_topology_get_support().
+    - Also add --objects and --topology to explicitly switch between the
+      default modes.
+  + Add --tid to let hwloc-bind operate on individual threads on Linux.
+  + Add --nodeset to let hwloc-bind report memory binding as NUMA node sets.
+  + hwloc-annotate and lstopo don't drop application userdata from XMLs anymore.
+    - Add --cu to hwloc-annotate to drop these application userdata.
+  + Make the hwloc-dump-hwdata dump directory configurable through configure
+    options such as --runstatedir or --localstatedir.
+* Misc Improvements
+  + Add systemd service template contrib/systemd/hwloc-dump-hwdata.service
+    for launching hwloc-dump-hwdata at boot on Linux.
+    Thanks to Grzegorz Andrejczuk.
+  + Add HWLOC_PLUGINS_BLACKLIST environment variable to prevent some plugins
+    from being loaded. Thanks to Alexandre Denis for the suggestion.
+  + Small improvements for various Windows build systems,
+    thanks to Jonathan L Peyton and Marco Atzeri.
+
+
+Version 1.11.2
+--------------
+* Improve support for Intel Knights Landing Xeon Phi on Linux:
+  + Group local NUMA nodes of normal memory (DDR) and high-bandwidth memory
+    (MCDRAM) together through "Cluster" groups so that the local MCDRAM is
+    easy to find.
+    - See "How do I find the local MCDRAM NUMA node on Intel Knights
+      Landing Xeon Phi?" in the documentation.
+    - For uniformity across all KNL configurations, always have a NUMA node
+      object even if the host is UMA.
+  + Fix the detection of the memory-side cache:
+    - Add the hwloc-dump-hwdata superuser utility to dump SMBIOS information
+      into /var/run/hwloc/ as root during boot, and load this dumped
+      information from the hwloc library at runtime.
+    - See "Why do I need hwloc-dump-hwdata for caches on Intel Knights
+      Landing Xeon Phi?" in the documentation.
+  Thanks to Grzegorz Andrejczuk for the patches and for the help.
+* The x86 and linux backends may now be combined for discovering CPUs
+  through x86 CPUID and memory from the Linux kernel.
+  This is useful for working around buggy CPU information reported by Linux
+  (for instance the AMD Bulldozer/Piledriver bug below).
+  Combination is enabled by passing HWLOC_COMPONENTS=x86 in the environment.
+* Fix L3 cache sharing on AMD Opteron 63xx (Piledriver) and 62xx (Bulldozer)
+  in the x86 backend. Thanks to many users who helped.
+* Fix the overzealous L3 cache sharing fix added to the x86 backend in 1.11.1
+  for AMD Opteron 61xx (Magny-Cours) processors.
+* The x86 backend may now add the info attribute Inclusive=0 or 1 to caches
+  it discovers, or to caches discovered by other backends earlier.
+  Thanks to Guillaume Beauchamp for the patch.
+* Fix the management on alloc_membind() allocation failures on AIX, HP-UX
+  and OSF/Tru64.
+* Fix spurious failures to load with ENOMEM on AIX in case of Misc objects
+  below PUs.
+* lstopo improvements in X11 and Windows graphical mode:
+  + Add + - f 1 shortcuts to manually zoom-in, zoom-out, reset the scale,
+    or fit the entire window.
+  + Display all keyboard shortcuts in the console.
+* Debug messages may be disabled at runtime by passing HWLOC_DEBUG_VERBOSE=0
+  in the environment when --enable-debug was passed to configure.
+* Add a FAQ entry "What are these Group objects in my topology?".
+
+
+Version 1.11.1
+--------------
+* Detection fixes
+  + Hardwire the topology of Fujitsu K-computer, FX10, FX100 servers to
+    workaround buggy Linux kernels.
+    Thanks to Takahiro Kawashima and Gilles Gouaillardet.
+  + Fix L3 cache information on AMD Opteron 61xx Magny-Cours processors
+    in the x86 backend. Thanks to Guillaume Beauchamp for the patch.
+  + Detect block devices directly attached to PCI without a controller,
+    for instance NVMe disks. Thanks to Barry M. Tannenbaum.
+  + Add the PCISlot attribute to all PCI functions instead of only the
+    first one.
+* Miscellaneous internal fixes
+  + Ignore PCI bridges that could fail assertions by reporting buggy
+    secondary-subordinate bus numbers
+    Thanks to George Bosilca for reporting the issue.
+  + Fix an overzealous assertion when inserting an intermediate Group object
+    while Groups are totally ignored.
+  + Fix a memory leak on Linux on AMD processors with dual-core compute units.
+    Thanks to Bob Benner.
+  + Fix a memory leak on failure to load a xml diff file.
+  + Fix some segfaults when inputting an invalid synthetic description.
+  + Fix a segfault when plugins fail to find core symbols.
+    Thanks to Guy Streeter.
+* Many fixes and improvements in the Windows backend:
+  + Fix the discovery of more than 32 processors and multiple processor
+    groups. Thanks to Barry M. Tannenbaum for the help.
+  + Add thread binding set support in case of multiple process groups.
+  + Add thread binding get support.
+  + Add get_last_cpu_location() support for the current thread.
+  + Disable the unsupported process binding in case of multiple processor
+    groups.
+  + Fix/update the Visual Studio support under contrib/windows.
+    Thanks to Eloi Gaudry for the help.
+* Tools fixes
+  + Fix a segfault when displaying logical indexes in the graphical lstopo.
+    Thanks to Guillaume Mercier for reporting the issue.
+  + Fix lstopo linking with X11 libraries, for instance on Mac OS X.
+    Thanks to Scott Atchley and Pierre Ramet for reporting the issue.
+  + hwloc-annotate, hwloc-diff and hwloc-patch do not drop unavailable
+    resources from the output anymore and those may be annotated as well.
+  + Command-line tools may now import XML from the standard input with -i -.xml
+  + Add missing documentation for the hwloc-info --no-icaches option.
+
+
+Version 1.11.0
+--------------
+* API
+  + Socket objects are renamed into Package to align with the terminology
+    used by processor vendors. The old HWLOC_OBJ_SOCKET type and "Socket"
+    name are still supported for backward compatibility.
+  + HWLOC_OBJ_NODE is replaced with HWLOC_OBJ_NUMANODE for clarification.
+    HWLOC_OBJ_NODE is still supported for backward compatibility.
+    "Node" and "NUMANode" strings are supported as in earlier releases.
+* Detection improvements
+  + Add support for Intel Knights Landing Xeon Phi.
+    Thanks to Grzegorz Andrejczuk and Lukasz Anaczkowski.
+  + Add Vendor, Model, Revision, SerialNumber, Type and LinuxDeviceID
+    info attributes to Block OS devices on Linux. Thanks to Vineet Pedaballe
+    for the help.
+    - Add --disable-libudev to avoid dependency on the libudev library.
+  + Add "MemoryModule" Misc objects with information about DIMMs, on Linux
+    when privileged and when I/O is enabled.
+    Thanks to Vineet Pedaballe for the help.
+  + Add a PCISlot attribute to PCI devices on Linux when supported to
+    identify the physical PCI slot where the board is plugged.
+  + Add CPUStepping info attribute on x86 processors,
+    thanks to Thomas Röhl for the suggestion.
+  + Ignore the device-tree on non-Power architectures to avoid buggy
+    detection on ARM. Thanks to Orion Poplawski for reporting the issue.
+  + Work-around buggy Xeon E5v3 BIOS reporting invalid PCI-NUMA affinity
+    for the PCI links on the second processor.
+  + Add support for CUDA compute capability 5.x, thanks Benjamin Worpitz.
+  + Many fixes to the x86 backend
+    - Add L1i and fix L2/L3 type on old AMD processors without topoext support.
+    - Fix Intel CPU family and model numbers when basic family isn't 6 or 15.
+    - Fix package IDs on recent AMD processors.
+    - Fix misc issues due to incomplete APIC IDs on x2APIC processors.
+    - Avoid buggy discovery on old SGI Altix UVs with non-unique APIC IDs.
+  + Gather total machine memory on NetBSD.
+* Tools
+  + lstopo
+    - Collapse identical PCI devices unless --no-collapse is given.
+      This avoids gigantic outputs when a PCI device contains dozens of
+      identical virtual functions.
+    - The ASCII art output is now called "ascii", for instance in
+      "lstopo -.ascii".
+      The former "txt" extension is retained for backward compatibility.
+    - Automatically scales graphical box width to the inner text in Cairo,
+      ASCII and Windows outputs.
+    - Add --rect to lstopo to force rectangular layout even for NUMA nodes.
+    - Add --restrict-flags to configure the behavior of --restrict.
+    - Objects may have a "Type" info attribute to specify a better type name
+      and display it in lstopo.
+    - Really export all verbose information to the given output file.
+  + hwloc-annotate
+    - May now operate on all types of objects, including I/O.
+    - May now insert Misc objects in the topology.
+    - Do not drop instruction caches and I/O devices from the output anymore.
+  + Fix lstopo path in hwloc-gather-topology after install.
+* Misc
+  + Fix hwloc/cudart.h for machines with multiple PCI domains,
+    thanks to Imre Kerr for reporting the problem.
+  + Fix PCI Bridge-specific depth attribute.
+  + Fix hwloc_bitmap_intersect() for two infinite bitmaps.
+  + Fix some corner cases in the building of levels on large NUMA machines
+    with non-uniform NUMA groups and I/Os.
+  + Improve the performance of object insertion by cpuset for large
+    topologies.
+  + Prefix verbose XML import errors with the source name.
+  + Improve pkg-config checks and error messages.
+  + Fix excluding after a component with an argument in the HWLOC_COMPONENTS
+    environment variable.
+* Documentation
+  + Fix the recommended way in documentation and examples to allocate memory
+    on some node, it should use HWLOC_MEMBIND_BIND.
+    Thanks to Nicolas Bouzat for reporting the issue.
+  + Add a "Miscellaneous objects" section in the documentation.
+  + Add a FAQ entry "What happens to my topology if I disable symmetric
+    multithreading, hyper-threading, etc. ?" to the documentation.
+
+
+Version 1.10.1
+--------------
+* Actually remove disallowed NUMA nodes from nodesets when the whole-system
+  flag isn't enabled.
+* Fix the gathering of PCI domains. Thanks to James Custer for reporting
+  the issue and providing a patch.
+* Fix the merging of identical parent and child in presence of Misc objects.
+  Thanks to Dave Love for reporting the issue.
+* Fix some misordering of children when merging with ignore_keep_structure()
+  in partially allowed topologies.
+* Fix an overzealous assertion in the debug code when running on a single-PU
+  host with I/O. Thanks to Thomas Van Doren for reporting the issue.
+* Don't forget to setup NUMA node object nodesets in x86 backend (for BSDs)
+  and OSF/Tru64 backend.
+* Fix cpuid-x86 build error with gcc -O3 on x86-32. Thanks to Thomas Van Doren
+  for reporting the issue.
+* Fix support for future very large caches in the x86 backend.
+* Fix vendor/device names for SR-IOV PCI devices on Linux.
+* Fix an unlikely crash in case of buggy hierarchical distance matrix.
+* Fix PU os_index on some AIX releases. Thanks to Hendryk Bockelmann and
+  Erik Schnetter for helping debugging.
+* Fix hwloc_bitmap_isincluded() in case of infinite sets.
+* Change hwloc-ls.desktop into a lstopo.desktop and only install it if
+  lstopo is built with Cairo/X11 support. It cannot work with a non-graphical
+  lstopo or hwloc-ls.
+* Add support for the renaming of Socket into Package in future releases.
+* Add support for the replacement of HWLOC_OBJ_NODE with HWLOC_OBJ_NUMANODE
+  in future releases.
+* Clarify the documentation of distance matrices in hwloc.h and in the manpage
+  of the hwloc-distances. Thanks to Dave Love for the suggestion.
+* Improve some error messages by displaying more information about the
+  hwloc library in use.
+* Document how to deal with the ABI break when upgrading to the upcoming 2.0
+  See "How do I handle ABI breaks and API upgrades ?" in the FAQ.
+
+
+Version 1.10.0
+--------------
+* API
+  + Add hwloc_topology_export_synthetic() to export a topology to a
+    synthetic string without using lstopo. See the Synthetic topologies
+    section in the documentation.
+  + Add hwloc_topology_set/get_userdata() to let the application save
+    a private pointer in the topology whenever it needs a way to find
+    its own object corresponding to a topology.
+  + Add hwloc_get_numanode_obj_by_os_index() and document that this function
+    as well as hwloc_get_pu_obj_by_os_index() are good at converting
+    nodesets and cpusets into objects.
+  + hwloc_distrib() does not ignore any objects anymore when there are
+    too many of them. They get merged with others instead.
+    Thanks to Tim Creech for reporting the issue.
+* Tools
+  + hwloc-bind --get <command-line> now executes the command after displaying
+    the binding instead of ignoring the command entirely.
+    Thanks to John Donners for the suggestion.
+  + Clarify that memory sizes shown in lstopo are local by default
+    unless specified (total memory added in the root object).
+* Synthetic topologies
+  + Synthetic topology descriptions may now specify attributes such as
+    memory sizes and OS indexes. See the Synthetic topologies section
+    in the documentation.
+  + lstopo now exports in this fully-detailed format by default.
+    The new option --export-synthetic-flags may be used to revert
+    back the old format.
+* Documentation
+  + Add the doc/examples/ subdirectory with several real-life examples,
+    including the already existing hwloc-hello.C for basics.
+    Thanks to Rob Aulwes for the suggestion.
+  + Improve the documentation of CPU and memory binding in the API.
+  + Add a FAQ entry about operating system errors, especially on AMD
+    platforms with buggy cache information.
+  + Add a FAQ entry about loading many topologies in a single program.
+* Misc
+  + Work around buggy Linux kernels reporting 2 sockets instead
+    1 socket with 2 NUMA nodes for each Xeon E5 v3 (Haswell) processor.
+  + pciutils/libpci support is now removed since libpciaccess works
+    well and there's also a Linux-specific PCI backend. For the record,
+    pciutils was GPL and therefore disabled by default since v1.6.2.
+  + Add --disable-cpuid configure flag to work around buggy processor
+    simulators reporting invalid CPUID information.
+    Thanks for Andrew Friedley for reporting the issue.
+  + Fix a racy use of libltdl when manipulating multiple topologies in
+    different threads.
+    Thanks to Andra Hugo for reporting the issue and testing patches.
+  + Fix some build failures in private/misc.h.
+    Thanks to Pavan Balaji and Ralph Castain for the reports.
+  + Fix failures to detect X11/Xutil.h on some Solaris platforms.
+    Thanks to Siegmar Gross for reporting the failure.
+  + The plugin ABI has changed, this release will not load plugins
+    built against previous hwloc releases.
+
+
+Version 1.9.1
+-------------
+* Fix a crash when the PCI locality is invalid. Attach to the root object
+  instead. Thanks to Nicolas Denoyelle for reporting the issue.
+* Fix -f in lstopo manpage. Thanks to Jirka Hladky for reporting the issue.
+* Fix hwloc_obj_type_sscanf() and others when strncasecmp() is not properly
+  available. Thanks to Nick Papior Andersen for reporting the problem.
+* Mark Linux file descriptors as close-on-exec to avoid leaks on exec.
+* Fix some minor memory leaks.
+
+
+Version 1.9.0
+-------------
+* API
+  + Add hwloc_obj_type_sscanf() to extend hwloc_obj_type_of_string() with
+    type-specific attributes such as Cache/Group depth and Cache type.
+    hwloc_obj_type_of_string() is moved to hwloc/deprecated.h.
+  + Add hwloc_linux_get_tid_last_cpu_location() for retrieving the
+    last CPU where a Linux thread given by TID ran.
+  + Add hwloc_distrib() to extend the old hwloc_distribute[v]() functions.
+    hwloc_distribute[v]() is moved to hwloc/deprecated.h.
+  + Don't mix total and local memory when displaying verbose object attributes
+    with hwloc_obj_attr_snprintf() or in lstopo.
+* Backends
+  + Add CPUVendor, CPUModelNumber and CPUFamilyNumber info attributes for
+    x86, ia64 and Xeon Phi sockets on Linux, to extend the x86-specific
+    support added in v1.8.1. Requested by Ralph Castain.
+  + Add many CPU- and Platform-related info attributes on ARM and POWER
+    platforms, in the Machine and Socket objects.
+  + Add CUDA info attributes describing the number of multiprocessors and
+    cores and the size of the global, shared and L2 cache memories in CUDA
+    OS devices.
+  + Add OpenCL info attributes describing the number of compute units and
+    the global memory size in OpenCL OS devices.
+  + The synthetic backend now accepts extended types such as L2Cache, L1i or
+    Group3. lstopo also exports synthetic strings using these extended types.
+* Tools
+  + lstopo
+    - Do not overwrite output files by default anymore.
+      Pass -f or --force to enforce it.
+    - Display OpenCL, CUDA and Xeon Phi numbers of cores and memory sizes
+      in the graphical output.
+    - Fix export to stdout when specifying a Cairo-based output type
+      with --of.
+  + hwloc-ps
+    - Add -e or --get-last-cpu-location to report where processes/threads
+      run instead of where they are bound.
+    - Report locations as likely-more-useful objects such as Cores or Sockets
+      instead of Caches when possible.
+  + hwloc-bind
+    - Fix failure on Windows when not using --pid.
+    - Add -e as a synonym to --get-last-cpu-location.
+  + hwloc-distrib
+    - Add --reverse to distribute using last objects first and singlify
+      into last bits first. Thanks to Jirka Hladky for the suggestion.
+  + hwloc-info
+    - Report unified caches when looking for data or instruction cache
+      ancestor objects.
+* Misc
+  + Add experimental Visual Studio support under contrib/windows.
+    Thanks to Eloi Gaudry for his help and for providing the first draft.
+  + Fix some overzealous assertions and warnings about the ordering of
+    objects on a level with respect to cpusets. The ordering is only
+    guaranteed for complete cpusets (based on the first bit in sets).
+  + Fix some memory leaks when importing xml diffs and when exporting a
+    "too complex" entry.
+
+
+Version 1.8.1
+-------------
+* Fix the cpuid code on Windows 64bits so that the x86 backend gets
+  enabled as expected and can populate CPU information.
+  Thanks to Robin Scher for reporting the problem.
+* Add CPUVendor/CPUModelNumber/CPUFamilyNumber attributes when running
+  on x86 architecture. Thanks to Ralph Castain for the suggestion.
+* Work around buggy BIOS reporting duplicate NUMA nodes on Linux.
+  Thanks to Jeff Becker for reporting the problem and testing the patch.
+* Add a name to the lstopo graphical window. Thanks to Michael Prokop
+  for reporting the issue.
+
+
+Version 1.8.0
+-------------
+* New components
+  + Add the "linuxpci" component that always works on Linux even when
+    libpciaccess and libpci aren't available (and even with a modified
+    file-system root). By default the old "pci" component runs first
+    because "linuxpci" lacks device names (obj->name is always NULL).
+* API
+  + Add the topology difference API in hwloc/diff.h for manipulating
+    many similar topologies.
+  + Add hwloc_topology_dup() for duplicating an entire topology.
+  + hwloc.h and hwloc/helper.h have been reorganized to clarify the
+    documentation sections. The actual inline code has moved out of hwloc.h
+    into the new hwloc/inlines.h.
+  + Deprecated functions are now in hwloc/deprecated.h, and not in the
+    official documentation anymore.
+* Tools
+  + Add hwloc-diff and hwloc-patch tools together with the new diff API.
+  + Add hwloc-compress-dir to (de)compress an entire directory of XML files
+    using hwloc-diff and hwloc-patch.
+  + Object colors in the graphical output of lstopo may be changed by adding
+    a "lstopoStyle" info attribute. See CUSTOM COLORS in the lstopo(1) manpage
+    for details. Thanks to Jirka Hladky for discussing the idea.
+  + hwloc-gather-topology may now gather I/O-related files on Linux when
+    --io is given. Only the linuxpci component supports discovering I/O
+    objects from these extended tarballs.
+  + hwloc-annotate now supports --ri to remove/replace info attributes with
+    a given name.
+  + hwloc-info supports "root" and "all" special locations for dumping
+    information about the root object.
+  + lstopo now supports --append-legend to append custom lines of text
+    to the legend in the graphical output. Thanks to Jirka Hladky for
+    discussing the idea.
+  + hwloc-calc and friends have a more robust parsing of locations given
+    on the command-line and they report useful error messages about it.
+  + Add --whole-system to hwloc-bind, hwloc-calc, hwloc-distances and
+    hwloc-distrib, and add --restrict to hwloc-bind for uniformity among
+    tools.
+* Misc
+  + Calling hwloc_topology_load() or hwloc_topology_set_*() on an already
+    loaded topology now returns an error (deprecated since release 1.6.1).
+  + Fix the initialisation of cpusets and nodesets in Group objects added
+    when inserting PCI hostbridges.
+  + Never merge Group objects that were added explicitly by the user with
+    hwloc_custom_insert_group_object_by_parent().
+  + Add a sanity check during dynamic plugin loading to prevent some
+    crashes when hwloc is dynamically loaded by another plugin mechanisms.
+  + Add --with-hwloc-plugins-path to specify the install/load directories
+    of plugins.
+  + Add the MICSerialNumber info attribute to the root object when running
+    hwloc inside a Xeon Phi to match the same attribute in the MIC OS device
+    when running in the host.
+
+
+Version 1.7.2
+-------------
+* Do not create invalid block OS devices on very old Linux kernel such
+  as RHEL4 2.6.9.
+* Fix PCI subvendor/device IDs.
+* Fix the management of Misc objects inserted by parent.
+  Thanks to Jirka Hladky for reporting the problem.
+* Add a Port<n>State into attribute to OpenFabrics OS devices.
+* Add a MICSerialNumber info attribute to Xeon PHI/MIC OS devices.
+* Improve verbose error messages when failing to load from XML.
+
+
+Version 1.7.1
+-------------
+* Fix a failed assertion in the distance grouping code when loading a XML
+  file that already contains some groups.
+  Thanks to Laercio Lima Pilla for reporting the problem.
+* Remove unexpected Group objects when loading XML topologies with I/O
+  objects and NUMA distances.
+  Thanks to Elena Elkina for reporting the problem and testing patches.
+* Fix PCI link speed discovery when using libpciaccess.
+* Fix invalid libpciaccess virtual function device/vendor IDs when using
+  SR-IOV PCI devices on Linux.
+* Fix GL component build with old NVCtrl releases.
+  Thanks to Jirka Hladky for reporting the problem.
+* Fix embedding breakage caused by libltdl.
+  Thanks to Pavan Balaji for reporting the problem.
+* Always use the system-wide libltdl instead of shipping one inside hwloc.
+* Document issues when enabling plugins while embedding hwloc in another
+  project, in the documentation section Embedding hwloc in Other Software.
+* Add a FAQ entry "How to get useful topology information on NetBSD?"
+  in the documentation.
+* Somes fixes in the renaming code for embedding.
+* Miscellaneous minor build fixes.
+
+
+Version 1.7.0
+-------------
+* New operating system backends
+  + Add BlueGene/Q compute node kernel (CNK) support. See the FAQ in the
+    documentation for details. Thanks to Jeff Hammond, Christopher Samuel
+    and Erik Schnetter for their help.
+  + Add NetBSD support, thanks to Aleksej Saushev.
+* New I/O device discovery
+  + Add co-processor OS devices such as "mic0" for Intel Xeon Phi (MIC)
+    on Linux. Thanks to Jerome Vienne for helping.
+  + Add co-processor OS devices such as "cuda0" for NVIDIA CUDA-capable GPUs.
+  + Add co-processor OS devices such as "opencl0d0" for OpenCL GPU devices
+    on the AMD OpenCL implementation.
+  + Add GPU OS devices such as ":0.0" for NVIDIA X11 displays.
+  + Add GPU OS devices such as "nvml0" for NVIDIA GPUs.
+    Thanks to Marwan Abdellah and Stefan Eilemann for helping.
+  These new OS devices have some string info attributes such as CoProcType,
+  GPUModel, etc. to better identify them.
+  See the I/O Devices and Attributes documentation sections for details.
+* New components
+  + Add the "opencl", "cuda", "nvml" and "gl" components for I/O device
+    discovery.
+  + "nvml" also improves the discovery of NVIDIA GPU PCIe link speed.
+  All of these new components may be built as plugins. They may also be
+  disabled entirely by passing --disable-opencl/cuda/nvml/gl to configure.
+  See the I/O Devices, Components and Plugins, and FAQ documentation
+  sections for details.
+* API
+  + Add hwloc_topology_get_flags().
+  + Add hwloc/plugins.h for building external plugins.
+    See the Adding new discovery components and plugins section.
+* Interoperability
+  + Add hwloc/opencl.h, hwloc/nvml.h, hwloc/gl.h and hwloc/intel-mic.h
+    to retrieve the locality of OS devices that correspond to AMD OpenCL
+    GPU devices or indexes, to NVML devices or indexes, to NVIDIA X11
+    displays, or to Intel Xeon Phi (MIC) device indexes.
+  + Add new helpers in hwloc/cuda.h and hwloc/cudart.h to convert
+    between CUDA devices or indexes and hwloc OS devices.
+  + Add hwloc_ibv_get_device_osdev() and clarify the requirements
+    of the OpenFabrics Verbs helpers in hwloc/openfabrics-verbs.h.
+* Tools
+  + hwloc-info is not only a synonym of lstopo -s anymore, it also
+    dumps information about objects given on the command-line.
+* Documentation
+  + Add a section "Existing components and plugins".
+  + Add a list of common OS devices in section "Software devices".
+  + Add a new FAQ entry "Why is lstopo slow?" about lstopo slowness
+    issues because of GPUs.
+  + Clarify the documentation of inline helpers in hwloc/myriexpress.h
+    and hwloc/openfabrics-verbs.h.
+* Misc
+  + Improve cache detection on AIX.
+  + The HWLOC_COMPONENTS variable now excludes the components whose
+    names are prefixed with '-'.
+  + lstopo --ignore PU now works when displaying the topology in
+    graphical and textual mode (not when exporting to XML).
+  + Make sure I/O options always appear in lstopo usage, not only when
+    using pciutils/libpci.
+  + Remove some unneeded Linux specific includes from some interoperability
+    headers.
+  + Fix some inconsistencies in hwloc-distrib and hwloc-assembler-remote
+    manpages. Thanks to Guy Streeter for the report.
+  + Fix a memory leak on AIX when getting memory binding.
+  + Fix many small memory leaks on Linux.
+  + The `libpci' component is now called `pci' but the old name is still
+    accepted in the HWLOC_COMPONENTS variable for backward compatibility.
+
+
+Version 1.6.2
+-------------
+* Use libpciaccess instead of pciutils/libpci by default for I/O discovery.
+  pciutils/libpci is only used if --enable-libpci is given to configure
+  because its GPL license may taint hwloc. See the Installation section
+  in the documentation for details.
+* Fix get_cpubind on Solaris when bound to a single PU with
+  processor_bind(). Thanks to Eugene Loh for reporting the problem
+  and providing a patch.
+
+
+Version 1.6.1
+-------------
+* Fix some crash or buggy detection in the x86 backend when Linux
+  cgroups/cpusets restrict the available CPUs.
+* Fix the pkg-config output with --libs --static.
+  Thanks to Erik Schnetter for reporting one of the problems.
+* Fix the output of hwloc-calc -H --hierarchical when using logical
+  indexes in the output.
+* Calling hwloc_topology_load() multiple times on the same topology
+  is officially deprecated. hwloc will warn in such cases.
+* Add some documentation about existing plugins/components, package
+  dependencies, and I/O devices specification on the command-line.
+
+
+Version 1.6.0
+-------------
+* Major changes
+  + Reorganize the backend infrastructure to support dynamic selection
+    of components and dynamic loading of plugins. For details, see the
+    new documentation section Components and plugins.
+    - The HWLOC_COMPONENTS variable lets one replace the default discovery
+      components.
+    - Dynamic loading of plugins may be enabled with --enable-plugins
+      (except on AIX and Windows). It will build libxml2 and libpci
+      support as separated modules. This helps reducing the dependencies
+      of the core hwloc library when distributed as a binary package.
+* Backends
+  + Add CPUModel detection on Darwin and x86/FreeBSD.
+    Thanks to Robin Scher for providing ways to implement this.
+  + The x86 backend now adds CPUModel info attributes to socket objects
+    created by other backends that do not natively support this attribute.
+  + Fix detection on FreeBSD in case of cpuset restriction. Thanks to
+    Sebastian Kuzminsky for reporting the problem.
+* XML
+  + Add hwloc_topology_set_userdata_import/export_callback(),
+    hwloc_export_obj_userdata() and _userdata_base64() to let
+    applications specify how to save/restore the custom data they placed
+    in the userdata private pointer field of hwloc objects.
+* Tools
+  + Add hwloc-annotate program to add string info attributes to XML
+    topologies.
+  + Add --pid-cmd to hwloc-ps to append the output of a command to each
+    PID line. May be used for showing Open MPI process ranks, see the
+    hwloc-ps(1) manpage for details.
+  + hwloc-bind now exits with an error if binding fails; the executable
+    is not launched unless binding suceeeded or --force was given.
+  + Add --quiet to hwloc-calc and hwloc-bind to hide non-fatal error
+    messages.
+  + Fix command-line pid support in windows tools.
+  + All programs accept --verbose as a synonym to -v.
+* Misc
+  + Fix some DIR descriptor leaks on Linux.
+  + Fix I/O device lists when some were filtered out after a XML import.
+  + Fix the removal of I/O objects when importing a I/O-enabled XML topology
+    without any I/O topology flag.
+  + When merging objects with HWLOC_IGNORE_TYPE_KEEP_STRUCTURE or
+    lstopo --merge, compare object types before deciding which one of two
+    identical object to remove (e.g. keep sockets in favor of caches).
+  + Add some GUID- and LID-related info attributes to OpenFabrics
+    OS devices.
+  + Only add CPUType socket attributes on Solaris/Sparc. Other cases
+    don't report reliable information (Solaris/x86), and a replacement
+    is available as the Architecture string info in the Machine object.
+  + Add missing Backend string info on Solaris in most cases.
+  + Document object attributes and string infos in a new Attributes
+    section in the documentation.
+  + Add a section about Synthetic topologies in the documentation.
+
+
+Version 1.5.2 (some of these changes are in v1.6.2 but not in v1.6)
+-------------
+* Use libpciaccess instead of pciutils/libpci by default for I/O discovery.
+  pciutils/libpci is only used if --enable-libpci is given to configure
+  because its GPL license may taint hwloc. See the Installation section
+  in the documentation for details.
+* Fix get_cpubind on Solaris when bound to a single PU with
+  processor_bind(). Thanks to Eugene Loh for reporting the problem
+  and providing a patch.
+* Fix some DIR descriptor leaks on Linux.
+* Fix I/O device lists when some were filtered out after a XML import.
+* Add missing Backend string info on Solaris in most cases.
+* Fix the removal of I/O objects when importing a I/O-enabled XML topology
+  without any I/O topology flag.
+* Fix the output of hwloc-calc -H --hierarchical when using logical
+  indexes in the output.
+* Fix the pkg-config output with --libs --static.
+  Thanks to Erik Schnetter for reporting one of the problems.
+
+
+Version 1.5.1
+-------------
+* Fix block OS device detection on Linux kernel 3.3 and later.
+  Thanks to Guy Streeter for reporting the problem and testing the fix.
+* Fix the cpuid code in the x86 backend (for FreeBSD). Thanks to
+  Sebastian Kuzminsky for reporting problems and testing patches.
+* Fix 64bit detection on FreeBSD.
+* Fix some corner cases in the management of the thissystem flag with
+  respect to topology flags and environment variables.
+* Fix some corner cases in command-line parsing checks in hwloc-distrib
+  and hwloc-distances.
+* Make sure we do not miss some block OS devices on old Linux kernels
+  when a single PCI device has multiple IDE hosts/devices behind it.
+* Do not disable I/O devices or instruction caches in hwloc-assembler output.
+
+
+Version 1.5.0
+-------------
+* Backends
+  + Do not limit the number of processors to 1024 on Solaris anymore.
+  + Gather total machine memory on FreeBSD. Thanks to Cyril Roelandt.
+  + XML topology files do not depend on the locale anymore. Float numbers
+    such as NUMA distances or PCI link speeds now always use a dot as a
+    decimal separator.
+  + Add instruction caches detection on Linux, AIX, Windows and Darwin.
+  + Add get_last_cpu_location() support for the current thread on AIX.
+  + Support binding on AIX when threads or processes were bound with
+    bindprocessor(). Thanks to Hendryk Bockelmann for reporting the issue
+    and testing patches, and to Farid Parpia for explaining the binding
+    interfaces.
+  + Improve AMD topology detection in the x86 backend (for FreeBSD) using
+    the topoext feature.
+* API
+  + Increase HWLOC_API_VERSION to 0x00010500 so that API changes may be
+    detected at build-time.
+  + Add a cache type attribute describind Data, Instruction and Unified
+    caches. Caches with different types but same depth (for instance L1d
+    and L1i) are placed on different levels.
+  + Add hwloc_get_cache_type_depth() to retrieve the hwloc level depth of
+    of the given cache depth and type, for instance L1i or L2.
+    It helps  disambiguating the case where hwloc_get_type_depth() returns
+    HWLOC_TYPE_DEPTH_MULTIPLE.
+  + Instruction caches are ignored unless HWLOC_TOPOLOGY_FLAG_ICACHES is
+    passed to hwloc_topology_set_flags() before load.
+  + Add hwloc_ibv_get_device_osdev_by_name() OpenFabrics helper in
+    openfabrics-verbs.h to find the hwloc OS device object corresponding to
+    an OpenFabrics device.
+* Tools
+  + Add lstopo-no-graphics, a lstopo built without graphical support to
+    avoid dependencies on external libraries such as Cairo and X11. When
+    supported, graphical outputs are only available in the original lstopo
+    program.
+    - Packagers splitting lstopo and lstopo-no-graphics into different
+      packages are advised to use the alternatives system so that lstopo
+      points to the best available binary.
+  + Instruction caches are enabled in lstopo by default. Use --no-icaches
+    to disable them.
+  + Add -t/--threads to show threads in hwloc-ps.
+* Removal of obsolete components
+  + Remove the old cpuset interface (hwloc/cpuset.h) which is deprecated and
+    superseded by the bitmap API (hwloc/bitmap.h) since v1.1.
+    hwloc_cpuset and nodeset types are still defined, but all hwloc_cpuset_*
+    compatibility wrappers are now gone.
+  + Remove Linux libnuma conversion helpers for the deprecated and
+    broken nodemask_t interface.
+  + Remove support for "Proc" type name, it was superseded by "PU" in v1.0.
+  + Remove hwloc-mask symlinks, it was replaced by hwloc-calc in v1.0.
+* Misc
+  + Fix PCIe 3.0 link speed computation.
+  + Non-printable characters are dropped from strings during XML export.
+  + Fix importing of escaped characters with the minimalistic XML backend.
+  + Assert hwloc_is_thissystem() in several I/O related helpers.
+  + Fix some memory leaks in the x86 backend for FreeBSD.
+  + Minor fixes to ease native builds on Windows.
+  + Limit the number of retries when operating on all threads within a
+    process on Linux if the list of threads is heavily getting modified.
+
+
+Version 1.4.3
+-------------
+* This release is only meant to fix the pciutils license issue when upgrading
+  to hwloc v1.5 or later is not possible. It contains several other minor
+  fixes but ignores many of them that are only in v1.5 or later.
+* Use libpciaccess instead of pciutils/libpci by default for I/O discovery.
+  pciutils/libpci is only used if --enable-libpci is given to configure
+  because its GPL license may taint hwloc. See the Installation section
+  in the documentation for details.
+* Fix PCIe 3.0 link speed computation.
+* Fix importing of escaped characters with the minimalistic XML backend.
+* Fix a memory leak in the x86 backend.
+
+
+Version 1.4.2
+-------------
+* Fix build on Solaris 9 and earlier when fabsf() is not a compiler
+  built-in. Thanks to Igor Galić for reporting the problem.
+* Fix support for more than 32 processors on Windows. Thanks to Hartmut
+  Kaiser for reporting the problem.
+* Fix process-wide binding and cpulocation routines on Linux when some
+  threads disappear in the meantime. Thanks to Vlad Roubtsov for reporting
+  the issue.
+* Make installed scripts executable. Thanks to Jirka Hladky for reporting
+  the problem.
+* Fix libtool revision management when building for Windows. This fix was
+  also released as hwloc v1.4.1.1 Windows builds. Thanks to Hartmut Kaiser
+  for reporting the problem.
+* Fix the __hwloc_inline keyword in public headers when compiling with a
+  C++ compiler.
+* Add Port info attribute to network OS devices inside OpenFabrics PCI
+  devices so as to identify which interface corresponds to which port.
+* Document requirements for interoperability helpers: I/O devices discovery
+  is required for some of them; the topology must match the current host
+  for most of them.
+
+
+Version 1.4.1
+-------------
+* This release contains all changes from v1.3.2.
+* Fix hwloc_alloc_membind, thanks Karl Napf for reporting the issue.
+* Fix memory leaks in some get_membind() functions.
+* Fix helpers converting from Linux libnuma to hwloc (hwloc/linux-libnuma.h)
+  in case of out-of-order NUMA node ids.
+* Fix some overzealous assertions in the distance grouping code.
+* Workaround BIOS reporting empty I/O locality in CUDA and OpenFabrics
+  helpers on Linux. Thanks to Albert Solernou for reporting the problem.
+* Install a valgrind suppressions file hwloc-valgrind.supp (see the FAQ).
+* Fix memory binding documentation. Thanks to Karl Napf for reporting the
+  issues.
+
+
+Version 1.4.0 (does not contain all v1.3.2 changes)
+-------------
+* Major features
+  + Add "custom" interface and "assembler" tools to build multi-node
+    topology. See the Multi-node Topologies section in the documentation
+    for details.
+* Interface improvements
+  + Add symmetric_subtree object attribute to ease assumptions when consulting
+    regular symmetric topologies.
+  + Add a CPUModel and CPUType info attribute to Socket objects on Linux
+    and Solaris.
+  + Add hwloc_get_obj_index_inside_cpuset() to retrieve the "logical" index
+    of an object within a subtree of the topology.
+  + Add more NVIDIA CUDA helpers in cuda.h and cudart.h to find hwloc objects
+    corresponding to CUDA devices.
+* Discovery improvements
+  + Add a group object above partial distance matrices to make sure
+    the matrices are available in the final topology, except when this
+    new object would contradict the existing hierarchy.
+  + Grouping by distances now also works when loading from XML.
+  + Fix some corner cases in object insertion, for instance when dealing
+    with NUMA nodes without any CPU.
+* Backends
+  + Implement hwloc_get_area_membind() on Linux.
+  + Honor I/O topology flags when importing from XML.
+  + Further improve XML-related error checking and reporting.
+  + Hide synthetic topology error messages unless HWLOC_SYNTHETIC_VERBOSE=1.
+* Tools
+  + Add synthetic exporting of symmetric topologies to lstopo.
+  + lstopo --horiz and --vert can now be applied to some specific object types.
+  + lstopo -v -p now displays distance matrices with physical indexes.
+  + Add hwloc-distances utility to list distances.
+* Documentation
+  + Fix and/or document the behavior of most inline functions in hwloc/helper.h
+    when the topology contains some I/O or Misc objects.
+  + Backend documentation enhancements.
+* Bug fixes
+  + Fix missing last bit in hwloc_linux_get_thread_cpubind().
+    Thanks to Carolina Gómez-Tostón Gutiérrez for reporting the issue.
+  + Fix FreeBSD build without cpuid support.
+  + Fix several Windows build issues.
+  + Fix inline keyword definition in public headers.
+  + Fix dependencies in the embedded library.
+  + Improve visibility support detection. Thanks to Dave Love for providing
+    the patch.
+  + Remove references to internal symbols in the tools.
+
+
+Version 1.3.3
+-------------
+* This release is only meant to fix the pciutils license issue when upgrading
+  to hwloc v1.4 or later is not possible. It contains several other minor
+  fixes but ignores many of them that are only in v1.4 or later.
+* Use libpciaccess instead of pciutils/libpci by default for I/O discovery.
+  pciutils/libpci is only used if --enable-libpci is given to configure
+  because its GPL license may taint hwloc. See the Installation section
+  in the documentation for details.
+
+
+Version 1.3.2
+-------------
+* Fix missing last bit in hwloc_linux_get_thread_cpubind().
+  Thanks to Carolina Gómez-Tostón Gutiérrez for reporting the issue.
+* Fix build with -mcmodel=medium. Thanks to Devendar Bureddy for reporting
+  the issue.
+* Fix build with Solaris Studio 12 compiler when XML is disabled.
+  Thanks to Paul H. Hargrove for reporting the problem.
+* Fix installation with old GNU sed, for instance on Red Hat 8.
+  Thanks to Paul H. Hargrove for reporting the problem.
+* Fix PCI locality when Linux cgroups restrict the available CPUs.
+* Fix floating point issue when grouping by distance on mips64 architecture.
+  Thanks to Paul H. Hargrove for reporting the problem.
+* Fix conversion from/to Linux libnuma when some NUMA nodes have no memory.
+* Fix support for gccfss compilers with broken ffs() support. Thanks to
+  Paul H. Hargrove for reporting the problem and providing a patch.
+* Fix FreeBSD build without cpuid support.
+* Fix several Windows build issues.
+* Fix inline keyword definition in public headers.
+* Fix dependencies in the embedded library.
+* Detect when a compiler such as xlc may not report compile errors
+  properly, causing some configure checks to be wrong. Thanks to
+  Paul H. Hargrove for reporting the problem and providing a patch.
+* Improve visibility support detection. Thanks to Dave Love for providing
+  the patch.
+* Remove references to internal symbols in the tools.
+* Fix installation on systems with limited command-line size.
+  Thanks to Paul H. Hargrove for reporting the problem.
+* Further improve XML-related error checking and reporting.
+
+
+Version 1.3.1
+-------------
+* Fix pciutils detection with pkg-config when not installed in standard
+  directories.
+* Fix visibility options detection with the Solaris Studio compiler.
+  Thanks to Igor Galić and Terry Dontje for reporting the problems.
+* Fix support for old Linux sched.h headers such as those found
+  on Red Hat 8. Thanks to Paul H. Hargrove for reporting the problems.
+* Fix inline and attribute support for Solaris compilers. Thanks to
+  Dave Love for reporting the problems.
+* Print a short summary at the end of the configure output. Thanks to
+  Stefan Eilemann for the suggestion.
+* Add --disable-libnuma configure option to disable libnuma-based
+  memory binding support on Linux.  Thanks to Rayson Ho for the
+  suggestion.
+* Make hwloc's configure script properly obey $PKG_CONFIG.  Thanks to
+  Nathan Phillip Brink for raising the issue.
+* Silence some harmless pciutils warnings, thanks to Paul H. Hargrove
+  for reporting the problem.
+* Fix the documentation with respect to hwloc_pid_t and hwloc_thread_t
+  being either pid_t and pthread_t on Unix, or HANDLE on Windows.
+
+
+Version 1.3.0
+-------------
+* Major features
+  + Add I/O devices and bridges to the topology using the pciutils
+    library. Only enabled after setting the relevant flag with
+    hwloc_topology_set_flags() before hwloc_topology_load(). See the
+    I/O Devices section in the documentation for details.
+* Discovery improvements
+  + Add associativity to the cache attributes.
+  + Add support for s390/z11 "books" on Linux.
+  + Add the HWLOC_GROUPING_ACCURACY environment variable to relax
+    distance-based grouping constraints. See the Environment Variables
+    section in the documentation for details about grouping behavior
+    and configuration.
+  + Allow user-given distance matrices to remove or replace those
+    discovered by the OS backend.
+* XML improvements
+  + XML is now always supported: a minimalistic custom import/export
+    code is used when libxml2 is not available. It is only guaranteed
+    to read XML files generated by hwloc.
+  + hwloc_topology_export_xml() and export_xmlbuffer() now return an
+    integer.
+  + Add hwloc_free_xmlbuffer() to free the buffer allocated by
+    hwloc_topology_export_xmlbuffer().
+  + Hide XML topology error messages unless HWLOC_XML_VERBOSE=1.
+* Minor API updates
+  + Add hwloc_obj_add_info to customize object info attributes.
+* Tools
+  + lstopo now displays I/O devices by default. Several options are
+    added to configure the I/O discovery.
+  + hwloc-calc and hwloc-bind now accept I/O devices as input.
+  + Add --restrict option to hwloc-calc and hwloc-distribute.
+  + Add --sep option to change the output field separator in hwloc-calc.
+  + Add --whole-system option to hwloc-ps.
+
+
+Version 1.2.2
+-------------
+* Fix build on AIX 5.2, thanks Utpal Kumar Ray for the report.
+* Fix XML import of very large page sizes or counts on 32bits platform,
+  thanks to Karsten Hopp for the RedHat ticket.
+* Fix crash when administrator limitations such as Linux cgroup require
+  to restrict distance matrices. Thanks to Ake Sandgren for reporting the
+  problem.
+* Fix the removal of objects such as AMD Magny-Cours dual-node sockets
+  in case of administrator restrictions.
+* Improve error reporting and messages in case of wrong synthetic topology
+  description.
+* Several other minor internal fixes and documentation improvements.
+
+
+Version 1.2.1
+-------------
+* Improve support of AMD Bulldozer "Compute-Unit" modules by detecting
+  logical processors with different core IDs on Linux.
+* Fix hwloc-ps crash when listing processes from another Linux cpuset.
+  Thanks to Carl Smith for reporting the problem.
+* Fix build on AIX and Solaris. Thanks to Carl Smith and Andreas Kupries
+  for reporting the problems.
+* Fix cache size detection on Darwin. Thanks to Erkcan Özcan for reporting
+  the problem.
+* Make configure fail if --enable-xml or --enable-cairo is given and
+  proper support cannot be found. Thanks to Andreas Kupries for reporting
+  the XML problem.
+* Fix spurious L1 cache detection on AIX. Thanks to Hendryk Bockelmann
+  for reporting the problem.
+* Fix hwloc_get_last_cpu_location(THREAD) on Linux. Thanks to Gabriele
+  Fatigati for reporting the problem.
+* Fix object distance detection on Solaris.
+* Add pthread_self weak symbol to ease static linking.
+* Minor documentation fixes.
+
+
+Version 1.2.0
+-------------
+* Major features
+  + Expose latency matrices in the API as an array of distance structures
+    within objects. Add several helpers to find distances.
+  + Add hwloc_topology_set_distance_matrix() and environment variables
+    to provide a matrix of distances between a given set of objects.
+  + Add hwloc_get_last_cpu_location() and hwloc_get_proc_last_cpu_location()
+    to retrieve the processors where a process or thread recently ran.
+    - Add the corresponding --get-last-cpu-location option to hwloc-bind.
+  + Add hwloc_topology_restrict() to restrict an existing topology to a
+    given cpuset.
+    - Add the corresponding --restrict option to lstopo.
+* Minor API updates
+  + Add hwloc_bitmap_list_sscanf/snprintf/asprintf to convert between bitmaps
+    and strings such as 4-5,7-9,12,15-
+  + hwloc_bitmap_set/clr_range() now support infinite ranges.
+  + Clarify the difference between inserting Misc objects by cpuset or by
+    parent.
+  + hwloc_insert_misc_object_by_cpuset() now returns NULL in case of error.
+* Discovery improvements
+  + x86 backend (for freebsd): add x2APIC support
+  + Support standard device-tree phandle, to get better support on e.g. ARM
+    systems providing it.
+  + Detect cache size on AIX. Thanks Christopher and IBM.
+  + Improve grouping to support asymmetric topologies.
+* Tools
+  + Command-line tools now support "all" and "root" special locations
+    consisting in the entire topology, as well as type names with depth
+    attributes such as L2 or Group4.
+  + hwloc-calc improvements:
+    - Add --number-of/-N option to report the number of objects of a given
+      type or depth.
+    - -I is now equivalent to --intersect for listing the indexes of
+      objects of a given type or depth that intersects the input.
+    - Add -H to report the output as a hierarchical combination of types
+      and depths.
+  + Add --thissystem to lstopo.
+  + Add lstopo-win, a console-less lstopo variant on Windows.
+* Miscellaneous
+  + Remove C99 usage from code base.
+  + Rename hwloc-gather-topology.sh into hwloc-gather-topology
+  + Fix AMD cache discovery on freebsd when there is no L3 cache, thanks
+    Andriy Gapon for the fix.
+
+
+Version 1.1.2
+-------------
+* Fix a segfault in the distance-based grouping code when some objects
+  are not placed in any group. Thanks to Bernd Kallies for reporting
+  the problem and providing a patch.
+* Fix the command-line parsing of hwloc-bind --mempolicy interleave.
+  Thanks to Guy Streeter for reporting the problem.
+* Stop truncating the output in hwloc_obj_attr_snprintf() and in the
+  corresponding lstopo output. Thanks to Guy Streeter for reporting the
+  problem.
+* Fix object levels ordering in synthetic topologies.
+* Fix potential incoherency between device tree and kernel information,
+  when SMT is disabled on Power machines.
+* Fix and document the behavior of hwloc_topology_set_synthetic() in case
+  of invalid argument. Thanks to Guy Streeter for reporting the problem.
+* Add some verbose error message reporting when it looks like the OS
+  gives erroneous information.
+* Do not include unistd.h and stdint.h in public headers on Windows.
+* Move config.h files into their own subdirectories to avoid name
+  conflicts when AC_CONFIG_HEADERS adds -I's for them.
+* Remove the use of declaring variables inside "for" loops.
+* Some other minor fixes.
+* Many minor documentation fixes.
+
+
+Version 1.1.1
+-------------
+* Add hwloc_get_api_version() which returns the version of hwloc used
+  at runtime. Thanks to Guy Streeter for the suggestion.
+* Fix the number of hugepages reported for NUMA nodes on Linux.
+* Fix hwloc_bitmap_to_ulong() right after allocating the bitmap.
+  Thanks to Bernd Kallies for reporting the problem.
+* Fix hwloc_bitmap_from_ith_ulong() to properly zero the first ulong.
+  Thanks to Guy Streeter for reporting the problem.
+* Fix hwloc_get_membind_nodeset() on Linux.
+  Thanks to Bernd Kallies for reporting the problem and providing a patch.
+* Fix some file descriptor leaks in the Linux discovery.
+* Fix the minimum width of NUMA nodes, caches and the legend in the graphical
+  lstopo output. Thanks to Jirka Hladky for reporting the problem.
+* Various fixes to bitmap conversion from/to taskset-strings.
+* Fix and document snprintf functions behavior when the buffer size is too
+  small or zero. Thanks to Guy Streeter for reporting the problem.
+* Fix configure to avoid spurious enabling of the cpuid backend.
+  Thanks to Tim Anderson for reporting the problem.
+* Cleanup error management in hwloc-gather-topology.sh.
+  Thanks to Jirka Hladky for reporting the problem and providing a patch.
+* Add a manpage and usage for hwloc-gather-topology.sh on Linux.
+  Thanks to Jirka Hladky for providing a patch.
+* Memory binding documentation enhancements.
+
+
+Version 1.1.0
+-------------
+
+* API
+  + Increase HWLOC_API_VERSION to 0x00010100 so that API changes may be
+    detected at build-time.
+  + Add a memory binding interface.
+  + The cpuset API (hwloc/cpuset.h) is now deprecated. It is replaced by
+    the bitmap API (hwloc/bitmap.h) which offers the same features with more
+    generic names since it applies to CPU sets, node sets and more.
+    Backward compatibility with the cpuset API and ABI is still provided but
+    it will be removed in a future release.
+    Old types (hwloc_cpuset_t, ...) are still available as a way to clarify
+    what kind of hwloc_bitmap_t each API function manipulates.
+    Upgrading to the new API only requires to replace hwloc_cpuset_ function
+    calls with the corresponding hwloc_bitmap_ calls, with the following
+    renaming exceptions:
+    - hwloc_cpuset_cpu -> hwloc_bitmap_only
+    - hwloc_cpuset_all_but_cpu -> hwloc_bitmap_allbut
+    - hwloc_cpuset_from_string -> hwloc_bitmap_sscanf
+  + Add an `infos' array in each object to store couples of info names and
+    values. It enables generic storage of things like the old dmi board infos
+    that were previously stored in machine specific attributes.
+  + Add linesize cache attribute.
+* Features
+  + Bitmaps (and thus CPU sets and node sets) are dynamically (re-)allocated,
+    the maximal number of CPUs (HWLOC_NBMAXCPUS) has been removed.
+  + Improve the distance-based grouping code to better support irregular
+    distance matrices.
+  + Add support for device-tree to get cache information (useful on Power
+    architectures).
+* Helpers
+  + Add NVIDIA CUDA helpers in cuda.h and cudart.h to ease interoperability
+    with CUDA Runtime and Driver APIs.
+  + Add Myrinet Express helper in myriexpress.h to ease interoperability.
+* Tools
+  + lstopo now displays physical/OS indexes by default in graphical mode
+    (use -l to switch back to logical indexes). The textual output still uses
+    logical by default (use -p to switch to physical indexes).
+  + lstopo prefixes logical indexes with `L#' and physical indexes with `P#'.
+    Physical indexes are also printed as `P#N' instead of `phys=N' within
+    object attributes (in parentheses).
+  + Add a legend at the bottom of the lstopo graphical output, use --no-legend
+    to remove it.
+  + Add hwloc-ps to list process' bindings.
+  + Add --membind and --mempolicy options to hwloc-bind.
+  + Improve tools command-line options by adding a generic --input option
+    (and more) which replaces the old --xml, --synthetic and --fsys-root.
+  + Cleanup lstopo output configuration by adding --output-format.
+  + Add --intersect in hwloc-calc, and replace --objects with --largest.
+  + Add the ability to work on standard input in hwloc-calc.
+  + Add --from, --to and --at in hwloc-distrib.
+  + Add taskset-specific functions and command-line tools options to
+    manipulate CPU set strings in the format of the taskset program.
+  + Install hwloc-gather-topology.sh on Linux.
+
+
+Version 1.0.3
+-------------
+
+* Fix support for Linux cpuset when emulated by a cgroup mount point.
+* Remove unneeded runtime dependency on libibverbs.so in the library and
+  all utils programs.
+* Fix hwloc_cpuset_to_linux_libnuma_ulongs in case of non-linear OS-indexes
+  for NUMA nodes.
+* lstopo now displays physical/OS indexes by default in graphical mode
+  (use -l to switch back to logical indexes). The textual output still uses
+  logical by default (use -p to switch to physical indexes).
+
+
+Version 1.0.2
+-------------
+
+* Public headers can now be included directly from C++ programs.
+* Solaris fix for non-contiguous cpu numbers.  Thanks to Rolf vandeVaart for
+  reporting the issue.
+* Darwin 10.4 fix.  Thanks to Olivier Cessenat for reporting the issue.
+* Revert 1.0.1 patch that ignored sockets with unknown ID values since it
+  only slightly helped POWER7 machines with old Linux kernels while it
+  prevents recent kernels from getting the complete POWER7 topology.
+* Fix hwloc_get_common_ancestor_obj().
+* Remove arch-specific bits in public headers.
+* Some fixes in the lstopo graphical output.
+* Various man page clarifications and minor updates.
+
+
+Version 1.0.1
+-------------
+
+* Various Solaris fixes.  Thanks to Yannick Martin for reporting the issue.
+* Fix "non-native" builds on x86 platforms (e.g., when building 32
+  bit executables with compilers that natively build 64 bit).
+* Ignore sockets with unknown ID values (which fixes issues on POWER7
+  machines).  Thanks to Greg Bauer for reporting the issue.
+* Various man page clarifications and minor updates.
+* Fixed memory leaks in hwloc_setup_group_from_min_distance_clique().
+* Fix cache type filtering on MS Windows 7.  Thanks to Αλέξανδρος
+  Παπαδογιαννάκ for reporting the issue.
+* Fixed warnings when compiling with -DNDEBUG.
+
+
+Version 1.0.0
+-------------
+
+* The ABI of the library has changed.
+* Backend updates
+  + Add FreeBSD support.
+  + Add x86 cpuid based backend.
+  + Add Linux cgroup support to the Linux cpuset code.
+  + Support binding of entire multithreaded process on Linux.
+  + Fix and enable Group support in Windows.
+  + Cleanup XML export/import.
+* Objects
+  + HWLOC_OBJ_PROC is renamed into HWLOC_OBJ_PU for "Processing Unit",
+    its stringified type name is now "PU".
+  + Use new HWLOC_OBJ_GROUP objects instead of MISC when grouping
+    objects according to NUMA distances or arbitrary OS aggregation.
+  + Rework memory attributes.
+  + Add different cpusets in each object to specify processors that
+    are offline, unavailable, ...
+  + Cleanup the storage of object names and DMI infos.
+* Features
+  + Add support for looking up specific PID topology information.
+  + Add hwloc_topology_export_xml() to export the topology in a XML file.
+  + Add hwloc_topology_get_support() to retrieve the supported features
+    for the current topology context.
+  + Support non-SYSTEM object as the root of the tree, use MACHINE in
+    most common cases.
+  + Add hwloc_get_*cpubind() routines to retrieve the current binding
+    of processes and threads.
+* API
+  + Add HWLOC_API_VERSION to help detect the currently used API version.
+  + Add missing ending "e" to *compare* functions.
+  + Add several routines to emulate PLPA functions.
+  + Rename and rework the cpuset and/or/xor/not/clear operators to output
+    their result in a dedicated argument instead of modifying one input.
+  + Deprecate hwloc_obj_snprintf() in favor of hwloc_obj_type/attr_snprintf().
+  + Clarify the use of parent and ancestor in the API, do not use father.
+  + Replace hwloc_get_system_obj() with hwloc_get_root_obj().
+  + Return -1 instead of HWLOC_OBJ_TYPE_MAX in the API since the latter
+    isn't public.
+  + Relax constraints in hwloc_obj_type_of_string().
+  + Improve displaying of memory sizes.
+  + Add 0x prefix to cpuset strings.
+* Tools
+  + lstopo now displays logical indexes by default, use --physical to
+    revert back to OS/physical indexes.
+  + Add colors in the lstopo graphical outputs to distinguish between online,
+    offline, reserved, ... objects.
+  + Extend lstopo to show cpusets, filter objects by type, ...
+  + Renamed hwloc-mask into hwloc-calc which supports many new options.
+* Documentation
+  + Add a hwloc(7) manpage containing general information.
+  + Add documentation about how to switch from PLPA to hwloc.
+  + Cleanup the distributed documentation files.
+* Miscellaneous
+  + Many compilers warning fixes.
+  + Cleanup the ABI by using the visibility attribute.
+  + Add project embedding support.
+
+
+Version 0.9.4 (unreleased)
+--------------------------
+
+* Fix reseting colors to normal in lstopo -.txt output.
+* Fix Linux pthread_t binding error report.
+
+
+Version 0.9.3
+-------------
+
+* Fix autogen.sh to work with Autoconf 2.63.
+* Fix various crashes in particular conditions:
+  - xml files with root attributes
+  - offline CPUs
+  - partial sysfs support
+  - unparseable /proc/cpuinfo
+  - ignoring NUMA level while Misc level have been generated
+* Tweak documentation a bit
+* Do not require the pthread library for binding the current thread on Linux
+* Do not erroneously consider the sched_setaffinity prototype is the old version
+  when there is actually none.
+* Fix _syscall3 compilation on archs for which we do not have the
+  sched_setaffinity system call number.
+* Fix AIX binding.
+* Fix libraries dependencies: now only lstopo depends on libtermcap, fix
+  binutils-gold link
+* Have make check always build and run hwloc-hello.c
+* Do not limit size of a cpuset.
+
+
+Version 0.9.2
+-------------
+
+* Trivial documentation changes.
+
+
+Version 0.9.1
+-------------
+
+* Re-branded to "hwloc" and moved to the Open MPI project, relicensed under the
+  BSD license.
+* The prefix of all functions and tools is now hwloc, and some public
+  functions were also renamed for real.
+* Group NUMA nodes into Misc objects according to their physical distance
+  that may be reported by the OS/BIOS.
+  May be ignored by setting HWLOC_IGNORE_DISTANCES=1 in the environment.
+* Ignore offline CPUs on Solaris.
+* Improved binding support on AIX.
+* Add HP-UX support.
+* CPU sets are now allocated/freed dynamically.
+* Add command line options to tune the lstopo graphical output, add
+  semi-graphical textual output
+* Extend topobind to support multiple cpusets or objects on the command
+  line as topomask does.
+* Add an Infiniband-specific helper hwloc/openfabrics-verbs.h to retrieve
+  the physical location of IB devices.
+
+
+Version 0.9 (libtopology)
+-------------------------
+
+* First release.
diff --git a/src/3rdparty/hwloc/README b/src/3rdparty/hwloc/README
new file mode 100644
index 000000000..5567b4d14
--- /dev/null
+++ b/src/3rdparty/hwloc/README
@@ -0,0 +1,85 @@
+Introduction
+
+The Hardware Locality (hwloc) software project aims at easing the process of
+discovering hardware resources in parallel architectures. It offers
+command-line tools and a C API for consulting these resources, their locality,
+attributes, and interconnection. hwloc primarily aims at helping
+high-performance computing (HPC) applications, but is also applicable to any
+project seeking to exploit code and/or data locality on modern computing
+platforms.
+
+hwloc is actually made of two subprojects distributed together:
+
+  * The original hwloc project for describing the internals of computing nodes.
+ It is described in details starting at section Hardware Locality (hwloc)
+ Introduction.
+  * The network-oriented companion called netloc (Network Locality), described
+ in details starting with section Network Locality (netloc).
+
+See also the Related pages tab above for links to other sections.
+
+Netloc may be disabled, but the original hwloc cannot. Both hwloc and netloc
+APIs are documented after these sections.
+
+Installation
+
+hwloc (http://www.open-mpi.org/projects/hwloc/) is available under the BSD
+license. It is hosted as a sub-project of the overall Open MPI project (http://
+www.open-mpi.org/). Note that hwloc does not require any functionality from
+Open MPI -- it is a wholly separate (and much smaller!) project and code base.
+It just happens to be hosted as part of the overall Open MPI project.
+
+Basic Installation
+
+Installation is the fairly common GNU-based process:
+
+shell$ ./configure --prefix=...
+shell$ make
+shell$ make install
+
+hwloc- and netloc-specific configure options and requirements are documented in
+sections hwloc Installation and Netloc Installation respectively.
+
+Also note that if you install supplemental libraries in non-standard locations,
+hwloc's configure script may not be able to find them without some help. You
+may need to specify additional CPPFLAGS, LDFLAGS, or PKG_CONFIG_PATH values on
+the configure command line.
+
+For example, if libpciaccess was installed into /opt/pciaccess, hwloc's
+configure script may not find it be default. Try adding PKG_CONFIG_PATH to the
+./configure command line, like this:
+
+./configure PKG_CONFIG_PATH=/opt/pciaccess/lib/pkgconfig ...
+
+Running the "lstopo" tool is a good way to check as a graphical output whether
+hwloc properly detected the architecture of your node. Netloc command-line
+tools can be used to display the network topology interconnecting your nodes.
+
+Installing from a Git clone
+
+Additionally, the code can be directly cloned from Git:
+
+shell$ git clone https://github.com/open-mpi/hwloc.git
+shell$ cd hwloc
+shell$ ./autogen.sh
+
+Note that GNU Autoconf >=2.63, Automake >=1.11 and Libtool >=2.2.6 are required
+when building from a Git clone.
+
+Nightly development snapshots are available on the web site, they can be
+configured and built without any need for Git or GNU Autotools.
+
+Questions and Bugs
+
+Bugs should be reported in the tracker (https://github.com/open-mpi/hwloc/
+issues). Opening a new issue automatically displays lots of hints about how to
+debug and report issues.
+
+Questions may be sent to the users or developers mailing lists (http://
+www.open-mpi.org/community/lists/hwloc.php).
+
+There is also a #hwloc IRC channel on Freenode (irc.freenode.net).
+
+
+
+See https://www.open-mpi.org/projects/hwloc/doc/ for more hwloc documentation.
diff --git a/src/3rdparty/hwloc/VERSION b/src/3rdparty/hwloc/VERSION
new file mode 100644
index 000000000..5ebc6bb47
--- /dev/null
+++ b/src/3rdparty/hwloc/VERSION
@@ -0,0 +1,47 @@
+# This is the VERSION file for hwloc, describing the precise version
+# of hwloc in this distribution.  The various components of the version
+# number below are combined to form a single version number string.
+
+# major, minor, and release are generally combined in the form
+# <major>.<minor>.<release>.  If release is zero, then it is omitted.
+
+# Please update HWLOC_VERSION* in contrib/windows/hwloc_config.h too.
+
+major=2
+minor=0
+release=4
+
+# greek is used for alpha or beta release tags.  If it is non-empty,
+# it will be appended to the version number.  It does not have to be
+# numeric.  Common examples include a1 (alpha release 1), b1 (beta
+# release 1), sc2005 (Super Computing 2005 release).  The only
+# requirement is that it must be entirely printable ASCII characters
+# and have no white space.
+
+greek=
+
+# The date when this release was created
+
+date="Jun 03, 2019"
+
+# If snapshot=1, then use the value from snapshot_version as the
+# entire hwloc version (i.e., ignore major, minor, release, and
+# greek).  This is only set to 1 when making snapshot tarballs.
+snapshot=0
+snapshot_version=${major}.${minor}.${release}${greek}-git
+
+# The shared library version of hwloc's public library.  This version
+# is maintained in accordance with the "Library Interface Versions"
+# chapter from the GNU Libtool documentation.  Notes:
+
+# 1. Since version numbers are associated with *releases*, the version
+# number maintained on the hwloc git master (and developer branches)
+# is always 0:0:0.
+
+# 2. Version numbers are described in the Libtool current:revision:age
+# format.
+
+libhwloc_so_version=15:3:0
+libnetloc_so_version=0:0:0
+
+# Please also update the <TargetName> lines in contrib/windows/libhwloc.vcxproj
diff --git a/src/3rdparty/hwloc/include/hwloc.h b/src/3rdparty/hwloc/include/hwloc.h
new file mode 100644
index 000000000..ee6da6fd1
--- /dev/null
+++ b/src/3rdparty/hwloc/include/hwloc.h
@@ -0,0 +1,2270 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2019 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/*=====================================================================
+ *                 PLEASE GO READ THE DOCUMENTATION!
+ *         ------------------------------------------------
+ *               $tarball_directory/doc/doxygen-doc/
+ *                                or
+ *           http://www.open-mpi.org/projects/hwloc/doc/
+ *=====================================================================
+ *
+ * FAIR WARNING: Do NOT expect to be able to figure out all the
+ * subtleties of hwloc by simply reading function prototypes and
+ * constant descrptions here in this file.
+ *
+ * Hwloc has wonderful documentation in both PDF and HTML formats for
+ * your reading pleasure.  The formal documentation explains a LOT of
+ * hwloc-specific concepts, provides definitions, and discusses the
+ * "big picture" for many of the things that you'll find here in this
+ * header file.
+ *
+ * The PDF/HTML documentation was generated via Doxygen; much of what
+ * you'll see in there is also here in this file.  BUT THERE IS A LOT
+ * THAT IS IN THE PDF/HTML THAT IS ***NOT*** IN hwloc.h!
+ *
+ * There are entire paragraph-length descriptions, discussions, and
+ * pretty prictures to explain subtle corner cases, provide concrete
+ * examples, etc.
+ *
+ * Please, go read the documentation.  :-)
+ *
+ * Moreover there are several examples of hwloc use under doc/examples
+ * in the source tree.
+ *
+ *=====================================================================*/
+
+/** \file
+ * \brief The hwloc API.
+ *
+ * See hwloc/bitmap.h for bitmap specific macros.
+ * See hwloc/helper.h for high-level topology traversal helpers.
+ * See hwloc/inlines.h for the actual inline code of some functions below.
+ * See hwloc/export.h for exporting topologies to XML or to synthetic descriptions.
+ * See hwloc/distances.h for querying and modifying distances between objects.
+ * See hwloc/diff.h for manipulating differences between similar topologies.
+ */
+
+#ifndef HWLOC_H
+#define HWLOC_H
+
+#include <hwloc/autogen/config.h>
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include <limits.h>
+
+/*
+ * Symbol transforms
+ */
+#include <hwloc/rename.h>
+
+/*
+ * Bitmap definitions
+ */
+
+#include <hwloc/bitmap.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_api_version API version
+ * @{
+ */
+
+/** \brief Indicate at build time which hwloc API version is being used.
+ *
+ * This number is updated to (X<<16)+(Y<<8)+Z when a new release X.Y.Z
+ * actually modifies the API.
+ *
+ * Users may check for available features at build time using this number
+ * (see \ref faq_upgrade).
+ *
+ * \note This should not be confused with HWLOC_VERSION, the library version.
+ * Two stable releases of the same series usually have the same ::HWLOC_API_VERSION
+ * even if their HWLOC_VERSION are different.
+ */
+#define HWLOC_API_VERSION 0x00020000
+
+/** \brief Indicate at runtime which hwloc API version was used at build time.
+ *
+ * Should be ::HWLOC_API_VERSION if running on the same version.
+ */
+HWLOC_DECLSPEC unsigned hwloc_get_api_version(void);
+
+/** \brief Current component and plugin ABI version (see hwloc/plugins.h) */
+#define HWLOC_COMPONENT_ABI 5
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_object_sets Object Sets (hwloc_cpuset_t and hwloc_nodeset_t)
+ *
+ * Hwloc uses bitmaps to represent two distinct kinds of object sets:
+ * CPU sets (::hwloc_cpuset_t) and NUMA node sets (::hwloc_nodeset_t).
+ * These types are both typedefs to a common back end type
+ * (::hwloc_bitmap_t), and therefore all the hwloc bitmap functions
+ * are applicable to both ::hwloc_cpuset_t and ::hwloc_nodeset_t (see
+ * \ref hwlocality_bitmap).
+ *
+ * The rationale for having two different types is that even though
+ * the actions one wants to perform on these types are the same (e.g.,
+ * enable and disable individual items in the set/mask), they're used
+ * in very different contexts: one for specifying which processors to
+ * use and one for specifying which NUMA nodes to use.  Hence, the
+ * name difference is really just to reflect the intent of where the
+ * type is used.
+ *
+ * @{
+ */
+
+/** \brief A CPU set is a bitmap whose bits are set according to CPU
+ * physical OS indexes.
+ *
+ * It may be consulted and modified with the bitmap API as any
+ * ::hwloc_bitmap_t (see hwloc/bitmap.h).
+ *
+ * Each bit may be converted into a PU object using
+ * hwloc_get_pu_obj_by_os_index().
+ */
+typedef hwloc_bitmap_t hwloc_cpuset_t;
+/** \brief A non-modifiable ::hwloc_cpuset_t. */
+typedef hwloc_const_bitmap_t hwloc_const_cpuset_t;
+
+/** \brief A node set is a bitmap whose bits are set according to NUMA
+ * memory node physical OS indexes.
+ *
+ * It may be consulted and modified with the bitmap API as any
+ * ::hwloc_bitmap_t (see hwloc/bitmap.h).
+ * Each bit may be converted into a NUMA node object using
+ * hwloc_get_numanode_obj_by_os_index().
+ *
+ * When binding memory on a system without any NUMA node,
+ * the single main memory bank is considered as NUMA node #0.
+ *
+ * See also \ref hwlocality_helper_nodeset_convert.
+ */
+typedef hwloc_bitmap_t hwloc_nodeset_t;
+/** \brief A non-modifiable ::hwloc_nodeset_t.
+ */
+typedef hwloc_const_bitmap_t hwloc_const_nodeset_t;
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_object_types Object Types
+ * @{
+ */
+
+/** \brief Type of topology object.
+ *
+ * \note Do not rely on the ordering or completeness of the values as new ones
+ * may be defined in the future!  If you need to compare types, use
+ * hwloc_compare_types() instead.
+ */
+#define HWLOC_OBJ_TYPE_MIN HWLOC_OBJ_MACHINE /**< \private Sentinel value */
+typedef enum {
+  HWLOC_OBJ_MACHINE,	/**< \brief Machine.
+			  * A set of processors and memory with cache
+			  * coherency.
+			  *
+			  * This type is always used for the root object of a topology,
+			  * and never used anywhere else.
+			  * Hence its parent is always \c NULL.
+			  */
+
+  HWLOC_OBJ_PACKAGE,	/**< \brief Physical package.
+			  * The physical package that usually gets inserted
+			  * into a socket on the motherboard.
+			  * A processor package usually contains multiple cores.
+			  */
+  HWLOC_OBJ_CORE,	/**< \brief Core.
+			  * A computation unit (may be shared by several
+			  * logical processors).
+			  */
+  HWLOC_OBJ_PU,		/**< \brief Processing Unit, or (Logical) Processor.
+			  * An execution unit (may share a core with some
+			  * other logical processors, e.g. in the case of
+			  * an SMT core).
+			  *
+			  * This is the smallest object representing CPU resources,
+			  * it cannot have any child except Misc objects.
+			  *
+			  * Objects of this kind are always reported and can
+			  * thus be used as fallback when others are not.
+			  */
+
+  HWLOC_OBJ_L1CACHE,	/**< \brief Level 1 Data (or Unified) Cache. */
+  HWLOC_OBJ_L2CACHE,	/**< \brief Level 2 Data (or Unified) Cache. */
+  HWLOC_OBJ_L3CACHE,	/**< \brief Level 3 Data (or Unified) Cache. */
+  HWLOC_OBJ_L4CACHE,	/**< \brief Level 4 Data (or Unified) Cache. */
+  HWLOC_OBJ_L5CACHE,	/**< \brief Level 5 Data (or Unified) Cache. */
+
+  HWLOC_OBJ_L1ICACHE,	/**< \brief Level 1 instruction Cache (filtered out by default). */
+  HWLOC_OBJ_L2ICACHE,	/**< \brief Level 2 instruction Cache (filtered out by default). */
+  HWLOC_OBJ_L3ICACHE,	/**< \brief Level 3 instruction Cache (filtered out by default). */
+
+  HWLOC_OBJ_GROUP,	/**< \brief Group objects.
+			  * Objects which do not fit in the above but are
+			  * detected by hwloc and are useful to take into
+			  * account for affinity. For instance, some operating systems
+			  * expose their arbitrary processors aggregation this
+			  * way.  And hwloc may insert such objects to group
+			  * NUMA nodes according to their distances.
+			  * See also \ref faq_groups.
+			  *
+			  * These objects are removed when they do not bring
+			  * any structure (see ::HWLOC_TYPE_FILTER_KEEP_STRUCTURE).
+			  */
+
+  HWLOC_OBJ_NUMANODE,	/**< \brief NUMA node.
+			  * An object that contains memory that is directly
+			  * and byte-accessible to the host processors.
+			  * It is usually close to some cores (the corresponding objects
+			  * are descendants of the NUMA node object in the hwloc tree).
+			  *
+			  * There is always at least one such object in the topology
+			  * even if the machine is not NUMA.
+			  *
+			  * Memory objects are not listed in the main children list,
+			  * but rather in the dedicated Memory children list.
+			  *
+			  * NUMA nodes have a special depth ::HWLOC_TYPE_DEPTH_NUMANODE
+			  * instead of a normal depth just like other objects in the
+			  * main tree.
+			  */
+
+  HWLOC_OBJ_BRIDGE,	/**< \brief Bridge (filtered out by default).
+			  * Any bridge that connects the host or an I/O bus,
+			  * to another I/O bus.
+			  * They are not added to the topology unless I/O discovery
+			  * is enabled with hwloc_topology_set_flags().
+			  * I/O objects are not listed in the main children list,
+			  * but rather in the dedicated io children list.
+			  * I/O objects have NULL CPU and node sets.
+			  */
+  HWLOC_OBJ_PCI_DEVICE,	/**< \brief PCI device (filtered out by default).
+			  * They are not added to the topology unless I/O discovery
+			  * is enabled with hwloc_topology_set_flags().
+			  * I/O objects are not listed in the main children list,
+			  * but rather in the dedicated io children list.
+			  * I/O objects have NULL CPU and node sets.
+			  */
+  HWLOC_OBJ_OS_DEVICE,	/**< \brief Operating system device (filtered out by default).
+			  * They are not added to the topology unless I/O discovery
+			  * is enabled with hwloc_topology_set_flags().
+			  * I/O objects are not listed in the main children list,
+			  * but rather in the dedicated io children list.
+			  * I/O objects have NULL CPU and node sets.
+			  */
+
+  HWLOC_OBJ_MISC,	/**< \brief Miscellaneous objects (filtered out by default).
+			  * Objects without particular meaning, that can e.g. be
+			  * added by the application for its own use, or by hwloc
+			  * for miscellaneous objects such as MemoryModule (DIMMs).
+			  * These objects are not listed in the main children list,
+			  * but rather in the dedicated misc children list.
+			  * Misc objects may only have Misc objects as children,
+			  * and those are in the dedicated misc children list as well.
+			  * Misc objects have NULL CPU and node sets.
+			  */
+
+  HWLOC_OBJ_TYPE_MAX    /**< \private Sentinel value */
+} hwloc_obj_type_t;
+
+/** \brief Cache type. */
+typedef enum hwloc_obj_cache_type_e {
+  HWLOC_OBJ_CACHE_UNIFIED,      /**< \brief Unified cache. */
+  HWLOC_OBJ_CACHE_DATA,         /**< \brief Data cache. */
+  HWLOC_OBJ_CACHE_INSTRUCTION   /**< \brief Instruction cache (filtered out by default). */
+} hwloc_obj_cache_type_t;
+
+/** \brief Type of one side (upstream or downstream) of an I/O bridge. */
+typedef enum hwloc_obj_bridge_type_e {
+  HWLOC_OBJ_BRIDGE_HOST,	/**< \brief Host-side of a bridge, only possible upstream. */
+  HWLOC_OBJ_BRIDGE_PCI		/**< \brief PCI-side of a bridge. */
+} hwloc_obj_bridge_type_t;
+
+/** \brief Type of a OS device. */
+typedef enum hwloc_obj_osdev_type_e {
+  HWLOC_OBJ_OSDEV_BLOCK,	/**< \brief Operating system block device.
+				  * For instance "sda" on Linux. */
+  HWLOC_OBJ_OSDEV_GPU,		/**< \brief Operating system GPU device.
+				  * For instance ":0.0" for a GL display,
+				  * "card0" for a Linux DRM device. */
+  HWLOC_OBJ_OSDEV_NETWORK,	/**< \brief Operating system network device.
+				  * For instance the "eth0" interface on Linux. */
+  HWLOC_OBJ_OSDEV_OPENFABRICS,	/**< \brief Operating system openfabrics device.
+				  * For instance the "mlx4_0" InfiniBand HCA,
+				  * or "hfi1_0" Omni-Path interface on Linux. */
+  HWLOC_OBJ_OSDEV_DMA,		/**< \brief Operating system dma engine device.
+				  * For instance the "dma0chan0" DMA channel on Linux. */
+  HWLOC_OBJ_OSDEV_COPROC	/**< \brief Operating system co-processor device.
+				  * For instance "mic0" for a Xeon Phi (MIC) on Linux,
+				  * "opencl0d0" for a OpenCL device,
+				  * "cuda0" for a CUDA device. */
+} hwloc_obj_osdev_type_t;
+
+/** \brief Compare the depth of two object types
+ *
+ * Types shouldn't be compared as they are, since newer ones may be added in
+ * the future.  This function returns less than, equal to, or greater than zero
+ * respectively if \p type1 objects usually include \p type2 objects, are the
+ * same as \p type2 objects, or are included in \p type2 objects. If the types
+ * can not be compared (because neither is usually contained in the other),
+ * ::HWLOC_TYPE_UNORDERED is returned.  Object types containing CPUs can always
+ * be compared (usually, a system contains machines which contain nodes which
+ * contain packages which contain caches, which contain cores, which contain
+ * processors).
+ *
+ * \note ::HWLOC_OBJ_PU will always be the deepest,
+ * while ::HWLOC_OBJ_MACHINE is always the highest.
+ *
+ * \note This does not mean that the actual topology will respect that order:
+ * e.g. as of today cores may also contain caches, and packages may also contain
+ * nodes. This is thus just to be seen as a fallback comparison method.
+ */
+HWLOC_DECLSPEC int hwloc_compare_types (hwloc_obj_type_t type1, hwloc_obj_type_t type2) __hwloc_attribute_const;
+
+enum hwloc_compare_types_e {
+    HWLOC_TYPE_UNORDERED = INT_MAX	/**< \brief Value returned by hwloc_compare_types() when types can not be compared. \hideinitializer */
+};
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_objects Object Structure and Attributes
+ * @{
+ */
+
+union hwloc_obj_attr_u;
+
+/** \brief Structure of a topology object
+ *
+ * Applications must not modify any field except \p hwloc_obj.userdata.
+ */
+struct hwloc_obj {
+  /* physical information */
+  hwloc_obj_type_t type;		/**< \brief Type of object */
+  char *subtype;			/**< \brief Subtype string to better describe the type field. */
+
+  unsigned os_index;			/**< \brief OS-provided physical index number.
+					 * It is not guaranteed unique across the entire machine,
+					 * except for PUs and NUMA nodes.
+					 * Set to HWLOC_UNKNOWN_INDEX if unknown or irrelevant for this object.
+					 */
+#define HWLOC_UNKNOWN_INDEX (unsigned)-1
+
+  char *name;				/**< \brief Object-specific name if any.
+					 * Mostly used for identifying OS devices and Misc objects where
+					 * a name string is more useful than numerical indexes.
+					 */
+
+  hwloc_uint64_t total_memory; /**< \brief Total memory (in bytes) in NUMA nodes below this object. */
+
+  union hwloc_obj_attr_u *attr;		/**< \brief Object type-specific Attributes,
+					 * may be \c NULL if no attribute value was found */
+
+  /* global position */
+  int depth;				/**< \brief Vertical index in the hierarchy.
+					 *
+					 * For normal objects, this is the depth of the horizontal level
+					 * that contains this object and its cousins of the same type.
+					 * If the topology is symmetric, this is equal to the parent depth
+					 * plus one, and also equal to the number of parent/child links
+					 * from the root object to here.
+					 *
+					 * For special objects (NUMA nodes, I/O and Misc) that are not
+					 * in the main tree, this is a special negative value that
+					 * corresponds to their dedicated level,
+					 * see hwloc_get_type_depth() and ::hwloc_get_type_depth_e.
+					 * Those special values can be passed to hwloc functions such
+					 * hwloc_get_nbobjs_by_depth() as usual.
+					 */
+  unsigned logical_index;		/**< \brief Horizontal index in the whole list of similar objects,
+					 * hence guaranteed unique across the entire machine.
+					 * Could be a "cousin_rank" since it's the rank within the "cousin" list below
+					 * Note that this index may change when restricting the topology
+					 * or when inserting a group.
+					 */
+
+  /* cousins are all objects of the same type (and depth) across the entire topology */
+  struct hwloc_obj *next_cousin;	/**< \brief Next object of same type and depth */
+  struct hwloc_obj *prev_cousin;	/**< \brief Previous object of same type and depth */
+
+  /* children of the same parent are siblings, even if they may have different type and depth */
+  struct hwloc_obj *parent;		/**< \brief Parent, \c NULL if root (Machine object) */
+  unsigned sibling_rank;		/**< \brief Index in parent's \c children[] array. Or the index in parent's Memory, I/O or Misc children list. */
+  struct hwloc_obj *next_sibling;	/**< \brief Next object below the same parent (inside the same list of children). */
+  struct hwloc_obj *prev_sibling;	/**< \brief Previous object below the same parent (inside the same list of children). */
+  /** @name List and array of normal children below this object (except Memory, I/O and Misc children). */
+  /**@{*/
+  unsigned arity;			/**< \brief Number of normal children.
+					 * Memory, Misc and I/O children are not listed here
+					 * but rather in their dedicated children list.
+					 */
+  struct hwloc_obj **children;		/**< \brief Normal children, \c children[0 .. arity -1] */
+  struct hwloc_obj *first_child;	/**< \brief First normal child */
+  struct hwloc_obj *last_child;		/**< \brief Last normal child */
+  /**@}*/
+
+  int symmetric_subtree;		/**< \brief Set if the subtree of normal objects below this object is symmetric,
+					  * which means all normal children and their children have identical subtrees.
+					  *
+					  * Memory, I/O and Misc children are ignored.
+					  *
+					  * If set in the topology root object, lstopo may export the topology
+					  * as a synthetic string.
+					  */
+
+  /** @name List of Memory children below this object. */
+  /**@{*/
+  unsigned memory_arity;		/**< \brief Number of Memory children.
+					 * These children are listed in \p memory_first_child.
+					 */
+  struct hwloc_obj *memory_first_child;	/**< \brief First Memory child.
+					 * NUMA nodes are listed here (\p memory_arity and \p memory_first_child)
+					 * instead of in the normal children list.
+					 * See also hwloc_obj_type_is_memory().
+					 */
+  /**@}*/
+
+  /** @name List of I/O children below this object. */
+  /**@{*/
+  unsigned io_arity;			/**< \brief Number of I/O children.
+					 * These children are listed in \p io_first_child.
+					 */
+  struct hwloc_obj *io_first_child;	/**< \brief First I/O child.
+					 * Bridges, PCI and OS devices are listed here (\p io_arity and \p io_first_child)
+					 * instead of in the normal children list.
+					 * See also hwloc_obj_type_is_io().
+					 */
+  /**@}*/
+
+  /** @name List of Misc children below this object. */
+  /**@{*/
+  unsigned misc_arity;			/**< \brief Number of Misc children.
+					 * These children are listed in \p misc_first_child.
+					 */
+  struct hwloc_obj *misc_first_child;	/**< \brief First Misc child.
+					 * Misc objects are listed here (\p misc_arity and \p misc_first_child)
+					 * instead of in the normal children list.
+					 */
+  /**@}*/
+
+  /* cpusets and nodesets */
+  hwloc_cpuset_t cpuset;		/**< \brief CPUs covered by this object
+                                          *
+                                          * This is the set of CPUs for which there are PU objects in the topology
+                                          * under this object, i.e. which are known to be physically contained in this
+                                          * object and known how (the children path between this object and the PU
+                                          * objects).
+                                          *
+                                          * If the ::HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM configuration flag is set,
+                                          * some of these CPUs may not be allowed for binding,
+                                          * see hwloc_topology_get_allowed_cpuset().
+                                          *
+					  * \note All objects have non-NULL CPU and node sets except Misc and I/O objects.
+					  *
+                                          * \note Its value must not be changed, hwloc_bitmap_dup() must be used instead.
+                                          */
+  hwloc_cpuset_t complete_cpuset;       /**< \brief The complete CPU set of logical processors of this object,
+                                          *
+                                          * This may include not only the same as the cpuset field, but also some CPUs for
+                                          * which topology information is unknown or incomplete, some offlines CPUs, and
+                                          * the CPUs that are ignored when the ::HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM flag
+                                          * is not set.
+                                          * Thus no corresponding PU object may be found in the topology, because the
+                                          * precise position is undefined. It is however known that it would be somewhere
+                                          * under this object.
+                                          *
+                                          * \note Its value must not be changed, hwloc_bitmap_dup() must be used instead.
+                                          */
+
+  hwloc_nodeset_t nodeset;              /**< \brief NUMA nodes covered by this object or containing this object
+                                          *
+                                          * This is the set of NUMA nodes for which there are NUMA node objects in the
+                                          * topology under or above this object, i.e. which are known to be physically
+                                          * contained in this object or containing it and known how (the children path
+                                          * between this object and the NUMA node objects).
+                                          *
+                                          * In the end, these nodes are those that are close to the current object.
+                                          *
+                                          * If the ::HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM configuration flag is set,
+                                          * some of these nodes may not be allowed for allocation,
+                                          * see hwloc_topology_get_allowed_nodeset().
+                                          *
+                                          * If there are no NUMA nodes in the machine, all the memory is close to this
+                                          * object, so only the first bit may be set in \p nodeset.
+                                          *
+					  * \note All objects have non-NULL CPU and node sets except Misc and I/O objects.
+					  *
+                                          * \note Its value must not be changed, hwloc_bitmap_dup() must be used instead.
+                                          */
+  hwloc_nodeset_t complete_nodeset;     /**< \brief The complete NUMA node set of this object,
+                                          *
+                                          * This may include not only the same as the nodeset field, but also some NUMA
+                                          * nodes for which topology information is unknown or incomplete, some offlines
+                                          * nodes, and the nodes that are ignored when the ::HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM
+                                          * flag is not set.
+                                          * Thus no corresponding NUMA node object may be found in the topology, because the
+                                          * precise position is undefined. It is however known that it would be
+                                          * somewhere under this object.
+                                          *
+                                          * If there are no NUMA nodes in the machine, all the memory is close to this
+                                          * object, so only the first bit is set in \p complete_nodeset.
+                                          *
+                                          * \note Its value must not be changed, hwloc_bitmap_dup() must be used instead.
+                                          */
+
+  struct hwloc_info_s *infos;		/**< \brief Array of stringified info type=name. */
+  unsigned infos_count;			/**< \brief Size of infos array. */
+
+  /* misc */
+  void *userdata;			/**< \brief Application-given private data pointer,
+					 * initialized to \c NULL, use it as you wish.
+					 * See hwloc_topology_set_userdata_export_callback() in hwloc/export.h
+					 * if you wish to export this field to XML. */
+
+  hwloc_uint64_t gp_index;			/**< \brief Global persistent index.
+					 * Generated by hwloc, unique across the topology (contrary to os_index)
+					 * and persistent across topology changes (contrary to logical_index).
+					 * Mostly used internally, but could also be used by application to identify objects.
+					 */
+};
+/**
+ * \brief Convenience typedef; a pointer to a struct hwloc_obj.
+ */
+typedef struct hwloc_obj * hwloc_obj_t;
+
+/** \brief Object type-specific Attributes */
+union hwloc_obj_attr_u {
+  /** \brief NUMA node-specific Object Attributes */
+  struct hwloc_numanode_attr_s {
+    hwloc_uint64_t local_memory; /**< \brief Local memory (in bytes) */
+    unsigned page_types_len; /**< \brief Size of array \p page_types */
+    /** \brief Array of local memory page types, \c NULL if no local memory and \p page_types is 0.
+     *
+     * The array is sorted by increasing \p size fields.
+     * It contains \p page_types_len slots.
+     */
+    struct hwloc_memory_page_type_s {
+      hwloc_uint64_t size;	/**< \brief Size of pages */
+      hwloc_uint64_t count;	/**< \brief Number of pages of this size */
+    } * page_types;
+  } numanode;
+
+  /** \brief Cache-specific Object Attributes */
+  struct hwloc_cache_attr_s {
+    hwloc_uint64_t size;		  /**< \brief Size of cache in bytes */
+    unsigned depth;			  /**< \brief Depth of cache (e.g., L1, L2, ...etc.) */
+    unsigned linesize;			  /**< \brief Cache-line size in bytes. 0 if unknown */
+    int associativity;			  /**< \brief Ways of associativity,
+    					    *  -1 if fully associative, 0 if unknown */
+    hwloc_obj_cache_type_t type;          /**< \brief Cache type */
+  } cache;
+  /** \brief Group-specific Object Attributes */
+  struct hwloc_group_attr_s {
+    unsigned depth;			  /**< \brief Depth of group object.
+					   *   It may change if intermediate Group objects are added. */
+    unsigned kind;			  /**< \brief Internally-used kind of group. */
+    unsigned subkind;			  /**< \brief Internally-used subkind to distinguish different levels of groups with same kind */
+    unsigned char dont_merge;		  /**< \brief Flag preventing groups from being automatically merged with identical parent or children. */
+  } group;
+  /** \brief PCI Device specific Object Attributes */
+  struct hwloc_pcidev_attr_s {
+    unsigned short domain;
+    unsigned char bus, dev, func;
+    unsigned short class_id;
+    unsigned short vendor_id, device_id, subvendor_id, subdevice_id;
+    unsigned char revision;
+    float linkspeed; /* in GB/s */
+  } pcidev;
+  /** \brief Bridge specific Object Attribues */
+  struct hwloc_bridge_attr_s {
+    union {
+      struct hwloc_pcidev_attr_s pci;
+    } upstream;
+    hwloc_obj_bridge_type_t upstream_type;
+    union {
+      struct {
+	unsigned short domain;
+	unsigned char secondary_bus, subordinate_bus;
+      } pci;
+    } downstream;
+    hwloc_obj_bridge_type_t downstream_type;
+    unsigned depth;
+  } bridge;
+  /** \brief OS Device specific Object Attributes */
+  struct hwloc_osdev_attr_s {
+    hwloc_obj_osdev_type_t type;
+  } osdev;
+};
+
+/** \brief Object info
+ *
+ * \sa hwlocality_info_attr
+ */
+struct hwloc_info_s {
+  char *name;	/**< \brief Info name */
+  char *value;	/**< \brief Info value */
+};
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_creation Topology Creation and Destruction
+ * @{
+ */
+
+struct hwloc_topology;
+/** \brief Topology context
+ *
+ * To be initialized with hwloc_topology_init() and built with hwloc_topology_load().
+ */
+typedef struct hwloc_topology * hwloc_topology_t;
+
+/** \brief Allocate a topology context.
+ *
+ * \param[out] topologyp is assigned a pointer to the new allocated context.
+ *
+ * \return 0 on success, -1 on error.
+ */
+HWLOC_DECLSPEC int hwloc_topology_init (hwloc_topology_t *topologyp);
+
+/** \brief Build the actual topology
+ *
+ * Build the actual topology once initialized with hwloc_topology_init() and
+ * tuned with \ref hwlocality_configuration and \ref hwlocality_setsource routines.
+ * No other routine may be called earlier using this topology context.
+ *
+ * \param topology is the topology to be loaded with objects.
+ *
+ * \return 0 on success, -1 on error.
+ *
+ * \note On failure, the topology is reinitialized. It should be either
+ * destroyed with hwloc_topology_destroy() or configured and loaded again.
+ *
+ * \note This function may be called only once per topology.
+ *
+ * \note The binding of the current thread or process may temporarily change
+ * during this call but it will be restored before it returns.
+ *
+ * \sa hwlocality_configuration and hwlocality_setsource
+ */
+HWLOC_DECLSPEC int hwloc_topology_load(hwloc_topology_t topology);
+
+/** \brief Terminate and free a topology context
+ *
+ * \param topology is the topology to be freed
+ */
+HWLOC_DECLSPEC void hwloc_topology_destroy (hwloc_topology_t topology);
+
+/** \brief Duplicate a topology.
+ *
+ * The entire topology structure as well as its objects
+ * are duplicated into a new one.
+ *
+ * This is useful for keeping a backup while modifying a topology.
+ *
+ * \note Object userdata is not duplicated since hwloc does not know what it point to.
+ * The objects of both old and new topologies will point to the same userdata.
+ */
+HWLOC_DECLSPEC int hwloc_topology_dup(hwloc_topology_t *newtopology, hwloc_topology_t oldtopology);
+
+/** \brief Verify that the topology is compatible with the current hwloc library.
+ *
+ * This is useful when using the same topology structure (in memory)
+ * in different libraries that may use different hwloc installations
+ * (for instance if one library embeds a specific version of hwloc,
+ * while another library uses a default system-wide hwloc installation).
+ *
+ * If all libraries/programs use the same hwloc installation, this function
+ * always returns success.
+ *
+ * \return \c 0 on success.
+ *
+ * \return \c -1 with \p errno set to \c EINVAL if incompatible.
+ *
+ * \note If sharing between processes with hwloc_shmem_topology_write(),
+ * the relevant check is already performed inside hwloc_shmem_topology_adopt().
+ */
+HWLOC_DECLSPEC int hwloc_topology_abi_check(hwloc_topology_t topology);
+
+/** \brief Run internal checks on a topology structure
+ *
+ * The program aborts if an inconsistency is detected in the given topology.
+ *
+ * \param topology is the topology to be checked
+ *
+ * \note This routine is only useful to developers.
+ *
+ * \note The input topology should have been previously loaded with
+ * hwloc_topology_load().
+ */
+HWLOC_DECLSPEC void hwloc_topology_check(hwloc_topology_t topology);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_levels Object levels, depths and types
+ * @{
+ *
+ * Be sure to see the figure in \ref termsanddefs that shows a
+ * complete topology tree, including depths, child/sibling/cousin
+ * relationships, and an example of an asymmetric topology where one
+ * package has fewer caches than its peers.
+ */
+
+/** \brief Get the depth of the hierarchical tree of objects.
+ *
+ * This is the depth of ::HWLOC_OBJ_PU objects plus one.
+ *
+ * \note NUMA nodes, I/O and Misc objects are ignored when computing
+ * the depth of the tree (they are placed on special levels).
+ */
+HWLOC_DECLSPEC int hwloc_topology_get_depth(hwloc_topology_t __hwloc_restrict topology) __hwloc_attribute_pure;
+
+/** \brief Returns the depth of objects of type \p type.
+ *
+ * If no object of this type is present on the underlying architecture, or if
+ * the OS doesn't provide this kind of information, the function returns
+ * ::HWLOC_TYPE_DEPTH_UNKNOWN.
+ *
+ * If type is absent but a similar type is acceptable, see also
+ * hwloc_get_type_or_below_depth() and hwloc_get_type_or_above_depth().
+ *
+ * If ::HWLOC_OBJ_GROUP is given, the function may return ::HWLOC_TYPE_DEPTH_MULTIPLE
+ * if multiple levels of Groups exist.
+ *
+ * If a NUMA node, I/O or Misc object type is given, the function returns a virtual
+ * value because these objects are stored in special levels that are not CPU-related.
+ * This virtual depth may be passed to other hwloc functions such as
+ * hwloc_get_obj_by_depth() but it should not be considered as an actual
+ * depth by the application. In particular, it should not be compared with
+ * any other object depth or with the entire topology depth.
+ * \sa hwloc_get_memory_parents_depth().
+ *
+ * \sa hwloc_type_sscanf_as_depth() for returning the depth of objects
+ * whose type is given as a string.
+ */
+HWLOC_DECLSPEC int hwloc_get_type_depth (hwloc_topology_t topology, hwloc_obj_type_t type);
+
+enum hwloc_get_type_depth_e {
+    HWLOC_TYPE_DEPTH_UNKNOWN = -1,    /**< \brief No object of given type exists in the topology. \hideinitializer */
+    HWLOC_TYPE_DEPTH_MULTIPLE = -2,   /**< \brief Objects of given type exist at different depth in the topology (only for Groups). \hideinitializer */
+    HWLOC_TYPE_DEPTH_NUMANODE = -3,   /**< \brief Virtual depth for NUMA nodes. \hideinitializer */
+    HWLOC_TYPE_DEPTH_BRIDGE = -4,     /**< \brief Virtual depth for bridge object level. \hideinitializer */
+    HWLOC_TYPE_DEPTH_PCI_DEVICE = -5, /**< \brief Virtual depth for PCI device object level. \hideinitializer */
+    HWLOC_TYPE_DEPTH_OS_DEVICE = -6,  /**< \brief Virtual depth for software device object level. \hideinitializer */
+    HWLOC_TYPE_DEPTH_MISC = -7        /**< \brief Virtual depth for Misc object. \hideinitializer */
+};
+
+/** \brief Return the depth of parents where memory objects are attached.
+ *
+ * Memory objects have virtual negative depths because they are not part of
+ * the main CPU-side hierarchy of objects. This depth should not be compared
+ * with other level depths.
+ *
+ * If all Memory objects are attached to Normal parents at the same depth,
+ * this parent depth may be compared to other as usual, for instance
+ * for knowing whether NUMA nodes is attached above or below Packages.
+ *
+ * \return The depth of Normal parents of all memory children
+ * if all these parents have the same depth. For instance the depth of
+ * the Package level if all NUMA nodes are attached to Package objects.
+ *
+ * \return ::HWLOC_TYPE_DEPTH_MULTIPLE if Normal parents of all
+ * memory children do not have the same depth. For instance if some
+ * NUMA nodes are attached to Packages while others are attached to
+ * Groups.
+ */
+HWLOC_DECLSPEC int hwloc_get_memory_parents_depth (hwloc_topology_t topology);
+
+/** \brief Returns the depth of objects of type \p type or below
+ *
+ * If no object of this type is present on the underlying architecture, the
+ * function returns the depth of the first "present" object typically found
+ * inside \p type.
+ *
+ * This function is only meaningful for normal object types.
+ * If a memory, I/O or Misc object type is given, the corresponding virtual
+ * depth is always returned (see hwloc_get_type_depth()).
+ *
+ * May return ::HWLOC_TYPE_DEPTH_MULTIPLE for ::HWLOC_OBJ_GROUP just like
+ * hwloc_get_type_depth().
+ */
+static __hwloc_inline int
+hwloc_get_type_or_below_depth (hwloc_topology_t topology, hwloc_obj_type_t type) __hwloc_attribute_pure;
+
+/** \brief Returns the depth of objects of type \p type or above
+ *
+ * If no object of this type is present on the underlying architecture, the
+ * function returns the depth of the first "present" object typically
+ * containing \p type.
+ *
+ * This function is only meaningful for normal object types.
+ * If a memory, I/O or Misc object type is given, the corresponding virtual
+ * depth is always returned (see hwloc_get_type_depth()).
+ *
+ * May return ::HWLOC_TYPE_DEPTH_MULTIPLE for ::HWLOC_OBJ_GROUP just like
+ * hwloc_get_type_depth().
+ */
+static __hwloc_inline int
+hwloc_get_type_or_above_depth (hwloc_topology_t topology, hwloc_obj_type_t type) __hwloc_attribute_pure;
+
+/** \brief Returns the type of objects at depth \p depth.
+ *
+ * \p depth should between 0 and hwloc_topology_get_depth()-1.
+ *
+ * \return (hwloc_obj_type_t)-1 if depth \p depth does not exist.
+ */
+HWLOC_DECLSPEC hwloc_obj_type_t hwloc_get_depth_type (hwloc_topology_t topology, int depth) __hwloc_attribute_pure;
+
+/** \brief Returns the width of level at depth \p depth.
+ */
+HWLOC_DECLSPEC unsigned hwloc_get_nbobjs_by_depth (hwloc_topology_t topology, int depth) __hwloc_attribute_pure;
+
+/** \brief Returns the width of level type \p type
+ *
+ * If no object for that type exists, 0 is returned.
+ * If there are several levels with objects of that type, -1 is returned.
+ */
+static __hwloc_inline int
+hwloc_get_nbobjs_by_type (hwloc_topology_t topology, hwloc_obj_type_t type) __hwloc_attribute_pure;
+
+/** \brief Returns the top-object of the topology-tree.
+ *
+ * Its type is ::HWLOC_OBJ_MACHINE.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_root_obj (hwloc_topology_t topology) __hwloc_attribute_pure;
+
+/** \brief Returns the topology object at logical index \p idx from depth \p depth */
+HWLOC_DECLSPEC hwloc_obj_t hwloc_get_obj_by_depth (hwloc_topology_t topology, int depth, unsigned idx) __hwloc_attribute_pure;
+
+/** \brief Returns the topology object at logical index \p idx with type \p type
+ *
+ * If no object for that type exists, \c NULL is returned.
+ * If there are several levels with objects of that type (::HWLOC_OBJ_GROUP),
+ * \c NULL is returned and the caller may fallback to hwloc_get_obj_by_depth().
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_by_type (hwloc_topology_t topology, hwloc_obj_type_t type, unsigned idx) __hwloc_attribute_pure;
+
+/** \brief Returns the next object at depth \p depth.
+ *
+ * If \p prev is \c NULL, return the first object at depth \p depth.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_by_depth (hwloc_topology_t topology, int depth, hwloc_obj_t prev);
+
+/** \brief Returns the next object of type \p type.
+ *
+ * If \p prev is \c NULL, return the first object at type \p type.  If
+ * there are multiple or no depth for given type, return \c NULL and
+ * let the caller fallback to hwloc_get_next_obj_by_depth().
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_by_type (hwloc_topology_t topology, hwloc_obj_type_t type,
+			    hwloc_obj_t prev);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_object_strings Converting between Object Types and Attributes, and Strings
+ * @{
+ */
+
+/** \brief Return a constant stringified object type.
+ *
+ * This function is the basic way to convert a generic type into a string.
+ * The output string may be parsed back by hwloc_type_sscanf().
+ *
+ * hwloc_obj_type_snprintf() may return a more precise output for a specific
+ * object, but it requires the caller to provide the output buffer.
+ */
+HWLOC_DECLSPEC const char * hwloc_obj_type_string (hwloc_obj_type_t type) __hwloc_attribute_const;
+
+/** \brief Stringify the type of a given topology object into a human-readable form.
+ *
+ * Contrary to hwloc_obj_type_string(), this function includes object-specific
+ * attributes (such as the Group depth, the Bridge type, or OS device type)
+ * in the output, and it requires the caller to provide the output buffer.
+ *
+ * The output is guaranteed to be the same for all objects of a same topology level.
+ *
+ * If \p verbose is 1, longer type names are used, e.g. L1Cache instead of L1.
+ *
+ * The output string may be parsed back by hwloc_type_sscanf().
+ *
+ * If \p size is 0, \p string may safely be \c NULL.
+ *
+ * \return the number of character that were actually written if not truncating,
+ * or that would have been written (not including the ending \\0).
+ */
+HWLOC_DECLSPEC int hwloc_obj_type_snprintf(char * __hwloc_restrict string, size_t size,
+					   hwloc_obj_t obj,
+					   int verbose);
+
+/** \brief Stringify the attributes of a given topology object into a human-readable form.
+ *
+ * Attribute values are separated by \p separator.
+ *
+ * Only the major attributes are printed in non-verbose mode.
+ *
+ * If \p size is 0, \p string may safely be \c NULL.
+ *
+ * \return the number of character that were actually written if not truncating,
+ * or that would have been written (not including the ending \\0).
+ */
+HWLOC_DECLSPEC int hwloc_obj_attr_snprintf(char * __hwloc_restrict string, size_t size,
+					   hwloc_obj_t obj, const char * __hwloc_restrict separator,
+					   int verbose);
+
+/** \brief Return an object type and attributes from a type string.
+ *
+ * Convert strings such as "Package" or "L1iCache" into the corresponding types.
+ * Matching is case-insensitive, and only the first letters are actually
+ * required to match.
+ *
+ * The matched object type is set in \p typep (which cannot be \c NULL).
+ *
+ * Type-specific attributes, for instance Cache type, Cache depth, Group depth,
+ * Bridge type or OS Device type may be returned in \p attrp.
+ * Attributes that are not specified in the string (for instance "Group"
+ * without a depth, or "L2Cache" without a cache type) are set to -1.
+ *
+ * \p attrp is only filled if not \c NULL and if its size specified in \p attrsize
+ * is large enough. It should be at least as large as union hwloc_obj_attr_u.
+ *
+ * \return 0 if a type was correctly identified, otherwise -1.
+ *
+ * \note This function is guaranteed to match any string returned by
+ * hwloc_obj_type_string() or hwloc_obj_type_snprintf().
+ *
+ * \note This is an extended version of the now deprecated hwloc_obj_type_sscanf().
+ */
+HWLOC_DECLSPEC int hwloc_type_sscanf(const char *string,
+				     hwloc_obj_type_t *typep,
+				     union hwloc_obj_attr_u *attrp, size_t attrsize);
+
+/** \brief Return an object type and its level depth from a type string.
+ *
+ * Convert strings such as "Package" or "L1iCache" into the corresponding types
+ * and return in \p depthp the depth of the corresponding level in the
+ * topology \p topology.
+ *
+ * If no object of this type is present on the underlying architecture,
+ * ::HWLOC_TYPE_DEPTH_UNKNOWN is returned.
+ *
+ * If multiple such levels exist (for instance if giving Group without any depth),
+ * the function may return ::HWLOC_TYPE_DEPTH_MULTIPLE instead.
+ *
+ * The matched object type is set in \p typep if \p typep is non \c NULL.
+ *
+ * \note This function is similar to hwloc_type_sscanf() followed
+ * by hwloc_get_type_depth() but it also automatically disambiguates
+ * multiple group levels etc.
+ *
+ * \note This function is guaranteed to match any string returned by
+ * hwloc_obj_type_string() or hwloc_obj_type_snprintf().
+ */
+HWLOC_DECLSPEC int hwloc_type_sscanf_as_depth(const char *string,
+					      hwloc_obj_type_t *typep,
+					      hwloc_topology_t topology, int *depthp);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_info_attr Consulting and Adding Key-Value Info Attributes
+ *
+ * @{
+ */
+
+/** \brief Search the given key name in object infos and return the corresponding value.
+ *
+ * If multiple keys match the given name, only the first one is returned.
+ *
+ * \return \c NULL if no such key exists.
+ */
+static __hwloc_inline const char *
+hwloc_obj_get_info_by_name(hwloc_obj_t obj, const char *name) __hwloc_attribute_pure;
+
+/** \brief Add the given info name and value pair to the given object.
+ *
+ * The info is appended to the existing info array even if another key
+ * with the same name already exists.
+ *
+ * The input strings are copied before being added in the object infos.
+ *
+ * \return \c 0 on success, \c -1 on error.
+ *
+ * \note This function may be used to enforce object colors in the lstopo
+ * graphical output by using "lstopoStyle" as a name and "Background=#rrggbb"
+ * as a value. See CUSTOM COLORS in the lstopo(1) manpage for details.
+ *
+ * \note If \p value contains some non-printable characters, they will
+ * be dropped when exporting to XML, see hwloc_topology_export_xml() in hwloc/export.h.
+ */
+HWLOC_DECLSPEC int hwloc_obj_add_info(hwloc_obj_t obj, const char *name, const char *value);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_cpubinding CPU binding
+ *
+ * Some operating systems only support binding threads or processes to a single PU.
+ * Others allow binding to larger sets such as entire Cores or Packages or
+ * even random sets of invididual PUs. In such operating system, the scheduler
+ * is free to run the task on one of these PU, then migrate it to another PU, etc.
+ * It is often useful to call hwloc_bitmap_singlify() on the target CPU set before
+ * passing it to the binding function to avoid these expensive migrations.
+ * See the documentation of hwloc_bitmap_singlify() for details.
+ *
+ * Some operating systems do not provide all hwloc-supported
+ * mechanisms to bind processes, threads, etc.
+ * hwloc_topology_get_support() may be used to query about the actual CPU
+ * binding support in the currently used operating system.
+ *
+ * When the requested binding operation is not available and the
+ * ::HWLOC_CPUBIND_STRICT flag was passed, the function returns -1.
+ * \p errno is set to \c ENOSYS when it is not possible to bind the requested kind of object
+ * processes/threads. errno is set to \c EXDEV when the requested cpuset
+ * can not be enforced (e.g. some systems only allow one CPU, and some
+ * other systems only allow one NUMA node).
+ *
+ * If ::HWLOC_CPUBIND_STRICT was not passed, the function may fail as well,
+ * or the operating system may use a slightly different operation
+ * (with side-effects, smaller binding set, etc.)
+ * when the requested operation is not exactly supported.
+ *
+ * The most portable version that should be preferred over the others,
+ * whenever possible, is the following one which just binds the current program,
+ * assuming it is single-threaded:
+ *
+ * \code
+ * hwloc_set_cpubind(topology, set, 0),
+ * \endcode
+ *
+ * If the program may be multithreaded, the following one should be preferred
+ * to only bind the current thread:
+ *
+ * \code
+ * hwloc_set_cpubind(topology, set, HWLOC_CPUBIND_THREAD),
+ * \endcode
+ *
+ * \sa Some example codes are available under doc/examples/ in the source tree.
+ *
+ * \note To unbind, just call the binding function with either a full cpuset or
+ * a cpuset equal to the system cpuset.
+ *
+ * \note On some operating systems, CPU binding may have effects on memory binding, see
+ * ::HWLOC_CPUBIND_NOMEMBIND
+ *
+ * \note Running lstopo \--top or hwloc-ps can be a very convenient tool to check
+ * how binding actually happened.
+ * @{
+ */
+
+/** \brief Process/Thread binding flags.
+ *
+ * These bit flags can be used to refine the binding policy.
+ *
+ * The default (0) is to bind the current process, assumed to be
+ * single-threaded, in a non-strict way.  This is the most portable
+ * way to bind as all operating systems usually provide it.
+ *
+ * \note Not all systems support all kinds of binding.  See the
+ * "Detailed Description" section of \ref hwlocality_cpubinding for a
+ * description of errors that can occur.
+ */
+typedef enum {
+  /** \brief Bind all threads of the current (possibly) multithreaded process.
+   * \hideinitializer */
+  HWLOC_CPUBIND_PROCESS = (1<<0),
+
+  /** \brief Bind current thread of current process.
+   * \hideinitializer */
+  HWLOC_CPUBIND_THREAD = (1<<1),
+
+  /** \brief Request for strict binding from the OS.
+   *
+   * By default, when the designated CPUs are all busy while other
+   * CPUs are idle, operating systems may execute the thread/process
+   * on those other CPUs instead of the designated CPUs, to let them
+   * progress anyway.  Strict binding means that the thread/process
+   * will _never_ execute on other cpus than the designated CPUs, even
+   * when those are busy with other tasks and other CPUs are idle.
+   *
+   * \note Depending on the operating system, strict binding may not
+   * be possible (e.g., the OS does not implement it) or not allowed
+   * (e.g., for an administrative reasons), and the function will fail
+   * in that case.
+   *
+   * When retrieving the binding of a process, this flag checks
+   * whether all its threads  actually have the same binding. If the
+   * flag is not given, the binding of each thread will be
+   * accumulated.
+   *
+   * \note This flag is meaningless when retrieving the binding of a
+   * thread.
+   * \hideinitializer
+   */
+  HWLOC_CPUBIND_STRICT = (1<<2),
+
+  /** \brief Avoid any effect on memory binding
+   *
+   * On some operating systems, some CPU binding function would also
+   * bind the memory on the corresponding NUMA node.  It is often not
+   * a problem for the application, but if it is, setting this flag
+   * will make hwloc avoid using OS functions that would also bind
+   * memory.  This will however reduce the support of CPU bindings,
+   * i.e. potentially return -1 with errno set to ENOSYS in some
+   * cases.
+   *
+   * This flag is only meaningful when used with functions that set
+   * the CPU binding.  It is ignored when used with functions that get
+   * CPU binding information.
+   * \hideinitializer
+   */
+  HWLOC_CPUBIND_NOMEMBIND = (1<<3)
+} hwloc_cpubind_flags_t;
+
+/** \brief Bind current process or thread on cpus given in physical bitmap \p set.
+ *
+ * \return -1 with errno set to ENOSYS if the action is not supported
+ * \return -1 with errno set to EXDEV if the binding cannot be enforced
+ */
+HWLOC_DECLSPEC int hwloc_set_cpubind(hwloc_topology_t topology, hwloc_const_cpuset_t set, int flags);
+
+/** \brief Get current process or thread binding.
+ *
+ * Writes into \p set the physical cpuset which the process or thread (according to \e
+ * flags) was last bound to.
+ */
+HWLOC_DECLSPEC int hwloc_get_cpubind(hwloc_topology_t topology, hwloc_cpuset_t set, int flags);
+
+/** \brief Bind a process \p pid on cpus given in physical bitmap \p set.
+ *
+ * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ *
+ * \note As a special case on Linux, if a tid (thread ID) is supplied
+ * instead of a pid (process ID) and ::HWLOC_CPUBIND_THREAD is passed in flags,
+ * the binding is applied to that specific thread.
+ *
+ * \note On non-Linux systems, ::HWLOC_CPUBIND_THREAD can not be used in \p flags.
+ */
+HWLOC_DECLSPEC int hwloc_set_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_cpuset_t set, int flags);
+
+/** \brief Get the current physical binding of process \p pid.
+ *
+ * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ *
+ * \note As a special case on Linux, if a tid (thread ID) is supplied
+ * instead of a pid (process ID) and HWLOC_CPUBIND_THREAD is passed in flags,
+ * the binding for that specific thread is returned.
+ *
+ * \note On non-Linux systems, HWLOC_CPUBIND_THREAD can not be used in \p flags.
+ */
+HWLOC_DECLSPEC int hwloc_get_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_cpuset_t set, int flags);
+
+#ifdef hwloc_thread_t
+/** \brief Bind a thread \p thread on cpus given in physical bitmap \p set.
+ *
+ * \note \p hwloc_thread_t is \p pthread_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ *
+ * \note ::HWLOC_CPUBIND_PROCESS can not be used in \p flags.
+ */
+HWLOC_DECLSPEC int hwloc_set_thread_cpubind(hwloc_topology_t topology, hwloc_thread_t thread, hwloc_const_cpuset_t set, int flags);
+#endif
+
+#ifdef hwloc_thread_t
+/** \brief Get the current physical binding of thread \p tid.
+ *
+ * \note \p hwloc_thread_t is \p pthread_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ *
+ * \note ::HWLOC_CPUBIND_PROCESS can not be used in \p flags.
+ */
+HWLOC_DECLSPEC int hwloc_get_thread_cpubind(hwloc_topology_t topology, hwloc_thread_t thread, hwloc_cpuset_t set, int flags);
+#endif
+
+/** \brief Get the last physical CPU where the current process or thread ran.
+ *
+ * The operating system may move some tasks from one processor
+ * to another at any time according to their binding,
+ * so this function may return something that is already
+ * outdated.
+ *
+ * \p flags can include either ::HWLOC_CPUBIND_PROCESS or ::HWLOC_CPUBIND_THREAD to
+ * specify whether the query should be for the whole process (union of all CPUs
+ * on which all threads are running), or only the current thread. If the
+ * process is single-threaded, flags can be set to zero to let hwloc use
+ * whichever method is available on the underlying OS.
+ */
+HWLOC_DECLSPEC int hwloc_get_last_cpu_location(hwloc_topology_t topology, hwloc_cpuset_t set, int flags);
+
+/** \brief Get the last physical CPU where a process ran.
+ *
+ * The operating system may move some tasks from one processor
+ * to another at any time according to their binding,
+ * so this function may return something that is already
+ * outdated.
+ *
+ * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ *
+ * \note As a special case on Linux, if a tid (thread ID) is supplied
+ * instead of a pid (process ID) and ::HWLOC_CPUBIND_THREAD is passed in flags,
+ * the last CPU location of that specific thread is returned.
+ *
+ * \note On non-Linux systems, ::HWLOC_CPUBIND_THREAD can not be used in \p flags.
+ */
+HWLOC_DECLSPEC int hwloc_get_proc_last_cpu_location(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_cpuset_t set, int flags);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_membinding Memory binding
+ *
+ * Memory binding can be done three ways:
+ *
+ * - explicit memory allocation thanks to hwloc_alloc_membind() and friends:
+ *   the binding will have effect on the memory allocated by these functions.
+ * - implicit memory binding through binding policy: hwloc_set_membind() and
+ *   friends only define the current policy of the process, which will be
+ *   applied to the subsequent calls to malloc() and friends.
+ * - migration of existing memory ranges, thanks to hwloc_set_area_membind()
+ *   and friends, which move already-allocated data.
+ *
+ * Not all operating systems support all three ways.
+ * hwloc_topology_get_support() may be used to query about the actual memory
+ * binding support in the currently used operating system.
+ *
+ * When the requested binding operation is not available and the
+ * ::HWLOC_MEMBIND_STRICT flag was passed, the function returns -1.
+ * \p errno will be set to \c ENOSYS when the system does support
+ * the specified action or policy
+ * (e.g., some systems only allow binding memory on a per-thread
+ * basis, whereas other systems only allow binding memory for all
+ * threads in a process).
+ * \p errno will be set to EXDEV when the requested set can not be enforced
+ * (e.g., some systems only allow binding memory to a single NUMA node).
+ *
+ * If ::HWLOC_MEMBIND_STRICT was not passed, the function may fail as well,
+ * or the operating system may use a slightly different operation
+ * (with side-effects, smaller binding set, etc.)
+ * when the requested operation is not exactly supported.
+ *
+ * The most portable form that should be preferred over the others
+ * whenever possible is as follows.
+ * It allocates some memory hopefully bound to the specified set.
+ * To do so, hwloc will possibly have to change the current memory
+ * binding policy in order to actually get the memory bound, if the OS
+ * does not provide any other way to simply allocate bound memory
+ * without changing the policy for all allocations. That is the
+ * difference with hwloc_alloc_membind(), which will never change the
+ * current memory binding policy.
+ *
+ * \code
+ * hwloc_alloc_membind_policy(topology, size, set,
+ *                            HWLOC_MEMBIND_BIND, 0);
+ * \endcode
+ *
+ * Each hwloc memory binding function takes a bitmap argument that
+ * is a CPU set by default, or a NUMA memory node set if the flag
+ * ::HWLOC_MEMBIND_BYNODESET is specified.
+ * See \ref hwlocality_object_sets and \ref hwlocality_bitmap for a
+ * discussion of CPU sets and NUMA memory node sets.
+ * It is also possible to convert between CPU set and node set using
+ * hwloc_cpuset_to_nodeset() or hwloc_cpuset_from_nodeset().
+ *
+ * Memory binding by CPU set cannot work for CPU-less NUMA memory nodes.
+ * Binding by nodeset should therefore be preferred whenever possible.
+ *
+ * \sa Some example codes are available under doc/examples/ in the source tree.
+ *
+ * \note On some operating systems, memory binding affects the CPU
+ * binding; see ::HWLOC_MEMBIND_NOCPUBIND
+ * @{
+ */
+
+/** \brief Memory binding policy.
+ *
+ * These constants can be used to choose the binding policy.  Only one policy can
+ * be used at a time (i.e., the values cannot be OR'ed together).
+ *
+ * Not all systems support all kinds of binding.
+ * hwloc_topology_get_support() may be used to query about the actual memory
+ * binding policy support in the currently used operating system.
+ * See the "Detailed Description" section of \ref hwlocality_membinding
+ * for a description of errors that can occur.
+ */
+typedef enum {
+  /** \brief Reset the memory allocation policy to the system default.
+   * Depending on the operating system, this may correspond to
+   * ::HWLOC_MEMBIND_FIRSTTOUCH (Linux),
+   * or ::HWLOC_MEMBIND_BIND (AIX, HP-UX, Solaris, Windows).
+   * This policy is never returned by get membind functions.
+   * The nodeset argument is ignored.
+   * \hideinitializer */
+  HWLOC_MEMBIND_DEFAULT =	0,
+
+  /** \brief Allocate each memory page individually on the local NUMA
+   * node of the thread that touches it.
+   *
+   * The given nodeset should usually be hwloc_topology_get_topology_nodeset()
+   * so that the touching thread may run and allocate on any node in the system.
+   *
+   * On AIX, if the nodeset is smaller, pages are allocated locally (if the local
+   * node is in the nodeset) or from a random non-local node (otherwise).
+   * \hideinitializer */
+  HWLOC_MEMBIND_FIRSTTOUCH =	1,
+
+  /** \brief Allocate memory on the specified nodes.
+   * \hideinitializer */
+  HWLOC_MEMBIND_BIND =		2,
+
+  /** \brief Allocate memory on the given nodes in an interleaved
+   * / round-robin manner.  The precise layout of the memory across
+   * multiple NUMA nodes is OS/system specific. Interleaving can be
+   * useful when threads distributed across the specified NUMA nodes
+   * will all be accessing the whole memory range concurrently, since
+   * the interleave will then balance the memory references.
+   * \hideinitializer */
+  HWLOC_MEMBIND_INTERLEAVE =	3,
+
+  /** \brief For each page bound with this policy, by next time
+   * it is touched (and next time only), it is moved from its current
+   * location to the local NUMA node of the thread where the memory
+   * reference occurred (if it needs to be moved at all).
+   * \hideinitializer */
+  HWLOC_MEMBIND_NEXTTOUCH =	4,
+
+  /** \brief Returned by get_membind() functions when multiple
+   * threads or parts of a memory area have differing memory binding
+   * policies.
+   * Also returned when binding is unknown because binding hooks are empty
+   * when the topology is loaded from XML without HWLOC_THISSYSTEM=1, etc.
+   * \hideinitializer */
+  HWLOC_MEMBIND_MIXED = -1
+} hwloc_membind_policy_t;
+
+/** \brief Memory binding flags.
+ *
+ * These flags can be used to refine the binding policy.
+ * All flags can be logically OR'ed together with the exception of
+ * ::HWLOC_MEMBIND_PROCESS and ::HWLOC_MEMBIND_THREAD;
+ * these two flags are mutually exclusive.
+ *
+ * Not all systems support all kinds of binding.
+ * hwloc_topology_get_support() may be used to query about the actual memory
+ * binding support in the currently used operating system.
+ * See the "Detailed Description" section of \ref hwlocality_membinding
+ * for a description of errors that can occur.
+ */
+typedef enum {
+  /** \brief Set policy for all threads of the specified (possibly
+   * multithreaded) process.  This flag is mutually exclusive with
+   * ::HWLOC_MEMBIND_THREAD.
+   * \hideinitializer */
+  HWLOC_MEMBIND_PROCESS =       (1<<0),
+
+ /** \brief Set policy for a specific thread of the current process.
+  * This flag is mutually exclusive with ::HWLOC_MEMBIND_PROCESS.
+  * \hideinitializer */
+  HWLOC_MEMBIND_THREAD =        (1<<1),
+
+ /** Request strict binding from the OS.  The function will fail if
+  * the binding can not be guaranteed / completely enforced.
+  *
+  * This flag has slightly different meanings depending on which
+  * function it is used with.
+  * \hideinitializer  */
+  HWLOC_MEMBIND_STRICT =        (1<<2),
+
+ /** \brief Migrate existing allocated memory.  If the memory cannot
+  * be migrated and the ::HWLOC_MEMBIND_STRICT flag is passed, an error
+  * will be returned.
+  * \hideinitializer  */
+  HWLOC_MEMBIND_MIGRATE =       (1<<3),
+
+  /** \brief Avoid any effect on CPU binding.
+   *
+   * On some operating systems, some underlying memory binding
+   * functions also bind the application to the corresponding CPU(s).
+   * Using this flag will cause hwloc to avoid using OS functions that
+   * could potentially affect CPU bindings.  Note, however, that using
+   * NOCPUBIND may reduce hwloc's overall memory binding
+   * support. Specifically: some of hwloc's memory binding functions
+   * may fail with errno set to ENOSYS when used with NOCPUBIND.
+   * \hideinitializer
+   */
+  HWLOC_MEMBIND_NOCPUBIND =     (1<<4),
+
+  /** \brief Consider the bitmap argument as a nodeset.
+   *
+   * The bitmap argument is considered a nodeset if this flag is given,
+   * or a cpuset otherwise by default.
+   *
+   * Memory binding by CPU set cannot work for CPU-less NUMA memory nodes.
+   * Binding by nodeset should therefore be preferred whenever possible.
+   * \hideinitializer
+   */
+  HWLOC_MEMBIND_BYNODESET =     (1<<5)
+} hwloc_membind_flags_t;
+
+/** \brief Set the default memory binding policy of the current
+ * process or thread to prefer the NUMA node(s) specified by \p set
+ *
+ * If neither ::HWLOC_MEMBIND_PROCESS nor ::HWLOC_MEMBIND_THREAD is
+ * specified, the current process is assumed to be single-threaded.
+ * This is the most portable form as it permits hwloc to use either
+ * process-based OS functions or thread-based OS functions, depending
+ * on which are available.
+ *
+ * If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset.
+ * Otherwise it's a cpuset.
+ *
+ * \return -1 with errno set to ENOSYS if the action is not supported
+ * \return -1 with errno set to EXDEV if the binding cannot be enforced
+ */
+HWLOC_DECLSPEC int hwloc_set_membind(hwloc_topology_t topology, hwloc_const_bitmap_t set, hwloc_membind_policy_t policy, int flags);
+
+/** \brief Query the default memory binding policy and physical locality of the
+ * current process or thread.
+ *
+ * This function has two output parameters: \p set and \p policy.
+ * The values returned in these parameters depend on both the \p flags
+ * passed in and the current memory binding policies and nodesets in
+ * the queried target.
+ *
+ * Passing the ::HWLOC_MEMBIND_PROCESS flag specifies that the query
+ * target is the current policies and nodesets for all the threads in
+ * the current process.  Passing ::HWLOC_MEMBIND_THREAD specifies that
+ * the query target is the current policy and nodeset for only the
+ * thread invoking this function.
+ *
+ * If neither of these flags are passed (which is the most portable
+ * method), the process is assumed to be single threaded.  This allows
+ * hwloc to use either process-based OS functions or thread-based OS
+ * functions, depending on which are available.
+ *
+ * ::HWLOC_MEMBIND_STRICT is only meaningful when ::HWLOC_MEMBIND_PROCESS
+ * is also specified.  In this case, hwloc will check the default
+ * memory policies and nodesets for all threads in the process.  If
+ * they are not identical, -1 is returned and errno is set to EXDEV.
+ * If they are identical, the values are returned in \p set and \p
+ * policy.
+ *
+ * Otherwise, if ::HWLOC_MEMBIND_PROCESS is specified (and
+ * ::HWLOC_MEMBIND_STRICT is \em not specified), the default set
+ * from each thread is logically OR'ed together.
+ * If all threads' default policies are the same, \p policy is set to
+ * that policy.  If they are different, \p policy is set to
+ * ::HWLOC_MEMBIND_MIXED.
+ *
+ * In the ::HWLOC_MEMBIND_THREAD case (or when neither
+ * ::HWLOC_MEMBIND_PROCESS or ::HWLOC_MEMBIND_THREAD is specified), there
+ * is only one set and policy; they are returned in \p set and
+ * \p policy, respectively.
+ *
+ * If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset.
+ * Otherwise it's a cpuset.
+ *
+ * If any other flags are specified, -1 is returned and errno is set
+ * to EINVAL.
+ */
+HWLOC_DECLSPEC int hwloc_get_membind(hwloc_topology_t topology, hwloc_bitmap_t set, hwloc_membind_policy_t * policy, int flags);
+
+/** \brief Set the default memory binding policy of the specified
+ * process to prefer the NUMA node(s) specified by \p set
+ *
+ * If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset.
+ * Otherwise it's a cpuset.
+ *
+ * \return -1 with errno set to ENOSYS if the action is not supported
+ * \return -1 with errno set to EXDEV if the binding cannot be enforced
+ *
+ * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ */
+HWLOC_DECLSPEC int hwloc_set_proc_membind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_bitmap_t set, hwloc_membind_policy_t policy, int flags);
+
+/** \brief Query the default memory binding policy and physical locality of the
+ * specified process.
+ *
+ * This function has two output parameters: \p set and \p policy.
+ * The values returned in these parameters depend on both the \p flags
+ * passed in and the current memory binding policies and nodesets in
+ * the queried target.
+ *
+ * Passing the ::HWLOC_MEMBIND_PROCESS flag specifies that the query
+ * target is the current policies and nodesets for all the threads in
+ * the specified process.  If ::HWLOC_MEMBIND_PROCESS is not specified
+ * (which is the most portable method), the process is assumed to be
+ * single threaded.  This allows hwloc to use either process-based OS
+ * functions or thread-based OS functions, depending on which are
+ * available.
+ *
+ * Note that it does not make sense to pass ::HWLOC_MEMBIND_THREAD to
+ * this function.
+ *
+ * If ::HWLOC_MEMBIND_STRICT is specified, hwloc will check the default
+ * memory policies and nodesets for all threads in the specified
+ * process.  If they are not identical, -1 is returned and errno is
+ * set to EXDEV.  If they are identical, the values are returned in \p
+ * set and \p policy.
+ *
+ * Otherwise, \p set is set to the logical OR of all threads'
+ * default set.  If all threads' default policies
+ * are the same, \p policy is set to that policy.  If they are
+ * different, \p policy is set to ::HWLOC_MEMBIND_MIXED.
+ *
+ * If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset.
+ * Otherwise it's a cpuset.
+ *
+ * If any other flags are specified, -1 is returned and errno is set
+ * to EINVAL.
+ *
+ * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ */
+HWLOC_DECLSPEC int hwloc_get_proc_membind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_bitmap_t set, hwloc_membind_policy_t * policy, int flags);
+
+/** \brief Bind the already-allocated memory identified by (addr, len)
+ * to the NUMA node(s) specified by \p set.
+ *
+ * If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset.
+ * Otherwise it's a cpuset.
+ *
+ * \return 0 if \p len is 0.
+ * \return -1 with errno set to ENOSYS if the action is not supported
+ * \return -1 with errno set to EXDEV if the binding cannot be enforced
+ */
+HWLOC_DECLSPEC int hwloc_set_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_bitmap_t set, hwloc_membind_policy_t policy, int flags);
+
+/** \brief Query the CPUs near the physical NUMA node(s) and binding policy of
+ * the memory identified by (\p addr, \p len ).
+ *
+ * This function has two output parameters: \p set and \p policy.
+ * The values returned in these parameters depend on both the \p flags
+ * passed in and the memory binding policies and nodesets of the pages
+ * in the address range.
+ *
+ * If ::HWLOC_MEMBIND_STRICT is specified, the target pages are first
+ * checked to see if they all have the same memory binding policy and
+ * nodeset.  If they do not, -1 is returned and errno is set to EXDEV.
+ * If they are identical across all pages, the set and policy are
+ * returned in \p set and \p policy, respectively.
+ *
+ * If ::HWLOC_MEMBIND_STRICT is not specified, the union of all NUMA
+ * node(s) containing pages in the address range is calculated.
+ * If all pages in the target have the same policy, it is returned in
+ * \p policy.  Otherwise, \p policy is set to ::HWLOC_MEMBIND_MIXED.
+ *
+ * If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset.
+ * Otherwise it's a cpuset.
+ *
+ * If any other flags are specified, -1 is returned and errno is set
+ * to EINVAL.
+ *
+ * If \p len is 0, -1 is returned and errno is set to EINVAL.
+ */
+HWLOC_DECLSPEC int hwloc_get_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_bitmap_t set, hwloc_membind_policy_t * policy, int flags);
+
+/** \brief Get the NUMA nodes where memory identified by (\p addr, \p len ) is physically allocated.
+ *
+ * Fills \p set according to the NUMA nodes where the memory area pages
+ * are physically allocated. If no page is actually allocated yet,
+ * \p set may be empty.
+ *
+ * If pages spread to multiple nodes, it is not specified whether they spread
+ * equitably, or whether most of them are on a single node, etc.
+ *
+ * The operating system may move memory pages from one processor
+ * to another at any time according to their binding,
+ * so this function may return something that is already
+ * outdated.
+ *
+ * If ::HWLOC_MEMBIND_BYNODESET is specified in \p flags, set is
+ * considered a nodeset. Otherwise it's a cpuset.
+ *
+ * If \p len is 0, \p set is emptied.
+ */
+HWLOC_DECLSPEC int hwloc_get_area_memlocation(hwloc_topology_t topology, const void *addr, size_t len, hwloc_bitmap_t set, int flags);
+
+/** \brief Allocate some memory
+ *
+ * This is equivalent to malloc(), except that it tries to allocate
+ * page-aligned memory from the OS.
+ *
+ * \note The allocated memory should be freed with hwloc_free().
+ */
+HWLOC_DECLSPEC void *hwloc_alloc(hwloc_topology_t topology, size_t len);
+
+/** \brief Allocate some memory on NUMA memory nodes specified by \p set
+ *
+ * \return NULL with errno set to ENOSYS if the action is not supported
+ * and ::HWLOC_MEMBIND_STRICT is given
+ * \return NULL with errno set to EXDEV if the binding cannot be enforced
+ * and ::HWLOC_MEMBIND_STRICT is given
+ * \return NULL with errno set to ENOMEM if the memory allocation failed
+ * even before trying to bind.
+ *
+ * If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset.
+ * Otherwise it's a cpuset.
+ *
+ * \note The allocated memory should be freed with hwloc_free().
+ */
+HWLOC_DECLSPEC void *hwloc_alloc_membind(hwloc_topology_t topology, size_t len, hwloc_const_bitmap_t set, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_malloc;
+
+/** \brief Allocate some memory on NUMA memory nodes specified by \p set
+ *
+ * This is similar to hwloc_alloc_membind_nodeset() except that it is allowed to change
+ * the current memory binding policy, thus providing more binding support, at
+ * the expense of changing the current state.
+ *
+ * If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset.
+ * Otherwise it's a cpuset.
+ */
+static __hwloc_inline void *
+hwloc_alloc_membind_policy(hwloc_topology_t topology, size_t len, hwloc_const_bitmap_t set, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_malloc;
+
+/** \brief Free memory that was previously allocated by hwloc_alloc()
+ * or hwloc_alloc_membind().
+ */
+HWLOC_DECLSPEC int hwloc_free(hwloc_topology_t topology, void *addr, size_t len);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_setsource Changing the Source of Topology Discovery
+ *
+ * If none of the functions below is called, the default is to detect all the objects
+ * of the machine that the caller is allowed to access.
+ *
+ * This default behavior may also be modified through environment variables
+ * if the application did not modify it already.
+ * Setting HWLOC_XMLFILE in the environment enforces the discovery from a XML
+ * file as if hwloc_topology_set_xml() had been called.
+ * Setting HWLOC_SYNTHETIC enforces a synthetic topology as if
+ * hwloc_topology_set_synthetic() had been called.
+ *
+ * Finally, HWLOC_THISSYSTEM enforces the return value of
+ * hwloc_topology_is_thissystem().
+ *
+ * @{
+ */
+
+/** \brief Change which process the topology is viewed from.
+ *
+ * On some systems, processes may have different views of the machine, for
+ * instance the set of allowed CPUs. By default, hwloc exposes the view from
+ * the current process. Calling hwloc_topology_set_pid() permits to make it
+ * expose the topology of the machine from the point of view of another
+ * process.
+ *
+ * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ *
+ * \note -1 is returned and errno is set to ENOSYS on platforms that do not
+ * support this feature.
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_pid(hwloc_topology_t __hwloc_restrict topology, hwloc_pid_t pid);
+
+/** \brief Enable synthetic topology.
+ *
+ * Gather topology information from the given \p description,
+ * a space-separated string of <type:number> describing
+ * the object type and arity at each level.
+ * All types may be omitted (space-separated string of numbers) so that
+ * hwloc chooses all types according to usual topologies.
+ * See also the \ref synthetic.
+ *
+ * Setting the environment variable HWLOC_SYNTHETIC
+ * may also result in this behavior.
+ *
+ * If \p description was properly parsed and describes a valid topology
+ * configuration, this function returns 0.
+ * Otherwise -1 is returned and errno is set to EINVAL.
+ *
+ * Note that this function does not actually load topology
+ * information; it just tells hwloc where to load it from.  You'll
+ * still need to invoke hwloc_topology_load() to actually load the
+ * topology information.
+ *
+ * \note For convenience, this backend provides empty binding hooks which just
+ * return success.
+ *
+ * \note On success, the synthetic component replaces the previously enabled
+ * component (if any), but the topology is not actually modified until
+ * hwloc_topology_load().
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_synthetic(hwloc_topology_t __hwloc_restrict topology, const char * __hwloc_restrict description);
+
+/** \brief Enable XML-file based topology.
+ *
+ * Gather topology information from the XML file given at \p xmlpath.
+ * Setting the environment variable HWLOC_XMLFILE may also result in this behavior.
+ * This file may have been generated earlier with hwloc_topology_export_xml() in hwloc/export.h,
+ * or lstopo file.xml.
+ *
+ * Note that this function does not actually load topology
+ * information; it just tells hwloc where to load it from.  You'll
+ * still need to invoke hwloc_topology_load() to actually load the
+ * topology information.
+ *
+ * \return -1 with errno set to EINVAL on failure to read the XML file.
+ *
+ * \note See also hwloc_topology_set_userdata_import_callback()
+ * for importing application-specific object userdata.
+ *
+ * \note For convenience, this backend provides empty binding hooks which just
+ * return success.  To have hwloc still actually call OS-specific hooks, the
+ * ::HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM has to be set to assert that the loaded
+ * file is really the underlying system.
+ *
+ * \note On success, the XML component replaces the previously enabled
+ * component (if any), but the topology is not actually modified until
+ * hwloc_topology_load().
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_xml(hwloc_topology_t __hwloc_restrict topology, const char * __hwloc_restrict xmlpath);
+
+/** \brief Enable XML based topology using a memory buffer (instead of
+ * a file, as with hwloc_topology_set_xml()).
+ *
+ * Gather topology information from the XML memory buffer given at \p
+ * buffer and of length \p size.  This buffer may have been filled
+ * earlier with hwloc_topology_export_xmlbuffer() in hwloc/export.h.
+ *
+ * Note that this function does not actually load topology
+ * information; it just tells hwloc where to load it from.  You'll
+ * still need to invoke hwloc_topology_load() to actually load the
+ * topology information.
+ *
+ * \return -1 with errno set to EINVAL on failure to read the XML buffer.
+ *
+ * \note See also hwloc_topology_set_userdata_import_callback()
+ * for importing application-specific object userdata.
+ *
+ * \note For convenience, this backend provides empty binding hooks which just
+ * return success.  To have hwloc still actually call OS-specific hooks, the
+ * ::HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM has to be set to assert that the loaded
+ * file is really the underlying system.
+ *
+ * \note On success, the XML component replaces the previously enabled
+ * component (if any), but the topology is not actually modified until
+ * hwloc_topology_load().
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_xmlbuffer(hwloc_topology_t __hwloc_restrict topology, const char * __hwloc_restrict buffer, int size);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_configuration Topology Detection Configuration and Query
+ *
+ * Several functions can optionally be called between hwloc_topology_init() and
+ * hwloc_topology_load() to configure how the detection should be performed,
+ * e.g. to ignore some objects types, define a synthetic topology, etc.
+ *
+ * @{
+ */
+
+/** \brief Flags to be set onto a topology context before load.
+ *
+ * Flags should be given to hwloc_topology_set_flags().
+ * They may also be returned by hwloc_topology_get_flags().
+ */
+enum hwloc_topology_flags_e {
+ /** \brief Detect the whole system, ignore reservations.
+   *
+   * Gather all resources, even if some were disabled by the administrator.
+   * For instance, ignore Linux Cgroup/Cpusets and gather all processors and memory nodes.
+   *
+   * When this flag is not set, PUs and NUMA nodes that are disallowed are not added to the topology.
+   * Parent objects (package, core, cache, etc.) are added only if some of their children are allowed.
+   *
+   * When this flag is set, the actual sets of allowed PUs and NUMA nodes are given
+   * by hwloc_topology_get_allowed_cpuset() and hwloc_topology_get_allowed_nodeset().
+   * They may be smaller than the root object cpuset and nodeset.
+   *
+   * When this flag is not set, all existing PUs and NUMA nodes in the topology
+   * are allowed. hwloc_topology_get_allowed_cpuset() and hwloc_topology_get_allowed_nodeset()
+   * are equal to the root object cpuset and nodeset.
+   *
+   * If the current topology is exported to XML and reimported later, this flag
+   * should be set again in the reimported topology so that disallowed resources
+   * are reimported as well.
+   * \hideinitializer
+   */
+  HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM = (1UL<<0),
+
+ /** \brief Assume that the selected backend provides the topology for the
+   * system on which we are running.
+   *
+   * This forces hwloc_topology_is_thissystem() to return 1, i.e. makes hwloc assume that
+   * the selected backend provides the topology for the system on which we are running,
+   * even if it is not the OS-specific backend but the XML backend for instance.
+   * This means making the binding functions actually call the OS-specific
+   * system calls and really do binding, while the XML backend would otherwise
+   * provide empty hooks just returning success.
+   *
+   * Setting the environment variable HWLOC_THISSYSTEM may also result in the
+   * same behavior.
+   *
+   * This can be used for efficiency reasons to first detect the topology once,
+   * save it to an XML file, and quickly reload it later through the XML
+   * backend, but still having binding functions actually do bind.
+   * \hideinitializer
+   */
+  HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM = (1UL<<1),
+
+ /** \brief Get the set of allowed resources from the local operating system even if the topology was loaded from XML or synthetic description.
+   *
+   * If the topology was loaded from XML or from a synthetic string,
+   * restrict it by applying the current process restrictions such as
+   * Linux Cgroup/Cpuset.
+   *
+   * This is useful when the topology is not loaded directly from
+   * the local machine (e.g. for performance reason) and it comes
+   * with all resources, while the running process is restricted
+   * to only parts of the machine.
+   *
+   * This flag is ignored unless ::HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM is
+   * also set since the loaded topology must match the underlying machine
+   * where restrictions will be gathered from.
+   *
+   * Setting the environment variable HWLOC_THISSYSTEM_ALLOWED_RESOURCES
+   * would result in the same behavior.
+   * \hideinitializer
+   */
+  HWLOC_TOPOLOGY_FLAG_THISSYSTEM_ALLOWED_RESOURCES = (1UL<<2)
+};
+
+/** \brief Set OR'ed flags to non-yet-loaded topology.
+ *
+ * Set a OR'ed set of ::hwloc_topology_flags_e onto a topology that was not yet loaded.
+ *
+ * If this function is called multiple times, the last invokation will erase
+ * and replace the set of flags that was previously set.
+ *
+ * The flags set in a topology may be retrieved with hwloc_topology_get_flags()
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_flags (hwloc_topology_t topology, unsigned long flags);
+
+/** \brief Get OR'ed flags of a topology.
+ *
+ * Get the OR'ed set of ::hwloc_topology_flags_e of a topology.
+ *
+ * \return the flags previously set with hwloc_topology_set_flags().
+ */
+HWLOC_DECLSPEC unsigned long hwloc_topology_get_flags (hwloc_topology_t topology);
+
+/** \brief Does the topology context come from this system?
+ *
+ * \return 1 if this topology context was built using the system
+ * running this program.
+ * \return 0 instead (for instance if using another file-system root,
+ * a XML topology file, or a synthetic topology).
+ */
+HWLOC_DECLSPEC int hwloc_topology_is_thissystem(hwloc_topology_t  __hwloc_restrict topology) __hwloc_attribute_pure;
+
+/** \brief Flags describing actual discovery support for this topology. */
+struct hwloc_topology_discovery_support {
+  /** \brief Detecting the number of PU objects is supported. */
+  unsigned char pu;
+  /** \brief Detecting the number of NUMA nodes is supported. */
+  unsigned char numa;
+  /** \brief Detecting the amount of memory in NUMA nodes is supported. */
+  unsigned char numa_memory;
+};
+
+/** \brief Flags describing actual PU binding support for this topology.
+ *
+ * A flag may be set even if the feature isn't supported in all cases
+ * (e.g. binding to random sets of non-contiguous objects).
+ */
+struct hwloc_topology_cpubind_support {
+  /** Binding the whole current process is supported.  */
+  unsigned char set_thisproc_cpubind;
+  /** Getting the binding of the whole current process is supported.  */
+  unsigned char get_thisproc_cpubind;
+  /** Binding a whole given process is supported.  */
+  unsigned char set_proc_cpubind;
+  /** Getting the binding of a whole given process is supported.  */
+  unsigned char get_proc_cpubind;
+  /** Binding the current thread only is supported.  */
+  unsigned char set_thisthread_cpubind;
+  /** Getting the binding of the current thread only is supported.  */
+  unsigned char get_thisthread_cpubind;
+  /** Binding a given thread only is supported.  */
+  unsigned char set_thread_cpubind;
+  /** Getting the binding of a given thread only is supported.  */
+  unsigned char get_thread_cpubind;
+  /** Getting the last processors where the whole current process ran is supported */
+  unsigned char get_thisproc_last_cpu_location;
+  /** Getting the last processors where a whole process ran is supported */
+  unsigned char get_proc_last_cpu_location;
+  /** Getting the last processors where the current thread ran is supported */
+  unsigned char get_thisthread_last_cpu_location;
+};
+
+/** \brief Flags describing actual memory binding support for this topology.
+ *
+ * A flag may be set even if the feature isn't supported in all cases
+ * (e.g. binding to random sets of non-contiguous objects).
+ */
+struct hwloc_topology_membind_support {
+  /** Binding the whole current process is supported.  */
+  unsigned char set_thisproc_membind;
+  /** Getting the binding of the whole current process is supported.  */
+  unsigned char get_thisproc_membind;
+  /** Binding a whole given process is supported.  */
+  unsigned char set_proc_membind;
+  /** Getting the binding of a whole given process is supported.  */
+  unsigned char get_proc_membind;
+  /** Binding the current thread only is supported.  */
+  unsigned char set_thisthread_membind;
+  /** Getting the binding of the current thread only is supported.  */
+  unsigned char get_thisthread_membind;
+  /** Binding a given memory area is supported. */
+  unsigned char set_area_membind;
+  /** Getting the binding of a given memory area is supported.  */
+  unsigned char get_area_membind;
+  /** Allocating a bound memory area is supported. */
+  unsigned char alloc_membind;
+  /** First-touch policy is supported. */
+  unsigned char firsttouch_membind;
+  /** Bind policy is supported. */
+  unsigned char bind_membind;
+  /** Interleave policy is supported. */
+  unsigned char interleave_membind;
+  /** Next-touch migration policy is supported. */
+  unsigned char nexttouch_membind;
+  /** Migration flags is supported. */
+  unsigned char migrate_membind;
+  /** Getting the last NUMA nodes where a memory area was allocated is supported */
+  unsigned char get_area_memlocation;
+};
+
+/** \brief Set of flags describing actual support for this topology.
+ *
+ * This is retrieved with hwloc_topology_get_support() and will be valid until
+ * the topology object is destroyed.  Note: the values are correct only after
+ * discovery.
+ */
+struct hwloc_topology_support {
+  struct hwloc_topology_discovery_support *discovery;
+  struct hwloc_topology_cpubind_support *cpubind;
+  struct hwloc_topology_membind_support *membind;
+};
+
+/** \brief Retrieve the topology support.
+ *
+ * Each flag indicates whether a feature is supported.
+ * If set to 0, the feature is not supported.
+ * If set to 1, the feature is supported, but the corresponding
+ * call may still fail in some corner cases.
+ *
+ * These features are also listed by hwloc-info \--support
+ */
+HWLOC_DECLSPEC const struct hwloc_topology_support *hwloc_topology_get_support(hwloc_topology_t __hwloc_restrict topology);
+
+/** \brief Type filtering flags.
+ *
+ * By default, most objects are kept (::HWLOC_TYPE_FILTER_KEEP_ALL).
+ * Instruction caches, I/O and Misc objects are ignored by default (::HWLOC_TYPE_FILTER_KEEP_NONE).
+ * Group levels are ignored unless they bring structure (::HWLOC_TYPE_FILTER_KEEP_STRUCTURE).
+ *
+ * Note that group objects are also ignored individually (without the entire level)
+ * when they do not bring structure.
+ */
+enum hwloc_type_filter_e {
+  /** \brief Keep all objects of this type.
+   *
+   * Cannot be set for ::HWLOC_OBJ_GROUP (groups are designed only to add more structure to the topology).
+   * \hideinitializer
+   */
+  HWLOC_TYPE_FILTER_KEEP_ALL = 0,
+
+  /** \brief Ignore all objects of this type.
+   *
+   * The bottom-level type ::HWLOC_OBJ_PU, the ::HWLOC_OBJ_NUMANODE type, and
+   * the top-level type ::HWLOC_OBJ_MACHINE may not be ignored.
+   * \hideinitializer
+   */
+  HWLOC_TYPE_FILTER_KEEP_NONE = 1,
+
+  /** \brief Only ignore objects if their entire level does not bring any structure.
+   *
+   * Keep the entire level of objects if at least one of these objects adds
+   * structure to the topology. An object brings structure when it has multiple
+   * children and it is not the only child of its parent.
+   *
+   * If all objects in the level are the only child of their parent, and if none
+   * of them has multiple children, the entire level is removed.
+   *
+   * Cannot be set for I/O and Misc objects since the topology structure does not matter there.
+   * \hideinitializer
+   */
+  HWLOC_TYPE_FILTER_KEEP_STRUCTURE = 2,
+
+  /** \brief Only keep likely-important objects of the given type.
+   *
+   * It is only useful for I/O object types.
+   * For ::HWLOC_OBJ_PCI_DEVICE and ::HWLOC_OBJ_OS_DEVICE, it means that only objects
+   * of major/common kinds are kept (storage, network, OpenFabrics, Intel MICs, CUDA,
+   * OpenCL, NVML, and displays).
+   * Also, only OS devices directly attached on PCI (e.g. no USB) are reported.
+   * For ::HWLOC_OBJ_BRIDGE, it means that bridges are kept only if they have children.
+   *
+   * This flag equivalent to ::HWLOC_TYPE_FILTER_KEEP_ALL for Normal, Memory and Misc types
+   * since they are likely important.
+   * \hideinitializer
+   */
+  HWLOC_TYPE_FILTER_KEEP_IMPORTANT = 3
+};
+
+/** \brief Set the filtering for the given object type.
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_type_filter(hwloc_topology_t topology, hwloc_obj_type_t type, enum hwloc_type_filter_e filter);
+
+/** \brief Get the current filtering for the given object type.
+ */
+HWLOC_DECLSPEC int hwloc_topology_get_type_filter(hwloc_topology_t topology, hwloc_obj_type_t type, enum hwloc_type_filter_e *filter);
+
+/** \brief Set the filtering for all object types.
+ *
+ * If some types do not support this filtering, they are silently ignored.
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_all_types_filter(hwloc_topology_t topology, enum hwloc_type_filter_e filter);
+
+/** \brief Set the filtering for all cache object types.
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_cache_types_filter(hwloc_topology_t topology, enum hwloc_type_filter_e filter);
+
+/** \brief Set the filtering for all instruction cache object types.
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_icache_types_filter(hwloc_topology_t topology, enum hwloc_type_filter_e filter);
+
+/** \brief Set the filtering for all I/O object types.
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_io_types_filter(hwloc_topology_t topology, enum hwloc_type_filter_e filter);
+
+/** \brief Set the topology-specific userdata pointer.
+ *
+ * Each topology may store one application-given private data pointer.
+ * It is initialized to \c NULL.
+ * hwloc will never modify it.
+ *
+ * Use it as you wish, after hwloc_topology_init() and until hwloc_topolog_destroy().
+ *
+ * This pointer is not exported to XML.
+ */
+HWLOC_DECLSPEC void hwloc_topology_set_userdata(hwloc_topology_t topology, const void *userdata);
+
+/** \brief Retrieve the topology-specific userdata pointer.
+ *
+ * Retrieve the application-given private data pointer that was
+ * previously set with hwloc_topology_set_userdata().
+ */
+HWLOC_DECLSPEC void * hwloc_topology_get_userdata(hwloc_topology_t topology);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_tinker Modifying a loaded Topology
+ * @{
+ */
+
+/** \brief Flags to be given to hwloc_topology_restrict(). */
+enum hwloc_restrict_flags_e {
+  /** \brief Remove all objects that became CPU-less.
+   * By default, only objects that contain no PU and no memory are removed.
+   * \hideinitializer
+   */
+  HWLOC_RESTRICT_FLAG_REMOVE_CPULESS = (1UL<<0),
+
+  /** \brief Move Misc objects to ancestors if their parents are removed during restriction.
+   * If this flag is not set, Misc objects are removed when their parents are removed.
+   * \hideinitializer
+   */
+  HWLOC_RESTRICT_FLAG_ADAPT_MISC = (1UL<<1),
+
+  /** \brief Move I/O objects to ancestors if their parents are removed during restriction.
+   * If this flag is not set, I/O devices and bridges are removed when their parents are removed.
+   * \hideinitializer
+   */
+  HWLOC_RESTRICT_FLAG_ADAPT_IO = (1UL<<2)
+};
+
+/** \brief Restrict the topology to the given CPU set.
+ *
+ * Topology \p topology is modified so as to remove all objects that
+ * are not included (or partially included) in the CPU set \p cpuset.
+ * All objects CPU and node sets are restricted accordingly.
+ *
+ * \p flags is a OR'ed set of ::hwloc_restrict_flags_e.
+ *
+ * \note This call may not be reverted by restricting back to a larger
+ * cpuset. Once dropped during restriction, objects may not be brought
+ * back, except by loading another topology with hwloc_topology_load().
+ *
+ * \return 0 on success.
+ *
+ * \return -1 with errno set to EINVAL if the input cpuset is invalid.
+ * The topology is not modified in this case.
+ *
+ * \return -1 with errno set to ENOMEM on failure to allocate internal data.
+ * The topology is reinitialized in this case. It should be either
+ * destroyed with hwloc_topology_destroy() or configured and loaded again.
+ */
+HWLOC_DECLSPEC int hwloc_topology_restrict(hwloc_topology_t __hwloc_restrict topology, hwloc_const_cpuset_t cpuset, unsigned long flags);
+
+/** \brief Add a MISC object as a leaf of the topology
+ *
+ * A new MISC object will be created and inserted into the topology at the
+ * position given by parent. It is appended to the list of existing Misc children,
+ * without ever adding any intermediate hierarchy level. This is useful for
+ * annotating the topology without actually changing the hierarchy.
+ *
+ * \p name is supposed to be unique across all Misc objects in the topology.
+ * It will be duplicated to setup the new object attributes.
+ *
+ * The new leaf object will not have any \p cpuset.
+ *
+ * \return the newly-created object
+ *
+ * \return \c NULL on error.
+ *
+ * \return \c NULL if Misc objects are filtered-out of the topology (::HWLOC_TYPE_FILTER_KEEP_NONE).
+ *
+ * \note If \p name contains some non-printable characters, they will
+ * be dropped when exporting to XML, see hwloc_topology_export_xml() in hwloc/export.h.
+ */
+HWLOC_DECLSPEC hwloc_obj_t hwloc_topology_insert_misc_object(hwloc_topology_t topology, hwloc_obj_t parent, const char *name);
+
+/** \brief Allocate a Group object to insert later with hwloc_topology_insert_group_object().
+ *
+ * This function returns a new Group object.
+ * The caller should (at least) initialize its sets before inserting the object.
+ * See hwloc_topology_insert_group_object().
+ *
+ * The \p subtype object attribute may be set to display something else
+ * than "Group" as the type name for this object in lstopo.
+ * Custom name/value info pairs may be added with hwloc_obj_add_info() after
+ * insertion.
+ *
+ * The \p kind group attribute should be 0. The \p subkind group attribute may
+ * be set to identify multiple Groups of the same level.
+ *
+ * It is recommended not to set any other object attribute before insertion,
+ * since the Group may get discarded during insertion.
+ *
+ * The object will be destroyed if passed to hwloc_topology_insert_group_object()
+ * without any set defined.
+ */
+HWLOC_DECLSPEC hwloc_obj_t hwloc_topology_alloc_group_object(hwloc_topology_t topology);
+
+/** \brief Add more structure to the topology by adding an intermediate Group
+ *
+ * The caller should first allocate a new Group object with hwloc_topology_alloc_group_object().
+ * Then it must setup at least one of its CPU or node sets to specify
+ * the final location of the Group in the topology.
+ * Then the object can be passed to this function for actual insertion in the topology.
+ *
+ * The group \p dont_merge attribute may be set to prevent the core from
+ * ever merging this object with another object hierarchically-identical.
+ *
+ * Either the cpuset or nodeset field (or both, if compatible) must be set
+ * to a non-empty bitmap. The complete_cpuset or complete_nodeset may be set
+ * instead if inserting with respect to the complete topology
+ * (including disallowed, offline or unknown objects).
+ *
+ * It grouping several objects, hwloc_obj_add_other_obj_sets() is an easy way
+ * to build the Group sets iteratively.
+ *
+ * These sets cannot be larger than the current topology, or they would get
+ * restricted silently.
+ *
+ * The core will setup the other sets after actual insertion.
+ *
+ * \return The inserted object if it was properly inserted.
+ *
+ * \return An existing object if the Group was discarded because the topology already
+ * contained an object at the same location (the Group did not add any locality information).
+ * Any name/info key pair set before inserting is appended to the existing object.
+ *
+ * \return \c NULL if the insertion failed because of conflicting sets in topology tree.
+ *
+ * \return \c NULL if Group objects are filtered-out of the topology (::HWLOC_TYPE_FILTER_KEEP_NONE).
+ *
+ * \return \c NULL if the object was discarded because no set was initialized in the Group
+ * before insert, or all of them were empty.
+ */
+HWLOC_DECLSPEC hwloc_obj_t hwloc_topology_insert_group_object(hwloc_topology_t topology, hwloc_obj_t group);
+
+/** \brief Setup object cpusets/nodesets by OR'ing another object's sets.
+ *
+ * For each defined cpuset or nodeset in \p src, allocate the corresponding set
+ * in \p dst and add \p src to it by OR'ing sets.
+ *
+ * This function is convenient between hwloc_topology_alloc_group_object()
+ * and hwloc_topology_insert_group_object(). It builds the sets of the new Group
+ * that will be inserted as a new intermediate parent of several objects.
+ */
+HWLOC_DECLSPEC int hwloc_obj_add_other_obj_sets(hwloc_obj_t dst, hwloc_obj_t src);
+
+/** @} */
+
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+/* high-level helpers */
+#include <hwloc/helper.h>
+
+/* inline code of some functions above */
+#include <hwloc/inlines.h>
+
+/* exporting to XML or synthetic */
+#include <hwloc/export.h>
+
+/* distances */
+#include <hwloc/distances.h>
+
+/* topology diffs */
+#include <hwloc/diff.h>
+
+/* deprecated headers */
+#include <hwloc/deprecated.h>
+
+#endif /* HWLOC_H */
diff --git a/src/3rdparty/hwloc/include/hwloc/autogen/config.h b/src/3rdparty/hwloc/include/hwloc/autogen/config.h
new file mode 100644
index 000000000..14d4481d2
--- /dev/null
+++ b/src/3rdparty/hwloc/include/hwloc/autogen/config.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2018 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/* The configuration file */
+
+#ifndef HWLOC_CONFIG_H
+#define HWLOC_CONFIG_H
+
+#define HWLOC_VERSION "2.0.4"
+#define HWLOC_VERSION_MAJOR 2
+#define HWLOC_VERSION_MINOR 0
+#define HWLOC_VERSION_RELEASE 4
+#define HWLOC_VERSION_GREEK ""
+
+#define __hwloc_restrict
+#define __hwloc_inline __inline
+
+#define __hwloc_attribute_unused
+#define __hwloc_attribute_malloc
+#define __hwloc_attribute_const
+#define __hwloc_attribute_pure
+#define __hwloc_attribute_deprecated
+#define __hwloc_attribute_may_alias
+#define __hwloc_attribute_warn_unused_result
+
+/* Defined to 1 if you have the `windows.h' header. */
+#define HWLOC_HAVE_WINDOWS_H 1
+#define hwloc_pid_t HANDLE
+#define hwloc_thread_t HANDLE
+
+#include <windows.h>
+#include <BaseTsd.h>
+typedef DWORDLONG hwloc_uint64_t;
+
+#if defined( _USRDLL ) /* dynamic linkage */
+#if defined( DECLSPEC_EXPORTS )
+#define HWLOC_DECLSPEC __declspec(dllexport)
+#else
+#define HWLOC_DECLSPEC __declspec(dllimport)
+#endif
+#else /* static linkage */
+#define HWLOC_DECLSPEC
+#endif
+
+/* Whether we need to re-define all the hwloc public symbols or not */
+#define HWLOC_SYM_TRANSFORM 0
+
+/* The hwloc symbol prefix */
+#define HWLOC_SYM_PREFIX hwloc_
+
+/* The hwloc symbol prefix in all caps */
+#define HWLOC_SYM_PREFIX_CAPS HWLOC_
+
+#endif /* HWLOC_CONFIG_H */
diff --git a/src/3rdparty/hwloc/include/hwloc/bitmap.h b/src/3rdparty/hwloc/include/hwloc/bitmap.h
new file mode 100644
index 000000000..bae623c8c
--- /dev/null
+++ b/src/3rdparty/hwloc/include/hwloc/bitmap.h
@@ -0,0 +1,467 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2018 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief The bitmap API, for use in hwloc itself.
+ */
+
+#ifndef HWLOC_BITMAP_H
+#define HWLOC_BITMAP_H
+
+#include <hwloc/autogen/config.h>
+#include <assert.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_bitmap The bitmap API
+ *
+ * The ::hwloc_bitmap_t type represents a set of integers (positive or null).
+ * A bitmap may be of infinite size (all bits are set after some point).
+ * A bitmap may even be full if all bits are set.
+ *
+ * Bitmaps are used by hwloc for sets of OS processors
+ * (which may actually be hardware threads) as by ::hwloc_cpuset_t
+ * (a typedef for ::hwloc_bitmap_t), or sets of NUMA memory nodes
+ * as ::hwloc_nodeset_t (also a typedef for ::hwloc_bitmap_t).
+ * Those are used for cpuset and nodeset fields in the ::hwloc_obj structure,
+ * see \ref hwlocality_object_sets.
+ *
+ * <em>Both CPU and node sets are always indexed by OS physical number.</em>
+ * However users should usually not build CPU and node sets manually
+ * (e.g. with hwloc_bitmap_set()).
+ * One should rather use existing object sets and combine them with
+ * hwloc_bitmap_or(), etc.
+ * For instance, binding the current thread on a pair of cores may be performed with:
+ * \code
+ * hwloc_obj_t core1 = ... , core2 = ... ;
+ * hwloc_bitmap_t set = hwloc_bitmap_alloc();
+ * hwloc_bitmap_or(set, core1->cpuset, core2->cpuset);
+ * hwloc_set_cpubind(topology, set, HWLOC_CPUBIND_THREAD);
+ * hwloc_bitmap_free(set);
+ * \endcode
+ *
+ * \note Most functions below return an int that may be negative in case of
+ * error. The usual error case would be an internal failure to realloc/extend
+ * the storage of the bitmap (\p errno would be set to \c ENOMEM).
+ *
+ * \note Several examples of using the bitmap API are available under the
+ * doc/examples/ directory in the source tree.
+ * Regression tests such as tests/hwloc/hwloc_bitmap*.c also make intensive use
+ * of this API.
+ * @{
+ */
+
+
+/** \brief
+ * Set of bits represented as an opaque pointer to an internal bitmap.
+ */
+typedef struct hwloc_bitmap_s * hwloc_bitmap_t;
+/** \brief a non-modifiable ::hwloc_bitmap_t */
+typedef const struct hwloc_bitmap_s * hwloc_const_bitmap_t;
+
+
+/*
+ * Bitmap allocation, freeing and copying.
+ */
+
+/** \brief Allocate a new empty bitmap.
+ *
+ * \returns A valid bitmap or \c NULL.
+ *
+ * The bitmap should be freed by a corresponding call to
+ * hwloc_bitmap_free().
+ */
+HWLOC_DECLSPEC hwloc_bitmap_t hwloc_bitmap_alloc(void) __hwloc_attribute_malloc;
+
+/** \brief Allocate a new full bitmap. */
+HWLOC_DECLSPEC hwloc_bitmap_t hwloc_bitmap_alloc_full(void) __hwloc_attribute_malloc;
+
+/** \brief Free bitmap \p bitmap.
+ *
+ * If \p bitmap is \c NULL, no operation is performed.
+ */
+HWLOC_DECLSPEC void hwloc_bitmap_free(hwloc_bitmap_t bitmap);
+
+/** \brief Duplicate bitmap \p bitmap by allocating a new bitmap and copying \p bitmap contents.
+ *
+ * If \p bitmap is \c NULL, \c NULL is returned.
+ */
+HWLOC_DECLSPEC hwloc_bitmap_t hwloc_bitmap_dup(hwloc_const_bitmap_t bitmap) __hwloc_attribute_malloc;
+
+/** \brief Copy the contents of bitmap \p src into the already allocated bitmap \p dst */
+HWLOC_DECLSPEC int hwloc_bitmap_copy(hwloc_bitmap_t dst, hwloc_const_bitmap_t src);
+
+
+/*
+ * Bitmap/String Conversion
+ */
+
+/** \brief Stringify a bitmap.
+ *
+ * Up to \p buflen characters may be written in buffer \p buf.
+ *
+ * If \p buflen is 0, \p buf may safely be \c NULL.
+ *
+ * \return the number of character that were actually written if not truncating,
+ * or that would have been written (not including the ending \\0).
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_snprintf(char * __hwloc_restrict buf, size_t buflen, hwloc_const_bitmap_t bitmap);
+
+/** \brief Stringify a bitmap into a newly allocated string.
+ *
+ * \return -1 on error.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_asprintf(char ** strp, hwloc_const_bitmap_t bitmap);
+
+/** \brief Parse a bitmap string and stores it in bitmap \p bitmap.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_sscanf(hwloc_bitmap_t bitmap, const char * __hwloc_restrict string);
+
+/** \brief Stringify a bitmap in the list format.
+ *
+ * Lists are comma-separated indexes or ranges.
+ * Ranges are dash separated indexes.
+ * The last range may not have an ending indexes if the bitmap is infinitely set.
+ *
+ * Up to \p buflen characters may be written in buffer \p buf.
+ *
+ * If \p buflen is 0, \p buf may safely be \c NULL.
+ *
+ * \return the number of character that were actually written if not truncating,
+ * or that would have been written (not including the ending \\0).
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_list_snprintf(char * __hwloc_restrict buf, size_t buflen, hwloc_const_bitmap_t bitmap);
+
+/** \brief Stringify a bitmap into a newly allocated list string.
+ *
+ * \return -1 on error.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_list_asprintf(char ** strp, hwloc_const_bitmap_t bitmap);
+
+/** \brief Parse a list string and stores it in bitmap \p bitmap.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_list_sscanf(hwloc_bitmap_t bitmap, const char * __hwloc_restrict string);
+
+/** \brief Stringify a bitmap in the taskset-specific format.
+ *
+ * The taskset command manipulates bitmap strings that contain a single
+ * (possible very long) hexadecimal number starting with 0x.
+ *
+ * Up to \p buflen characters may be written in buffer \p buf.
+ *
+ * If \p buflen is 0, \p buf may safely be \c NULL.
+ *
+ * \return the number of character that were actually written if not truncating,
+ * or that would have been written (not including the ending \\0).
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_taskset_snprintf(char * __hwloc_restrict buf, size_t buflen, hwloc_const_bitmap_t bitmap);
+
+/** \brief Stringify a bitmap into a newly allocated taskset-specific string.
+ *
+ * \return -1 on error.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_taskset_asprintf(char ** strp, hwloc_const_bitmap_t bitmap);
+
+/** \brief Parse a taskset-specific bitmap string and stores it in bitmap \p bitmap.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_taskset_sscanf(hwloc_bitmap_t bitmap, const char * __hwloc_restrict string);
+
+
+/*
+ * Building bitmaps.
+ */
+
+/** \brief Empty the bitmap \p bitmap */
+HWLOC_DECLSPEC void hwloc_bitmap_zero(hwloc_bitmap_t bitmap);
+
+/** \brief Fill bitmap \p bitmap with all possible indexes (even if those objects don't exist or are otherwise unavailable) */
+HWLOC_DECLSPEC void hwloc_bitmap_fill(hwloc_bitmap_t bitmap);
+
+/** \brief Empty the bitmap \p bitmap and add bit \p id */
+HWLOC_DECLSPEC int hwloc_bitmap_only(hwloc_bitmap_t bitmap, unsigned id);
+
+/** \brief Fill the bitmap \p and clear the index \p id */
+HWLOC_DECLSPEC int hwloc_bitmap_allbut(hwloc_bitmap_t bitmap, unsigned id);
+
+/** \brief Setup bitmap \p bitmap from unsigned long \p mask */
+HWLOC_DECLSPEC int hwloc_bitmap_from_ulong(hwloc_bitmap_t bitmap, unsigned long mask);
+
+/** \brief Setup bitmap \p bitmap from unsigned long \p mask used as \p i -th subset */
+HWLOC_DECLSPEC int hwloc_bitmap_from_ith_ulong(hwloc_bitmap_t bitmap, unsigned i, unsigned long mask);
+
+
+/*
+ * Modifying bitmaps.
+ */
+
+/** \brief Add index \p id in bitmap \p bitmap */
+HWLOC_DECLSPEC int hwloc_bitmap_set(hwloc_bitmap_t bitmap, unsigned id);
+
+/** \brief Add indexes from \p begin to \p end in bitmap \p bitmap.
+ *
+ * If \p end is \c -1, the range is infinite.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_set_range(hwloc_bitmap_t bitmap, unsigned begin, int end);
+
+/** \brief Replace \p i -th subset of bitmap \p bitmap with unsigned long \p mask */
+HWLOC_DECLSPEC int hwloc_bitmap_set_ith_ulong(hwloc_bitmap_t bitmap, unsigned i, unsigned long mask);
+
+/** \brief Remove index \p id from bitmap \p bitmap */
+HWLOC_DECLSPEC int hwloc_bitmap_clr(hwloc_bitmap_t bitmap, unsigned id);
+
+/** \brief Remove indexes from \p begin to \p end in bitmap \p bitmap.
+ *
+ * If \p end is \c -1, the range is infinite.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_clr_range(hwloc_bitmap_t bitmap, unsigned begin, int end);
+
+/** \brief Keep a single index among those set in bitmap \p bitmap
+ *
+ * May be useful before binding so that the process does not
+ * have a chance of migrating between multiple logical CPUs
+ * in the original mask.
+ * Instead of running the task on any PU inside the given CPU set,
+ * the operating system scheduler will be forced to run it on a single
+ * of these PUs.
+ * It avoids a migration overhead and cache-line ping-pongs between PUs.
+ *
+ * \note This function is NOT meant to distribute multiple processes
+ * within a single CPU set. It always return the same single bit when
+ * called multiple times on the same input set. hwloc_distrib() may
+ * be used for generating CPU sets to distribute multiple tasks below
+ * a single multi-PU object.
+ *
+ * \note This function cannot be applied to an object set directly. It
+ * should be applied to a copy (which may be obtained with hwloc_bitmap_dup()).
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_singlify(hwloc_bitmap_t bitmap);
+
+
+/*
+ * Consulting bitmaps.
+ */
+
+/** \brief Convert the beginning part of bitmap \p bitmap into unsigned long \p mask */
+HWLOC_DECLSPEC unsigned long hwloc_bitmap_to_ulong(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
+
+/** \brief Convert the \p i -th subset of bitmap \p bitmap into unsigned long mask */
+HWLOC_DECLSPEC unsigned long hwloc_bitmap_to_ith_ulong(hwloc_const_bitmap_t bitmap, unsigned i) __hwloc_attribute_pure;
+
+/** \brief Test whether index \p id is part of bitmap \p bitmap.
+ *
+ * \return 1 if the bit at index \p id is set in bitmap \p bitmap, 0 otherwise.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_isset(hwloc_const_bitmap_t bitmap, unsigned id) __hwloc_attribute_pure;
+
+/** \brief Test whether bitmap \p bitmap is empty
+ *
+ * \return 1 if bitmap is empty, 0 otherwise.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_iszero(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
+
+/** \brief Test whether bitmap \p bitmap is completely full
+ *
+ * \return 1 if bitmap is full, 0 otherwise.
+ *
+ * \note A full bitmap is always infinitely set.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_isfull(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
+
+/** \brief Compute the first index (least significant bit) in bitmap \p bitmap
+ *
+ * \return -1 if no index is set in \p bitmap.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_first(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
+
+/** \brief Compute the next index in bitmap \p bitmap which is after index \p prev
+ *
+ * If \p prev is -1, the first index is returned.
+ *
+ * \return -1 if no index with higher index is set in \p bitmap.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_next(hwloc_const_bitmap_t bitmap, int prev) __hwloc_attribute_pure;
+
+/** \brief Compute the last index (most significant bit) in bitmap \p bitmap
+ *
+ * \return -1 if no index is set in \p bitmap, or if \p bitmap is infinitely set.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_last(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
+
+/** \brief Compute the "weight" of bitmap \p bitmap (i.e., number of
+ * indexes that are in the bitmap).
+ *
+ * \return the number of indexes that are in the bitmap.
+ *
+ * \return -1 if \p bitmap is infinitely set.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_weight(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
+
+/** \brief Compute the first unset index (least significant bit) in bitmap \p bitmap
+ *
+ * \return -1 if no index is unset in \p bitmap.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_first_unset(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
+
+/** \brief Compute the next unset index in bitmap \p bitmap which is after index \p prev
+ *
+ * If \p prev is -1, the first unset index is returned.
+ *
+ * \return -1 if no index with higher index is unset in \p bitmap.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_next_unset(hwloc_const_bitmap_t bitmap, int prev) __hwloc_attribute_pure;
+
+/** \brief Compute the last unset index (most significant bit) in bitmap \p bitmap
+ *
+ * \return -1 if no index is unset in \p bitmap, or if \p bitmap is infinitely set.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_last_unset(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
+
+/** \brief Loop macro iterating on bitmap \p bitmap
+ *
+ * The loop must start with hwloc_bitmap_foreach_begin() and end
+ * with hwloc_bitmap_foreach_end() followed by a terminating ';'.
+ *
+ * \p index is the loop variable; it should be an unsigned int.  The
+ * first iteration will set \p index to the lowest index in the bitmap.
+ * Successive iterations will iterate through, in order, all remaining
+ * indexes set in the bitmap.  To be specific: each iteration will return a
+ * value for \p index such that hwloc_bitmap_isset(bitmap, index) is true.
+ *
+ * The assert prevents the loop from being infinite if the bitmap is infinitely set.
+ *
+ * \hideinitializer
+ */
+#define hwloc_bitmap_foreach_begin(id, bitmap) \
+do { \
+        assert(hwloc_bitmap_weight(bitmap) != -1); \
+        for (id = hwloc_bitmap_first(bitmap); \
+             (unsigned) id != (unsigned) -1; \
+             id = hwloc_bitmap_next(bitmap, id)) {
+
+/** \brief End of loop macro iterating on a bitmap.
+ *
+ * Needs a terminating ';'.
+ *
+ * \sa hwloc_bitmap_foreach_begin()
+ * \hideinitializer
+ */
+#define hwloc_bitmap_foreach_end()		\
+        } \
+} while (0)
+
+
+/*
+ * Combining bitmaps.
+ */
+
+/** \brief Or bitmaps \p bitmap1 and \p bitmap2 and store the result in bitmap \p res
+ *
+ * \p res can be the same as \p bitmap1 or \p bitmap2
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_or (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2);
+
+/** \brief And bitmaps \p bitmap1 and \p bitmap2 and store the result in bitmap \p res
+ *
+ * \p res can be the same as \p bitmap1 or \p bitmap2
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_and (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2);
+
+/** \brief And bitmap \p bitmap1 and the negation of \p bitmap2 and store the result in bitmap \p res
+ *
+ * \p res can be the same as \p bitmap1 or \p bitmap2
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_andnot (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2);
+
+/** \brief Xor bitmaps \p bitmap1 and \p bitmap2 and store the result in bitmap \p res
+ *
+ * \p res can be the same as \p bitmap1 or \p bitmap2
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_xor (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2);
+
+/** \brief Negate bitmap \p bitmap and store the result in bitmap \p res
+ *
+ * \p res can be the same as \p bitmap
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_not (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap);
+
+
+/*
+ * Comparing bitmaps.
+ */
+
+/** \brief Test whether bitmaps \p bitmap1 and \p bitmap2 intersects.
+ *
+ * \return 1 if bitmaps intersect, 0 otherwise.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_intersects (hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2) __hwloc_attribute_pure;
+
+/** \brief Test whether bitmap \p sub_bitmap is part of bitmap \p super_bitmap.
+ *
+ * \return 1 if \p sub_bitmap is included in \p super_bitmap, 0 otherwise.
+ *
+ * \note The empty bitmap is considered included in any other bitmap.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_isincluded (hwloc_const_bitmap_t sub_bitmap, hwloc_const_bitmap_t super_bitmap) __hwloc_attribute_pure;
+
+/** \brief Test whether bitmap \p bitmap1 is equal to bitmap \p bitmap2.
+ *
+ * \return 1 if bitmaps are equal, 0 otherwise.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_isequal (hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2) __hwloc_attribute_pure;
+
+/** \brief Compare bitmaps \p bitmap1 and \p bitmap2 using their lowest index.
+ *
+ * A bitmap is considered smaller if its least significant bit is smaller.
+ * The empty bitmap is considered higher than anything (because its least significant bit does not exist).
+ *
+ * \return -1 if \p bitmap1 is considered smaller than \p bitmap2.
+ * \return 1 if \p bitmap1 is considered larger than \p bitmap2.
+ *
+ * For instance comparing binary bitmaps 0011 and 0110 returns -1
+ * (hence 0011 is considered smaller than 0110)
+ * because least significant bit of 0011 (0001) is smaller than least significant bit of 0110 (0010).
+ * Comparing 01001 and 00110 would also return -1 for the same reason.
+ *
+ * \return 0 if bitmaps are considered equal, even if they are not strictly equal.
+ * They just need to have the same least significant bit.
+ * For instance, comparing binary bitmaps 0010 and 0110 returns 0 because they have the same least significant bit.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_compare_first(hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2) __hwloc_attribute_pure;
+
+/** \brief Compare bitmaps \p bitmap1 and \p bitmap2 in lexicographic order.
+ *
+ * Lexicographic comparison of bitmaps, starting for their highest indexes.
+ * Compare last indexes first, then second, etc.
+ * The empty bitmap is considered lower than anything.
+ *
+ * \return -1 if \p bitmap1 is considered smaller than \p bitmap2.
+ * \return 1 if \p bitmap1 is considered larger than \p bitmap2.
+ * \return 0 if bitmaps are equal (contrary to hwloc_bitmap_compare_first()).
+ *
+ * For instance comparing binary bitmaps 0011 and 0110 returns -1
+ * (hence 0011 is considered smaller than 0110).
+ * Comparing 00101 and 01010 returns -1 too.
+ *
+ * \note This is different from the non-existing hwloc_bitmap_compare_last()
+ * which would only compare the highest index of each bitmap.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_compare(hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2) __hwloc_attribute_pure;
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_BITMAP_H */
diff --git a/src/3rdparty/hwloc/include/hwloc/cuda.h b/src/3rdparty/hwloc/include/hwloc/cuda.h
new file mode 100644
index 000000000..77c8473e6
--- /dev/null
+++ b/src/3rdparty/hwloc/include/hwloc/cuda.h
@@ -0,0 +1,220 @@
+/*
+ * Copyright © 2010-2017 Inria.  All rights reserved.
+ * Copyright © 2010-2011 Université Bordeaux
+ * Copyright © 2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and the CUDA Driver API.
+ *
+ * Applications that use both hwloc and the CUDA Driver API may want to
+ * include this file so as to get topology information for CUDA devices.
+ *
+ */
+
+#ifndef HWLOC_CUDA_H
+#define HWLOC_CUDA_H
+
+#include <hwloc.h>
+#include <hwloc/autogen/config.h>
+#include <hwloc/helper.h>
+#ifdef HWLOC_LINUX_SYS
+#include <hwloc/linux.h>
+#endif
+
+#include <cuda.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_cuda Interoperability with the CUDA Driver API
+ *
+ * This interface offers ways to retrieve topology information about
+ * CUDA devices when using the CUDA Driver API.
+ *
+ * @{
+ */
+
+/** \brief Return the domain, bus and device IDs of the CUDA device \p cudevice.
+ *
+ * Device \p cudevice must match the local machine.
+ */
+static __hwloc_inline int
+hwloc_cuda_get_device_pci_ids(hwloc_topology_t topology __hwloc_attribute_unused,
+			      CUdevice cudevice, int *domain, int *bus, int *dev)
+{
+  CUresult cres;
+
+#if CUDA_VERSION >= 4000
+  cres = cuDeviceGetAttribute(domain, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, cudevice);
+  if (cres != CUDA_SUCCESS) {
+    errno = ENOSYS;
+    return -1;
+  }
+#else
+  *domain = 0;
+#endif
+  cres = cuDeviceGetAttribute(bus, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, cudevice);
+  if (cres != CUDA_SUCCESS) {
+    errno = ENOSYS;
+    return -1;
+  }
+  cres = cuDeviceGetAttribute(dev, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, cudevice);
+  if (cres != CUDA_SUCCESS) {
+    errno = ENOSYS;
+    return -1;
+  }
+
+  return 0;
+}
+
+/** \brief Get the CPU set of logical processors that are physically
+ * close to device \p cudevice.
+ *
+ * Return the CPU set describing the locality of the CUDA device \p cudevice.
+ *
+ * Topology \p topology and device \p cudevice must match the local machine.
+ * I/O devices detection and the CUDA component are not needed in the topology.
+ *
+ * The function only returns the locality of the device.
+ * If more information about the device is needed, OS objects should
+ * be used instead, see hwloc_cuda_get_device_osdev()
+ * and hwloc_cuda_get_device_osdev_by_index().
+ *
+ * This function is currently only implemented in a meaningful way for
+ * Linux; other systems will simply get a full cpuset.
+ */
+static __hwloc_inline int
+hwloc_cuda_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_unused,
+			     CUdevice cudevice, hwloc_cpuset_t set)
+{
+#ifdef HWLOC_LINUX_SYS
+  /* If we're on Linux, use the sysfs mechanism to get the local cpus */
+#define HWLOC_CUDA_DEVICE_SYSFS_PATH_MAX 128
+  char path[HWLOC_CUDA_DEVICE_SYSFS_PATH_MAX];
+  int domainid, busid, deviceid;
+
+  if (hwloc_cuda_get_device_pci_ids(topology, cudevice, &domainid, &busid, &deviceid))
+    return -1;
+
+  if (!hwloc_topology_is_thissystem(topology)) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  sprintf(path, "/sys/bus/pci/devices/%04x:%02x:%02x.0/local_cpus", domainid, busid, deviceid);
+  if (hwloc_linux_read_path_as_cpumask(path, set) < 0
+      || hwloc_bitmap_iszero(set))
+    hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+#else
+  /* Non-Linux systems simply get a full cpuset */
+  hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+#endif
+  return 0;
+}
+
+/** \brief Get the hwloc PCI device object corresponding to the
+ * CUDA device \p cudevice.
+ *
+ * Return the PCI device object describing the CUDA device \p cudevice.
+ * Return NULL if there is none.
+ *
+ * Topology \p topology and device \p cudevice must match the local machine.
+ * I/O devices detection must be enabled in topology \p topology.
+ * The CUDA component is not needed in the topology.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_cuda_get_device_pcidev(hwloc_topology_t topology, CUdevice cudevice)
+{
+  int domain, bus, dev;
+
+  if (hwloc_cuda_get_device_pci_ids(topology, cudevice, &domain, &bus, &dev))
+    return NULL;
+
+  return hwloc_get_pcidev_by_busid(topology, domain, bus, dev, 0);
+}
+
+/** \brief Get the hwloc OS device object corresponding to CUDA device \p cudevice.
+ *
+ * Return the hwloc OS device object that describes the given
+ * CUDA device \p cudevice. Return NULL if there is none.
+ *
+ * Topology \p topology and device \p cudevice must match the local machine.
+ * I/O devices detection and the CUDA component must be enabled in the topology.
+ * If not, the locality of the object may still be found using
+ * hwloc_cuda_get_device_cpuset().
+ *
+ * \note This function cannot work if PCI devices are filtered out.
+ *
+ * \note The corresponding hwloc PCI device may be found by looking
+ * at the result parent pointer (unless PCI devices are filtered out).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_cuda_get_device_osdev(hwloc_topology_t topology, CUdevice cudevice)
+{
+	hwloc_obj_t osdev = NULL;
+	int domain, bus, dev;
+
+	if (hwloc_cuda_get_device_pci_ids(topology, cudevice, &domain, &bus, &dev))
+		return NULL;
+
+	osdev = NULL;
+	while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+		hwloc_obj_t pcidev = osdev->parent;
+		if (strncmp(osdev->name, "cuda", 4))
+			continue;
+		if (pcidev
+		    && pcidev->type == HWLOC_OBJ_PCI_DEVICE
+		    && (int) pcidev->attr->pcidev.domain == domain
+		    && (int) pcidev->attr->pcidev.bus == bus
+		    && (int) pcidev->attr->pcidev.dev == dev
+		    && pcidev->attr->pcidev.func == 0)
+			return osdev;
+		/* if PCI are filtered out, we need a info attr to match on */
+	}
+
+	return NULL;
+}
+
+/** \brief Get the hwloc OS device object corresponding to the
+ * CUDA device whose index is \p idx.
+ *
+ * Return the OS device object describing the CUDA device whose
+ * index is \p idx. Return NULL if there is none.
+ *
+ * The topology \p topology does not necessarily have to match the current
+ * machine. For instance the topology may be an XML import of a remote host.
+ * I/O devices detection and the CUDA component must be enabled in the topology.
+ *
+ * \note The corresponding PCI device object can be obtained by looking
+ * at the OS device parent object (unless PCI devices are filtered out).
+ *
+ * \note This function is identical to hwloc_cudart_get_device_osdev_by_index().
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_cuda_get_device_osdev_by_index(hwloc_topology_t topology, unsigned idx)
+{
+	hwloc_obj_t osdev = NULL;
+	while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+		if (HWLOC_OBJ_OSDEV_COPROC == osdev->attr->osdev.type
+		    && osdev->name
+		    && !strncmp("cuda", osdev->name, 4)
+		    && atoi(osdev->name + 4) == (int) idx)
+			return osdev;
+	}
+	return NULL;
+}
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_CUDA_H */
diff --git a/src/3rdparty/hwloc/include/hwloc/cudart.h b/src/3rdparty/hwloc/include/hwloc/cudart.h
new file mode 100644
index 000000000..63c7f59c6
--- /dev/null
+++ b/src/3rdparty/hwloc/include/hwloc/cudart.h
@@ -0,0 +1,177 @@
+/*
+ * Copyright © 2010-2017 Inria.  All rights reserved.
+ * Copyright © 2010-2011 Université Bordeaux
+ * Copyright © 2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and the CUDA Runtime API.
+ *
+ * Applications that use both hwloc and the CUDA Runtime API may want to
+ * include this file so as to get topology information for CUDA devices.
+ *
+ */
+
+#ifndef HWLOC_CUDART_H
+#define HWLOC_CUDART_H
+
+#include <hwloc.h>
+#include <hwloc/autogen/config.h>
+#include <hwloc/helper.h>
+#ifdef HWLOC_LINUX_SYS
+#include <hwloc/linux.h>
+#endif
+
+#include <cuda.h> /* for CUDA_VERSION */
+#include <cuda_runtime_api.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_cudart Interoperability with the CUDA Runtime API
+ *
+ * This interface offers ways to retrieve topology information about
+ * CUDA devices when using the CUDA Runtime API.
+ *
+ * @{
+ */
+
+/** \brief Return the domain, bus and device IDs of the CUDA device whose index is \p idx.
+ *
+ * Device index \p idx must match the local machine.
+ */
+static __hwloc_inline int
+hwloc_cudart_get_device_pci_ids(hwloc_topology_t topology __hwloc_attribute_unused,
+				int idx, int *domain, int *bus, int *dev)
+{
+  cudaError_t cerr;
+  struct cudaDeviceProp prop;
+
+  cerr = cudaGetDeviceProperties(&prop, idx);
+  if (cerr) {
+    errno = ENOSYS;
+    return -1;
+  }
+
+#if CUDA_VERSION >= 4000
+  *domain = prop.pciDomainID;
+#else
+  *domain = 0;
+#endif
+
+  *bus = prop.pciBusID;
+  *dev = prop.pciDeviceID;
+
+  return 0;
+}
+
+/** \brief Get the CPU set of logical processors that are physically
+ * close to device \p idx.
+ *
+ * Return the CPU set describing the locality of the CUDA device
+ * whose index is \p idx.
+ *
+ * Topology \p topology and device \p idx must match the local machine.
+ * I/O devices detection and the CUDA component are not needed in the topology.
+ *
+ * The function only returns the locality of the device.
+ * If more information about the device is needed, OS objects should
+ * be used instead, see hwloc_cudart_get_device_osdev_by_index().
+ *
+ * This function is currently only implemented in a meaningful way for
+ * Linux; other systems will simply get a full cpuset.
+ */
+static __hwloc_inline int
+hwloc_cudart_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_unused,
+			       int idx, hwloc_cpuset_t set)
+{
+#ifdef HWLOC_LINUX_SYS
+  /* If we're on Linux, use the sysfs mechanism to get the local cpus */
+#define HWLOC_CUDART_DEVICE_SYSFS_PATH_MAX 128
+  char path[HWLOC_CUDART_DEVICE_SYSFS_PATH_MAX];
+  int domain, bus, dev;
+
+  if (hwloc_cudart_get_device_pci_ids(topology, idx, &domain, &bus, &dev))
+    return -1;
+
+  if (!hwloc_topology_is_thissystem(topology)) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  sprintf(path, "/sys/bus/pci/devices/%04x:%02x:%02x.0/local_cpus", (unsigned) domain, (unsigned) bus, (unsigned) dev);
+  if (hwloc_linux_read_path_as_cpumask(path, set) < 0
+      || hwloc_bitmap_iszero(set))
+    hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+#else
+  /* Non-Linux systems simply get a full cpuset */
+  hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+#endif
+  return 0;
+}
+
+/** \brief Get the hwloc PCI device object corresponding to the
+ * CUDA device whose index is \p idx.
+ *
+ * Return the PCI device object describing the CUDA device whose
+ * index is \p idx. Return NULL if there is none.
+ *
+ * Topology \p topology and device \p idx must match the local machine.
+ * I/O devices detection must be enabled in topology \p topology.
+ * The CUDA component is not needed in the topology.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_cudart_get_device_pcidev(hwloc_topology_t topology, int idx)
+{
+  int domain, bus, dev;
+
+  if (hwloc_cudart_get_device_pci_ids(topology, idx, &domain, &bus, &dev))
+    return NULL;
+
+  return hwloc_get_pcidev_by_busid(topology, domain, bus, dev, 0);
+}
+
+/** \brief Get the hwloc OS device object corresponding to the
+ * CUDA device whose index is \p idx.
+ *
+ * Return the OS device object describing the CUDA device whose
+ * index is \p idx. Return NULL if there is none.
+ *
+ * The topology \p topology does not necessarily have to match the current
+ * machine. For instance the topology may be an XML import of a remote host.
+ * I/O devices detection and the CUDA component must be enabled in the topology.
+ * If not, the locality of the object may still be found using
+ * hwloc_cudart_get_device_cpuset().
+ *
+ * \note The corresponding PCI device object can be obtained by looking
+ * at the OS device parent object (unless PCI devices are filtered out).
+ *
+ * \note This function is identical to hwloc_cuda_get_device_osdev_by_index().
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_cudart_get_device_osdev_by_index(hwloc_topology_t topology, unsigned idx)
+{
+	hwloc_obj_t osdev = NULL;
+	while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+		if (HWLOC_OBJ_OSDEV_COPROC == osdev->attr->osdev.type
+		    && osdev->name
+		    && !strncmp("cuda", osdev->name, 4)
+		    && atoi(osdev->name + 4) == (int) idx)
+			return osdev;
+	}
+	return NULL;
+}
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_CUDART_H */
diff --git a/src/3rdparty/hwloc/include/hwloc/deprecated.h b/src/3rdparty/hwloc/include/hwloc/deprecated.h
new file mode 100644
index 000000000..8f3b1459a
--- /dev/null
+++ b/src/3rdparty/hwloc/include/hwloc/deprecated.h
@@ -0,0 +1,206 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2017 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2010 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/**
+ * This file contains the inline code of functions declared in hwloc.h
+ */
+
+#ifndef HWLOC_DEPRECATED_H
+#define HWLOC_DEPRECATED_H
+
+#ifndef HWLOC_H
+#error Please include the main hwloc.h instead
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* backward compat with v1.11 before System removal */
+#define HWLOC_OBJ_SYSTEM HWLOC_OBJ_MACHINE
+/* backward compat with v1.10 before Socket->Package renaming */
+#define HWLOC_OBJ_SOCKET HWLOC_OBJ_PACKAGE
+/* backward compat with v1.10 before Node->NUMANode clarification */
+#define HWLOC_OBJ_NODE HWLOC_OBJ_NUMANODE
+
+/** \brief Insert a misc object by parent.
+ *
+ * Identical to hwloc_topology_insert_misc_object().
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_topology_insert_misc_object_by_parent(hwloc_topology_t topology, hwloc_obj_t parent, const char *name) __hwloc_attribute_deprecated;
+static __hwloc_inline hwloc_obj_t
+hwloc_topology_insert_misc_object_by_parent(hwloc_topology_t topology, hwloc_obj_t parent, const char *name)
+{
+  return hwloc_topology_insert_misc_object(topology, parent, name);
+}
+
+/** \brief Stringify the cpuset containing a set of objects.
+ *
+ * If \p size is 0, \p string may safely be \c NULL.
+ *
+ * \return the number of character that were actually written if not truncating,
+ * or that would have been written (not including the ending \\0).
+ */
+static __hwloc_inline int
+hwloc_obj_cpuset_snprintf(char *str, size_t size, size_t nobj, struct hwloc_obj * const *objs) __hwloc_attribute_deprecated;
+static __hwloc_inline int
+hwloc_obj_cpuset_snprintf(char *str, size_t size, size_t nobj, struct hwloc_obj * const *objs)
+{
+  hwloc_bitmap_t set = hwloc_bitmap_alloc();
+  int res;
+  unsigned i;
+
+  hwloc_bitmap_zero(set);
+  for(i=0; i<nobj; i++)
+    if (objs[i]->cpuset)
+      hwloc_bitmap_or(set, set, objs[i]->cpuset);
+
+  res = hwloc_bitmap_snprintf(str, size, set);
+  hwloc_bitmap_free(set);
+  return res;
+}
+
+/** \brief Convert a type string into a type and some attributes.
+ *
+ * Deprecated by hwloc_type_sscanf()
+ */
+static __hwloc_inline int
+hwloc_obj_type_sscanf(const char *string, hwloc_obj_type_t *typep, int *depthattrp, void *typeattrp, size_t typeattrsize) __hwloc_attribute_deprecated;
+static __hwloc_inline int
+hwloc_obj_type_sscanf(const char *string, hwloc_obj_type_t *typep, int *depthattrp, void *typeattrp, size_t typeattrsize)
+{
+  union hwloc_obj_attr_u attr;
+  int err = hwloc_type_sscanf(string, typep, &attr, sizeof(attr));
+  if (err < 0)
+    return err;
+  if (hwloc_obj_type_is_cache(*typep)) {
+    if (depthattrp)
+      *depthattrp = (int) attr.cache.depth;
+    if (typeattrp && typeattrsize >= sizeof(hwloc_obj_cache_type_t))
+      memcpy(typeattrp, &attr.cache.type, sizeof(hwloc_obj_cache_type_t));
+  } else if (*typep == HWLOC_OBJ_GROUP) {
+    if (depthattrp)
+      *depthattrp = (int) attr.group.depth;
+  }
+  return 0;
+}
+
+/** \brief Set the default memory binding policy of the current
+ * process or thread to prefer the NUMA node(s) specified by physical \p nodeset
+ */
+static __hwloc_inline int
+hwloc_set_membind_nodeset(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_deprecated;
+static __hwloc_inline int
+hwloc_set_membind_nodeset(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  return hwloc_set_membind(topology, nodeset, policy, flags | HWLOC_MEMBIND_BYNODESET);
+}
+
+/** \brief Query the default memory binding policy and physical locality of the
+ * current process or thread.
+ */
+static __hwloc_inline int
+hwloc_get_membind_nodeset(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags) __hwloc_attribute_deprecated;
+static __hwloc_inline int
+hwloc_get_membind_nodeset(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags)
+{
+  return hwloc_get_membind(topology, nodeset, policy, flags | HWLOC_MEMBIND_BYNODESET);
+}
+
+/** \brief Set the default memory binding policy of the specified
+ * process to prefer the NUMA node(s) specified by physical \p nodeset
+ */
+static __hwloc_inline int
+hwloc_set_proc_membind_nodeset(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_deprecated;
+static __hwloc_inline int
+hwloc_set_proc_membind_nodeset(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  return hwloc_set_proc_membind(topology, pid, nodeset, policy, flags | HWLOC_MEMBIND_BYNODESET);
+}
+
+/** \brief Query the default memory binding policy and physical locality of the
+ * specified process.
+ */
+static __hwloc_inline int
+hwloc_get_proc_membind_nodeset(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags) __hwloc_attribute_deprecated;
+static __hwloc_inline int
+hwloc_get_proc_membind_nodeset(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags)
+{
+  return hwloc_get_proc_membind(topology, pid, nodeset, policy, flags | HWLOC_MEMBIND_BYNODESET);
+}
+
+/** \brief Bind the already-allocated memory identified by (addr, len)
+ * to the NUMA node(s) in physical \p nodeset.
+ */
+static __hwloc_inline int
+hwloc_set_area_membind_nodeset(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_deprecated;
+static __hwloc_inline int
+hwloc_set_area_membind_nodeset(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  return hwloc_set_area_membind(topology, addr, len, nodeset, policy, flags | HWLOC_MEMBIND_BYNODESET);
+}
+
+/** \brief Query the physical NUMA node(s) and binding policy of the memory
+ * identified by (\p addr, \p len ).
+ */
+static __hwloc_inline int
+hwloc_get_area_membind_nodeset(hwloc_topology_t topology, const void *addr, size_t len, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags) __hwloc_attribute_deprecated;
+static __hwloc_inline int
+hwloc_get_area_membind_nodeset(hwloc_topology_t topology, const void *addr, size_t len, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags)
+{
+  return hwloc_get_area_membind(topology, addr, len, nodeset, policy, flags | HWLOC_MEMBIND_BYNODESET);
+}
+
+/** \brief Allocate some memory on the given physical nodeset \p nodeset
+ */
+static __hwloc_inline void *
+hwloc_alloc_membind_nodeset(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_malloc __hwloc_attribute_deprecated;
+static __hwloc_inline void *
+hwloc_alloc_membind_nodeset(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  return hwloc_alloc_membind(topology, len, nodeset, policy, flags | HWLOC_MEMBIND_BYNODESET);
+}
+
+/** \brief Allocate some memory on the given nodeset \p nodeset.
+ */
+static __hwloc_inline void *
+hwloc_alloc_membind_policy_nodeset(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_malloc __hwloc_attribute_deprecated;
+static __hwloc_inline void *
+hwloc_alloc_membind_policy_nodeset(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  return hwloc_alloc_membind_policy(topology, len, nodeset, policy, flags | HWLOC_MEMBIND_BYNODESET);
+}
+
+/** \brief Convert a CPU set into a NUMA node set and handle non-NUMA cases
+ */
+static __hwloc_inline void
+hwloc_cpuset_to_nodeset_strict(hwloc_topology_t topology, hwloc_const_cpuset_t _cpuset, hwloc_nodeset_t nodeset) __hwloc_attribute_deprecated;
+static __hwloc_inline void
+hwloc_cpuset_to_nodeset_strict(hwloc_topology_t topology, hwloc_const_cpuset_t _cpuset, hwloc_nodeset_t nodeset)
+{
+  hwloc_cpuset_to_nodeset(topology, _cpuset, nodeset);
+}
+
+/** \brief Convert a NUMA node set into a CPU set and handle non-NUMA cases
+ */
+static __hwloc_inline void
+hwloc_cpuset_from_nodeset_strict(hwloc_topology_t topology, hwloc_cpuset_t _cpuset, hwloc_const_nodeset_t nodeset) __hwloc_attribute_deprecated;
+static __hwloc_inline void
+hwloc_cpuset_from_nodeset_strict(hwloc_topology_t topology, hwloc_cpuset_t _cpuset, hwloc_const_nodeset_t nodeset)
+{
+  hwloc_cpuset_from_nodeset(topology, _cpuset, nodeset);
+}
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_DEPRECATED_H */
diff --git a/src/3rdparty/hwloc/include/hwloc/diff.h b/src/3rdparty/hwloc/include/hwloc/diff.h
new file mode 100644
index 000000000..79f2df3de
--- /dev/null
+++ b/src/3rdparty/hwloc/include/hwloc/diff.h
@@ -0,0 +1,289 @@
+/*
+ * Copyright © 2013-2018 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Topology differences.
+ */
+
+#ifndef HWLOC_DIFF_H
+#define HWLOC_DIFF_H
+
+#ifndef HWLOC_H
+#error Please include the main hwloc.h instead
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#elif 0
+}
+#endif
+
+
+/** \defgroup hwlocality_diff Topology differences
+ *
+ * Applications that manipulate many similar topologies, for instance
+ * one for each node of a homogeneous cluster, may want to compress
+ * topologies to reduce the memory footprint.
+ *
+ * This file offers a way to manipulate the difference between topologies
+ * and export/import it to/from XML.
+ * Compression may therefore be achieved by storing one topology
+ * entirely while the others are only described by their differences
+ * with the former.
+ * The actual topology can be reconstructed when actually needed by
+ * applying the precomputed difference to the reference topology.
+ *
+ * This interface targets very similar nodes.
+ * Only very simple differences between topologies are actually
+ * supported, for instance a change in the memory size, the name
+ * of the object, or some info attribute.
+ * More complex differences such as adding or removing objects cannot
+ * be represented in the difference structures and therefore return
+ * errors.
+ * Differences between object sets or topology-wide allowed sets,
+ * cannot be represented either.
+ *
+ * It means that there is no need to apply the difference when
+ * looking at the tree organization (how many levels, how many
+ * objects per level, what kind of objects, CPU and node sets, etc)
+ * and when binding to objects.
+ * However the difference must be applied when looking at object
+ * attributes such as the name, the memory size or info attributes.
+ *
+ * @{
+ */
+
+
+/** \brief Type of one object attribute difference.
+ */
+typedef enum hwloc_topology_diff_obj_attr_type_e {
+  /** \brief The object local memory is modified.
+   * The union is a hwloc_topology_diff_obj_attr_u::hwloc_topology_diff_obj_attr_uint64_s
+   * (and the index field is ignored).
+   */
+  HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_SIZE,
+
+  /** \brief The object name is modified.
+   * The union is a hwloc_topology_diff_obj_attr_u::hwloc_topology_diff_obj_attr_string_s
+   * (and the name field is ignored).
+   */
+
+  HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_NAME,
+  /** \brief the value of an info attribute is modified.
+   * The union is a hwloc_topology_diff_obj_attr_u::hwloc_topology_diff_obj_attr_string_s.
+   */
+  HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_INFO
+} hwloc_topology_diff_obj_attr_type_t;
+
+/** \brief One object attribute difference.
+ */
+union hwloc_topology_diff_obj_attr_u {
+  struct hwloc_topology_diff_obj_attr_generic_s {
+    /* each part of the union must start with these */
+    hwloc_topology_diff_obj_attr_type_t type;
+  } generic;
+
+  /** \brief Integer attribute modification with an optional index. */
+  struct hwloc_topology_diff_obj_attr_uint64_s {
+    /* used for storing integer attributes */
+    hwloc_topology_diff_obj_attr_type_t type;
+    hwloc_uint64_t index; /* not used for SIZE */
+    hwloc_uint64_t oldvalue;
+    hwloc_uint64_t newvalue;
+  } uint64;
+
+  /** \brief String attribute modification with an optional name */
+  struct hwloc_topology_diff_obj_attr_string_s {
+    /* used for storing name and info pairs */
+    hwloc_topology_diff_obj_attr_type_t type;
+    char *name; /* not used for NAME */
+    char *oldvalue;
+    char *newvalue;
+  } string;
+};
+
+
+/** \brief Type of one element of a difference list.
+ */
+typedef enum hwloc_topology_diff_type_e {
+  /** \brief An object attribute was changed.
+   * The union is a hwloc_topology_diff_obj_attr_u::hwloc_topology_diff_obj_attr_s.
+   */
+  HWLOC_TOPOLOGY_DIFF_OBJ_ATTR,
+
+  /** \brief The difference is too complex,
+   * it cannot be represented. The difference below
+   * this object has not been checked.
+   * hwloc_topology_diff_build() will return 1.
+   *
+   * The union is a hwloc_topology_diff_obj_attr_u::hwloc_topology_diff_too_complex_s.
+   */
+  HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX
+} hwloc_topology_diff_type_t;
+
+/** \brief One element of a difference list between two topologies.
+ */
+typedef union hwloc_topology_diff_u {
+  struct hwloc_topology_diff_generic_s {
+    /* each part of the union must start with these */
+    hwloc_topology_diff_type_t type;
+    union hwloc_topology_diff_u * next; /* pointer to the next element of the list, or NULL */
+  } generic;
+
+  /* A difference in an object attribute. */
+  struct hwloc_topology_diff_obj_attr_s {
+    hwloc_topology_diff_type_t type; /* must be ::HWLOC_TOPOLOGY_DIFF_OBJ_ATTR */
+    union hwloc_topology_diff_u * next;
+    /* List of attribute differences for a single object */
+    int obj_depth;
+    unsigned obj_index;
+    union hwloc_topology_diff_obj_attr_u diff;
+  } obj_attr;
+
+  /* A difference that is too complex. */
+  struct hwloc_topology_diff_too_complex_s {
+    hwloc_topology_diff_type_t type; /* must be ::HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX */
+    union hwloc_topology_diff_u * next;
+    /* Where we had to stop computing the diff in the first topology */
+    int obj_depth;
+    unsigned obj_index;
+  } too_complex;
+} * hwloc_topology_diff_t;
+
+
+/** \brief Compute the difference between 2 topologies.
+ *
+ * The difference is stored as a list of ::hwloc_topology_diff_t entries
+ * starting at \p diff.
+ * It is computed by doing a depth-first traversal of both topology trees
+ * simultaneously.
+ *
+ * If the difference between 2 objects is too complex to be represented
+ * (for instance if some objects have different types, or different numbers
+ * of children), a special diff entry of type ::HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX
+ * is queued.
+ * The computation of the diff does not continue below these objects.
+ * So each such diff entry means that the difference between two subtrees
+ * could not be computed.
+ *
+ * \return 0 if the difference can be represented properly.
+ *
+ * \return 0 with \p diff pointing to NULL if there is no difference
+ * between the topologies.
+ *
+ * \return 1 if the difference is too complex (see above). Some entries in
+ * the list will be of type ::HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX.
+ *
+ * \return -1 on any other error.
+ *
+ * \note \p flags is currently not used. It should be 0.
+ *
+ * \note The output diff has to be freed with hwloc_topology_diff_destroy().
+ *
+ * \note The output diff can only be exported to XML or passed to
+ * hwloc_topology_diff_apply() if 0 was returned, i.e. if no entry of type
+ * ::HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX is listed.
+ *
+ * \note The output diff may be modified by removing some entries from
+ * the list. The removed entries should be freed by passing them to
+ * to hwloc_topology_diff_destroy() (possible as another list).
+*/
+HWLOC_DECLSPEC int hwloc_topology_diff_build(hwloc_topology_t topology, hwloc_topology_t newtopology, unsigned long flags, hwloc_topology_diff_t *diff);
+
+/** \brief Flags to be given to hwloc_topology_diff_apply().
+ */
+enum hwloc_topology_diff_apply_flags_e {
+  /** \brief Apply topology diff in reverse direction.
+   * \hideinitializer
+   */
+  HWLOC_TOPOLOGY_DIFF_APPLY_REVERSE = (1UL<<0)
+};
+
+/** \brief Apply a topology diff to an existing topology.
+ *
+ * \p flags is an OR'ed set of ::hwloc_topology_diff_apply_flags_e.
+ *
+ * The new topology is modified in place. hwloc_topology_dup()
+ * may be used to duplicate it before patching.
+ *
+ * If the difference cannot be applied entirely, all previous applied
+ * elements are unapplied before returning.
+ *
+ * \return 0 on success.
+ *
+ * \return -N if applying the difference failed while trying
+ * to apply the N-th part of the difference. For instance -1
+ * is returned if the very first difference element could not
+ * be applied.
+ */
+HWLOC_DECLSPEC int hwloc_topology_diff_apply(hwloc_topology_t topology, hwloc_topology_diff_t diff, unsigned long flags);
+
+/** \brief Destroy a list of topology differences.
+ */
+HWLOC_DECLSPEC int hwloc_topology_diff_destroy(hwloc_topology_diff_t diff);
+
+/** \brief Load a list of topology differences from a XML file.
+ *
+ * If not \c NULL, \p refname will be filled with the identifier
+ * string of the reference topology for the difference file,
+ * if any was specified in the XML file.
+ * This identifier is usually the name of the other XML file
+ * that contains the reference topology.
+ *
+ * \note the pointer returned in refname should later be freed
+ * by the caller.
+ */
+HWLOC_DECLSPEC int hwloc_topology_diff_load_xml(const char *xmlpath, hwloc_topology_diff_t *diff, char **refname);
+
+/** \brief Export a list of topology differences to a XML file.
+ *
+ * If not \c NULL, \p refname defines an identifier string
+ * for the reference topology which was used as a base when
+ * computing this difference.
+ * This identifier is usually the name of the other XML file
+ * that contains the reference topology.
+ * This attribute is given back when reading the diff from XML.
+ */
+HWLOC_DECLSPEC int hwloc_topology_diff_export_xml(hwloc_topology_diff_t diff, const char *refname, const char *xmlpath);
+
+/** \brief Load a list of topology differences from a XML buffer.
+ *
+ * If not \c NULL, \p refname will be filled with the identifier
+ * string of the reference topology for the difference file,
+ * if any was specified in the XML file.
+ * This identifier is usually the name of the other XML file
+ * that contains the reference topology.
+ *
+ * \note the pointer returned in refname should later be freed
+ * by the caller.
+  */
+HWLOC_DECLSPEC int hwloc_topology_diff_load_xmlbuffer(const char *xmlbuffer, int buflen, hwloc_topology_diff_t *diff, char **refname);
+
+/** \brief Export a list of topology differences to a XML buffer.
+ *
+ * If not \c NULL, \p refname defines an identifier string
+ * for the reference topology which was used as a base when
+ * computing this difference.
+ * This identifier is usually the name of the other XML file
+ * that contains the reference topology.
+ * This attribute is given back when reading the diff from XML.
+ *
+ * The returned buffer ends with a \0 that is included in the returned
+ * length.
+ *
+ * \note The XML buffer should later be freed with hwloc_free_xmlbuffer().
+ */
+HWLOC_DECLSPEC int hwloc_topology_diff_export_xmlbuffer(hwloc_topology_diff_t diff, const char *refname, char **xmlbuffer, int *buflen);
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_DIFF_H */
diff --git a/src/3rdparty/hwloc/include/hwloc/distances.h b/src/3rdparty/hwloc/include/hwloc/distances.h
new file mode 100644
index 000000000..d523f29fc
--- /dev/null
+++ b/src/3rdparty/hwloc/include/hwloc/distances.h
@@ -0,0 +1,271 @@
+/*
+ * Copyright © 2010-2019 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Object distances.
+ */
+
+#ifndef HWLOC_DISTANCES_H
+#define HWLOC_DISTANCES_H
+
+#ifndef HWLOC_H
+#error Please include the main hwloc.h instead
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#elif 0
+}
+#endif
+
+
+/** \defgroup hwlocality_distances_get Retrieve distances between objects
+ * @{
+ */
+
+/** \brief Matrix of distances between a set of objects.
+ *
+ * This matrix often contains latencies between NUMA nodes
+ * (as reported in the System Locality Distance Information Table (SLIT)
+ * in the ACPI specification), which may or may not be physically accurate.
+ * It corresponds to the latency for accessing the memory of one node
+ * from a core in another node.
+ * The corresponding kind is ::HWLOC_DISTANCES_KIND_FROM_OS | ::HWLOC_DISTANCES_KIND_FROM_USER.
+ *
+ * The matrix may also contain bandwidths between random sets of objects,
+ * possibly provided by the user, as specified in the \p kind attribute.
+ */
+struct hwloc_distances_s {
+  unsigned nbobjs;		/**< \brief Number of objects described by the distance matrix. */
+  hwloc_obj_t *objs;		/**< \brief Array of objects described by the distance matrix.
+				 * These objects are not in any particular order,
+				 * see hwloc_distances_obj_index() and hwloc_distances_obj_pair_values()
+				 * for easy ways to find objects in this array and their corresponding values.
+				 */
+  unsigned long kind;		/**< \brief OR'ed set of ::hwloc_distances_kind_e. */
+  hwloc_uint64_t *values;	/**< \brief Matrix of distances between objects, stored as a one-dimension array.
+				 *
+				 * Distance from i-th to j-th object is stored in slot i*nbobjs+j.
+				 * The meaning of the value depends on the \p kind attribute.
+				 */
+};
+
+/** \brief Kinds of distance matrices.
+ *
+ * The \p kind attribute of struct hwloc_distances_s is a OR'ed set
+ * of kinds.
+ *
+ * A kind of format HWLOC_DISTANCES_KIND_FROM_* specifies where the
+ * distance information comes from, if known.
+ *
+ * A kind of format HWLOC_DISTANCES_KIND_MEANS_* specifies whether
+ * values are latencies or bandwidths, if applicable.
+ */
+enum hwloc_distances_kind_e {
+  /** \brief These distances were obtained from the operating system or hardware.
+   * \hideinitializer
+   */
+  HWLOC_DISTANCES_KIND_FROM_OS = (1UL<<0),
+  /** \brief These distances were provided by the user.
+   * \hideinitializer
+   */
+  HWLOC_DISTANCES_KIND_FROM_USER = (1UL<<1),
+
+  /** \brief Distance values are similar to latencies between objects.
+   * Values are smaller for closer objects, hence minimal on the diagonal
+   * of the matrix (distance between an object and itself).
+   * It could also be the number of network hops between objects, etc.
+   * \hideinitializer
+   */
+  HWLOC_DISTANCES_KIND_MEANS_LATENCY = (1UL<<2),
+  /** \brief Distance values are similar to bandwidths between objects.
+   * Values are higher for closer objects, hence maximal on the diagonal
+   * of the matrix (distance between an object and itself).
+   * Such values are currently ignored for distance-based grouping.
+   * \hideinitializer
+   */
+  HWLOC_DISTANCES_KIND_MEANS_BANDWIDTH = (1UL<<3)
+};
+
+/** \brief Retrieve distance matrices.
+ *
+ * Retrieve distance matrices from the topology into the \p distances array.
+ *
+ * \p flags is currently unused, should be \c 0.
+ *
+ * \p kind serves as a filter. If \c 0, all distance matrices are returned.
+ * If it contains some HWLOC_DISTANCES_KIND_FROM_*, only distance matrices
+ * whose kind matches one of these are returned.
+ * If it contains some HWLOC_DISTANCES_KIND_MEANS_*, only distance matrices
+ * whose kind matches one of these are returned.
+ *
+ * On input, \p nr points to the number of distance matrices that may be stored
+ * in \p distances.
+ * On output, \p nr points to the number of distance matrices that were actually
+ * found, even if some of them couldn't be stored in \p distances.
+ * Distance matrices that couldn't be stored are ignored, but the function still
+ * returns success (\c 0). The caller may find out by comparing the value pointed
+ * by \p nr before and after the function call.
+ *
+ * Each distance matrix returned in the \p distances array should be released
+ * by the caller using hwloc_distances_release().
+ */
+HWLOC_DECLSPEC int
+hwloc_distances_get(hwloc_topology_t topology,
+		    unsigned *nr, struct hwloc_distances_s **distances,
+		    unsigned long kind, unsigned long flags);
+
+/** \brief Retrieve distance matrices for object at a specific depth in the topology.
+ *
+ * Identical to hwloc_distances_get() with the additional \p depth filter.
+ */
+HWLOC_DECLSPEC int
+hwloc_distances_get_by_depth(hwloc_topology_t topology, int depth,
+			     unsigned *nr, struct hwloc_distances_s **distances,
+			     unsigned long kind, unsigned long flags);
+
+/** \brief Retrieve distance matrices for object of a specific type.
+ *
+ * Identical to hwloc_distances_get() with the additional \p type filter.
+ */
+static __hwloc_inline int
+hwloc_distances_get_by_type(hwloc_topology_t topology, hwloc_obj_type_t type,
+			    unsigned *nr, struct hwloc_distances_s **distances,
+			    unsigned long kind, unsigned long flags)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+  if (depth == HWLOC_TYPE_DEPTH_UNKNOWN || depth == HWLOC_TYPE_DEPTH_MULTIPLE) {
+    *nr = 0;
+    return 0;
+  }
+  return hwloc_distances_get_by_depth(topology, depth, nr, distances, kind, flags);
+}
+
+/** \brief Release a distance matrix structure previously returned by hwloc_distances_get(). */
+HWLOC_DECLSPEC void
+hwloc_distances_release(hwloc_topology_t topology, struct hwloc_distances_s *distances);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_distances_consult Helpers for consulting distance matrices
+ * @{
+ */
+
+/** \brief Find the index of an object in a distances structure.
+ *
+ * \return -1 if object \p obj is not involved in structure \p distances.
+ */
+static __hwloc_inline int
+hwloc_distances_obj_index(struct hwloc_distances_s *distances, hwloc_obj_t obj)
+{
+  unsigned i;
+  for(i=0; i<distances->nbobjs; i++)
+    if (distances->objs[i] == obj)
+      return (int)i;
+  return -1;
+}
+
+/** \brief Find the values between two objects in a distance matrices.
+ *
+ * The distance from \p obj1 to \p obj2 is stored in the value pointed by
+ * \p value1to2 and reciprocally.
+ *
+ * \return -1 if object \p obj1 or \p obj2 is not involved in structure \p distances.
+ */
+static __hwloc_inline int
+hwloc_distances_obj_pair_values(struct hwloc_distances_s *distances,
+				hwloc_obj_t obj1, hwloc_obj_t obj2,
+				hwloc_uint64_t *value1to2, hwloc_uint64_t *value2to1)
+{
+  int i1 = hwloc_distances_obj_index(distances, obj1);
+  int i2 = hwloc_distances_obj_index(distances, obj2);
+  if (i1 < 0 || i2 < 0)
+    return -1;
+  *value1to2 = distances->values[i1 * distances->nbobjs + i2];
+  *value2to1 = distances->values[i2 * distances->nbobjs + i1];
+  return 0;
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_distances_add Add or remove distances between objects
+ * @{
+ */
+
+/** \brief Flags for adding a new distances to a topology. */
+enum hwloc_distances_add_flag_e {
+  /** \brief Try to group objects based on the newly provided distance information.
+   * \hideinitializer
+   */
+  HWLOC_DISTANCES_ADD_FLAG_GROUP = (1UL<<0),
+  /** \brief If grouping, consider the distance values as inaccurate and relax the
+   * comparisons during the grouping algorithms. The actual accuracy may be modified
+   * through the HWLOC_GROUPING_ACCURACY environment variable (see \ref envvar).
+   * \hideinitializer
+   */
+  HWLOC_DISTANCES_ADD_FLAG_GROUP_INACCURATE = (1UL<<1)
+};
+
+/** \brief Provide a new distance matrix.
+ *
+ * Provide the matrix of distances between a set of objects given by \p nbobjs
+ * and the \p objs array. \p nbobjs must be at least 2.
+ * The distances are stored as a one-dimension array in \p values.
+ * The distance from object i to object j is in slot i*nbobjs+j.
+ *
+ * \p kind specifies the kind of distance as a OR'ed set of ::hwloc_distances_kind_e.
+ *
+ * \p flags configures the behavior of the function using an optional OR'ed set of
+ * ::hwloc_distances_add_flag_e.
+ *
+ * Objects must be of the same type. They cannot be of type Group.
+ */
+HWLOC_DECLSPEC int hwloc_distances_add(hwloc_topology_t topology,
+				       unsigned nbobjs, hwloc_obj_t *objs, hwloc_uint64_t *values,
+				       unsigned long kind, unsigned long flags);
+
+/** \brief Remove all distance matrices from a topology.
+ *
+ * Remove all distance matrices, either provided by the user or
+ * gathered through the OS.
+ *
+ * If these distances were used to group objects, these additional
+ *Group objects are not removed from the topology.
+ */
+HWLOC_DECLSPEC int hwloc_distances_remove(hwloc_topology_t topology);
+
+/** \brief Remove distance matrices for objects at a specific depth in the topology.
+ *
+ * Identical to hwloc_distances_remove() but only applies to one level of the topology.
+ */
+HWLOC_DECLSPEC int hwloc_distances_remove_by_depth(hwloc_topology_t topology, int depth);
+
+/** \brief Remove distance matrices for objects of a specific type in the topology.
+ *
+ * Identical to hwloc_distances_remove() but only applies to one level of the topology.
+ */
+static __hwloc_inline int
+hwloc_distances_remove_by_type(hwloc_topology_t topology, hwloc_obj_type_t type)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+  if (depth == HWLOC_TYPE_DEPTH_UNKNOWN || depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+    return 0;
+  return hwloc_distances_remove_by_depth(topology, depth);
+}
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_DISTANCES_H */
diff --git a/src/3rdparty/hwloc/include/hwloc/export.h b/src/3rdparty/hwloc/include/hwloc/export.h
new file mode 100644
index 000000000..b178b77e5
--- /dev/null
+++ b/src/3rdparty/hwloc/include/hwloc/export.h
@@ -0,0 +1,278 @@
+/*
+ * Copyright © 2009-2018 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Exporting Topologies to XML or to Synthetic strings.
+ */
+
+#ifndef HWLOC_EXPORT_H
+#define HWLOC_EXPORT_H
+
+#ifndef HWLOC_H
+#error Please include the main hwloc.h instead
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#elif 0
+}
+#endif
+
+
+/** \defgroup hwlocality_xmlexport Exporting Topologies to XML
+ * @{
+ */
+
+/** \brief Flags for exporting XML topologies.
+ *
+ * Flags to be given as a OR'ed set to hwloc_topology_export_xml().
+ */
+enum hwloc_topology_export_xml_flags_e {
+ /** \brief Export XML that is loadable by hwloc v1.x.
+  * However, the export may miss some details about the topology.
+  * \hideinitializer
+  */
+ HWLOC_TOPOLOGY_EXPORT_XML_FLAG_V1 = (1UL<<0)
+};
+
+/** \brief Export the topology into an XML file.
+ *
+ * This file may be loaded later through hwloc_topology_set_xml().
+ *
+ * By default, the latest export format is used, which means older hwloc
+ * releases (e.g. v1.x) will not be able to import it.
+ * Exporting to v1.x specific XML format is possible using flag
+ * ::HWLOC_TOPOLOGY_EXPORT_XML_FLAG_V1 but it may miss some details
+ * about the topology.
+ * If there is any chance that the exported file may ever be imported
+ * back by a process using hwloc 1.x, one should consider detecting
+ * it at runtime and using the corresponding export format.
+ *
+ * \p flags is a OR'ed set of ::hwloc_topology_export_xml_flags_e.
+ *
+ * \return -1 if a failure occured.
+ *
+ * \note See also hwloc_topology_set_userdata_export_callback()
+ * for exporting application-specific object userdata.
+ *
+ * \note The topology-specific userdata pointer is ignored when exporting to XML.
+ *
+ * \note Only printable characters may be exported to XML string attributes.
+ * Any other character, especially any non-ASCII character, will be silently
+ * dropped.
+ *
+ * \note If \p name is "-", the XML output is sent to the standard output.
+ */
+HWLOC_DECLSPEC int hwloc_topology_export_xml(hwloc_topology_t topology, const char *xmlpath, unsigned long flags);
+
+/** \brief Export the topology into a newly-allocated XML memory buffer.
+ *
+ * \p xmlbuffer is allocated by the callee and should be freed with
+ * hwloc_free_xmlbuffer() later in the caller.
+ *
+ * This memory buffer may be loaded later through hwloc_topology_set_xmlbuffer().
+ *
+ * By default, the latest export format is used, which means older hwloc
+ * releases (e.g. v1.x) will not be able to import it.
+ * Exporting to v1.x specific XML format is possible using flag
+ * ::HWLOC_TOPOLOGY_EXPORT_XML_FLAG_V1 but it may miss some details
+ * about the topology.
+ * If there is any chance that the exported buffer may ever be imported
+ * back by a process using hwloc 1.x, one should consider detecting
+ * it at runtime and using the corresponding export format.
+ *
+ * The returned buffer ends with a \0 that is included in the returned
+ * length.
+ *
+ * \p flags is a OR'ed set of ::hwloc_topology_export_xml_flags_e.
+ *
+ * \return -1 if a failure occured.
+ *
+ * \note See also hwloc_topology_set_userdata_export_callback()
+ * for exporting application-specific object userdata.
+ *
+ * \note The topology-specific userdata pointer is ignored when exporting to XML.
+ *
+ * \note Only printable characters may be exported to XML string attributes.
+ * Any other character, especially any non-ASCII character, will be silently
+ * dropped.
+ */
+HWLOC_DECLSPEC int hwloc_topology_export_xmlbuffer(hwloc_topology_t topology, char **xmlbuffer, int *buflen, unsigned long flags);
+
+/** \brief Free a buffer allocated by hwloc_topology_export_xmlbuffer() */
+HWLOC_DECLSPEC void hwloc_free_xmlbuffer(hwloc_topology_t topology, char *xmlbuffer);
+
+/** \brief Set the application-specific callback for exporting object userdata
+ *
+ * The object userdata pointer is not exported to XML by default because hwloc
+ * does not know what it contains.
+ *
+ * This function lets applications set \p export_cb to a callback function
+ * that converts this opaque userdata into an exportable string.
+ *
+ * \p export_cb is invoked during XML export for each object whose
+ * \p userdata pointer is not \c NULL.
+ * The callback should use hwloc_export_obj_userdata() or
+ * hwloc_export_obj_userdata_base64() to actually export
+ * something to XML (possibly multiple times per object).
+ *
+ * \p export_cb may be set to \c NULL if userdata should not be exported to XML.
+ *
+ * \note The topology-specific userdata pointer is ignored when exporting to XML.
+ */
+HWLOC_DECLSPEC void hwloc_topology_set_userdata_export_callback(hwloc_topology_t topology,
+								void (*export_cb)(void *reserved, hwloc_topology_t topology, hwloc_obj_t obj));
+
+/** \brief Export some object userdata to XML
+ *
+ * This function may only be called from within the export() callback passed
+ * to hwloc_topology_set_userdata_export_callback().
+ * It may be invoked one of multiple times to export some userdata to XML.
+ * The \p buffer content of length \p length is stored with optional name
+ * \p name.
+ *
+ * When importing this XML file, the import() callback (if set) will be
+ * called exactly as many times as hwloc_export_obj_userdata() was called
+ * during export(). It will receive the corresponding \p name, \p buffer
+ * and \p length arguments.
+ *
+ * \p reserved, \p topology and \p obj must be the first three parameters
+ * that were given to the export callback.
+ *
+ * Only printable characters may be exported to XML string attributes.
+ * If a non-printable character is passed in \p name or \p buffer,
+ * the function returns -1 with errno set to EINVAL.
+ *
+ * If exporting binary data, the application should first encode into
+ * printable characters only (or use hwloc_export_obj_userdata_base64()).
+ * It should also take care of portability issues if the export may
+ * be reimported on a different architecture.
+ */
+HWLOC_DECLSPEC int hwloc_export_obj_userdata(void *reserved, hwloc_topology_t topology, hwloc_obj_t obj, const char *name, const void *buffer, size_t length);
+
+/** \brief Encode and export some object userdata to XML
+ *
+ * This function is similar to hwloc_export_obj_userdata() but it encodes
+ * the input buffer into printable characters before exporting.
+ * On import, decoding is automatically performed before the data is given
+ * to the import() callback if any.
+ *
+ * This function may only be called from within the export() callback passed
+ * to hwloc_topology_set_userdata_export_callback().
+ *
+ * The function does not take care of portability issues if the export
+ * may be reimported on a different architecture.
+ */
+HWLOC_DECLSPEC int hwloc_export_obj_userdata_base64(void *reserved, hwloc_topology_t topology, hwloc_obj_t obj, const char *name, const void *buffer, size_t length);
+
+/** \brief Set the application-specific callback for importing userdata
+ *
+ * On XML import, userdata is ignored by default because hwloc does not know
+ * how to store it in memory.
+ *
+ * This function lets applications set \p import_cb to a callback function
+ * that will get the XML-stored userdata and store it in the object as expected
+ * by the application.
+ *
+ * \p import_cb is called during hwloc_topology_load() as many times as
+ * hwloc_export_obj_userdata() was called during export. The topology
+ * is not entirely setup yet. Object attributes are ready to consult,
+ * but links between objects are not.
+ *
+ * \p import_cb may be \c NULL if userdata should be ignored during import.
+ *
+ * \note \p buffer contains \p length characters followed by a null byte ('\0').
+ *
+ * \note This function should be called before hwloc_topology_load().
+ *
+ * \note The topology-specific userdata pointer is ignored when importing from XML.
+ */
+HWLOC_DECLSPEC void hwloc_topology_set_userdata_import_callback(hwloc_topology_t topology,
+								void (*import_cb)(hwloc_topology_t topology, hwloc_obj_t obj, const char *name, const void *buffer, size_t length));
+
+/** @} */
+
+
+/** \defgroup hwlocality_syntheticexport Exporting Topologies to Synthetic
+ * @{
+ */
+
+/** \brief Flags for exporting synthetic topologies.
+ *
+ * Flags to be given as a OR'ed set to hwloc_topology_export_synthetic().
+ */
+enum hwloc_topology_export_synthetic_flags_e {
+ /** \brief Export extended types such as L2dcache as basic types such as Cache.
+  *
+  * This is required if loading the synthetic description with hwloc < 1.9.
+  * \hideinitializer
+  */
+ HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_EXTENDED_TYPES = (1UL<<0),
+
+ /** \brief Do not export level attributes.
+  *
+  * Ignore level attributes such as memory/cache sizes or PU indexes.
+  * This is required if loading the synthetic description with hwloc < 1.10.
+  * \hideinitializer
+  */
+ HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS = (1UL<<1),
+
+ /** \brief Export the memory hierarchy as expected in hwloc 1.x.
+  *
+  * Instead of attaching memory children to levels, export single NUMA node child
+  * as normal intermediate levels, when possible.
+  * This is required if loading the synthetic description with hwloc 1.x.
+  * However this may fail if some objects have multiple local NUMA nodes.
+  * \hideinitializer
+  */
+ HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_V1 = (1UL<<2),
+
+ /** \brief Do not export memory information.
+  *
+  * Only export the actual hierarchy of normal CPU-side objects and ignore
+  * where memory is attached.
+  * This is useful for when the hierarchy of CPUs is what really matters,
+  * but it behaves as if there was a single machine-wide NUMA node.
+  * \hideinitializer
+  */
+ HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_IGNORE_MEMORY = (1UL<<3)
+};
+
+/** \brief Export the topology as a synthetic string.
+ *
+ * At most \p buflen characters will be written in \p buffer,
+ * including the terminating \0.
+ *
+ * This exported string may be given back to hwloc_topology_set_synthetic().
+ *
+ * \p flags is a OR'ed set of ::hwloc_topology_export_synthetic_flags_e.
+ *
+ * \return The number of characters that were written,
+ * not including the terminating \0.
+ *
+ * \return -1 if the topology could not be exported,
+ * for instance if it is not symmetric.
+ *
+ * \note I/O and Misc children are ignored, the synthetic string only
+ * describes normal children.
+ *
+ * \note A 1024-byte buffer should be large enough for exporting
+ * topologies in the vast majority of cases.
+ */
+  HWLOC_DECLSPEC int hwloc_topology_export_synthetic(hwloc_topology_t topology, char *buffer, size_t buflen, unsigned long flags);
+
+/** @} */
+
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_EXPORT_H */
diff --git a/src/3rdparty/hwloc/include/hwloc/gl.h b/src/3rdparty/hwloc/include/hwloc/gl.h
new file mode 100644
index 000000000..3e643fa9a
--- /dev/null
+++ b/src/3rdparty/hwloc/include/hwloc/gl.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright © 2012 Blue Brain Project, EPFL. All rights reserved.
+ * Copyright © 2012-2013 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and OpenGL displays.
+ *
+ * Applications that use both hwloc and OpenGL may want to include
+ * this file so as to get topology information for OpenGL displays.
+ */
+
+#ifndef HWLOC_GL_H
+#define HWLOC_GL_H
+
+#include <hwloc.h>
+
+#include <stdio.h>
+#include <string.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_gl Interoperability with OpenGL displays
+ *
+ * This interface offers ways to retrieve topology information about
+ * OpenGL displays.
+ *
+ * Only the NVIDIA display locality information is currently available,
+ * using the NV-CONTROL X11 extension and the NVCtrl library.
+ *
+ * @{
+ */
+
+/** \brief Get the hwloc OS device object corresponding to the
+ * OpenGL display given by port and device index.
+ *
+ * Return the OS device object describing the OpenGL display
+ * whose port (server) is \p port and device (screen) is \p device.
+ * Return NULL if there is none.
+ *
+ * The topology \p topology does not necessarily have to match the current
+ * machine. For instance the topology may be an XML import of a remote host.
+ * I/O devices detection and the GL component must be enabled in the topology.
+ *
+ * \note The corresponding PCI device object can be obtained by looking
+ * at the OS device parent object (unless PCI devices are filtered out).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_gl_get_display_osdev_by_port_device(hwloc_topology_t topology,
+					  unsigned port, unsigned device)
+{
+        unsigned x = (unsigned) -1, y = (unsigned) -1;
+        hwloc_obj_t osdev = NULL;
+        while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+                if (HWLOC_OBJ_OSDEV_GPU == osdev->attr->osdev.type
+                    && osdev->name
+                    && sscanf(osdev->name, ":%u.%u", &x, &y) == 2
+                    && port == x && device == y)
+                        return osdev;
+        }
+	errno = EINVAL;
+        return NULL;
+}
+
+/** \brief Get the hwloc OS device object corresponding to the
+ * OpenGL display given by name.
+ *
+ * Return the OS device object describing the OpenGL display
+ * whose name is \p name, built as ":port.device" such as ":0.0" .
+ * Return NULL if there is none.
+ *
+ * The topology \p topology does not necessarily have to match the current
+ * machine. For instance the topology may be an XML import of a remote host.
+ * I/O devices detection and the GL component must be enabled in the topology.
+ *
+ * \note The corresponding PCI device object can be obtained by looking
+ * at the OS device parent object (unless PCI devices are filtered out).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_gl_get_display_osdev_by_name(hwloc_topology_t topology,
+				   const char *name)
+{
+        hwloc_obj_t osdev = NULL;
+        while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+                if (HWLOC_OBJ_OSDEV_GPU == osdev->attr->osdev.type
+                    && osdev->name
+                    && !strcmp(name, osdev->name))
+                        return osdev;
+        }
+	errno = EINVAL;
+        return NULL;
+}
+
+/** \brief Get the OpenGL display port and device corresponding
+ * to the given hwloc OS object.
+ *
+ * Return the OpenGL display port (server) in \p port and device (screen)
+ * in \p screen that correspond to the given hwloc OS device object.
+ * Return \c -1 if there is none.
+ *
+ * The topology \p topology does not necessarily have to match the current
+ * machine. For instance the topology may be an XML import of a remote host.
+ * I/O devices detection and the GL component must be enabled in the topology.
+ */
+static __hwloc_inline int
+hwloc_gl_get_display_by_osdev(hwloc_topology_t topology __hwloc_attribute_unused,
+			      hwloc_obj_t osdev,
+			      unsigned *port, unsigned *device)
+{
+	unsigned x = -1, y = -1;
+	if (HWLOC_OBJ_OSDEV_GPU == osdev->attr->osdev.type
+	    && sscanf(osdev->name, ":%u.%u", &x, &y) == 2) {
+		*port = x;
+		*device = y;
+		return 0;
+	}
+	errno = EINVAL;
+	return -1;
+}
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_GL_H */
+
diff --git a/src/3rdparty/hwloc/include/hwloc/glibc-sched.h b/src/3rdparty/hwloc/include/hwloc/glibc-sched.h
new file mode 100644
index 000000000..1f9ba7cdd
--- /dev/null
+++ b/src/3rdparty/hwloc/include/hwloc/glibc-sched.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2013 inria.  All rights reserved.
+ * Copyright © 2009-2011 Université Bordeaux
+ * Copyright © 2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and glibc scheduling routines.
+ *
+ * Applications that use both hwloc and glibc scheduling routines such as
+ * sched_getaffinity() or pthread_attr_setaffinity_np() may want to include
+ * this file so as to ease conversion between their respective types.
+ */
+
+#ifndef HWLOC_GLIBC_SCHED_H
+#define HWLOC_GLIBC_SCHED_H
+
+#include <hwloc.h>
+#include <hwloc/helper.h>
+#include <assert.h>
+
+#if !defined _GNU_SOURCE || !defined _SCHED_H || (!defined CPU_SETSIZE && !defined sched_priority)
+#error Please make sure to include sched.h before including glibc-sched.h, and define _GNU_SOURCE before any inclusion of sched.h
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#ifdef HWLOC_HAVE_CPU_SET
+
+
+/** \defgroup hwlocality_glibc_sched Interoperability with glibc sched affinity
+ *
+ * This interface offers ways to convert between hwloc cpusets and glibc cpusets
+ * such as those manipulated by sched_getaffinity() or pthread_attr_setaffinity_np().
+ *
+ * \note Topology \p topology must match the current machine.
+ *
+ * @{
+ */
+
+
+/** \brief Convert hwloc CPU set \p toposet into glibc sched affinity CPU set \p schedset
+ *
+ * This function may be used before calling sched_setaffinity or any other function
+ * that takes a cpu_set_t as input parameter.
+ *
+ * \p schedsetsize should be sizeof(cpu_set_t) unless \p schedset was dynamically allocated with CPU_ALLOC
+ */
+static __hwloc_inline int
+hwloc_cpuset_to_glibc_sched_affinity(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_cpuset_t hwlocset,
+				    cpu_set_t *schedset, size_t schedsetsize)
+{
+#ifdef CPU_ZERO_S
+  unsigned cpu;
+  CPU_ZERO_S(schedsetsize, schedset);
+  hwloc_bitmap_foreach_begin(cpu, hwlocset)
+    CPU_SET_S(cpu, schedsetsize, schedset);
+  hwloc_bitmap_foreach_end();
+#else /* !CPU_ZERO_S */
+  unsigned cpu;
+  CPU_ZERO(schedset);
+  assert(schedsetsize == sizeof(cpu_set_t));
+  hwloc_bitmap_foreach_begin(cpu, hwlocset)
+    CPU_SET(cpu, schedset);
+  hwloc_bitmap_foreach_end();
+#endif /* !CPU_ZERO_S */
+  return 0;
+}
+
+/** \brief Convert glibc sched affinity CPU set \p schedset into hwloc CPU set
+ *
+ * This function may be used before calling sched_setaffinity  or any other function
+ * that takes a cpu_set_t  as input parameter.
+ *
+ * \p schedsetsize should be sizeof(cpu_set_t) unless \p schedset was dynamically allocated with CPU_ALLOC
+ */
+static __hwloc_inline int
+hwloc_cpuset_from_glibc_sched_affinity(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_cpuset_t hwlocset,
+                                       const cpu_set_t *schedset, size_t schedsetsize)
+{
+  int cpu;
+#ifdef CPU_ZERO_S
+  int count;
+#endif
+  hwloc_bitmap_zero(hwlocset);
+#ifdef CPU_ZERO_S
+  count = CPU_COUNT_S(schedsetsize, schedset);
+  cpu = 0;
+  while (count) {
+    if (CPU_ISSET_S(cpu, schedsetsize, schedset)) {
+      hwloc_bitmap_set(hwlocset, cpu);
+      count--;
+    }
+    cpu++;
+  }
+#else /* !CPU_ZERO_S */
+  /* sched.h does not support dynamic cpu_set_t (introduced in glibc 2.7),
+   * assume we have a very old interface without CPU_COUNT (added in 2.6)
+   */
+  assert(schedsetsize == sizeof(cpu_set_t));
+  for(cpu=0; cpu<CPU_SETSIZE; cpu++)
+    if (CPU_ISSET(cpu, schedset))
+      hwloc_bitmap_set(hwlocset, cpu);
+#endif /* !CPU_ZERO_S */
+  return 0;
+}
+
+/** @} */
+
+
+#endif /* CPU_SET */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_GLIBC_SCHED_H */
diff --git a/src/3rdparty/hwloc/include/hwloc/helper.h b/src/3rdparty/hwloc/include/hwloc/helper.h
new file mode 100644
index 000000000..d48df15f3
--- /dev/null
+++ b/src/3rdparty/hwloc/include/hwloc/helper.h
@@ -0,0 +1,1160 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2019 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2010 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief High-level hwloc traversal helpers.
+ */
+
+#ifndef HWLOC_HELPER_H
+#define HWLOC_HELPER_H
+
+#ifndef HWLOC_H
+#error Please include the main hwloc.h instead
+#endif
+
+#include <stdlib.h>
+#include <errno.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_helper_find_inside Finding Objects inside a CPU set
+ * @{
+ */
+
+/** \brief Get the first largest object included in the given cpuset \p set.
+ *
+ * \return the first object that is included in \p set and whose parent is not.
+ *
+ * This is convenient for iterating over all largest objects within a CPU set
+ * by doing a loop getting the first largest object and clearing its CPU set
+ * from the remaining CPU set.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_first_largest_obj_inside_cpuset(hwloc_topology_t topology, hwloc_const_cpuset_t set)
+{
+  hwloc_obj_t obj = hwloc_get_root_obj(topology);
+  if (!hwloc_bitmap_intersects(obj->cpuset, set))
+    return NULL;
+  while (!hwloc_bitmap_isincluded(obj->cpuset, set)) {
+    /* while the object intersects without being included, look at its children */
+    hwloc_obj_t child = obj->first_child;
+    while (child) {
+      if (hwloc_bitmap_intersects(child->cpuset, set))
+	break;
+      child = child->next_sibling;
+    }
+    if (!child)
+      /* no child intersects, return their father */
+      return obj;
+    /* found one intersecting child, look at its children */
+    obj = child;
+  }
+  /* obj is included, return it */
+  return obj;
+}
+
+/** \brief Get the set of largest objects covering exactly a given cpuset \p set
+ *
+ * \return the number of objects returned in \p objs.
+ */
+HWLOC_DECLSPEC int hwloc_get_largest_objs_inside_cpuset (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+						 hwloc_obj_t * __hwloc_restrict objs, int max);
+
+/** \brief Return the next object at depth \p depth included in CPU set \p set.
+ *
+ * If \p prev is \c NULL, return the first object at depth \p depth
+ * included in \p set.  The next invokation should pass the previous
+ * return value in \p prev so as to obtain the next object in \p set.
+ *
+ * \note Objects with empty CPU sets are ignored
+ * (otherwise they would be considered included in any given set).
+ *
+ * \note This function cannot work if objects at the given depth do
+ * not have CPU sets (I/O or Misc objects).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+					   int depth, hwloc_obj_t prev)
+{
+  hwloc_obj_t next = hwloc_get_next_obj_by_depth(topology, depth, prev);
+  if (!next)
+    return NULL;
+  while (next && (hwloc_bitmap_iszero(next->cpuset) || !hwloc_bitmap_isincluded(next->cpuset, set)))
+    next = next->next_cousin;
+  return next;
+}
+
+/** \brief Return the next object of type \p type included in CPU set \p set.
+ *
+ * If there are multiple or no depth for given type, return \c NULL
+ * and let the caller fallback to
+ * hwloc_get_next_obj_inside_cpuset_by_depth().
+ *
+ * \note Objects with empty CPU sets are ignored
+ * (otherwise they would be considered included in any given set).
+ *
+ * \note This function cannot work if objects of the given type do
+ * not have CPU sets (I/O or Misc objects).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_inside_cpuset_by_type (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+					  hwloc_obj_type_t type, hwloc_obj_t prev)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+  if (depth == HWLOC_TYPE_DEPTH_UNKNOWN || depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+    return NULL;
+  return hwloc_get_next_obj_inside_cpuset_by_depth(topology, set, depth, prev);
+}
+
+/** \brief Return the (logically) \p idx -th object at depth \p depth included in CPU set \p set.
+ *
+ * \note Objects with empty CPU sets are ignored
+ * (otherwise they would be considered included in any given set).
+ *
+ * \note This function cannot work if objects at the given depth do
+ * not have CPU sets (I/O or Misc objects).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+				      int depth, unsigned idx) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+				      int depth, unsigned idx)
+{
+  hwloc_obj_t obj = hwloc_get_obj_by_depth (topology, depth, 0);
+  unsigned count = 0;
+  if (!obj)
+    return NULL;
+  while (obj) {
+    if (!hwloc_bitmap_iszero(obj->cpuset) && hwloc_bitmap_isincluded(obj->cpuset, set)) {
+      if (count == idx)
+	return obj;
+      count++;
+    }
+    obj = obj->next_cousin;
+  }
+  return NULL;
+}
+
+/** \brief Return the \p idx -th object of type \p type included in CPU set \p set.
+ *
+ * If there are multiple or no depth for given type, return \c NULL
+ * and let the caller fallback to
+ * hwloc_get_obj_inside_cpuset_by_depth().
+ *
+ * \note Objects with empty CPU sets are ignored
+ * (otherwise they would be considered included in any given set).
+ *
+ * \note This function cannot work if objects of the given type do
+ * not have CPU sets (I/O or Misc objects).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_inside_cpuset_by_type (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+				     hwloc_obj_type_t type, unsigned idx) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_inside_cpuset_by_type (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+				     hwloc_obj_type_t type, unsigned idx)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+  if (depth == HWLOC_TYPE_DEPTH_UNKNOWN || depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+    return NULL;
+  return hwloc_get_obj_inside_cpuset_by_depth(topology, set, depth, idx);
+}
+
+/** \brief Return the number of objects at depth \p depth included in CPU set \p set.
+ *
+ * \note Objects with empty CPU sets are ignored
+ * (otherwise they would be considered included in any given set).
+ *
+ * \note This function cannot work if objects at the given depth do
+ * not have CPU sets (I/O or Misc objects).
+ */
+static __hwloc_inline unsigned
+hwloc_get_nbobjs_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+					 int depth) __hwloc_attribute_pure;
+static __hwloc_inline unsigned
+hwloc_get_nbobjs_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+					 int depth)
+{
+  hwloc_obj_t obj = hwloc_get_obj_by_depth (topology, depth, 0);
+  unsigned count = 0;
+  if (!obj)
+    return 0;
+  while (obj) {
+    if (!hwloc_bitmap_iszero(obj->cpuset) && hwloc_bitmap_isincluded(obj->cpuset, set))
+      count++;
+    obj = obj->next_cousin;
+  }
+  return count;
+}
+
+/** \brief Return the number of objects of type \p type included in CPU set \p set.
+ *
+ * If no object for that type exists inside CPU set \p set, 0 is
+ * returned.  If there are several levels with objects of that type
+ * inside CPU set \p set, -1 is returned.
+ *
+ * \note Objects with empty CPU sets are ignored
+ * (otherwise they would be considered included in any given set).
+ *
+ * \note This function cannot work if objects of the given type do
+ * not have CPU sets (I/O objects).
+ */
+static __hwloc_inline int
+hwloc_get_nbobjs_inside_cpuset_by_type (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+					hwloc_obj_type_t type) __hwloc_attribute_pure;
+static __hwloc_inline int
+hwloc_get_nbobjs_inside_cpuset_by_type (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+					hwloc_obj_type_t type)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+  if (depth == HWLOC_TYPE_DEPTH_UNKNOWN)
+    return 0;
+  if (depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+    return -1; /* FIXME: agregate nbobjs from different levels? */
+  return (int) hwloc_get_nbobjs_inside_cpuset_by_depth(topology, set, depth);
+}
+
+/** \brief Return the logical index among the objects included in CPU set \p set.
+ *
+ * Consult all objects in the same level as \p obj and inside CPU set \p set
+ * in the logical order, and return the index of \p obj within them.
+ * If \p set covers the entire topology, this is the logical index of \p obj.
+ * Otherwise, this is similar to a logical index within the part of the topology
+ * defined by CPU set \p set.
+ *
+ * \note Objects with empty CPU sets are ignored
+ * (otherwise they would be considered included in any given set).
+ *
+ * \note This function cannot work if obj does not have CPU sets (I/O objects).
+ */
+static __hwloc_inline int
+hwloc_get_obj_index_inside_cpuset (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_cpuset_t set,
+				   hwloc_obj_t obj) __hwloc_attribute_pure;
+static __hwloc_inline int
+hwloc_get_obj_index_inside_cpuset (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_cpuset_t set,
+				   hwloc_obj_t obj)
+{
+  int idx = 0;
+  if (!hwloc_bitmap_isincluded(obj->cpuset, set))
+    return -1;
+  /* count how many objects are inside the cpuset on the way from us to the beginning of the level */
+  while ((obj = obj->prev_cousin) != NULL)
+    if (!hwloc_bitmap_iszero(obj->cpuset) && hwloc_bitmap_isincluded(obj->cpuset, set))
+      idx++;
+  return idx;
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_helper_find_covering Finding Objects covering at least CPU set
+ * @{
+ */
+
+/** \brief Get the child covering at least CPU set \p set.
+ *
+ * \return \c NULL if no child matches or if \p set is empty.
+ *
+ * \note This function cannot work if parent does not have a CPU set (I/O or Misc objects).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_child_covering_cpuset (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_cpuset_t set,
+				hwloc_obj_t parent) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_child_covering_cpuset (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_cpuset_t set,
+				hwloc_obj_t parent)
+{
+  hwloc_obj_t child;
+  if (hwloc_bitmap_iszero(set))
+    return NULL;
+  child = parent->first_child;
+  while (child) {
+    if (child->cpuset && hwloc_bitmap_isincluded(set, child->cpuset))
+      return child;
+    child = child->next_sibling;
+  }
+  return NULL;
+}
+
+/** \brief Get the lowest object covering at least CPU set \p set
+ *
+ * \return \c NULL if no object matches or if \p set is empty.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_covering_cpuset (hwloc_topology_t topology, hwloc_const_cpuset_t set) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_covering_cpuset (hwloc_topology_t topology, hwloc_const_cpuset_t set)
+{
+  struct hwloc_obj *current = hwloc_get_root_obj(topology);
+  if (hwloc_bitmap_iszero(set) || !hwloc_bitmap_isincluded(set, current->cpuset))
+    return NULL;
+  while (1) {
+    hwloc_obj_t child = hwloc_get_child_covering_cpuset(topology, set, current);
+    if (!child)
+      return current;
+    current = child;
+  }
+}
+
+/** \brief Iterate through same-depth objects covering at least CPU set \p set
+ *
+ * If object \p prev is \c NULL, return the first object at depth \p
+ * depth covering at least part of CPU set \p set.  The next
+ * invokation should pass the previous return value in \p prev so as
+ * to obtain the next object covering at least another part of \p set.
+ *
+ * \note This function cannot work if objects at the given depth do
+ * not have CPU sets (I/O or Misc objects).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_covering_cpuset_by_depth(hwloc_topology_t topology, hwloc_const_cpuset_t set,
+					    int depth, hwloc_obj_t prev)
+{
+  hwloc_obj_t next = hwloc_get_next_obj_by_depth(topology, depth, prev);
+  if (!next)
+    return NULL;
+  while (next && !hwloc_bitmap_intersects(set, next->cpuset))
+    next = next->next_cousin;
+  return next;
+}
+
+/** \brief Iterate through same-type objects covering at least CPU set \p set
+ *
+ * If object \p prev is \c NULL, return the first object of type \p
+ * type covering at least part of CPU set \p set.  The next invokation
+ * should pass the previous return value in \p prev so as to obtain
+ * the next object of type \p type covering at least another part of
+ * \p set.
+ *
+ * If there are no or multiple depths for type \p type, \c NULL is returned.
+ * The caller may fallback to hwloc_get_next_obj_covering_cpuset_by_depth()
+ * for each depth.
+ *
+ * \note This function cannot work if objects of the given type do
+ * not have CPU sets (I/O or Misc objects).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_covering_cpuset_by_type(hwloc_topology_t topology, hwloc_const_cpuset_t set,
+					   hwloc_obj_type_t type, hwloc_obj_t prev)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+  if (depth == HWLOC_TYPE_DEPTH_UNKNOWN || depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+    return NULL;
+  return hwloc_get_next_obj_covering_cpuset_by_depth(topology, set, depth, prev);
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_helper_ancestors Looking at Ancestor and Child Objects
+ * @{
+ *
+ * Be sure to see the figure in \ref termsanddefs that shows a
+ * complete topology tree, including depths, child/sibling/cousin
+ * relationships, and an example of an asymmetric topology where one
+ * package has fewer caches than its peers.
+ */
+
+/** \brief Returns the ancestor object of \p obj at depth \p depth.
+ *
+ * \note \p depth should not be the depth of PU or NUMA objects
+ * since they are ancestors of no objects (except Misc or I/O).
+ * This function rather expects an intermediate level depth,
+ * such as the depth of Packages, Cores, or Caches.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_ancestor_obj_by_depth (hwloc_topology_t topology __hwloc_attribute_unused, int depth, hwloc_obj_t obj) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_ancestor_obj_by_depth (hwloc_topology_t topology __hwloc_attribute_unused, int depth, hwloc_obj_t obj)
+{
+  hwloc_obj_t ancestor = obj;
+  if (obj->depth < depth)
+    return NULL;
+  while (ancestor && ancestor->depth > depth)
+    ancestor = ancestor->parent;
+  return ancestor;
+}
+
+/** \brief Returns the ancestor object of \p obj with type \p type.
+ *
+ * \note \p type should not be ::HWLOC_OBJ_PU or ::HWLOC_OBJ_NUMANODE
+ * since these objects are ancestors of no objects (except Misc or I/O).
+ * This function rather expects an intermediate object type,
+ * such as ::HWLOC_OBJ_PACKAGE, ::HWLOC_OBJ_CORE, etc.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_ancestor_obj_by_type (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_type_t type, hwloc_obj_t obj) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_ancestor_obj_by_type (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_type_t type, hwloc_obj_t obj)
+{
+  hwloc_obj_t ancestor = obj->parent;
+  while (ancestor && ancestor->type != type)
+    ancestor = ancestor->parent;
+  return ancestor;
+}
+
+/** \brief Returns the common parent object to objects \p obj1 and \p obj2 */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_common_ancestor_obj (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj1, hwloc_obj_t obj2) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_common_ancestor_obj (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj1, hwloc_obj_t obj2)
+{
+  /* the loop isn't so easy since intermediate ancestors may have
+   * different depth, causing us to alternate between using obj1->parent
+   * and obj2->parent. Also, even if at some point we find ancestors of
+   * of the same depth, their ancestors may have different depth again.
+   */
+  while (obj1 != obj2) {
+    while (obj1->depth > obj2->depth)
+      obj1 = obj1->parent;
+    while (obj2->depth > obj1->depth)
+      obj2 = obj2->parent;
+    if (obj1 != obj2 && obj1->depth == obj2->depth) {
+      obj1 = obj1->parent;
+      obj2 = obj2->parent;
+    }
+  }
+  return obj1;
+}
+
+/** \brief Returns true if \p obj is inside the subtree beginning with ancestor object \p subtree_root.
+ *
+ * \note This function cannot work if \p obj and \p subtree_root objects do
+ * not have CPU sets (I/O or Misc objects).
+ */
+static __hwloc_inline int
+hwloc_obj_is_in_subtree (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj, hwloc_obj_t subtree_root) __hwloc_attribute_pure;
+static __hwloc_inline int
+hwloc_obj_is_in_subtree (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj, hwloc_obj_t subtree_root)
+{
+  return obj->cpuset && subtree_root->cpuset && hwloc_bitmap_isincluded(obj->cpuset, subtree_root->cpuset);
+}
+
+/** \brief Return the next child.
+ *
+ * Return the next child among the normal children list,
+ * then among the memory children list, then among the I/O
+ * children list, then among the Misc children list.
+ *
+ * If \p prev is \c NULL, return the first child.
+ *
+ * Return \c NULL when there is no next child.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_child (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t parent, hwloc_obj_t prev)
+{
+  hwloc_obj_t obj;
+  int state = 0;
+  if (prev) {
+    if (prev->type == HWLOC_OBJ_MISC)
+      state = 3;
+    else if (prev->type == HWLOC_OBJ_BRIDGE || prev->type == HWLOC_OBJ_PCI_DEVICE || prev->type == HWLOC_OBJ_OS_DEVICE)
+      state = 2;
+    else if (prev->type == HWLOC_OBJ_NUMANODE)
+      state = 1;
+    obj = prev->next_sibling;
+  } else {
+    obj = parent->first_child;
+  }
+  if (!obj && state == 0) {
+    obj = parent->memory_first_child;
+    state = 1;
+  }
+  if (!obj && state == 1) {
+    obj = parent->io_first_child;
+    state = 2;
+  }
+  if (!obj && state == 2) {
+    obj = parent->misc_first_child;
+    state = 3;
+  }
+  return obj;
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_helper_types Kinds of object Type
+ * @{
+ *
+ * Each object type is
+ * either Normal (i.e. hwloc_obj_type_is_normal() returns 1),
+ * or Memory (i.e. hwloc_obj_type_is_memory() returns 1)
+ * or I/O (i.e. hwloc_obj_type_is_io() returns 1)
+ * or Misc (i.e. equal to ::HWLOC_OBJ_MISC).
+ * It cannot be of more than one of these kinds.
+ */
+
+/** \brief Check whether an object type is Normal.
+ *
+ * Normal objects are objects of the main CPU hierarchy
+ * (Machine, Package, Core, PU, CPU caches, etc.),
+ * but they are not NUMA nodes, I/O devices or Misc objects.
+ *
+ * They are attached to parent as Normal children,
+ * not as Memory, I/O or Misc children.
+ *
+ * \return 1 if an object of type \p type is a Normal object, 0 otherwise.
+ */
+HWLOC_DECLSPEC int
+hwloc_obj_type_is_normal(hwloc_obj_type_t type);
+
+/** \brief Check whether an object type is I/O.
+ *
+ * I/O objects are objects attached to their parents
+ * in the I/O children list.
+ * This current includes Bridges, PCI and OS devices.
+ *
+ * \return 1 if an object of type \p type is a I/O object, 0 otherwise.
+ */
+HWLOC_DECLSPEC int
+hwloc_obj_type_is_io(hwloc_obj_type_t type);
+
+/** \brief Check whether an object type is Memory.
+ *
+ * Memory objects are objects attached to their parents
+ * in the Memory children list.
+ * This current only includes NUMA nodes.
+ *
+ * \return 1 if an object of type \p type is a Memory object, 0 otherwise.
+ */
+HWLOC_DECLSPEC int
+hwloc_obj_type_is_memory(hwloc_obj_type_t type);
+
+/** \brief Check whether an object type is a Cache (Data, Unified or Instruction).
+ *
+ * \return 1 if an object of type \p type is a Cache, 0 otherwise.
+ */
+HWLOC_DECLSPEC int
+hwloc_obj_type_is_cache(hwloc_obj_type_t type);
+
+/** \brief Check whether an object type is a Data or Unified Cache.
+ *
+ * \return 1 if an object of type \p type is a Data or Unified Cache, 0 otherwise.
+ */
+HWLOC_DECLSPEC int
+hwloc_obj_type_is_dcache(hwloc_obj_type_t type);
+
+/** \brief Check whether an object type is a Instruction Cache,
+ *
+ * \return 1 if an object of type \p type is a Instruction Cache, 0 otherwise.
+ */
+HWLOC_DECLSPEC int
+hwloc_obj_type_is_icache(hwloc_obj_type_t type);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_helper_find_cache Looking at Cache Objects
+ * @{
+ */
+
+/** \brief Find the depth of cache objects matching cache level and type.
+ *
+ * Return the depth of the topology level that contains cache objects
+ * whose attributes match \p cachelevel and \p cachetype.
+
+ * This function is identical to calling hwloc_get_type_depth() with the
+ * corresponding type such as ::HWLOC_OBJ_L1ICACHE, except that it may
+ * also return a Unified cache when looking for an instruction cache.
+ *
+ * If no cache level matches, ::HWLOC_TYPE_DEPTH_UNKNOWN is returned.
+ *
+ * If \p cachetype is ::HWLOC_OBJ_CACHE_UNIFIED, the depth of the
+ * unique matching unified cache level is returned.
+ *
+ * If \p cachetype is ::HWLOC_OBJ_CACHE_DATA or ::HWLOC_OBJ_CACHE_INSTRUCTION,
+ * either a matching cache, or a unified cache is returned.
+ *
+ * If \p cachetype is \c -1, it is ignored and multiple levels may
+ * match. The function returns either the depth of a uniquely matching
+ * level or ::HWLOC_TYPE_DEPTH_MULTIPLE.
+ */
+static __hwloc_inline int
+hwloc_get_cache_type_depth (hwloc_topology_t topology,
+			    unsigned cachelevel, hwloc_obj_cache_type_t cachetype)
+{
+  int depth;
+  int found = HWLOC_TYPE_DEPTH_UNKNOWN;
+  for (depth=0; ; depth++) {
+    hwloc_obj_t obj = hwloc_get_obj_by_depth(topology, depth, 0);
+    if (!obj)
+      break;
+    if (!hwloc_obj_type_is_dcache(obj->type) || obj->attr->cache.depth != cachelevel)
+      /* doesn't match, try next depth */
+      continue;
+    if (cachetype == (hwloc_obj_cache_type_t) -1) {
+      if (found != HWLOC_TYPE_DEPTH_UNKNOWN) {
+	/* second match, return MULTIPLE */
+        return HWLOC_TYPE_DEPTH_MULTIPLE;
+      }
+      /* first match, mark it as found */
+      found = depth;
+      continue;
+    }
+    if (obj->attr->cache.type == cachetype || obj->attr->cache.type == HWLOC_OBJ_CACHE_UNIFIED)
+      /* exact match (either unified is alone, or we match instruction or data), return immediately */
+      return depth;
+  }
+  /* went to the bottom, return what we found */
+  return found;
+}
+
+/** \brief Get the first data (or unified) cache covering a cpuset \p set
+ *
+ * \return \c NULL if no cache matches.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_cache_covering_cpuset (hwloc_topology_t topology, hwloc_const_cpuset_t set) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_cache_covering_cpuset (hwloc_topology_t topology, hwloc_const_cpuset_t set)
+{
+  hwloc_obj_t current = hwloc_get_obj_covering_cpuset(topology, set);
+  while (current) {
+    if (hwloc_obj_type_is_dcache(current->type))
+      return current;
+    current = current->parent;
+  }
+  return NULL;
+}
+
+/** \brief Get the first data (or unified) cache shared between an object and somebody else.
+ *
+ * \return \c NULL if no cache matches or if an invalid object is given.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_shared_cache_covering_obj (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_shared_cache_covering_obj (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj)
+{
+  hwloc_obj_t current = obj->parent;
+  if (!obj->cpuset)
+    return NULL;
+  while (current) {
+    if (!hwloc_bitmap_isequal(current->cpuset, obj->cpuset)
+        && hwloc_obj_type_is_dcache(current->type))
+      return current;
+    current = current->parent;
+  }
+  return NULL;
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_helper_find_misc Finding objects, miscellaneous helpers
+ * @{
+ *
+ * Be sure to see the figure in \ref termsanddefs that shows a
+ * complete topology tree, including depths, child/sibling/cousin
+ * relationships, and an example of an asymmetric topology where one
+ * package has fewer caches than its peers.
+ */
+
+/** \brief Returns the object of type ::HWLOC_OBJ_PU with \p os_index.
+ *
+ * This function is useful for converting a CPU set into the PU
+ * objects it contains.
+ * When retrieving the current binding (e.g. with hwloc_get_cpubind()),
+ * one may iterate over the bits of the resulting CPU set with
+ * hwloc_bitmap_foreach_begin(), and find the corresponding PUs
+ * with this function.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_pu_obj_by_os_index(hwloc_topology_t topology, unsigned os_index) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_pu_obj_by_os_index(hwloc_topology_t topology, unsigned os_index)
+{
+  hwloc_obj_t obj = NULL;
+  while ((obj = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_PU, obj)) != NULL)
+    if (obj->os_index == os_index)
+      return obj;
+  return NULL;
+}
+
+/** \brief Returns the object of type ::HWLOC_OBJ_NUMANODE with \p os_index.
+ *
+ * This function is useful for converting a nodeset into the NUMA node
+ * objects it contains.
+ * When retrieving the current binding (e.g. with hwloc_get_membind() with HWLOC_MEMBIND_BYNODESET),
+ * one may iterate over the bits of the resulting nodeset with
+ * hwloc_bitmap_foreach_begin(), and find the corresponding NUMA nodes
+ * with this function.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_numanode_obj_by_os_index(hwloc_topology_t topology, unsigned os_index) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_numanode_obj_by_os_index(hwloc_topology_t topology, unsigned os_index)
+{
+  hwloc_obj_t obj = NULL;
+  while ((obj = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_NUMANODE, obj)) != NULL)
+    if (obj->os_index == os_index)
+      return obj;
+  return NULL;
+}
+
+/** \brief Do a depth-first traversal of the topology to find and sort
+ *
+ * all objects that are at the same depth than \p src.
+ * Report in \p objs up to \p max physically closest ones to \p src.
+ *
+ * \return the number of objects returned in \p objs.
+ *
+ * \return 0 if \p src is an I/O object.
+ *
+ * \note This function requires the \p src object to have a CPU set.
+ */
+/* TODO: rather provide an iterator? Provide a way to know how much should be allocated? By returning the total number of objects instead? */
+HWLOC_DECLSPEC unsigned hwloc_get_closest_objs (hwloc_topology_t topology, hwloc_obj_t src, hwloc_obj_t * __hwloc_restrict objs, unsigned max);
+
+/** \brief Find an object below another object, both specified by types and indexes.
+ *
+ * Start from the top system object and find object of type \p type1
+ * and logical index \p idx1.  Then look below this object and find another
+ * object of type \p type2 and logical index \p idx2.  Indexes are specified
+ * within the parent, not withing the entire system.
+ *
+ * For instance, if type1 is PACKAGE, idx1 is 2, type2 is CORE and idx2
+ * is 3, return the fourth core object below the third package.
+ *
+ * \note This function requires these objects to have a CPU set.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_below_by_type (hwloc_topology_t topology,
+			     hwloc_obj_type_t type1, unsigned idx1,
+			     hwloc_obj_type_t type2, unsigned idx2) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_below_by_type (hwloc_topology_t topology,
+			     hwloc_obj_type_t type1, unsigned idx1,
+			     hwloc_obj_type_t type2, unsigned idx2)
+{
+  hwloc_obj_t obj;
+  obj = hwloc_get_obj_by_type (topology, type1, idx1);
+  if (!obj)
+    return NULL;
+  return hwloc_get_obj_inside_cpuset_by_type(topology, obj->cpuset, type2, idx2);
+}
+
+/** \brief Find an object below a chain of objects specified by types and indexes.
+ *
+ * This is a generalized version of hwloc_get_obj_below_by_type().
+ *
+ * Arrays \p typev and \p idxv must contain \p nr types and indexes.
+ *
+ * Start from the top system object and walk the arrays \p typev and \p idxv.
+ * For each type and logical index couple in the arrays, look under the previously found
+ * object to find the index-th object of the given type.
+ * Indexes are specified within the parent, not withing the entire system.
+ *
+ * For instance, if nr is 3, typev contains NODE, PACKAGE and CORE,
+ * and idxv contains 0, 1 and 2, return the third core object below
+ * the second package below the first NUMA node.
+ *
+ * \note This function requires all these objects and the root object
+ * to have a CPU set.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_below_array_by_type (hwloc_topology_t topology, int nr, hwloc_obj_type_t *typev, unsigned *idxv) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_below_array_by_type (hwloc_topology_t topology, int nr, hwloc_obj_type_t *typev, unsigned *idxv)
+{
+  hwloc_obj_t obj = hwloc_get_root_obj(topology);
+  int i;
+  for(i=0; i<nr; i++) {
+    if (!obj)
+      return NULL;
+    obj = hwloc_get_obj_inside_cpuset_by_type(topology, obj->cpuset, typev[i], idxv[i]);
+  }
+  return obj;
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_helper_distribute Distributing items over a topology
+ * @{
+ */
+
+/** \brief Flags to be given to hwloc_distrib().
+ */
+enum hwloc_distrib_flags_e {
+  /** \brief Distrib in reverse order, starting from the last objects.
+   * \hideinitializer
+   */
+  HWLOC_DISTRIB_FLAG_REVERSE = (1UL<<0)
+};
+
+/** \brief Distribute \p n items over the topology under \p roots
+ *
+ * Array \p set will be filled with \p n cpusets recursively distributed
+ * linearly over the topology under objects \p roots, down to depth \p until
+ * (which can be INT_MAX to distribute down to the finest level).
+ *
+ * \p n_roots is usually 1 and \p roots only contains the topology root object
+ * so as to distribute over the entire topology.
+ *
+ * This is typically useful when an application wants to distribute \p n
+ * threads over a machine, giving each of them as much private cache as
+ * possible and keeping them locally in number order.
+ *
+ * The caller may typically want to also call hwloc_bitmap_singlify()
+ * before binding a thread so that it does not move at all.
+ *
+ * \p flags should be 0 or a OR'ed set of ::hwloc_distrib_flags_e.
+ *
+ * \note This function requires the \p roots objects to have a CPU set.
+ *
+ * \note This function replaces the now deprecated hwloc_distribute()
+ * and hwloc_distributev() functions.
+ */
+static __hwloc_inline int
+hwloc_distrib(hwloc_topology_t topology,
+	      hwloc_obj_t *roots, unsigned n_roots,
+	      hwloc_cpuset_t *set,
+	      unsigned n,
+	      int until, unsigned long flags)
+{
+  unsigned i;
+  unsigned tot_weight;
+  unsigned given, givenweight;
+  hwloc_cpuset_t *cpusetp = set;
+
+  if (flags & ~HWLOC_DISTRIB_FLAG_REVERSE) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  tot_weight = 0;
+  for (i = 0; i < n_roots; i++)
+    tot_weight += (unsigned) hwloc_bitmap_weight(roots[i]->cpuset);
+
+  for (i = 0, given = 0, givenweight = 0; i < n_roots; i++) {
+    unsigned chunk, weight;
+    hwloc_obj_t root = roots[flags & HWLOC_DISTRIB_FLAG_REVERSE ? n_roots-1-i : i];
+    hwloc_cpuset_t cpuset = root->cpuset;
+    if (root->type == HWLOC_OBJ_NUMANODE)
+      /* NUMANodes have same cpuset as their parent, but we need normal objects below */
+      root = root->parent;
+    weight = (unsigned) hwloc_bitmap_weight(cpuset);
+    if (!weight)
+      continue;
+    /* Give to root a chunk proportional to its weight.
+     * If previous chunks got rounded-up, we may get a bit less. */
+    chunk = (( (givenweight+weight) * n  + tot_weight-1) / tot_weight)
+          - ((  givenweight         * n  + tot_weight-1) / tot_weight);
+    if (!root->arity || chunk <= 1 || root->depth >= until) {
+      /* We can't split any more, put everything there.  */
+      if (chunk) {
+	/* Fill cpusets with ours */
+	unsigned j;
+	for (j=0; j < chunk; j++)
+	  cpusetp[j] = hwloc_bitmap_dup(cpuset);
+      } else {
+	/* We got no chunk, just merge our cpuset to a previous one
+	 * (the first chunk cannot be empty)
+	 * so that this root doesn't get ignored.
+	 */
+	assert(given);
+	hwloc_bitmap_or(cpusetp[-1], cpusetp[-1], cpuset);
+      }
+    } else {
+      /* Still more to distribute, recurse into children */
+      hwloc_distrib(topology, root->children, root->arity, cpusetp, chunk, until, flags);
+    }
+    cpusetp += chunk;
+    given += chunk;
+    givenweight += weight;
+  }
+
+  return 0;
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_helper_topology_sets CPU and node sets of entire topologies
+ * @{
+ */
+
+/** \brief Get complete CPU set
+ *
+ * \return the complete CPU set of logical processors of the system.
+ *
+ * \note The returned cpuset is not newly allocated and should thus not be
+ * changed or freed; hwloc_bitmap_dup() must be used to obtain a local copy.
+ *
+ * \note This is equivalent to retrieving the root object complete CPU-set.
+ */
+HWLOC_DECLSPEC hwloc_const_cpuset_t
+hwloc_topology_get_complete_cpuset(hwloc_topology_t topology) __hwloc_attribute_pure;
+
+/** \brief Get topology CPU set
+ *
+ * \return the CPU set of logical processors of the system for which hwloc
+ * provides topology information. This is equivalent to the cpuset of the
+ * system object.
+ *
+ * \note The returned cpuset is not newly allocated and should thus not be
+ * changed or freed; hwloc_bitmap_dup() must be used to obtain a local copy.
+ *
+ * \note This is equivalent to retrieving the root object complete CPU-set.
+ */
+HWLOC_DECLSPEC hwloc_const_cpuset_t
+hwloc_topology_get_topology_cpuset(hwloc_topology_t topology) __hwloc_attribute_pure;
+
+/** \brief Get allowed CPU set
+ *
+ * \return the CPU set of allowed logical processors of the system.
+ *
+ * \note If the topology flag ::HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM was not set,
+ * this is identical to hwloc_topology_get_topology_cpuset(), which means
+ * all PUs are allowed.
+ *
+ * \note If ::HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM was set, applying
+ * hwloc_bitmap_intersects() on the result of this function and on an object
+ * cpuset checks whether there are allowed PUs inside that object.
+ * Applying hwloc_bitmap_and() returns the list of these allowed PUs.
+ *
+ * \note The returned cpuset is not newly allocated and should thus not be
+ * changed or freed, hwloc_bitmap_dup() must be used to obtain a local copy.
+ */
+HWLOC_DECLSPEC hwloc_const_cpuset_t
+hwloc_topology_get_allowed_cpuset(hwloc_topology_t topology) __hwloc_attribute_pure;
+
+/** \brief Get complete node set
+ *
+ * \return the complete node set of memory of the system.
+ *
+ * \note The returned nodeset is not newly allocated and should thus not be
+ * changed or freed; hwloc_bitmap_dup() must be used to obtain a local copy.
+ *
+ * \note This is equivalent to retrieving the root object complete CPU-set.
+ */
+HWLOC_DECLSPEC hwloc_const_nodeset_t
+hwloc_topology_get_complete_nodeset(hwloc_topology_t topology) __hwloc_attribute_pure;
+
+/** \brief Get topology node set
+ *
+ * \return the node set of memory of the system for which hwloc
+ * provides topology information. This is equivalent to the nodeset of the
+ * system object.
+ *
+ * \note The returned nodeset is not newly allocated and should thus not be
+ * changed or freed; hwloc_bitmap_dup() must be used to obtain a local copy.
+ *
+ * \note This is equivalent to retrieving the root object complete CPU-set.
+ */
+HWLOC_DECLSPEC hwloc_const_nodeset_t
+hwloc_topology_get_topology_nodeset(hwloc_topology_t topology) __hwloc_attribute_pure;
+
+/** \brief Get allowed node set
+ *
+ * \return the node set of allowed memory of the system.
+ *
+ * \note If the topology flag ::HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM was not set,
+ * this is identical to hwloc_topology_get_topology_nodeset(), which means
+ * all NUMA nodes are allowed.
+ *
+ * \note If ::HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM was set, applying
+ * hwloc_bitmap_intersects() on the result of this function and on an object
+ * nodeset checks whether there are allowed NUMA nodes inside that object.
+ * Applying hwloc_bitmap_and() returns the list of these allowed NUMA nodes.
+ *
+ * \note The returned nodeset is not newly allocated and should thus not be
+ * changed or freed, hwloc_bitmap_dup() must be used to obtain a local copy.
+ */
+HWLOC_DECLSPEC hwloc_const_nodeset_t
+hwloc_topology_get_allowed_nodeset(hwloc_topology_t topology) __hwloc_attribute_pure;
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_helper_nodeset_convert Converting between CPU sets and node sets
+ *
+ * @{
+ */
+
+/** \brief Convert a CPU set into a NUMA node set and handle non-NUMA cases
+ *
+ * If some NUMA nodes have no CPUs at all, this function never sets their
+ * indexes in the output node set, even if a full CPU set is given in input.
+ *
+ * If the topology contains no NUMA nodes, the machine is considered
+ * as a single memory node, and the following behavior is used:
+ * If \p cpuset is empty, \p nodeset will be emptied as well.
+ * Otherwise \p nodeset will be entirely filled.
+ */
+static __hwloc_inline int
+hwloc_cpuset_to_nodeset(hwloc_topology_t topology, hwloc_const_cpuset_t _cpuset, hwloc_nodeset_t nodeset)
+{
+	int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+	hwloc_obj_t obj = NULL;
+	assert(depth != HWLOC_TYPE_DEPTH_UNKNOWN);
+	hwloc_bitmap_zero(nodeset);
+	while ((obj = hwloc_get_next_obj_covering_cpuset_by_depth(topology, _cpuset, depth, obj)) != NULL)
+		if (hwloc_bitmap_set(nodeset, obj->os_index) < 0)
+			return -1;
+	return 0;
+}
+
+/** \brief Convert a NUMA node set into a CPU set and handle non-NUMA cases
+ *
+ * If the topology contains no NUMA nodes, the machine is considered
+ * as a single memory node, and the following behavior is used:
+ * If \p nodeset is empty, \p cpuset will be emptied as well.
+ * Otherwise \p cpuset will be entirely filled.
+ * This is useful for manipulating memory binding sets.
+ */
+static __hwloc_inline int
+hwloc_cpuset_from_nodeset(hwloc_topology_t topology, hwloc_cpuset_t _cpuset, hwloc_const_nodeset_t nodeset)
+{
+	int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+	hwloc_obj_t obj = NULL;
+	assert(depth != HWLOC_TYPE_DEPTH_UNKNOWN);
+	hwloc_bitmap_zero(_cpuset);
+	while ((obj = hwloc_get_next_obj_by_depth(topology, depth, obj)) != NULL) {
+		if (hwloc_bitmap_isset(nodeset, obj->os_index))
+			/* no need to check obj->cpuset because objects in levels always have a cpuset */
+			if (hwloc_bitmap_or(_cpuset, _cpuset, obj->cpuset) < 0)
+				return -1;
+	}
+	return 0;
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_advanced_io Finding I/O objects
+ * @{
+ */
+
+/** \brief Get the first non-I/O ancestor object.
+ *
+ * Given the I/O object \p ioobj, find the smallest non-I/O ancestor
+ * object. This object (normal or memory) may then be used for binding
+ * because it has non-NULL CPU and node sets
+ * and because its locality is the same as \p ioobj.
+ *
+ * \note The resulting object is usually a normal object but it could also
+ * be a memory object (e.g. NUMA node) in future platforms if I/O objects
+ * ever get attached to memory instead of CPUs.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_non_io_ancestor_obj(hwloc_topology_t topology __hwloc_attribute_unused,
+			      hwloc_obj_t ioobj)
+{
+  hwloc_obj_t obj = ioobj;
+  while (obj && !obj->cpuset) {
+    obj = obj->parent;
+  }
+  return obj;
+}
+
+/** \brief Get the next PCI device in the system.
+ *
+ * \return the first PCI device if \p prev is \c NULL.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_pcidev(hwloc_topology_t topology, hwloc_obj_t prev)
+{
+  return hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_PCI_DEVICE, prev);
+}
+
+/** \brief Find the PCI device object matching the PCI bus id
+ * given domain, bus device and function PCI bus id.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_pcidev_by_busid(hwloc_topology_t topology,
+			  unsigned domain, unsigned bus, unsigned dev, unsigned func)
+{
+  hwloc_obj_t obj = NULL;
+  while ((obj = hwloc_get_next_pcidev(topology, obj)) != NULL) {
+    if (obj->attr->pcidev.domain == domain
+	&& obj->attr->pcidev.bus == bus
+	&& obj->attr->pcidev.dev == dev
+	&& obj->attr->pcidev.func == func)
+      return obj;
+  }
+  return NULL;
+}
+
+/** \brief Find the PCI device object matching the PCI bus id
+ * given as a string xxxx:yy:zz.t or yy:zz.t.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_pcidev_by_busidstring(hwloc_topology_t topology, const char *busid)
+{
+  unsigned domain = 0; /* default */
+  unsigned bus, dev, func;
+
+  if (sscanf(busid, "%x:%x.%x", &bus, &dev, &func) != 3
+      && sscanf(busid, "%x:%x:%x.%x", &domain, &bus, &dev, &func) != 4) {
+    errno = EINVAL;
+    return NULL;
+  }
+
+  return hwloc_get_pcidev_by_busid(topology, domain, bus, dev, func);
+}
+
+/** \brief Get the next OS device in the system.
+ *
+ * \return the first OS device if \p prev is \c NULL.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_osdev(hwloc_topology_t topology, hwloc_obj_t prev)
+{
+  return hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_OS_DEVICE, prev);
+}
+
+/** \brief Get the next bridge in the system.
+ *
+ * \return the first bridge if \p prev is \c NULL.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_bridge(hwloc_topology_t topology, hwloc_obj_t prev)
+{
+  return hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_BRIDGE, prev);
+}
+
+/* \brief Checks whether a given bridge covers a given PCI bus.
+ */
+static __hwloc_inline int
+hwloc_bridge_covers_pcibus(hwloc_obj_t bridge,
+			   unsigned domain, unsigned bus)
+{
+  return bridge->type == HWLOC_OBJ_BRIDGE
+    && bridge->attr->bridge.downstream_type == HWLOC_OBJ_BRIDGE_PCI
+    && bridge->attr->bridge.downstream.pci.domain == domain
+    && bridge->attr->bridge.downstream.pci.secondary_bus <= bus
+    && bridge->attr->bridge.downstream.pci.subordinate_bus >= bus;
+}
+
+/** @} */
+
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_HELPER_H */
diff --git a/src/3rdparty/hwloc/include/hwloc/inlines.h b/src/3rdparty/hwloc/include/hwloc/inlines.h
new file mode 100644
index 000000000..494209ea6
--- /dev/null
+++ b/src/3rdparty/hwloc/include/hwloc/inlines.h
@@ -0,0 +1,146 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2018 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2010 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/**
+ * This file contains the inline code of functions declared in hwloc.h
+ */
+
+#ifndef HWLOC_INLINES_H
+#define HWLOC_INLINES_H
+
+#ifndef HWLOC_H
+#error Please include the main hwloc.h instead
+#endif
+
+#include <stdlib.h>
+#include <errno.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static __hwloc_inline int
+hwloc_get_type_or_below_depth (hwloc_topology_t topology, hwloc_obj_type_t type)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+
+  if (depth != HWLOC_TYPE_DEPTH_UNKNOWN)
+    return depth;
+
+  /* find the highest existing level with type order >= */
+  for(depth = hwloc_get_type_depth(topology, HWLOC_OBJ_PU); ; depth--)
+    if (hwloc_compare_types(hwloc_get_depth_type(topology, depth), type) < 0)
+      return depth+1;
+
+  /* Shouldn't ever happen, as there is always a Machine level with lower order and known depth.  */
+  /* abort(); */
+}
+
+static __hwloc_inline int
+hwloc_get_type_or_above_depth (hwloc_topology_t topology, hwloc_obj_type_t type)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+
+  if (depth != HWLOC_TYPE_DEPTH_UNKNOWN)
+    return depth;
+
+  /* find the lowest existing level with type order <= */
+  for(depth = 0; ; depth++)
+    if (hwloc_compare_types(hwloc_get_depth_type(topology, depth), type) > 0)
+      return depth-1;
+
+  /* Shouldn't ever happen, as there is always a PU level with higher order and known depth.  */
+  /* abort(); */
+}
+
+static __hwloc_inline int
+hwloc_get_nbobjs_by_type (hwloc_topology_t topology, hwloc_obj_type_t type)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+  if (depth == HWLOC_TYPE_DEPTH_UNKNOWN)
+    return 0;
+  if (depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+    return -1; /* FIXME: agregate nbobjs from different levels? */
+  return (int) hwloc_get_nbobjs_by_depth(topology, depth);
+}
+
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_by_type (hwloc_topology_t topology, hwloc_obj_type_t type, unsigned idx)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+  if (depth == HWLOC_TYPE_DEPTH_UNKNOWN)
+    return NULL;
+  if (depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+    return NULL;
+  return hwloc_get_obj_by_depth(topology, depth, idx);
+}
+
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_by_depth (hwloc_topology_t topology, int depth, hwloc_obj_t prev)
+{
+  if (!prev)
+    return hwloc_get_obj_by_depth (topology, depth, 0);
+  if (prev->depth != depth)
+    return NULL;
+  return prev->next_cousin;
+}
+
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_by_type (hwloc_topology_t topology, hwloc_obj_type_t type,
+			    hwloc_obj_t prev)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+  if (depth == HWLOC_TYPE_DEPTH_UNKNOWN || depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+    return NULL;
+  return hwloc_get_next_obj_by_depth (topology, depth, prev);
+}
+
+static __hwloc_inline hwloc_obj_t
+hwloc_get_root_obj (hwloc_topology_t topology)
+{
+  return hwloc_get_obj_by_depth (topology, 0, 0);
+}
+
+static __hwloc_inline const char *
+hwloc_obj_get_info_by_name(hwloc_obj_t obj, const char *name)
+{
+  unsigned i;
+  for(i=0; i<obj->infos_count; i++) {
+    struct hwloc_info_s *info = &obj->infos[i];
+    if (!strcmp(info->name, name))
+      return info->value;
+  }
+  return NULL;
+}
+
+static __hwloc_inline void *
+hwloc_alloc_membind_policy(hwloc_topology_t topology, size_t len, hwloc_const_cpuset_t set, hwloc_membind_policy_t policy, int flags)
+{
+  void *p = hwloc_alloc_membind(topology, len, set, policy, flags);
+  if (p)
+    return p;
+
+  if (hwloc_set_membind(topology, set, policy, flags) < 0)
+    /* hwloc_set_membind() takes care of ignoring errors if non-STRICT */
+    return NULL;
+
+  p = hwloc_alloc(topology, len);
+  if (p && policy != HWLOC_MEMBIND_FIRSTTOUCH)
+    /* Enforce the binding by touching the data */
+    memset(p, 0, len);
+  return p;
+}
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_INLINES_H */
diff --git a/src/3rdparty/hwloc/include/hwloc/intel-mic.h b/src/3rdparty/hwloc/include/hwloc/intel-mic.h
new file mode 100644
index 000000000..6f6f9d1b3
--- /dev/null
+++ b/src/3rdparty/hwloc/include/hwloc/intel-mic.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright © 2013-2016 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and Intel Xeon Phi (MIC).
+ *
+ * Applications that use both hwloc and Intel Xeon Phi (MIC) may want to
+ * include this file so as to get topology information for MIC devices.
+ */
+
+#ifndef HWLOC_INTEL_MIC_H
+#define HWLOC_INTEL_MIC_H
+
+#include <hwloc.h>
+#include <hwloc/autogen/config.h>
+#include <hwloc/helper.h>
+#ifdef HWLOC_LINUX_SYS
+#include <hwloc/linux.h>
+#include <dirent.h>
+#include <string.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_intel_mic Interoperability with Intel Xeon Phi (MIC)
+ *
+ * This interface offers ways to retrieve topology information about
+ * Intel Xeon Phi (MIC) devices.
+ *
+ * @{
+ */
+
+/** \brief Get the CPU set of logical processors that are physically
+ * close to MIC device whose index is \p idx.
+ *
+ * Return the CPU set describing the locality of the MIC device whose index is \p idx.
+ *
+ * Topology \p topology and device index \p idx must match the local machine.
+ * I/O devices detection is not needed in the topology.
+ *
+ * The function only returns the locality of the device.
+ * If more information about the device is needed, OS objects should
+ * be used instead, see hwloc_intel_mic_get_device_osdev_by_index().
+ *
+ * This function is currently only implemented in a meaningful way for
+ * Linux; other systems will simply get a full cpuset.
+ */
+static __hwloc_inline int
+hwloc_intel_mic_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_unused,
+				  int idx __hwloc_attribute_unused,
+				  hwloc_cpuset_t set)
+{
+#ifdef HWLOC_LINUX_SYS
+	/* If we're on Linux, use the sysfs mechanism to get the local cpus */
+#define HWLOC_INTEL_MIC_DEVICE_SYSFS_PATH_MAX 128
+	char path[HWLOC_INTEL_MIC_DEVICE_SYSFS_PATH_MAX];
+	DIR *sysdir = NULL;
+	struct dirent *dirent;
+	unsigned pcibus, pcidev, pcifunc;
+
+	if (!hwloc_topology_is_thissystem(topology)) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	sprintf(path, "/sys/class/mic/mic%d", idx);
+	sysdir = opendir(path);
+	if (!sysdir)
+		return -1;
+
+	while ((dirent = readdir(sysdir)) != NULL) {
+		if (sscanf(dirent->d_name, "pci_%02x:%02x.%02x", &pcibus, &pcidev, &pcifunc) == 3) {
+			sprintf(path, "/sys/class/mic/mic%d/pci_%02x:%02x.%02x/local_cpus", idx, pcibus, pcidev, pcifunc);
+			if (hwloc_linux_read_path_as_cpumask(path, set) < 0
+			    || hwloc_bitmap_iszero(set))
+				hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+			break;
+		}
+	}
+
+	closedir(sysdir);
+#else
+	/* Non-Linux systems simply get a full cpuset */
+	hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+#endif
+	return 0;
+}
+
+/** \brief Get the hwloc OS device object corresponding to the
+ * MIC device for the given index.
+ *
+ * Return the OS device object describing the MIC device whose index is \p idx.
+ * Return NULL if there is none.
+ *
+ * The topology \p topology does not necessarily have to match the current
+ * machine. For instance the topology may be an XML import of a remote host.
+ * I/O devices detection must be enabled in the topology.
+ *
+ * \note The corresponding PCI device object can be obtained by looking
+ * at the OS device parent object.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_intel_mic_get_device_osdev_by_index(hwloc_topology_t topology,
+					  unsigned idx)
+{
+	hwloc_obj_t osdev = NULL;
+	while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+		if (HWLOC_OBJ_OSDEV_COPROC == osdev->attr->osdev.type
+                    && osdev->name
+		    && !strncmp("mic", osdev->name, 3)
+		    && atoi(osdev->name + 3) == (int) idx)
+                        return osdev;
+        }
+        return NULL;
+}
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_INTEL_MIC_H */
diff --git a/src/3rdparty/hwloc/include/hwloc/linux-libnuma.h b/src/3rdparty/hwloc/include/hwloc/linux-libnuma.h
new file mode 100644
index 000000000..7cea4166b
--- /dev/null
+++ b/src/3rdparty/hwloc/include/hwloc/linux-libnuma.h
@@ -0,0 +1,273 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2017 Inria.  All rights reserved.
+ * Copyright © 2009-2010, 2012 Université Bordeaux
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and Linux libnuma.
+ *
+ * Applications that use both Linux libnuma and hwloc may want to
+ * include this file so as to ease conversion between their respective types.
+*/
+
+#ifndef HWLOC_LINUX_LIBNUMA_H
+#define HWLOC_LINUX_LIBNUMA_H
+
+#include <hwloc.h>
+#include <numa.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_linux_libnuma_ulongs Interoperability with Linux libnuma unsigned long masks
+ *
+ * This interface helps converting between Linux libnuma unsigned long masks
+ * and hwloc cpusets and nodesets.
+ *
+ * \note Topology \p topology must match the current machine.
+ *
+ * \note The behavior of libnuma is undefined if the kernel is not NUMA-aware.
+ * (when CONFIG_NUMA is not set in the kernel configuration).
+ * This helper and libnuma may thus not be strictly compatible in this case,
+ * which may be detected by checking whether numa_available() returns -1.
+ *
+ * @{
+ */
+
+
+/** \brief Convert hwloc CPU set \p cpuset into the array of unsigned long \p mask
+ *
+ * \p mask is the array of unsigned long that will be filled.
+ * \p maxnode contains the maximal node number that may be stored in \p mask.
+ * \p maxnode will be set to the maximal node number that was found, plus one.
+ *
+ * This function may be used before calling set_mempolicy, mbind, migrate_pages
+ * or any other function that takes an array of unsigned long and a maximal
+ * node number as input parameter.
+ */
+static __hwloc_inline int
+hwloc_cpuset_to_linux_libnuma_ulongs(hwloc_topology_t topology, hwloc_const_cpuset_t cpuset,
+				    unsigned long *mask, unsigned long *maxnode)
+{
+  int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+  unsigned long outmaxnode = -1;
+  hwloc_obj_t node = NULL;
+
+  /* round-up to the next ulong and clear all bytes */
+  *maxnode = (*maxnode + 8*sizeof(*mask) - 1) & ~(8*sizeof(*mask) - 1);
+  memset(mask, 0, *maxnode/8);
+
+  while ((node = hwloc_get_next_obj_covering_cpuset_by_depth(topology, cpuset, depth, node)) != NULL) {
+    if (node->os_index >= *maxnode)
+      continue;
+    mask[node->os_index/sizeof(*mask)/8] |= 1UL << (node->os_index % (sizeof(*mask)*8));
+    if (outmaxnode == (unsigned long) -1 || outmaxnode < node->os_index)
+      outmaxnode = node->os_index;
+  }
+
+  *maxnode = outmaxnode+1;
+  return 0;
+}
+
+/** \brief Convert hwloc NUMA node set \p nodeset into the array of unsigned long \p mask
+ *
+ * \p mask is the array of unsigned long that will be filled.
+ * \p maxnode contains the maximal node number that may be stored in \p mask.
+ * \p maxnode will be set to the maximal node number that was found, plus one.
+ *
+ * This function may be used before calling set_mempolicy, mbind, migrate_pages
+ * or any other function that takes an array of unsigned long and a maximal
+ * node number as input parameter.
+ */
+static __hwloc_inline int
+hwloc_nodeset_to_linux_libnuma_ulongs(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset,
+				      unsigned long *mask, unsigned long *maxnode)
+{
+  int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+  unsigned long outmaxnode = -1;
+  hwloc_obj_t node = NULL;
+
+  /* round-up to the next ulong and clear all bytes */
+  *maxnode = (*maxnode + 8*sizeof(*mask) - 1) & ~(8*sizeof(*mask) - 1);
+  memset(mask, 0, *maxnode/8);
+
+  while ((node = hwloc_get_next_obj_by_depth(topology, depth, node)) != NULL) {
+    if (node->os_index >= *maxnode)
+      continue;
+    if (!hwloc_bitmap_isset(nodeset, node->os_index))
+      continue;
+    mask[node->os_index/sizeof(*mask)/8] |= 1UL << (node->os_index % (sizeof(*mask)*8));
+    if (outmaxnode == (unsigned long) -1 || outmaxnode < node->os_index)
+      outmaxnode = node->os_index;
+  }
+
+  *maxnode = outmaxnode+1;
+  return 0;
+}
+
+/** \brief Convert the array of unsigned long \p mask into hwloc CPU set
+ *
+ * \p mask is a array of unsigned long that will be read.
+ * \p maxnode contains the maximal node number that may be read in \p mask.
+ *
+ * This function may be used after calling get_mempolicy or any other function
+ * that takes an array of unsigned long as output parameter (and possibly
+ * a maximal node number as input parameter).
+ */
+static __hwloc_inline int
+hwloc_cpuset_from_linux_libnuma_ulongs(hwloc_topology_t topology, hwloc_cpuset_t cpuset,
+				      const unsigned long *mask, unsigned long maxnode)
+{
+  int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+  hwloc_obj_t node = NULL;
+  hwloc_bitmap_zero(cpuset);
+  while ((node = hwloc_get_next_obj_by_depth(topology, depth, node)) != NULL)
+    if (node->os_index < maxnode
+	&& (mask[node->os_index/sizeof(*mask)/8] & (1UL << (node->os_index % (sizeof(*mask)*8)))))
+      hwloc_bitmap_or(cpuset, cpuset, node->cpuset);
+  return 0;
+}
+
+/** \brief Convert the array of unsigned long \p mask into hwloc NUMA node set
+ *
+ * \p mask is a array of unsigned long that will be read.
+ * \p maxnode contains the maximal node number that may be read in \p mask.
+ *
+ * This function may be used after calling get_mempolicy or any other function
+ * that takes an array of unsigned long as output parameter (and possibly
+ * a maximal node number as input parameter).
+ */
+static __hwloc_inline int
+hwloc_nodeset_from_linux_libnuma_ulongs(hwloc_topology_t topology, hwloc_nodeset_t nodeset,
+					const unsigned long *mask, unsigned long maxnode)
+{
+  int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+  hwloc_obj_t node = NULL;
+  hwloc_bitmap_zero(nodeset);
+  while ((node = hwloc_get_next_obj_by_depth(topology, depth, node)) != NULL)
+    if (node->os_index < maxnode
+	&& (mask[node->os_index/sizeof(*mask)/8] & (1UL << (node->os_index % (sizeof(*mask)*8)))))
+      hwloc_bitmap_set(nodeset, node->os_index);
+  return 0;
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_linux_libnuma_bitmask Interoperability with Linux libnuma bitmask
+ *
+ * This interface helps converting between Linux libnuma bitmasks
+ * and hwloc cpusets and nodesets.
+ *
+ * \note Topology \p topology must match the current machine.
+ *
+ * \note The behavior of libnuma is undefined if the kernel is not NUMA-aware.
+ * (when CONFIG_NUMA is not set in the kernel configuration).
+ * This helper and libnuma may thus not be strictly compatible in this case,
+ * which may be detected by checking whether numa_available() returns -1.
+ *
+ * @{
+ */
+
+
+/** \brief Convert hwloc CPU set \p cpuset into the returned libnuma bitmask
+ *
+ * The returned bitmask should later be freed with numa_bitmask_free.
+ *
+ * This function may be used before calling many numa_ functions
+ * that use a struct bitmask as an input parameter.
+ *
+ * \return newly allocated struct bitmask.
+ */
+static __hwloc_inline struct bitmask *
+hwloc_cpuset_to_linux_libnuma_bitmask(hwloc_topology_t topology, hwloc_const_cpuset_t cpuset) __hwloc_attribute_malloc;
+static __hwloc_inline struct bitmask *
+hwloc_cpuset_to_linux_libnuma_bitmask(hwloc_topology_t topology, hwloc_const_cpuset_t cpuset)
+{
+  int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+  hwloc_obj_t node = NULL;
+  struct bitmask *bitmask = numa_allocate_cpumask();
+  if (!bitmask)
+    return NULL;
+  while ((node = hwloc_get_next_obj_covering_cpuset_by_depth(topology, cpuset, depth, node)) != NULL)
+    if (node->attr->numanode.local_memory)
+      numa_bitmask_setbit(bitmask, node->os_index);
+  return bitmask;
+}
+
+/** \brief Convert hwloc NUMA node set \p nodeset into the returned libnuma bitmask
+ *
+ * The returned bitmask should later be freed with numa_bitmask_free.
+ *
+ * This function may be used before calling many numa_ functions
+ * that use a struct bitmask as an input parameter.
+ *
+ * \return newly allocated struct bitmask.
+ */
+static __hwloc_inline struct bitmask *
+hwloc_nodeset_to_linux_libnuma_bitmask(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset) __hwloc_attribute_malloc;
+static __hwloc_inline struct bitmask *
+hwloc_nodeset_to_linux_libnuma_bitmask(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset)
+{
+  int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+  hwloc_obj_t node = NULL;
+  struct bitmask *bitmask = numa_allocate_cpumask();
+  if (!bitmask)
+    return NULL;
+  while ((node = hwloc_get_next_obj_by_depth(topology, depth, node)) != NULL)
+    if (hwloc_bitmap_isset(nodeset, node->os_index) && node->attr->numanode.local_memory)
+      numa_bitmask_setbit(bitmask, node->os_index);
+  return bitmask;
+}
+
+/** \brief Convert libnuma bitmask \p bitmask into hwloc CPU set \p cpuset
+ *
+ * This function may be used after calling many numa_ functions
+ * that use a struct bitmask as an output parameter.
+ */
+static __hwloc_inline int
+hwloc_cpuset_from_linux_libnuma_bitmask(hwloc_topology_t topology, hwloc_cpuset_t cpuset,
+					const struct bitmask *bitmask)
+{
+  int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+  hwloc_obj_t node = NULL;
+  hwloc_bitmap_zero(cpuset);
+  while ((node = hwloc_get_next_obj_by_depth(topology, depth, node)) != NULL)
+    if (numa_bitmask_isbitset(bitmask, node->os_index))
+      hwloc_bitmap_or(cpuset, cpuset, node->cpuset);
+  return 0;
+}
+
+/** \brief Convert libnuma bitmask \p bitmask into hwloc NUMA node set \p nodeset
+ *
+ * This function may be used after calling many numa_ functions
+ * that use a struct bitmask as an output parameter.
+ */
+static __hwloc_inline int
+hwloc_nodeset_from_linux_libnuma_bitmask(hwloc_topology_t topology, hwloc_nodeset_t nodeset,
+					 const struct bitmask *bitmask)
+{
+  int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+  hwloc_obj_t node = NULL;
+  hwloc_bitmap_zero(nodeset);
+  while ((node = hwloc_get_next_obj_by_depth(topology, depth, node)) != NULL)
+    if (numa_bitmask_isbitset(bitmask, node->os_index))
+      hwloc_bitmap_set(nodeset, node->os_index);
+  return 0;
+}
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_LINUX_NUMA_H */
diff --git a/src/3rdparty/hwloc/include/hwloc/linux.h b/src/3rdparty/hwloc/include/hwloc/linux.h
new file mode 100644
index 000000000..c409e1c2a
--- /dev/null
+++ b/src/3rdparty/hwloc/include/hwloc/linux.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2016 Inria.  All rights reserved.
+ * Copyright © 2009-2011 Université Bordeaux
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and Linux.
+ *
+ * Applications that use hwloc on Linux may want to include this file
+ * if using some low-level Linux features.
+ */
+
+#ifndef HWLOC_LINUX_H
+#define HWLOC_LINUX_H
+
+#include <hwloc.h>
+#include <stdio.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_linux Linux-specific helpers
+ *
+ * This includes helpers for manipulating Linux kernel cpumap files, and hwloc
+ * equivalents of the Linux sched_setaffinity and sched_getaffinity system calls.
+ *
+ * @{
+ */
+
+/** \brief Bind a thread \p tid on cpus given in cpuset \p set
+ *
+ * The behavior is exactly the same as the Linux sched_setaffinity system call,
+ * but uses a hwloc cpuset.
+ *
+ * \note This is equivalent to calling hwloc_set_proc_cpubind() with
+ * HWLOC_CPUBIND_THREAD as flags.
+ */
+HWLOC_DECLSPEC int hwloc_linux_set_tid_cpubind(hwloc_topology_t topology, pid_t tid, hwloc_const_cpuset_t set);
+
+/** \brief Get the current binding of thread \p tid
+ *
+ * The behavior is exactly the same as the Linux sched_getaffinity system call,
+ * but uses a hwloc cpuset.
+ *
+ * \note This is equivalent to calling hwloc_get_proc_cpubind() with
+ * ::HWLOC_CPUBIND_THREAD as flags.
+ */
+HWLOC_DECLSPEC int hwloc_linux_get_tid_cpubind(hwloc_topology_t topology, pid_t tid, hwloc_cpuset_t set);
+
+/** \brief Get the last physical CPU where thread \p tid ran.
+ *
+ * \note This is equivalent to calling hwloc_get_proc_last_cpu_location() with
+ * ::HWLOC_CPUBIND_THREAD as flags.
+ */
+HWLOC_DECLSPEC int hwloc_linux_get_tid_last_cpu_location(hwloc_topology_t topology, pid_t tid, hwloc_bitmap_t set);
+
+/** \brief Convert a linux kernel cpumask file \p path into a hwloc bitmap \p set.
+ *
+ * Might be used when reading CPU set from sysfs attributes such as topology
+ * and caches for processors, or local_cpus for devices.
+ *
+ * \note This function ignores the HWLOC_FSROOT environment variable.
+ */
+HWLOC_DECLSPEC int hwloc_linux_read_path_as_cpumask(const char *path, hwloc_bitmap_t set);
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_LINUX_H */
diff --git a/src/3rdparty/hwloc/include/hwloc/nvml.h b/src/3rdparty/hwloc/include/hwloc/nvml.h
new file mode 100644
index 000000000..197108660
--- /dev/null
+++ b/src/3rdparty/hwloc/include/hwloc/nvml.h
@@ -0,0 +1,181 @@
+/*
+ * Copyright © 2012-2016 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and the NVIDIA Management Library.
+ *
+ * Applications that use both hwloc and the NVIDIA Management Library may want to
+ * include this file so as to get topology information for NVML devices.
+ */
+
+#ifndef HWLOC_NVML_H
+#define HWLOC_NVML_H
+
+#include <hwloc.h>
+#include <hwloc/autogen/config.h>
+#include <hwloc/helper.h>
+#ifdef HWLOC_LINUX_SYS
+#include <hwloc/linux.h>
+#endif
+
+#include <nvml.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_nvml Interoperability with the NVIDIA Management Library
+ *
+ * This interface offers ways to retrieve topology information about
+ * devices managed by the NVIDIA Management Library (NVML).
+ *
+ * @{
+ */
+
+/** \brief Get the CPU set of logical processors that are physically
+ * close to NVML device \p device.
+ *
+ * Return the CPU set describing the locality of the NVML device \p device.
+ *
+ * Topology \p topology and device \p device must match the local machine.
+ * I/O devices detection and the NVML component are not needed in the topology.
+ *
+ * The function only returns the locality of the device.
+ * If more information about the device is needed, OS objects should
+ * be used instead, see hwloc_nvml_get_device_osdev()
+ * and hwloc_nvml_get_device_osdev_by_index().
+ *
+ * This function is currently only implemented in a meaningful way for
+ * Linux; other systems will simply get a full cpuset.
+ */
+static __hwloc_inline int
+hwloc_nvml_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_unused,
+			     nvmlDevice_t device, hwloc_cpuset_t set)
+{
+#ifdef HWLOC_LINUX_SYS
+  /* If we're on Linux, use the sysfs mechanism to get the local cpus */
+#define HWLOC_NVML_DEVICE_SYSFS_PATH_MAX 128
+  char path[HWLOC_NVML_DEVICE_SYSFS_PATH_MAX];
+  nvmlReturn_t nvres;
+  nvmlPciInfo_t pci;
+
+  if (!hwloc_topology_is_thissystem(topology)) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  nvres = nvmlDeviceGetPciInfo(device, &pci);
+  if (NVML_SUCCESS != nvres) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  sprintf(path, "/sys/bus/pci/devices/%04x:%02x:%02x.0/local_cpus", pci.domain, pci.bus, pci.device);
+  if (hwloc_linux_read_path_as_cpumask(path, set) < 0
+      || hwloc_bitmap_iszero(set))
+    hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+#else
+  /* Non-Linux systems simply get a full cpuset */
+  hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+#endif
+  return 0;
+}
+
+/** \brief Get the hwloc OS device object corresponding to the
+ * NVML device whose index is \p idx.
+ *
+ * Return the OS device object describing the NVML device whose
+ * index is \p idx. Returns NULL if there is none.
+ *
+ * The topology \p topology does not necessarily have to match the current
+ * machine. For instance the topology may be an XML import of a remote host.
+ * I/O devices detection and the NVML component must be enabled in the topology.
+ *
+ * \note The corresponding PCI device object can be obtained by looking
+ * at the OS device parent object (unless PCI devices are filtered out).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_nvml_get_device_osdev_by_index(hwloc_topology_t topology, unsigned idx)
+{
+	hwloc_obj_t osdev = NULL;
+	while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+                if (HWLOC_OBJ_OSDEV_GPU == osdev->attr->osdev.type
+                    && osdev->name
+		    && !strncmp("nvml", osdev->name, 4)
+		    && atoi(osdev->name + 4) == (int) idx)
+                        return osdev;
+        }
+        return NULL;
+}
+
+/** \brief Get the hwloc OS device object corresponding to NVML device \p device.
+ *
+ * Return the hwloc OS device object that describes the given
+ * NVML device \p device. Return NULL if there is none.
+ *
+ * Topology \p topology and device \p device must match the local machine.
+ * I/O devices detection and the NVML component must be enabled in the topology.
+ * If not, the locality of the object may still be found using
+ * hwloc_nvml_get_device_cpuset().
+ *
+ * \note The corresponding hwloc PCI device may be found by looking
+ * at the result parent pointer (unless PCI devices are filtered out).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_nvml_get_device_osdev(hwloc_topology_t topology, nvmlDevice_t device)
+{
+	hwloc_obj_t osdev;
+	nvmlReturn_t nvres;
+	nvmlPciInfo_t pci;
+	char uuid[64];
+
+	if (!hwloc_topology_is_thissystem(topology)) {
+		errno = EINVAL;
+		return NULL;
+	}
+
+	nvres = nvmlDeviceGetPciInfo(device, &pci);
+	if (NVML_SUCCESS != nvres)
+		return NULL;
+
+	nvres = nvmlDeviceGetUUID(device, uuid, sizeof(uuid));
+	if (NVML_SUCCESS != nvres)
+		uuid[0] = '\0';
+
+	osdev = NULL;
+	while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+		hwloc_obj_t pcidev = osdev->parent;
+		const char *info;
+
+		if (strncmp(osdev->name, "nvml", 4))
+			continue;
+
+		if (pcidev
+		    && pcidev->type == HWLOC_OBJ_PCI_DEVICE
+		    && pcidev->attr->pcidev.domain == pci.domain
+		    && pcidev->attr->pcidev.bus == pci.bus
+		    && pcidev->attr->pcidev.dev == pci.device
+		    && pcidev->attr->pcidev.func == 0)
+			return osdev;
+
+		info = hwloc_obj_get_info_by_name(osdev, "NVIDIAUUID");
+		if (info && !strcmp(info, uuid))
+			return osdev;
+	}
+
+	return NULL;
+}
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_NVML_H */
diff --git a/src/3rdparty/hwloc/include/hwloc/opencl.h b/src/3rdparty/hwloc/include/hwloc/opencl.h
new file mode 100644
index 000000000..058968d74
--- /dev/null
+++ b/src/3rdparty/hwloc/include/hwloc/opencl.h
@@ -0,0 +1,206 @@
+/*
+ * Copyright © 2012-2018 Inria.  All rights reserved.
+ * Copyright © 2013, 2018 Université Bordeaux.  All right reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and the OpenCL interface.
+ *
+ * Applications that use both hwloc and OpenCL may want to
+ * include this file so as to get topology information for OpenCL devices.
+ */
+
+#ifndef HWLOC_OPENCL_H
+#define HWLOC_OPENCL_H
+
+#include <hwloc.h>
+#include <hwloc/autogen/config.h>
+#include <hwloc/helper.h>
+#ifdef HWLOC_LINUX_SYS
+#include <hwloc/linux.h>
+#endif
+
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#include <OpenCL/cl_ext.h>
+#else
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
+#endif
+
+#include <stdio.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_opencl Interoperability with OpenCL
+ *
+ * This interface offers ways to retrieve topology information about
+ * OpenCL devices.
+ *
+ * Only the AMD OpenCL interface currently offers useful locality information
+ * about its devices.
+ *
+ * @{
+ */
+
+/** \brief Get the CPU set of logical processors that are physically
+ * close to OpenCL device \p device.
+ *
+ * Return the CPU set describing the locality of the OpenCL device \p device.
+ *
+ * Topology \p topology and device \p device must match the local machine.
+ * I/O devices detection and the OpenCL component are not needed in the topology.
+ *
+ * The function only returns the locality of the device.
+ * If more information about the device is needed, OS objects should
+ * be used instead, see hwloc_opencl_get_device_osdev()
+ * and hwloc_opencl_get_device_osdev_by_index().
+ *
+ * This function is currently only implemented in a meaningful way for
+ * Linux with the AMD OpenCL implementation; other systems will simply
+ * get a full cpuset.
+ */
+static __hwloc_inline int
+hwloc_opencl_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_unused,
+			       cl_device_id device __hwloc_attribute_unused,
+			       hwloc_cpuset_t set)
+{
+#if (defined HWLOC_LINUX_SYS) && (defined CL_DEVICE_TOPOLOGY_AMD)
+	/* If we're on Linux + AMD OpenCL, use the AMD extension + the sysfs mechanism to get the local cpus */
+#define HWLOC_OPENCL_DEVICE_SYSFS_PATH_MAX 128
+	char path[HWLOC_OPENCL_DEVICE_SYSFS_PATH_MAX];
+	cl_device_topology_amd amdtopo;
+	cl_int clret;
+
+	if (!hwloc_topology_is_thissystem(topology)) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	clret = clGetDeviceInfo(device, CL_DEVICE_TOPOLOGY_AMD, sizeof(amdtopo), &amdtopo, NULL);
+	if (CL_SUCCESS != clret) {
+		hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+		return 0;
+	}
+	if (CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD != amdtopo.raw.type) {
+		hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+		return 0;
+	}
+
+	sprintf(path, "/sys/bus/pci/devices/0000:%02x:%02x.%01x/local_cpus",
+		(unsigned) amdtopo.pcie.bus, (unsigned) amdtopo.pcie.device, (unsigned) amdtopo.pcie.function);
+	if (hwloc_linux_read_path_as_cpumask(path, set) < 0
+	    || hwloc_bitmap_iszero(set))
+		hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+#else
+	/* Non-Linux + AMD OpenCL systems simply get a full cpuset */
+	hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+#endif
+  return 0;
+}
+
+/** \brief Get the hwloc OS device object corresponding to the
+ * OpenCL device for the given indexes.
+ *
+ * Return the OS device object describing the OpenCL device
+ * whose platform index is \p platform_index,
+ * and whose device index within this platform if \p device_index.
+ * Return NULL if there is none.
+ *
+ * The topology \p topology does not necessarily have to match the current
+ * machine. For instance the topology may be an XML import of a remote host.
+ * I/O devices detection and the OpenCL component must be enabled in the topology.
+ *
+ * \note The corresponding PCI device object can be obtained by looking
+ * at the OS device parent object (unless PCI devices are filtered out).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_opencl_get_device_osdev_by_index(hwloc_topology_t topology,
+				       unsigned platform_index, unsigned device_index)
+{
+	unsigned x = (unsigned) -1, y = (unsigned) -1;
+	hwloc_obj_t osdev = NULL;
+	while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+		if (HWLOC_OBJ_OSDEV_COPROC == osdev->attr->osdev.type
+                    && osdev->name
+		    && sscanf(osdev->name, "opencl%ud%u", &x, &y) == 2
+		    && platform_index == x && device_index == y)
+                        return osdev;
+        }
+        return NULL;
+}
+
+/** \brief Get the hwloc OS device object corresponding to OpenCL device \p deviceX.
+ *
+ * Use OpenCL device attributes to find the corresponding hwloc OS device object.
+ * Return NULL if there is none or if useful attributes are not available.
+ *
+ * This function currently only works on AMD OpenCL devices that support
+ * the CL_DEVICE_TOPOLOGY_AMD extension. hwloc_opencl_get_device_osdev_by_index()
+ * should be preferred whenever possible, i.e. when platform and device index
+ * are known.
+ *
+ * Topology \p topology and device \p device must match the local machine.
+ * I/O devices detection and the OpenCL component must be enabled in the topology.
+ * If not, the locality of the object may still be found using
+ * hwloc_opencl_get_device_cpuset().
+ *
+ * \note This function cannot work if PCI devices are filtered out.
+ *
+ * \note The corresponding hwloc PCI device may be found by looking
+ * at the result parent pointer (unless PCI devices are filtered out).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_opencl_get_device_osdev(hwloc_topology_t topology __hwloc_attribute_unused,
+			      cl_device_id device __hwloc_attribute_unused)
+{
+#ifdef CL_DEVICE_TOPOLOGY_AMD
+	hwloc_obj_t osdev;
+	cl_device_topology_amd amdtopo;
+	cl_int clret;
+
+	clret = clGetDeviceInfo(device, CL_DEVICE_TOPOLOGY_AMD, sizeof(amdtopo), &amdtopo, NULL);
+	if (CL_SUCCESS != clret) {
+		errno = EINVAL;
+		return NULL;
+	}
+	if (CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD != amdtopo.raw.type) {
+		errno = EINVAL;
+		return NULL;
+	}
+
+	osdev = NULL;
+	while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+		hwloc_obj_t pcidev = osdev->parent;
+		if (strncmp(osdev->name, "opencl", 6))
+			continue;
+		if (pcidev
+		    && pcidev->type == HWLOC_OBJ_PCI_DEVICE
+		    && pcidev->attr->pcidev.domain == 0
+		    && pcidev->attr->pcidev.bus == amdtopo.pcie.bus
+		    && pcidev->attr->pcidev.dev == amdtopo.pcie.device
+		    && pcidev->attr->pcidev.func == amdtopo.pcie.function)
+			return osdev;
+		/* if PCI are filtered out, we need a info attr to match on */
+	}
+
+	return NULL;
+#else
+	return NULL;
+#endif
+}
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_OPENCL_H */
diff --git a/src/3rdparty/hwloc/include/hwloc/openfabrics-verbs.h b/src/3rdparty/hwloc/include/hwloc/openfabrics-verbs.h
new file mode 100644
index 000000000..174ab4a57
--- /dev/null
+++ b/src/3rdparty/hwloc/include/hwloc/openfabrics-verbs.h
@@ -0,0 +1,150 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2016 Inria.  All rights reserved.
+ * Copyright © 2009-2010 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and OpenFabrics
+ * verbs.
+ *
+ * Applications that use both hwloc and OpenFabrics verbs may want to
+ * include this file so as to get topology information for OpenFabrics
+ * hardware (InfiniBand, etc).
+ *
+ */
+
+#ifndef HWLOC_OPENFABRICS_VERBS_H
+#define HWLOC_OPENFABRICS_VERBS_H
+
+#include <hwloc.h>
+#include <hwloc/autogen/config.h>
+#ifdef HWLOC_LINUX_SYS
+#include <hwloc/linux.h>
+#endif
+
+#include <infiniband/verbs.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_openfabrics Interoperability with OpenFabrics
+ *
+ * This interface offers ways to retrieve topology information about
+ * OpenFabrics devices (InfiniBand, Omni-Path, usNIC, etc).
+ *
+ * @{
+ */
+
+/** \brief Get the CPU set of logical processors that are physically
+ * close to device \p ibdev.
+ *
+ * Return the CPU set describing the locality of the OpenFabrics
+ * device \p ibdev (InfiniBand, etc).
+ *
+ * Topology \p topology and device \p ibdev must match the local machine.
+ * I/O devices detection is not needed in the topology.
+ *
+ * The function only returns the locality of the device.
+ * If more information about the device is needed, OS objects should
+ * be used instead, see hwloc_ibv_get_device_osdev()
+ * and hwloc_ibv_get_device_osdev_by_name().
+ *
+ * This function is currently only implemented in a meaningful way for
+ * Linux; other systems will simply get a full cpuset.
+ */
+static __hwloc_inline int
+hwloc_ibv_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_unused,
+			    struct ibv_device *ibdev, hwloc_cpuset_t set)
+{
+#ifdef HWLOC_LINUX_SYS
+  /* If we're on Linux, use the verbs-provided sysfs mechanism to
+     get the local cpus */
+#define HWLOC_OPENFABRICS_VERBS_SYSFS_PATH_MAX 128
+  char path[HWLOC_OPENFABRICS_VERBS_SYSFS_PATH_MAX];
+
+  if (!hwloc_topology_is_thissystem(topology)) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  sprintf(path, "/sys/class/infiniband/%s/device/local_cpus",
+	  ibv_get_device_name(ibdev));
+  if (hwloc_linux_read_path_as_cpumask(path, set) < 0
+      || hwloc_bitmap_iszero(set))
+    hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+#else
+  /* Non-Linux systems simply get a full cpuset */
+  hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+#endif
+  return 0;
+}
+
+/** \brief Get the hwloc OS device object corresponding to the OpenFabrics
+ * device named \p ibname.
+ *
+ * Return the OS device object describing the OpenFabrics device
+ * (InfiniBand, Omni-Path, usNIC, etc) whose name is \p ibname
+ * (mlx5_0, hfi1_0, usnic_0, qib0, etc).
+ * Returns NULL if there is none.
+ * The name \p ibname is usually obtained from ibv_get_device_name().
+ *
+ * The topology \p topology does not necessarily have to match the current
+ * machine. For instance the topology may be an XML import of a remote host.
+ * I/O devices detection must be enabled in the topology.
+ *
+ * \note The corresponding PCI device object can be obtained by looking
+ * at the OS device parent object.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_ibv_get_device_osdev_by_name(hwloc_topology_t topology,
+				   const char *ibname)
+{
+	hwloc_obj_t osdev = NULL;
+	while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+		if (HWLOC_OBJ_OSDEV_OPENFABRICS == osdev->attr->osdev.type
+		    && osdev->name && !strcmp(ibname, osdev->name))
+			return osdev;
+	}
+	return NULL;
+}
+
+/** \brief Get the hwloc OS device object corresponding to the OpenFabrics
+ * device \p ibdev.
+ *
+ * Return the OS device object describing the OpenFabrics device \p ibdev
+ * (InfiniBand, etc). Returns NULL if there is none.
+ *
+ * Topology \p topology and device \p ibdev must match the local machine.
+ * I/O devices detection must be enabled in the topology.
+ * If not, the locality of the object may still be found using
+ * hwloc_ibv_get_device_cpuset().
+ *
+ * \note The corresponding PCI device object can be obtained by looking
+ * at the OS device parent object.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_ibv_get_device_osdev(hwloc_topology_t topology,
+			   struct ibv_device *ibdev)
+{
+	if (!hwloc_topology_is_thissystem(topology)) {
+		errno = EINVAL;
+		return NULL;
+	}
+	return hwloc_ibv_get_device_osdev_by_name(topology, ibv_get_device_name(ibdev));
+}
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_OPENFABRICS_VERBS_H */
diff --git a/src/3rdparty/hwloc/include/hwloc/plugins.h b/src/3rdparty/hwloc/include/hwloc/plugins.h
new file mode 100644
index 000000000..cb22000d4
--- /dev/null
+++ b/src/3rdparty/hwloc/include/hwloc/plugins.h
@@ -0,0 +1,542 @@
+/*
+ * Copyright © 2013-2017 Inria.  All rights reserved.
+ * Copyright © 2016 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#ifndef HWLOC_PLUGINS_H
+#define HWLOC_PLUGINS_H
+
+/** \file
+ * \brief Public interface for building hwloc plugins.
+ */
+
+struct hwloc_backend;
+
+#include <hwloc.h>
+#ifdef HWLOC_INSIDE_PLUGIN
+/* needed for hwloc_plugin_check_namespace() */
+#include <ltdl.h>
+#endif
+
+
+
+/** \defgroup hwlocality_disc_components Components and Plugins: Discovery components
+ * @{
+ */
+
+/** \brief Discovery component type */
+typedef enum hwloc_disc_component_type_e {
+  /** \brief CPU-only discovery through the OS, or generic no-OS support.
+   * \hideinitializer */
+  HWLOC_DISC_COMPONENT_TYPE_CPU = (1<<0),
+
+  /** \brief xml or synthetic,
+   * platform-specific components such as bgq.
+   * Anything the discovers CPU and everything else.
+   * No misc backend is expected to complement a global component.
+   * \hideinitializer */
+  HWLOC_DISC_COMPONENT_TYPE_GLOBAL = (1<<1),
+
+  /** \brief OpenCL, Cuda, etc.
+   * \hideinitializer */
+  HWLOC_DISC_COMPONENT_TYPE_MISC = (1<<2)
+} hwloc_disc_component_type_t;
+
+/** \brief Discovery component structure
+ *
+ * This is the major kind of components, taking care of the discovery.
+ * They are registered by generic components, either statically-built or as plugins.
+ */
+struct hwloc_disc_component {
+  /** \brief Discovery component type */
+  hwloc_disc_component_type_t type;
+
+  /** \brief Name.
+   * If this component is built as a plugin, this name does not have to match the plugin filename.
+   */
+  const char *name;
+
+  /** \brief Component types to exclude, as an OR'ed set of ::hwloc_disc_component_type_e.
+   *
+   * For a GLOBAL component, this usually includes all other types (~0).
+   *
+   * Other components only exclude types that may bring conflicting
+   * topology information. MISC components should likely not be excluded
+   * since they usually bring non-primary additional information.
+   */
+  unsigned excludes;
+
+  /** \brief Instantiate callback to create a backend from the component.
+   * Parameters data1, data2, data3 are NULL except for components
+   * that have special enabling routines such as hwloc_topology_set_xml(). */
+  struct hwloc_backend * (*instantiate)(struct hwloc_disc_component *component, const void *data1, const void *data2, const void *data3);
+
+  /** \brief Component priority.
+   * Used to sort topology->components, higher priority first.
+   * Also used to decide between two components with the same name.
+   *
+   * Usual values are
+   * 50 for native OS (or platform) components,
+   * 45 for x86,
+   * 40 for no-OS fallback,
+   * 30 for global components (xml, synthetic),
+   * 20 for pci,
+   * 10 for other misc components (opencl etc.).
+   */
+  unsigned priority;
+
+  /** \brief Enabled by default.
+   * If unset, if will be disabled unless explicitly requested.
+   */
+  unsigned enabled_by_default;
+
+  /** \private Used internally to list components by priority on topology->components
+   * (the component structure is usually read-only,
+   *  the core copies it before using this field for queueing)
+   */
+  struct hwloc_disc_component * next;
+};
+
+/** @} */
+
+
+
+
+/** \defgroup hwlocality_disc_backends Components and Plugins: Discovery backends
+ * @{
+ */
+
+/** \brief Discovery backend structure
+ *
+ * A backend is the instantiation of a discovery component.
+ * When a component gets enabled for a topology,
+ * its instantiate() callback creates a backend.
+ *
+ * hwloc_backend_alloc() initializes all fields to default values
+ * that the component may change (except "component" and "next")
+ * before enabling the backend with hwloc_backend_enable().
+ */
+struct hwloc_backend {
+  /** \private Reserved for the core, set by hwloc_backend_alloc() */
+  struct hwloc_disc_component * component;
+  /** \private Reserved for the core, set by hwloc_backend_enable() */
+  struct hwloc_topology * topology;
+  /** \private Reserved for the core. Set to 1 if forced through envvar, 0 otherwise. */
+  int envvar_forced;
+  /** \private Reserved for the core. Used internally to list backends topology->backends. */
+  struct hwloc_backend * next;
+
+  /** \brief Backend flags, currently always 0. */
+  unsigned long flags;
+
+  /** \brief Backend-specific 'is_thissystem' property.
+   * Set to 0 or 1 if the backend should enforce the thissystem flag when it gets enabled.
+   * Set to -1 if the backend doesn't care (default). */
+  int is_thissystem;
+
+  /** \brief Backend private data, or NULL if none. */
+  void * private_data;
+  /** \brief Callback for freeing the private_data.
+   * May be NULL.
+   */
+  void (*disable)(struct hwloc_backend *backend);
+
+  /** \brief Main discovery callback.
+   * returns -1 on error, either because it couldn't add its objects ot the existing topology,
+   * or because of an actual discovery/gathering failure.
+   * May be NULL.
+   */
+  int (*discover)(struct hwloc_backend *backend);
+
+  /** \brief Callback used by the PCI backend to retrieve the locality of a PCI object from the OS/cpu backend.
+   * May be NULL. */
+  int (*get_pci_busid_cpuset)(struct hwloc_backend *backend, struct hwloc_pcidev_attr_s *busid, hwloc_bitmap_t cpuset);
+};
+
+/** \brief Allocate a backend structure, set good default values, initialize backend->component and topology, etc.
+ * The caller will then modify whatever needed, and call hwloc_backend_enable().
+ */
+HWLOC_DECLSPEC struct hwloc_backend * hwloc_backend_alloc(struct hwloc_disc_component *component);
+
+/** \brief Enable a previously allocated and setup backend. */
+HWLOC_DECLSPEC int hwloc_backend_enable(struct hwloc_topology *topology, struct hwloc_backend *backend);
+
+/** @} */
+
+
+
+
+/** \defgroup hwlocality_generic_components Components and Plugins: Generic components
+ * @{
+ */
+
+/** \brief Generic component type */
+typedef enum hwloc_component_type_e {
+  /** \brief The data field must point to a struct hwloc_disc_component. */
+  HWLOC_COMPONENT_TYPE_DISC,
+
+  /** \brief The data field must point to a struct hwloc_xml_component. */
+  HWLOC_COMPONENT_TYPE_XML
+} hwloc_component_type_t;
+
+/** \brief Generic component structure
+ *
+ * Generic components structure, either statically listed by configure in static-components.h
+ * or dynamically loaded as a plugin.
+ */
+struct hwloc_component {
+  /** \brief Component ABI version, set to ::HWLOC_COMPONENT_ABI */
+  unsigned abi;
+
+  /** \brief Process-wide component initialization callback.
+   *
+   * This optional callback is called when the component is registered
+   * to the hwloc core (after loading the plugin).
+   *
+   * When the component is built as a plugin, this callback
+   * should call hwloc_check_plugin_namespace()
+   * and return an negative error code on error.
+   *
+   * \p flags is always 0 for now.
+   *
+   * \return 0 on success, or a negative code on error.
+   *
+   * \note If the component uses ltdl for loading its own plugins,
+   * it should load/unload them only in init() and finalize(),
+   * to avoid race conditions with hwloc's use of ltdl.
+   */
+  int (*init)(unsigned long flags);
+
+  /** \brief Process-wide component termination callback.
+   *
+   * This optional callback is called after unregistering the component
+   * from the hwloc core (before unloading the plugin).
+   *
+   * \p flags is always 0 for now.
+   *
+   * \note If the component uses ltdl for loading its own plugins,
+   * it should load/unload them only in init() and finalize(),
+   * to avoid race conditions with hwloc's use of ltdl.
+   */
+  void (*finalize)(unsigned long flags);
+
+  /** \brief Component type */
+  hwloc_component_type_t type;
+
+  /** \brief Component flags, unused for now */
+  unsigned long flags;
+
+  /** \brief Component data, pointing to a struct hwloc_disc_component or struct hwloc_xml_component. */
+  void * data;
+};
+
+/** @} */
+
+
+
+
+/** \defgroup hwlocality_components_core_funcs Components and Plugins: Core functions to be used by components
+ * @{
+ */
+
+/** \brief Add an object to the topology.
+ *
+ * It is sorted along the tree of other objects according to the inclusion of
+ * cpusets, to eventually be added as a child of the smallest object including
+ * this object.
+ *
+ * If the cpuset is empty, the type of the object (and maybe some attributes)
+ * must be enough to find where to insert the object. This is especially true
+ * for NUMA nodes with memory and no CPUs.
+ *
+ * The given object should not have children.
+ *
+ * This shall only be called before levels are built.
+ *
+ * In case of error, hwloc_report_os_error() is called.
+ *
+ * The caller should check whether the object type is filtered-out before calling this function.
+ *
+ * The topology cpuset/nodesets will be enlarged to include the object sets.
+ *
+ * Returns the object on success.
+ * Returns NULL and frees obj on error.
+ * Returns another object and frees obj if it was merged with an identical pre-existing object.
+ */
+HWLOC_DECLSPEC struct hwloc_obj *hwloc_insert_object_by_cpuset(struct hwloc_topology *topology, hwloc_obj_t obj);
+
+/** \brief Type of error callbacks during object insertion */
+typedef void (*hwloc_report_error_t)(const char * msg, int line);
+/** \brief Report an insertion error from a backend */
+HWLOC_DECLSPEC void hwloc_report_os_error(const char * msg, int line);
+/** \brief Check whether insertion errors are hidden */
+HWLOC_DECLSPEC int hwloc_hide_errors(void);
+
+/** \brief Add an object to the topology and specify which error callback to use.
+ *
+ * This function is similar to hwloc_insert_object_by_cpuset() but it allows specifying
+ * where to start insertion from (if \p root is NULL, the topology root object is used),
+ * and specifying the error callback.
+ */
+HWLOC_DECLSPEC struct hwloc_obj *hwloc__insert_object_by_cpuset(struct hwloc_topology *topology, hwloc_obj_t root, hwloc_obj_t obj, hwloc_report_error_t report_error);
+
+/** \brief Insert an object somewhere in the topology.
+ *
+ * It is added as the last child of the given parent.
+ * The cpuset is completely ignored, so strange objects such as I/O devices should
+ * preferably be inserted with this.
+ *
+ * When used for "normal" children with cpusets (when importing from XML
+ * when duplicating a topology), the caller should make sure that:
+ * - children are inserted in order,
+ * - children cpusets do not intersect.
+ *
+ * The given object may have normal, I/O or Misc children, as long as they are in order as well.
+ * These children must have valid parent and next_sibling pointers.
+ *
+ * The caller should check whether the object type is filtered-out before calling this function.
+ */
+HWLOC_DECLSPEC void hwloc_insert_object_by_parent(struct hwloc_topology *topology, hwloc_obj_t parent, hwloc_obj_t obj);
+
+/** \brief Allocate and initialize an object of the given type and physical index.
+ *
+ * If \p os_index is unknown or irrelevant, use \c HWLOC_UNKNOWN_INDEX.
+ */
+HWLOC_DECLSPEC hwloc_obj_t hwloc_alloc_setup_object(hwloc_topology_t topology, hwloc_obj_type_t type, unsigned os_index);
+
+/** \brief Setup object cpusets/nodesets by OR'ing its children.
+ *
+ * Used when adding an object late in the topology.
+ * Will update the new object by OR'ing all its new children sets.
+ *
+ * Used when PCI backend adds a hostbridge parent, when distances
+ * add a new Group, etc.
+ */
+HWLOC_DECLSPEC int hwloc_obj_add_children_sets(hwloc_obj_t obj);
+
+/** \brief Request a reconnection of children and levels in the topology.
+ *
+ * May be used by backends during discovery if they need arrays or lists
+ * of object within levels or children to be fully connected.
+ *
+ * \p flags is currently unused, must 0.
+ */
+HWLOC_DECLSPEC int hwloc_topology_reconnect(hwloc_topology_t topology, unsigned long flags __hwloc_attribute_unused);
+
+/** \brief Make sure that plugins can lookup core symbols.
+ *
+ * This is a sanity check to avoid lazy-lookup failures when libhwloc
+ * is loaded within a plugin, and later tries to load its own plugins.
+ * This may fail (and abort the program) if libhwloc symbols are in a
+ * private namespace.
+ *
+ * \return 0 on success.
+ * \return -1 if the plugin cannot be successfully loaded. The caller
+ * plugin init() callback should return a negative error code as well.
+ *
+ * Plugins should call this function in their init() callback to avoid
+ * later crashes if lazy symbol resolution is used by the upper layer that
+ * loaded hwloc (e.g. OpenCL implementations using dlopen with RTLD_LAZY).
+ *
+ * \note The build system must define HWLOC_INSIDE_PLUGIN if and only if
+ * building the caller as a plugin.
+ *
+ * \note This function should remain inline so plugins can call it even
+ * when they cannot find libhwloc symbols.
+ */
+static __hwloc_inline int
+hwloc_plugin_check_namespace(const char *pluginname __hwloc_attribute_unused, const char *symbol __hwloc_attribute_unused)
+{
+#ifdef HWLOC_INSIDE_PLUGIN
+  lt_dlhandle handle;
+  void *sym;
+  handle = lt_dlopen(NULL);
+  if (!handle)
+    /* cannot check, assume things will work */
+    return 0;
+  sym = lt_dlsym(handle, symbol);
+  lt_dlclose(handle);
+  if (!sym) {
+    static int verboseenv_checked = 0;
+    static int verboseenv_value = 0;
+    if (!verboseenv_checked) {
+      const char *verboseenv = getenv("HWLOC_PLUGINS_VERBOSE");
+      verboseenv_value = verboseenv ? atoi(verboseenv) : 0;
+      verboseenv_checked = 1;
+    }
+    if (verboseenv_value)
+      fprintf(stderr, "Plugin `%s' disabling itself because it cannot find the `%s' core symbol.\n",
+	      pluginname, symbol);
+    return -1;
+  }
+#endif /* HWLOC_INSIDE_PLUGIN */
+  return 0;
+}
+
+/** @} */
+
+
+
+
+/** \defgroup hwlocality_components_filtering Components and Plugins: Filtering objects
+ * @{
+ */
+
+/** \brief Check whether the given PCI device classid is important.
+ *
+ * \return 1 if important, 0 otherwise.
+ */
+static __hwloc_inline int
+hwloc_filter_check_pcidev_subtype_important(unsigned classid)
+{
+  unsigned baseclass = classid >> 8;
+  return (baseclass == 0x03 /* PCI_BASE_CLASS_DISPLAY */
+	  || baseclass == 0x02 /* PCI_BASE_CLASS_NETWORK */
+	  || baseclass == 0x01 /* PCI_BASE_CLASS_STORAGE */
+	  || baseclass == 0x0b /* PCI_BASE_CLASS_PROCESSOR */
+	  || classid == 0x0c04 /* PCI_CLASS_SERIAL_FIBER */
+	  || classid == 0x0c06 /* PCI_CLASS_SERIAL_INFINIBAND */
+	  || baseclass == 0x12 /* Processing Accelerators */);
+}
+
+/** \brief Check whether the given OS device subtype is important.
+ *
+ * \return 1 if important, 0 otherwise.
+ */
+static __hwloc_inline int
+hwloc_filter_check_osdev_subtype_important(hwloc_obj_osdev_type_t subtype)
+{
+  return (subtype != HWLOC_OBJ_OSDEV_DMA);
+}
+
+/** \brief Check whether a non-I/O object type should be filtered-out.
+ *
+ * Cannot be used for I/O objects.
+ *
+ * \return 1 if the object type should be kept, 0 otherwise.
+ */
+static __hwloc_inline int
+hwloc_filter_check_keep_object_type(hwloc_topology_t topology, hwloc_obj_type_t type)
+{
+  enum hwloc_type_filter_e filter = HWLOC_TYPE_FILTER_KEEP_NONE;
+  hwloc_topology_get_type_filter(topology, type, &filter);
+  assert(filter != HWLOC_TYPE_FILTER_KEEP_IMPORTANT); /* IMPORTANT only used for I/O */
+  return filter == HWLOC_TYPE_FILTER_KEEP_NONE ? 0 : 1;
+}
+
+/** \brief Check whether the given object should be filtered-out.
+ *
+ * \return 1 if the object type should be kept, 0 otherwise.
+ */
+static __hwloc_inline int
+hwloc_filter_check_keep_object(hwloc_topology_t topology, hwloc_obj_t obj)
+{
+  hwloc_obj_type_t type = obj->type;
+  enum hwloc_type_filter_e filter = HWLOC_TYPE_FILTER_KEEP_NONE;
+  hwloc_topology_get_type_filter(topology, type, &filter);
+  if (filter == HWLOC_TYPE_FILTER_KEEP_NONE)
+    return 0;
+  if (filter == HWLOC_TYPE_FILTER_KEEP_IMPORTANT) {
+    if (type == HWLOC_OBJ_PCI_DEVICE)
+      return hwloc_filter_check_pcidev_subtype_important(obj->attr->pcidev.class_id);
+    if (type == HWLOC_OBJ_OS_DEVICE)
+      return hwloc_filter_check_osdev_subtype_important(obj->attr->osdev.type);
+  }
+  return 1;
+}
+
+/** @} */
+
+
+
+
+/** \defgroup hwlocality_components_pcidisc Components and Plugins: helpers for PCI discovery
+ * @{
+ */
+
+/** \brief Return the offset of the given capability in the PCI config space buffer
+ *
+ * This function requires a 256-bytes config space. Unknown/unavailable bytes should be set to 0xff.
+ */
+HWLOC_DECLSPEC unsigned hwloc_pcidisc_find_cap(const unsigned char *config, unsigned cap);
+
+/** \brief Fill linkspeed by reading the PCI config space where PCI_CAP_ID_EXP is at position offset.
+ *
+ * Needs 20 bytes of EXP capability block starting at offset in the config space
+ * for registers up to link status.
+ */
+HWLOC_DECLSPEC int hwloc_pcidisc_find_linkspeed(const unsigned char *config, unsigned offset, float *linkspeed);
+
+/** \brief Return the hwloc object type (PCI device or Bridge) for the given class and configuration space.
+ *
+ * This function requires 16 bytes of common configuration header at the beginning of config.
+ */
+HWLOC_DECLSPEC hwloc_obj_type_t hwloc_pcidisc_check_bridge_type(unsigned device_class, const unsigned char *config);
+
+/** \brief Fills the attributes of the given PCI bridge using the given PCI config space.
+ *
+ * This function requires 32 bytes of common configuration header at the beginning of config.
+ *
+ * Returns -1 and destroys /p obj if bridge fields are invalid.
+ */
+HWLOC_DECLSPEC int hwloc_pcidisc_setup_bridge_attr(hwloc_obj_t obj, const unsigned char *config);
+
+/** \brief Insert a PCI object in the given PCI tree by looking at PCI bus IDs.
+ *
+ * If \p treep points to \c NULL, the new object is inserted there.
+ */
+HWLOC_DECLSPEC void hwloc_pcidisc_tree_insert_by_busid(struct hwloc_obj **treep, struct hwloc_obj *obj);
+
+/** \brief Add some hostbridges on top of the given tree of PCI objects and attach them to the topology.
+ *
+ * For now, they will be attached to the root object. The core will move them to their actual PCI
+ * locality using hwloc_pci_belowroot_apply_locality() at the end of the discovery.
+ *
+ * In the meantime, other backends lookup PCI objects or localities (for instance to attach OS devices)
+ * by using hwloc_pcidisc_find_by_busid() or hwloc_pcidisc_find_busid_parent().
+ */
+HWLOC_DECLSPEC int hwloc_pcidisc_tree_attach(struct hwloc_topology *topology, struct hwloc_obj *tree);
+
+/** @} */
+
+
+
+
+/** \defgroup hwlocality_components_pcifind Components and Plugins: finding PCI objects during other discoveries
+ * @{
+ */
+
+/** \brief Find the PCI object that matches the bus ID.
+ *
+ * To be used after a PCI backend added PCI devices with hwloc_pcidisc_tree_attach()
+ * and before the core moves them to their actual location with hwloc_pci_belowroot_apply_locality().
+ *
+ * If no exactly matching object is found, return the container bridge if any, or NULL.
+ *
+ * On failure, it may be possible to find the PCI locality (instead of the PCI device)
+ * by calling hwloc_pcidisc_find_busid_parent().
+ *
+ * \note This is semantically identical to hwloc_get_pcidev_by_busid() which only works
+ * after the topology is fully loaded.
+ */
+HWLOC_DECLSPEC struct hwloc_obj * hwloc_pcidisc_find_by_busid(struct hwloc_topology *topology, unsigned domain, unsigned bus, unsigned dev, unsigned func);
+
+/** \brief Find the normal parent of a PCI bus ID.
+ *
+ * Look at PCI affinity to find out where the given PCI bus ID should be attached.
+ *
+ * This function should be used to attach an I/O device directly under a normal
+ * (non-I/O) object, instead of below a PCI object.
+ * It is usually used by backends when hwloc_pcidisc_find_by_busid() failed
+ * to find the hwloc object corresponding to this bus ID, for instance because
+ * PCI discovery is not supported on this platform.
+ */
+HWLOC_DECLSPEC struct hwloc_obj * hwloc_pcidisc_find_busid_parent(struct hwloc_topology *topology, unsigned domain, unsigned bus, unsigned dev, unsigned func);
+
+/** @} */
+
+
+
+
+#endif /* HWLOC_PLUGINS_H */
diff --git a/src/3rdparty/hwloc/include/hwloc/rename.h b/src/3rdparty/hwloc/include/hwloc/rename.h
new file mode 100644
index 000000000..7cef1b2e8
--- /dev/null
+++ b/src/3rdparty/hwloc/include/hwloc/rename.h
@@ -0,0 +1,765 @@
+/*
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * Copyright © 2010-2018 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#ifndef HWLOC_RENAME_H
+#define HWLOC_RENAME_H
+
+#include <hwloc/autogen/config.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Only enact these defines if we're actually renaming the symbols
+   (i.e., avoid trying to have no-op defines if we're *not*
+   renaming). */
+
+#if HWLOC_SYM_TRANSFORM
+
+/* Use a preprocessor two-step in order to get the prefixing right.
+   Make 2 macros: HWLOC_NAME and HWLOC_NAME_CAPS for renaming
+   things. */
+
+#define HWLOC_MUNGE_NAME(a, b) HWLOC_MUNGE_NAME2(a, b)
+#define HWLOC_MUNGE_NAME2(a, b) a ## b
+#define HWLOC_NAME(name) HWLOC_MUNGE_NAME(HWLOC_SYM_PREFIX, hwloc_ ## name)
+#define HWLOC_NAME_CAPS(name) HWLOC_MUNGE_NAME(HWLOC_SYM_PREFIX_CAPS, hwloc_ ## name)
+
+/* Now define all the "real" names to be the prefixed names.  This
+   allows us to use the real names throughout the code base (i.e.,
+   "hwloc_<foo>"); the preprocessor will adjust to have the prefixed
+   name under the covers. */
+
+/* Names from hwloc.h */
+
+#define hwloc_get_api_version HWLOC_NAME(get_api_version)
+
+#define hwloc_topology HWLOC_NAME(topology)
+#define hwloc_topology_t HWLOC_NAME(topology_t)
+
+#define hwloc_cpuset_t HWLOC_NAME(cpuset_t)
+#define hwloc_const_cpuset_t HWLOC_NAME(const_cpuset_t)
+#define hwloc_nodeset_t HWLOC_NAME(nodeset_t)
+#define hwloc_const_nodeset_t HWLOC_NAME(const_nodeset_t)
+
+#define HWLOC_OBJ_MACHINE HWLOC_NAME_CAPS(OBJ_MACHINE)
+#define HWLOC_OBJ_NUMANODE HWLOC_NAME_CAPS(OBJ_NUMANODE)
+#define HWLOC_OBJ_PACKAGE HWLOC_NAME_CAPS(OBJ_PACKAGE)
+#define HWLOC_OBJ_CORE HWLOC_NAME_CAPS(OBJ_CORE)
+#define HWLOC_OBJ_PU HWLOC_NAME_CAPS(OBJ_PU)
+#define HWLOC_OBJ_L1CACHE HWLOC_NAME_CAPS(OBJ_L1CACHE)
+#define HWLOC_OBJ_L2CACHE HWLOC_NAME_CAPS(OBJ_L2CACHE)
+#define HWLOC_OBJ_L3CACHE HWLOC_NAME_CAPS(OBJ_L3CACHE)
+#define HWLOC_OBJ_L4CACHE HWLOC_NAME_CAPS(OBJ_L4CACHE)
+#define HWLOC_OBJ_L5CACHE HWLOC_NAME_CAPS(OBJ_L5CACHE)
+#define HWLOC_OBJ_L1ICACHE HWLOC_NAME_CAPS(OBJ_L1ICACHE)
+#define HWLOC_OBJ_L2ICACHE HWLOC_NAME_CAPS(OBJ_L2ICACHE)
+#define HWLOC_OBJ_L3ICACHE HWLOC_NAME_CAPS(OBJ_L3ICACHE)
+#define HWLOC_OBJ_MISC HWLOC_NAME_CAPS(OBJ_MISC)
+#define HWLOC_OBJ_GROUP HWLOC_NAME_CAPS(OBJ_GROUP)
+#define HWLOC_OBJ_BRIDGE HWLOC_NAME_CAPS(OBJ_BRIDGE)
+#define HWLOC_OBJ_PCI_DEVICE HWLOC_NAME_CAPS(OBJ_PCI_DEVICE)
+#define HWLOC_OBJ_OS_DEVICE HWLOC_NAME_CAPS(OBJ_OS_DEVICE)
+#define HWLOC_OBJ_TYPE_MAX HWLOC_NAME_CAPS(OBJ_TYPE_MAX)
+#define hwloc_obj_type_t HWLOC_NAME(obj_type_t)
+
+#define hwloc_obj_cache_type_e HWLOC_NAME(obj_cache_type_e)
+#define hwloc_obj_cache_type_t HWLOC_NAME(obj_cache_type_t)
+#define HWLOC_OBJ_CACHE_UNIFIED HWLOC_NAME_CAPS(OBJ_CACHE_UNIFIED)
+#define HWLOC_OBJ_CACHE_DATA HWLOC_NAME_CAPS(OBJ_CACHE_DATA)
+#define HWLOC_OBJ_CACHE_INSTRUCTION HWLOC_NAME_CAPS(OBJ_CACHE_INSTRUCTION)
+
+#define hwloc_obj_bridge_type_e HWLOC_NAME(obj_bridge_type_e)
+#define hwloc_obj_bridge_type_t HWLOC_NAME(obj_bridge_type_t)
+#define HWLOC_OBJ_BRIDGE_HOST HWLOC_NAME_CAPS(OBJ_BRIDGE_HOST)
+#define HWLOC_OBJ_BRIDGE_PCI HWLOC_NAME_CAPS(OBJ_BRIDGE_PCI)
+
+#define hwloc_obj_osdev_type_e HWLOC_NAME(obj_osdev_type_e)
+#define hwloc_obj_osdev_type_t HWLOC_NAME(obj_osdev_type_t)
+#define HWLOC_OBJ_OSDEV_BLOCK HWLOC_NAME_CAPS(OBJ_OSDEV_BLOCK)
+#define HWLOC_OBJ_OSDEV_GPU HWLOC_NAME_CAPS(OBJ_OSDEV_GPU)
+#define HWLOC_OBJ_OSDEV_NETWORK HWLOC_NAME_CAPS(OBJ_OSDEV_NETWORK)
+#define HWLOC_OBJ_OSDEV_OPENFABRICS HWLOC_NAME_CAPS(OBJ_OSDEV_OPENFABRICS)
+#define HWLOC_OBJ_OSDEV_DMA HWLOC_NAME_CAPS(OBJ_OSDEV_DMA)
+#define HWLOC_OBJ_OSDEV_COPROC HWLOC_NAME_CAPS(OBJ_OSDEV_COPROC)
+
+#define hwloc_compare_types HWLOC_NAME(compare_types)
+
+#define hwloc_compare_types_e HWLOC_NAME(compare_types_e)
+#define HWLOC_TYPE_UNORDERED HWLOC_NAME_CAPS(TYPE_UNORDERED)
+
+#define hwloc_obj HWLOC_NAME(obj)
+#define hwloc_obj_t HWLOC_NAME(obj_t)
+
+#define hwloc_info_s HWLOC_NAME(info_s)
+
+#define hwloc_obj_attr_u HWLOC_NAME(obj_attr_u)
+#define hwloc_numanode_attr_s HWLOC_NAME(numanode_attr_s)
+#define hwloc_memory_page_type_s HWLOC_NAME(memory_page_type_s)
+#define hwloc_cache_attr_s HWLOC_NAME(cache_attr_s)
+#define hwloc_group_attr_s HWLOC_NAME(group_attr_s)
+#define hwloc_pcidev_attr_s HWLOC_NAME(pcidev_attr_s)
+#define hwloc_bridge_attr_s HWLOC_NAME(bridge_attr_s)
+#define hwloc_osdev_attr_s HWLOC_NAME(osdev_attr_s)
+
+#define hwloc_topology_init HWLOC_NAME(topology_init)
+#define hwloc_topology_load HWLOC_NAME(topology_load)
+#define hwloc_topology_destroy HWLOC_NAME(topology_destroy)
+#define hwloc_topology_dup HWLOC_NAME(topology_dup)
+#define hwloc_topology_abi_check HWLOC_NAME(topology_abi_check)
+#define hwloc_topology_check HWLOC_NAME(topology_check)
+
+#define hwloc_topology_flags_e HWLOC_NAME(topology_flags_e)
+
+#define HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM HWLOC_NAME_CAPS(TOPOLOGY_FLAG_WHOLE_SYSTEM)
+#define HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM HWLOC_NAME_CAPS(TOPOLOGY_FLAG_IS_THISSYSTEM)
+#define HWLOC_TOPOLOGY_FLAG_THISSYSTEM_ALLOWED_RESOURCES HWLOC_NAME_CAPS(TOPOLOGY_FLAG_THISSYSTEM_ALLOWED_RESOURCES)
+
+#define hwloc_topology_set_pid HWLOC_NAME(topology_set_pid)
+#define hwloc_topology_set_synthetic HWLOC_NAME(topology_set_synthetic)
+#define hwloc_topology_set_xml HWLOC_NAME(topology_set_xml)
+#define hwloc_topology_set_xmlbuffer HWLOC_NAME(topology_set_xmlbuffer)
+
+#define hwloc_topology_set_flags HWLOC_NAME(topology_set_flags)
+#define hwloc_topology_is_thissystem HWLOC_NAME(topology_is_thissystem)
+#define hwloc_topology_get_flags HWLOC_NAME(topology_get_flags)
+#define hwloc_topology_discovery_support HWLOC_NAME(topology_discovery_support)
+#define hwloc_topology_cpubind_support HWLOC_NAME(topology_cpubind_support)
+#define hwloc_topology_membind_support HWLOC_NAME(topology_membind_support)
+#define hwloc_topology_support HWLOC_NAME(topology_support)
+#define hwloc_topology_get_support HWLOC_NAME(topology_get_support)
+
+#define hwloc_type_filter_e HWLOC_NAME(type_filter_e)
+#define HWLOC_TYPE_FILTER_KEEP_ALL HWLOC_NAME_CAPS(TYPE_FILTER_KEEP_ALL)
+#define HWLOC_TYPE_FILTER_KEEP_NONE HWLOC_NAME_CAPS(TYPE_FILTER_KEEP_NONE)
+#define HWLOC_TYPE_FILTER_KEEP_STRUCTURE HWLOC_NAME_CAPS(TYPE_FILTER_KEEP_STRUCTURE)
+#define HWLOC_TYPE_FILTER_KEEP_IMPORTANT HWLOC_NAME_CAPS(TYPE_FILTER_KEEP_IMPORTANT)
+#define hwloc_topology_set_type_filter HWLOC_NAME(topology_set_type_filter)
+#define hwloc_topology_get_type_filter HWLOC_NAME(topology_get_type_filter)
+#define hwloc_topology_set_all_types_filter HWLOC_NAME(topology_set_all_types_filter)
+#define hwloc_topology_set_cache_types_filter HWLOC_NAME(topology_set_cache_types_filter)
+#define hwloc_topology_set_icache_types_filter HWLOC_NAME(topology_set_icache_types_filter)
+#define hwloc_topology_set_io_types_filter HWLOC_NAME(topology_set_io_types_filter)
+
+#define hwloc_topology_set_userdata HWLOC_NAME(topology_set_userdata)
+#define hwloc_topology_get_userdata HWLOC_NAME(topology_get_userdata)
+
+#define hwloc_restrict_flags_e HWLOC_NAME(restrict_flags_e)
+#define HWLOC_RESTRICT_FLAG_REMOVE_CPULESS HWLOC_NAME_CAPS(RESTRICT_FLAG_REMOVE_CPULESS)
+#define HWLOC_RESTRICT_FLAG_ADAPT_MISC HWLOC_NAME_CAPS(RESTRICT_FLAG_ADAPT_MISC)
+#define HWLOC_RESTRICT_FLAG_ADAPT_IO HWLOC_NAME_CAPS(RESTRICT_FLAG_ADAPT_IO)
+#define hwloc_topology_restrict HWLOC_NAME(topology_restrict)
+
+#define hwloc_topology_insert_misc_object HWLOC_NAME(topology_insert_misc_object)
+#define hwloc_topology_alloc_group_object HWLOC_NAME(topology_alloc_group_object)
+#define hwloc_topology_insert_group_object HWLOC_NAME(topology_insert_group_object)
+#define hwloc_obj_add_other_obj_sets HWLOC_NAME(obj_add_other_obj_sets)
+
+#define hwloc_topology_get_depth HWLOC_NAME(topology_get_depth)
+#define hwloc_get_type_depth HWLOC_NAME(get_type_depth)
+#define hwloc_get_memory_parents_depth HWLOC_NAME(get_memory_parents_depth)
+
+#define hwloc_get_type_depth_e HWLOC_NAME(get_type_depth_e)
+#define HWLOC_TYPE_DEPTH_UNKNOWN HWLOC_NAME_CAPS(TYPE_DEPTH_UNKNOWN)
+#define HWLOC_TYPE_DEPTH_MULTIPLE HWLOC_NAME_CAPS(TYPE_DEPTH_MULTIPLE)
+#define HWLOC_TYPE_DEPTH_BRIDGE HWLOC_NAME_CAPS(TYPE_DEPTH_BRIDGE)
+#define HWLOC_TYPE_DEPTH_PCI_DEVICE HWLOC_NAME_CAPS(TYPE_DEPTH_PCI_DEVICE)
+#define HWLOC_TYPE_DEPTH_OS_DEVICE HWLOC_NAME_CAPS(TYPE_DEPTH_OS_DEVICE)
+#define HWLOC_TYPE_DEPTH_MISC HWLOC_NAME_CAPS(TYPE_DEPTH_MISC)
+#define HWLOC_TYPE_DEPTH_NUMANODE HWLOC_NAME_CAPS(TYPE_DEPTH_NUMANODE)
+
+#define hwloc_get_depth_type HWLOC_NAME(get_depth_type)
+#define hwloc_get_nbobjs_by_depth HWLOC_NAME(get_nbobjs_by_depth)
+#define hwloc_get_nbobjs_by_type HWLOC_NAME(get_nbobjs_by_type)
+
+#define hwloc_get_obj_by_depth HWLOC_NAME(get_obj_by_depth )
+#define hwloc_get_obj_by_type HWLOC_NAME(get_obj_by_type )
+
+#define hwloc_obj_type_string HWLOC_NAME(obj_type_string )
+#define hwloc_obj_type_snprintf HWLOC_NAME(obj_type_snprintf )
+#define hwloc_obj_attr_snprintf HWLOC_NAME(obj_attr_snprintf )
+#define hwloc_type_sscanf HWLOC_NAME(type_sscanf)
+#define hwloc_type_sscanf_as_depth HWLOC_NAME(type_sscanf_as_depth)
+
+#define hwloc_obj_get_info_by_name HWLOC_NAME(obj_get_info_by_name)
+#define hwloc_obj_add_info HWLOC_NAME(obj_add_info)
+
+#define HWLOC_CPUBIND_PROCESS HWLOC_NAME_CAPS(CPUBIND_PROCESS)
+#define HWLOC_CPUBIND_THREAD HWLOC_NAME_CAPS(CPUBIND_THREAD)
+#define HWLOC_CPUBIND_STRICT HWLOC_NAME_CAPS(CPUBIND_STRICT)
+#define HWLOC_CPUBIND_NOMEMBIND HWLOC_NAME_CAPS(CPUBIND_NOMEMBIND)
+
+#define hwloc_cpubind_flags_t HWLOC_NAME(cpubind_flags_t)
+
+#define hwloc_set_cpubind HWLOC_NAME(set_cpubind)
+#define hwloc_get_cpubind HWLOC_NAME(get_cpubind)
+#define hwloc_set_proc_cpubind HWLOC_NAME(set_proc_cpubind)
+#define hwloc_get_proc_cpubind HWLOC_NAME(get_proc_cpubind)
+#define hwloc_set_thread_cpubind HWLOC_NAME(set_thread_cpubind)
+#define hwloc_get_thread_cpubind HWLOC_NAME(get_thread_cpubind)
+
+#define hwloc_get_last_cpu_location HWLOC_NAME(get_last_cpu_location)
+#define hwloc_get_proc_last_cpu_location HWLOC_NAME(get_proc_last_cpu_location)
+
+#define HWLOC_MEMBIND_DEFAULT HWLOC_NAME_CAPS(MEMBIND_DEFAULT)
+#define HWLOC_MEMBIND_FIRSTTOUCH HWLOC_NAME_CAPS(MEMBIND_FIRSTTOUCH)
+#define HWLOC_MEMBIND_BIND HWLOC_NAME_CAPS(MEMBIND_BIND)
+#define HWLOC_MEMBIND_INTERLEAVE HWLOC_NAME_CAPS(MEMBIND_INTERLEAVE)
+#define HWLOC_MEMBIND_NEXTTOUCH HWLOC_NAME_CAPS(MEMBIND_NEXTTOUCH)
+#define HWLOC_MEMBIND_MIXED HWLOC_NAME_CAPS(MEMBIND_MIXED)
+
+#define hwloc_membind_policy_t HWLOC_NAME(membind_policy_t)
+
+#define HWLOC_MEMBIND_PROCESS HWLOC_NAME_CAPS(MEMBIND_PROCESS)
+#define HWLOC_MEMBIND_THREAD HWLOC_NAME_CAPS(MEMBIND_THREAD)
+#define HWLOC_MEMBIND_STRICT HWLOC_NAME_CAPS(MEMBIND_STRICT)
+#define HWLOC_MEMBIND_MIGRATE HWLOC_NAME_CAPS(MEMBIND_MIGRATE)
+#define HWLOC_MEMBIND_NOCPUBIND HWLOC_NAME_CAPS(MEMBIND_NOCPUBIND)
+#define HWLOC_MEMBIND_BYNODESET HWLOC_NAME_CAPS(MEMBIND_BYNODESET)
+
+#define hwloc_membind_flags_t HWLOC_NAME(membind_flags_t)
+
+#define hwloc_set_membind HWLOC_NAME(set_membind)
+#define hwloc_get_membind HWLOC_NAME(get_membind)
+#define hwloc_set_proc_membind HWLOC_NAME(set_proc_membind)
+#define hwloc_get_proc_membind HWLOC_NAME(get_proc_membind)
+#define hwloc_set_area_membind HWLOC_NAME(set_area_membind)
+#define hwloc_get_area_membind HWLOC_NAME(get_area_membind)
+#define hwloc_get_area_memlocation HWLOC_NAME(get_area_memlocation)
+#define hwloc_alloc_membind HWLOC_NAME(alloc_membind)
+#define hwloc_alloc HWLOC_NAME(alloc)
+#define hwloc_free HWLOC_NAME(free)
+
+#define hwloc_get_non_io_ancestor_obj HWLOC_NAME(get_non_io_ancestor_obj)
+#define hwloc_get_next_pcidev HWLOC_NAME(get_next_pcidev)
+#define hwloc_get_pcidev_by_busid HWLOC_NAME(get_pcidev_by_busid)
+#define hwloc_get_pcidev_by_busidstring HWLOC_NAME(get_pcidev_by_busidstring)
+#define hwloc_get_next_osdev HWLOC_NAME(get_next_osdev)
+#define hwloc_get_next_bridge HWLOC_NAME(get_next_bridge)
+#define hwloc_bridge_covers_pcibus HWLOC_NAME(bridge_covers_pcibus)
+
+/* hwloc/bitmap.h */
+
+#define hwloc_bitmap_s HWLOC_NAME(bitmap_s)
+#define hwloc_bitmap_t HWLOC_NAME(bitmap_t)
+#define hwloc_const_bitmap_t HWLOC_NAME(const_bitmap_t)
+
+#define hwloc_bitmap_alloc HWLOC_NAME(bitmap_alloc)
+#define hwloc_bitmap_alloc_full HWLOC_NAME(bitmap_alloc_full)
+#define hwloc_bitmap_free HWLOC_NAME(bitmap_free)
+#define hwloc_bitmap_dup HWLOC_NAME(bitmap_dup)
+#define hwloc_bitmap_copy HWLOC_NAME(bitmap_copy)
+#define hwloc_bitmap_snprintf HWLOC_NAME(bitmap_snprintf)
+#define hwloc_bitmap_asprintf HWLOC_NAME(bitmap_asprintf)
+#define hwloc_bitmap_sscanf HWLOC_NAME(bitmap_sscanf)
+#define hwloc_bitmap_list_snprintf HWLOC_NAME(bitmap_list_snprintf)
+#define hwloc_bitmap_list_asprintf HWLOC_NAME(bitmap_list_asprintf)
+#define hwloc_bitmap_list_sscanf HWLOC_NAME(bitmap_list_sscanf)
+#define hwloc_bitmap_taskset_snprintf HWLOC_NAME(bitmap_taskset_snprintf)
+#define hwloc_bitmap_taskset_asprintf HWLOC_NAME(bitmap_taskset_asprintf)
+#define hwloc_bitmap_taskset_sscanf HWLOC_NAME(bitmap_taskset_sscanf)
+#define hwloc_bitmap_zero HWLOC_NAME(bitmap_zero)
+#define hwloc_bitmap_fill HWLOC_NAME(bitmap_fill)
+#define hwloc_bitmap_from_ulong HWLOC_NAME(bitmap_from_ulong)
+
+#define hwloc_bitmap_from_ith_ulong HWLOC_NAME(bitmap_from_ith_ulong)
+#define hwloc_bitmap_to_ulong HWLOC_NAME(bitmap_to_ulong)
+#define hwloc_bitmap_to_ith_ulong HWLOC_NAME(bitmap_to_ith_ulong)
+#define hwloc_bitmap_only HWLOC_NAME(bitmap_only)
+#define hwloc_bitmap_allbut HWLOC_NAME(bitmap_allbut)
+#define hwloc_bitmap_set HWLOC_NAME(bitmap_set)
+#define hwloc_bitmap_set_range HWLOC_NAME(bitmap_set_range)
+#define hwloc_bitmap_set_ith_ulong HWLOC_NAME(bitmap_set_ith_ulong)
+#define hwloc_bitmap_clr HWLOC_NAME(bitmap_clr)
+#define hwloc_bitmap_clr_range HWLOC_NAME(bitmap_clr_range)
+#define hwloc_bitmap_isset HWLOC_NAME(bitmap_isset)
+#define hwloc_bitmap_iszero HWLOC_NAME(bitmap_iszero)
+#define hwloc_bitmap_isfull HWLOC_NAME(bitmap_isfull)
+#define hwloc_bitmap_isequal HWLOC_NAME(bitmap_isequal)
+#define hwloc_bitmap_intersects HWLOC_NAME(bitmap_intersects)
+#define hwloc_bitmap_isincluded HWLOC_NAME(bitmap_isincluded)
+#define hwloc_bitmap_or HWLOC_NAME(bitmap_or)
+#define hwloc_bitmap_and HWLOC_NAME(bitmap_and)
+#define hwloc_bitmap_andnot HWLOC_NAME(bitmap_andnot)
+#define hwloc_bitmap_xor HWLOC_NAME(bitmap_xor)
+#define hwloc_bitmap_not HWLOC_NAME(bitmap_not)
+#define hwloc_bitmap_first HWLOC_NAME(bitmap_first)
+#define hwloc_bitmap_last HWLOC_NAME(bitmap_last)
+#define hwloc_bitmap_next HWLOC_NAME(bitmap_next)
+#define hwloc_bitmap_first_unset HWLOC_NAME(bitmap_first_unset)
+#define hwloc_bitmap_last_unset HWLOC_NAME(bitmap_last_unset)
+#define hwloc_bitmap_next_unset HWLOC_NAME(bitmap_next_unset)
+#define hwloc_bitmap_singlify HWLOC_NAME(bitmap_singlify)
+#define hwloc_bitmap_compare_first HWLOC_NAME(bitmap_compare_first)
+#define hwloc_bitmap_compare HWLOC_NAME(bitmap_compare)
+#define hwloc_bitmap_weight HWLOC_NAME(bitmap_weight)
+
+/* hwloc/helper.h */
+
+#define hwloc_get_type_or_below_depth HWLOC_NAME(get_type_or_below_depth)
+#define hwloc_get_type_or_above_depth HWLOC_NAME(get_type_or_above_depth)
+#define hwloc_get_root_obj HWLOC_NAME(get_root_obj)
+#define hwloc_get_ancestor_obj_by_depth HWLOC_NAME(get_ancestor_obj_by_depth)
+#define hwloc_get_ancestor_obj_by_type HWLOC_NAME(get_ancestor_obj_by_type)
+#define hwloc_get_next_obj_by_depth HWLOC_NAME(get_next_obj_by_depth)
+#define hwloc_get_next_obj_by_type HWLOC_NAME(get_next_obj_by_type)
+#define hwloc_get_pu_obj_by_os_index HWLOC_NAME(get_pu_obj_by_os_index)
+#define hwloc_get_numanode_obj_by_os_index HWLOC_NAME(get_numanode_obj_by_os_index)
+#define hwloc_get_next_child HWLOC_NAME(get_next_child)
+#define hwloc_get_common_ancestor_obj HWLOC_NAME(get_common_ancestor_obj)
+#define hwloc_obj_is_in_subtree HWLOC_NAME(obj_is_in_subtree)
+#define hwloc_get_first_largest_obj_inside_cpuset HWLOC_NAME(get_first_largest_obj_inside_cpuset)
+#define hwloc_get_largest_objs_inside_cpuset HWLOC_NAME(get_largest_objs_inside_cpuset)
+#define hwloc_get_next_obj_inside_cpuset_by_depth HWLOC_NAME(get_next_obj_inside_cpuset_by_depth)
+#define hwloc_get_next_obj_inside_cpuset_by_type HWLOC_NAME(get_next_obj_inside_cpuset_by_type)
+#define hwloc_get_obj_inside_cpuset_by_depth HWLOC_NAME(get_obj_inside_cpuset_by_depth)
+#define hwloc_get_obj_inside_cpuset_by_type HWLOC_NAME(get_obj_inside_cpuset_by_type)
+#define hwloc_get_nbobjs_inside_cpuset_by_depth HWLOC_NAME(get_nbobjs_inside_cpuset_by_depth)
+#define hwloc_get_nbobjs_inside_cpuset_by_type HWLOC_NAME(get_nbobjs_inside_cpuset_by_type)
+#define hwloc_get_obj_index_inside_cpuset HWLOC_NAME(get_obj_index_inside_cpuset)
+#define hwloc_get_child_covering_cpuset HWLOC_NAME(get_child_covering_cpuset)
+#define hwloc_get_obj_covering_cpuset HWLOC_NAME(get_obj_covering_cpuset)
+#define hwloc_get_next_obj_covering_cpuset_by_depth HWLOC_NAME(get_next_obj_covering_cpuset_by_depth)
+#define hwloc_get_next_obj_covering_cpuset_by_type HWLOC_NAME(get_next_obj_covering_cpuset_by_type)
+#define hwloc_obj_type_is_normal HWLOC_NAME(obj_type_is_normal)
+#define hwloc_obj_type_is_memory HWLOC_NAME(obj_type_is_memory)
+#define hwloc_obj_type_is_io HWLOC_NAME(obj_type_is_io)
+#define hwloc_obj_type_is_cache HWLOC_NAME(obj_type_is_cache)
+#define hwloc_obj_type_is_dcache HWLOC_NAME(obj_type_is_dcache)
+#define hwloc_obj_type_is_icache HWLOC_NAME(obj_type_is_icache)
+#define hwloc_get_cache_type_depth HWLOC_NAME(get_cache_type_depth)
+#define hwloc_get_cache_covering_cpuset HWLOC_NAME(get_cache_covering_cpuset)
+#define hwloc_get_shared_cache_covering_obj HWLOC_NAME(get_shared_cache_covering_obj)
+#define hwloc_get_closest_objs HWLOC_NAME(get_closest_objs)
+#define hwloc_get_obj_below_by_type HWLOC_NAME(get_obj_below_by_type)
+#define hwloc_get_obj_below_array_by_type HWLOC_NAME(get_obj_below_array_by_type)
+#define hwloc_distrib_flags_e HWLOC_NAME(distrib_flags_e)
+#define HWLOC_DISTRIB_FLAG_REVERSE HWLOC_NAME_CAPS(DISTRIB_FLAG_REVERSE)
+#define hwloc_distrib HWLOC_NAME(distrib)
+#define hwloc_alloc_membind_policy HWLOC_NAME(alloc_membind_policy)
+#define hwloc_alloc_membind_policy_nodeset HWLOC_NAME(alloc_membind_policy_nodeset)
+#define hwloc_topology_get_complete_cpuset HWLOC_NAME(topology_get_complete_cpuset)
+#define hwloc_topology_get_topology_cpuset HWLOC_NAME(topology_get_topology_cpuset)
+#define hwloc_topology_get_allowed_cpuset HWLOC_NAME(topology_get_allowed_cpuset)
+#define hwloc_topology_get_complete_nodeset HWLOC_NAME(topology_get_complete_nodeset)
+#define hwloc_topology_get_topology_nodeset HWLOC_NAME(topology_get_topology_nodeset)
+#define hwloc_topology_get_allowed_nodeset HWLOC_NAME(topology_get_allowed_nodeset)
+#define hwloc_cpuset_to_nodeset HWLOC_NAME(cpuset_to_nodeset)
+#define hwloc_cpuset_from_nodeset HWLOC_NAME(cpuset_from_nodeset)
+
+/* export.h */
+
+#define hwloc_topology_export_xml_flags_e HWLOC_NAME(topology_export_xml_flags_e)
+#define HWLOC_TOPOLOGY_EXPORT_XML_FLAG_V1 HWLOC_NAME_CAPS(TOPOLOGY_EXPORT_XML_FLAG_V1)
+#define hwloc_topology_export_xml HWLOC_NAME(topology_export_xml)
+#define hwloc_topology_export_xmlbuffer HWLOC_NAME(topology_export_xmlbuffer)
+#define hwloc_free_xmlbuffer HWLOC_NAME(free_xmlbuffer)
+#define hwloc_topology_set_userdata_export_callback HWLOC_NAME(topology_set_userdata_export_callback)
+#define hwloc_export_obj_userdata HWLOC_NAME(export_obj_userdata)
+#define hwloc_export_obj_userdata_base64 HWLOC_NAME(export_obj_userdata_base64)
+#define hwloc_topology_set_userdata_import_callback HWLOC_NAME(topology_set_userdata_import_callback)
+
+#define hwloc_topology_export_synthetic_flags_e HWLOC_NAME(topology_export_synthetic_flags_e)
+#define HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_EXTENDED_TYPES HWLOC_NAME_CAPS(TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_EXTENDED_TYPES)
+#define HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS HWLOC_NAME_CAPS(TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS)
+#define HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_V1 HWLOC_NAME_CAPS(TOPOLOGY_EXPORT_SYNTHETIC_FLAG_V1)
+#define HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_IGNORE_MEMORY HWLOC_NAME_CAPS(TOPOLOGY_EXPORT_SYNTHETIC_FLAG_IGNORE_MEMORY)
+#define hwloc_topology_export_synthetic HWLOC_NAME(topology_export_synthetic)
+
+/* distances.h */
+
+#define hwloc_distances_s HWLOC_NAME(distances_s)
+
+#define hwloc_distances_kind_e HWLOC_NAME(distances_kind_e)
+#define HWLOC_DISTANCES_KIND_FROM_OS HWLOC_NAME_CAPS(DISTANCES_KIND_FROM_OS)
+#define HWLOC_DISTANCES_KIND_FROM_USER HWLOC_NAME_CAPS(DISTANCES_KIND_FROM_USER)
+#define HWLOC_DISTANCES_KIND_MEANS_LATENCY HWLOC_NAME_CAPS(DISTANCES_KIND_MEANS_LATENCY)
+#define HWLOC_DISTANCES_KIND_MEANS_BANDWIDTH HWLOC_NAME_CAPS(DISTANCES_KIND_MEANS_BANDWIDTH)
+
+#define hwloc_distances_get HWLOC_NAME(distances_get)
+#define hwloc_distances_get_by_depth HWLOC_NAME(distances_get_by_depth)
+#define hwloc_distances_get_by_type HWLOC_NAME(distances_get_by_type)
+#define hwloc_distances_release HWLOC_NAME(distances_release)
+#define hwloc_distances_obj_index HWLOC_NAME(distances_obj_index)
+#define hwloc_distances_obj_pair_values HWLOC_NAME(distances_pair_values)
+
+#define hwloc_distances_add_flag_e HWLOC_NAME(distances_add_flag_e)
+#define HWLOC_DISTANCES_ADD_FLAG_GROUP HWLOC_NAME_CAPS(DISTANCES_ADD_FLAG_GROUP)
+#define HWLOC_DISTANCES_ADD_FLAG_GROUP_INACCURATE HWLOC_NAME_CAPS(DISTANCES_ADD_FLAG_GROUP_INACCURATE)
+
+#define hwloc_distances_add HWLOC_NAME(distances_add)
+#define hwloc_distances_remove HWLOC_NAME(distances_remove)
+#define hwloc_distances_remove_by_depth HWLOC_NAME(distances_remove_by_depth)
+#define hwloc_distances_remove_by_type HWLOC_NAME(distances_remove_by_type)
+
+/* diff.h */
+
+#define hwloc_topology_diff_obj_attr_type_e HWLOC_NAME(topology_diff_obj_attr_type_e)
+#define hwloc_topology_diff_obj_attr_type_t HWLOC_NAME(topology_diff_obj_attr_type_t)
+#define HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_SIZE HWLOC_NAME_CAPS(TOPOLOGY_DIFF_OBJ_ATTR_SIZE)
+#define HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_NAME HWLOC_NAME_CAPS(TOPOLOGY_DIFF_OBJ_ATTR_NAME)
+#define HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_INFO HWLOC_NAME_CAPS(TOPOLOGY_DIFF_OBJ_ATTR_INFO)
+#define hwloc_topology_diff_obj_attr_u HWLOC_NAME(topology_diff_obj_attr_u)
+#define hwloc_topology_diff_obj_attr_generic_s HWLOC_NAME(topology_diff_obj_attr_generic_s)
+#define hwloc_topology_diff_obj_attr_uint64_s HWLOC_NAME(topology_diff_obj_attr_uint64_s)
+#define hwloc_topology_diff_obj_attr_string_s HWLOC_NAME(topology_diff_obj_attr_string_s)
+#define hwloc_topology_diff_type_e HWLOC_NAME(topology_diff_type_e)
+#define hwloc_topology_diff_type_t HWLOC_NAME(topology_diff_type_t)
+#define HWLOC_TOPOLOGY_DIFF_OBJ_ATTR HWLOC_NAME_CAPS(TOPOLOGY_DIFF_OBJ_ATTR)
+#define HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX HWLOC_NAME_CAPS(TOPOLOGY_DIFF_TOO_COMPLEX)
+#define hwloc_topology_diff_u HWLOC_NAME(topology_diff_u)
+#define hwloc_topology_diff_t HWLOC_NAME(topology_diff_t)
+#define hwloc_topology_diff_generic_s HWLOC_NAME(topology_diff_generic_s)
+#define hwloc_topology_diff_obj_attr_s HWLOC_NAME(topology_diff_obj_attr_s)
+#define hwloc_topology_diff_too_complex_s HWLOC_NAME(topology_diff_too_complex_s)
+#define hwloc_topology_diff_build HWLOC_NAME(topology_diff_build)
+#define hwloc_topology_diff_apply_flags_e HWLOC_NAME(topology_diff_apply_flags_e)
+#define HWLOC_TOPOLOGY_DIFF_APPLY_REVERSE HWLOC_NAME_CAPS(TOPOLOGY_DIFF_APPLY_REVERSE)
+#define hwloc_topology_diff_apply HWLOC_NAME(topology_diff_apply)
+#define hwloc_topology_diff_destroy HWLOC_NAME(topology_diff_destroy)
+#define hwloc_topology_diff_load_xml HWLOC_NAME(topology_diff_load_xml)
+#define hwloc_topology_diff_export_xml HWLOC_NAME(topology_diff_export_xml)
+#define hwloc_topology_diff_load_xmlbuffer HWLOC_NAME(topology_diff_load_xmlbuffer)
+#define hwloc_topology_diff_export_xmlbuffer HWLOC_NAME(topology_diff_export_xmlbuffer)
+
+/* shmem.h */
+
+#define hwloc_shmem_topology_get_length HWLOC_NAME(shmem_topology_get_length)
+#define hwloc_shmem_topology_write HWLOC_NAME(shmem_topology_write)
+#define hwloc_shmem_topology_adopt HWLOC_NAME(shmem_topology_adopt)
+
+/* glibc-sched.h */
+
+#define hwloc_cpuset_to_glibc_sched_affinity HWLOC_NAME(cpuset_to_glibc_sched_affinity)
+#define hwloc_cpuset_from_glibc_sched_affinity HWLOC_NAME(cpuset_from_glibc_sched_affinity)
+
+/* linux-libnuma.h */
+
+#define hwloc_cpuset_to_linux_libnuma_ulongs HWLOC_NAME(cpuset_to_linux_libnuma_ulongs)
+#define hwloc_nodeset_to_linux_libnuma_ulongs HWLOC_NAME(nodeset_to_linux_libnuma_ulongs)
+#define hwloc_cpuset_from_linux_libnuma_ulongs HWLOC_NAME(cpuset_from_linux_libnuma_ulongs)
+#define hwloc_nodeset_from_linux_libnuma_ulongs HWLOC_NAME(nodeset_from_linux_libnuma_ulongs)
+#define hwloc_cpuset_to_linux_libnuma_bitmask HWLOC_NAME(cpuset_to_linux_libnuma_bitmask)
+#define hwloc_nodeset_to_linux_libnuma_bitmask HWLOC_NAME(nodeset_to_linux_libnuma_bitmask)
+#define hwloc_cpuset_from_linux_libnuma_bitmask HWLOC_NAME(cpuset_from_linux_libnuma_bitmask)
+#define hwloc_nodeset_from_linux_libnuma_bitmask HWLOC_NAME(nodeset_from_linux_libnuma_bitmask)
+
+/* linux.h */
+
+#define hwloc_linux_set_tid_cpubind HWLOC_NAME(linux_set_tid_cpubind)
+#define hwloc_linux_get_tid_cpubind HWLOC_NAME(linux_get_tid_cpubind)
+#define hwloc_linux_get_tid_last_cpu_location HWLOC_NAME(linux_get_tid_last_cpu_location)
+#define hwloc_linux_read_path_as_cpumask HWLOC_NAME(linux_read_file_cpumask)
+
+/* openfabrics-verbs.h */
+
+#define hwloc_ibv_get_device_cpuset HWLOC_NAME(ibv_get_device_cpuset)
+#define hwloc_ibv_get_device_osdev HWLOC_NAME(ibv_get_device_osdev)
+#define hwloc_ibv_get_device_osdev_by_name HWLOC_NAME(ibv_get_device_osdev_by_name)
+
+/* intel-mic.h */
+
+#define hwloc_intel_mic_get_device_cpuset HWLOC_NAME(intel_mic_get_device_cpuset)
+#define hwloc_intel_mic_get_device_osdev_by_index HWLOC_NAME(intel_mic_get_device_osdev_by_index)
+
+/* opencl.h */
+
+#define hwloc_opencl_get_device_cpuset HWLOC_NAME(opencl_get_device_cpuset)
+#define hwloc_opencl_get_device_osdev HWLOC_NAME(opencl_get_device_osdev)
+#define hwloc_opencl_get_device_osdev_by_index HWLOC_NAME(opencl_get_device_osdev_by_index)
+
+/* cuda.h */
+
+#define hwloc_cuda_get_device_pci_ids HWLOC_NAME(cuda_get_device_pci_ids)
+#define hwloc_cuda_get_device_cpuset HWLOC_NAME(cuda_get_device_cpuset)
+#define hwloc_cuda_get_device_pcidev HWLOC_NAME(cuda_get_device_pcidev)
+#define hwloc_cuda_get_device_osdev HWLOC_NAME(cuda_get_device_osdev)
+#define hwloc_cuda_get_device_osdev_by_index HWLOC_NAME(cuda_get_device_osdev_by_index)
+
+/* cudart.h */
+
+#define hwloc_cudart_get_device_pci_ids HWLOC_NAME(cudart_get_device_pci_ids)
+#define hwloc_cudart_get_device_cpuset HWLOC_NAME(cudart_get_device_cpuset)
+#define hwloc_cudart_get_device_pcidev HWLOC_NAME(cudart_get_device_pcidev)
+#define hwloc_cudart_get_device_osdev_by_index HWLOC_NAME(cudart_get_device_osdev_by_index)
+
+/* nvml.h */
+
+#define hwloc_nvml_get_device_cpuset HWLOC_NAME(nvml_get_device_cpuset)
+#define hwloc_nvml_get_device_osdev HWLOC_NAME(nvml_get_device_osdev)
+#define hwloc_nvml_get_device_osdev_by_index HWLOC_NAME(nvml_get_device_osdev_by_index)
+
+/* gl.h */
+
+#define hwloc_gl_get_display_osdev_by_port_device HWLOC_NAME(gl_get_display_osdev_by_port_device)
+#define hwloc_gl_get_display_osdev_by_name HWLOC_NAME(gl_get_display_osdev_by_name)
+#define hwloc_gl_get_display_by_osdev HWLOC_NAME(gl_get_display_by_osdev)
+
+/* hwloc/plugins.h */
+
+#define hwloc_disc_component_type_e HWLOC_NAME(disc_component_type_e)
+#define HWLOC_DISC_COMPONENT_TYPE_CPU HWLOC_NAME_CAPS(DISC_COMPONENT_TYPE_CPU)
+#define HWLOC_DISC_COMPONENT_TYPE_GLOBAL HWLOC_NAME_CAPS(DISC_COMPONENT_TYPE_GLOBAL)
+#define HWLOC_DISC_COMPONENT_TYPE_MISC HWLOC_NAME_CAPS(DISC_COMPONENT_TYPE_MISC)
+#define hwloc_disc_component_type_t HWLOC_NAME(disc_component_type_t)
+#define hwloc_disc_component HWLOC_NAME(disc_component)
+
+#define hwloc_backend HWLOC_NAME(backend)
+
+#define hwloc_backend_alloc HWLOC_NAME(backend_alloc)
+#define hwloc_backend_enable HWLOC_NAME(backend_enable)
+
+#define hwloc_component_type_e HWLOC_NAME(component_type_e)
+#define HWLOC_COMPONENT_TYPE_DISC HWLOC_NAME_CAPS(COMPONENT_TYPE_DISC)
+#define HWLOC_COMPONENT_TYPE_XML HWLOC_NAME_CAPS(COMPONENT_TYPE_XML)
+#define hwloc_component_type_t HWLOC_NAME(component_type_t)
+#define hwloc_component HWLOC_NAME(component)
+
+#define hwloc_plugin_check_namespace HWLOC_NAME(plugin_check_namespace)
+
+#define hwloc_insert_object_by_cpuset HWLOC_NAME(insert_object_by_cpuset)
+#define hwloc_report_error_t HWLOC_NAME(report_error_t)
+#define hwloc_report_os_error HWLOC_NAME(report_os_error)
+#define hwloc_hide_errors HWLOC_NAME(hide_errors)
+#define hwloc__insert_object_by_cpuset HWLOC_NAME(_insert_object_by_cpuset)
+#define hwloc_insert_object_by_parent HWLOC_NAME(insert_object_by_parent)
+#define hwloc_alloc_setup_object HWLOC_NAME(alloc_setup_object)
+#define hwloc_obj_add_children_sets HWLOC_NAME(add_children_sets)
+#define hwloc_topology_reconnect HWLOC_NAME(topology_reconnect)
+
+#define hwloc_filter_check_pcidev_subtype_important HWLOC_NAME(filter_check_pcidev_subtype_important)
+#define hwloc_filter_check_osdev_subtype_important HWLOC_NAME(filter_check_osdev_subtype_important)
+#define hwloc_filter_check_keep_object_type HWLOC_NAME(filter_check_keep_object_type)
+#define hwloc_filter_check_keep_object HWLOC_NAME(filter_check_keep_object)
+
+#define hwloc_pcidisc_find_cap HWLOC_NAME(pcidisc_find_cap)
+#define hwloc_pcidisc_find_linkspeed HWLOC_NAME(pcidisc_find_linkspeed)
+#define hwloc_pcidisc_check_bridge_type HWLOC_NAME(pcidisc_check_bridge_type)
+#define hwloc_pcidisc_setup_bridge_attr HWLOC_NAME(pcidisc_setup_bridge_attr)
+#define hwloc_pcidisc_tree_insert_by_busid HWLOC_NAME(pcidisc_tree_insert_by_busid)
+#define hwloc_pcidisc_tree_attach HWLOC_NAME(pcidisc_tree_attach)
+
+#define hwloc_pcidisc_find_by_busid HWLOC_NAME(pcidisc_find_by_busid)
+#define hwloc_pcidisc_find_busid_parent HWLOC_NAME(pcidisc_find_busid_parent)
+
+/* hwloc/deprecated.h */
+
+#define hwloc_topology_insert_misc_object_by_parent HWLOC_NAME(topology_insert_misc_object_by_parent)
+#define hwloc_obj_cpuset_snprintf HWLOC_NAME(obj_cpuset_snprintf)
+#define hwloc_obj_type_sscanf HWLOC_NAME(obj_type_sscanf)
+
+#define hwloc_set_membind_nodeset HWLOC_NAME(set_membind_nodeset)
+#define hwloc_get_membind_nodeset HWLOC_NAME(get_membind_nodeset)
+#define hwloc_set_proc_membind_nodeset HWLOC_NAME(set_proc_membind_nodeset)
+#define hwloc_get_proc_membind_nodeset HWLOC_NAME(get_proc_membind_nodeset)
+#define hwloc_set_area_membind_nodeset HWLOC_NAME(set_area_membind_nodeset)
+#define hwloc_get_area_membind_nodeset HWLOC_NAME(get_area_membind_nodeset)
+#define hwloc_alloc_membind_nodeset HWLOC_NAME(alloc_membind_nodeset)
+
+#define hwloc_cpuset_to_nodeset_strict HWLOC_NAME(cpuset_to_nodeset_strict)
+#define hwloc_cpuset_from_nodeset_strict HWLOC_NAME(cpuset_from_nodeset_strict)
+
+/* private/debug.h */
+
+#define hwloc_debug_enabled HWLOC_NAME(debug_enabled)
+#define hwloc_debug HWLOC_NAME(debug)
+
+/* private/misc.h */
+
+#define hwloc_snprintf HWLOC_NAME(snprintf)
+#define hwloc_namecoloncmp HWLOC_NAME(namecoloncmp)
+#define hwloc_ffsl_manual HWLOC_NAME(ffsl_manual)
+#define hwloc_ffs32 HWLOC_NAME(ffs32)
+#define hwloc_ffsl_from_ffs32 HWLOC_NAME(ffsl_from_ffs32)
+#define hwloc_flsl_manual HWLOC_NAME(flsl_manual)
+#define hwloc_fls32 HWLOC_NAME(fls32)
+#define hwloc_flsl_from_fls32 HWLOC_NAME(flsl_from_fls32)
+#define hwloc_weight_long HWLOC_NAME(weight_long)
+#define hwloc_strncasecmp HWLOC_NAME(strncasecmp)
+
+#define hwloc_bitmap_compare_inclusion HWLOC_NAME(bitmap_compare_inclusion)
+
+#define hwloc_pci_class_string HWLOC_NAME(pci_class_string)
+#define hwloc_linux_pci_link_speed_from_string HWLOC_NAME(linux_pci_link_speed_from_string)
+
+#define hwloc_cache_type_by_depth_type HWLOC_NAME(cache_type_by_depth_type)
+#define hwloc__obj_type_is_normal HWLOC_NAME(_obj_type_is_normal)
+#define hwloc__obj_type_is_memory HWLOC_NAME(_obj_type_is_memory)
+#define hwloc__obj_type_is_io HWLOC_NAME(_obj_type_is_io)
+#define hwloc__obj_type_is_special HWLOC_NAME(_obj_type_is_special)
+
+#define hwloc__obj_type_is_cache HWLOC_NAME(_obj_type_is_cache)
+#define hwloc__obj_type_is_dcache HWLOC_NAME(_obj_type_is_dcache)
+#define hwloc__obj_type_is_icache HWLOC_NAME(_obj_type_is_icache)
+
+/* private/cpuid-x86.h */
+
+#define hwloc_have_x86_cpuid HWLOC_NAME(have_x86_cpuid)
+#define hwloc_x86_cpuid HWLOC_NAME(x86_cpuid)
+
+/* private/xml.h */
+
+#define hwloc__xml_verbose HWLOC_NAME(_xml_verbose)
+
+#define hwloc__xml_import_state_s HWLOC_NAME(_xml_import_state_s)
+#define hwloc__xml_import_state_t HWLOC_NAME(_xml_import_state_t)
+#define hwloc__xml_import_diff HWLOC_NAME(_xml_import_diff)
+#define hwloc_xml_backend_data_s HWLOC_NAME(xml_backend_data_s)
+#define hwloc__xml_export_state_s HWLOC_NAME(_xml_export_state_s)
+#define hwloc__xml_export_state_t HWLOC_NAME(_xml_export_state_t)
+#define hwloc__xml_export_data_s HWLOC_NAME(_xml_export_data_s)
+#define hwloc__xml_export_topology HWLOC_NAME(_xml_export_topology)
+#define hwloc__xml_export_diff HWLOC_NAME(_xml_export_diff)
+
+#define hwloc_xml_callbacks HWLOC_NAME(xml_callbacks)
+#define hwloc_xml_component HWLOC_NAME(xml_component)
+#define hwloc_xml_callbacks_register HWLOC_NAME(xml_callbacks_register)
+#define hwloc_xml_callbacks_reset HWLOC_NAME(xml_callbacks_reset)
+
+#define hwloc__xml_imported_v1distances_s HWLOC_NAME(_xml_imported_v1distances_s)
+
+/* private/components.h */
+
+#define hwloc_disc_component_force_enable HWLOC_NAME(disc_component_force_enable)
+#define hwloc_disc_components_enable_others HWLOC_NAME(disc_components_instantiate_others)
+
+#define hwloc_backends_is_thissystem HWLOC_NAME(backends_is_thissystem)
+#define hwloc_backends_find_callbacks HWLOC_NAME(backends_find_callbacks)
+
+#define hwloc_backends_init HWLOC_NAME(backends_init)
+#define hwloc_backends_disable_all HWLOC_NAME(backends_disable_all)
+
+#define hwloc_components_init HWLOC_NAME(components_init)
+#define hwloc_components_fini HWLOC_NAME(components_fini)
+
+/* private/internal-private.h */
+
+#define hwloc_xml_component HWLOC_NAME(xml_component)
+#define hwloc_synthetic_component HWLOC_NAME(synthetic_component)
+
+#define hwloc_aix_component HWLOC_NAME(aix_component)
+#define hwloc_bgq_component HWLOC_NAME(bgq_component)
+#define hwloc_darwin_component HWLOC_NAME(darwin_component)
+#define hwloc_freebsd_component HWLOC_NAME(freebsd_component)
+#define hwloc_hpux_component HWLOC_NAME(hpux_component)
+#define hwloc_linux_component HWLOC_NAME(linux_component)
+#define hwloc_netbsd_component HWLOC_NAME(netbsd_component)
+#define hwloc_noos_component HWLOC_NAME(noos_component)
+#define hwloc_solaris_component HWLOC_NAME(solaris_component)
+#define hwloc_windows_component HWLOC_NAME(windows_component)
+#define hwloc_x86_component HWLOC_NAME(x86_component)
+
+#define hwloc_cuda_component HWLOC_NAME(cuda_component)
+#define hwloc_gl_component HWLOC_NAME(gl_component)
+#define hwloc_linuxio_component HWLOC_NAME(linuxio_component)
+#define hwloc_nvml_component HWLOC_NAME(nvml_component)
+#define hwloc_opencl_component HWLOC_NAME(opencl_component)
+#define hwloc_pci_component HWLOC_NAME(pci_component)
+
+#define hwloc_xml_libxml_component HWLOC_NAME(xml_libxml_component)
+#define hwloc_xml_nolibxml_component HWLOC_NAME(xml_nolibxml_component)
+
+/* private/private.h */
+
+#define hwloc_special_level_s HWLOC_NAME(special_level_s)
+
+#define hwloc_pci_forced_locality_s HWLOC_NAME(pci_forced_locality_s)
+
+#define hwloc_alloc_root_sets HWLOC_NAME(alloc_root_sets)
+#define hwloc_setup_pu_level HWLOC_NAME(setup_pu_level)
+#define hwloc_get_sysctlbyname HWLOC_NAME(get_sysctlbyname)
+#define hwloc_get_sysctl HWLOC_NAME(get_sysctl)
+#define hwloc_fallback_nbprocessors HWLOC_NAME(fallback_nbprocessors)
+
+#define hwloc__object_cpusets_compare_first HWLOC_NAME(_object_cpusets_compare_first)
+#define hwloc__reorder_children HWLOC_NAME(_reorder_children)
+
+#define hwloc_topology_setup_defaults HWLOC_NAME(topology_setup_defaults)
+#define hwloc_topology_clear HWLOC_NAME(topology_clear)
+
+#define hwloc__attach_memory_object HWLOC_NAME(insert_memory_object)
+
+#define hwloc_pci_discovery_init HWLOC_NAME(pci_discovery_init)
+#define hwloc_pci_discovery_prepare HWLOC_NAME(pci_discovery_prepare)
+#define hwloc_pci_discovery_exit HWLOC_NAME(pci_discovery_exit)
+#define hwloc_find_insert_io_parent_by_complete_cpuset HWLOC_NAME(hwloc_find_insert_io_parent_by_complete_cpuset)
+#define hwloc_pci_belowroot_apply_locality HWLOC_NAME(pci_belowroot_apply_locality)
+
+#define hwloc__add_info HWLOC_NAME(_add_info)
+#define hwloc__add_info_nodup HWLOC_NAME(_add_info_nodup)
+#define hwloc__move_infos HWLOC_NAME(_move_infos)
+#define hwloc__free_infos HWLOC_NAME(_free_infos)
+
+#define hwloc_binding_hooks HWLOC_NAME(binding_hooks)
+#define hwloc_set_native_binding_hooks HWLOC_NAME(set_native_binding_hooks)
+#define hwloc_set_binding_hooks HWLOC_NAME(set_binding_hooks)
+
+#define hwloc_set_linuxfs_hooks HWLOC_NAME(set_linuxfs_hooks)
+#define hwloc_set_bgq_hooks HWLOC_NAME(set_bgq_hooks)
+#define hwloc_set_solaris_hooks HWLOC_NAME(set_solaris_hooks)
+#define hwloc_set_aix_hooks HWLOC_NAME(set_aix_hooks)
+#define hwloc_set_windows_hooks HWLOC_NAME(set_windows_hooks)
+#define hwloc_set_darwin_hooks HWLOC_NAME(set_darwin_hooks)
+#define hwloc_set_freebsd_hooks HWLOC_NAME(set_freebsd_hooks)
+#define hwloc_set_netbsd_hooks HWLOC_NAME(set_netbsd_hooks)
+#define hwloc_set_hpux_hooks HWLOC_NAME(set_hpux_hooks)
+
+#define hwloc_look_hardwired_fujitsu_k HWLOC_NAME(look_hardwired_fujitsu_k)
+#define hwloc_look_hardwired_fujitsu_fx10 HWLOC_NAME(look_hardwired_fujitsu_fx10)
+#define hwloc_look_hardwired_fujitsu_fx100 HWLOC_NAME(look_hardwired_fujitsu_fx100)
+
+#define hwloc_add_uname_info HWLOC_NAME(add_uname_info)
+#define hwloc_free_unlinked_object HWLOC_NAME(free_unlinked_object)
+#define hwloc_free_object_and_children HWLOC_NAME(free_object_and_children)
+#define hwloc_free_object_siblings_and_children HWLOC_NAME(free_object_siblings_and_children)
+
+#define hwloc_alloc_heap HWLOC_NAME(alloc_heap)
+#define hwloc_alloc_mmap HWLOC_NAME(alloc_mmap)
+#define hwloc_free_heap HWLOC_NAME(free_heap)
+#define hwloc_free_mmap HWLOC_NAME(free_mmap)
+#define hwloc_alloc_or_fail HWLOC_NAME(alloc_or_fail)
+
+#define hwloc_internal_distances_s HWLOC_NAME(internal_distances_s)
+#define hwloc_internal_distances_init HWLOC_NAME(internal_distances_init)
+#define hwloc_internal_distances_prepare HWLOC_NAME(internal_distances_prepare)
+#define hwloc_internal_distances_dup HWLOC_NAME(internal_distances_dup)
+#define hwloc_internal_distances_refresh HWLOC_NAME(internal_distances_refresh)
+#define hwloc_internal_distances_destroy HWLOC_NAME(internal_distances_destroy)
+
+#define hwloc_internal_distances_add HWLOC_NAME(internal_distances_add)
+#define hwloc_internal_distances_add_by_index HWLOC_NAME(internal_distances_add_by_index)
+#define hwloc_internal_distances_invalidate_cached_objs HWLOC_NAME(hwloc_internal_distances_invalidate_cached_objs)
+
+#define hwloc_encode_to_base64 HWLOC_NAME(encode_to_base64)
+#define hwloc_decode_from_base64 HWLOC_NAME(decode_from_base64)
+
+#define hwloc_progname HWLOC_NAME(progname)
+
+#define hwloc__topology_disadopt HWLOC_NAME(_topology_disadopt)
+#define hwloc__topology_dup HWLOC_NAME(_topology_dup)
+
+#define hwloc_tma HWLOC_NAME(tma)
+#define hwloc_tma_malloc HWLOC_NAME(tma_malloc)
+#define hwloc_tma_calloc HWLOC_NAME(tma_calloc)
+#define hwloc_tma_strdup HWLOC_NAME(tma_strdup)
+#define hwloc_bitmap_tma_dup HWLOC_NAME(bitmap_tma_dup)
+
+/* private/solaris-chiptype.h */
+
+#define hwloc_solaris_chip_info_s HWLOC_NAME(solaris_chip_info_s)
+#define hwloc_solaris_get_chip_info HWLOC_NAME(solaris_get_chip_info)
+
+#endif /* HWLOC_SYM_TRANSFORM */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_RENAME_H */
diff --git a/src/3rdparty/hwloc/include/hwloc/shmem.h b/src/3rdparty/hwloc/include/hwloc/shmem.h
new file mode 100644
index 000000000..222494630
--- /dev/null
+++ b/src/3rdparty/hwloc/include/hwloc/shmem.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright © 2013-2018 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Sharing topologies between processes
+ */
+
+#ifndef HWLOC_SHMEM_H
+#define HWLOC_SHMEM_H
+
+#include <hwloc.h>
+
+#ifdef __cplusplus
+extern "C" {
+#elif 0
+}
+#endif
+
+
+/** \defgroup hwlocality_shmem Sharing topologies between processes
+ *
+ * These functions are used to share a topology between processes by
+ * duplicating it into a file-backed shared-memory buffer.
+ *
+ * The master process must first get the required shared-memory size
+ * for storing this topology with hwloc_shmem_topology_get_length().
+ *
+ * Then it must find a virtual memory area of that size that is available
+ * in all processes (identical virtual addresses in all processes).
+ * On Linux, this can be done by comparing holes found in /proc/\<pid\>/maps
+ * for each process.
+ *
+ * Once found, it must open a destination file for storing the buffer,
+ * and pass it to hwloc_shmem_topology_write() together with
+ * virtual memory address and length obtained above.
+ *
+ * Other processes may then adopt this shared topology by opening the
+ * same file and passing it to hwloc_shmem_topology_adopt() with the
+ * exact same virtual memory address and length.
+ *
+ * @{
+ */
+
+/** \brief Get the required shared memory length for storing a topology.
+ *
+ * This length (in bytes) must be used in hwloc_shmem_topology_write()
+ * and hwloc_shmem_topology_adopt() later.
+ *
+ * \note Flags \p flags are currently unused, must be 0.
+ */
+HWLOC_DECLSPEC int hwloc_shmem_topology_get_length(hwloc_topology_t topology,
+						   size_t *lengthp,
+						   unsigned long flags);
+
+/** \brief Duplicate a topology to a shared memory file.
+ *
+ * Temporarily map a file in virtual memory and duplicate the
+ * topology \p topology by allocating duplicates in there.
+ *
+ * The segment of the file pointed by descriptor \p fd,
+ * starting at offset \p fileoffset, and of length \p length (in bytes),
+ * will be temporarily mapped at virtual address \p mmap_address
+ * during the duplication.
+ *
+ * The mapping length \p length must have been previously obtained with
+ * hwloc_shmem_topology_get_length()
+ * and the topology must not have been modified in the meantime.
+ *
+ * \note Flags \p flags are currently unused, must be 0.
+ *
+ * \note The object userdata pointer is duplicated but the pointed buffer
+ * is not. However the caller may also allocate it manually in shared memory
+ * to share it as well.
+ *
+ * \return -1 with errno set to EBUSY if the virtual memory mapping defined
+ * by \p mmap_address and \p length isn't available in the process.
+ * \return -1 with errno set to EINVAL if \p fileoffset, \p mmap_address
+ * or \p length aren't page-aligned.
+ */
+HWLOC_DECLSPEC int hwloc_shmem_topology_write(hwloc_topology_t topology,
+					      int fd, hwloc_uint64_t fileoffset,
+					      void *mmap_address, size_t length,
+					      unsigned long flags);
+
+/** \brief Adopt a shared memory topology stored in a file.
+ *
+ * Map a file in virtual memory and adopt the topology that was previously
+ * stored there with hwloc_shmem_topology_write().
+ *
+ * The returned adopted topology in \p topologyp can be used just like any
+ * topology. And it must be destroyed with hwloc_topology_destroy() as usual.
+ *
+ * However the topology is read-only.
+ * For instance, it cannot be modified with hwloc_topology_restrict()
+ * and object userdata pointers cannot be changed.
+ *
+ * The segment of the file pointed by descriptor \p fd,
+ * starting at offset \p fileoffset, and of length \p length (in bytes),
+ * will be mapped at virtual address \p mmap_address.
+ *
+ * The file pointed by descriptor \p fd, the offset \p fileoffset,
+ * the requested mapping virtual address \p mmap_address and the length \p length
+ * must be identical to what was given to hwloc_shmem_topology_write() earlier.
+ *
+ * \note Flags \p flags are currently unused, must be 0.
+ *
+ * \note The object userdata pointer should not be used unless the process
+ * that created the shared topology also placed userdata-pointed buffers
+ * in shared memory.
+ *
+ * \note This function takes care of calling hwloc_topology_abi_check().
+ *
+ * \return -1 with errno set to EBUSY if the virtual memory mapping defined
+ * by \p mmap_address and \p length isn't available in the process.
+ *
+ * \return -1 with errno set to EINVAL if \p fileoffset, \p mmap_address
+ * or \p length aren't page-aligned, or do not match what was given to
+ * hwloc_shmem_topology_write() earlier.
+ *
+ * \return -1 with errno set to EINVAL if the layout of the topology structure
+ * is different between the writer process and the adopter process.
+ */
+HWLOC_DECLSPEC int hwloc_shmem_topology_adopt(hwloc_topology_t *topologyp,
+					      int fd, hwloc_uint64_t fileoffset,
+					      void *mmap_address, size_t length,
+					      unsigned long flags);
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_SHMEM_H */
diff --git a/src/3rdparty/hwloc/include/private/autogen/config.h b/src/3rdparty/hwloc/include/private/autogen/config.h
new file mode 100644
index 000000000..a97bdfea2
--- /dev/null
+++ b/src/3rdparty/hwloc/include/private/autogen/config.h
@@ -0,0 +1,672 @@
+/*
+ * Copyright © 2009, 2011, 2012 CNRS.  All rights reserved.
+ * Copyright © 2009-2018 Inria.  All rights reserved.
+ * Copyright © 2009, 2011, 2012, 2015 Université Bordeaux.  All rights reserved.
+ * Copyright © 2009 Cisco Systems, Inc.  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#ifndef HWLOC_CONFIGURE_H
+#define HWLOC_CONFIGURE_H
+
+#define DECLSPEC_EXPORTS
+
+#define HWLOC_HAVE_MSVC_CPUIDEX 1
+
+/* Define to 1 if the system has the type `CACHE_DESCRIPTOR'. */
+#define HAVE_CACHE_DESCRIPTOR 0
+
+/* Define to 1 if the system has the type `CACHE_RELATIONSHIP'. */
+#define HAVE_CACHE_RELATIONSHIP 0
+
+/* Define to 1 if you have the `clz' function. */
+/* #undef HAVE_CLZ */
+
+/* Define to 1 if you have the `clzl' function. */
+/* #undef HAVE_CLZL */
+
+/* Define to 1 if you have the <CL/cl_ext.h> header file. */
+/* #undef HAVE_CL_CL_EXT_H */
+
+/* Define to 1 if you have the `cpuset_setaffinity' function. */
+/* #undef HAVE_CPUSET_SETAFFINITY */
+
+/* Define to 1 if you have the `cpuset_setid' function. */
+/* #undef HAVE_CPUSET_SETID */
+
+/* Define to 1 if we have -lcuda */
+/* #undef HAVE_CUDA */
+
+/* Define to 1 if you have the <cuda.h> header file. */
+/* #undef HAVE_CUDA_H */
+
+/* Define to 1 if you have the <cuda_runtime_api.h> header file. */
+/* #undef HAVE_CUDA_RUNTIME_API_H */
+
+/* Define to 1 if you have the declaration of `CL_DEVICE_TOPOLOGY_AMD', and to
+   0 if you don't. */
+/* #undef HAVE_DECL_CL_DEVICE_TOPOLOGY_AMD */
+
+/* Define to 1 if you have the declaration of `CTL_HW', and to 0 if you don't.
+   */
+/* #undef HAVE_DECL_CTL_HW */
+
+/* Define to 1 if you have the declaration of `fabsf', and to 0 if you don't.
+   */
+#define HAVE_DECL_FABSF 1
+
+/* Define to 1 if you have the declaration of `modff', and to 0 if you don't.
+   */
+#define HAVE_DECL_MODFF 1
+
+/* Define to 1 if you have the declaration of `HW_NCPU', and to 0 if you
+   don't. */
+/* #undef HAVE_DECL_HW_NCPU */
+
+/* Define to 1 if you have the declaration of
+   `nvmlDeviceGetMaxPcieLinkGeneration', and to 0 if you don't. */
+/* #undef HAVE_DECL_NVMLDEVICEGETMAXPCIELINKGENERATION */
+
+/* Define to 1 if you have the declaration of `pthread_getaffinity_np', and to
+   0 if you don't. */
+#define HAVE_DECL_PTHREAD_GETAFFINITY_NP 0
+
+/* Define to 1 if you have the declaration of `pthread_setaffinity_np', and to
+   0 if you don't. */
+#define HAVE_DECL_PTHREAD_SETAFFINITY_NP 0
+
+/* Define to 1 if you have the declaration of `strtoull', and to 0 if you
+   don't. */
+#define HAVE_DECL_STRTOULL 0
+
+/* Define to 1 if you have the declaration of `strcasecmp', and to 0 if you
+   don't. */
+/* #undef HWLOC_HAVE_DECL_STRCASECMP */
+
+/* Define to 1 if you have the declaration of `snprintf', and to 0 if you
+   don't. */
+#define HAVE_DECL_SNPRINTF 0
+
+/* Define to 1 if you have the declaration of `_strdup', and to 0 if you
+   don't. */
+#define HAVE_DECL__STRDUP 1
+
+/* Define to 1 if you have the declaration of `_putenv', and to 0 if you
+   don't. */
+#define HAVE_DECL__PUTENV 1
+
+/* Define to 1 if you have the declaration of `_SC_LARGE_PAGESIZE', and to 0
+   if you don't. */
+#define HAVE_DECL__SC_LARGE_PAGESIZE 0
+
+/* Define to 1 if you have the declaration of `_SC_NPROCESSORS_CONF', and to 0
+   if you don't. */
+#define HAVE_DECL__SC_NPROCESSORS_CONF 0
+
+/* Define to 1 if you have the declaration of `_SC_NPROCESSORS_ONLN', and to 0
+   if you don't. */
+#define HAVE_DECL__SC_NPROCESSORS_ONLN 0
+
+/* Define to 1 if you have the declaration of `_SC_NPROC_CONF', and to 0 if
+   you don't. */
+#define HAVE_DECL__SC_NPROC_CONF 0
+
+/* Define to 1 if you have the declaration of `_SC_NPROC_ONLN', and to 0 if
+   you don't. */
+#define HAVE_DECL__SC_NPROC_ONLN 0
+
+/* Define to 1 if you have the declaration of `_SC_PAGESIZE', and to 0 if you
+   don't. */
+#define HAVE_DECL__SC_PAGESIZE 0
+
+/* Define to 1 if you have the declaration of `_SC_PAGE_SIZE', and to 0 if you
+   don't. */
+#define HAVE_DECL__SC_PAGE_SIZE 0
+
+/* Define to 1 if you have the <dirent.h> header file. */
+/* #define HAVE_DIRENT_H 1 */
+#undef HAVE_DIRENT_H
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+/* #undef HAVE_DLFCN_H */
+
+/* Define to 1 if you have the `ffs' function. */
+/* #undef HAVE_FFS */
+
+/* Define to 1 if you have the `ffsl' function. */
+/* #undef HAVE_FFSL */
+
+/* Define to 1 if you have the `fls' function. */
+/* #undef HAVE_FLS */
+
+/* Define to 1 if you have the `flsl' function. */
+/* #undef HAVE_FLSL */
+
+/* Define to 1 if you have the `getpagesize' function. */
+#define HAVE_GETPAGESIZE 1
+
+/* Define to 1 if the system has the type `GROUP_AFFINITY'. */
+#define HAVE_GROUP_AFFINITY 1
+
+/* Define to 1 if the system has the type `GROUP_RELATIONSHIP'. */
+#define HAVE_GROUP_RELATIONSHIP 1
+
+/* Define to 1 if you have the `host_info' function. */
+/* #undef HAVE_HOST_INFO */
+
+/* Define to 1 if you have the <infiniband/verbs.h> header file. */
+/* #undef HAVE_INFINIBAND_VERBS_H */
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#define HAVE_INTTYPES_H 1
+
+/* Define to 1 if the system has the type `KAFFINITY'. */
+#define HAVE_KAFFINITY 1
+
+/* Define to 1 if you have the <kstat.h> header file. */
+/* #undef HAVE_KSTAT_H */
+
+/* Define to 1 if you have the <langinfo.h> header file. */
+/* #undef HAVE_LANGINFO_H */
+
+/* Define to 1 if we have -lgdi32 */
+#define HAVE_LIBGDI32 1
+
+/* Define to 1 if we have -libverbs */
+/* #undef HAVE_LIBIBVERBS */
+
+/* Define to 1 if we have -lkstat */
+/* #undef HAVE_LIBKSTAT */
+
+/* Define to 1 if we have -llgrp */
+/* #undef HAVE_LIBLGRP */
+
+/* Define to 1 if you have the <locale.h> header file. */
+#define HAVE_LOCALE_H 1
+
+/* Define to 1 if the system has the type `LOGICAL_PROCESSOR_RELATIONSHIP'. */
+#define HAVE_LOGICAL_PROCESSOR_RELATIONSHIP 1
+
+/* Define to 1 if you have the <mach/mach_host.h> header file. */
+/* #undef HAVE_MACH_MACH_HOST_H */
+
+/* Define to 1 if you have the <mach/mach_init.h> header file. */
+/* #undef HAVE_MACH_MACH_INIT_H */
+
+/* Define to 1 if you have the <malloc.h> header file. */
+#define HAVE_MALLOC_H 1
+
+/* Define to 1 if you have the `memalign' function. */
+/* #undef HAVE_MEMALIGN */
+
+/* Define to 1 if you have the <memory.h> header file. */
+#define HAVE_MEMORY_H 1
+
+/* Define to 1 if you have the `nl_langinfo' function. */
+/* #undef HAVE_NL_LANGINFO */
+
+/* Define to 1 if you have the <numaif.h> header file. */
+/* #undef HAVE_NUMAIF_H */
+
+/* Define to 1 if the system has the type `NUMA_NODE_RELATIONSHIP'. */
+#define HAVE_NUMA_NODE_RELATIONSHIP 1
+
+/* Define to 1 if you have the <NVCtrl/NVCtrl.h> header file. */
+/* #undef HAVE_NVCTRL_NVCTRL_H */
+
+/* Define to 1 if you have the <nvml.h> header file. */
+/* #undef HAVE_NVML_H */
+
+/* Define to 1 if you have the `openat' function. */
+/* #undef HAVE_OPENAT */
+
+/* Define to 1 if you have the <picl.h> header file. */
+/* #undef HAVE_PICL_H */
+
+/* Define to 1 if you have the `posix_memalign' function. */
+/* #undef HAVE_POSIX_MEMALIGN */
+
+/* Define to 1 if the system has the type `PROCESSOR_CACHE_TYPE'. */
+#define HAVE_PROCESSOR_CACHE_TYPE 1
+
+/* Define to 1 if the system has the type `PROCESSOR_GROUP_INFO'. */
+#define HAVE_PROCESSOR_GROUP_INFO 1
+
+/* Define to 1 if the system has the type `PROCESSOR_RELATIONSHIP'. */
+#define HAVE_PROCESSOR_RELATIONSHIP 1
+
+/* Define to 1 if the system has the type `PSAPI_WORKING_SET_EX_BLOCK'. */
+/* #undef HAVE_PSAPI_WORKING_SET_EX_BLOCK */
+
+/* Define to 1 if the system has the type `PSAPI_WORKING_SET_EX_INFORMATION'.
+   */
+/* #undef HAVE_PSAPI_WORKING_SET_EX_INFORMATION */
+
+/* Define to 1 if the system has the type `PROCESSOR_NUMBER'. */
+#define HAVE_PROCESSOR_NUMBER 1
+
+/* Define to 1 if you have the <pthread_np.h> header file. */
+/* #undef HAVE_PTHREAD_NP_H */
+
+/* Define to 1 if the system has the type `pthread_t'. */
+/* #undef HAVE_PTHREAD_T */
+#undef HAVE_PTHREAD_T
+
+/* Define to 1 if you have the `putwc' function. */
+#define HAVE_PUTWC 1
+
+/* Define to 1 if the system has the type `RelationProcessorPackage'. */
+/* #undef HAVE_RELATIONPROCESSORPACKAGE */
+
+/* Define to 1 if you have the `setlocale' function. */
+#define HAVE_SETLOCALE 1
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HAVE_STDINT_H 1
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1
+
+/* Define to 1 if you have the `strftime' function. */
+#define HAVE_STRFTIME 1
+
+/* Define to 1 if you have the <strings.h> header file. */
+/* #define HAVE_STRINGS_H 1*/
+#undef HAVE_STRINGS_H
+
+/* Define to 1 if you have the <string.h> header file. */
+#define HAVE_STRING_H 1
+
+/* Define to 1 if you have the `strncasecmp' function. */
+#define HAVE_STRNCASECMP 1
+
+/* Define to '1' if sysctl is present and usable */
+/* #undef HAVE_SYSCTL */
+
+/* Define to '1' if sysctlbyname is present and usable */
+/* #undef HAVE_SYSCTLBYNAME */
+
+/* Define to 1 if the system has the type
+   `SYSTEM_LOGICAL_PROCESSOR_INFORMATION'. */
+#define HAVE_SYSTEM_LOGICAL_PROCESSOR_INFORMATION 1
+
+/* Define to 1 if the system has the type
+   `SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX'. */
+#define HAVE_SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX 1
+
+/* Define to 1 if you have the <sys/cpuset.h> header file. */
+/* #undef HAVE_SYS_CPUSET_H */
+
+/* Define to 1 if you have the <sys/lgrp_user.h> header file. */
+/* #undef HAVE_SYS_LGRP_USER_H */
+
+/* Define to 1 if you have the <sys/mman.h> header file. */
+/* #undef HAVE_SYS_MMAN_H */
+
+/* Define to 1 if you have the <sys/param.h> header file. */
+/* #define HAVE_SYS_PARAM_H 1 */
+#undef HAVE_SYS_PARAM_H
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#define HAVE_SYS_STAT_H 1
+
+/* Define to 1 if you have the <sys/sysctl.h> header file. */
+/* #undef HAVE_SYS_SYSCTL_H */
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#define HAVE_SYS_TYPES_H 1
+
+/* Define to 1 if you have the <sys/utsname.h> header file. */
+/* #undef HAVE_SYS_UTSNAME_H */
+
+/* Define to 1 if you have the `uname' function. */
+/* #undef HAVE_UNAME */
+
+/* Define to 1 if you have the <unistd.h> header file. */
+/* #define HAVE_UNISTD_H 1 */
+#undef HAVE_UNISTD_H
+
+/* Define to 1 if you have the `uselocale' function. */
+/* #undef HAVE_USELOCALE */
+
+/* Define to 1 if the system has the type `wchar_t'. */
+#define HAVE_WCHAR_T 1
+
+/* Define to 1 if you have the <X11/keysym.h> header file. */
+/* #undef HAVE_X11_KEYSYM_H */
+
+/* Define to 1 if you have the <X11/Xlib.h> header file. */
+/* #undef HAVE_X11_XLIB_H */
+
+/* Define to 1 if you have the <X11/Xutil.h> header file. */
+/* #undef HAVE_X11_XUTIL_H */
+
+/* Define to 1 if you have the <xlocale.h> header file. */
+/* #undef HAVE_XLOCALE_H */
+
+/* Define to 1 on AIX */
+/* #undef HWLOC_AIX_SYS */
+
+/* Define to 1 on BlueGene/Q */
+/* #undef HWLOC_BGQ_SYS */
+
+/* Whether C compiler supports symbol visibility or not */
+#define HWLOC_C_HAVE_VISIBILITY 0
+
+/* Define to 1 on Darwin */
+/* #undef HWLOC_DARWIN_SYS */
+
+/* Whether we are in debugging mode or not */
+/* #undef HWLOC_DEBUG */
+
+/* Define to 1 on *FREEBSD */
+/* #undef HWLOC_FREEBSD_SYS */
+
+/* Whether your compiler has __attribute__ or not */
+/* #define HWLOC_HAVE_ATTRIBUTE 1 */
+#undef HWLOC_HAVE_ATTRIBUTE
+
+/* Whether your compiler has __attribute__ aligned or not */
+/* #define HWLOC_HAVE_ATTRIBUTE_ALIGNED 1 */
+
+/* Whether your compiler has __attribute__ always_inline or not */
+/* #define HWLOC_HAVE_ATTRIBUTE_ALWAYS_INLINE 1 */
+
+/* Whether your compiler has __attribute__ cold or not */
+/* #define HWLOC_HAVE_ATTRIBUTE_COLD 1 */
+
+/* Whether your compiler has __attribute__ const or not */
+/* #define HWLOC_HAVE_ATTRIBUTE_CONST 1 */
+
+/* Whether your compiler has __attribute__ deprecated or not */
+/* #define HWLOC_HAVE_ATTRIBUTE_DEPRECATED 1 */
+
+/* Whether your compiler has __attribute__ format or not */
+/* #define HWLOC_HAVE_ATTRIBUTE_FORMAT 1 */
+
+/* Whether your compiler has __attribute__ hot or not */
+/* #define HWLOC_HAVE_ATTRIBUTE_HOT 1 */
+
+/* Whether your compiler has __attribute__ malloc or not */
+/* #define HWLOC_HAVE_ATTRIBUTE_MALLOC 1 */
+
+/* Whether your compiler has __attribute__ may_alias or not */
+/* #define HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS 1 */
+
+/* Whether your compiler has __attribute__ nonnull or not */
+/* #define HWLOC_HAVE_ATTRIBUTE_NONNULL 1 */
+
+/* Whether your compiler has __attribute__ noreturn or not */
+/* #define HWLOC_HAVE_ATTRIBUTE_NORETURN 1 */
+
+/* Whether your compiler has __attribute__ no_instrument_function or not */
+/* #define HWLOC_HAVE_ATTRIBUTE_NO_INSTRUMENT_FUNCTION 1 */
+
+/* Whether your compiler has __attribute__ packed or not */
+/* #define HWLOC_HAVE_ATTRIBUTE_PACKED 1 */
+
+/* Whether your compiler has __attribute__ pure or not */
+/* #define HWLOC_HAVE_ATTRIBUTE_PURE 1 */
+
+/* Whether your compiler has __attribute__ sentinel or not */
+/* #define HWLOC_HAVE_ATTRIBUTE_SENTINEL 1 */
+
+/* Whether your compiler has __attribute__ unused or not */
+/* #define HWLOC_HAVE_ATTRIBUTE_UNUSED 1 */
+
+/* Whether your compiler has __attribute__ warn unused result or not */
+/* #define HWLOC_HAVE_ATTRIBUTE_WARN_UNUSED_RESULT 1 */
+
+/* Whether your compiler has __attribute__ weak alias or not */
+/* #define HWLOC_HAVE_ATTRIBUTE_WEAK_ALIAS 1 */
+
+/* Define to 1 if your `ffs' function is known to be broken. */
+/* #undef HWLOC_HAVE_BROKEN_FFS */
+
+/* Define to 1 if you have the `cairo' library. */
+/* #undef HWLOC_HAVE_CAIRO */
+
+/* Define to 1 if you have the `clz' function. */
+/* #undef HWLOC_HAVE_CLZ */
+
+/* Define to 1 if you have the `clzl' function. */
+/* #undef HWLOC_HAVE_CLZL */
+
+/* Define to 1 if you have cpuid */
+/* #undef HWLOC_HAVE_CPUID */
+
+/* Define to 1 if the CPU_SET macro works */
+/* #undef HWLOC_HAVE_CPU_SET */
+
+/* Define to 1 if the CPU_SET_S macro works */
+/* #undef HWLOC_HAVE_CPU_SET_S */
+
+/* Define to 1 if you have the `cudart' SDK. */
+/* #undef HWLOC_HAVE_CUDART */
+
+/* Define to 1 if function `clz' is declared by system headers */
+/* #undef HWLOC_HAVE_DECL_CLZ */
+
+/* Define to 1 if function `clzl' is declared by system headers */
+/* #undef HWLOC_HAVE_DECL_CLZL */
+
+/* Define to 1 if function `ffs' is declared by system headers */
+/* #undef HWLOC_HAVE_DECL_FFS */
+
+/* Define to 1 if function `ffsl' is declared by system headers */
+/* #undef HWLOC_HAVE_DECL_FFSL */
+
+/* Define to 1 if function `fls' is declared by system headers */
+/* #undef HWLOC_HAVE_DECL_FLS */
+
+/* Define to 1 if function `flsl' is declared by system headers */
+/* #undef HWLOC_HAVE_DECL_FLSL */
+
+/* Define to 1 if you have the `ffs' function. */
+/* #undef HWLOC_HAVE_FFS */
+
+/* Define to 1 if you have the `ffsl' function. */
+/* #undef HWLOC_HAVE_FFSL */
+
+/* Define to 1 if you have the `fls' function. */
+/* #undef HWLOC_HAVE_FLS */
+
+/* Define to 1 if you have the `flsl' function. */
+/* #undef HWLOC_HAVE_FLSL */
+
+/* Define to 1 if you have the GL module components. */
+/* #undef HWLOC_HAVE_GL */
+
+/* Define to 1 if you have a library providing the termcap interface */
+/* #undef HWLOC_HAVE_LIBTERMCAP */
+
+/* Define to 1 if you have the `libxml2' library. */
+/* #undef HWLOC_HAVE_LIBXML2 */
+
+/* Define to 1 if building the Linux PCI component */
+/* #undef HWLOC_HAVE_LINUXPCI */
+
+/* Define to 1 if you have the `NVML' library. */
+/* #undef HWLOC_HAVE_NVML */
+
+/* Define to 1 if glibc provides the old prototype (without length) of
+   sched_setaffinity() */
+/* #undef HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+
+/* Define to 1 if you have the `OpenCL' library. */
+/* #undef HWLOC_HAVE_OPENCL */
+
+/* Define to 1 if the hwloc library should support dynamically-loaded plugins
+   */
+/* #undef HWLOC_HAVE_PLUGINS */
+
+/* `Define to 1 if you have pthread_getthrds_np' */
+/* #undef HWLOC_HAVE_PTHREAD_GETTHRDS_NP */
+
+/* Define to 1 if pthread mutexes are available */
+/* #undef HWLOC_HAVE_PTHREAD_MUTEX */
+
+/* Define to 1 if glibc provides a prototype of sched_setaffinity() */
+#define HWLOC_HAVE_SCHED_SETAFFINITY 1
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HWLOC_HAVE_STDINT_H 1
+
+/* Define to 1 if you have the `windows.h' header. */
+#define HWLOC_HAVE_WINDOWS_H 1
+
+/* Define to 1 if X11 headers including Xutil.h and keysym.h are available. */
+/* #undef HWLOC_HAVE_X11_KEYSYM */
+
+/* Define to 1 if function `syscall' is available */
+/* #undef HWLOC_HAVE_SYSCALL */
+
+/* Define to 1 on HP-UX */
+/* #undef HWLOC_HPUX_SYS */
+
+/* Define to 1 on Linux */
+/* #undef HWLOC_LINUX_SYS */
+
+/* Define to 1 on *NETBSD */
+/* #undef HWLOC_NETBSD_SYS */
+
+/* The size of `unsigned int', as computed by sizeof */
+#define HWLOC_SIZEOF_UNSIGNED_INT 4
+
+/* The size of `unsigned long', as computed by sizeof */
+#define HWLOC_SIZEOF_UNSIGNED_LONG 4
+
+/* Define to 1 on Solaris */
+/* #undef HWLOC_SOLARIS_SYS */
+
+/* The hwloc symbol prefix */
+#define HWLOC_SYM_PREFIX hwloc_
+
+/* The hwloc symbol prefix in all caps */
+#define HWLOC_SYM_PREFIX_CAPS HWLOC_
+
+/* Whether we need to re-define all the hwloc public symbols or not */
+#define HWLOC_SYM_TRANSFORM 0
+
+/* Define to 1 on unsupported systems */
+/* #undef HWLOC_UNSUPPORTED_SYS */
+
+/* Define to 1 if ncurses works, preferred over curses */
+/* #undef HWLOC_USE_NCURSES */
+
+/* Define to 1 on WINDOWS */
+#define HWLOC_WIN_SYS 1
+
+/* Define to 1 on x86_32 */
+/* #undef HWLOC_X86_32_ARCH */
+
+/* Define to 1 on x86_64 */
+#define HWLOC_X86_64_ARCH 1
+
+/* Define to the sub-directory in which libtool stores uninstalled libraries.
+   */
+#define LT_OBJDIR ".libs/"
+
+/* Name of package */
+#define PACKAGE "hwloc"
+
+/* Define to the address where bug reports for this package should be sent. */
+#define PACKAGE_BUGREPORT "http://www.open-mpi.org/projects/hwloc/"
+
+/* Define to the full name of this package. */
+#define PACKAGE_NAME "hwloc"
+
+/* Define to the full name and version of this package. */
+#define PACKAGE_STRING "hwloc"
+
+/* Define to the one symbol short name of this package. */
+#define PACKAGE_TARNAME "hwloc"
+
+/* Define to the home page for this package. */
+#define PACKAGE_URL ""
+
+/* Define to the version of this package. */
+#define PACKAGE_VERSION HWLOC_VERSION
+
+/* The size of `unsigned int', as computed by sizeof. */
+#define SIZEOF_UNSIGNED_INT 4
+
+/* The size of `unsigned long', as computed by sizeof. */
+#define SIZEOF_UNSIGNED_LONG 4
+
+/* The size of `void *', as computed by sizeof. */
+#define SIZEOF_VOID_P 8
+
+/* Define to 1 if you have the ANSI C header files. */
+#define STDC_HEADERS 1
+
+/* Enable extensions on HP-UX. */
+#ifndef _HPUX_SOURCE
+# define _HPUX_SOURCE 1
+#endif
+
+
+/* Enable extensions on AIX 3, Interix.  */
+/*
+#ifndef _ALL_SOURCE
+# define _ALL_SOURCE 1
+#endif
+*/
+
+/* Enable GNU extensions on systems that have them.  */
+/*
+#ifndef _GNU_SOURCE
+# define _GNU_SOURCE 1
+#endif
+*/
+/* Enable threading extensions on Solaris.  */
+/*
+#ifndef _POSIX_PTHREAD_SEMANTICS
+# define _POSIX_PTHREAD_SEMANTICS 1
+#endif
+*/
+/* Enable extensions on HP NonStop.  */
+/*
+#ifndef _TANDEM_SOURCE
+# define _TANDEM_SOURCE 1
+#endif
+*/
+/* Enable general extensions on Solaris.  */
+/*
+#ifndef __EXTENSIONS__
+# define __EXTENSIONS__ 1
+#endif
+*/
+
+
+/* Version number of package */
+#define VERSION HWLOC_VERSION
+
+/* Define to 1 if the X Window System is missing or not being used. */
+#define X_DISPLAY_MISSING 1
+
+/* Define to 1 if on MINIX. */
+/* #undef _MINIX */
+
+/* Define to 2 if the system does not provide POSIX.1 features except with
+   this defined. */
+/* #undef _POSIX_1_SOURCE */
+
+/* Define to 1 if you need to in order for `stat' and other things to work. */
+/* #undef _POSIX_SOURCE */
+
+/* Define this to the process ID type */
+#define hwloc_pid_t HANDLE
+
+/* Define this to either strncasecmp or strncmp */
+#define hwloc_strncasecmp strncasecmp
+
+/* Define this to the thread ID type */
+#define hwloc_thread_t HANDLE
+
+
+#endif /* HWLOC_CONFIGURE_H */
diff --git a/src/3rdparty/hwloc/include/private/components.h b/src/3rdparty/hwloc/include/private/components.h
new file mode 100644
index 000000000..8525bbe46
--- /dev/null
+++ b/src/3rdparty/hwloc/include/private/components.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright © 2012-2015 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+
+#ifdef HWLOC_INSIDE_PLUGIN
+/*
+ * these declarations are internal only, they are not available to plugins
+ * (many functions below are internal static symbols).
+ */
+#error This file should not be used in plugins
+#endif
+
+
+#ifndef PRIVATE_COMPONENTS_H
+#define PRIVATE_COMPONENTS_H 1
+
+#include <hwloc/plugins.h>
+
+struct hwloc_topology;
+
+extern int hwloc_disc_component_force_enable(struct hwloc_topology *topology,
+					     int envvar_forced, /* 1 if forced through envvar, 0 if forced through API */
+					     int type, const char *name,
+					     const void *data1, const void *data2, const void *data3);
+extern void hwloc_disc_components_enable_others(struct hwloc_topology *topology);
+
+/* Compute the topology is_thissystem flag and find some callbacks based on enabled backends */
+extern void hwloc_backends_is_thissystem(struct hwloc_topology *topology);
+extern void hwloc_backends_find_callbacks(struct hwloc_topology *topology);
+
+/* Initialize the list of backends used by a topology */
+extern void hwloc_backends_init(struct hwloc_topology *topology);
+/* Disable and destroy all backends used by a topology */
+extern void hwloc_backends_disable_all(struct hwloc_topology *topology);
+
+/* Used by the core to setup/destroy the list of components */
+extern void hwloc_components_init(void); /* increases components refcount, should be called exactly once per topology (during init) */
+extern void hwloc_components_fini(void); /* decreases components refcount, should be called exactly once per topology (during destroy) */
+
+#endif /* PRIVATE_COMPONENTS_H */
+
diff --git a/src/3rdparty/hwloc/include/private/cpuid-x86.h b/src/3rdparty/hwloc/include/private/cpuid-x86.h
new file mode 100644
index 000000000..2758afe04
--- /dev/null
+++ b/src/3rdparty/hwloc/include/private/cpuid-x86.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright © 2010-2012, 2014 Université Bordeaux
+ * Copyright © 2010 Cisco Systems, Inc.  All rights reserved.
+ * Copyright © 2014 Inria.  All rights reserved.
+ *
+ * See COPYING in top-level directory.
+ */
+
+/* Internals for x86's cpuid.  */
+
+#ifndef HWLOC_PRIVATE_CPUID_X86_H
+#define HWLOC_PRIVATE_CPUID_X86_H
+
+#if (defined HWLOC_X86_32_ARCH) && (!defined HWLOC_HAVE_MSVC_CPUIDEX)
+static __hwloc_inline int hwloc_have_x86_cpuid(void)
+{
+  int ret;
+  unsigned tmp, tmp2;
+  __asm__(
+      "mov $0,%0\n\t"   /* Not supported a priori */
+
+      "pushfl   \n\t"   /* Save flags */
+
+      "pushfl   \n\t"                                           \
+      "pop %1   \n\t"   /* Get flags */                         \
+
+#define TRY_TOGGLE                                              \
+      "xor $0x00200000,%1\n\t"        /* Try to toggle ID */    \
+      "mov %1,%2\n\t"   /* Save expected value */               \
+      "push %1  \n\t"                                           \
+      "popfl    \n\t"   /* Try to toggle */                     \
+      "pushfl   \n\t"                                           \
+      "pop %1   \n\t"                                           \
+      "cmp %1,%2\n\t"   /* Compare with expected value */       \
+      "jnz 0f\n\t"   /* Unexpected, failure */               \
+
+      TRY_TOGGLE        /* Try to set/clear */
+      TRY_TOGGLE        /* Try to clear/set */
+
+      "mov $1,%0\n\t"   /* Passed the test! */
+
+      "0: \n\t"
+      "popfl    \n\t"   /* Restore flags */
+
+      : "=r" (ret), "=&r" (tmp), "=&r" (tmp2));
+  return ret;
+}
+#endif /* !defined HWLOC_X86_32_ARCH && !defined HWLOC_HAVE_MSVC_CPUIDEX*/
+#if (defined HWLOC_X86_64_ARCH) || (defined HWLOC_HAVE_MSVC_CPUIDEX)
+static __hwloc_inline int hwloc_have_x86_cpuid(void) { return 1; }
+#endif /* HWLOC_X86_64_ARCH */
+
+static __hwloc_inline void hwloc_x86_cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx)
+{
+#ifdef HWLOC_HAVE_MSVC_CPUIDEX
+  int regs[4];
+  __cpuidex(regs, *eax, *ecx);
+  *eax = regs[0];
+  *ebx = regs[1];
+  *ecx = regs[2];
+  *edx = regs[3];
+#else /* HWLOC_HAVE_MSVC_CPUIDEX */
+  /* Note: gcc might want to use bx or the stack for %1 addressing, so we can't
+   * use them :/ */
+#ifdef HWLOC_X86_64_ARCH
+  hwloc_uint64_t sav_rbx;
+  __asm__(
+  "mov %%rbx,%2\n\t"
+  "cpuid\n\t"
+  "xchg %2,%%rbx\n\t"
+  "movl %k2,%1\n\t"
+  : "+a" (*eax), "=m" (*ebx), "=&r"(sav_rbx),
+    "+c" (*ecx), "=&d" (*edx));
+#elif defined(HWLOC_X86_32_ARCH)
+  __asm__(
+  "mov %%ebx,%1\n\t"
+  "cpuid\n\t"
+  "xchg %%ebx,%1\n\t"
+  : "+a" (*eax), "=&SD" (*ebx), "+c" (*ecx), "=&d" (*edx));
+#else
+#error unknown architecture
+#endif
+#endif /* HWLOC_HAVE_MSVC_CPUIDEX */
+}
+
+#endif /* HWLOC_PRIVATE_X86_CPUID_H */
diff --git a/src/3rdparty/hwloc/include/private/debug.h b/src/3rdparty/hwloc/include/private/debug.h
new file mode 100644
index 000000000..74b697db4
--- /dev/null
+++ b/src/3rdparty/hwloc/include/private/debug.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2017 Inria.  All rights reserved.
+ * Copyright © 2009, 2011 Université Bordeaux
+ * Copyright © 2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/* The configuration file */
+
+#ifndef HWLOC_DEBUG_H
+#define HWLOC_DEBUG_H
+
+#include <private/autogen/config.h>
+#include <private/misc.h>
+
+#ifdef HWLOC_DEBUG
+#include <stdarg.h>
+#include <stdio.h>
+#endif
+
+/* Compile-time assertion */
+#define HWLOC_BUILD_ASSERT(condition) ((void)sizeof(char[1 - 2*!(condition)]))
+
+#ifdef HWLOC_DEBUG
+static __hwloc_inline int hwloc_debug_enabled(void)
+{
+  static int checked = 0;
+  static int enabled = 1;
+  if (!checked) {
+    const char *env = getenv("HWLOC_DEBUG_VERBOSE");
+    if (env)
+      enabled = atoi(env);
+    if (enabled)
+      fprintf(stderr, "hwloc verbose debug enabled, may be disabled with HWLOC_DEBUG_VERBOSE=0 in the environment.\n");
+    checked = 1;
+  }
+  return enabled;
+}
+#endif
+
+static __hwloc_inline void hwloc_debug(const char *s __hwloc_attribute_unused, ...) __hwloc_attribute_format(printf, 1, 2);
+static __hwloc_inline void hwloc_debug(const char *s __hwloc_attribute_unused, ...)
+{
+#ifdef HWLOC_DEBUG
+  if (hwloc_debug_enabled()) {
+    va_list ap;
+    va_start(ap, s);
+    vfprintf(stderr, s, ap);
+    va_end(ap);
+  }
+#endif
+}
+
+#ifdef HWLOC_DEBUG
+#define hwloc_debug_bitmap(fmt, bitmap) do { \
+if (hwloc_debug_enabled()) { \
+  char *s; \
+  hwloc_bitmap_asprintf(&s, bitmap); \
+  fprintf(stderr, fmt, s); \
+  free(s); \
+} } while (0)
+#define hwloc_debug_1arg_bitmap(fmt, arg1, bitmap) do { \
+if (hwloc_debug_enabled()) { \
+  char *s; \
+  hwloc_bitmap_asprintf(&s, bitmap); \
+  fprintf(stderr, fmt, arg1, s); \
+  free(s); \
+} } while (0)
+#define hwloc_debug_2args_bitmap(fmt, arg1, arg2, bitmap) do { \
+if (hwloc_debug_enabled()) { \
+  char *s; \
+  hwloc_bitmap_asprintf(&s, bitmap); \
+  fprintf(stderr, fmt, arg1, arg2, s); \
+  free(s); \
+} } while (0)
+#else
+#define hwloc_debug_bitmap(s, bitmap) do { } while(0)
+#define hwloc_debug_1arg_bitmap(s, arg1, bitmap) do { } while(0)
+#define hwloc_debug_2args_bitmap(s, arg1, arg2, bitmap) do { } while(0)
+#endif
+
+#endif /* HWLOC_DEBUG_H */
diff --git a/src/3rdparty/hwloc/include/private/internal-components.h b/src/3rdparty/hwloc/include/private/internal-components.h
new file mode 100644
index 000000000..b138a0eb9
--- /dev/null
+++ b/src/3rdparty/hwloc/include/private/internal-components.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright © 2018 Inria.  All rights reserved.
+ *
+ * See COPYING in top-level directory.
+ */
+
+/* List of components defined inside hwloc */
+
+#ifndef PRIVATE_INTERNAL_COMPONENTS_H
+#define PRIVATE_INTERNAL_COMPONENTS_H
+
+/* global discovery */
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_xml_component;
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_synthetic_component;
+
+/* CPU discovery */
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_aix_component;
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_bgq_component;
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_darwin_component;
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_freebsd_component;
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_hpux_component;
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_linux_component;
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_netbsd_component;
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_noos_component;
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_solaris_component;
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_windows_component;
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_x86_component;
+
+/* I/O discovery */
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_cuda_component;
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_gl_component;
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_linuxio_component;
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_nvml_component;
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_opencl_component;
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_pci_component;
+
+/* XML backend */
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_xml_nolibxml_component;
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_xml_libxml_component;
+
+#endif /* PRIVATE_INTERNAL_COMPONENTS_H */
diff --git a/src/3rdparty/hwloc/include/private/misc.h b/src/3rdparty/hwloc/include/private/misc.h
new file mode 100644
index 000000000..66608bc79
--- /dev/null
+++ b/src/3rdparty/hwloc/include/private/misc.h
@@ -0,0 +1,583 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2018 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/* Misc macros and inlines.  */
+
+#ifndef HWLOC_PRIVATE_MISC_H
+#define HWLOC_PRIVATE_MISC_H
+
+#include <hwloc/autogen/config.h>
+#include <private/autogen/config.h>
+#include <hwloc.h>
+
+#ifdef HWLOC_HAVE_DECL_STRNCASECMP
+#ifdef HAVE_STRINGS_H
+#include <strings.h>
+#endif
+#else
+#ifdef HAVE_CTYPE_H
+#include <ctype.h>
+#endif
+#endif
+
+#define HWLOC_BITS_PER_LONG (HWLOC_SIZEOF_UNSIGNED_LONG * 8)
+#define HWLOC_BITS_PER_INT (HWLOC_SIZEOF_UNSIGNED_INT * 8)
+
+#if (HWLOC_BITS_PER_LONG != 32) && (HWLOC_BITS_PER_LONG != 64)
+#error "unknown size for unsigned long."
+#endif
+
+#if (HWLOC_BITS_PER_INT != 16) && (HWLOC_BITS_PER_INT != 32) && (HWLOC_BITS_PER_INT != 64)
+#error "unknown size for unsigned int."
+#endif
+
+/* internal-use-only value for when we don't know the type or don't have any value */
+#define HWLOC_OBJ_TYPE_NONE ((hwloc_obj_type_t) -1)
+
+/**
+ * ffsl helpers.
+ */
+
+#if defined(HWLOC_HAVE_BROKEN_FFS)
+
+/* System has a broken ffs().
+ * We must check the before __GNUC__ or HWLOC_HAVE_FFSL
+ */
+#    define HWLOC_NO_FFS
+
+#elif defined(__GNUC__)
+
+#  if (__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))
+     /* Starting from 3.4, gcc has a long variant.  */
+#    define hwloc_ffsl(x) __builtin_ffsl(x)
+#  else
+#    define hwloc_ffs(x) __builtin_ffs(x)
+#    define HWLOC_NEED_FFSL
+#  endif
+
+#elif defined(HWLOC_HAVE_FFSL)
+
+#  ifndef HWLOC_HAVE_DECL_FFSL
+extern int ffsl(long) __hwloc_attribute_const;
+#  endif
+
+#  define hwloc_ffsl(x) ffsl(x)
+
+#elif defined(HWLOC_HAVE_FFS)
+
+#  ifndef HWLOC_HAVE_DECL_FFS
+extern int ffs(int) __hwloc_attribute_const;
+#  endif
+
+#  define hwloc_ffs(x) ffs(x)
+#  define HWLOC_NEED_FFSL
+
+#else /* no ffs implementation */
+
+#    define HWLOC_NO_FFS
+
+#endif
+
+#ifdef HWLOC_NO_FFS
+
+/* no ffs or it is known to be broken */
+static __hwloc_inline int
+hwloc_ffsl_manual(unsigned long x) __hwloc_attribute_const;
+static __hwloc_inline int
+hwloc_ffsl_manual(unsigned long x)
+{
+	int i;
+
+	if (!x)
+		return 0;
+
+	i = 1;
+#if HWLOC_BITS_PER_LONG >= 64
+	if (!(x & 0xfffffffful)) {
+		x >>= 32;
+		i += 32;
+	}
+#endif
+	if (!(x & 0xffffu)) {
+		x >>= 16;
+		i += 16;
+	}
+	if (!(x & 0xff)) {
+		x >>= 8;
+		i += 8;
+	}
+	if (!(x & 0xf)) {
+		x >>= 4;
+		i += 4;
+	}
+	if (!(x & 0x3)) {
+		x >>= 2;
+		i += 2;
+	}
+	if (!(x & 0x1)) {
+		x >>= 1;
+		i += 1;
+	}
+
+	return i;
+}
+/* always define hwloc_ffsl as a macro, to avoid renaming breakage */
+#define hwloc_ffsl hwloc_ffsl_manual
+
+#elif defined(HWLOC_NEED_FFSL)
+
+/* We only have an int ffs(int) implementation, build a long one.  */
+
+/* First make it 32 bits if it was only 16.  */
+static __hwloc_inline int
+hwloc_ffs32(unsigned long x) __hwloc_attribute_const;
+static __hwloc_inline int
+hwloc_ffs32(unsigned long x)
+{
+#if HWLOC_BITS_PER_INT == 16
+	int low_ffs, hi_ffs;
+
+	low_ffs = hwloc_ffs(x & 0xfffful);
+	if (low_ffs)
+		return low_ffs;
+
+	hi_ffs = hwloc_ffs(x >> 16);
+	if (hi_ffs)
+		return hi_ffs + 16;
+
+	return 0;
+#else
+	return hwloc_ffs(x);
+#endif
+}
+
+/* Then make it 64 bit if longs are.  */
+static __hwloc_inline int
+hwloc_ffsl_from_ffs32(unsigned long x) __hwloc_attribute_const;
+static __hwloc_inline int
+hwloc_ffsl_from_ffs32(unsigned long x)
+{
+#if HWLOC_BITS_PER_LONG == 64
+	int low_ffs, hi_ffs;
+
+	low_ffs = hwloc_ffs32(x & 0xfffffffful);
+	if (low_ffs)
+		return low_ffs;
+
+	hi_ffs = hwloc_ffs32(x >> 32);
+	if (hi_ffs)
+		return hi_ffs + 32;
+
+	return 0;
+#else
+	return hwloc_ffs32(x);
+#endif
+}
+/* always define hwloc_ffsl as a macro, to avoid renaming breakage */
+#define hwloc_ffsl hwloc_ffsl_from_ffs32
+
+#endif
+
+/**
+ * flsl helpers.
+ */
+#ifdef __GNUC_____
+
+#  if (__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))
+#    define hwloc_flsl(x) ((x) ? (8*sizeof(long) - __builtin_clzl(x)) : 0)
+#  else
+#    define hwloc_fls(x) ((x) ? (8*sizeof(int) - __builtin_clz(x)) : 0)
+#    define HWLOC_NEED_FLSL
+#  endif
+
+#elif defined(HWLOC_HAVE_FLSL)
+
+#  ifndef HWLOC_HAVE_DECL_FLSL
+extern int flsl(long) __hwloc_attribute_const;
+#  endif
+
+#  define hwloc_flsl(x) flsl(x)
+
+#elif defined(HWLOC_HAVE_CLZL)
+
+#  ifndef HWLOC_HAVE_DECL_CLZL
+extern int clzl(long) __hwloc_attribute_const;
+#  endif
+
+#  define hwloc_flsl(x) ((x) ? (8*sizeof(long) - clzl(x)) : 0)
+
+#elif defined(HWLOC_HAVE_FLS)
+
+#  ifndef HWLOC_HAVE_DECL_FLS
+extern int fls(int) __hwloc_attribute_const;
+#  endif
+
+#  define hwloc_fls(x) fls(x)
+#  define HWLOC_NEED_FLSL
+
+#elif defined(HWLOC_HAVE_CLZ)
+
+#  ifndef HWLOC_HAVE_DECL_CLZ
+extern int clz(int) __hwloc_attribute_const;
+#  endif
+
+#  define hwloc_fls(x) ((x) ? (8*sizeof(int) - clz(x)) : 0)
+#  define HWLOC_NEED_FLSL
+
+#else /* no fls implementation */
+
+static __hwloc_inline int
+hwloc_flsl_manual(unsigned long x) __hwloc_attribute_const;
+static __hwloc_inline int
+hwloc_flsl_manual(unsigned long x)
+{
+	int i = 0;
+
+	if (!x)
+		return 0;
+
+	i = 1;
+#if HWLOC_BITS_PER_LONG >= 64
+	if ((x & 0xffffffff00000000ul)) {
+		x >>= 32;
+		i += 32;
+	}
+#endif
+	if ((x & 0xffff0000u)) {
+		x >>= 16;
+		i += 16;
+	}
+	if ((x & 0xff00)) {
+		x >>= 8;
+		i += 8;
+	}
+	if ((x & 0xf0)) {
+		x >>= 4;
+		i += 4;
+	}
+	if ((x & 0xc)) {
+		x >>= 2;
+		i += 2;
+	}
+	if ((x & 0x2)) {
+		x >>= 1;
+		i += 1;
+	}
+
+	return i;
+}
+/* always define hwloc_flsl as a macro, to avoid renaming breakage */
+#define hwloc_flsl hwloc_flsl_manual
+
+#endif
+
+#ifdef HWLOC_NEED_FLSL
+
+/* We only have an int fls(int) implementation, build a long one.  */
+
+/* First make it 32 bits if it was only 16.  */
+static __hwloc_inline int
+hwloc_fls32(unsigned long x) __hwloc_attribute_const;
+static __hwloc_inline int
+hwloc_fls32(unsigned long x)
+{
+#if HWLOC_BITS_PER_INT == 16
+	int low_fls, hi_fls;
+
+	hi_fls = hwloc_fls(x >> 16);
+	if (hi_fls)
+		return hi_fls + 16;
+
+	low_fls = hwloc_fls(x & 0xfffful);
+	if (low_fls)
+		return low_fls;
+
+	return 0;
+#else
+	return hwloc_fls(x);
+#endif
+}
+
+/* Then make it 64 bit if longs are.  */
+static __hwloc_inline int
+hwloc_flsl_from_fls32(unsigned long x) __hwloc_attribute_const;
+static __hwloc_inline int
+hwloc_flsl_from_fls32(unsigned long x)
+{
+#if HWLOC_BITS_PER_LONG == 64
+	int low_fls, hi_fls;
+
+	hi_fls = hwloc_fls32(x >> 32);
+	if (hi_fls)
+		return hi_fls + 32;
+
+	low_fls = hwloc_fls32(x & 0xfffffffful);
+	if (low_fls)
+		return low_fls;
+
+	return 0;
+#else
+	return hwloc_fls32(x);
+#endif
+}
+/* always define hwloc_flsl as a macro, to avoid renaming breakage */
+#define hwloc_flsl hwloc_flsl_from_fls32
+
+#endif
+
+static __hwloc_inline int
+hwloc_weight_long(unsigned long w) __hwloc_attribute_const;
+static __hwloc_inline int
+hwloc_weight_long(unsigned long w)
+{
+#if HWLOC_BITS_PER_LONG == 32
+#if (__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__) >= 4)
+	return __builtin_popcount(w);
+#else
+	unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
+	res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
+	res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
+	res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
+	return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
+#endif
+#else /* HWLOC_BITS_PER_LONG == 32 */
+#if (__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__) >= 4)
+	return __builtin_popcountll(w);
+#else
+	unsigned long res;
+	res = (w & 0x5555555555555555ul) + ((w >> 1) & 0x5555555555555555ul);
+	res = (res & 0x3333333333333333ul) + ((res >> 2) & 0x3333333333333333ul);
+	res = (res & 0x0F0F0F0F0F0F0F0Ful) + ((res >> 4) & 0x0F0F0F0F0F0F0F0Ful);
+	res = (res & 0x00FF00FF00FF00FFul) + ((res >> 8) & 0x00FF00FF00FF00FFul);
+	res = (res & 0x0000FFFF0000FFFFul) + ((res >> 16) & 0x0000FFFF0000FFFFul);
+	return (res & 0x00000000FFFFFFFFul) + ((res >> 32) & 0x00000000FFFFFFFFul);
+#endif
+#endif /* HWLOC_BITS_PER_LONG == 64 */
+}
+
+#if !HAVE_DECL_STRTOULL && defined(HAVE_STRTOULL)
+unsigned long long int strtoull(const char *nptr, char **endptr, int base);
+#endif
+
+static __hwloc_inline int hwloc_strncasecmp(const char *s1, const char *s2, size_t n)
+{
+#ifdef HWLOC_HAVE_DECL_STRNCASECMP
+  return strncasecmp(s1, s2, n);
+#else
+  while (n) {
+    char c1 = tolower(*s1), c2 = tolower(*s2);
+    if (!c1 || !c2 || c1 != c2)
+      return c1-c2;
+    n--; s1++; s2++;
+  }
+  return 0;
+#endif
+}
+
+static __hwloc_inline hwloc_obj_type_t hwloc_cache_type_by_depth_type(unsigned depth, hwloc_obj_cache_type_t type)
+{
+  if (type == HWLOC_OBJ_CACHE_INSTRUCTION) {
+    if (depth >= 1 && depth <= 3)
+      return HWLOC_OBJ_L1ICACHE + depth-1;
+    else
+      return HWLOC_OBJ_TYPE_NONE;
+  } else {
+    if (depth >= 1 && depth <= 5)
+      return HWLOC_OBJ_L1CACHE + depth-1;
+    else
+      return HWLOC_OBJ_TYPE_NONE;
+  }
+}
+
+#define HWLOC_BITMAP_EQUAL 0       /* Bitmaps are equal */
+#define HWLOC_BITMAP_INCLUDED 1    /* First bitmap included in second */
+#define HWLOC_BITMAP_CONTAINS 2    /* First bitmap contains second */
+#define HWLOC_BITMAP_INTERSECTS 3  /* Bitmaps intersect without any inclusion */
+#define HWLOC_BITMAP_DIFFERENT  4  /* Bitmaps do not intersect */
+
+/* Compare bitmaps \p bitmap1 and \p bitmap2 from an inclusion point of view. */
+HWLOC_DECLSPEC int hwloc_bitmap_compare_inclusion(hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2) __hwloc_attribute_pure;
+
+/* Return a stringified PCI class. */
+HWLOC_DECLSPEC extern const char * hwloc_pci_class_string(unsigned short class_id);
+
+/* Parse a PCI link speed (GT/s) string from Linux sysfs */
+#ifdef HWLOC_LINUX_SYS
+#include <stdlib.h> /* for atof() */
+static __hwloc_inline float
+hwloc_linux_pci_link_speed_from_string(const char *string)
+{
+  /* don't parse Gen1 with atof() since it expects a localized string
+   * while the kernel sysfs files aren't.
+   */
+  if (!strncmp(string, "2.5 ", 4))
+    /* "2.5 GT/s" is Gen1 with 8/10 encoding */
+    return 2.5 * .8;
+
+  /* also hardwire Gen2 since it also has a specific encoding */
+  if (!strncmp(string, "5 ", 2))
+    /* "5 GT/s" is Gen2 with 8/10 encoding */
+    return 5 * .8;
+
+  /* handle Gen3+ in a generic way */
+  return atof(string) * 128./130; /* Gen3+ encoding is 128/130 */
+}
+#endif
+
+/* Traverse children of a parent */
+#define for_each_child(child, parent) for(child = parent->first_child; child; child = child->next_sibling)
+#define for_each_memory_child(child, parent) for(child = parent->memory_first_child; child; child = child->next_sibling)
+#define for_each_io_child(child, parent) for(child = parent->io_first_child; child; child = child->next_sibling)
+#define for_each_misc_child(child, parent) for(child = parent->misc_first_child; child; child = child->next_sibling)
+
+/* Any object attached to normal children */
+static __hwloc_inline int hwloc__obj_type_is_normal (hwloc_obj_type_t type)
+{
+  /* type contiguity is asserted in topology_check() */
+  return type <= HWLOC_OBJ_GROUP;
+}
+
+/* Any object attached to memory children, currently only NUMA nodes */
+static __hwloc_inline int hwloc__obj_type_is_memory (hwloc_obj_type_t type)
+{
+  /* type contiguity is asserted in topology_check() */
+  return type == HWLOC_OBJ_NUMANODE;
+}
+
+/* I/O or Misc object, without cpusets or nodesets. */
+static __hwloc_inline int hwloc__obj_type_is_special (hwloc_obj_type_t type)
+{
+  /* type contiguity is asserted in topology_check() */
+  return type >= HWLOC_OBJ_BRIDGE && type <= HWLOC_OBJ_MISC;
+}
+
+/* Any object attached to io children */
+static __hwloc_inline int hwloc__obj_type_is_io (hwloc_obj_type_t type)
+{
+  /* type contiguity is asserted in topology_check() */
+  return type >= HWLOC_OBJ_BRIDGE && type <= HWLOC_OBJ_OS_DEVICE;
+}
+
+static __hwloc_inline int
+hwloc__obj_type_is_cache(hwloc_obj_type_t type)
+{
+  /* type contiguity is asserted in topology_check() */
+  return (type >= HWLOC_OBJ_L1CACHE && type <= HWLOC_OBJ_L3ICACHE);
+}
+
+static __hwloc_inline int
+hwloc__obj_type_is_dcache(hwloc_obj_type_t type)
+{
+  /* type contiguity is asserted in topology_check() */
+  return (type >= HWLOC_OBJ_L1CACHE && type <= HWLOC_OBJ_L5CACHE);
+}
+
+/** \brief Check whether an object is a Instruction Cache. */
+static __hwloc_inline int
+hwloc__obj_type_is_icache(hwloc_obj_type_t type)
+{
+  /* type contiguity is asserted in topology_check() */
+  return (type >= HWLOC_OBJ_L1ICACHE && type <= HWLOC_OBJ_L3ICACHE);
+}
+
+#ifdef HAVE_USELOCALE
+#include "locale.h"
+#ifdef HAVE_XLOCALE_H
+#include "xlocale.h"
+#endif
+#define hwloc_localeswitch_declare locale_t __old_locale = (locale_t)0, __new_locale
+#define hwloc_localeswitch_init() do {                     \
+  __new_locale = newlocale(LC_ALL_MASK, "C", (locale_t)0); \
+  if (__new_locale != (locale_t)0)                         \
+    __old_locale = uselocale(__new_locale);                \
+} while (0)
+#define hwloc_localeswitch_fini() do { \
+  if (__new_locale != (locale_t)0) {   \
+    uselocale(__old_locale);           \
+    freelocale(__new_locale);          \
+  }                                    \
+} while(0)
+#else /* HAVE_USELOCALE */
+#if __HWLOC_HAVE_ATTRIBUTE_UNUSED
+#define hwloc_localeswitch_declare int __dummy_nolocale __hwloc_attribute_unused
+#define hwloc_localeswitch_init()
+#else
+#define hwloc_localeswitch_declare int __dummy_nolocale
+#define hwloc_localeswitch_init() (void)__dummy_nolocale
+#endif
+#define hwloc_localeswitch_fini()
+#endif /* HAVE_USELOCALE */
+
+#if !HAVE_DECL_FABSF
+#define fabsf(f) fabs((double)(f))
+#endif
+
+#if !HAVE_DECL_MODFF
+#define modff(x,iptr) (float)modf((double)x,(double *)iptr)
+#endif
+
+#if HAVE_DECL__SC_PAGE_SIZE
+#define hwloc_getpagesize() sysconf(_SC_PAGE_SIZE)
+#elif HAVE_DECL__SC_PAGESIZE
+#define hwloc_getpagesize() sysconf(_SC_PAGESIZE)
+#elif defined HAVE_GETPAGESIZE
+#define hwloc_getpagesize() getpagesize()
+#else
+#undef hwloc_getpagesize
+#endif
+
+#if HWLOC_HAVE_ATTRIBUTE_FORMAT
+#  define __hwloc_attribute_format(type, str, arg)  __attribute__((__format__(type, str, arg)))
+#else
+#  define __hwloc_attribute_format(type, str, arg)
+#endif
+
+#define hwloc_memory_size_printf_value(_size, _verbose) \
+  ((_size) < (10ULL<<20) || (_verbose) ? (((_size)>>9)+1)>>1 : (_size) < (10ULL<<30) ? (((_size)>>19)+1)>>1 : (_size) < (10ULL<<40) ? (((_size)>>29)+1)>>1 : (((_size)>>39)+1)>>1)
+#define hwloc_memory_size_printf_unit(_size, _verbose) \
+  ((_size) < (10ULL<<20) || (_verbose) ? "KB" : (_size) < (10ULL<<30) ? "MB" : (_size) < (10ULL<<40) ? "GB" : "TB")
+
+#ifdef HWLOC_WIN_SYS
+#  ifndef HAVE_SSIZE_T
+typedef SSIZE_T ssize_t;
+#  endif
+#  if !HAVE_DECL_STRTOULL && !defined(HAVE_STRTOULL)
+#    define strtoull _strtoui64
+#  endif
+#  ifndef S_ISREG
+#    define S_ISREG(m) ((m) & S_IFREG)
+#  endif
+#  ifndef S_ISDIR
+#    define S_ISDIR(m) (((m) & S_IFMT) == S_IFDIR)
+#  endif
+#  ifndef S_IRWXU
+#    define S_IRWXU 00700
+#  endif
+#  ifndef HWLOC_HAVE_DECL_STRCASECMP
+#    define strcasecmp _stricmp
+#  endif
+#  if !HAVE_DECL_SNPRINTF
+#    define snprintf _snprintf
+#  endif
+#  if HAVE_DECL__STRDUP
+#    define strdup _strdup
+#  endif
+#  if HAVE_DECL__PUTENV
+#    define putenv _putenv
+#  endif
+#endif
+
+#if defined HWLOC_WIN_SYS && !defined __MINGW32__ && !defined(__CYGWIN__)
+/* MSVC doesn't support C99 variable-length array */
+#include <malloc.h>
+#define HWLOC_VLA(_type, _name, _nb) _type *_name = (_type*) _alloca((_nb)*sizeof(_type))
+#else
+#define HWLOC_VLA(_type, _name, _nb) _type _name[_nb]
+#endif
+
+#endif /* HWLOC_PRIVATE_MISC_H */
diff --git a/src/3rdparty/hwloc/include/private/netloc.h b/src/3rdparty/hwloc/include/private/netloc.h
new file mode 100644
index 000000000..c070c54cc
--- /dev/null
+++ b/src/3rdparty/hwloc/include/private/netloc.h
@@ -0,0 +1,578 @@
+/*
+ * Copyright © 2014 Cisco Systems, Inc.  All rights reserved.
+ * Copyright © 2013-2014 University of Wisconsin-La Crosse.
+ *                         All rights reserved.
+ * Copyright © 2015-2017 Inria.  All rights reserved.
+ *
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ * See COPYING in top-level directory.
+ *
+ * $HEADER$
+ */
+
+#ifndef _NETLOC_PRIVATE_H_
+#define _NETLOC_PRIVATE_H_
+
+#include <hwloc.h>
+#include <netloc.h>
+#include <netloc/uthash.h>
+#include <netloc/utarray.h>
+#include <private/autogen/config.h>
+
+#define NETLOCFILE_VERSION 1
+
+#ifdef NETLOC_SCOTCH
+#include <stdint.h>
+#include <scotch.h>
+#define NETLOC_int SCOTCH_Num
+#else
+#define NETLOC_int int
+#endif
+
+/*
+ * "Import" a few things from hwloc
+ */
+#define __netloc_attribute_unused __hwloc_attribute_unused
+#define __netloc_attribute_malloc __hwloc_attribute_malloc
+#define __netloc_attribute_const __hwloc_attribute_const
+#define __netloc_attribute_pure __hwloc_attribute_pure
+#define __netloc_attribute_deprecated __hwloc_attribute_deprecated
+#define __netloc_attribute_may_alias __hwloc_attribute_may_alias
+#define NETLOC_DECLSPEC HWLOC_DECLSPEC
+
+
+/**********************************************************************
+ * Types
+ **********************************************************************/
+
+/**
+ * Definitions for Comparators
+ * \sa These are the return values from the following functions:
+ *     netloc_network_compare, netloc_dt_edge_t_compare, netloc_dt_node_t_compare
+ */
+typedef enum {
+    NETLOC_CMP_SAME    =  0,  /**< Compared as the Same */
+    NETLOC_CMP_SIMILAR = -1,  /**< Compared as Similar, but not the Same */
+    NETLOC_CMP_DIFF    = -2   /**< Compared as Different */
+} netloc_compare_type_t;
+
+/**
+ * Enumerated type for the various types of supported networks
+ */
+typedef enum {
+    NETLOC_NETWORK_TYPE_ETHERNET    = 1, /**< Ethernet network */
+    NETLOC_NETWORK_TYPE_INFINIBAND  = 2, /**< InfiniBand network */
+    NETLOC_NETWORK_TYPE_INVALID     = 3  /**< Invalid network */
+} netloc_network_type_t;
+
+/**
+ * Enumerated type for the various types of supported topologies
+ */
+typedef enum {
+    NETLOC_TOPOLOGY_TYPE_INVALID = -1, /**< Invalid */
+    NETLOC_TOPOLOGY_TYPE_TREE    = 1,  /**< Tree */
+} netloc_topology_type_t;
+
+/**
+ * Enumerated type for the various types of nodes
+ */
+typedef enum {
+    NETLOC_NODE_TYPE_HOST    = 0, /**< Host (a.k.a., network addressable endpoint - e.g., MAC Address) node */
+    NETLOC_NODE_TYPE_SWITCH  = 1, /**< Switch node */
+    NETLOC_NODE_TYPE_INVALID = 2  /**< Invalid node */
+} netloc_node_type_t;
+
+typedef enum {
+    NETLOC_ARCH_TREE    =  0,  /* Fat tree */
+} netloc_arch_type_t;
+
+
+/* Pre declarations to avoid inter dependency problems */
+/** \cond IGNORE */
+struct netloc_topology_t;
+typedef struct netloc_topology_t netloc_topology_t;
+struct netloc_node_t;
+typedef struct netloc_node_t netloc_node_t;
+struct netloc_edge_t;
+typedef struct netloc_edge_t netloc_edge_t;
+struct netloc_physical_link_t;
+typedef struct netloc_physical_link_t netloc_physical_link_t;
+struct netloc_path_t;
+typedef struct netloc_path_t netloc_path_t;
+
+struct netloc_arch_tree_t;
+typedef struct netloc_arch_tree_t netloc_arch_tree_t;
+struct netloc_arch_node_t;
+typedef struct netloc_arch_node_t netloc_arch_node_t;
+struct netloc_arch_node_slot_t;
+typedef struct netloc_arch_node_slot_t netloc_arch_node_slot_t;
+struct netloc_arch_t;
+typedef struct netloc_arch_t netloc_arch_t;
+/** \endcond */
+
+/**
+ * \struct netloc_topology_t
+ * \brief Netloc Topology Context
+ *
+ * An opaque data structure used to reference a network topology.
+ *
+ * \note Must be initialized with \ref netloc_topology_construct()
+ */
+struct netloc_topology_t {
+    /** Topology path */
+    char *topopath;
+    /** Subnet ID */
+    char *subnet_id;
+
+    /** Node List */
+    netloc_node_t *nodes; /* Hash table of nodes by physical_id */
+    netloc_node_t *nodesByHostname; /* Hash table of nodes by hostname */
+
+    netloc_physical_link_t *physical_links; /* Hash table with physcial links */
+
+    /** Partition List */
+    UT_array *partitions;
+
+    /** Hwloc topology List */
+    char *hwlocpath;
+    UT_array *topos;
+    hwloc_topology_t *hwloc_topos;
+
+    /** Type of the graph */
+    netloc_topology_type_t type;
+};
+
+/**
+ * \brief Netloc Node Type
+ *
+ * Represents the concept of a node (a.k.a., vertex, endpoint) within a network
+ * graph. This could be a server or a network switch. The \ref node_type parameter
+ * will distinguish the exact type of node this represents in the graph.
+ */
+struct netloc_node_t {
+    UT_hash_handle hh;       /* makes this structure hashable with physical_id */
+    UT_hash_handle hh2;      /* makes this structure hashable with hostname */
+
+    /** Physical ID of the node */
+    char physical_id[20];
+
+    /** Logical ID of the node (if any) */
+    int logical_id;
+
+    /** Type of the node */
+    netloc_node_type_t type;
+
+    /* Pointer to physical_links */
+    UT_array *physical_links;
+
+    /** Description information from discovery (if any) */
+    char *description;
+
+    /**
+     * Application-given private data pointer.
+     * Initialized to NULL, and not used by the netloc library.
+     */
+    void * userdata;
+
+    /** Outgoing edges from this node */
+    netloc_edge_t *edges;
+
+    UT_array *subnodes; /* the group of nodes for the virtual nodes */
+
+    netloc_path_t *paths;
+
+    char *hostname;
+
+    UT_array *partitions; /* index in the list from the topology */
+
+    hwloc_topology_t hwlocTopo;
+    int hwlocTopoIdx;
+};
+
+/**
+ * \brief Netloc Edge Type
+ *
+ * Represents the concept of a directed edge within a network graph.
+ *
+ * \note We do not point to the netloc_node_t structure directly to
+ * simplify the representation, and allow the information to more easily
+ * be entered into the data store without circular references.
+ * \todo JJH Is the note above still true?
+ */
+struct netloc_edge_t {
+    UT_hash_handle hh;       /* makes this structure hashable */
+
+    netloc_node_t *dest;
+
+    int id;
+
+    /** Pointers to the parent node */
+    netloc_node_t *node;
+
+    /* Pointer to physical_links */
+    UT_array *physical_links;
+
+    /** total gbits of the links */
+    float total_gbits;
+
+    UT_array *partitions; /* index in the list from the topology */
+
+    UT_array *subnode_edges; /* for edges going to virtual nodes */
+
+    struct netloc_edge_t *other_way;
+
+    /**
+     * Application-given private data pointer.
+     * Initialized to NULL, and not used by the netloc library.
+     */
+    void * userdata;
+};
+
+
+struct netloc_physical_link_t {
+    UT_hash_handle hh;       /* makes this structure hashable */
+
+    int id; // TODO long long
+    netloc_node_t *src;
+    netloc_node_t *dest;
+    int ports[2];
+    char *width;
+    char *speed;
+
+    netloc_edge_t *edge;
+
+    int other_way_id;
+    struct netloc_physical_link_t *other_way;
+
+    UT_array *partitions; /* index in the list from the topology */
+
+    /** gbits of the link from speed and width */
+    float gbits;
+
+    /** Description information from discovery (if any) */
+    char *description;
+};
+
+struct netloc_path_t {
+    UT_hash_handle hh;       /* makes this structure hashable */
+    char dest_id[20];
+    UT_array *links;
+};
+
+
+/**********************************************************************
+ *        Architecture structures
+ **********************************************************************/
+struct netloc_arch_tree_t {
+    NETLOC_int num_levels;
+    NETLOC_int *degrees;
+    NETLOC_int *cost;
+};
+
+struct netloc_arch_node_t {
+    UT_hash_handle hh;       /* makes this structure hashable */
+    char *name; /* Hash key */
+    netloc_node_t *node; /* Corresponding node */
+    int idx_in_topo; /* idx with ghost hosts to have complete topo */
+    int num_slots; /* it is not the real number of slots but the maximum slot idx */
+    int *slot_idx; /* corresponding idx in slot_tree */
+    int *slot_os_idx; /* corresponding os index for each leaf in tree */
+    netloc_arch_tree_t *slot_tree; /* Tree built from hwloc */
+    int num_current_slots; /* Number of PUs */
+    NETLOC_int *current_slots; /* indices in the complete tree */
+    int *slot_ranks; /* corresponding MPI rank for each leaf in tree */
+};
+
+struct netloc_arch_node_slot_t {
+    netloc_arch_node_t *node;
+    int slot;
+};
+
+struct netloc_arch_t {
+    netloc_topology_t *topology;
+    int has_slots; /* if slots are included in the architecture */
+    netloc_arch_type_t type;
+    union {
+        netloc_arch_tree_t *node_tree;
+        netloc_arch_tree_t *global_tree;
+    } arch;
+    netloc_arch_node_t *nodes_by_name;
+    netloc_arch_node_slot_t *node_slot_by_idx; /* node_slot by index in complete topo */
+    NETLOC_int num_current_hosts; /* if has_slots, host is a slot, else host is a node */
+    NETLOC_int *current_hosts; /* indices in the complete topology */
+};
+
+/**********************************************************************
+ * Topology Functions
+ **********************************************************************/
+/**
+ * Allocate a topology handle.
+ *
+ * User is responsible for calling \ref netloc_detach on the topology handle.
+ * The network parameter information is deep copied into the topology handle, so the
+ * user may destruct the network handle after calling this function and/or reuse
+ * the network handle.
+ *
+ * \returns NETLOC_SUCCESS on success
+ * \returns NETLOC_ERROR upon an error.
+ */
+netloc_topology_t *netloc_topology_construct(char *path);
+
+/**
+ * Destruct a topology handle
+ *
+ * \param topology A valid pointer to a \ref netloc_topology_t handle created
+ * from a prior call to \ref netloc_topology_construct.
+ *
+ * \returns NETLOC_SUCCESS on success
+ * \returns NETLOC_ERROR upon an error.
+ */
+int netloc_topology_destruct(netloc_topology_t *topology);
+
+int netloc_topology_find_partition_idx(netloc_topology_t *topology, char *partition_name);
+
+int netloc_topology_read_hwloc(netloc_topology_t *topology, int num_nodes,
+        netloc_node_t **node_list);
+
+#define netloc_topology_iter_partitions(topology,partition) \
+    for ((partition) = (char **)utarray_front(topology->partitions); \
+            (partition) != NULL; \
+            (partition) = (char **)utarray_next(topology->partitions, partition))
+
+#define netloc_topology_iter_hwloctopos(topology,hwloctopo) \
+    for ((hwloctopo) = (char **)utarray_front(topology->topos); \
+            (hwloctopo) != NULL; \
+            (hwloctopo) = (char **)utarray_next(topology->topos, hwloctopo))
+
+#define netloc_topology_find_node(topology,node_id,node) \
+    HASH_FIND_STR(topology->nodes, node_id, node)
+
+#define netloc_topology_iter_nodes(topology,node,_tmp) \
+    HASH_ITER(hh, topology->nodes, node, _tmp)
+
+#define netloc_topology_num_nodes(topology) \
+    HASH_COUNT(topology->nodes)
+
+/*************************************************/
+
+
+/**
+ * Constructor for netloc_node_t
+ *
+ * User is responsible for calling the destructor on the handle.
+ *
+ * Returns
+ *   A newly allocated pointer to the network information.
+ */
+netloc_node_t *netloc_node_construct(void);
+
+/**
+ * Destructor for netloc_node_t
+ *
+ * \param node A valid node handle
+ *
+ * Returns
+ *   NETLOC_SUCCESS on success
+ *   NETLOC_ERROR on error
+ */
+int netloc_node_destruct(netloc_node_t *node);
+
+char *netloc_node_pretty_print(netloc_node_t* node);
+
+#define netloc_node_get_num_subnodes(node) \
+    utarray_len((node)->subnodes)
+
+#define netloc_node_get_subnode(node,i) \
+    (*(netloc_node_t **)utarray_eltptr((node)->subnodes, (i)))
+
+#define netloc_node_get_num_edges(node) \
+    utarray_len((node)->edges)
+
+#define netloc_node_get_edge(node,i) \
+    (*(netloc_edge_t **)utarray_eltptr((node)->edges, (i)))
+
+#define netloc_node_iter_edges(node,edge,_tmp) \
+    HASH_ITER(hh, node->edges, edge, _tmp)
+
+#define netloc_node_iter_paths(node,path,_tmp) \
+    HASH_ITER(hh, node->paths, path, _tmp)
+
+#define netloc_node_is_host(node) \
+    (node->type == NETLOC_NODE_TYPE_HOST)
+
+#define netloc_node_is_switch(node) \
+    (node->type == NETLOC_NODE_TYPE_SWITCH)
+
+#define netloc_node_iter_paths(node, path,_tmp) \
+    HASH_ITER(hh, node->paths, path, _tmp)
+
+int netloc_node_is_in_partition(netloc_node_t *node, int partition);
+
+/*************************************************/
+
+
+/**
+ * Constructor for netloc_edge_t
+ *
+ * User is responsible for calling the destructor on the handle.
+ *
+ * Returns
+ *   A newly allocated pointer to the edge information.
+ */
+netloc_edge_t *netloc_edge_construct(void);
+
+/**
+ * Destructor for netloc_edge_t
+ *
+ * \param edge A valid edge handle
+ *
+ * Returns
+ *   NETLOC_SUCCESS on success
+ *   NETLOC_ERROR on error
+ */
+int netloc_edge_destruct(netloc_edge_t *edge);
+
+char * netloc_edge_pretty_print(netloc_edge_t* edge);
+
+void netloc_edge_reset_uid(void);
+
+int netloc_edge_is_in_partition(netloc_edge_t *edge, int partition);
+
+#define netloc_edge_get_num_links(edge) \
+    utarray_len((edge)->physical_links)
+
+#define netloc_edge_get_link(edge,i) \
+    (*(netloc_physical_link_t **)utarray_eltptr((edge)->physical_links, (i)))
+
+#define netloc_edge_get_num_subedges(edge) \
+    utarray_len((edge)->subnode_edges)
+
+#define netloc_edge_get_subedge(edge,i) \
+    (*(netloc_edge_t **)utarray_eltptr((edge)->subnode_edges, (i)))
+
+/*************************************************/
+
+
+/**
+ * Constructor for netloc_physical_link_t
+ *
+ * User is responsible for calling the destructor on the handle.
+ *
+ * Returns
+ *   A newly allocated pointer to the physical link information.
+ */
+netloc_physical_link_t * netloc_physical_link_construct(void);
+
+/**
+ * Destructor for netloc_physical_link_t
+ *
+ * Returns
+ *   NETLOC_SUCCESS on success
+ *   NETLOC_ERROR on error
+ */
+int netloc_physical_link_destruct(netloc_physical_link_t *link);
+
+char * netloc_link_pretty_print(netloc_physical_link_t* link);
+
+/*************************************************/
+
+
+netloc_path_t *netloc_path_construct(void);
+int netloc_path_destruct(netloc_path_t *path);
+
+
+/**********************************************************************
+ *        Architecture functions
+ **********************************************************************/
+
+netloc_arch_t * netloc_arch_construct(void);
+
+int netloc_arch_destruct(netloc_arch_t *arch);
+
+int netloc_arch_build(netloc_arch_t *arch, int add_slots);
+
+int netloc_arch_set_current_resources(netloc_arch_t *arch);
+
+int netloc_arch_set_global_resources(netloc_arch_t *arch);
+
+int netloc_arch_node_get_hwloc_info(netloc_arch_node_t *arch);
+
+void netloc_arch_tree_complete(netloc_arch_tree_t *tree, UT_array **down_degrees_by_level,
+        int num_hosts, int **parch_idx);
+
+NETLOC_int netloc_arch_tree_num_leaves(netloc_arch_tree_t *tree);
+
+
+/**********************************************************************
+ *        Access functions of various elements of the topology
+ **********************************************************************/
+
+#define netloc_get_num_partitions(object) \
+    utarray_len((object)->partitions)
+
+#define netloc_get_partition(object,i) \
+    (*(int *)utarray_eltptr((object)->partitions, (i)))
+
+
+#define netloc_path_iter_links(path,link) \
+    for ((link) = (netloc_physical_link_t **)utarray_front(path->links); \
+            (link) != NULL; \
+            (link) = (netloc_physical_link_t **)utarray_next(path->links, link))
+
+/**********************************************************************
+ *        Misc functions
+ **********************************************************************/
+
+/**
+ * Decode the network type
+ *
+ * \param net_type A valid member of the \ref netloc_network_type_t type
+ *
+ * \returns NULL if the type is invalid
+ * \returns A string for that \ref netloc_network_type_t type
+ */
+static inline const char * netloc_network_type_decode(netloc_network_type_t net_type) {
+    if( NETLOC_NETWORK_TYPE_ETHERNET == net_type ) {
+        return "ETH";
+    }
+    else if( NETLOC_NETWORK_TYPE_INFINIBAND == net_type ) {
+        return "IB";
+    }
+    else {
+        return NULL;
+    }
+}
+
+/**
+ * Decode the node type
+ *
+ * \param node_type A valid member of the \ref netloc_node_type_t type
+ *
+ * \returns NULL if the type is invalid
+ * \returns A string for that \ref netloc_node_type_t type
+ */
+static inline const char * netloc_node_type_decode(netloc_node_type_t node_type) {
+    if( NETLOC_NODE_TYPE_SWITCH == node_type ) {
+        return "SW";
+    }
+    else if( NETLOC_NODE_TYPE_HOST == node_type ) {
+        return "CA";
+    }
+    else {
+        return NULL;
+    }
+}
+
+ssize_t netloc_line_get(char **lineptr, size_t *n, FILE *stream);
+
+char *netloc_line_get_next_token(char **string, char c);
+
+int netloc_build_comm_mat(char *filename, int *pn, double ***pmat);
+
+#define STRDUP_IF_NOT_NULL(str) (NULL == str ? NULL : strdup(str))
+#define STR_EMPTY_IF_NULL(str) (NULL == str ? "" : str)
+
+
+#endif // _NETLOC_PRIVATE_H_
diff --git a/src/3rdparty/hwloc/include/private/private.h b/src/3rdparty/hwloc/include/private/private.h
new file mode 100644
index 000000000..8e3964ab2
--- /dev/null
+++ b/src/3rdparty/hwloc/include/private/private.h
@@ -0,0 +1,417 @@
+/*
+ * Copyright © 2009      CNRS
+ * Copyright © 2009-2019 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ *
+ * See COPYING in top-level directory.
+ */
+
+/* Internal types and helpers. */
+
+
+#ifdef HWLOC_INSIDE_PLUGIN
+/*
+ * these declarations are internal only, they are not available to plugins
+ * (many functions below are internal static symbols).
+ */
+#error This file should not be used in plugins
+#endif
+
+
+#ifndef HWLOC_PRIVATE_H
+#define HWLOC_PRIVATE_H
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <hwloc/bitmap.h>
+#include <private/components.h>
+#include <private/misc.h>
+#include <sys/types.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#ifdef HAVE_STDINT_H
+#include <stdint.h>
+#endif
+#ifdef HAVE_SYS_UTSNAME_H
+#include <sys/utsname.h>
+#endif
+#include <string.h>
+
+#define HWLOC_TOPOLOGY_ABI 0x20000 /* version of the layout of struct topology */
+
+/*****************************************************
+ * WARNING:
+ * changes below in this structure (and its children)
+ * should cause a bump of HWLOC_TOPOLOGY_ABI.
+ *****************************************************/
+
+struct hwloc_topology {
+  unsigned topology_abi;
+
+  unsigned nb_levels;					/* Number of horizontal levels */
+  unsigned nb_levels_allocated;				/* Number of levels allocated and zeroed in level_nbobjects and levels below */
+  unsigned *level_nbobjects; 				/* Number of objects on each horizontal level */
+  struct hwloc_obj ***levels;				/* Direct access to levels, levels[l = 0 .. nblevels-1][0..level_nbobjects[l]] */
+  unsigned long flags;
+  int type_depth[HWLOC_OBJ_TYPE_MAX];
+  enum hwloc_type_filter_e type_filter[HWLOC_OBJ_TYPE_MAX];
+  int is_thissystem;
+  int is_loaded;
+  int modified;                                         /* >0 if objects were added/removed recently, which means a reconnect is needed */
+  hwloc_pid_t pid;                                      /* Process ID the topology is view from, 0 for self */
+  void *userdata;
+  uint64_t next_gp_index;
+
+  void *adopted_shmem_addr;
+  size_t adopted_shmem_length;
+
+#define HWLOC_NR_SLEVELS 5
+#define HWLOC_SLEVEL_NUMANODE 0
+#define HWLOC_SLEVEL_BRIDGE 1
+#define HWLOC_SLEVEL_PCIDEV 2
+#define HWLOC_SLEVEL_OSDEV 3
+#define HWLOC_SLEVEL_MISC 4
+  /* order must match negative depth, it's asserted in setup_defaults() */
+#define HWLOC_SLEVEL_FROM_DEPTH(x) (HWLOC_TYPE_DEPTH_NUMANODE-(x))
+#define HWLOC_SLEVEL_TO_DEPTH(x) (HWLOC_TYPE_DEPTH_NUMANODE-(x))
+  struct hwloc_special_level_s {
+    unsigned nbobjs;
+    struct hwloc_obj **objs;
+    struct hwloc_obj *first, *last; /* Temporarily used while listing object before building the objs array */
+  } slevels[HWLOC_NR_SLEVELS];
+
+  hwloc_bitmap_t allowed_cpuset;
+  hwloc_bitmap_t allowed_nodeset;
+
+  struct hwloc_binding_hooks {
+    int (*set_thisproc_cpubind)(hwloc_topology_t topology, hwloc_const_cpuset_t set, int flags);
+    int (*get_thisproc_cpubind)(hwloc_topology_t topology, hwloc_cpuset_t set, int flags);
+    int (*set_thisthread_cpubind)(hwloc_topology_t topology, hwloc_const_cpuset_t set, int flags);
+    int (*get_thisthread_cpubind)(hwloc_topology_t topology, hwloc_cpuset_t set, int flags);
+    int (*set_proc_cpubind)(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_cpuset_t set, int flags);
+    int (*get_proc_cpubind)(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_cpuset_t set, int flags);
+#ifdef hwloc_thread_t
+    int (*set_thread_cpubind)(hwloc_topology_t topology, hwloc_thread_t tid, hwloc_const_cpuset_t set, int flags);
+    int (*get_thread_cpubind)(hwloc_topology_t topology, hwloc_thread_t tid, hwloc_cpuset_t set, int flags);
+#endif
+
+    int (*get_thisproc_last_cpu_location)(hwloc_topology_t topology, hwloc_cpuset_t set, int flags);
+    int (*get_thisthread_last_cpu_location)(hwloc_topology_t topology, hwloc_cpuset_t set, int flags);
+    int (*get_proc_last_cpu_location)(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_cpuset_t set, int flags);
+
+    int (*set_thisproc_membind)(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags);
+    int (*get_thisproc_membind)(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags);
+    int (*set_thisthread_membind)(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags);
+    int (*get_thisthread_membind)(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags);
+    int (*set_proc_membind)(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags);
+    int (*get_proc_membind)(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags);
+    int (*set_area_membind)(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags);
+    int (*get_area_membind)(hwloc_topology_t topology, const void *addr, size_t len, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags);
+    int (*get_area_memlocation)(hwloc_topology_t topology, const void *addr, size_t len, hwloc_nodeset_t nodeset, int flags);
+    /* This has to return the same kind of pointer as alloc_membind, so that free_membind can be used on it */
+    void *(*alloc)(hwloc_topology_t topology, size_t len);
+    /* alloc_membind has to always succeed if !(flags & HWLOC_MEMBIND_STRICT).
+     * see hwloc_alloc_or_fail which is convenient for that.  */
+    void *(*alloc_membind)(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags);
+    int (*free_membind)(hwloc_topology_t topology, void *addr, size_t len);
+
+    int (*get_allowed_resources)(hwloc_topology_t topology);
+  } binding_hooks;
+
+  struct hwloc_topology_support support;
+
+  void (*userdata_export_cb)(void *reserved, struct hwloc_topology *topology, struct hwloc_obj *obj);
+  void (*userdata_import_cb)(struct hwloc_topology *topology, struct hwloc_obj *obj, const char *name, const void *buffer, size_t length);
+  int userdata_not_decoded;
+
+  struct hwloc_internal_distances_s {
+    hwloc_obj_type_t type;
+    /* add union hwloc_obj_attr_u if we ever support groups */
+    unsigned nbobjs;
+    uint64_t *indexes; /* array of OS or GP indexes before we can convert them into objs. */
+    uint64_t *values; /* distance matrices, ordered according to the above indexes/objs array.
+		       * distance from i to j is stored in slot i*nbnodes+j.
+		       */
+    unsigned long kind;
+
+    /* objects are currently stored in physical_index order */
+    hwloc_obj_t *objs; /* array of objects */
+    int objs_are_valid; /* set to 1 if the array objs is still valid, 0 if needs refresh */
+
+    unsigned id; /* to match the container id field of public distances structure */
+    struct hwloc_internal_distances_s *prev, *next;
+  } *first_dist, *last_dist;
+  unsigned next_dist_id;
+
+  int grouping;
+  int grouping_verbose;
+  unsigned grouping_nbaccuracies;
+  float grouping_accuracies[5];
+  unsigned grouping_next_subkind;
+
+  /* list of enabled backends. */
+  struct hwloc_backend * backends;
+  struct hwloc_backend * get_pci_busid_cpuset_backend;
+  unsigned backend_excludes;
+
+  /* memory allocator for topology objects */
+  struct hwloc_tma * tma;
+
+/*****************************************************
+ * WARNING:
+ * changes above in this structure (and its children)
+ * should cause a bump of HWLOC_TOPOLOGY_ABI.
+ *****************************************************/
+
+  /*
+   * temporary variables during discovery
+   */
+
+  /* machine-wide memory.
+   * temporarily stored there by OSes that only provide this without NUMA information,
+   * and actually used later by the core.
+   */
+  struct hwloc_numanode_attr_s machine_memory;
+
+  /* pci stuff */
+  int need_pci_belowroot_apply_locality;
+  int pci_has_forced_locality;
+  unsigned pci_forced_locality_nr;
+  struct hwloc_pci_forced_locality_s {
+    unsigned domain;
+    unsigned bus_first, bus_last;
+    hwloc_bitmap_t cpuset;
+  } * pci_forced_locality;
+
+};
+
+extern void hwloc_alloc_root_sets(hwloc_obj_t root);
+extern void hwloc_setup_pu_level(struct hwloc_topology *topology, unsigned nb_pus);
+extern int hwloc_get_sysctlbyname(const char *name, int64_t *n);
+extern int hwloc_get_sysctl(int name[], unsigned namelen, int *n);
+extern int hwloc_fallback_nbprocessors(struct hwloc_topology *topology);
+
+extern int hwloc__object_cpusets_compare_first(hwloc_obj_t obj1, hwloc_obj_t obj2);
+extern void hwloc__reorder_children(hwloc_obj_t parent);
+
+extern void hwloc_topology_setup_defaults(struct hwloc_topology *topology);
+extern void hwloc_topology_clear(struct hwloc_topology *topology);
+
+/* insert memory object as memory child of normal parent */
+extern struct hwloc_obj * hwloc__attach_memory_object(struct hwloc_topology *topology, hwloc_obj_t parent,
+						      hwloc_obj_t obj,
+						      hwloc_report_error_t report_error);
+
+extern void hwloc_pci_discovery_init(struct hwloc_topology *topology);
+extern void hwloc_pci_discovery_prepare(struct hwloc_topology *topology);
+extern void hwloc_pci_discovery_exit(struct hwloc_topology *topology);
+
+/* Look for an object matching complete cpuset exactly, or insert one.
+ * Return NULL on failure.
+ * Return a good fallback (object above) on failure to insert.
+ */
+extern hwloc_obj_t hwloc_find_insert_io_parent_by_complete_cpuset(struct hwloc_topology *topology, hwloc_cpuset_t cpuset);
+
+/* Move PCI objects currently attached to the root object ot their actual location.
+ * Called by the core at the end of hwloc_topology_load().
+ * Prior to this call, all PCI objects may be found below the root object.
+ * After this call and a reconnect of levels, all PCI objects are available through levels.
+ */
+extern int hwloc_pci_belowroot_apply_locality(struct hwloc_topology *topology);
+
+extern int hwloc__add_info(struct hwloc_info_s **infosp, unsigned *countp, const char *name, const char *value);
+extern int hwloc__add_info_nodup(struct hwloc_info_s **infosp, unsigned *countp, const char *name, const char *value, int replace);
+extern int hwloc__move_infos(struct hwloc_info_s **dst_infosp, unsigned *dst_countp, struct hwloc_info_s **src_infosp, unsigned *src_countp);
+extern void hwloc__free_infos(struct hwloc_info_s *infos, unsigned count);
+
+/* set native OS binding hooks */
+extern void hwloc_set_native_binding_hooks(struct hwloc_binding_hooks *hooks, struct hwloc_topology_support *support);
+/* set either native OS binding hooks (if thissystem), or dummy ones */
+extern void hwloc_set_binding_hooks(struct hwloc_topology *topology);
+
+#if defined(HWLOC_LINUX_SYS)
+extern void hwloc_set_linuxfs_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
+#endif /* HWLOC_LINUX_SYS */
+
+#if defined(HWLOC_BGQ_SYS)
+extern void hwloc_set_bgq_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
+#endif /* HWLOC_BGQ_SYS */
+
+#ifdef HWLOC_SOLARIS_SYS
+extern void hwloc_set_solaris_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
+#endif /* HWLOC_SOLARIS_SYS */
+
+#ifdef HWLOC_AIX_SYS
+extern void hwloc_set_aix_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
+#endif /* HWLOC_AIX_SYS */
+
+#ifdef HWLOC_WIN_SYS
+extern void hwloc_set_windows_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
+#endif /* HWLOC_WIN_SYS */
+
+#ifdef HWLOC_DARWIN_SYS
+extern void hwloc_set_darwin_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
+#endif /* HWLOC_DARWIN_SYS */
+
+#ifdef HWLOC_FREEBSD_SYS
+extern void hwloc_set_freebsd_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
+#endif /* HWLOC_FREEBSD_SYS */
+
+#ifdef HWLOC_NETBSD_SYS
+extern void hwloc_set_netbsd_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
+#endif /* HWLOC_NETBSD_SYS */
+
+#ifdef HWLOC_HPUX_SYS
+extern void hwloc_set_hpux_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
+#endif /* HWLOC_HPUX_SYS */
+
+extern int hwloc_look_hardwired_fujitsu_k(struct hwloc_topology *topology);
+extern int hwloc_look_hardwired_fujitsu_fx10(struct hwloc_topology *topology);
+extern int hwloc_look_hardwired_fujitsu_fx100(struct hwloc_topology *topology);
+
+/* Insert uname-specific names/values in the object infos array.
+ * If cached_uname isn't NULL, it is used as a struct utsname instead of recalling uname.
+ * Any field that starts with \0 is ignored.
+ */
+extern void hwloc_add_uname_info(struct hwloc_topology *topology, void *cached_uname);
+
+/* Free obj and its attributes assuming it's not linked to a parent and doesn't have any child */
+extern void hwloc_free_unlinked_object(hwloc_obj_t obj);
+
+/* Free obj and its children, assuming it's not linked to a parent */
+extern void hwloc_free_object_and_children(hwloc_obj_t obj);
+
+/* Free obj, its next siblings, and their children, assuming they're not linked to a parent */
+extern void hwloc_free_object_siblings_and_children(hwloc_obj_t obj);
+
+/* This can be used for the alloc field to get allocated data that can be freed by free() */
+void *hwloc_alloc_heap(hwloc_topology_t topology, size_t len);
+
+/* This can be used for the alloc field to get allocated data that can be freed by munmap() */
+void *hwloc_alloc_mmap(hwloc_topology_t topology, size_t len);
+
+/* This can be used for the free_membind field to free data using free() */
+int hwloc_free_heap(hwloc_topology_t topology, void *addr, size_t len);
+
+/* This can be used for the free_membind field to free data using munmap() */
+int hwloc_free_mmap(hwloc_topology_t topology, void *addr, size_t len);
+
+/* Allocates unbound memory or fail, depending on whether STRICT is requested
+ * or not */
+static __hwloc_inline void *
+hwloc_alloc_or_fail(hwloc_topology_t topology, size_t len, int flags)
+{
+  if (flags & HWLOC_MEMBIND_STRICT)
+    return NULL;
+  return hwloc_alloc(topology, len);
+}
+
+extern void hwloc_internal_distances_init(hwloc_topology_t topology);
+extern void hwloc_internal_distances_prepare(hwloc_topology_t topology);
+extern void hwloc_internal_distances_destroy(hwloc_topology_t topology);
+extern int hwloc_internal_distances_dup(hwloc_topology_t new, hwloc_topology_t old);
+extern void hwloc_internal_distances_refresh(hwloc_topology_t topology);
+extern int hwloc_internal_distances_add(hwloc_topology_t topology, unsigned nbobjs, hwloc_obj_t *objs, uint64_t *values, unsigned long kind, unsigned long flags);
+extern int hwloc_internal_distances_add_by_index(hwloc_topology_t topology, hwloc_obj_type_t type, unsigned nbobjs, uint64_t *indexes, uint64_t *values, unsigned long kind, unsigned long flags);
+extern void hwloc_internal_distances_invalidate_cached_objs(hwloc_topology_t topology);
+
+/* encode src buffer into target buffer.
+ * targsize must be at least 4*((srclength+2)/3)+1.
+ * target will be 0-terminated.
+ */
+extern int hwloc_encode_to_base64(const char *src, size_t srclength, char *target, size_t targsize);
+/* decode src buffer into target buffer.
+ * src is 0-terminated.
+ * targsize must be at least srclength*3/4+1 (srclength not including \0)
+ * but only srclength*3/4 characters will be meaningful
+ * (the next one may be partially written during decoding, but it should be ignored).
+ */
+extern int hwloc_decode_from_base64(char const *src, char *target, size_t targsize);
+
+/* Check whether needle matches the beginning of haystack, at least n, and up
+ * to a colon or \0 */
+extern int hwloc_namecoloncmp(const char *haystack, const char *needle, size_t n);
+
+/* On some systems, snprintf returns the size of written data, not the actually
+ * required size.  hwloc_snprintf always report the actually required size. */
+extern int hwloc_snprintf(char *str, size_t size, const char *format, ...) __hwloc_attribute_format(printf, 3, 4);
+
+/* Return the name of the currently running program, if supported.
+ * If not NULL, must be freed by the caller.
+ */
+extern char * hwloc_progname(struct hwloc_topology *topology);
+
+/* obj->attr->group.kind internal values.
+ * the core will keep the smallest ones when merging two groups,
+ * that's why user-given kinds are first.
+ */
+/* first, user-given groups, should remain as long as possible */
+#define HWLOC_GROUP_KIND_USER				0	/* user-given, user may use subkind too */
+#define HWLOC_GROUP_KIND_SYNTHETIC			10	/* subkind is group depth within synthetic description */
+/* then, hardware-specific groups */
+#define HWLOC_GROUP_KIND_INTEL_KNL_SUBNUMA_CLUSTER	100	/* no subkind */
+#define HWLOC_GROUP_KIND_INTEL_EXTTOPOENUM_UNKNOWN	101	/* subkind is unknown level */
+#define HWLOC_GROUP_KIND_INTEL_MODULE			102	/* no subkind */
+#define HWLOC_GROUP_KIND_INTEL_TILE			103	/* no subkind */
+#define HWLOC_GROUP_KIND_INTEL_DIE			104	/* no subkind */
+#define HWLOC_GROUP_KIND_S390_BOOK			110	/* no subkind */
+#define HWLOC_GROUP_KIND_AMD_COMPUTE_UNIT		120	/* no subkind */
+/* then, OS-specific groups */
+#define HWLOC_GROUP_KIND_SOLARIS_PG_HW_PERF		200	/* subkind is group width */
+#define HWLOC_GROUP_KIND_AIX_SDL_UNKNOWN		210	/* subkind is SDL level */
+#define HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP	220	/* no subkind */
+#define HWLOC_GROUP_KIND_WINDOWS_RELATIONSHIP_UNKNOWN	221	/* no subkind */
+/* distance groups */
+#define HWLOC_GROUP_KIND_DISTANCE			900	/* subkind is round of adding these groups during distance based grouping */
+/* finally, hwloc-specific groups required to insert something else, should disappear as soon as possible */
+#define HWLOC_GROUP_KIND_IO				1000	/* no subkind */
+#define HWLOC_GROUP_KIND_MEMORY				1001	/* no subkind */
+
+/* memory allocator for topology objects */
+struct hwloc_tma {
+  void * (*malloc)(struct hwloc_tma *, size_t);
+  void *data;
+  int dontfree; /* when set, free() or realloc() cannot be used, and tma->malloc() cannot fail */
+};
+
+static __hwloc_inline void *
+hwloc_tma_malloc(struct hwloc_tma *tma,
+		 size_t size)
+{
+  if (tma) {
+    return tma->malloc(tma, size);
+  } else {
+    return malloc(size);
+  }
+}
+
+static __hwloc_inline void *
+hwloc_tma_calloc(struct hwloc_tma *tma,
+		 size_t size)
+{
+  char *ptr = hwloc_tma_malloc(tma, size);
+  if (ptr)
+    memset(ptr, 0, size);
+  return ptr;
+}
+
+static __hwloc_inline char *
+hwloc_tma_strdup(struct hwloc_tma *tma,
+		 const char *src)
+{
+  size_t len = strlen(src);
+  char *ptr = hwloc_tma_malloc(tma, len+1);
+  if (ptr)
+    memcpy(ptr, src, len+1);
+  return ptr;
+}
+
+/* bitmap allocator to be used inside hwloc */
+extern hwloc_bitmap_t hwloc_bitmap_tma_dup(struct hwloc_tma *tma, hwloc_const_bitmap_t old);
+
+extern int hwloc__topology_dup(hwloc_topology_t *newp, hwloc_topology_t old, struct hwloc_tma *tma);
+extern void hwloc__topology_disadopt(hwloc_topology_t  topology);
+
+#endif /* HWLOC_PRIVATE_H */
diff --git a/src/3rdparty/hwloc/include/private/solaris-chiptype.h b/src/3rdparty/hwloc/include/private/solaris-chiptype.h
new file mode 100644
index 000000000..4ad2130a0
--- /dev/null
+++ b/src/3rdparty/hwloc/include/private/solaris-chiptype.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright © 2009-2010 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * Copyright © 2017 Inria.  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+
+#ifdef HWLOC_INSIDE_PLUGIN
+/*
+ * these declarations are internal only, they are not available to plugins
+ * (functions below are internal static symbols).
+ */
+#error This file should not be used in plugins
+#endif
+
+
+#ifndef HWLOC_PRIVATE_SOLARIS_CHIPTYPE_H
+#define HWLOC_PRIVATE_SOLARIS_CHIPTYPE_H
+
+struct hwloc_solaris_chip_info_s {
+  char *model;
+  char *type;
+  /* L1i, L1d, L2, L3 */
+#define HWLOC_SOLARIS_CHIP_INFO_L1I 0
+#define HWLOC_SOLARIS_CHIP_INFO_L1D 1
+#define HWLOC_SOLARIS_CHIP_INFO_L2I 2
+#define HWLOC_SOLARIS_CHIP_INFO_L2D 3
+#define HWLOC_SOLARIS_CHIP_INFO_L3  4
+  long cache_size[5]; /* cleared to -1 if we don't want of that cache */
+  unsigned cache_linesize[5];
+  unsigned cache_associativity[5];
+  int l2_unified;
+};
+
+/* fills the structure with 0 on error */
+extern void hwloc_solaris_get_chip_info(struct hwloc_solaris_chip_info_s *info);
+
+#endif /* HWLOC_PRIVATE_SOLARIS_CHIPTYPE_H */
diff --git a/src/3rdparty/hwloc/include/private/xml.h b/src/3rdparty/hwloc/include/private/xml.h
new file mode 100644
index 000000000..7c73384d9
--- /dev/null
+++ b/src/3rdparty/hwloc/include/private/xml.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright © 2009-2019 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#ifndef PRIVATE_XML_H
+#define PRIVATE_XML_H 1
+
+#include <hwloc.h>
+
+#include <sys/types.h>
+
+HWLOC_DECLSPEC int hwloc__xml_verbose(void);
+
+/**************
+ * XML import *
+ **************/
+
+typedef struct hwloc__xml_import_state_s {
+  struct hwloc__xml_import_state_s *parent;
+
+  /* globals shared because the entire stack of states during import */
+  struct hwloc_xml_backend_data_s *global;
+
+  /* opaque data used to store backend-specific data.
+   * statically allocated to allow stack-allocation by the common code without knowing actual backend needs.
+   */
+  char data[32];
+} * hwloc__xml_import_state_t;
+
+struct hwloc__xml_imported_v1distances_s {
+  unsigned long kind;
+  unsigned nbobjs;
+  float *floats;
+  struct hwloc__xml_imported_v1distances_s *prev, *next;
+};
+
+HWLOC_DECLSPEC int hwloc__xml_import_diff(hwloc__xml_import_state_t state, hwloc_topology_diff_t *firstdiffp);
+
+struct hwloc_xml_backend_data_s {
+  /* xml backend parameters */
+  int (*look_init)(struct hwloc_xml_backend_data_s *bdata, struct hwloc__xml_import_state_s *state);
+  void (*look_done)(struct hwloc_xml_backend_data_s *bdata, int result);
+  void (*backend_exit)(struct hwloc_xml_backend_data_s *bdata);
+  int (*next_attr)(struct hwloc__xml_import_state_s * state, char **namep, char **valuep);
+  int (*find_child)(struct hwloc__xml_import_state_s * state, struct hwloc__xml_import_state_s * childstate, char **tagp);
+  int (*close_tag)(struct hwloc__xml_import_state_s * state); /* look for an explicit closing tag </name> */
+  void (*close_child)(struct hwloc__xml_import_state_s * state);
+  int (*get_content)(struct hwloc__xml_import_state_s * state, char **beginp, size_t expected_length); /* return 0 on empty content (and sets beginp to empty string), 1 on actual content, -1 on error or unexpected content length */
+  void (*close_content)(struct hwloc__xml_import_state_s * state);
+  char * msgprefix;
+  void *data; /* libxml2 doc, or nolibxml buffer */
+  unsigned version_major, version_minor;
+  unsigned nbnumanodes;
+  hwloc_obj_t first_numanode, last_numanode; /* temporary cousin-list for handling v1distances */
+  struct hwloc__xml_imported_v1distances_s *first_v1dist, *last_v1dist;
+  int dont_merge_die_groups;
+};
+
+/**************
+ * XML export *
+ **************/
+
+typedef struct hwloc__xml_export_state_s {
+  struct hwloc__xml_export_state_s *parent;
+
+  void (*new_child)(struct hwloc__xml_export_state_s *parentstate, struct hwloc__xml_export_state_s *state, const char *name);
+  void (*new_prop)(struct hwloc__xml_export_state_s *state, const char *name, const char *value);
+  void (*add_content)(struct hwloc__xml_export_state_s *state, const char *buffer, size_t length);
+  void (*end_object)(struct hwloc__xml_export_state_s *state, const char *name);
+
+  struct hwloc__xml_export_data_s {
+    hwloc_obj_t v1_memory_group; /* if we need to insert intermediate group above memory children when exporting to v1 */
+  } *global;
+
+  /* opaque data used to store backend-specific data.
+   * statically allocated to allow stack-allocation by the common code without knowing actual backend needs.
+   */
+  char data[40];
+} * hwloc__xml_export_state_t;
+
+HWLOC_DECLSPEC void hwloc__xml_export_topology(hwloc__xml_export_state_t parentstate, hwloc_topology_t topology, unsigned long flags);
+
+HWLOC_DECLSPEC void hwloc__xml_export_diff(hwloc__xml_export_state_t parentstate, hwloc_topology_diff_t diff);
+
+/******************
+ * XML components *
+ ******************/
+
+struct hwloc_xml_callbacks {
+  int (*backend_init)(struct hwloc_xml_backend_data_s *bdata, const char *xmlpath, const char *xmlbuffer, int xmlbuflen);
+  int (*export_file)(struct hwloc_topology *topology, struct hwloc__xml_export_data_s *edata, const char *filename, unsigned long flags);
+  int (*export_buffer)(struct hwloc_topology *topology, struct hwloc__xml_export_data_s *edata, char **xmlbuffer, int *buflen, unsigned long flags);
+  void (*free_buffer)(void *xmlbuffer);
+  int (*import_diff)(struct hwloc__xml_import_state_s *state, const char *xmlpath, const char *xmlbuffer, int xmlbuflen, hwloc_topology_diff_t *diff, char **refnamep);
+  int (*export_diff_file)(union hwloc_topology_diff_u *diff, const char *refname, const char *filename);
+  int (*export_diff_buffer)(union hwloc_topology_diff_u *diff, const char *refname, char **xmlbuffer, int *buflen);
+};
+
+struct hwloc_xml_component {
+  struct hwloc_xml_callbacks *nolibxml_callbacks;
+  struct hwloc_xml_callbacks *libxml_callbacks;
+};
+
+HWLOC_DECLSPEC void hwloc_xml_callbacks_register(struct hwloc_xml_component *component);
+HWLOC_DECLSPEC void hwloc_xml_callbacks_reset(void);
+
+#endif /* PRIVATE_XML_H */
diff --git a/src/3rdparty/hwloc/src/base64.c b/src/3rdparty/hwloc/src/base64.c
new file mode 100644
index 000000000..7b3e12101
--- /dev/null
+++ b/src/3rdparty/hwloc/src/base64.c
@@ -0,0 +1,309 @@
+/*
+ * Copyright © 2012-2018 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ *
+ * Modifications after import:
+ * - removed all #if
+ * - updated prototypes
+ * - updated #include
+ */
+
+/* include hwloc's config before anything else
+ * so that extensions and features are properly enabled
+ */
+#include <private/private.h>
+
+/*	$OpenBSD: base64.c,v 1.5 2006/10/21 09:55:03 otto Exp $	*/
+
+/*
+ * Copyright (c) 1996 by Internet Software Consortium.
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND INTERNET SOFTWARE CONSORTIUM DISCLAIMS
+ * ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL INTERNET SOFTWARE
+ * CONSORTIUM BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
+ * DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
+ * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+/*
+ * Portions Copyright (c) 1995 by International Business Machines, Inc.
+ *
+ * International Business Machines, Inc. (hereinafter called IBM) grants
+ * permission under its copyrights to use, copy, modify, and distribute this
+ * Software with or without fee, provided that the above copyright notice and
+ * all paragraphs of this notice appear in all copies, and that the name of IBM
+ * not be used in connection with the marketing of any product incorporating
+ * the Software or modifications thereof, without specific, written prior
+ * permission.
+ *
+ * To the extent it has a right to do so, IBM grants an immunity from suit
+ * under its patents, if any, for the use, sale or manufacture of products to
+ * the extent that such products are used for performing Domain Name System
+ * dynamic updates in TCP/IP networks by means of the Software.  No immunity is
+ * granted for any product per se or for any other function of any product.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", AND IBM DISCLAIMS ALL WARRANTIES,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ * PARTICULAR PURPOSE.  IN NO EVENT SHALL IBM BE LIABLE FOR ANY SPECIAL,
+ * DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE, EVEN
+ * IF IBM IS APPRISED OF THE POSSIBILITY OF SUCH DAMAGES.
+ */
+
+/* OPENBSD ORIGINAL: lib/libc/net/base64.c */
+
+static const char Base64[] =
+	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+static const char Pad64 = '=';
+
+/* (From RFC1521 and draft-ietf-dnssec-secext-03.txt)
+   The following encoding technique is taken from RFC 1521 by Borenstein
+   and Freed.  It is reproduced here in a slightly edited form for
+   convenience.
+
+   A 65-character subset of US-ASCII is used, enabling 6 bits to be
+   represented per printable character. (The extra 65th character, "=",
+   is used to signify a special processing function.)
+
+   The encoding process represents 24-bit groups of input bits as output
+   strings of 4 encoded characters. Proceeding from left to right, a
+   24-bit input group is formed by concatenating 3 8-bit input groups.
+   These 24 bits are then treated as 4 concatenated 6-bit groups, each
+   of which is translated into a single digit in the base64 alphabet.
+
+   Each 6-bit group is used as an index into an array of 64 printable
+   characters. The character referenced by the index is placed in the
+   output string.
+
+                         Table 1: The Base64 Alphabet
+
+      Value Encoding  Value Encoding  Value Encoding  Value Encoding
+          0 A            17 R            34 i            51 z
+          1 B            18 S            35 j            52 0
+          2 C            19 T            36 k            53 1
+          3 D            20 U            37 l            54 2
+          4 E            21 V            38 m            55 3
+          5 F            22 W            39 n            56 4
+          6 G            23 X            40 o            57 5
+          7 H            24 Y            41 p            58 6
+          8 I            25 Z            42 q            59 7
+          9 J            26 a            43 r            60 8
+         10 K            27 b            44 s            61 9
+         11 L            28 c            45 t            62 +
+         12 M            29 d            46 u            63 /
+         13 N            30 e            47 v
+         14 O            31 f            48 w         (pad) =
+         15 P            32 g            49 x
+         16 Q            33 h            50 y
+
+   Special processing is performed if fewer than 24 bits are available
+   at the end of the data being encoded.  A full encoding quantum is
+   always completed at the end of a quantity.  When fewer than 24 input
+   bits are available in an input group, zero bits are added (on the
+   right) to form an integral number of 6-bit groups.  Padding at the
+   end of the data is performed using the '=' character.
+
+   Since all base64 input is an integral number of octets, only the
+         -------------------------------------------------
+   following cases can arise:
+
+       (1) the final quantum of encoding input is an integral
+           multiple of 24 bits; here, the final unit of encoded
+	   output will be an integral multiple of 4 characters
+	   with no "=" padding,
+       (2) the final quantum of encoding input is exactly 8 bits;
+           here, the final unit of encoded output will be two
+	   characters followed by two "=" padding characters, or
+       (3) the final quantum of encoding input is exactly 16 bits;
+           here, the final unit of encoded output will be three
+	   characters followed by one "=" padding character.
+   */
+
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+
+int
+hwloc_encode_to_base64(const char *src, size_t srclength, char *target, size_t targsize)
+{
+	size_t datalength = 0;
+	unsigned char input[3];
+	unsigned char output[4];
+	unsigned int i;
+
+	while (2 < srclength) {
+		input[0] = *src++;
+		input[1] = *src++;
+		input[2] = *src++;
+		srclength -= 3;
+
+		output[0] = input[0] >> 2;
+		output[1] = ((input[0] & 0x03) << 4) + (input[1] >> 4);
+		output[2] = ((input[1] & 0x0f) << 2) + (input[2] >> 6);
+		output[3] = input[2] & 0x3f;
+
+		if (datalength + 4 > targsize)
+			return (-1);
+		target[datalength++] = Base64[output[0]];
+		target[datalength++] = Base64[output[1]];
+		target[datalength++] = Base64[output[2]];
+		target[datalength++] = Base64[output[3]];
+	}
+
+	/* Now we worry about padding. */
+	if (0 != srclength) {
+		/* Get what's left. */
+		input[0] = input[1] = input[2] = '\0';
+		for (i = 0; i < srclength; i++)
+			input[i] = *src++;
+
+		output[0] = input[0] >> 2;
+		output[1] = ((input[0] & 0x03) << 4) + (input[1] >> 4);
+		output[2] = ((input[1] & 0x0f) << 2) + (input[2] >> 6);
+
+		if (datalength + 4 > targsize)
+			return (-1);
+		target[datalength++] = Base64[output[0]];
+		target[datalength++] = Base64[output[1]];
+		if (srclength == 1)
+			target[datalength++] = Pad64;
+		else
+			target[datalength++] = Base64[output[2]];
+		target[datalength++] = Pad64;
+	}
+	if (datalength >= targsize)
+		return (-1);
+	target[datalength] = '\0';	/* Returned value doesn't count \0. */
+	return (int)(datalength);
+}
+
+/* skips all whitespace anywhere.
+   converts characters, four at a time, starting at (or after)
+   src from base - 64 numbers into three 8 bit bytes in the target area.
+   it returns the number of data bytes stored at the target, or -1 on error.
+ */
+
+int
+hwloc_decode_from_base64(char const *src, char *target, size_t targsize)
+{
+	unsigned int tarindex, state;
+	int ch;
+	char *pos;
+
+	state = 0;
+	tarindex = 0;
+
+	while ((ch = *src++) != '\0') {
+		if (isspace(ch))	/* Skip whitespace anywhere. */
+			continue;
+
+		if (ch == Pad64)
+			break;
+
+		pos = strchr(Base64, ch);
+		if (pos == 0) 		/* A non-base64 character. */
+			return (-1);
+
+		switch (state) {
+		case 0:
+			if (target) {
+				if (tarindex >= targsize)
+					return (-1);
+				target[tarindex] = (char)(pos - Base64) << 2;
+			}
+			state = 1;
+			break;
+		case 1:
+			if (target) {
+				if (tarindex + 1 >= targsize)
+					return (-1);
+				target[tarindex]   |=  (pos - Base64) >> 4;
+				target[tarindex+1]  = ((pos - Base64) & 0x0f)
+							<< 4 ;
+			}
+			tarindex++;
+			state = 2;
+			break;
+		case 2:
+			if (target) {
+				if (tarindex + 1 >= targsize)
+					return (-1);
+				target[tarindex]   |=  (pos - Base64) >> 2;
+				target[tarindex+1]  = ((pos - Base64) & 0x03)
+							<< 6;
+			}
+			tarindex++;
+			state = 3;
+			break;
+		case 3:
+			if (target) {
+				if (tarindex >= targsize)
+					return (-1);
+				target[tarindex] |= (pos - Base64);
+			}
+			tarindex++;
+			state = 0;
+			break;
+		}
+	}
+
+	/*
+	 * We are done decoding Base-64 chars.  Let's see if we ended
+	 * on a byte boundary, and/or with erroneous trailing characters.
+	 */
+
+	if (ch == Pad64) {		/* We got a pad char. */
+		ch = *src++;		/* Skip it, get next. */
+		switch (state) {
+		case 0:		/* Invalid = in first position */
+		case 1:		/* Invalid = in second position */
+			return (-1);
+
+		case 2:		/* Valid, means one byte of info */
+			/* Skip any number of spaces. */
+			for (; ch != '\0'; ch = *src++)
+				if (!isspace(ch))
+					break;
+			/* Make sure there is another trailing = sign. */
+			if (ch != Pad64)
+				return (-1);
+			ch = *src++;		/* Skip the = */
+			/* Fall through to "single trailing =" case. */
+			/* FALLTHROUGH */
+
+		case 3:		/* Valid, means two bytes of info */
+			/*
+			 * We know this char is an =.  Is there anything but
+			 * whitespace after it?
+			 */
+			for (; ch != '\0'; ch = *src++)
+				if (!isspace(ch))
+					return (-1);
+
+			/*
+			 * Now make sure for cases 2 and 3 that the "extra"
+			 * bits that slopped past the last full byte were
+			 * zeros.  If we don't check them, they become a
+			 * subliminal channel.
+			 */
+			if (target && target[tarindex] != 0)
+				return (-1);
+		}
+	} else {
+		/*
+		 * We ended by seeing the end of the string.  Make sure we
+		 * have no partial bytes lying around.
+		 */
+		if (state != 0)
+			return (-1);
+	}
+
+	return (tarindex);
+}
diff --git a/src/3rdparty/hwloc/src/bind.c b/src/3rdparty/hwloc/src/bind.c
new file mode 100644
index 000000000..b3457bc76
--- /dev/null
+++ b/src/3rdparty/hwloc/src/bind.c
@@ -0,0 +1,922 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2018 Inria.  All rights reserved.
+ * Copyright © 2009-2010, 2012 Université Bordeaux
+ * Copyright © 2011-2015 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <private/private.h>
+#include <hwloc/helper.h>
+#ifdef HAVE_SYS_MMAN_H
+#  include <sys/mman.h>
+#endif
+/* <malloc.h> is only needed if we don't have posix_memalign() */
+#if defined(hwloc_getpagesize) && !defined(HAVE_POSIX_MEMALIGN) && defined(HAVE_MEMALIGN) && defined(HAVE_MALLOC_H)
+#include <malloc.h>
+#endif
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#include <stdlib.h>
+#include <errno.h>
+
+/* TODO: HWLOC_GNU_SYS,
+ *
+ * We could use glibc's sched_setaffinity generically when it is available
+ *
+ * Darwin and OpenBSD don't seem to have binding facilities.
+ */
+
+#define HWLOC_CPUBIND_ALLFLAGS (HWLOC_CPUBIND_PROCESS|HWLOC_CPUBIND_THREAD|HWLOC_CPUBIND_STRICT|HWLOC_CPUBIND_NOMEMBIND)
+
+static hwloc_const_bitmap_t
+hwloc_fix_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t set)
+{
+  hwloc_const_bitmap_t topology_set = hwloc_topology_get_topology_cpuset(topology);
+  hwloc_const_bitmap_t complete_set = hwloc_topology_get_complete_cpuset(topology);
+
+  if (hwloc_bitmap_iszero(set)) {
+    errno = EINVAL;
+    return NULL;
+  }
+
+  if (!hwloc_bitmap_isincluded(set, complete_set)) {
+    errno = EINVAL;
+    return NULL;
+  }
+
+  if (hwloc_bitmap_isincluded(topology_set, set))
+    set = complete_set;
+
+  return set;
+}
+
+int
+hwloc_set_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t set, int flags)
+{
+  if (flags & ~HWLOC_CPUBIND_ALLFLAGS) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  set = hwloc_fix_cpubind(topology, set);
+  if (!set)
+    return -1;
+
+  if (flags & HWLOC_CPUBIND_PROCESS) {
+    if (topology->binding_hooks.set_thisproc_cpubind)
+      return topology->binding_hooks.set_thisproc_cpubind(topology, set, flags);
+  } else if (flags & HWLOC_CPUBIND_THREAD) {
+    if (topology->binding_hooks.set_thisthread_cpubind)
+      return topology->binding_hooks.set_thisthread_cpubind(topology, set, flags);
+  } else {
+    if (topology->binding_hooks.set_thisproc_cpubind) {
+      int err = topology->binding_hooks.set_thisproc_cpubind(topology, set, flags);
+      if (err >= 0 || errno != ENOSYS)
+        return err;
+      /* ENOSYS, fallback */
+    }
+    if (topology->binding_hooks.set_thisthread_cpubind)
+      return topology->binding_hooks.set_thisthread_cpubind(topology, set, flags);
+  }
+
+  errno = ENOSYS;
+  return -1;
+}
+
+int
+hwloc_get_cpubind(hwloc_topology_t topology, hwloc_bitmap_t set, int flags)
+{
+  if (flags & ~HWLOC_CPUBIND_ALLFLAGS) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (flags & HWLOC_CPUBIND_PROCESS) {
+    if (topology->binding_hooks.get_thisproc_cpubind)
+      return topology->binding_hooks.get_thisproc_cpubind(topology, set, flags);
+  } else if (flags & HWLOC_CPUBIND_THREAD) {
+    if (topology->binding_hooks.get_thisthread_cpubind)
+      return topology->binding_hooks.get_thisthread_cpubind(topology, set, flags);
+  } else {
+    if (topology->binding_hooks.get_thisproc_cpubind) {
+      int err = topology->binding_hooks.get_thisproc_cpubind(topology, set, flags);
+      if (err >= 0 || errno != ENOSYS)
+        return err;
+      /* ENOSYS, fallback */
+    }
+    if (topology->binding_hooks.get_thisthread_cpubind)
+      return topology->binding_hooks.get_thisthread_cpubind(topology, set, flags);
+  }
+
+  errno = ENOSYS;
+  return -1;
+}
+
+int
+hwloc_set_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_bitmap_t set, int flags)
+{
+  if (flags & ~HWLOC_CPUBIND_ALLFLAGS) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  set = hwloc_fix_cpubind(topology, set);
+  if (!set)
+    return -1;
+
+  if (topology->binding_hooks.set_proc_cpubind)
+    return topology->binding_hooks.set_proc_cpubind(topology, pid, set, flags);
+
+  errno = ENOSYS;
+  return -1;
+}
+
+int
+hwloc_get_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_bitmap_t set, int flags)
+{
+  if (flags & ~HWLOC_CPUBIND_ALLFLAGS) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (topology->binding_hooks.get_proc_cpubind)
+    return topology->binding_hooks.get_proc_cpubind(topology, pid, set, flags);
+
+  errno = ENOSYS;
+  return -1;
+}
+
+#ifdef hwloc_thread_t
+int
+hwloc_set_thread_cpubind(hwloc_topology_t topology, hwloc_thread_t tid, hwloc_const_bitmap_t set, int flags)
+{
+  if (flags & ~HWLOC_CPUBIND_ALLFLAGS) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  set = hwloc_fix_cpubind(topology, set);
+  if (!set)
+    return -1;
+
+  if (topology->binding_hooks.set_thread_cpubind)
+    return topology->binding_hooks.set_thread_cpubind(topology, tid, set, flags);
+
+  errno = ENOSYS;
+  return -1;
+}
+
+int
+hwloc_get_thread_cpubind(hwloc_topology_t topology, hwloc_thread_t tid, hwloc_bitmap_t set, int flags)
+{
+  if (flags & ~HWLOC_CPUBIND_ALLFLAGS) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (topology->binding_hooks.get_thread_cpubind)
+    return topology->binding_hooks.get_thread_cpubind(topology, tid, set, flags);
+
+  errno = ENOSYS;
+  return -1;
+}
+#endif
+
+int
+hwloc_get_last_cpu_location(hwloc_topology_t topology, hwloc_bitmap_t set, int flags)
+{
+  if (flags & ~HWLOC_CPUBIND_ALLFLAGS) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (flags & HWLOC_CPUBIND_PROCESS) {
+    if (topology->binding_hooks.get_thisproc_last_cpu_location)
+      return topology->binding_hooks.get_thisproc_last_cpu_location(topology, set, flags);
+  } else if (flags & HWLOC_CPUBIND_THREAD) {
+    if (topology->binding_hooks.get_thisthread_last_cpu_location)
+      return topology->binding_hooks.get_thisthread_last_cpu_location(topology, set, flags);
+  } else {
+    if (topology->binding_hooks.get_thisproc_last_cpu_location) {
+      int err = topology->binding_hooks.get_thisproc_last_cpu_location(topology, set, flags);
+      if (err >= 0 || errno != ENOSYS)
+        return err;
+      /* ENOSYS, fallback */
+    }
+    if (topology->binding_hooks.get_thisthread_last_cpu_location)
+      return topology->binding_hooks.get_thisthread_last_cpu_location(topology, set, flags);
+  }
+
+  errno = ENOSYS;
+  return -1;
+}
+
+int
+hwloc_get_proc_last_cpu_location(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_bitmap_t set, int flags)
+{
+  if (flags & ~HWLOC_CPUBIND_ALLFLAGS) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (topology->binding_hooks.get_proc_last_cpu_location)
+    return topology->binding_hooks.get_proc_last_cpu_location(topology, pid, set, flags);
+
+  errno = ENOSYS;
+  return -1;
+}
+
+#define HWLOC_MEMBIND_ALLFLAGS (HWLOC_MEMBIND_PROCESS|HWLOC_MEMBIND_THREAD|HWLOC_MEMBIND_STRICT|HWLOC_MEMBIND_MIGRATE|HWLOC_MEMBIND_NOCPUBIND|HWLOC_MEMBIND_BYNODESET)
+
+static hwloc_const_nodeset_t
+hwloc_fix_membind(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset)
+{
+  hwloc_const_bitmap_t topology_nodeset = hwloc_topology_get_topology_nodeset(topology);
+  hwloc_const_bitmap_t complete_nodeset = hwloc_topology_get_complete_nodeset(topology);
+
+  if (hwloc_bitmap_iszero(nodeset)) {
+    errno = EINVAL;
+    return NULL;
+  }
+
+  if (!hwloc_bitmap_isincluded(nodeset, complete_nodeset)) {
+    errno = EINVAL;
+    return NULL;
+  }
+
+  if (hwloc_bitmap_isincluded(topology_nodeset, nodeset))
+    return complete_nodeset;
+
+  return nodeset;
+}
+
+static int
+hwloc_fix_membind_cpuset(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_const_cpuset_t cpuset)
+{
+  hwloc_const_bitmap_t topology_set = hwloc_topology_get_topology_cpuset(topology);
+  hwloc_const_bitmap_t complete_set = hwloc_topology_get_complete_cpuset(topology);
+  hwloc_const_bitmap_t complete_nodeset = hwloc_topology_get_complete_nodeset(topology);
+
+  if (hwloc_bitmap_iszero(cpuset)) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (!hwloc_bitmap_isincluded(cpuset, complete_set)) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (hwloc_bitmap_isincluded(topology_set, cpuset)) {
+    hwloc_bitmap_copy(nodeset, complete_nodeset);
+    return 0;
+  }
+
+  hwloc_cpuset_to_nodeset(topology, cpuset, nodeset);
+  return 0;
+}
+
+static __hwloc_inline int hwloc__check_membind_policy(hwloc_membind_policy_t policy)
+{
+  if (policy == HWLOC_MEMBIND_DEFAULT
+      || policy == HWLOC_MEMBIND_FIRSTTOUCH
+      || policy == HWLOC_MEMBIND_BIND
+      || policy == HWLOC_MEMBIND_INTERLEAVE
+      || policy == HWLOC_MEMBIND_NEXTTOUCH)
+    return 0;
+  return -1;
+}
+
+static int
+hwloc_set_membind_by_nodeset(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  if ((flags & ~HWLOC_MEMBIND_ALLFLAGS) || hwloc__check_membind_policy(policy) < 0) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  nodeset = hwloc_fix_membind(topology, nodeset);
+  if (!nodeset)
+    return -1;
+
+  if (flags & HWLOC_MEMBIND_PROCESS) {
+    if (topology->binding_hooks.set_thisproc_membind)
+      return topology->binding_hooks.set_thisproc_membind(topology, nodeset, policy, flags);
+  } else if (flags & HWLOC_MEMBIND_THREAD) {
+    if (topology->binding_hooks.set_thisthread_membind)
+      return topology->binding_hooks.set_thisthread_membind(topology, nodeset, policy, flags);
+  } else {
+    if (topology->binding_hooks.set_thisproc_membind) {
+      int err = topology->binding_hooks.set_thisproc_membind(topology, nodeset, policy, flags);
+      if (err >= 0 || errno != ENOSYS)
+        return err;
+      /* ENOSYS, fallback */
+    }
+    if (topology->binding_hooks.set_thisthread_membind)
+      return topology->binding_hooks.set_thisthread_membind(topology, nodeset, policy, flags);
+  }
+
+  errno = ENOSYS;
+  return -1;
+}
+
+int
+hwloc_set_membind(hwloc_topology_t topology, hwloc_const_bitmap_t set, hwloc_membind_policy_t policy, int flags)
+{
+  int ret;
+
+  if (flags & HWLOC_MEMBIND_BYNODESET) {
+    ret = hwloc_set_membind_by_nodeset(topology, set, policy, flags);
+  } else {
+    hwloc_nodeset_t nodeset = hwloc_bitmap_alloc();
+    if (hwloc_fix_membind_cpuset(topology, nodeset, set))
+      ret = -1;
+    else
+      ret = hwloc_set_membind_by_nodeset(topology, nodeset, policy, flags);
+    hwloc_bitmap_free(nodeset);
+  }
+  return ret;
+}
+
+static int
+hwloc_get_membind_by_nodeset(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags)
+{
+  if (flags & ~HWLOC_MEMBIND_ALLFLAGS) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (flags & HWLOC_MEMBIND_PROCESS) {
+    if (topology->binding_hooks.get_thisproc_membind)
+      return topology->binding_hooks.get_thisproc_membind(topology, nodeset, policy, flags);
+  } else if (flags & HWLOC_MEMBIND_THREAD) {
+    if (topology->binding_hooks.get_thisthread_membind)
+      return topology->binding_hooks.get_thisthread_membind(topology, nodeset, policy, flags);
+  } else {
+    if (topology->binding_hooks.get_thisproc_membind) {
+      int err = topology->binding_hooks.get_thisproc_membind(topology, nodeset, policy, flags);
+      if (err >= 0 || errno != ENOSYS)
+        return err;
+      /* ENOSYS, fallback */
+    }
+    if (topology->binding_hooks.get_thisthread_membind)
+      return topology->binding_hooks.get_thisthread_membind(topology, nodeset, policy, flags);
+  }
+
+  errno = ENOSYS;
+  return -1;
+}
+
+int
+hwloc_get_membind(hwloc_topology_t topology, hwloc_bitmap_t set, hwloc_membind_policy_t * policy, int flags)
+{
+  int ret;
+
+  if (flags & HWLOC_MEMBIND_BYNODESET) {
+    ret = hwloc_get_membind_by_nodeset(topology, set, policy, flags);
+  } else {
+    hwloc_nodeset_t nodeset = hwloc_bitmap_alloc();
+    ret = hwloc_get_membind_by_nodeset(topology, nodeset, policy, flags);
+    if (!ret)
+      hwloc_cpuset_from_nodeset(topology, set, nodeset);
+    hwloc_bitmap_free(nodeset);
+  }
+
+  return ret;
+}
+
+static int
+hwloc_set_proc_membind_by_nodeset(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  if ((flags & ~HWLOC_MEMBIND_ALLFLAGS) || hwloc__check_membind_policy(policy) < 0) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  nodeset = hwloc_fix_membind(topology, nodeset);
+  if (!nodeset)
+    return -1;
+
+  if (topology->binding_hooks.set_proc_membind)
+    return topology->binding_hooks.set_proc_membind(topology, pid, nodeset, policy, flags);
+
+  errno = ENOSYS;
+  return -1;
+}
+
+
+int
+hwloc_set_proc_membind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_bitmap_t set, hwloc_membind_policy_t policy, int flags)
+{
+  int ret;
+
+  if (flags & HWLOC_MEMBIND_BYNODESET) {
+    ret = hwloc_set_proc_membind_by_nodeset(topology, pid, set, policy, flags);
+  } else {
+    hwloc_nodeset_t nodeset = hwloc_bitmap_alloc();
+    if (hwloc_fix_membind_cpuset(topology, nodeset, set))
+      ret = -1;
+    else
+      ret = hwloc_set_proc_membind_by_nodeset(topology, pid, nodeset, policy, flags);
+    hwloc_bitmap_free(nodeset);
+  }
+
+  return ret;
+}
+
+static int
+hwloc_get_proc_membind_by_nodeset(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags)
+{
+  if (flags & ~HWLOC_MEMBIND_ALLFLAGS) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (topology->binding_hooks.get_proc_membind)
+    return topology->binding_hooks.get_proc_membind(topology, pid, nodeset, policy, flags);
+
+  errno = ENOSYS;
+  return -1;
+}
+
+int
+hwloc_get_proc_membind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_bitmap_t set, hwloc_membind_policy_t * policy, int flags)
+{
+  int ret;
+
+  if (flags & HWLOC_MEMBIND_BYNODESET) {
+    ret = hwloc_get_proc_membind_by_nodeset(topology, pid, set, policy, flags);
+  } else {
+    hwloc_nodeset_t nodeset = hwloc_bitmap_alloc();
+    ret = hwloc_get_proc_membind_by_nodeset(topology, pid, nodeset, policy, flags);
+    if (!ret)
+      hwloc_cpuset_from_nodeset(topology, set, nodeset);
+    hwloc_bitmap_free(nodeset);
+  }
+
+  return ret;
+}
+
+static int
+hwloc_set_area_membind_by_nodeset(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  if ((flags & ~HWLOC_MEMBIND_ALLFLAGS) || hwloc__check_membind_policy(policy) < 0) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (!len)
+    /* nothing to do */
+    return 0;
+
+  nodeset = hwloc_fix_membind(topology, nodeset);
+  if (!nodeset)
+    return -1;
+
+  if (topology->binding_hooks.set_area_membind)
+    return topology->binding_hooks.set_area_membind(topology, addr, len, nodeset, policy, flags);
+
+  errno = ENOSYS;
+  return -1;
+}
+
+int
+hwloc_set_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_bitmap_t set, hwloc_membind_policy_t policy, int flags)
+{
+  int ret;
+
+  if (flags & HWLOC_MEMBIND_BYNODESET) {
+    ret = hwloc_set_area_membind_by_nodeset(topology, addr, len, set, policy, flags);
+  } else {
+    hwloc_nodeset_t nodeset = hwloc_bitmap_alloc();
+    if (hwloc_fix_membind_cpuset(topology, nodeset, set))
+      ret = -1;
+    else
+      ret = hwloc_set_area_membind_by_nodeset(topology, addr, len, nodeset, policy, flags);
+    hwloc_bitmap_free(nodeset);
+  }
+
+  return ret;
+}
+
+static int
+hwloc_get_area_membind_by_nodeset(hwloc_topology_t topology, const void *addr, size_t len, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags)
+{
+  if (flags & ~HWLOC_MEMBIND_ALLFLAGS) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (!len) {
+    /* nothing to query */
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (topology->binding_hooks.get_area_membind)
+    return topology->binding_hooks.get_area_membind(topology, addr, len, nodeset, policy, flags);
+
+  errno = ENOSYS;
+  return -1;
+}
+
+int
+hwloc_get_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_bitmap_t set, hwloc_membind_policy_t * policy, int flags)
+{
+  int ret;
+
+  if (flags & HWLOC_MEMBIND_BYNODESET) {
+    ret = hwloc_get_area_membind_by_nodeset(topology, addr, len, set, policy, flags);
+  } else {
+    hwloc_nodeset_t nodeset = hwloc_bitmap_alloc();
+    ret = hwloc_get_area_membind_by_nodeset(topology, addr, len, nodeset, policy, flags);
+    if (!ret)
+      hwloc_cpuset_from_nodeset(topology, set, nodeset);
+    hwloc_bitmap_free(nodeset);
+  }
+
+  return ret;
+}
+
+static int
+hwloc_get_area_memlocation_by_nodeset(hwloc_topology_t topology, const void *addr, size_t len, hwloc_nodeset_t nodeset, int flags)
+{
+  if (flags & ~HWLOC_MEMBIND_ALLFLAGS) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (!len)
+    /* nothing to do */
+    return 0;
+
+  if (topology->binding_hooks.get_area_memlocation)
+    return topology->binding_hooks.get_area_memlocation(topology, addr, len, nodeset, flags);
+
+  errno = ENOSYS;
+  return -1;
+}
+
+int
+hwloc_get_area_memlocation(hwloc_topology_t topology, const void *addr, size_t len, hwloc_cpuset_t set, int flags)
+{
+  int ret;
+
+  if (flags & HWLOC_MEMBIND_BYNODESET) {
+    ret = hwloc_get_area_memlocation_by_nodeset(topology, addr, len, set, flags);
+  } else {
+    hwloc_nodeset_t nodeset = hwloc_bitmap_alloc();
+    ret = hwloc_get_area_memlocation_by_nodeset(topology, addr, len, nodeset, flags);
+    if (!ret)
+      hwloc_cpuset_from_nodeset(topology, set, nodeset);
+    hwloc_bitmap_free(nodeset);
+  }
+
+  return ret;
+}
+
+void *
+hwloc_alloc_heap(hwloc_topology_t topology __hwloc_attribute_unused, size_t len)
+{
+  void *p = NULL;
+#if defined(hwloc_getpagesize) && defined(HAVE_POSIX_MEMALIGN)
+  errno = posix_memalign(&p, hwloc_getpagesize(), len);
+  if (errno)
+    p = NULL;
+#elif defined(hwloc_getpagesize) && defined(HAVE_MEMALIGN)
+  p = memalign(hwloc_getpagesize(), len);
+#else
+  p = malloc(len);
+#endif
+  return p;
+}
+
+#ifdef MAP_ANONYMOUS
+void *
+hwloc_alloc_mmap(hwloc_topology_t topology __hwloc_attribute_unused, size_t len)
+{
+  void * buffer = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+  return buffer == MAP_FAILED ? NULL : buffer;
+}
+#endif
+
+int
+hwloc_free_heap(hwloc_topology_t topology __hwloc_attribute_unused, void *addr, size_t len __hwloc_attribute_unused)
+{
+  free(addr);
+  return 0;
+}
+
+#ifdef MAP_ANONYMOUS
+int
+hwloc_free_mmap(hwloc_topology_t topology __hwloc_attribute_unused, void *addr, size_t len)
+{
+  if (!addr)
+    return 0;
+  return munmap(addr, len);
+}
+#endif
+
+void *
+hwloc_alloc(hwloc_topology_t topology, size_t len)
+{
+  if (topology->binding_hooks.alloc)
+    return topology->binding_hooks.alloc(topology, len);
+  return hwloc_alloc_heap(topology, len);
+}
+
+static void *
+hwloc_alloc_membind_by_nodeset(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  void *p;
+
+  if ((flags & ~HWLOC_MEMBIND_ALLFLAGS) || hwloc__check_membind_policy(policy) < 0) {
+    errno = EINVAL;
+    return NULL;
+  }
+
+  nodeset = hwloc_fix_membind(topology, nodeset);
+  if (!nodeset)
+    goto fallback;
+  if (flags & HWLOC_MEMBIND_MIGRATE) {
+    errno = EINVAL;
+    goto fallback;
+  }
+
+  if (topology->binding_hooks.alloc_membind)
+    return topology->binding_hooks.alloc_membind(topology, len, nodeset, policy, flags);
+  else if (topology->binding_hooks.set_area_membind) {
+    p = hwloc_alloc(topology, len);
+    if (!p)
+      return NULL;
+    if (topology->binding_hooks.set_area_membind(topology, p, len, nodeset, policy, flags) && flags & HWLOC_MEMBIND_STRICT) {
+      int error = errno;
+      free(p);
+      errno = error;
+      return NULL;
+    }
+    return p;
+  } else {
+    errno = ENOSYS;
+  }
+
+fallback:
+  if (flags & HWLOC_MEMBIND_STRICT)
+    /* Report error */
+    return NULL;
+  /* Never mind, allocate anyway */
+  return hwloc_alloc(topology, len);
+}
+
+void *
+hwloc_alloc_membind(hwloc_topology_t topology, size_t len, hwloc_const_bitmap_t set, hwloc_membind_policy_t policy, int flags)
+{
+  void *ret;
+
+  if (flags & HWLOC_MEMBIND_BYNODESET) {
+    ret = hwloc_alloc_membind_by_nodeset(topology, len, set, policy, flags);
+  } else {
+    hwloc_nodeset_t nodeset = hwloc_bitmap_alloc();
+    if (hwloc_fix_membind_cpuset(topology, nodeset, set)) {
+      if (flags & HWLOC_MEMBIND_STRICT)
+	ret = NULL;
+      else
+	ret = hwloc_alloc(topology, len);
+    } else
+      ret = hwloc_alloc_membind_by_nodeset(topology, len, nodeset, policy, flags);
+    hwloc_bitmap_free(nodeset);
+  }
+
+  return ret;
+}
+
+int
+hwloc_free(hwloc_topology_t topology, void *addr, size_t len)
+{
+  if (topology->binding_hooks.free_membind)
+    return topology->binding_hooks.free_membind(topology, addr, len);
+  return hwloc_free_heap(topology, addr, len);
+}
+
+/*
+ * Empty binding hooks always returning success
+ */
+
+static int dontset_return_complete_cpuset(hwloc_topology_t topology, hwloc_cpuset_t set)
+{
+  hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+  return 0;
+}
+
+static int dontset_thisthread_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_bitmap_t set __hwloc_attribute_unused, int flags __hwloc_attribute_unused)
+{
+  return 0;
+}
+static int dontget_thisthread_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_bitmap_t set, int flags __hwloc_attribute_unused)
+{
+  return dontset_return_complete_cpuset(topology, set);
+}
+static int dontset_thisproc_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_bitmap_t set __hwloc_attribute_unused, int flags __hwloc_attribute_unused)
+{
+  return 0;
+}
+static int dontget_thisproc_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_bitmap_t set, int flags __hwloc_attribute_unused)
+{
+  return dontset_return_complete_cpuset(topology, set);
+}
+static int dontset_proc_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_pid_t pid __hwloc_attribute_unused, hwloc_const_bitmap_t set __hwloc_attribute_unused, int flags __hwloc_attribute_unused)
+{
+  return 0;
+}
+static int dontget_proc_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_pid_t pid __hwloc_attribute_unused, hwloc_bitmap_t cpuset, int flags __hwloc_attribute_unused)
+{
+  return dontset_return_complete_cpuset(topology, cpuset);
+}
+#ifdef hwloc_thread_t
+static int dontset_thread_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_thread_t tid __hwloc_attribute_unused, hwloc_const_bitmap_t set __hwloc_attribute_unused, int flags __hwloc_attribute_unused)
+{
+  return 0;
+}
+static int dontget_thread_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_thread_t tid __hwloc_attribute_unused, hwloc_bitmap_t cpuset, int flags __hwloc_attribute_unused)
+{
+  return dontset_return_complete_cpuset(topology, cpuset);
+}
+#endif
+
+static int dontset_return_complete_nodeset(hwloc_topology_t topology, hwloc_nodeset_t set, hwloc_membind_policy_t *policy)
+{
+  hwloc_bitmap_copy(set, hwloc_topology_get_complete_nodeset(topology));
+  *policy = HWLOC_MEMBIND_MIXED;
+  return 0;
+}
+
+static int dontset_thisproc_membind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_bitmap_t set __hwloc_attribute_unused, hwloc_membind_policy_t policy __hwloc_attribute_unused, int flags __hwloc_attribute_unused)
+{
+  return 0;
+}
+static int dontget_thisproc_membind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_bitmap_t set, hwloc_membind_policy_t * policy, int flags __hwloc_attribute_unused)
+{
+  return dontset_return_complete_nodeset(topology, set, policy);
+}
+
+static int dontset_thisthread_membind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_bitmap_t set __hwloc_attribute_unused, hwloc_membind_policy_t policy __hwloc_attribute_unused, int flags __hwloc_attribute_unused)
+{
+  return 0;
+}
+static int dontget_thisthread_membind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_bitmap_t set, hwloc_membind_policy_t * policy, int flags __hwloc_attribute_unused)
+{
+  return dontset_return_complete_nodeset(topology, set, policy);
+}
+
+static int dontset_proc_membind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_pid_t pid __hwloc_attribute_unused, hwloc_const_bitmap_t set __hwloc_attribute_unused, hwloc_membind_policy_t policy __hwloc_attribute_unused, int flags __hwloc_attribute_unused)
+{
+  return 0;
+}
+static int dontget_proc_membind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_pid_t pid __hwloc_attribute_unused, hwloc_bitmap_t set, hwloc_membind_policy_t * policy, int flags __hwloc_attribute_unused)
+{
+  return dontset_return_complete_nodeset(topology, set, policy);
+}
+
+static int dontset_area_membind(hwloc_topology_t topology __hwloc_attribute_unused, const void *addr __hwloc_attribute_unused, size_t size __hwloc_attribute_unused, hwloc_const_bitmap_t set __hwloc_attribute_unused, hwloc_membind_policy_t policy __hwloc_attribute_unused, int flags __hwloc_attribute_unused)
+{
+  return 0;
+}
+static int dontget_area_membind(hwloc_topology_t topology __hwloc_attribute_unused, const void *addr __hwloc_attribute_unused, size_t size __hwloc_attribute_unused, hwloc_bitmap_t set, hwloc_membind_policy_t * policy, int flags __hwloc_attribute_unused)
+{
+  return dontset_return_complete_nodeset(topology, set, policy);
+}
+static int dontget_area_memlocation(hwloc_topology_t topology __hwloc_attribute_unused, const void *addr __hwloc_attribute_unused, size_t size __hwloc_attribute_unused, hwloc_bitmap_t set, int flags __hwloc_attribute_unused)
+{
+  hwloc_membind_policy_t policy;
+  return dontset_return_complete_nodeset(topology, set, &policy);
+}
+
+static void * dontalloc_membind(hwloc_topology_t topology __hwloc_attribute_unused, size_t size __hwloc_attribute_unused, hwloc_const_bitmap_t set __hwloc_attribute_unused, hwloc_membind_policy_t policy __hwloc_attribute_unused, int flags __hwloc_attribute_unused)
+{
+  return malloc(size);
+}
+static int dontfree_membind(hwloc_topology_t topology __hwloc_attribute_unused, void *addr __hwloc_attribute_unused, size_t size __hwloc_attribute_unused)
+{
+  free(addr);
+  return 0;
+}
+
+static void hwloc_set_dummy_hooks(struct hwloc_binding_hooks *hooks,
+				  struct hwloc_topology_support *support __hwloc_attribute_unused)
+{
+  hooks->set_thisproc_cpubind = dontset_thisproc_cpubind;
+  hooks->get_thisproc_cpubind = dontget_thisproc_cpubind;
+  hooks->set_thisthread_cpubind = dontset_thisthread_cpubind;
+  hooks->get_thisthread_cpubind = dontget_thisthread_cpubind;
+  hooks->set_proc_cpubind = dontset_proc_cpubind;
+  hooks->get_proc_cpubind = dontget_proc_cpubind;
+#ifdef hwloc_thread_t
+  hooks->set_thread_cpubind = dontset_thread_cpubind;
+  hooks->get_thread_cpubind = dontget_thread_cpubind;
+#endif
+  hooks->get_thisproc_last_cpu_location = dontget_thisproc_cpubind; /* cpubind instead of last_cpu_location is ok */
+  hooks->get_thisthread_last_cpu_location = dontget_thisthread_cpubind; /* cpubind instead of last_cpu_location is ok */
+  hooks->get_proc_last_cpu_location = dontget_proc_cpubind; /* cpubind instead of last_cpu_location is ok */
+  /* TODO: get_thread_last_cpu_location */
+  hooks->set_thisproc_membind = dontset_thisproc_membind;
+  hooks->get_thisproc_membind = dontget_thisproc_membind;
+  hooks->set_thisthread_membind = dontset_thisthread_membind;
+  hooks->get_thisthread_membind = dontget_thisthread_membind;
+  hooks->set_proc_membind = dontset_proc_membind;
+  hooks->get_proc_membind = dontget_proc_membind;
+  hooks->set_area_membind = dontset_area_membind;
+  hooks->get_area_membind = dontget_area_membind;
+  hooks->get_area_memlocation = dontget_area_memlocation;
+  hooks->alloc_membind = dontalloc_membind;
+  hooks->free_membind = dontfree_membind;
+}
+
+void
+hwloc_set_native_binding_hooks(struct hwloc_binding_hooks *hooks, struct hwloc_topology_support *support)
+{
+#    ifdef HWLOC_LINUX_SYS
+    hwloc_set_linuxfs_hooks(hooks, support);
+#    endif /* HWLOC_LINUX_SYS */
+
+#    ifdef HWLOC_BGQ_SYS
+    hwloc_set_bgq_hooks(hooks, support);
+#    endif /* HWLOC_BGQ_SYS */
+
+#    ifdef HWLOC_AIX_SYS
+    hwloc_set_aix_hooks(hooks, support);
+#    endif /* HWLOC_AIX_SYS */
+
+#    ifdef HWLOC_SOLARIS_SYS
+    hwloc_set_solaris_hooks(hooks, support);
+#    endif /* HWLOC_SOLARIS_SYS */
+
+#    ifdef HWLOC_WIN_SYS
+    hwloc_set_windows_hooks(hooks, support);
+#    endif /* HWLOC_WIN_SYS */
+
+#    ifdef HWLOC_DARWIN_SYS
+    hwloc_set_darwin_hooks(hooks, support);
+#    endif /* HWLOC_DARWIN_SYS */
+
+#    ifdef HWLOC_FREEBSD_SYS
+    hwloc_set_freebsd_hooks(hooks, support);
+#    endif /* HWLOC_FREEBSD_SYS */
+
+#    ifdef HWLOC_NETBSD_SYS
+    hwloc_set_netbsd_hooks(hooks, support);
+#    endif /* HWLOC_NETBSD_SYS */
+
+#    ifdef HWLOC_HPUX_SYS
+    hwloc_set_hpux_hooks(hooks, support);
+#    endif /* HWLOC_HPUX_SYS */
+}
+
+/* If the represented system is actually not this system, use dummy binding hooks. */
+void
+hwloc_set_binding_hooks(struct hwloc_topology *topology)
+{
+  if (topology->is_thissystem) {
+    hwloc_set_native_binding_hooks(&topology->binding_hooks, &topology->support);
+    /* every hook not set above will return ENOSYS */
+  } else {
+    /* not this system, use dummy binding hooks that do nothing (but don't return ENOSYS) */
+    hwloc_set_dummy_hooks(&topology->binding_hooks, &topology->support);
+  }
+
+  /* if not is_thissystem, set_cpubind is fake
+   * and get_cpubind returns the whole system cpuset,
+   * so don't report that set/get_cpubind as supported
+   */
+  if (topology->is_thissystem) {
+#define DO(which,kind) \
+    if (topology->binding_hooks.kind) \
+      topology->support.which##bind->kind = 1;
+    DO(cpu,set_thisproc_cpubind);
+    DO(cpu,get_thisproc_cpubind);
+    DO(cpu,set_proc_cpubind);
+    DO(cpu,get_proc_cpubind);
+    DO(cpu,set_thisthread_cpubind);
+    DO(cpu,get_thisthread_cpubind);
+#ifdef hwloc_thread_t
+    DO(cpu,set_thread_cpubind);
+    DO(cpu,get_thread_cpubind);
+#endif
+    DO(cpu,get_thisproc_last_cpu_location);
+    DO(cpu,get_proc_last_cpu_location);
+    DO(cpu,get_thisthread_last_cpu_location);
+    DO(mem,set_thisproc_membind);
+    DO(mem,get_thisproc_membind);
+    DO(mem,set_thisthread_membind);
+    DO(mem,get_thisthread_membind);
+    DO(mem,set_proc_membind);
+    DO(mem,get_proc_membind);
+    DO(mem,set_area_membind);
+    DO(mem,get_area_membind);
+    DO(mem,get_area_memlocation);
+    DO(mem,alloc_membind);
+  }
+}
diff --git a/src/3rdparty/hwloc/src/bitmap.c b/src/3rdparty/hwloc/src/bitmap.c
new file mode 100644
index 000000000..ea1264afc
--- /dev/null
+++ b/src/3rdparty/hwloc/src/bitmap.c
@@ -0,0 +1,1676 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2017 Inria.  All rights reserved.
+ * Copyright © 2009-2011 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc/autogen/config.h>
+#include <hwloc.h>
+#include <private/misc.h>
+#include <private/private.h>
+#include <private/debug.h>
+#include <hwloc/bitmap.h>
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <assert.h>
+#include <errno.h>
+#include <ctype.h>
+
+/*
+ * possible improvements:
+ * - have a way to change the initial allocation size:
+ *   add hwloc_bitmap_set_foo() to changes a global here,
+ *   and make the hwloc core call based on the early number of PUs
+ * - make HWLOC_BITMAP_PREALLOC_BITS configurable, and detectable
+ *   by parsing /proc/cpuinfo during configure on Linux.
+ * - preallocate inside the bitmap structure (so that the whole structure is a cacheline for instance)
+ *   and allocate a dedicated array only later when reallocating larger
+ * - add a bitmap->ulongs_empty_first which guarantees that some first ulongs are empty,
+ *   making tests much faster for big bitmaps since there's no need to look at first ulongs.
+ *   no need for ulongs_empty_first to be exactly the max number of empty ulongs,
+ *   clearing bits that were set earlier isn't very common.
+ */
+
+/* magic number */
+#define HWLOC_BITMAP_MAGIC 0x20091007
+
+/* preallocated bits in every bitmap */
+#define HWLOC_BITMAP_PREALLOC_BITS 512
+#define HWLOC_BITMAP_PREALLOC_ULONGS (HWLOC_BITMAP_PREALLOC_BITS/HWLOC_BITS_PER_LONG)
+
+/* actual opaque type internals */
+struct hwloc_bitmap_s {
+  unsigned ulongs_count; /* how many ulong bitmasks are valid, >= 1 */
+  unsigned ulongs_allocated; /* how many ulong bitmasks are allocated, >= ulongs_count */
+  unsigned long *ulongs;
+  int infinite; /* set to 1 if all bits beyond ulongs are set */
+#ifdef HWLOC_DEBUG
+  int magic;
+#endif
+};
+
+/* overzealous check in debug-mode, not as powerful as valgrind but still useful */
+#ifdef HWLOC_DEBUG
+#define HWLOC__BITMAP_CHECK(set) do {				\
+  assert((set)->magic == HWLOC_BITMAP_MAGIC);			\
+  assert((set)->ulongs_count >= 1);				\
+  assert((set)->ulongs_allocated >= (set)->ulongs_count);	\
+} while (0)
+#else
+#define HWLOC__BITMAP_CHECK(set)
+#endif
+
+/* extract a subset from a set using an index or a cpu */
+#define HWLOC_SUBBITMAP_INDEX(cpu)		((cpu)/(HWLOC_BITS_PER_LONG))
+#define HWLOC_SUBBITMAP_CPU_ULBIT(cpu)		((cpu)%(HWLOC_BITS_PER_LONG))
+/* Read from a bitmap ulong without knowing whether x is valid.
+ * Writers should make sure that x is valid and modify set->ulongs[x] directly.
+ */
+#define HWLOC_SUBBITMAP_READULONG(set,x)	((x) < (set)->ulongs_count ? (set)->ulongs[x] : (set)->infinite ? HWLOC_SUBBITMAP_FULL : HWLOC_SUBBITMAP_ZERO)
+
+/* predefined subset values */
+#define HWLOC_SUBBITMAP_ZERO			0UL
+#define HWLOC_SUBBITMAP_FULL			(~0UL)
+#define HWLOC_SUBBITMAP_ULBIT(bit)		(1UL<<(bit))
+#define HWLOC_SUBBITMAP_CPU(cpu)		HWLOC_SUBBITMAP_ULBIT(HWLOC_SUBBITMAP_CPU_ULBIT(cpu))
+#define HWLOC_SUBBITMAP_ULBIT_TO(bit)		(HWLOC_SUBBITMAP_FULL>>(HWLOC_BITS_PER_LONG-1-(bit)))
+#define HWLOC_SUBBITMAP_ULBIT_FROM(bit)		(HWLOC_SUBBITMAP_FULL<<(bit))
+#define HWLOC_SUBBITMAP_ULBIT_FROMTO(begin,end)	(HWLOC_SUBBITMAP_ULBIT_TO(end) & HWLOC_SUBBITMAP_ULBIT_FROM(begin))
+
+struct hwloc_bitmap_s * hwloc_bitmap_alloc(void)
+{
+  struct hwloc_bitmap_s * set;
+
+  set = malloc(sizeof(struct hwloc_bitmap_s));
+  if (!set)
+    return NULL;
+
+  set->ulongs_count = 1;
+  set->ulongs_allocated = HWLOC_BITMAP_PREALLOC_ULONGS;
+  set->ulongs = malloc(HWLOC_BITMAP_PREALLOC_ULONGS * sizeof(unsigned long));
+  if (!set->ulongs) {
+    free(set);
+    return NULL;
+  }
+
+  set->ulongs[0] = HWLOC_SUBBITMAP_ZERO;
+  set->infinite = 0;
+#ifdef HWLOC_DEBUG
+  set->magic = HWLOC_BITMAP_MAGIC;
+#endif
+  return set;
+}
+
+struct hwloc_bitmap_s * hwloc_bitmap_alloc_full(void)
+{
+  struct hwloc_bitmap_s * set = hwloc_bitmap_alloc();
+  if (set) {
+    set->infinite = 1;
+    set->ulongs[0] = HWLOC_SUBBITMAP_FULL;
+  }
+  return set;
+}
+
+void hwloc_bitmap_free(struct hwloc_bitmap_s * set)
+{
+  if (!set)
+    return;
+
+  HWLOC__BITMAP_CHECK(set);
+#ifdef HWLOC_DEBUG
+  set->magic = 0;
+#endif
+
+  free(set->ulongs);
+  free(set);
+}
+
+/* enlarge until it contains at least needed_count ulongs.
+ */
+static int
+hwloc_bitmap_enlarge_by_ulongs(struct hwloc_bitmap_s * set, unsigned needed_count) __hwloc_attribute_warn_unused_result;
+static int
+hwloc_bitmap_enlarge_by_ulongs(struct hwloc_bitmap_s * set, unsigned needed_count)
+{
+  unsigned tmp = 1U << hwloc_flsl((unsigned long) needed_count - 1);
+  if (tmp > set->ulongs_allocated) {
+    unsigned long *tmpulongs;
+    tmpulongs = realloc(set->ulongs, tmp * sizeof(unsigned long));
+    if (!tmpulongs)
+      return -1;
+    set->ulongs = tmpulongs;
+    set->ulongs_allocated = tmp;
+  }
+  return 0;
+}
+
+/* enlarge until it contains at least needed_count ulongs,
+ * and update new ulongs according to the infinite field.
+ */
+static int
+hwloc_bitmap_realloc_by_ulongs(struct hwloc_bitmap_s * set, unsigned needed_count) __hwloc_attribute_warn_unused_result;
+static int
+hwloc_bitmap_realloc_by_ulongs(struct hwloc_bitmap_s * set, unsigned needed_count)
+{
+  unsigned i;
+
+  HWLOC__BITMAP_CHECK(set);
+
+  if (needed_count <= set->ulongs_count)
+    return 0;
+
+  /* realloc larger if needed */
+  if (hwloc_bitmap_enlarge_by_ulongs(set, needed_count) < 0)
+    return -1;
+
+  /* fill the newly allocated subset depending on the infinite flag */
+  for(i=set->ulongs_count; i<needed_count; i++)
+    set->ulongs[i] = set->infinite ? HWLOC_SUBBITMAP_FULL : HWLOC_SUBBITMAP_ZERO;
+  set->ulongs_count = needed_count;
+  return 0;
+}
+
+/* realloc until it contains at least cpu+1 bits */
+#define hwloc_bitmap_realloc_by_cpu_index(set, cpu) hwloc_bitmap_realloc_by_ulongs(set, ((cpu)/HWLOC_BITS_PER_LONG)+1)
+
+/* reset a bitmap to exactely the needed size.
+ * the caller must reinitialize all ulongs and the infinite flag later.
+ */
+static int
+hwloc_bitmap_reset_by_ulongs(struct hwloc_bitmap_s * set, unsigned needed_count) __hwloc_attribute_warn_unused_result;
+static int
+hwloc_bitmap_reset_by_ulongs(struct hwloc_bitmap_s * set, unsigned needed_count)
+{
+  if (hwloc_bitmap_enlarge_by_ulongs(set, needed_count))
+    return -1;
+  set->ulongs_count = needed_count;
+  return 0;
+}
+
+/* reset until it contains exactly cpu+1 bits (roundup to a ulong).
+ * the caller must reinitialize all ulongs and the infinite flag later.
+ */
+#define hwloc_bitmap_reset_by_cpu_index(set, cpu) hwloc_bitmap_reset_by_ulongs(set, ((cpu)/HWLOC_BITS_PER_LONG)+1)
+
+struct hwloc_bitmap_s * hwloc_bitmap_tma_dup(struct hwloc_tma *tma, const struct hwloc_bitmap_s * old)
+{
+  struct hwloc_bitmap_s * new;
+
+  if (!old)
+    return NULL;
+
+  HWLOC__BITMAP_CHECK(old);
+
+  new = hwloc_tma_malloc(tma, sizeof(struct hwloc_bitmap_s));
+  if (!new)
+    return NULL;
+
+  new->ulongs = hwloc_tma_malloc(tma, old->ulongs_allocated * sizeof(unsigned long));
+  if (!new->ulongs) {
+    free(new);
+    return NULL;
+  }
+  new->ulongs_allocated = old->ulongs_allocated;
+  new->ulongs_count = old->ulongs_count;
+  memcpy(new->ulongs, old->ulongs, new->ulongs_count * sizeof(unsigned long));
+  new->infinite = old->infinite;
+#ifdef HWLOC_DEBUG
+  new->magic = HWLOC_BITMAP_MAGIC;
+#endif
+  return new;
+}
+
+struct hwloc_bitmap_s * hwloc_bitmap_dup(const struct hwloc_bitmap_s * old)
+{
+  return hwloc_bitmap_tma_dup(NULL, old);
+}
+
+int hwloc_bitmap_copy(struct hwloc_bitmap_s * dst, const struct hwloc_bitmap_s * src)
+{
+  HWLOC__BITMAP_CHECK(dst);
+  HWLOC__BITMAP_CHECK(src);
+
+  if (hwloc_bitmap_reset_by_ulongs(dst, src->ulongs_count) < 0)
+    return -1;
+
+  memcpy(dst->ulongs, src->ulongs, src->ulongs_count * sizeof(unsigned long));
+  dst->infinite = src->infinite;
+  return 0;
+}
+
+/* Strings always use 32bit groups */
+#define HWLOC_PRIxSUBBITMAP		"%08lx"
+#define HWLOC_BITMAP_SUBSTRING_SIZE	32
+#define HWLOC_BITMAP_SUBSTRING_LENGTH	(HWLOC_BITMAP_SUBSTRING_SIZE/4)
+#define HWLOC_BITMAP_STRING_PER_LONG	(HWLOC_BITS_PER_LONG/HWLOC_BITMAP_SUBSTRING_SIZE)
+
+int hwloc_bitmap_snprintf(char * __hwloc_restrict buf, size_t buflen, const struct hwloc_bitmap_s * __hwloc_restrict set)
+{
+  ssize_t size = buflen;
+  char *tmp = buf;
+  int res, ret = 0;
+  int needcomma = 0;
+  int i;
+  unsigned long accum = 0;
+  int accumed = 0;
+#if HWLOC_BITS_PER_LONG == HWLOC_BITMAP_SUBSTRING_SIZE
+  const unsigned long accum_mask = ~0UL;
+#else /* HWLOC_BITS_PER_LONG != HWLOC_BITMAP_SUBSTRING_SIZE */
+  const unsigned long accum_mask = ((1UL << HWLOC_BITMAP_SUBSTRING_SIZE) - 1) << (HWLOC_BITS_PER_LONG - HWLOC_BITMAP_SUBSTRING_SIZE);
+#endif /* HWLOC_BITS_PER_LONG != HWLOC_BITMAP_SUBSTRING_SIZE */
+
+  HWLOC__BITMAP_CHECK(set);
+
+  /* mark the end in case we do nothing later */
+  if (buflen > 0)
+    tmp[0] = '\0';
+
+  if (set->infinite) {
+    res = hwloc_snprintf(tmp, size, "0xf...f");
+    needcomma = 1;
+    if (res < 0)
+      return -1;
+    ret += res;
+    if (res >= size)
+      res = size>0 ? (int)size - 1 : 0;
+    tmp += res;
+    size -= res;
+  }
+
+  i=(int) set->ulongs_count-1;
+
+  if (set->infinite) {
+    /* ignore starting FULL since we have 0xf...f already */
+    while (i>=0 && set->ulongs[i] == HWLOC_SUBBITMAP_FULL)
+      i--;
+  } else {
+    /* ignore starting ZERO except the last one */
+    while (i>=0 && set->ulongs[i] == HWLOC_SUBBITMAP_ZERO)
+      i--;
+  }
+
+  while (i>=0 || accumed) {
+    /* Refill accumulator */
+    if (!accumed) {
+      accum = set->ulongs[i--];
+      accumed = HWLOC_BITS_PER_LONG;
+    }
+
+    if (accum & accum_mask) {
+      /* print the whole subset if not empty */
+        res = hwloc_snprintf(tmp, size, needcomma ? ",0x" HWLOC_PRIxSUBBITMAP : "0x" HWLOC_PRIxSUBBITMAP,
+		     (accum & accum_mask) >> (HWLOC_BITS_PER_LONG - HWLOC_BITMAP_SUBSTRING_SIZE));
+      needcomma = 1;
+    } else if (i == -1 && accumed == HWLOC_BITMAP_SUBSTRING_SIZE) {
+      /* print a single 0 to mark the last subset */
+      res = hwloc_snprintf(tmp, size, needcomma ? ",0x0" : "0x0");
+    } else if (needcomma) {
+      res = hwloc_snprintf(tmp, size, ",");
+    } else {
+      res = 0;
+    }
+    if (res < 0)
+      return -1;
+    ret += res;
+
+#if HWLOC_BITS_PER_LONG == HWLOC_BITMAP_SUBSTRING_SIZE
+    accum = 0;
+    accumed = 0;
+#else
+    accum <<= HWLOC_BITMAP_SUBSTRING_SIZE;
+    accumed -= HWLOC_BITMAP_SUBSTRING_SIZE;
+#endif
+
+    if (res >= size)
+      res = size>0 ? (int)size - 1 : 0;
+
+    tmp += res;
+    size -= res;
+  }
+
+  /* if didn't display anything, display 0x0 */
+  if (!ret) {
+    res = hwloc_snprintf(tmp, size, "0x0");
+    if (res < 0)
+      return -1;
+    ret += res;
+  }
+
+  return ret;
+}
+
+int hwloc_bitmap_asprintf(char ** strp, const struct hwloc_bitmap_s * __hwloc_restrict set)
+{
+  int len;
+  char *buf;
+
+  HWLOC__BITMAP_CHECK(set);
+
+  len = hwloc_bitmap_snprintf(NULL, 0, set);
+  buf = malloc(len+1);
+  if (!buf)
+    return -1;
+  *strp = buf;
+  return hwloc_bitmap_snprintf(buf, len+1, set);
+}
+
+int hwloc_bitmap_sscanf(struct hwloc_bitmap_s *set, const char * __hwloc_restrict string)
+{
+  const char * current = string;
+  unsigned long accum = 0;
+  int count=0;
+  int infinite = 0;
+
+  /* count how many substrings there are */
+  count++;
+  while ((current = strchr(current+1, ',')) != NULL)
+    count++;
+
+  current = string;
+  if (!strncmp("0xf...f", current, 7)) {
+    current += 7;
+    if (*current != ',') {
+      /* special case for infinite/full bitmap */
+      hwloc_bitmap_fill(set);
+      return 0;
+    }
+    current++;
+    infinite = 1;
+    count--;
+  }
+
+  if (hwloc_bitmap_reset_by_ulongs(set, (count + HWLOC_BITMAP_STRING_PER_LONG - 1) / HWLOC_BITMAP_STRING_PER_LONG) < 0)
+    return -1;
+  set->infinite = 0;
+
+  while (*current != '\0') {
+    unsigned long val;
+    char *next;
+    val = strtoul(current, &next, 16);
+
+    assert(count > 0);
+    count--;
+
+    accum |= (val << ((count * HWLOC_BITMAP_SUBSTRING_SIZE) % HWLOC_BITS_PER_LONG));
+    if (!(count % HWLOC_BITMAP_STRING_PER_LONG)) {
+      set->ulongs[count / HWLOC_BITMAP_STRING_PER_LONG] = accum;
+      accum = 0;
+    }
+
+    if (*next != ',') {
+      if (*next || count > 0)
+	goto failed;
+      else
+	break;
+    }
+    current = (const char*) next+1;
+  }
+
+  set->infinite = infinite; /* set at the end, to avoid spurious realloc with filled new ulongs */
+
+  return 0;
+
+ failed:
+  /* failure to parse */
+  hwloc_bitmap_zero(set);
+  return -1;
+}
+
+int hwloc_bitmap_list_snprintf(char * __hwloc_restrict buf, size_t buflen, const struct hwloc_bitmap_s * __hwloc_restrict set)
+{
+  int prev = -1;
+  ssize_t size = buflen;
+  char *tmp = buf;
+  int res, ret = 0;
+  int needcomma = 0;
+
+  HWLOC__BITMAP_CHECK(set);
+
+  /* mark the end in case we do nothing later */
+  if (buflen > 0)
+    tmp[0] = '\0';
+
+  while (1) {
+    int begin, end;
+
+    begin = hwloc_bitmap_next(set, prev);
+    if (begin == -1)
+      break;
+    end = hwloc_bitmap_next_unset(set, begin);
+
+    if (end == begin+1) {
+      res = hwloc_snprintf(tmp, size, needcomma ? ",%d" : "%d", begin);
+    } else if (end == -1) {
+      res = hwloc_snprintf(tmp, size, needcomma ? ",%d-" : "%d-", begin);
+    } else {
+      res = hwloc_snprintf(tmp, size, needcomma ? ",%d-%d" : "%d-%d", begin, end-1);
+    }
+    if (res < 0)
+      return -1;
+    ret += res;
+
+    if (res >= size)
+      res = size>0 ? (int)size - 1 : 0;
+
+    tmp += res;
+    size -= res;
+    needcomma = 1;
+
+    if (end == -1)
+      break;
+    else
+      prev = end - 1;
+  }
+
+  return ret;
+}
+
+int hwloc_bitmap_list_asprintf(char ** strp, const struct hwloc_bitmap_s * __hwloc_restrict set)
+{
+  int len;
+  char *buf;
+
+  HWLOC__BITMAP_CHECK(set);
+
+  len = hwloc_bitmap_list_snprintf(NULL, 0, set);
+  buf = malloc(len+1);
+  if (!buf)
+    return -1;
+  *strp = buf;
+  return hwloc_bitmap_list_snprintf(buf, len+1, set);
+}
+
+int hwloc_bitmap_list_sscanf(struct hwloc_bitmap_s *set, const char * __hwloc_restrict string)
+{
+  const char * current = string;
+  char *next;
+  long begin = -1, val;
+
+  hwloc_bitmap_zero(set);
+
+  while (*current != '\0') {
+
+    /* ignore empty ranges */
+    while (*current == ',' || *current == ' ')
+      current++;
+
+    val = strtoul(current, &next, 0);
+    /* make sure we got at least one digit */
+    if (next == current)
+      goto failed;
+
+    if (begin != -1) {
+      /* finishing a range */
+      hwloc_bitmap_set_range(set, begin, val);
+      begin = -1;
+
+    } else if (*next == '-') {
+      /* starting a new range */
+      if (*(next+1) == '\0') {
+	/* infinite range */
+	hwloc_bitmap_set_range(set, val, -1);
+        break;
+      } else {
+	/* normal range */
+	begin = val;
+      }
+
+    } else if (*next == ',' || *next == ' ' || *next == '\0') {
+      /* single digit */
+      hwloc_bitmap_set(set, val);
+    }
+
+    if (*next == '\0')
+      break;
+    current = next+1;
+  }
+
+  return 0;
+
+ failed:
+  /* failure to parse */
+  hwloc_bitmap_zero(set);
+  return -1;
+}
+
+int hwloc_bitmap_taskset_snprintf(char * __hwloc_restrict buf, size_t buflen, const struct hwloc_bitmap_s * __hwloc_restrict set)
+{
+  ssize_t size = buflen;
+  char *tmp = buf;
+  int res, ret = 0;
+  int started = 0;
+  int i;
+
+  HWLOC__BITMAP_CHECK(set);
+
+  /* mark the end in case we do nothing later */
+  if (buflen > 0)
+    tmp[0] = '\0';
+
+  if (set->infinite) {
+    res = hwloc_snprintf(tmp, size, "0xf...f");
+    started = 1;
+    if (res < 0)
+      return -1;
+    ret += res;
+    if (res >= size)
+      res = size>0 ? (int)size - 1 : 0;
+    tmp += res;
+    size -= res;
+  }
+
+  i=set->ulongs_count-1;
+
+  if (set->infinite) {
+    /* ignore starting FULL since we have 0xf...f already */
+    while (i>=0 && set->ulongs[i] == HWLOC_SUBBITMAP_FULL)
+      i--;
+  } else {
+    /* ignore starting ZERO except the last one */
+    while (i>=1 && set->ulongs[i] == HWLOC_SUBBITMAP_ZERO)
+      i--;
+  }
+
+  while (i>=0) {
+    unsigned long val = set->ulongs[i--];
+    if (started) {
+      /* print the whole subset */
+#if HWLOC_BITS_PER_LONG == 64
+      res = hwloc_snprintf(tmp, size, "%016lx", val);
+#else
+      res = hwloc_snprintf(tmp, size, "%08lx", val);
+#endif
+    } else if (val || i == -1) {
+      res = hwloc_snprintf(tmp, size, "0x%lx", val);
+      started = 1;
+    } else {
+      res = 0;
+    }
+    if (res < 0)
+      return -1;
+    ret += res;
+    if (res >= size)
+      res = size>0 ? (int)size - 1 : 0;
+    tmp += res;
+    size -= res;
+  }
+
+  /* if didn't display anything, display 0x0 */
+  if (!ret) {
+    res = hwloc_snprintf(tmp, size, "0x0");
+    if (res < 0)
+      return -1;
+    ret += res;
+  }
+
+  return ret;
+}
+
+int hwloc_bitmap_taskset_asprintf(char ** strp, const struct hwloc_bitmap_s * __hwloc_restrict set)
+{
+  int len;
+  char *buf;
+
+  HWLOC__BITMAP_CHECK(set);
+
+  len = hwloc_bitmap_taskset_snprintf(NULL, 0, set);
+  buf = malloc(len+1);
+  if (!buf)
+    return -1;
+  *strp = buf;
+  return hwloc_bitmap_taskset_snprintf(buf, len+1, set);
+}
+
+int hwloc_bitmap_taskset_sscanf(struct hwloc_bitmap_s *set, const char * __hwloc_restrict string)
+{
+  const char * current = string;
+  int chars;
+  int count;
+  int infinite = 0;
+
+  if (!strncmp("0xf...f", current, 7)) {
+    /* infinite bitmap */
+    infinite = 1;
+    current += 7;
+    if (*current == '\0') {
+      /* special case for infinite/full bitmap */
+      hwloc_bitmap_fill(set);
+      return 0;
+    }
+  } else {
+    /* finite bitmap */
+    if (!strncmp("0x", current, 2))
+      current += 2;
+    if (*current == '\0') {
+      /* special case for empty bitmap */
+      hwloc_bitmap_zero(set);
+      return 0;
+    }
+  }
+  /* we know there are other characters now */
+
+  chars = (int)strlen(current);
+  count = (chars * 4 + HWLOC_BITS_PER_LONG - 1) / HWLOC_BITS_PER_LONG;
+
+  if (hwloc_bitmap_reset_by_ulongs(set, count) < 0)
+    return -1;
+  set->infinite = 0;
+
+  while (*current != '\0') {
+    int tmpchars;
+    char ustr[17];
+    unsigned long val;
+    char *next;
+
+    tmpchars = chars % (HWLOC_BITS_PER_LONG/4);
+    if (!tmpchars)
+      tmpchars = (HWLOC_BITS_PER_LONG/4);
+
+    memcpy(ustr, current, tmpchars);
+    ustr[tmpchars] = '\0';
+    val = strtoul(ustr, &next, 16);
+    if (*next != '\0')
+      goto failed;
+
+    set->ulongs[count-1] = val;
+
+    current += tmpchars;
+    chars -= tmpchars;
+    count--;
+  }
+
+  set->infinite = infinite; /* set at the end, to avoid spurious realloc with filled new ulongs */
+
+  return 0;
+
+ failed:
+  /* failure to parse */
+  hwloc_bitmap_zero(set);
+  return -1;
+}
+
+static void hwloc_bitmap__zero(struct hwloc_bitmap_s *set)
+{
+	unsigned i;
+	for(i=0; i<set->ulongs_count; i++)
+		set->ulongs[i] = HWLOC_SUBBITMAP_ZERO;
+	set->infinite = 0;
+}
+
+void hwloc_bitmap_zero(struct hwloc_bitmap_s * set)
+{
+	HWLOC__BITMAP_CHECK(set);
+
+	HWLOC_BUILD_ASSERT(HWLOC_BITMAP_PREALLOC_ULONGS >= 1);
+	if (hwloc_bitmap_reset_by_ulongs(set, 1) < 0) {
+		/* cannot fail since we preallocate some ulongs.
+		 * if we ever preallocate nothing, we'll reset to 0 ulongs.
+		 */
+	}
+	hwloc_bitmap__zero(set);
+}
+
+static void hwloc_bitmap__fill(struct hwloc_bitmap_s * set)
+{
+	unsigned i;
+	for(i=0; i<set->ulongs_count; i++)
+		set->ulongs[i] = HWLOC_SUBBITMAP_FULL;
+	set->infinite = 1;
+}
+
+void hwloc_bitmap_fill(struct hwloc_bitmap_s * set)
+{
+	HWLOC__BITMAP_CHECK(set);
+
+	HWLOC_BUILD_ASSERT(HWLOC_BITMAP_PREALLOC_ULONGS >= 1);
+	if (hwloc_bitmap_reset_by_ulongs(set, 1) < 0) {
+		/* cannot fail since we pre-allocate some ulongs.
+		 * if we ever pre-allocate nothing, we'll reset to 0 ulongs.
+		 */
+	}
+	hwloc_bitmap__fill(set);
+}
+
+int hwloc_bitmap_from_ulong(struct hwloc_bitmap_s *set, unsigned long mask)
+{
+	HWLOC__BITMAP_CHECK(set);
+
+	HWLOC_BUILD_ASSERT(HWLOC_BITMAP_PREALLOC_ULONGS >= 1);
+	if (hwloc_bitmap_reset_by_ulongs(set, 1) < 0) {
+		/* cannot fail since we pre-allocate some ulongs.
+		 * if ever pre-allocate nothing, we may have to return a failure.
+		 */
+	}
+	set->ulongs[0] = mask; /* there's always at least one ulong allocated */
+	set->infinite = 0;
+	return 0;
+}
+
+int hwloc_bitmap_from_ith_ulong(struct hwloc_bitmap_s *set, unsigned i, unsigned long mask)
+{
+	unsigned j;
+
+	HWLOC__BITMAP_CHECK(set);
+
+	if (hwloc_bitmap_reset_by_ulongs(set, i+1) < 0)
+		return -1;
+
+	set->ulongs[i] = mask;
+	for(j=0; j<i; j++)
+		set->ulongs[j] = HWLOC_SUBBITMAP_ZERO;
+	set->infinite = 0;
+	return 0;
+}
+
+unsigned long hwloc_bitmap_to_ulong(const struct hwloc_bitmap_s *set)
+{
+	HWLOC__BITMAP_CHECK(set);
+
+	return set->ulongs[0]; /* there's always at least one ulong allocated */
+}
+
+unsigned long hwloc_bitmap_to_ith_ulong(const struct hwloc_bitmap_s *set, unsigned i)
+{
+	HWLOC__BITMAP_CHECK(set);
+
+	return HWLOC_SUBBITMAP_READULONG(set, i);
+}
+
+int hwloc_bitmap_only(struct hwloc_bitmap_s * set, unsigned cpu)
+{
+	unsigned index_ = HWLOC_SUBBITMAP_INDEX(cpu);
+
+	HWLOC__BITMAP_CHECK(set);
+
+	if (hwloc_bitmap_reset_by_cpu_index(set, cpu) < 0)
+		return -1;
+
+	hwloc_bitmap__zero(set);
+	set->ulongs[index_] |= HWLOC_SUBBITMAP_CPU(cpu);
+	return 0;
+}
+
+int hwloc_bitmap_allbut(struct hwloc_bitmap_s * set, unsigned cpu)
+{
+	unsigned index_ = HWLOC_SUBBITMAP_INDEX(cpu);
+
+	HWLOC__BITMAP_CHECK(set);
+
+	if (hwloc_bitmap_reset_by_cpu_index(set, cpu) < 0)
+		return -1;
+
+	hwloc_bitmap__fill(set);
+	set->ulongs[index_] &= ~HWLOC_SUBBITMAP_CPU(cpu);
+	return 0;
+}
+
+int hwloc_bitmap_set(struct hwloc_bitmap_s * set, unsigned cpu)
+{
+	unsigned index_ = HWLOC_SUBBITMAP_INDEX(cpu);
+
+	HWLOC__BITMAP_CHECK(set);
+
+	/* nothing to do if setting inside the infinite part of the bitmap */
+	if (set->infinite && cpu >= set->ulongs_count * HWLOC_BITS_PER_LONG)
+		return 0;
+
+	if (hwloc_bitmap_realloc_by_cpu_index(set, cpu) < 0)
+		return -1;
+
+	set->ulongs[index_] |= HWLOC_SUBBITMAP_CPU(cpu);
+	return 0;
+}
+
+int hwloc_bitmap_set_range(struct hwloc_bitmap_s * set, unsigned begincpu, int _endcpu)
+{
+	unsigned i;
+	unsigned beginset,endset;
+	unsigned endcpu = (unsigned) _endcpu;
+
+	HWLOC__BITMAP_CHECK(set);
+
+	if (endcpu < begincpu)
+		return 0;
+	if (set->infinite && begincpu >= set->ulongs_count * HWLOC_BITS_PER_LONG)
+		/* setting only in the already-set infinite part, nothing to do */
+		return 0;
+
+	if (_endcpu == -1) {
+		/* infinite range */
+
+		/* make sure we can play with the ulong that contains begincpu */
+		if (hwloc_bitmap_realloc_by_cpu_index(set, begincpu) < 0)
+			return -1;
+
+		/* update the ulong that contains begincpu */
+		beginset = HWLOC_SUBBITMAP_INDEX(begincpu);
+		set->ulongs[beginset] |= HWLOC_SUBBITMAP_ULBIT_FROM(HWLOC_SUBBITMAP_CPU_ULBIT(begincpu));
+		/* set ulongs after begincpu if any already allocated */
+		for(i=beginset+1; i<set->ulongs_count; i++)
+			set->ulongs[i] = HWLOC_SUBBITMAP_FULL;
+		/* mark the infinity as set */
+		set->infinite = 1;
+	} else {
+		/* finite range */
+
+		/* ignore the part of the range that overlaps with the already-set infinite part */
+		if (set->infinite && endcpu >= set->ulongs_count * HWLOC_BITS_PER_LONG)
+			endcpu = set->ulongs_count * HWLOC_BITS_PER_LONG - 1;
+		/* make sure we can play with the ulongs that contain begincpu and endcpu */
+		if (hwloc_bitmap_realloc_by_cpu_index(set, endcpu) < 0)
+			return -1;
+
+		/* update first and last ulongs */
+		beginset = HWLOC_SUBBITMAP_INDEX(begincpu);
+		endset = HWLOC_SUBBITMAP_INDEX(endcpu);
+		if (beginset == endset) {
+			set->ulongs[beginset] |= HWLOC_SUBBITMAP_ULBIT_FROMTO(HWLOC_SUBBITMAP_CPU_ULBIT(begincpu), HWLOC_SUBBITMAP_CPU_ULBIT(endcpu));
+		} else {
+			set->ulongs[beginset] |= HWLOC_SUBBITMAP_ULBIT_FROM(HWLOC_SUBBITMAP_CPU_ULBIT(begincpu));
+			set->ulongs[endset] |= HWLOC_SUBBITMAP_ULBIT_TO(HWLOC_SUBBITMAP_CPU_ULBIT(endcpu));
+		}
+		/* set ulongs in the middle of the range */
+		for(i=beginset+1; i<endset; i++)
+			set->ulongs[i] = HWLOC_SUBBITMAP_FULL;
+	}
+
+	return 0;
+}
+
+int hwloc_bitmap_set_ith_ulong(struct hwloc_bitmap_s *set, unsigned i, unsigned long mask)
+{
+	HWLOC__BITMAP_CHECK(set);
+
+	if (hwloc_bitmap_realloc_by_ulongs(set, i+1) < 0)
+		return -1;
+
+	set->ulongs[i] = mask;
+	return 0;
+}
+
+int hwloc_bitmap_clr(struct hwloc_bitmap_s * set, unsigned cpu)
+{
+	unsigned index_ = HWLOC_SUBBITMAP_INDEX(cpu);
+
+	HWLOC__BITMAP_CHECK(set);
+
+	/* nothing to do if clearing inside the infinitely-unset part of the bitmap */
+	if (!set->infinite && cpu >= set->ulongs_count * HWLOC_BITS_PER_LONG)
+		return 0;
+
+	if (hwloc_bitmap_realloc_by_cpu_index(set, cpu) < 0)
+		return -1;
+
+	set->ulongs[index_] &= ~HWLOC_SUBBITMAP_CPU(cpu);
+	return 0;
+}
+
+int hwloc_bitmap_clr_range(struct hwloc_bitmap_s * set, unsigned begincpu, int _endcpu)
+{
+	unsigned i;
+	unsigned beginset,endset;
+	unsigned endcpu = (unsigned) _endcpu;
+
+	HWLOC__BITMAP_CHECK(set);
+
+	if (endcpu < begincpu)
+		return 0;
+
+	if (!set->infinite && begincpu >= set->ulongs_count * HWLOC_BITS_PER_LONG)
+		/* clearing only in the already-unset infinite part, nothing to do */
+		return 0;
+
+	if (_endcpu == -1) {
+		/* infinite range */
+
+		/* make sure we can play with the ulong that contains begincpu */
+		if (hwloc_bitmap_realloc_by_cpu_index(set, begincpu) < 0)
+			return -1;
+
+		/* update the ulong that contains begincpu */
+		beginset = HWLOC_SUBBITMAP_INDEX(begincpu);
+		set->ulongs[beginset] &= ~HWLOC_SUBBITMAP_ULBIT_FROM(HWLOC_SUBBITMAP_CPU_ULBIT(begincpu));
+		/* clear ulong after begincpu if any already allocated */
+		for(i=beginset+1; i<set->ulongs_count; i++)
+			set->ulongs[i] = HWLOC_SUBBITMAP_ZERO;
+		/* mark the infinity as unset */
+		set->infinite = 0;
+	} else {
+		/* finite range */
+
+		/* ignore the part of the range that overlaps with the already-unset infinite part */
+		if (!set->infinite && endcpu >= set->ulongs_count * HWLOC_BITS_PER_LONG)
+			endcpu = set->ulongs_count * HWLOC_BITS_PER_LONG - 1;
+		/* make sure we can play with the ulongs that contain begincpu and endcpu */
+		if (hwloc_bitmap_realloc_by_cpu_index(set, endcpu) < 0)
+			return -1;
+
+		/* update first and last ulongs */
+		beginset = HWLOC_SUBBITMAP_INDEX(begincpu);
+		endset = HWLOC_SUBBITMAP_INDEX(endcpu);
+		if (beginset == endset) {
+			set->ulongs[beginset] &= ~HWLOC_SUBBITMAP_ULBIT_FROMTO(HWLOC_SUBBITMAP_CPU_ULBIT(begincpu), HWLOC_SUBBITMAP_CPU_ULBIT(endcpu));
+		} else {
+			set->ulongs[beginset] &= ~HWLOC_SUBBITMAP_ULBIT_FROM(HWLOC_SUBBITMAP_CPU_ULBIT(begincpu));
+			set->ulongs[endset] &= ~HWLOC_SUBBITMAP_ULBIT_TO(HWLOC_SUBBITMAP_CPU_ULBIT(endcpu));
+		}
+		/* clear ulongs in the middle of the range */
+		for(i=beginset+1; i<endset; i++)
+			set->ulongs[i] = HWLOC_SUBBITMAP_ZERO;
+	}
+
+	return 0;
+}
+
+int hwloc_bitmap_isset(const struct hwloc_bitmap_s * set, unsigned cpu)
+{
+	unsigned index_ = HWLOC_SUBBITMAP_INDEX(cpu);
+
+	HWLOC__BITMAP_CHECK(set);
+
+	return (HWLOC_SUBBITMAP_READULONG(set, index_) & HWLOC_SUBBITMAP_CPU(cpu)) != 0;
+}
+
+int hwloc_bitmap_iszero(const struct hwloc_bitmap_s *set)
+{
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(set);
+
+	if (set->infinite)
+		return 0;
+	for(i=0; i<set->ulongs_count; i++)
+		if (set->ulongs[i] != HWLOC_SUBBITMAP_ZERO)
+			return 0;
+	return 1;
+}
+
+int hwloc_bitmap_isfull(const struct hwloc_bitmap_s *set)
+{
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(set);
+
+	if (!set->infinite)
+		return 0;
+	for(i=0; i<set->ulongs_count; i++)
+		if (set->ulongs[i] != HWLOC_SUBBITMAP_FULL)
+			return 0;
+	return 1;
+}
+
+int hwloc_bitmap_isequal (const struct hwloc_bitmap_s *set1, const struct hwloc_bitmap_s *set2)
+{
+	unsigned count1 = set1->ulongs_count;
+	unsigned count2 = set2->ulongs_count;
+	unsigned min_count = count1 < count2 ? count1 : count2;
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(set1);
+	HWLOC__BITMAP_CHECK(set2);
+
+	for(i=0; i<min_count; i++)
+		if (set1->ulongs[i] != set2->ulongs[i])
+			return 0;
+
+	if (count1 != count2) {
+		unsigned long w1 = set1->infinite ? HWLOC_SUBBITMAP_FULL : HWLOC_SUBBITMAP_ZERO;
+		unsigned long w2 = set2->infinite ? HWLOC_SUBBITMAP_FULL : HWLOC_SUBBITMAP_ZERO;
+		for(i=min_count; i<count1; i++) {
+			if (set1->ulongs[i] != w2)
+				return 0;
+		}
+		for(i=min_count; i<count2; i++) {
+			if (set2->ulongs[i] != w1)
+				return 0;
+		}
+	}
+
+	if (set1->infinite != set2->infinite)
+		return 0;
+
+	return 1;
+}
+
+int hwloc_bitmap_intersects (const struct hwloc_bitmap_s *set1, const struct hwloc_bitmap_s *set2)
+{
+	unsigned count1 = set1->ulongs_count;
+	unsigned count2 = set2->ulongs_count;
+	unsigned min_count = count1 < count2 ? count1 : count2;
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(set1);
+	HWLOC__BITMAP_CHECK(set2);
+
+	for(i=0; i<min_count; i++)
+		if (set1->ulongs[i] & set2->ulongs[i])
+			return 1;
+
+	if (count1 != count2) {
+		if (set2->infinite) {
+			for(i=min_count; i<set1->ulongs_count; i++)
+				if (set1->ulongs[i])
+					return 1;
+		}
+		if (set1->infinite) {
+			for(i=min_count; i<set2->ulongs_count; i++)
+				if (set2->ulongs[i])
+					return 1;
+		}
+	}
+
+	if (set1->infinite && set2->infinite)
+		return 1;
+
+	return 0;
+}
+
+int hwloc_bitmap_isincluded (const struct hwloc_bitmap_s *sub_set, const struct hwloc_bitmap_s *super_set)
+{
+	unsigned super_count = super_set->ulongs_count;
+	unsigned sub_count = sub_set->ulongs_count;
+	unsigned min_count = super_count < sub_count ? super_count : sub_count;
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(sub_set);
+	HWLOC__BITMAP_CHECK(super_set);
+
+	for(i=0; i<min_count; i++)
+		if (super_set->ulongs[i] != (super_set->ulongs[i] | sub_set->ulongs[i]))
+			return 0;
+
+	if (super_count != sub_count) {
+		if (!super_set->infinite)
+			for(i=min_count; i<sub_count; i++)
+				if (sub_set->ulongs[i])
+					return 0;
+		if (sub_set->infinite)
+			for(i=min_count; i<super_count; i++)
+				if (super_set->ulongs[i] != HWLOC_SUBBITMAP_FULL)
+					return 0;
+	}
+
+	if (sub_set->infinite && !super_set->infinite)
+		return 0;
+
+	return 1;
+}
+
+int hwloc_bitmap_or (struct hwloc_bitmap_s *res, const struct hwloc_bitmap_s *set1, const struct hwloc_bitmap_s *set2)
+{
+	/* cache counts so that we can reset res even if it's also set1 or set2 */
+	unsigned count1 = set1->ulongs_count;
+	unsigned count2 = set2->ulongs_count;
+	unsigned max_count = count1 > count2 ? count1 : count2;
+	unsigned min_count = count1 + count2 - max_count;
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(res);
+	HWLOC__BITMAP_CHECK(set1);
+	HWLOC__BITMAP_CHECK(set2);
+
+	if (hwloc_bitmap_reset_by_ulongs(res, max_count) < 0)
+		return -1;
+
+	for(i=0; i<min_count; i++)
+		res->ulongs[i] = set1->ulongs[i] | set2->ulongs[i];
+
+	if (count1 != count2) {
+		if (min_count < count1) {
+			if (set2->infinite) {
+				res->ulongs_count = min_count;
+			} else {
+				for(i=min_count; i<max_count; i++)
+					res->ulongs[i] = set1->ulongs[i];
+			}
+		} else {
+			if (set1->infinite) {
+				res->ulongs_count = min_count;
+			} else {
+				for(i=min_count; i<max_count; i++)
+					res->ulongs[i] = set2->ulongs[i];
+			}
+		}
+	}
+
+	res->infinite = set1->infinite || set2->infinite;
+	return 0;
+}
+
+int hwloc_bitmap_and (struct hwloc_bitmap_s *res, const struct hwloc_bitmap_s *set1, const struct hwloc_bitmap_s *set2)
+{
+	/* cache counts so that we can reset res even if it's also set1 or set2 */
+	unsigned count1 = set1->ulongs_count;
+	unsigned count2 = set2->ulongs_count;
+	unsigned max_count = count1 > count2 ? count1 : count2;
+	unsigned min_count = count1 + count2 - max_count;
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(res);
+	HWLOC__BITMAP_CHECK(set1);
+	HWLOC__BITMAP_CHECK(set2);
+
+	if (hwloc_bitmap_reset_by_ulongs(res, max_count) < 0)
+		return -1;
+
+	for(i=0; i<min_count; i++)
+		res->ulongs[i] = set1->ulongs[i] & set2->ulongs[i];
+
+	if (count1 != count2) {
+		if (min_count < count1) {
+			if (set2->infinite) {
+				for(i=min_count; i<max_count; i++)
+					res->ulongs[i] = set1->ulongs[i];
+			} else {
+				res->ulongs_count = min_count;
+			}
+		} else {
+			if (set1->infinite) {
+				for(i=min_count; i<max_count; i++)
+					res->ulongs[i] = set2->ulongs[i];
+			} else {
+				res->ulongs_count = min_count;
+			}
+		}
+	}
+
+	res->infinite = set1->infinite && set2->infinite;
+	return 0;
+}
+
+int hwloc_bitmap_andnot (struct hwloc_bitmap_s *res, const struct hwloc_bitmap_s *set1, const struct hwloc_bitmap_s *set2)
+{
+	/* cache counts so that we can reset res even if it's also set1 or set2 */
+	unsigned count1 = set1->ulongs_count;
+	unsigned count2 = set2->ulongs_count;
+	unsigned max_count = count1 > count2 ? count1 : count2;
+	unsigned min_count = count1 + count2 - max_count;
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(res);
+	HWLOC__BITMAP_CHECK(set1);
+	HWLOC__BITMAP_CHECK(set2);
+
+	if (hwloc_bitmap_reset_by_ulongs(res, max_count) < 0)
+		return -1;
+
+	for(i=0; i<min_count; i++)
+		res->ulongs[i] = set1->ulongs[i] & ~set2->ulongs[i];
+
+	if (count1 != count2) {
+		if (min_count < count1) {
+			if (!set2->infinite) {
+				for(i=min_count; i<max_count; i++)
+					res->ulongs[i] = set1->ulongs[i];
+			} else {
+				res->ulongs_count = min_count;
+			}
+		} else {
+			if (set1->infinite) {
+				for(i=min_count; i<max_count; i++)
+					res->ulongs[i] = ~set2->ulongs[i];
+			} else {
+				res->ulongs_count = min_count;
+			}
+		}
+	}
+
+	res->infinite = set1->infinite && !set2->infinite;
+	return 0;
+}
+
+int hwloc_bitmap_xor (struct hwloc_bitmap_s *res, const struct hwloc_bitmap_s *set1, const struct hwloc_bitmap_s *set2)
+{
+	/* cache counts so that we can reset res even if it's also set1 or set2 */
+	unsigned count1 = set1->ulongs_count;
+	unsigned count2 = set2->ulongs_count;
+	unsigned max_count = count1 > count2 ? count1 : count2;
+	unsigned min_count = count1 + count2 - max_count;
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(res);
+	HWLOC__BITMAP_CHECK(set1);
+	HWLOC__BITMAP_CHECK(set2);
+
+	if (hwloc_bitmap_reset_by_ulongs(res, max_count) < 0)
+		return -1;
+
+	for(i=0; i<min_count; i++)
+		res->ulongs[i] = set1->ulongs[i] ^ set2->ulongs[i];
+
+	if (count1 != count2) {
+		if (min_count < count1) {
+			unsigned long w2 = set2->infinite ? HWLOC_SUBBITMAP_FULL : HWLOC_SUBBITMAP_ZERO;
+			for(i=min_count; i<max_count; i++)
+				res->ulongs[i] = set1->ulongs[i] ^ w2;
+		} else {
+			unsigned long w1 = set1->infinite ? HWLOC_SUBBITMAP_FULL : HWLOC_SUBBITMAP_ZERO;
+			for(i=min_count; i<max_count; i++)
+				res->ulongs[i] = set2->ulongs[i] ^ w1;
+		}
+	}
+
+	res->infinite = (!set1->infinite) != (!set2->infinite);
+	return 0;
+}
+
+int hwloc_bitmap_not (struct hwloc_bitmap_s *res, const struct hwloc_bitmap_s *set)
+{
+	unsigned count = set->ulongs_count;
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(res);
+	HWLOC__BITMAP_CHECK(set);
+
+	if (hwloc_bitmap_reset_by_ulongs(res, count) < 0)
+		return -1;
+
+	for(i=0; i<count; i++)
+		res->ulongs[i] = ~set->ulongs[i];
+
+	res->infinite = !set->infinite;
+	return 0;
+}
+
+int hwloc_bitmap_first(const struct hwloc_bitmap_s * set)
+{
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(set);
+
+	for(i=0; i<set->ulongs_count; i++) {
+		/* subsets are unsigned longs, use ffsl */
+		unsigned long w = set->ulongs[i];
+		if (w)
+			return hwloc_ffsl(w) - 1 + HWLOC_BITS_PER_LONG*i;
+	}
+
+	if (set->infinite)
+		return set->ulongs_count * HWLOC_BITS_PER_LONG;
+
+	return -1;
+}
+
+int hwloc_bitmap_first_unset(const struct hwloc_bitmap_s * set)
+{
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(set);
+
+	for(i=0; i<set->ulongs_count; i++) {
+		/* subsets are unsigned longs, use ffsl */
+		unsigned long w = ~set->ulongs[i];
+		if (w)
+			return hwloc_ffsl(w) - 1 + HWLOC_BITS_PER_LONG*i;
+	}
+
+	if (!set->infinite)
+		return set->ulongs_count * HWLOC_BITS_PER_LONG;
+
+	return -1;
+}
+
+int hwloc_bitmap_last(const struct hwloc_bitmap_s * set)
+{
+	int i;
+
+	HWLOC__BITMAP_CHECK(set);
+
+	if (set->infinite)
+		return -1;
+
+	for(i=(int)set->ulongs_count-1; i>=0; i--) {
+		/* subsets are unsigned longs, use flsl */
+		unsigned long w = set->ulongs[i];
+		if (w)
+			return hwloc_flsl(w) - 1 + HWLOC_BITS_PER_LONG*i;
+	}
+
+	return -1;
+}
+
+int hwloc_bitmap_last_unset(const struct hwloc_bitmap_s * set)
+{
+	int i;
+
+	HWLOC__BITMAP_CHECK(set);
+
+	if (!set->infinite)
+		return -1;
+
+	for(i=(int)set->ulongs_count-1; i>=0; i--) {
+		/* subsets are unsigned longs, use flsl */
+		unsigned long w = ~set->ulongs[i];
+		if (w)
+			return hwloc_flsl(w) - 1 + HWLOC_BITS_PER_LONG*i;
+	}
+
+	return -1;
+}
+
+int hwloc_bitmap_next(const struct hwloc_bitmap_s * set, int prev_cpu)
+{
+	unsigned i = HWLOC_SUBBITMAP_INDEX(prev_cpu + 1);
+
+	HWLOC__BITMAP_CHECK(set);
+
+	if (i >= set->ulongs_count) {
+		if (set->infinite)
+			return prev_cpu + 1;
+		else
+			return -1;
+	}
+
+	for(; i<set->ulongs_count; i++) {
+		/* subsets are unsigned longs, use ffsl */
+		unsigned long w = set->ulongs[i];
+
+		/* if the prev cpu is in the same word as the possible next one,
+		   we need to mask out previous cpus */
+		if (prev_cpu >= 0 && HWLOC_SUBBITMAP_INDEX((unsigned) prev_cpu) == i)
+			w &= ~HWLOC_SUBBITMAP_ULBIT_TO(HWLOC_SUBBITMAP_CPU_ULBIT(prev_cpu));
+
+		if (w)
+			return hwloc_ffsl(w) - 1 + HWLOC_BITS_PER_LONG*i;
+	}
+
+	if (set->infinite)
+		return set->ulongs_count * HWLOC_BITS_PER_LONG;
+
+	return -1;
+}
+
+int hwloc_bitmap_next_unset(const struct hwloc_bitmap_s * set, int prev_cpu)
+{
+	unsigned i = HWLOC_SUBBITMAP_INDEX(prev_cpu + 1);
+
+	HWLOC__BITMAP_CHECK(set);
+
+	if (i >= set->ulongs_count) {
+		if (!set->infinite)
+			return prev_cpu + 1;
+		else
+			return -1;
+	}
+
+	for(; i<set->ulongs_count; i++) {
+		/* subsets are unsigned longs, use ffsl */
+		unsigned long w = ~set->ulongs[i];
+
+		/* if the prev cpu is in the same word as the possible next one,
+		   we need to mask out previous cpus */
+		if (prev_cpu >= 0 && HWLOC_SUBBITMAP_INDEX((unsigned) prev_cpu) == i)
+			w &= ~HWLOC_SUBBITMAP_ULBIT_TO(HWLOC_SUBBITMAP_CPU_ULBIT(prev_cpu));
+
+		if (w)
+			return hwloc_ffsl(w) - 1 + HWLOC_BITS_PER_LONG*i;
+	}
+
+	if (!set->infinite)
+		return set->ulongs_count * HWLOC_BITS_PER_LONG;
+
+	return -1;
+}
+
+int hwloc_bitmap_singlify(struct hwloc_bitmap_s * set)
+{
+	unsigned i;
+	int found = 0;
+
+	HWLOC__BITMAP_CHECK(set);
+
+	for(i=0; i<set->ulongs_count; i++) {
+		if (found) {
+			set->ulongs[i] = HWLOC_SUBBITMAP_ZERO;
+			continue;
+		} else {
+			/* subsets are unsigned longs, use ffsl */
+			unsigned long w = set->ulongs[i];
+			if (w) {
+				int _ffs = hwloc_ffsl(w);
+				set->ulongs[i] = HWLOC_SUBBITMAP_CPU(_ffs-1);
+				found = 1;
+			}
+		}
+	}
+
+	if (set->infinite) {
+		if (found) {
+			set->infinite = 0;
+		} else {
+			/* set the first non allocated bit */
+			unsigned first = set->ulongs_count * HWLOC_BITS_PER_LONG;
+			set->infinite = 0; /* do not let realloc fill the newly allocated sets */
+			return hwloc_bitmap_set(set, first);
+		}
+	}
+
+	return 0;
+}
+
+int hwloc_bitmap_compare_first(const struct hwloc_bitmap_s * set1, const struct hwloc_bitmap_s * set2)
+{
+	unsigned count1 = set1->ulongs_count;
+	unsigned count2 = set2->ulongs_count;
+	unsigned max_count = count1 > count2 ? count1 : count2;
+	unsigned min_count = count1 + count2 - max_count;
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(set1);
+	HWLOC__BITMAP_CHECK(set2);
+
+	for(i=0; i<min_count; i++) {
+		unsigned long w1 = set1->ulongs[i];
+		unsigned long w2 = set2->ulongs[i];
+		if (w1 || w2) {
+			int _ffs1 = hwloc_ffsl(w1);
+			int _ffs2 = hwloc_ffsl(w2);
+			/* if both have a bit set, compare for real */
+			if (_ffs1 && _ffs2)
+				return _ffs1-_ffs2;
+			/* one is empty, and it is considered higher, so reverse-compare them */
+			return _ffs2-_ffs1;
+		}
+	}
+
+	if (count1 != count2) {
+		if (min_count < count2) {
+			for(i=min_count; i<count2; i++) {
+				unsigned long w2 = set2->ulongs[i];
+				if (set1->infinite)
+					return -!(w2 & 1);
+				else if (w2)
+					return 1;
+			}
+		} else {
+			for(i=min_count; i<count1; i++) {
+				unsigned long w1 = set1->ulongs[i];
+				if (set2->infinite)
+					return !(w1 & 1);
+				else if (w1)
+					return -1;
+			}
+		}
+	}
+
+	return !!set1->infinite - !!set2->infinite;
+}
+
+int hwloc_bitmap_compare(const struct hwloc_bitmap_s * set1, const struct hwloc_bitmap_s * set2)
+{
+	unsigned count1 = set1->ulongs_count;
+	unsigned count2 = set2->ulongs_count;
+	unsigned max_count = count1 > count2 ? count1 : count2;
+	unsigned min_count = count1 + count2 - max_count;
+	int i;
+
+	HWLOC__BITMAP_CHECK(set1);
+	HWLOC__BITMAP_CHECK(set2);
+
+	if ((!set1->infinite) != (!set2->infinite))
+		return !!set1->infinite - !!set2->infinite;
+
+	if (count1 != count2) {
+		if (min_count < count2) {
+			unsigned long val1 = set1->infinite ? HWLOC_SUBBITMAP_FULL :  HWLOC_SUBBITMAP_ZERO;
+			for(i=(int)max_count-1; i>=(int) min_count; i--) {
+				unsigned long val2 = set2->ulongs[i];
+				if (val1 == val2)
+					continue;
+				return val1 < val2 ? -1 : 1;
+			}
+		} else {
+			unsigned long val2 = set2->infinite ? HWLOC_SUBBITMAP_FULL :  HWLOC_SUBBITMAP_ZERO;
+			for(i=(int)max_count-1; i>=(int) min_count; i--) {
+				unsigned long val1 = set1->ulongs[i];
+				if (val1 == val2)
+					continue;
+				return val1 < val2 ? -1 : 1;
+			}
+		}
+	}
+
+	for(i=(int)min_count-1; i>=0; i--) {
+		unsigned long val1 = set1->ulongs[i];
+		unsigned long val2 = set2->ulongs[i];
+		if (val1 == val2)
+			continue;
+		return val1 < val2 ? -1 : 1;
+	}
+
+	return 0;
+}
+
+int hwloc_bitmap_weight(const struct hwloc_bitmap_s * set)
+{
+	int weight = 0;
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(set);
+
+	if (set->infinite)
+		return -1;
+
+	for(i=0; i<set->ulongs_count; i++)
+		weight += hwloc_weight_long(set->ulongs[i]);
+	return weight;
+}
+
+int hwloc_bitmap_compare_inclusion(const struct hwloc_bitmap_s * set1, const struct hwloc_bitmap_s * set2)
+{
+	unsigned max_count = set1->ulongs_count > set2->ulongs_count ? set1->ulongs_count : set2->ulongs_count;
+	int result = HWLOC_BITMAP_EQUAL; /* means empty sets return equal */
+	int empty1 = 1;
+	int empty2 = 1;
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(set1);
+	HWLOC__BITMAP_CHECK(set2);
+
+	for(i=0; i<max_count; i++) {
+	  unsigned long val1 = HWLOC_SUBBITMAP_READULONG(set1, (unsigned) i);
+	  unsigned long val2 = HWLOC_SUBBITMAP_READULONG(set2, (unsigned) i);
+
+	  if (!val1) {
+	    if (!val2)
+	      /* both empty, no change */
+	      continue;
+
+	    /* val1 empty, val2 not */
+	    if (result == HWLOC_BITMAP_CONTAINS) {
+	      if (!empty2)
+		return HWLOC_BITMAP_INTERSECTS;
+	      result = HWLOC_BITMAP_DIFFERENT;
+	    } else if (result == HWLOC_BITMAP_EQUAL) {
+	      result = HWLOC_BITMAP_INCLUDED;
+	    }
+	    /* no change otherwise */
+
+	  } else if (!val2) {
+	    /* val2 empty, val1 not */
+	    if (result == HWLOC_BITMAP_INCLUDED) {
+	      if (!empty1)
+		return HWLOC_BITMAP_INTERSECTS;
+	      result = HWLOC_BITMAP_DIFFERENT;
+	    } else if (result == HWLOC_BITMAP_EQUAL) {
+	      result = HWLOC_BITMAP_CONTAINS;
+	    }
+	    /* no change otherwise */
+
+	  } else if (val1 == val2) {
+	    /* equal and not empty */
+	    if (result == HWLOC_BITMAP_DIFFERENT)
+	      return HWLOC_BITMAP_INTERSECTS;
+	    /* equal/contains/included unchanged */
+
+	  } else if ((val1 & val2) == val1) {
+	    /* included and not empty */
+	    if (result == HWLOC_BITMAP_CONTAINS || result == HWLOC_BITMAP_DIFFERENT)
+	      return HWLOC_BITMAP_INTERSECTS;
+	    /* equal/included unchanged */
+	    result = HWLOC_BITMAP_INCLUDED;
+
+	  } else if ((val1 & val2) == val2) {
+	    /* contains and not empty */
+	    if (result == HWLOC_BITMAP_INCLUDED || result == HWLOC_BITMAP_DIFFERENT)
+	      return HWLOC_BITMAP_INTERSECTS;
+	    /* equal/contains unchanged */
+	    result = HWLOC_BITMAP_CONTAINS;
+
+	  } else if ((val1 & val2) != 0) {
+	    /* intersects and not empty */
+	    return HWLOC_BITMAP_INTERSECTS;
+
+	  } else {
+	    /* different and not empty */
+
+	    /* equal/included/contains with non-empty sets means intersects */
+	    if (result == HWLOC_BITMAP_EQUAL && !empty1 /* implies !empty2 */)
+	      return HWLOC_BITMAP_INTERSECTS;
+	    if (result == HWLOC_BITMAP_INCLUDED && !empty1)
+	      return HWLOC_BITMAP_INTERSECTS;
+	    if (result == HWLOC_BITMAP_CONTAINS && !empty2)
+	      return HWLOC_BITMAP_INTERSECTS;
+	    /* otherwise means different */
+	    result = HWLOC_BITMAP_DIFFERENT;
+	  }
+
+	  empty1 &= !val1;
+	  empty2 &= !val2;
+	}
+
+	if (!set1->infinite) {
+	  if (set2->infinite) {
+	    /* set2 infinite only */
+	    if (result == HWLOC_BITMAP_CONTAINS) {
+	      if (!empty2)
+		return HWLOC_BITMAP_INTERSECTS;
+	      result = HWLOC_BITMAP_DIFFERENT;
+	    } else if (result == HWLOC_BITMAP_EQUAL) {
+	      result = HWLOC_BITMAP_INCLUDED;
+	    }
+	    /* no change otherwise */
+	  }
+	} else if (!set2->infinite) {
+	  /* set1 infinite only */
+	  if (result == HWLOC_BITMAP_INCLUDED) {
+	    if (!empty1)
+	      return HWLOC_BITMAP_INTERSECTS;
+	    result = HWLOC_BITMAP_DIFFERENT;
+	  } else if (result == HWLOC_BITMAP_EQUAL) {
+	    result = HWLOC_BITMAP_CONTAINS;
+	  }
+	  /* no change otherwise */
+	} else {
+	  /* both infinite */
+	  if (result == HWLOC_BITMAP_DIFFERENT)
+	    return HWLOC_BITMAP_INTERSECTS;
+	  /* equal/contains/included unchanged */
+	}
+
+	return result;
+}
diff --git a/src/3rdparty/hwloc/src/components.c b/src/3rdparty/hwloc/src/components.c
new file mode 100644
index 000000000..bd7c00e36
--- /dev/null
+++ b/src/3rdparty/hwloc/src/components.c
@@ -0,0 +1,785 @@
+/*
+ * Copyright © 2009-2017 Inria.  All rights reserved.
+ * Copyright © 2012 Université Bordeaux
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <private/private.h>
+#include <private/xml.h>
+#include <private/misc.h>
+
+#define HWLOC_COMPONENT_STOP_NAME "stop"
+#define HWLOC_COMPONENT_EXCLUDE_CHAR '-'
+#define HWLOC_COMPONENT_SEPS ","
+
+/* list of all registered discovery components, sorted by priority, higher priority first.
+ * noos is last because its priority is 0.
+ * others' priority is 10.
+ */
+static struct hwloc_disc_component * hwloc_disc_components = NULL;
+
+static unsigned hwloc_components_users = 0; /* first one initializes, last ones destroys */
+
+static int hwloc_components_verbose = 0;
+#ifdef HWLOC_HAVE_PLUGINS
+static int hwloc_plugins_verbose = 0;
+static const char * hwloc_plugins_blacklist = NULL;
+#endif
+
+/* hwloc_components_mutex serializes:
+ * - loading/unloading plugins, and modifications of the hwloc_plugins list
+ * - calls to ltdl, including in hwloc_check_plugin_namespace()
+ * - registration of components with hwloc_disc_component_register()
+ *   and hwloc_xml_callbacks_register()
+ */
+#ifdef HWLOC_WIN_SYS
+/* Basic mutex on top of InterlockedCompareExchange() on windows,
+ * Far from perfect, but easy to maintain, and way enough given that this code will never be needed for real. */
+#include <windows.h>
+static LONG hwloc_components_mutex = 0;
+#define HWLOC_COMPONENTS_LOCK() do {						\
+  while (InterlockedCompareExchange(&hwloc_components_mutex, 1, 0) != 0)	\
+    SwitchToThread();								\
+} while (0)
+#define HWLOC_COMPONENTS_UNLOCK() do {						\
+  assert(hwloc_components_mutex == 1);						\
+  hwloc_components_mutex = 0;							\
+} while (0)
+
+#elif defined HWLOC_HAVE_PTHREAD_MUTEX
+/* pthread mutex if available (except on windows) */
+#include <pthread.h>
+static pthread_mutex_t hwloc_components_mutex = PTHREAD_MUTEX_INITIALIZER;
+#define HWLOC_COMPONENTS_LOCK() pthread_mutex_lock(&hwloc_components_mutex)
+#define HWLOC_COMPONENTS_UNLOCK() pthread_mutex_unlock(&hwloc_components_mutex)
+
+#else /* HWLOC_WIN_SYS || HWLOC_HAVE_PTHREAD_MUTEX */
+#error No mutex implementation available
+#endif
+
+
+#ifdef HWLOC_HAVE_PLUGINS
+
+#include <ltdl.h>
+
+/* array of pointers to dynamically loaded plugins */
+static struct hwloc__plugin_desc {
+  char *name;
+  struct hwloc_component *component;
+  char *filename;
+  lt_dlhandle handle;
+  struct hwloc__plugin_desc *next;
+} *hwloc_plugins = NULL;
+
+static int
+hwloc__dlforeach_cb(const char *filename, void *_data __hwloc_attribute_unused)
+{
+  const char *basename;
+  lt_dlhandle handle;
+  struct hwloc_component *component;
+  struct hwloc__plugin_desc *desc, **prevdesc;
+
+  if (hwloc_plugins_verbose)
+    fprintf(stderr, "Plugin dlforeach found `%s'\n", filename);
+
+  basename = strrchr(filename, '/');
+  if (!basename)
+    basename = filename;
+  else
+    basename++;
+
+  if (hwloc_plugins_blacklist && strstr(hwloc_plugins_blacklist, basename)) {
+    if (hwloc_plugins_verbose)
+      fprintf(stderr, "Plugin `%s' is blacklisted in the environment\n", basename);
+    goto out;
+  }
+
+  /* dlopen and get the component structure */
+  handle = lt_dlopenext(filename);
+  if (!handle) {
+    if (hwloc_plugins_verbose)
+      fprintf(stderr, "Failed to load plugin: %s\n", lt_dlerror());
+    goto out;
+  }
+
+{
+  char componentsymbolname[strlen(basename)+10+1];
+  sprintf(componentsymbolname, "%s_component", basename);
+  component = lt_dlsym(handle, componentsymbolname);
+  if (!component) {
+    if (hwloc_plugins_verbose)
+      fprintf(stderr, "Failed to find component symbol `%s'\n",
+	      componentsymbolname);
+    goto out_with_handle;
+  }
+  if (component->abi != HWLOC_COMPONENT_ABI) {
+    if (hwloc_plugins_verbose)
+      fprintf(stderr, "Plugin symbol ABI %u instead of %d\n",
+	      component->abi, HWLOC_COMPONENT_ABI);
+    goto out_with_handle;
+  }
+  if (hwloc_plugins_verbose)
+    fprintf(stderr, "Plugin contains expected symbol `%s'\n",
+	    componentsymbolname);
+}
+
+  if (HWLOC_COMPONENT_TYPE_DISC == component->type) {
+    if (strncmp(basename, "hwloc_", 6)) {
+      if (hwloc_plugins_verbose)
+	fprintf(stderr, "Plugin name `%s' doesn't match its type DISCOVERY\n", basename);
+      goto out_with_handle;
+    }
+  } else if (HWLOC_COMPONENT_TYPE_XML == component->type) {
+    if (strncmp(basename, "hwloc_xml_", 10)) {
+      if (hwloc_plugins_verbose)
+	fprintf(stderr, "Plugin name `%s' doesn't match its type XML\n", basename);
+      goto out_with_handle;
+    }
+  } else {
+    if (hwloc_plugins_verbose)
+      fprintf(stderr, "Plugin name `%s' has invalid type %u\n",
+	      basename, (unsigned) component->type);
+    goto out_with_handle;
+  }
+
+  /* allocate a plugin_desc and queue it */
+  desc = malloc(sizeof(*desc));
+  if (!desc)
+    goto out_with_handle;
+  desc->name = strdup(basename);
+  desc->filename = strdup(filename);
+  desc->component = component;
+  desc->handle = handle;
+  desc->next = NULL;
+  if (hwloc_plugins_verbose)
+    fprintf(stderr, "Plugin descriptor `%s' ready\n", basename);
+
+  /* append to the list */
+  prevdesc = &hwloc_plugins;
+  while (*prevdesc)
+    prevdesc = &((*prevdesc)->next);
+  *prevdesc = desc;
+  if (hwloc_plugins_verbose)
+    fprintf(stderr, "Plugin descriptor `%s' queued\n", basename);
+  return 0;
+
+ out_with_handle:
+  lt_dlclose(handle);
+ out:
+  return 0;
+}
+
+static void
+hwloc_plugins_exit(void)
+{
+  struct hwloc__plugin_desc *desc, *next;
+
+  if (hwloc_plugins_verbose)
+    fprintf(stderr, "Closing all plugins\n");
+
+  desc = hwloc_plugins;
+  while (desc) {
+    next = desc->next;
+    lt_dlclose(desc->handle);
+    free(desc->name);
+    free(desc->filename);
+    free(desc);
+    desc = next;
+  }
+  hwloc_plugins = NULL;
+
+  lt_dlexit();
+}
+
+static int
+hwloc_plugins_init(void)
+{
+  const char *verboseenv;
+  const char *path = HWLOC_PLUGINS_PATH;
+  const char *env;
+  int err;
+
+  verboseenv = getenv("HWLOC_PLUGINS_VERBOSE");
+  hwloc_plugins_verbose = verboseenv ? atoi(verboseenv) : 0;
+
+  hwloc_plugins_blacklist = getenv("HWLOC_PLUGINS_BLACKLIST");
+
+  err = lt_dlinit();
+  if (err)
+    goto out;
+
+  env = getenv("HWLOC_PLUGINS_PATH");
+  if (env)
+    path = env;
+
+  hwloc_plugins = NULL;
+
+  if (hwloc_plugins_verbose)
+    fprintf(stderr, "Starting plugin dlforeach in %s\n", path);
+  err = lt_dlforeachfile(path, hwloc__dlforeach_cb, NULL);
+  if (err)
+    goto out_with_init;
+
+  return 0;
+
+ out_with_init:
+  hwloc_plugins_exit();
+ out:
+  return -1;
+}
+
+#endif /* HWLOC_HAVE_PLUGINS */
+
+static const char *
+hwloc_disc_component_type_string(hwloc_disc_component_type_t type)
+{
+  switch (type) {
+  case HWLOC_DISC_COMPONENT_TYPE_CPU: return "cpu";
+  case HWLOC_DISC_COMPONENT_TYPE_GLOBAL: return "global";
+  case HWLOC_DISC_COMPONENT_TYPE_MISC: return "misc";
+  default: return "**unknown**";
+  }
+}
+
+static int
+hwloc_disc_component_register(struct hwloc_disc_component *component,
+			      const char *filename)
+{
+  struct hwloc_disc_component **prev;
+
+  /* check that the component name is valid */
+  if (!strcmp(component->name, HWLOC_COMPONENT_STOP_NAME)) {
+    if (hwloc_components_verbose)
+      fprintf(stderr, "Cannot register discovery component with reserved name `" HWLOC_COMPONENT_STOP_NAME "'\n");
+    return -1;
+  }
+  if (strchr(component->name, HWLOC_COMPONENT_EXCLUDE_CHAR)
+      || strcspn(component->name, HWLOC_COMPONENT_SEPS) != strlen(component->name)) {
+    if (hwloc_components_verbose)
+      fprintf(stderr, "Cannot register discovery component with name `%s' containing reserved characters `%c" HWLOC_COMPONENT_SEPS "'\n",
+	      component->name, HWLOC_COMPONENT_EXCLUDE_CHAR);
+    return -1;
+  }
+  /* check that the component type is valid */
+  switch ((unsigned) component->type) {
+  case HWLOC_DISC_COMPONENT_TYPE_CPU:
+  case HWLOC_DISC_COMPONENT_TYPE_GLOBAL:
+  case HWLOC_DISC_COMPONENT_TYPE_MISC:
+    break;
+  default:
+    fprintf(stderr, "Cannot register discovery component `%s' with unknown type %u\n",
+	    component->name, (unsigned) component->type);
+    return -1;
+  }
+
+  prev = &hwloc_disc_components;
+  while (NULL != *prev) {
+    if (!strcmp((*prev)->name, component->name)) {
+      /* if two components have the same name, only keep the highest priority one */
+      if ((*prev)->priority < component->priority) {
+	/* drop the existing component */
+	if (hwloc_components_verbose)
+	  fprintf(stderr, "Dropping previously registered discovery component `%s', priority %u lower than new one %u\n",
+		  (*prev)->name, (*prev)->priority, component->priority);
+	*prev = (*prev)->next;
+      } else {
+	/* drop the new one */
+	if (hwloc_components_verbose)
+	  fprintf(stderr, "Ignoring new discovery component `%s', priority %u lower than previously registered one %u\n",
+		  component->name, component->priority, (*prev)->priority);
+	return -1;
+      }
+    }
+    prev = &((*prev)->next);
+  }
+  if (hwloc_components_verbose)
+    fprintf(stderr, "Registered %s discovery component `%s' with priority %u (%s%s)\n",
+	    hwloc_disc_component_type_string(component->type), component->name, component->priority,
+	    filename ? "from plugin " : "statically build", filename ? filename : "");
+
+  prev = &hwloc_disc_components;
+  while (NULL != *prev) {
+    if ((*prev)->priority < component->priority)
+      break;
+    prev = &((*prev)->next);
+  }
+  component->next = *prev;
+  *prev = component;
+  return 0;
+}
+
+#include <static-components.h>
+
+static void (**hwloc_component_finalize_cbs)(unsigned long);
+static unsigned hwloc_component_finalize_cb_count;
+
+void
+hwloc_components_init(void)
+{
+#ifdef HWLOC_HAVE_PLUGINS
+  struct hwloc__plugin_desc *desc;
+#endif
+  const char *verboseenv;
+  unsigned i;
+
+  HWLOC_COMPONENTS_LOCK();
+  assert((unsigned) -1 != hwloc_components_users);
+  if (0 != hwloc_components_users++) {
+    HWLOC_COMPONENTS_UNLOCK();
+    return;
+  }
+
+  verboseenv = getenv("HWLOC_COMPONENTS_VERBOSE");
+  hwloc_components_verbose = verboseenv ? atoi(verboseenv) : 0;
+
+#ifdef HWLOC_HAVE_PLUGINS
+  hwloc_plugins_init();
+#endif
+
+  hwloc_component_finalize_cbs = NULL;
+  hwloc_component_finalize_cb_count = 0;
+  /* count the max number of finalize callbacks */
+  for(i=0; NULL != hwloc_static_components[i]; i++)
+    hwloc_component_finalize_cb_count++;
+#ifdef HWLOC_HAVE_PLUGINS
+  for(desc = hwloc_plugins; NULL != desc; desc = desc->next)
+    hwloc_component_finalize_cb_count++;
+#endif
+  if (hwloc_component_finalize_cb_count) {
+    hwloc_component_finalize_cbs = calloc(hwloc_component_finalize_cb_count,
+					  sizeof(*hwloc_component_finalize_cbs));
+    assert(hwloc_component_finalize_cbs);
+    /* forget that max number and recompute the real one below */
+    hwloc_component_finalize_cb_count = 0;
+  }
+
+  /* hwloc_static_components is created by configure in static-components.h */
+  for(i=0; NULL != hwloc_static_components[i]; i++) {
+    if (hwloc_static_components[i]->flags) {
+      fprintf(stderr, "Ignoring static component with invalid flags %lx\n",
+	      hwloc_static_components[i]->flags);
+      continue;
+    }
+
+    /* initialize the component */
+    if (hwloc_static_components[i]->init && hwloc_static_components[i]->init(0) < 0) {
+      if (hwloc_components_verbose)
+	fprintf(stderr, "Ignoring static component, failed to initialize\n");
+      continue;
+    }
+    /* queue ->finalize() callback if any */
+    if (hwloc_static_components[i]->finalize)
+      hwloc_component_finalize_cbs[hwloc_component_finalize_cb_count++] = hwloc_static_components[i]->finalize;
+
+    /* register for real now */
+    if (HWLOC_COMPONENT_TYPE_DISC == hwloc_static_components[i]->type)
+      hwloc_disc_component_register(hwloc_static_components[i]->data, NULL);
+    else if (HWLOC_COMPONENT_TYPE_XML == hwloc_static_components[i]->type)
+      hwloc_xml_callbacks_register(hwloc_static_components[i]->data);
+    else
+      assert(0);
+  }
+
+  /* dynamic plugins */
+#ifdef HWLOC_HAVE_PLUGINS
+  for(desc = hwloc_plugins; NULL != desc; desc = desc->next) {
+    if (desc->component->flags) {
+      fprintf(stderr, "Ignoring plugin `%s' component with invalid flags %lx\n",
+	      desc->name, desc->component->flags);
+      continue;
+    }
+
+    /* initialize the component */
+    if (desc->component->init && desc->component->init(0) < 0) {
+      if (hwloc_components_verbose)
+	fprintf(stderr, "Ignoring plugin `%s', failed to initialize\n", desc->name);
+      continue;
+    }
+    /* queue ->finalize() callback if any */
+    if (desc->component->finalize)
+      hwloc_component_finalize_cbs[hwloc_component_finalize_cb_count++] = desc->component->finalize;
+
+    /* register for real now */
+    if (HWLOC_COMPONENT_TYPE_DISC == desc->component->type)
+      hwloc_disc_component_register(desc->component->data, desc->filename);
+    else if (HWLOC_COMPONENT_TYPE_XML == desc->component->type)
+      hwloc_xml_callbacks_register(desc->component->data);
+    else
+      assert(0);
+  }
+#endif
+
+  HWLOC_COMPONENTS_UNLOCK();
+}
+
+void
+hwloc_backends_init(struct hwloc_topology *topology)
+{
+  topology->backends = NULL;
+  topology->backend_excludes = 0;
+}
+
+static struct hwloc_disc_component *
+hwloc_disc_component_find(int type /* hwloc_disc_component_type_t or -1 if any */,
+			       const char *name /* name of NULL if any */)
+{
+  struct hwloc_disc_component *comp = hwloc_disc_components;
+  while (NULL != comp) {
+    if ((-1 == type || type == (int) comp->type)
+       && (NULL == name || !strcmp(name, comp->name)))
+      return comp;
+    comp = comp->next;
+  }
+  return NULL;
+}
+
+/* used by set_xml(), set_synthetic(), ... environment variables, ... to force the first backend */
+int
+hwloc_disc_component_force_enable(struct hwloc_topology *topology,
+				  int envvar_forced,
+				  int type, const char *name,
+				  const void *data1, const void *data2, const void *data3)
+{
+  struct hwloc_disc_component *comp;
+  struct hwloc_backend *backend;
+
+  if (topology->is_loaded) {
+    errno = EBUSY;
+    return -1;
+  }
+
+  comp = hwloc_disc_component_find(type, name);
+  if (!comp) {
+    errno = ENOSYS;
+    return -1;
+  }
+
+  backend = comp->instantiate(comp, data1, data2, data3);
+  if (backend) {
+    backend->envvar_forced = envvar_forced;
+    if (topology->backends)
+      hwloc_backends_disable_all(topology);
+    return hwloc_backend_enable(topology, backend);
+  } else
+    return -1;
+}
+
+static int
+hwloc_disc_component_try_enable(struct hwloc_topology *topology,
+				struct hwloc_disc_component *comp,
+				const char *comparg,
+				int envvar_forced)
+{
+  struct hwloc_backend *backend;
+
+  if (topology->backend_excludes & comp->type) {
+    if (hwloc_components_verbose)
+      /* do not warn if envvar_forced since system-wide HWLOC_COMPONENTS must be silently ignored after set_xml() etc.
+       */
+      fprintf(stderr, "Excluding %s discovery component `%s', conflicts with excludes 0x%x\n",
+	      hwloc_disc_component_type_string(comp->type), comp->name, topology->backend_excludes);
+    return -1;
+  }
+
+  backend = comp->instantiate(comp, comparg, NULL, NULL);
+  if (!backend) {
+    if (hwloc_components_verbose || envvar_forced)
+      fprintf(stderr, "Failed to instantiate discovery component `%s'\n", comp->name);
+    return -1;
+  }
+
+  backend->envvar_forced = envvar_forced;
+  return hwloc_backend_enable(topology, backend);
+}
+
+void
+hwloc_disc_components_enable_others(struct hwloc_topology *topology)
+{
+  struct hwloc_disc_component *comp;
+  struct hwloc_backend *backend;
+  int tryall = 1;
+  const char *_env;
+  char *env; /* we'll to modify the env value, so duplicate it */
+
+  _env = getenv("HWLOC_COMPONENTS");
+  env = _env ? strdup(_env) : NULL;
+
+  /* enable explicitly listed components */
+  if (env) {
+    char *curenv = env;
+    size_t s;
+
+    while (*curenv) {
+      s = strcspn(curenv, HWLOC_COMPONENT_SEPS);
+      if (s) {
+	char c;
+
+	/* replace linuxpci with linuxio for backward compatibility with pre-v2.0 */
+	if (!strncmp(curenv, "linuxpci", 8) && s == 8) {
+	  curenv[5] = 'i';
+	  curenv[6] = 'o';
+	  curenv[7] = *HWLOC_COMPONENT_SEPS;
+	} else if (curenv[0] == HWLOC_COMPONENT_EXCLUDE_CHAR && !strncmp(curenv+1, "linuxpci", 8) && s == 9) {
+	  curenv[6] = 'i';
+	  curenv[7] = 'o';
+	  curenv[8] = *HWLOC_COMPONENT_SEPS;
+	  /* skip this name, it's a negated one */
+	  goto nextname;
+	}
+
+	if (curenv[0] == HWLOC_COMPONENT_EXCLUDE_CHAR)
+	  goto nextname;
+
+	if (!strncmp(curenv, HWLOC_COMPONENT_STOP_NAME, s)) {
+	  tryall = 0;
+	  break;
+	}
+
+	/* save the last char and replace with \0 */
+	c = curenv[s];
+	curenv[s] = '\0';
+
+	comp = hwloc_disc_component_find(-1, curenv);
+	if (comp) {
+	  hwloc_disc_component_try_enable(topology, comp, NULL, 1 /* envvar forced */);
+	} else {
+	  fprintf(stderr, "Cannot find discovery component `%s'\n", curenv);
+	}
+
+	/* restore chars (the second loop below needs env to be unmodified) */
+	curenv[s] = c;
+      }
+
+nextname:
+      curenv += s;
+      if (*curenv)
+	/* Skip comma */
+	curenv++;
+    }
+  }
+
+  /* env is still the same, the above loop didn't modify it */
+
+  /* now enable remaining components (except the explicitly '-'-listed ones) */
+  if (tryall) {
+    comp = hwloc_disc_components;
+    while (NULL != comp) {
+      if (!comp->enabled_by_default)
+	goto nextcomp;
+      /* check if this component was explicitly excluded in env */
+      if (env) {
+	char *curenv = env;
+	while (*curenv) {
+	  size_t s = strcspn(curenv, HWLOC_COMPONENT_SEPS);
+	  if (curenv[0] == HWLOC_COMPONENT_EXCLUDE_CHAR && !strncmp(curenv+1, comp->name, s-1) && strlen(comp->name) == s-1) {
+	    if (hwloc_components_verbose)
+	      fprintf(stderr, "Excluding %s discovery component `%s' because of HWLOC_COMPONENTS environment variable\n",
+	    hwloc_disc_component_type_string(comp->type), comp->name);
+	    goto nextcomp;
+	  }
+	  curenv += s;
+	  if (*curenv)
+	    /* Skip comma */
+	    curenv++;
+	}
+      }
+      hwloc_disc_component_try_enable(topology, comp, NULL, 0 /* defaults, not envvar forced */);
+nextcomp:
+      comp = comp->next;
+    }
+  }
+
+  if (hwloc_components_verbose) {
+    /* print a summary */
+    int first = 1;
+    backend = topology->backends;
+    fprintf(stderr, "Final list of enabled discovery components: ");
+    while (backend != NULL) {
+      fprintf(stderr, "%s%s", first ? "" : ",", backend->component->name);
+      backend = backend->next;
+      first = 0;
+    }
+    fprintf(stderr, "\n");
+  }
+
+  free(env);
+}
+
+void
+hwloc_components_fini(void)
+{
+  unsigned i;
+
+  HWLOC_COMPONENTS_LOCK();
+  assert(0 != hwloc_components_users);
+  if (0 != --hwloc_components_users) {
+    HWLOC_COMPONENTS_UNLOCK();
+    return;
+  }
+
+  for(i=0; i<hwloc_component_finalize_cb_count; i++)
+    hwloc_component_finalize_cbs[hwloc_component_finalize_cb_count-i-1](0);
+  free(hwloc_component_finalize_cbs);
+  hwloc_component_finalize_cbs = NULL;
+  hwloc_component_finalize_cb_count = 0;
+
+  /* no need to unlink/free the list of components, they'll be unloaded below */
+
+  hwloc_disc_components = NULL;
+  hwloc_xml_callbacks_reset();
+
+#ifdef HWLOC_HAVE_PLUGINS
+  hwloc_plugins_exit();
+#endif
+
+  HWLOC_COMPONENTS_UNLOCK();
+}
+
+struct hwloc_backend *
+hwloc_backend_alloc(struct hwloc_disc_component *component)
+{
+  struct hwloc_backend * backend = malloc(sizeof(*backend));
+  if (!backend) {
+    errno = ENOMEM;
+    return NULL;
+  }
+  backend->component = component;
+  backend->flags = 0;
+  backend->discover = NULL;
+  backend->get_pci_busid_cpuset = NULL;
+  backend->disable = NULL;
+  backend->is_thissystem = -1;
+  backend->next = NULL;
+  backend->envvar_forced = 0;
+  return backend;
+}
+
+static void
+hwloc_backend_disable(struct hwloc_backend *backend)
+{
+  if (backend->disable)
+    backend->disable(backend);
+  free(backend);
+}
+
+int
+hwloc_backend_enable(struct hwloc_topology *topology, struct hwloc_backend *backend)
+{
+  struct hwloc_backend **pprev;
+
+  /* check backend flags */
+  if (backend->flags) {
+    fprintf(stderr, "Cannot enable %s discovery component `%s' with unknown flags %lx\n",
+	    hwloc_disc_component_type_string(backend->component->type), backend->component->name, backend->flags);
+    return -1;
+  }
+
+  /* make sure we didn't already enable this backend, we don't want duplicates */
+  pprev = &topology->backends;
+  while (NULL != *pprev) {
+    if ((*pprev)->component == backend->component) {
+      if (hwloc_components_verbose)
+	fprintf(stderr, "Cannot enable %s discovery component `%s' twice\n",
+		hwloc_disc_component_type_string(backend->component->type), backend->component->name);
+      hwloc_backend_disable(backend);
+      errno = EBUSY;
+      return -1;
+    }
+    pprev = &((*pprev)->next);
+  }
+
+  if (hwloc_components_verbose)
+    fprintf(stderr, "Enabling %s discovery component `%s'\n",
+	    hwloc_disc_component_type_string(backend->component->type), backend->component->name);
+
+  /* enqueue at the end */
+  pprev = &topology->backends;
+  while (NULL != *pprev)
+    pprev = &((*pprev)->next);
+  backend->next = *pprev;
+  *pprev = backend;
+
+  backend->topology = topology;
+  topology->backend_excludes |= backend->component->excludes;
+  return 0;
+}
+
+void
+hwloc_backends_is_thissystem(struct hwloc_topology *topology)
+{
+  struct hwloc_backend *backend;
+  const char *local_env;
+
+  /* Apply is_thissystem topology flag before we enforce envvar backends.
+   * If the application changed the backend with set_foo(),
+   * it may use set_flags() update the is_thissystem flag here.
+   * If it changes the backend with environment variables below,
+   * it may use HWLOC_THISSYSTEM envvar below as well.
+   */
+
+  topology->is_thissystem = 1;
+
+  /* apply thissystem from normally-given backends (envvar_forced=0, either set_foo() or defaults) */
+  backend = topology->backends;
+  while (backend != NULL) {
+    if (backend->envvar_forced == 0 && backend->is_thissystem != -1) {
+      assert(backend->is_thissystem == 0);
+      topology->is_thissystem = 0;
+    }
+    backend = backend->next;
+  }
+
+  /* override set_foo() with flags */
+  if (topology->flags & HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM)
+    topology->is_thissystem = 1;
+
+  /* now apply envvar-forced backend (envvar_forced=1) */
+  backend = topology->backends;
+  while (backend != NULL) {
+    if (backend->envvar_forced == 1 && backend->is_thissystem != -1) {
+      assert(backend->is_thissystem == 0);
+      topology->is_thissystem = 0;
+    }
+    backend = backend->next;
+  }
+
+  /* override with envvar-given flag */
+  local_env = getenv("HWLOC_THISSYSTEM");
+  if (local_env)
+    topology->is_thissystem = atoi(local_env);
+}
+
+void
+hwloc_backends_find_callbacks(struct hwloc_topology *topology)
+{
+  struct hwloc_backend *backend = topology->backends;
+  /* use the first backend's get_pci_busid_cpuset callback */
+  topology->get_pci_busid_cpuset_backend = NULL;
+  while (backend != NULL) {
+    if (backend->get_pci_busid_cpuset) {
+      topology->get_pci_busid_cpuset_backend = backend;
+      return;
+    }
+    backend = backend->next;
+  }
+  return;
+}
+
+void
+hwloc_backends_disable_all(struct hwloc_topology *topology)
+{
+  struct hwloc_backend *backend;
+
+  while (NULL != (backend = topology->backends)) {
+    struct hwloc_backend *next = backend->next;
+    if (hwloc_components_verbose)
+      fprintf(stderr, "Disabling %s discovery component `%s'\n",
+	      hwloc_disc_component_type_string(backend->component->type), backend->component->name);
+    hwloc_backend_disable(backend);
+    topology->backends = next;
+  }
+  topology->backends = NULL;
+  topology->backend_excludes = 0;
+}
diff --git a/src/3rdparty/hwloc/src/diff.c b/src/3rdparty/hwloc/src/diff.c
new file mode 100644
index 000000000..00811a7b5
--- /dev/null
+++ b/src/3rdparty/hwloc/src/diff.c
@@ -0,0 +1,492 @@
+/*
+ * Copyright © 2013-2018 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <private/private.h>
+#include <private/misc.h>
+
+int hwloc_topology_diff_destroy(hwloc_topology_diff_t diff)
+{
+	hwloc_topology_diff_t next;
+	while (diff) {
+		next = diff->generic.next;
+		switch (diff->generic.type) {
+		default:
+			break;
+		case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR:
+			switch (diff->obj_attr.diff.generic.type) {
+			default:
+				break;
+			case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_NAME:
+			case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_INFO:
+				free(diff->obj_attr.diff.string.name);
+				free(diff->obj_attr.diff.string.oldvalue);
+				free(diff->obj_attr.diff.string.newvalue);
+				break;
+			}
+			break;
+		}
+		free(diff);
+		diff = next;
+	}
+	return 0;
+}
+
+/************************
+ * Computing diffs
+ */
+
+static void hwloc_append_diff(hwloc_topology_diff_t newdiff,
+			      hwloc_topology_diff_t *firstdiffp,
+			      hwloc_topology_diff_t *lastdiffp)
+{
+	if (*firstdiffp)
+		(*lastdiffp)->generic.next = newdiff;
+	else
+		*firstdiffp = newdiff;
+	*lastdiffp = newdiff;
+	newdiff->generic.next = NULL;
+}
+
+static int hwloc_append_diff_too_complex(hwloc_obj_t obj1,
+					 hwloc_topology_diff_t *firstdiffp,
+					 hwloc_topology_diff_t *lastdiffp)
+{
+	hwloc_topology_diff_t newdiff;
+	newdiff = malloc(sizeof(*newdiff));
+	if (!newdiff)
+		return -1;
+
+	newdiff->too_complex.type = HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX;
+	newdiff->too_complex.obj_depth = obj1->depth;
+	newdiff->too_complex.obj_index = obj1->logical_index;
+	hwloc_append_diff(newdiff, firstdiffp, lastdiffp);
+	return 0;
+}
+
+static int hwloc_append_diff_obj_attr_string(hwloc_obj_t obj,
+					     hwloc_topology_diff_obj_attr_type_t type,
+					     const char *name,
+					     const char *oldvalue,
+					     const char *newvalue,
+					     hwloc_topology_diff_t *firstdiffp,
+					     hwloc_topology_diff_t *lastdiffp)
+{
+	hwloc_topology_diff_t newdiff;
+	newdiff = malloc(sizeof(*newdiff));
+	if (!newdiff)
+		return -1;
+
+	newdiff->obj_attr.type = HWLOC_TOPOLOGY_DIFF_OBJ_ATTR;
+	newdiff->obj_attr.obj_depth = obj->depth;
+	newdiff->obj_attr.obj_index = obj->logical_index;
+	newdiff->obj_attr.diff.string.type = type;
+	newdiff->obj_attr.diff.string.name = name ? strdup(name) : NULL;
+	newdiff->obj_attr.diff.string.oldvalue = oldvalue ? strdup(oldvalue) : NULL;
+	newdiff->obj_attr.diff.string.newvalue = newvalue ? strdup(newvalue) : NULL;
+	hwloc_append_diff(newdiff, firstdiffp, lastdiffp);
+	return 0;
+}
+
+static int hwloc_append_diff_obj_attr_uint64(hwloc_obj_t obj,
+					     hwloc_topology_diff_obj_attr_type_t type,
+					     hwloc_uint64_t idx,
+					     hwloc_uint64_t oldvalue,
+					     hwloc_uint64_t newvalue,
+					     hwloc_topology_diff_t *firstdiffp,
+					     hwloc_topology_diff_t *lastdiffp)
+{
+	hwloc_topology_diff_t newdiff;
+	newdiff = malloc(sizeof(*newdiff));
+	if (!newdiff)
+		return -1;
+
+	newdiff->obj_attr.type = HWLOC_TOPOLOGY_DIFF_OBJ_ATTR;
+	newdiff->obj_attr.obj_depth = obj->depth;
+	newdiff->obj_attr.obj_index = obj->logical_index;
+	newdiff->obj_attr.diff.uint64.type = type;
+	newdiff->obj_attr.diff.uint64.index = idx;
+	newdiff->obj_attr.diff.uint64.oldvalue = oldvalue;
+	newdiff->obj_attr.diff.uint64.newvalue = newvalue;
+	hwloc_append_diff(newdiff, firstdiffp, lastdiffp);
+	return 0;
+}
+
+static int
+hwloc_diff_trees(hwloc_topology_t topo1, hwloc_obj_t obj1,
+		 hwloc_topology_t topo2, hwloc_obj_t obj2,
+		 unsigned flags,
+		 hwloc_topology_diff_t *firstdiffp, hwloc_topology_diff_t *lastdiffp)
+{
+	unsigned i;
+	int err;
+	hwloc_obj_t child1, child2;
+
+	if (obj1->depth != obj2->depth)
+		goto out_too_complex;
+
+	if (obj1->type != obj2->type)
+		goto out_too_complex;
+	if ((!obj1->subtype) != (!obj2->subtype)
+	    || (obj1->subtype && strcmp(obj1->subtype, obj2->subtype)))
+		goto out_too_complex;
+
+	if (obj1->os_index != obj2->os_index)
+		/* we could allow different os_index for non-PU non-NUMAnode objects
+		 * but it's likely useless anyway */
+		goto out_too_complex;
+
+#define _SETS_DIFFERENT(_set1, _set2) \
+ (   ( !(_set1) != !(_set2) ) \
+  || ( (_set1) && !hwloc_bitmap_isequal(_set1, _set2) ) )
+#define SETS_DIFFERENT(_set, _obj1, _obj2) _SETS_DIFFERENT((_obj1)->_set, (_obj2)->_set)
+	if (SETS_DIFFERENT(cpuset, obj1, obj2)
+	    || SETS_DIFFERENT(complete_cpuset, obj1, obj2)
+	    || SETS_DIFFERENT(nodeset, obj1, obj2)
+	    || SETS_DIFFERENT(complete_nodeset, obj1, obj2))
+		goto out_too_complex;
+
+	/* no need to check logical_index, sibling_rank, symmetric_subtree,
+	 * the parents did it */
+
+	/* gp_index don't have to be strictly identical */
+
+	if ((!obj1->name) != (!obj2->name)
+	    || (obj1->name && strcmp(obj1->name, obj2->name))) {
+		err = hwloc_append_diff_obj_attr_string(obj1,
+						       HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_NAME,
+						       NULL,
+						       obj1->name,
+						       obj2->name,
+						       firstdiffp, lastdiffp);
+		if (err < 0)
+			return err;
+	}
+
+	/* type-specific attrs */
+	switch (obj1->type) {
+	default:
+		break;
+	case HWLOC_OBJ_NUMANODE:
+		if (obj1->attr->numanode.local_memory != obj2->attr->numanode.local_memory) {
+			err = hwloc_append_diff_obj_attr_uint64(obj1,
+								HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_SIZE,
+								0,
+								obj1->attr->numanode.local_memory,
+								obj2->attr->numanode.local_memory,
+								firstdiffp, lastdiffp);
+			if (err < 0)
+				return err;
+		}
+		/* ignore memory page_types */
+		break;
+	case HWLOC_OBJ_L1CACHE:
+	case HWLOC_OBJ_L2CACHE:
+	case HWLOC_OBJ_L3CACHE:
+	case HWLOC_OBJ_L4CACHE:
+	case HWLOC_OBJ_L5CACHE:
+	case HWLOC_OBJ_L1ICACHE:
+	case HWLOC_OBJ_L2ICACHE:
+	case HWLOC_OBJ_L3ICACHE:
+		if (memcmp(obj1->attr, obj2->attr, sizeof(obj1->attr->cache)))
+			goto out_too_complex;
+		break;
+	case HWLOC_OBJ_GROUP:
+		if (memcmp(obj1->attr, obj2->attr, sizeof(obj1->attr->group)))
+			goto out_too_complex;
+		break;
+	case HWLOC_OBJ_PCI_DEVICE:
+		if (memcmp(obj1->attr, obj2->attr, sizeof(obj1->attr->pcidev)))
+			goto out_too_complex;
+		break;
+	case HWLOC_OBJ_BRIDGE:
+		if (memcmp(obj1->attr, obj2->attr, sizeof(obj1->attr->bridge)))
+			goto out_too_complex;
+		break;
+	case HWLOC_OBJ_OS_DEVICE:
+		if (memcmp(obj1->attr, obj2->attr, sizeof(obj1->attr->osdev)))
+			goto out_too_complex;
+		break;
+	}
+
+	/* infos */
+	if (obj1->infos_count != obj2->infos_count)
+		goto out_too_complex;
+	for(i=0; i<obj1->infos_count; i++) {
+		struct hwloc_info_s *info1 = &obj1->infos[i], *info2 = &obj2->infos[i];
+		if (strcmp(info1->name, info2->name))
+			goto out_too_complex;
+		if (strcmp(obj1->infos[i].value, obj2->infos[i].value)) {
+			err = hwloc_append_diff_obj_attr_string(obj1,
+								HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_INFO,
+								info1->name,
+								info1->value,
+								info2->value,
+								firstdiffp, lastdiffp);
+			if (err < 0)
+				return err;
+		}
+	}
+
+	/* ignore userdata */
+
+	/* children */
+	for(child1 = obj1->first_child, child2 = obj2->first_child;
+	    child1 != NULL && child2 != NULL;
+	    child1 = child1->next_sibling, child2 = child2->next_sibling) {
+		err = hwloc_diff_trees(topo1, child1,
+				       topo2, child2,
+				       flags,
+				       firstdiffp, lastdiffp);
+		if (err < 0)
+			return err;
+	}
+	if (child1 || child2)
+		goto out_too_complex;
+
+	/* memory children */
+	for(child1 = obj1->memory_first_child, child2 = obj2->memory_first_child;
+	    child1 != NULL && child2 != NULL;
+	    child1 = child1->next_sibling, child2 = child2->next_sibling) {
+		err = hwloc_diff_trees(topo1, child1,
+				       topo2, child2,
+				       flags,
+				       firstdiffp, lastdiffp);
+		if (err < 0)
+			return err;
+	}
+	if (child1 || child2)
+		goto out_too_complex;
+
+	/* I/O children */
+	for(child1 = obj1->io_first_child, child2 = obj2->io_first_child;
+	    child1 != NULL && child2 != NULL;
+	    child1 = child1->next_sibling, child2 = child2->next_sibling) {
+		err = hwloc_diff_trees(topo1, child1,
+				       topo2, child2,
+				       flags,
+				       firstdiffp, lastdiffp);
+		if (err < 0)
+			return err;
+	}
+	if (child1 || child2)
+		goto out_too_complex;
+
+	/* misc children */
+	for(child1 = obj1->misc_first_child, child2 = obj2->misc_first_child;
+	    child1 != NULL && child2 != NULL;
+	    child1 = child1->next_sibling, child2 = child2->next_sibling) {
+		err = hwloc_diff_trees(topo1, child1,
+				       topo2, child2,
+				       flags,
+				       firstdiffp, lastdiffp);
+		if (err < 0)
+			return err;
+	}
+	if (child1 || child2)
+		goto out_too_complex;
+
+	return 0;
+
+out_too_complex:
+	hwloc_append_diff_too_complex(obj1, firstdiffp, lastdiffp);
+	return 0;
+}
+
+int hwloc_topology_diff_build(hwloc_topology_t topo1,
+			      hwloc_topology_t topo2,
+			      unsigned long flags,
+			      hwloc_topology_diff_t *diffp)
+{
+	hwloc_topology_diff_t lastdiff, tmpdiff;
+	struct hwloc_internal_distances_s *dist1, *dist2;
+	unsigned i;
+	int err;
+
+	if (!topo1->is_loaded || !topo2->is_loaded) {
+	  errno = EINVAL;
+	  return -1;
+	}
+
+	if (flags != 0) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	*diffp = NULL;
+	err = hwloc_diff_trees(topo1, hwloc_get_root_obj(topo1),
+			       topo2, hwloc_get_root_obj(topo2),
+			       flags,
+			       diffp, &lastdiff);
+	if (!err) {
+		tmpdiff = *diffp;
+		while (tmpdiff) {
+			if (tmpdiff->generic.type == HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX) {
+				err = 1;
+				break;
+			}
+			tmpdiff = tmpdiff->generic.next;
+		}
+	}
+
+	if (!err) {
+		if (SETS_DIFFERENT(allowed_cpuset, topo1, topo2)
+		    || SETS_DIFFERENT(allowed_nodeset, topo1, topo2)) {
+			hwloc_append_diff_too_complex(hwloc_get_root_obj(topo1), diffp, &lastdiff);
+			err = 1;
+		}
+	}
+
+	if (!err) {
+		/* distances */
+		hwloc_internal_distances_refresh(topo1);
+		hwloc_internal_distances_refresh(topo2);
+		dist1 = topo1->first_dist;
+		dist2 = topo2->first_dist;
+		while (dist1 || dist2) {
+			if (!!dist1 != !!dist2) {
+				hwloc_append_diff_too_complex(hwloc_get_root_obj(topo1), diffp, &lastdiff);
+				err = 1;
+				break;
+			}
+			if (dist1->type != dist2->type
+			    || dist1->nbobjs != dist2->nbobjs
+			    || dist1->kind != dist2->kind
+			    || memcmp(dist1->values, dist2->values, dist1->nbobjs * dist1->nbobjs * sizeof(*dist1->values))) {
+				hwloc_append_diff_too_complex(hwloc_get_root_obj(topo1), diffp, &lastdiff);
+				err = 1;
+				break;
+			}
+			for(i=0; i<dist1->nbobjs; i++)
+				/* gp_index isn't enforced above. so compare logical_index instead, which is enforced. requires distances refresh() above */
+				if (dist1->objs[i]->logical_index != dist2->objs[i]->logical_index) {
+					hwloc_append_diff_too_complex(hwloc_get_root_obj(topo1), diffp, &lastdiff);
+					err = 1;
+					break;
+				}
+			dist1 = dist1->next;
+			dist2 = dist2->next;
+		}
+	}
+
+	return err;
+}
+
+/********************
+ * Applying diffs
+ */
+
+static int
+hwloc_apply_diff_one(hwloc_topology_t topology,
+		     hwloc_topology_diff_t diff,
+		     unsigned long flags)
+{
+	int reverse = !!(flags & HWLOC_TOPOLOGY_DIFF_APPLY_REVERSE);
+
+	switch (diff->generic.type) {
+	case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR: {
+		struct hwloc_topology_diff_obj_attr_s *obj_attr = &diff->obj_attr;
+		hwloc_obj_t obj = hwloc_get_obj_by_depth(topology, obj_attr->obj_depth, obj_attr->obj_index);
+		if (!obj)
+			return -1;
+
+		switch (obj_attr->diff.generic.type) {
+		case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_SIZE: {
+			hwloc_obj_t tmpobj;
+			hwloc_uint64_t oldvalue = reverse ? obj_attr->diff.uint64.newvalue : obj_attr->diff.uint64.oldvalue;
+			hwloc_uint64_t newvalue = reverse ? obj_attr->diff.uint64.oldvalue : obj_attr->diff.uint64.newvalue;
+			hwloc_uint64_t valuediff = newvalue - oldvalue;
+			if (obj->type != HWLOC_OBJ_NUMANODE)
+				return -1;
+			if (obj->attr->numanode.local_memory != oldvalue)
+				return -1;
+			obj->attr->numanode.local_memory = newvalue;
+			tmpobj = obj;
+			while (tmpobj) {
+				tmpobj->total_memory += valuediff;
+				tmpobj = tmpobj->parent;
+			}
+			break;
+		}
+		case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_NAME: {
+			const char *oldvalue = reverse ? obj_attr->diff.string.newvalue : obj_attr->diff.string.oldvalue;
+			const char *newvalue = reverse ? obj_attr->diff.string.oldvalue : obj_attr->diff.string.newvalue;
+			if (!obj->name || strcmp(obj->name, oldvalue))
+				return -1;
+			free(obj->name);
+			obj->name = strdup(newvalue);
+			break;
+		}
+		case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_INFO: {
+			const char *name = obj_attr->diff.string.name;
+			const char *oldvalue = reverse ? obj_attr->diff.string.newvalue : obj_attr->diff.string.oldvalue;
+			const char *newvalue = reverse ? obj_attr->diff.string.oldvalue : obj_attr->diff.string.newvalue;
+			unsigned i;
+			int found = 0;
+			for(i=0; i<obj->infos_count; i++) {
+				struct hwloc_info_s *info = &obj->infos[i];
+				if (!strcmp(info->name, name)
+				    && !strcmp(info->value, oldvalue)) {
+					free(info->value);
+					info->value = strdup(newvalue);
+					found = 1;
+					break;
+				}
+			}
+			if (!found)
+				return -1;
+			break;
+		}
+		default:
+			return -1;
+		}
+
+		break;
+	}
+	default:
+		return -1;
+	}
+
+	return 0;
+}
+
+int hwloc_topology_diff_apply(hwloc_topology_t topology,
+			      hwloc_topology_diff_t diff,
+			      unsigned long flags)
+{
+	hwloc_topology_diff_t tmpdiff, tmpdiff2;
+	int err, nr;
+
+	if (!topology->is_loaded) {
+	  errno = EINVAL;
+	  return -1;
+	}
+
+	if (flags & ~HWLOC_TOPOLOGY_DIFF_APPLY_REVERSE) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	tmpdiff = diff;
+	nr = 0;
+	while (tmpdiff) {
+		nr++;
+		err = hwloc_apply_diff_one(topology, tmpdiff, flags);
+		if (err < 0)
+			goto cancel;
+		tmpdiff = tmpdiff->generic.next;
+	}
+	return 0;
+
+cancel:
+	tmpdiff2 = tmpdiff;
+	tmpdiff = diff;
+	while (tmpdiff != tmpdiff2) {
+		hwloc_apply_diff_one(topology, tmpdiff, flags ^ HWLOC_TOPOLOGY_DIFF_APPLY_REVERSE);
+		tmpdiff = tmpdiff->generic.next;
+	}
+	errno = EINVAL;
+	return -nr; /* return the index (starting at 1) of the first element that couldn't be applied */
+}
diff --git a/src/3rdparty/hwloc/src/distances.c b/src/3rdparty/hwloc/src/distances.c
new file mode 100644
index 000000000..f0b91f019
--- /dev/null
+++ b/src/3rdparty/hwloc/src/distances.c
@@ -0,0 +1,920 @@
+/*
+ * Copyright © 2010-2018 Inria.  All rights reserved.
+ * Copyright © 2011-2012 Université Bordeaux
+ * Copyright © 2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <private/private.h>
+#include <private/debug.h>
+#include <private/misc.h>
+
+#include <float.h>
+#include <math.h>
+
+/******************************************************
+ * Global init, prepare, destroy, dup
+ */
+
+/* called during topology init() */
+void hwloc_internal_distances_init(struct hwloc_topology *topology)
+{
+  topology->first_dist = topology->last_dist = NULL;
+  topology->next_dist_id = 0;
+}
+
+/* called at the beginning of load() */
+void hwloc_internal_distances_prepare(struct hwloc_topology *topology)
+{
+  char *env;
+  hwloc_localeswitch_declare;
+
+  topology->grouping = 1;
+  if (topology->type_filter[HWLOC_OBJ_GROUP] == HWLOC_TYPE_FILTER_KEEP_NONE)
+    topology->grouping = 0;
+  env = getenv("HWLOC_GROUPING");
+  if (env && !atoi(env))
+    topology->grouping = 0;
+
+  if (topology->grouping) {
+    topology->grouping_next_subkind = 0;
+
+    HWLOC_BUILD_ASSERT(sizeof(topology->grouping_accuracies)/sizeof(*topology->grouping_accuracies) == 5);
+    topology->grouping_accuracies[0] = 0.0f;
+    topology->grouping_accuracies[1] = 0.01f;
+    topology->grouping_accuracies[2] = 0.02f;
+    topology->grouping_accuracies[3] = 0.05f;
+    topology->grouping_accuracies[4] = 0.1f;
+    topology->grouping_nbaccuracies = 5;
+
+    hwloc_localeswitch_init();
+    env = getenv("HWLOC_GROUPING_ACCURACY");
+    if (!env) {
+      /* only use 0.0 */
+      topology->grouping_nbaccuracies = 1;
+    } else if (strcmp(env, "try")) {
+      /* use the given value */
+      topology->grouping_nbaccuracies = 1;
+      topology->grouping_accuracies[0] = (float) atof(env);
+    } /* otherwise try all values */
+    hwloc_localeswitch_fini();
+
+    topology->grouping_verbose = 0;
+    env = getenv("HWLOC_GROUPING_VERBOSE");
+    if (env)
+      topology->grouping_verbose = atoi(env);
+  }
+}
+
+static void hwloc_internal_distances_free(struct hwloc_internal_distances_s *dist)
+{
+  free(dist->indexes);
+  free(dist->objs);
+  free(dist->values);
+  free(dist);
+}
+
+/* called during topology destroy */
+void hwloc_internal_distances_destroy(struct hwloc_topology * topology)
+{
+  struct hwloc_internal_distances_s *dist, *next = topology->first_dist;
+  while ((dist = next) != NULL) {
+    next = dist->next;
+    hwloc_internal_distances_free(dist);
+  }
+  topology->first_dist = topology->last_dist = NULL;
+}
+
+static int hwloc_internal_distances_dup_one(struct hwloc_topology *new, struct hwloc_internal_distances_s *olddist)
+{
+  struct hwloc_tma *tma = new->tma;
+  struct hwloc_internal_distances_s *newdist;
+  unsigned nbobjs = olddist->nbobjs;
+
+  newdist = hwloc_tma_malloc(tma, sizeof(*newdist));
+  if (!newdist)
+    return -1;
+
+  newdist->type = olddist->type;
+  newdist->nbobjs = nbobjs;
+  newdist->kind = olddist->kind;
+  newdist->id = olddist->id;
+
+  newdist->indexes = hwloc_tma_malloc(tma, nbobjs * sizeof(*newdist->indexes));
+  newdist->objs = hwloc_tma_calloc(tma, nbobjs * sizeof(*newdist->objs));
+  newdist->objs_are_valid = 0;
+  newdist->values = hwloc_tma_malloc(tma, nbobjs*nbobjs * sizeof(*newdist->values));
+  if (!newdist->indexes || !newdist->objs || !newdist->values) {
+    assert(!tma || !tma->dontfree); /* this tma cannot fail to allocate */
+    hwloc_internal_distances_free(newdist);
+    return -1;
+  }
+
+  memcpy(newdist->indexes, olddist->indexes, nbobjs * sizeof(*newdist->indexes));
+  memcpy(newdist->values, olddist->values, nbobjs*nbobjs * sizeof(*newdist->values));
+
+  newdist->next = NULL;
+  newdist->prev = new->last_dist;
+  if (new->last_dist)
+    new->last_dist->next = newdist;
+  else
+    new->first_dist = newdist;
+  new->last_dist = newdist;
+
+  return 0;
+}
+
+/* This function may be called with topology->tma set, it cannot free() or realloc() */
+int hwloc_internal_distances_dup(struct hwloc_topology *new, struct hwloc_topology *old)
+{
+  struct hwloc_internal_distances_s *olddist;
+  int err;
+  new->next_dist_id = old->next_dist_id;
+  for(olddist = old->first_dist; olddist; olddist = olddist->next) {
+    err = hwloc_internal_distances_dup_one(new, olddist);
+    if (err < 0)
+      return err;
+  }
+  return 0;
+}
+
+/******************************************************
+ * Remove distances from the topology
+ */
+
+int hwloc_distances_remove(hwloc_topology_t topology)
+{
+  if (!topology->is_loaded) {
+    errno = EINVAL;
+    return -1;
+  }
+  hwloc_internal_distances_destroy(topology);
+  return 0;
+}
+
+int hwloc_distances_remove_by_depth(hwloc_topology_t topology, int depth)
+{
+  struct hwloc_internal_distances_s *dist, *next;
+  hwloc_obj_type_t type;
+
+  if (!topology->is_loaded) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  /* switch back to types since we don't support groups for now */
+  type = hwloc_get_depth_type(topology, depth);
+  if (type == (hwloc_obj_type_t)-1) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  next = topology->first_dist;
+  while ((dist = next) != NULL) {
+    next = dist->next;
+    if (dist->type == type) {
+      if (next)
+	next->prev = dist->prev;
+      else
+	topology->last_dist = dist->prev;
+      if (dist->prev)
+	dist->prev->next = dist->next;
+      else
+	topology->first_dist = dist->next;
+      hwloc_internal_distances_free(dist);
+    }
+  }
+
+  return 0;
+}
+
+/******************************************************
+ * Add distances to the topology
+ */
+
+static void
+hwloc__groups_by_distances(struct hwloc_topology *topology, unsigned nbobjs, struct hwloc_obj **objs, uint64_t *values, unsigned long kind, unsigned nbaccuracies, float *accuracies, int needcheck);
+
+/* insert a distance matrix in the topology.
+ * the caller gives us the distances and objs pointers, we'll free them later.
+ */
+static int
+hwloc_internal_distances__add(hwloc_topology_t topology,
+			      hwloc_obj_type_t type, unsigned nbobjs, hwloc_obj_t *objs, uint64_t *indexes, uint64_t *values,
+			      unsigned long kind)
+{
+  struct hwloc_internal_distances_s *dist = calloc(1, sizeof(*dist));
+  if (!dist)
+    goto err;
+
+  dist->type = type;
+  dist->nbobjs = nbobjs;
+  dist->kind = kind;
+
+  if (!objs) {
+    assert(indexes);
+    /* we only have indexes, we'll refresh objs from there */
+    dist->indexes = indexes;
+    dist->objs = calloc(nbobjs, sizeof(hwloc_obj_t));
+    if (!dist->objs)
+      goto err_with_dist;
+    dist->objs_are_valid = 0;
+
+  } else {
+    unsigned i;
+    assert(!indexes);
+    /* we only have objs, generate the indexes arrays so that we can refresh objs later */
+    dist->objs = objs;
+    dist->objs_are_valid = 1;
+    dist->indexes = malloc(nbobjs * sizeof(*dist->indexes));
+    if (!dist->indexes)
+      goto err_with_dist;
+    if (dist->type == HWLOC_OBJ_PU || dist->type == HWLOC_OBJ_NUMANODE) {
+      for(i=0; i<nbobjs; i++)
+	dist->indexes[i] = objs[i]->os_index;
+    } else {
+      for(i=0; i<nbobjs; i++)
+	dist->indexes[i] = objs[i]->gp_index;
+    }
+  }
+
+  dist->values = values;
+
+  dist->id = topology->next_dist_id++;
+
+  if (topology->last_dist)
+    topology->last_dist->next = dist;
+  else
+    topology->first_dist = dist;
+  dist->prev = topology->last_dist;
+  dist->next = NULL;
+  topology->last_dist = dist;
+  return 0;
+
+ err_with_dist:
+  free(dist);
+ err:
+  free(objs);
+  free(indexes);
+  free(values);
+  return -1;
+}
+
+int hwloc_internal_distances_add_by_index(hwloc_topology_t topology,
+					  hwloc_obj_type_t type, unsigned nbobjs, uint64_t *indexes, uint64_t *values,
+					  unsigned long kind, unsigned long flags)
+{
+  if (nbobjs < 2) {
+    errno = EINVAL;
+    goto err;
+  }
+
+  /* cannot group without objects,
+   * and we don't group from XML anyway since the hwloc that generated the XML should have grouped already.
+   */
+  if (flags & HWLOC_DISTANCES_ADD_FLAG_GROUP) {
+    errno = EINVAL;
+    goto err;
+  }
+
+  return hwloc_internal_distances__add(topology, type, nbobjs, NULL, indexes, values, kind);
+
+ err:
+  free(indexes);
+  free(values);
+  return -1;
+}
+
+int hwloc_internal_distances_add(hwloc_topology_t topology,
+				 unsigned nbobjs, hwloc_obj_t *objs, uint64_t *values,
+				 unsigned long kind, unsigned long flags)
+{
+  if (nbobjs < 2) {
+    errno = EINVAL;
+    goto err;
+  }
+
+  if (topology->grouping && (flags & HWLOC_DISTANCES_ADD_FLAG_GROUP)) {
+    float full_accuracy = 0.f;
+    float *accuracies;
+    unsigned nbaccuracies;
+
+    if (flags & HWLOC_DISTANCES_ADD_FLAG_GROUP_INACCURATE) {
+      accuracies = topology->grouping_accuracies;
+      nbaccuracies = topology->grouping_nbaccuracies;
+    } else {
+      accuracies = &full_accuracy;
+      nbaccuracies = 1;
+    }
+
+    if (topology->grouping_verbose) {
+      unsigned i, j;
+      int gp = (objs[0]->type != HWLOC_OBJ_NUMANODE && objs[0]->type != HWLOC_OBJ_PU);
+      fprintf(stderr, "Trying to group objects using distance matrix:\n");
+      fprintf(stderr, "%s", gp ? "gp_index" : "os_index");
+      for(j=0; j<nbobjs; j++)
+	fprintf(stderr, " % 5d", (int)(gp ? objs[j]->gp_index : objs[j]->os_index));
+      fprintf(stderr, "\n");
+      for(i=0; i<nbobjs; i++) {
+	fprintf(stderr, "  % 5d", (int)(gp ? objs[i]->gp_index : objs[i]->os_index));
+	for(j=0; j<nbobjs; j++)
+	  fprintf(stderr, " % 5lld", (long long) values[i*nbobjs + j]);
+	fprintf(stderr, "\n");
+      }
+    }
+
+    hwloc__groups_by_distances(topology, nbobjs, objs, values,
+			       kind, nbaccuracies, accuracies, 1 /* check the first matrice */);
+  }
+
+  return hwloc_internal_distances__add(topology, objs[0]->type, nbobjs, objs, NULL, values, kind);
+
+ err:
+  free(objs);
+  free(values);
+  return -1;
+}
+
+#define HWLOC_DISTANCES_KIND_FROM_ALL (HWLOC_DISTANCES_KIND_FROM_OS|HWLOC_DISTANCES_KIND_FROM_USER)
+#define HWLOC_DISTANCES_KIND_MEANS_ALL (HWLOC_DISTANCES_KIND_MEANS_LATENCY|HWLOC_DISTANCES_KIND_MEANS_BANDWIDTH)
+#define HWLOC_DISTANCES_KIND_ALL (HWLOC_DISTANCES_KIND_FROM_ALL|HWLOC_DISTANCES_KIND_MEANS_ALL)
+#define HWLOC_DISTANCES_ADD_FLAG_ALL (HWLOC_DISTANCES_ADD_FLAG_GROUP|HWLOC_DISTANCES_ADD_FLAG_GROUP_INACCURATE)
+
+/* The actual function exported to the user
+ */
+int hwloc_distances_add(hwloc_topology_t topology,
+			unsigned nbobjs, hwloc_obj_t *objs, hwloc_uint64_t *values,
+			unsigned long kind, unsigned long flags)
+{
+  hwloc_obj_type_t type;
+  unsigned i;
+  uint64_t *_values;
+  hwloc_obj_t *_objs;
+  int err;
+
+  if (nbobjs < 2 || !objs || !values || !topology->is_loaded) {
+    errno = EINVAL;
+    return -1;
+  }
+  if ((kind & ~HWLOC_DISTANCES_KIND_ALL)
+      || hwloc_weight_long(kind & HWLOC_DISTANCES_KIND_FROM_ALL) != 1
+      || hwloc_weight_long(kind & HWLOC_DISTANCES_KIND_MEANS_ALL) != 1
+      || (flags & ~HWLOC_DISTANCES_ADD_FLAG_ALL)) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  /* no strict need to check for duplicates, things shouldn't break */
+
+  type = objs[0]->type;
+  if (type == HWLOC_OBJ_GROUP) {
+    /* not supported yet, would require we save the subkind together with the type. */
+    errno = EINVAL;
+    return -1;
+  }
+
+  for(i=1; i<nbobjs; i++)
+    if (!objs[i] || objs[i]->type != type) {
+      errno = EINVAL;
+      return -1;
+    }
+
+  /* copy the input arrays and give them to the topology */
+  _objs = malloc(nbobjs*sizeof(hwloc_obj_t));
+  _values = malloc(nbobjs*nbobjs*sizeof(*_values));
+  if (!_objs || !_values)
+    goto out_with_arrays;
+
+  memcpy(_objs, objs, nbobjs*sizeof(hwloc_obj_t));
+  memcpy(_values, values, nbobjs*nbobjs*sizeof(*_values));
+  err = hwloc_internal_distances_add(topology, nbobjs, _objs, _values, kind, flags);
+  if (err < 0)
+    goto out; /* _objs and _values freed in hwloc_internal_distances_add() */
+
+  /* in case we added some groups, see if we need to reconnect */
+  hwloc_topology_reconnect(topology, 0);
+
+  return 0;
+
+ out_with_arrays:
+  free(_values);
+  free(_objs);
+ out:
+  return -1;
+}
+
+/******************************************************
+ * Refresh objects in distances
+ */
+
+static hwloc_obj_t hwloc_find_obj_by_type_and_gp_index(hwloc_topology_t topology, hwloc_obj_type_t type, uint64_t gp_index)
+{
+  hwloc_obj_t obj = hwloc_get_obj_by_type(topology, type, 0);
+  while (obj) {
+    if (obj->gp_index == gp_index)
+      return obj;
+    obj = obj->next_cousin;
+  }
+  return NULL;
+}
+
+static void
+hwloc_internal_distances_restrict(struct hwloc_internal_distances_s *dist,
+				  hwloc_obj_t *objs,
+				  unsigned disappeared)
+{
+  unsigned nbobjs = dist->nbobjs;
+  unsigned i, newi;
+  unsigned j, newj;
+
+  for(i=0, newi=0; i<nbobjs; i++)
+    if (objs[i]) {
+      for(j=0, newj=0; j<nbobjs; j++)
+	if (objs[j]) {
+	  dist->values[newi*(nbobjs-disappeared)+newj] = dist->values[i*nbobjs+j];
+	  newj++;
+	}
+      newi++;
+    }
+
+  for(i=0, newi=0; i<nbobjs; i++)
+    if (objs[i]) {
+      objs[newi] = objs[i];
+      dist->indexes[newi] = dist->indexes[i];
+      newi++;
+    }
+
+  dist->nbobjs -= disappeared;
+}
+
+static int
+hwloc_internal_distances_refresh_one(hwloc_topology_t topology,
+				     struct hwloc_internal_distances_s *dist)
+{
+  hwloc_obj_type_t type = dist->type;
+  unsigned nbobjs = dist->nbobjs;
+  hwloc_obj_t *objs = dist->objs;
+  uint64_t *indexes = dist->indexes;
+  unsigned disappeared = 0;
+  unsigned i;
+
+  if (dist->objs_are_valid)
+    return 0;
+
+  for(i=0; i<nbobjs; i++) {
+    hwloc_obj_t obj;
+    /* TODO use cpuset/nodeset to find pus/numas from the root?
+     * faster than traversing the entire level?
+     */
+    if (type == HWLOC_OBJ_PU)
+      obj = hwloc_get_pu_obj_by_os_index(topology, (unsigned) indexes[i]);
+    else if (type == HWLOC_OBJ_NUMANODE)
+      obj = hwloc_get_numanode_obj_by_os_index(topology, (unsigned) indexes[i]);
+    else
+      obj = hwloc_find_obj_by_type_and_gp_index(topology, type, indexes[i]);
+    objs[i] = obj;
+    if (!obj)
+      disappeared++;
+  }
+
+  if (nbobjs-disappeared < 2)
+    /* became useless, drop */
+    return -1;
+
+  if (disappeared)
+    hwloc_internal_distances_restrict(dist, objs, disappeared);
+
+  dist->objs_are_valid = 1;
+  return 0;
+}
+
+/* This function may be called with topology->tma set, it cannot free() or realloc() */
+void
+hwloc_internal_distances_refresh(hwloc_topology_t topology)
+{
+  struct hwloc_internal_distances_s *dist, *next;
+
+  for(dist = topology->first_dist; dist; dist = next) {
+    next = dist->next;
+
+    if (hwloc_internal_distances_refresh_one(topology, dist) < 0) {
+      assert(!topology->tma || !topology->tma->dontfree); /* this tma cannot fail to allocate */
+      if (dist->prev)
+	dist->prev->next = next;
+      else
+	topology->first_dist = next;
+      if (next)
+	next->prev = dist->prev;
+      else
+	topology->last_dist = dist->prev;
+      hwloc_internal_distances_free(dist);
+      continue;
+    }
+  }
+}
+
+void
+hwloc_internal_distances_invalidate_cached_objs(hwloc_topology_t topology)
+{
+  struct hwloc_internal_distances_s *dist;
+  for(dist = topology->first_dist; dist; dist = dist->next)
+    dist->objs_are_valid = 0;
+}
+
+/******************************************************
+ * User API for getting distances
+ */
+
+void
+hwloc_distances_release(hwloc_topology_t topology __hwloc_attribute_unused,
+			struct hwloc_distances_s *distances)
+{
+  free(distances->values);
+  free(distances->objs);
+  free(distances);
+}
+
+static struct hwloc_distances_s *
+hwloc_distances_get_one(hwloc_topology_t topology __hwloc_attribute_unused,
+			struct hwloc_internal_distances_s *dist)
+{
+  struct hwloc_distances_s *distances;
+  unsigned nbobjs;
+
+  distances = malloc(sizeof(*distances));
+  if (!distances)
+    return NULL;
+
+  nbobjs = distances->nbobjs = dist->nbobjs;
+
+  distances->objs = malloc(nbobjs * sizeof(hwloc_obj_t));
+  if (!distances->objs)
+    goto out;
+  memcpy(distances->objs, dist->objs, nbobjs * sizeof(hwloc_obj_t));
+
+  distances->values = malloc(nbobjs * nbobjs * sizeof(*distances->values));
+  if (!distances->values)
+    goto out_with_objs;
+  memcpy(distances->values, dist->values, nbobjs*nbobjs*sizeof(*distances->values));
+
+  distances->kind = dist->kind;
+  return distances;
+
+ out_with_objs:
+  free(distances->objs);
+ out:
+  free(distances);
+  return NULL;
+}
+
+static int
+hwloc__distances_get(hwloc_topology_t topology,
+		     hwloc_obj_type_t type,
+		     unsigned *nrp, struct hwloc_distances_s **distancesp,
+		     unsigned long kind, unsigned long flags __hwloc_attribute_unused)
+{
+  struct hwloc_internal_distances_s *dist;
+  unsigned nr = 0, i;
+
+  /* We could return the internal arrays (as const),
+   * but it would require to prevent removing distances between get() and free().
+   * Not performance critical anyway.
+   */
+
+  if (flags) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  /* we could refresh only the distances that match, but we won't have many distances anyway,
+   * so performance is totally negligible.
+   *
+   * This is also useful in multithreaded apps that modify the topology.
+   * They can call any valid hwloc_distances_get() to force a refresh after
+   * changing the topology, so that future concurrent get() won't cause
+   * concurrent refresh().
+   */
+  hwloc_internal_distances_refresh(topology);
+
+  for(dist = topology->first_dist; dist; dist = dist->next) {
+    unsigned long kind_from = kind & HWLOC_DISTANCES_KIND_FROM_ALL;
+    unsigned long kind_means = kind & HWLOC_DISTANCES_KIND_MEANS_ALL;
+
+    if (type != HWLOC_OBJ_TYPE_NONE && type != dist->type)
+      continue;
+
+    if (kind_from && !(kind_from & dist->kind))
+      continue;
+    if (kind_means && !(kind_means & dist->kind))
+      continue;
+
+    if (nr < *nrp) {
+      struct hwloc_distances_s *distances = hwloc_distances_get_one(topology, dist);
+      if (!distances)
+	goto error;
+      distancesp[nr] = distances;
+    }
+    nr++;
+  }
+
+  for(i=nr; i<*nrp; i++)
+    distancesp[i] = NULL;
+  *nrp = nr;
+  return 0;
+
+ error:
+  for(i=0; i<nr; i++)
+    hwloc_distances_release(topology, distancesp[i]);
+  return -1;
+}
+
+int
+hwloc_distances_get(hwloc_topology_t topology,
+		    unsigned *nrp, struct hwloc_distances_s **distancesp,
+		    unsigned long kind, unsigned long flags)
+{
+  if (flags || !topology->is_loaded) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  return hwloc__distances_get(topology, HWLOC_OBJ_TYPE_NONE, nrp, distancesp, kind, flags);
+}
+
+int
+hwloc_distances_get_by_depth(hwloc_topology_t topology, int depth,
+			     unsigned *nrp, struct hwloc_distances_s **distancesp,
+			     unsigned long kind, unsigned long flags)
+{
+  hwloc_obj_type_t type;
+
+  if (flags || !topology->is_loaded) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  /* switch back to types since we don't support groups for now */
+  type = hwloc_get_depth_type(topology, depth);
+  if (type == (hwloc_obj_type_t)-1) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  return hwloc__distances_get(topology, type, nrp, distancesp, kind, flags);
+}
+
+/******************************************************
+ * Grouping objects according to distances
+ */
+
+static void hwloc_report_user_distance_error(const char *msg, int line)
+{
+  static int reported = 0;
+
+  if (!reported && !hwloc_hide_errors()) {
+    fprintf(stderr, "****************************************************************************\n");
+    fprintf(stderr, "* hwloc %s was given invalid distances by the user.\n", HWLOC_VERSION);
+    fprintf(stderr, "*\n");
+    fprintf(stderr, "* %s\n", msg);
+    fprintf(stderr, "* Error occurred in topology.c line %d\n", line);
+    fprintf(stderr, "*\n");
+    fprintf(stderr, "* Please make sure that distances given through the programming API\n");
+    fprintf(stderr, "* do not contradict any other topology information.\n");
+    fprintf(stderr, "* \n");
+    fprintf(stderr, "* hwloc will now ignore this invalid topology information and continue.\n");
+    fprintf(stderr, "****************************************************************************\n");
+    reported = 1;
+  }
+}
+
+static int hwloc_compare_values(uint64_t a, uint64_t b, float accuracy)
+{
+  if (accuracy != 0.0f && fabsf((float)a-(float)b) < (float)a * accuracy)
+    return 0;
+  return a < b ? -1 : a == b ? 0 : 1;
+}
+
+/*
+ * Place objects in groups if they are in a transitive graph of minimal values.
+ * Return how many groups were created, or 0 if some incomplete distance graphs were found.
+ */
+static unsigned
+hwloc__find_groups_by_min_distance(unsigned nbobjs,
+				   uint64_t *_values,
+				   float accuracy,
+				   unsigned *groupids,
+				   int verbose)
+{
+  uint64_t min_distance = UINT64_MAX;
+  unsigned groupid = 1;
+  unsigned i,j,k;
+  unsigned skipped = 0;
+
+#define VALUE(i, j) _values[(i) * nbobjs + (j)]
+
+  memset(groupids, 0, nbobjs*sizeof(*groupids));
+
+  /* find the minimal distance */
+  for(i=0; i<nbobjs; i++)
+    for(j=0; j<nbobjs; j++) /* check the entire matrix, it may not be perfectly symmetric depending on the accuracy */
+      if (i != j && VALUE(i, j) < min_distance) /* no accuracy here, we want the real minimal */
+        min_distance = VALUE(i, j);
+  hwloc_debug("  found minimal distance %llu between objects\n", (unsigned long long) min_distance);
+
+  if (min_distance == UINT64_MAX)
+    return 0;
+
+  /* build groups of objects connected with this distance */
+  for(i=0; i<nbobjs; i++) {
+    unsigned size;
+    unsigned firstfound;
+
+    /* if already grouped, skip */
+    if (groupids[i])
+      continue;
+
+    /* start a new group */
+    groupids[i] = groupid;
+    size = 1;
+    firstfound = i;
+
+    while (firstfound != (unsigned)-1) {
+      /* we added new objects to the group, the first one was firstfound.
+       * rescan all connections from these new objects (starting at first found) to any other objects,
+       * so as to find new objects minimally-connected by transivity.
+       */
+      unsigned newfirstfound = (unsigned)-1;
+      for(j=firstfound; j<nbobjs; j++)
+	if (groupids[j] == groupid)
+	  for(k=0; k<nbobjs; k++)
+              if (!groupids[k] && !hwloc_compare_values(VALUE(j, k), min_distance, accuracy)) {
+	      groupids[k] = groupid;
+	      size++;
+	      if (newfirstfound == (unsigned)-1)
+		newfirstfound = k;
+	      if (i == j)
+		hwloc_debug("  object %u is minimally connected to %u\n", k, i);
+	      else
+	        hwloc_debug("  object %u is minimally connected to %u through %u\n", k, i, j);
+	    }
+      firstfound = newfirstfound;
+    }
+
+    if (size == 1) {
+      /* cancel this useless group, ignore this object and try from the next one */
+      groupids[i] = 0;
+      skipped++;
+      continue;
+    }
+
+    /* valid this group */
+    groupid++;
+    if (verbose)
+      fprintf(stderr, " Found transitive graph with %u objects with minimal distance %llu accuracy %f\n",
+	      size, (unsigned long long) min_distance, accuracy);
+  }
+
+  if (groupid == 2 && !skipped)
+    /* we created a single group containing all objects, ignore it */
+    return 0;
+
+  /* return the last id, since it's also the number of used group ids */
+  return groupid-1;
+}
+
+/* check that the matrix is ok */
+static int
+hwloc__check_grouping_matrix(unsigned nbobjs, uint64_t *_values, float accuracy, int verbose)
+{
+  unsigned i,j;
+  for(i=0; i<nbobjs; i++) {
+    for(j=i+1; j<nbobjs; j++) {
+      /* should be symmetric */
+      if (hwloc_compare_values(VALUE(i, j), VALUE(j, i), accuracy)) {
+	if (verbose)
+	  fprintf(stderr, " Distance matrix asymmetric ([%u,%u]=%llu != [%u,%u]=%llu), aborting\n",
+		  i, j, (unsigned long long) VALUE(i, j), j, i, (unsigned long long) VALUE(j, i));
+	return -1;
+      }
+      /* diagonal is smaller than everything else */
+      if (hwloc_compare_values(VALUE(i, j), VALUE(i, i), accuracy) <= 0) {
+	if (verbose)
+	  fprintf(stderr, " Distance to self not strictly minimal ([%u,%u]=%llu <= [%u,%u]=%llu), aborting\n",
+		  i, j, (unsigned long long) VALUE(i, j), i, i, (unsigned long long) VALUE(i, i));
+	return -1;
+      }
+    }
+  }
+  return 0;
+}
+
+/*
+ * Look at object physical distances to group them.
+ */
+static void
+hwloc__groups_by_distances(struct hwloc_topology *topology,
+			   unsigned nbobjs,
+			   struct hwloc_obj **objs,
+			   uint64_t *_values,
+			   unsigned long kind,
+			   unsigned nbaccuracies,
+			   float *accuracies,
+			   int needcheck)
+{
+  HWLOC_VLA(unsigned, groupids, nbobjs);
+  unsigned nbgroups = 0;
+  unsigned i,j;
+  int verbose = topology->grouping_verbose;
+
+  if (nbobjs <= 2)
+      return;
+
+  if (!(kind & HWLOC_DISTANCES_KIND_MEANS_LATENCY))
+    /* don't know use to use those for grouping */
+    /* TODO hwloc__find_groups_by_max_distance() for bandwidth */
+    return;
+
+  for(i=0; i<nbaccuracies; i++) {
+    if (verbose)
+      fprintf(stderr, "Trying to group %u %s objects according to physical distances with accuracy %f\n",
+	      nbobjs, hwloc_obj_type_string(objs[0]->type), accuracies[i]);
+    if (needcheck && hwloc__check_grouping_matrix(nbobjs, _values, accuracies[i], verbose) < 0)
+      continue;
+    nbgroups = hwloc__find_groups_by_min_distance(nbobjs, _values, accuracies[i], groupids, verbose);
+    if (nbgroups)
+      break;
+  }
+  if (!nbgroups)
+    return;
+
+  {
+      HWLOC_VLA(hwloc_obj_t, groupobjs, nbgroups);
+      HWLOC_VLA(unsigned, groupsizes, nbgroups);
+      HWLOC_VLA(uint64_t, groupvalues, nbgroups*nbgroups);
+      unsigned failed = 0;
+
+      /* create new Group objects and record their size */
+      memset(&(groupsizes[0]), 0, sizeof(groupsizes[0]) * nbgroups);
+      for(i=0; i<nbgroups; i++) {
+          /* create the Group object */
+          hwloc_obj_t group_obj, res_obj;
+          group_obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_GROUP, HWLOC_UNKNOWN_INDEX);
+          group_obj->cpuset = hwloc_bitmap_alloc();
+          group_obj->attr->group.kind = HWLOC_GROUP_KIND_DISTANCE;
+          group_obj->attr->group.subkind = topology->grouping_next_subkind;
+          for (j=0; j<nbobjs; j++)
+	    if (groupids[j] == i+1) {
+	      /* assemble the group sets */
+	      hwloc_obj_add_other_obj_sets(group_obj, objs[j]);
+              groupsizes[i]++;
+            }
+          hwloc_debug_1arg_bitmap("adding Group object with %u objects and cpuset %s\n",
+                                  groupsizes[i], group_obj->cpuset);
+          res_obj = hwloc__insert_object_by_cpuset(topology, NULL, group_obj,
+						   (kind & HWLOC_DISTANCES_KIND_FROM_USER) ? hwloc_report_user_distance_error : hwloc_report_os_error);
+	  /* res_obj may be NULL on failure to insert. */
+	  if (!res_obj)
+	    failed++;
+	  /* or it may be different from groupobjs if we got groups from XML import before grouping */
+          groupobjs[i] = res_obj;
+      }
+      topology->grouping_next_subkind++;
+
+      if (failed)
+	/* don't try to group above if we got a NULL group here, just keep this incomplete level */
+	return;
+
+      /* factorize values */
+      memset(&(groupvalues[0]), 0, sizeof(groupvalues[0]) * nbgroups * nbgroups);
+#undef VALUE
+#define VALUE(i, j) _values[(i) * nbobjs + (j)]
+#define GROUP_VALUE(i, j) groupvalues[(i) * nbgroups + (j)]
+      for(i=0; i<nbobjs; i++)
+	if (groupids[i])
+	  for(j=0; j<nbobjs; j++)
+	    if (groupids[j])
+                GROUP_VALUE(groupids[i]-1, groupids[j]-1) += VALUE(i, j);
+      for(i=0; i<nbgroups; i++)
+          for(j=0; j<nbgroups; j++) {
+              unsigned groupsize = groupsizes[i]*groupsizes[j];
+              GROUP_VALUE(i, j) /= groupsize;
+          }
+#ifdef HWLOC_DEBUG
+      hwloc_debug("%s", "generated new distance matrix between groups:\n");
+      hwloc_debug("%s", "  index");
+      for(j=0; j<nbgroups; j++)
+	hwloc_debug(" % 5d", (int) j); /* print index because os_index is -1 for Groups */
+      hwloc_debug("%s", "\n");
+      for(i=0; i<nbgroups; i++) {
+	hwloc_debug("  % 5d", (int) i);
+	for(j=0; j<nbgroups; j++)
+	  hwloc_debug(" %llu", (unsigned long long) GROUP_VALUE(i, j));
+	hwloc_debug("%s", "\n");
+      }
+#endif
+
+      hwloc__groups_by_distances(topology, nbgroups, groupobjs, groupvalues, kind, nbaccuracies, accuracies, 0 /* no need to check generated matrix */);
+  }
+}
diff --git a/src/3rdparty/hwloc/src/misc.c b/src/3rdparty/hwloc/src/misc.c
new file mode 100644
index 000000000..16dacf623
--- /dev/null
+++ b/src/3rdparty/hwloc/src/misc.c
@@ -0,0 +1,166 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2015 Inria.  All rights reserved.
+ * Copyright © 2009-2010 Université Bordeaux
+ * Copyright © 2009-2018 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <private/private.h>
+#include <private/misc.h>
+
+#include <stdarg.h>
+#ifdef HAVE_SYS_UTSNAME_H
+#include <sys/utsname.h>
+#endif
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <errno.h>
+#include <ctype.h>
+
+#ifdef HAVE_PROGRAM_INVOCATION_NAME
+#include <errno.h>
+extern char *program_invocation_name;
+#endif
+#ifdef HAVE___PROGNAME
+extern char *__progname;
+#endif
+
+int hwloc_snprintf(char *str, size_t size, const char *format, ...)
+{
+  int ret;
+  va_list ap;
+  static char bin;
+  size_t fakesize;
+  char *fakestr;
+
+  /* Some systems crash on str == NULL */
+  if (!size) {
+    str = &bin;
+    size = 1;
+  }
+
+  va_start(ap, format);
+  ret = vsnprintf(str, size, format, ap);
+  va_end(ap);
+
+  if (ret >= 0 && (size_t) ret != size-1)
+    return ret;
+
+  /* vsnprintf returned size-1 or -1. That could be a system which reports the
+   * written data and not the actually required room. Try increasing buffer
+   * size to get the latter. */
+
+  fakesize = size;
+  fakestr = NULL;
+  do {
+    fakesize *= 2;
+    free(fakestr);
+    fakestr = malloc(fakesize);
+    if (NULL == fakestr)
+      return -1;
+    va_start(ap, format);
+    errno = 0;
+    ret = vsnprintf(fakestr, fakesize, format, ap);
+    va_end(ap);
+  } while ((size_t) ret == fakesize-1 || (ret < 0 && (!errno || errno == ERANGE)));
+
+  if (ret >= 0 && size) {
+    if (size > (size_t) ret+1)
+      size = ret+1;
+    memcpy(str, fakestr, size-1);
+    str[size-1] = 0;
+  }
+  free(fakestr);
+
+  return ret;
+}
+
+int hwloc_namecoloncmp(const char *haystack, const char *needle, size_t n)
+{
+  size_t i = 0;
+  while (*haystack && *haystack != ':') {
+    int ha = *haystack++;
+    int low_h = tolower(ha);
+    int ne = *needle++;
+    int low_n = tolower(ne);
+    if (low_h != low_n)
+      return 1;
+    i++;
+  }
+  return i < n;
+}
+
+void hwloc_add_uname_info(struct hwloc_topology *topology __hwloc_attribute_unused,
+			  void *cached_uname __hwloc_attribute_unused)
+{
+#ifdef HAVE_UNAME
+  struct utsname _utsname, *utsname;
+
+  if (hwloc_obj_get_info_by_name(topology->levels[0][0], "OSName"))
+    /* don't annotate twice */
+    return;
+
+  if (cached_uname)
+    utsname = (struct utsname *) cached_uname;
+  else {
+    utsname = &_utsname;
+    if (uname(utsname) < 0)
+      return;
+  }
+
+  if (*utsname->sysname)
+    hwloc_obj_add_info(topology->levels[0][0], "OSName", utsname->sysname);
+  if (*utsname->release)
+    hwloc_obj_add_info(topology->levels[0][0], "OSRelease", utsname->release);
+  if (*utsname->version)
+    hwloc_obj_add_info(topology->levels[0][0], "OSVersion", utsname->version);
+  if (*utsname->nodename)
+    hwloc_obj_add_info(topology->levels[0][0], "HostName", utsname->nodename);
+  if (*utsname->machine)
+    hwloc_obj_add_info(topology->levels[0][0], "Architecture", utsname->machine);
+#endif /* HAVE_UNAME */
+}
+
+char *
+hwloc_progname(struct hwloc_topology *topology __hwloc_attribute_unused)
+{
+#if HAVE_DECL_GETMODULEFILENAME
+  char name[256], *local_basename;
+  unsigned res = GetModuleFileName(NULL, name, sizeof(name));
+  if (res == sizeof(name) || !res)
+    return NULL;
+  local_basename = strrchr(name, '\\');
+  if (!local_basename)
+    local_basename = name;
+  else
+    local_basename++;
+  return strdup(local_basename);
+#else /* !HAVE_GETMODULEFILENAME */
+  const char *name, *local_basename;
+#if HAVE_DECL_GETPROGNAME
+  name = getprogname(); /* FreeBSD, NetBSD, some Solaris */
+#elif HAVE_DECL_GETEXECNAME
+  name = getexecname(); /* Solaris */
+#elif defined HAVE_PROGRAM_INVOCATION_NAME
+  name = program_invocation_name; /* Glibc. BGQ CNK. */
+  /* could use program_invocation_short_name directly, but we have the code to remove the path below anyway */
+#elif defined HAVE___PROGNAME
+  name = __progname; /* fallback for most unix, used for OpenBSD */
+#else
+  /* TODO: _NSGetExecutablePath(path, &size) on Darwin */
+  /* TODO: AIX, HPUX */
+  name = NULL;
+#endif
+  if (!name)
+    return NULL;
+  local_basename = strrchr(name, '/');
+  if (!local_basename)
+    local_basename = name;
+  else
+    local_basename++;
+  return strdup(local_basename);
+#endif /* !HAVE_GETMODULEFILENAME */
+}
diff --git a/src/3rdparty/hwloc/src/pci-common.c b/src/3rdparty/hwloc/src/pci-common.c
new file mode 100644
index 000000000..00f08a9e7
--- /dev/null
+++ b/src/3rdparty/hwloc/src/pci-common.c
@@ -0,0 +1,941 @@
+/*
+ * Copyright © 2009-2018 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <hwloc/plugins.h>
+#include <private/private.h>
+#include <private/debug.h>
+#include <private/misc.h>
+
+#include <fcntl.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#include <sys/stat.h>
+
+#if defined(HWLOC_WIN_SYS) && !defined(__CYGWIN__)
+#include <io.h>
+#define open _open
+#define read _read
+#define close _close
+#endif
+
+static void
+hwloc_pci_forced_locality_parse_one(struct hwloc_topology *topology,
+				    const char *string /* must contain a ' ' */,
+				    unsigned *allocated)
+{
+  unsigned nr = topology->pci_forced_locality_nr;
+  unsigned domain, bus_first, bus_last, dummy;
+  hwloc_bitmap_t set;
+  char *tmp;
+
+  if (sscanf(string, "%x:%x-%x %x", &domain, &bus_first, &bus_last, &dummy) == 4) {
+    /* fine */
+  } else if (sscanf(string, "%x:%x %x", &domain, &bus_first, &dummy) == 3) {
+    bus_last = bus_first;
+  } else if (sscanf(string, "%x %x", &domain, &dummy) == 2) {
+    bus_first = 0;
+    bus_last = 255;
+  } else
+    return;
+
+  tmp = strchr(string, ' ');
+  if (!tmp)
+    return;
+  tmp++;
+
+  set = hwloc_bitmap_alloc();
+  hwloc_bitmap_sscanf(set, tmp);
+
+  if (!*allocated) {
+    topology->pci_forced_locality = malloc(sizeof(*topology->pci_forced_locality));
+    if (!topology->pci_forced_locality)
+      goto out_with_set; /* failed to allocate, ignore this forced locality */
+    *allocated = 1;
+  } else if (nr >= *allocated) {
+    struct hwloc_pci_forced_locality_s *tmplocs;
+    tmplocs = realloc(topology->pci_forced_locality,
+		      2 * *allocated * sizeof(*topology->pci_forced_locality));
+    if (!tmplocs)
+      goto out_with_set; /* failed to allocate, ignore this forced locality */
+    topology->pci_forced_locality = tmplocs;
+    *allocated *= 2;
+  }
+
+  topology->pci_forced_locality[nr].domain = domain;
+  topology->pci_forced_locality[nr].bus_first = bus_first;
+  topology->pci_forced_locality[nr].bus_last = bus_last;
+  topology->pci_forced_locality[nr].cpuset = set;
+  topology->pci_forced_locality_nr++;
+  return;
+
+ out_with_set:
+  hwloc_bitmap_free(set);
+  return;
+}
+
+static void
+hwloc_pci_forced_locality_parse(struct hwloc_topology *topology, const char *_env)
+{
+  char *env = strdup(_env);
+  unsigned allocated = 0;
+  char *tmp = env;
+
+  while (1) {
+    size_t len = strcspn(tmp, ";\r\n");
+    char *next = NULL;
+
+    if (tmp[len] != '\0') {
+      tmp[len] = '\0';
+      if (tmp[len+1] != '\0')
+	next = &tmp[len]+1;
+    }
+
+    hwloc_pci_forced_locality_parse_one(topology, tmp, &allocated);
+
+    if (next)
+      tmp = next;
+    else
+      break;
+  }
+
+  free(env);
+}
+
+void
+hwloc_pci_discovery_init(struct hwloc_topology *topology)
+{
+  topology->need_pci_belowroot_apply_locality = 0;
+
+  topology->pci_has_forced_locality = 0;
+  topology->pci_forced_locality_nr = 0;
+  topology->pci_forced_locality = NULL;
+}
+
+void
+hwloc_pci_discovery_prepare(struct hwloc_topology *topology)
+{
+  char *env;
+
+  env = getenv("HWLOC_PCI_LOCALITY");
+  if (env) {
+    int fd;
+
+    topology->pci_has_forced_locality = 1;
+
+    fd = open(env, O_RDONLY);
+    if (fd >= 0) {
+      struct stat st;
+      char *buffer;
+      int err = fstat(fd, &st);
+      if (!err) {
+	if (st.st_size <= 64*1024) { /* random limit large enough to store multiple cpusets for thousands of PUs */
+	  buffer = malloc(st.st_size+1);
+	  if (read(fd, buffer, st.st_size) == st.st_size) {
+	    buffer[st.st_size] = '\0';
+	    hwloc_pci_forced_locality_parse(topology, buffer);
+	  }
+	  free(buffer);
+	} else {
+	  fprintf(stderr, "Ignoring HWLOC_PCI_LOCALITY file `%s' too large (%lu bytes)\n",
+		  env, (unsigned long) st.st_size);
+	}
+      }
+      close(fd);
+    } else
+      hwloc_pci_forced_locality_parse(topology, env);
+  }
+}
+
+void
+hwloc_pci_discovery_exit(struct hwloc_topology *topology __hwloc_attribute_unused)
+{
+  unsigned i;
+  for(i=0; i<topology->pci_forced_locality_nr; i++)
+    hwloc_bitmap_free(topology->pci_forced_locality[i].cpuset);
+  free(topology->pci_forced_locality);
+
+  hwloc_pci_discovery_init(topology);
+}
+
+#ifdef HWLOC_DEBUG
+static void
+hwloc_pci_traverse_print_cb(void * cbdata __hwloc_attribute_unused,
+			    struct hwloc_obj *pcidev)
+{
+  char busid[14];
+  hwloc_obj_t parent;
+
+  /* indent */
+  parent = pcidev->parent;
+  while (parent) {
+    hwloc_debug("%s", "  ");
+    parent = parent->parent;
+  }
+
+  snprintf(busid, sizeof(busid), "%04x:%02x:%02x.%01x",
+           pcidev->attr->pcidev.domain, pcidev->attr->pcidev.bus, pcidev->attr->pcidev.dev, pcidev->attr->pcidev.func);
+
+  if (pcidev->type == HWLOC_OBJ_BRIDGE) {
+    if (pcidev->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_HOST)
+      hwloc_debug("HostBridge");
+    else
+      hwloc_debug("%s Bridge [%04x:%04x]", busid,
+		  pcidev->attr->pcidev.vendor_id, pcidev->attr->pcidev.device_id);
+    hwloc_debug(" to %04x:[%02x:%02x]\n",
+		pcidev->attr->bridge.downstream.pci.domain, pcidev->attr->bridge.downstream.pci.secondary_bus, pcidev->attr->bridge.downstream.pci.subordinate_bus);
+  } else
+    hwloc_debug("%s Device [%04x:%04x (%04x:%04x) rev=%02x class=%04x]\n", busid,
+		pcidev->attr->pcidev.vendor_id, pcidev->attr->pcidev.device_id,
+		pcidev->attr->pcidev.subvendor_id, pcidev->attr->pcidev.subdevice_id,
+		pcidev->attr->pcidev.revision, pcidev->attr->pcidev.class_id);
+}
+
+static void
+hwloc_pci_traverse(void * cbdata, struct hwloc_obj *tree,
+		   void (*cb)(void * cbdata, struct hwloc_obj *))
+{
+  hwloc_obj_t child;
+  cb(cbdata, tree);
+  for_each_io_child(child, tree) {
+    if (child->type == HWLOC_OBJ_BRIDGE)
+      hwloc_pci_traverse(cbdata, child, cb);
+  }
+}
+#endif /* HWLOC_DEBUG */
+
+enum hwloc_pci_busid_comparison_e {
+  HWLOC_PCI_BUSID_LOWER,
+  HWLOC_PCI_BUSID_HIGHER,
+  HWLOC_PCI_BUSID_INCLUDED,
+  HWLOC_PCI_BUSID_SUPERSET
+};
+
+static enum hwloc_pci_busid_comparison_e
+hwloc_pci_compare_busids(struct hwloc_obj *a, struct hwloc_obj *b)
+{
+#ifdef HWLOC_DEBUG
+  if (a->type == HWLOC_OBJ_BRIDGE)
+    assert(a->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI);
+  if (b->type == HWLOC_OBJ_BRIDGE)
+    assert(b->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI);
+#endif
+
+  if (a->attr->pcidev.domain < b->attr->pcidev.domain)
+    return HWLOC_PCI_BUSID_LOWER;
+  if (a->attr->pcidev.domain > b->attr->pcidev.domain)
+    return HWLOC_PCI_BUSID_HIGHER;
+
+  if (a->type == HWLOC_OBJ_BRIDGE
+      && b->attr->pcidev.bus >= a->attr->bridge.downstream.pci.secondary_bus
+      && b->attr->pcidev.bus <= a->attr->bridge.downstream.pci.subordinate_bus)
+    return HWLOC_PCI_BUSID_SUPERSET;
+  if (b->type == HWLOC_OBJ_BRIDGE
+      && a->attr->pcidev.bus >= b->attr->bridge.downstream.pci.secondary_bus
+      && a->attr->pcidev.bus <= b->attr->bridge.downstream.pci.subordinate_bus)
+    return HWLOC_PCI_BUSID_INCLUDED;
+
+  if (a->attr->pcidev.bus < b->attr->pcidev.bus)
+    return HWLOC_PCI_BUSID_LOWER;
+  if (a->attr->pcidev.bus > b->attr->pcidev.bus)
+    return HWLOC_PCI_BUSID_HIGHER;
+
+  if (a->attr->pcidev.dev < b->attr->pcidev.dev)
+    return HWLOC_PCI_BUSID_LOWER;
+  if (a->attr->pcidev.dev > b->attr->pcidev.dev)
+    return HWLOC_PCI_BUSID_HIGHER;
+
+  if (a->attr->pcidev.func < b->attr->pcidev.func)
+    return HWLOC_PCI_BUSID_LOWER;
+  if (a->attr->pcidev.func > b->attr->pcidev.func)
+    return HWLOC_PCI_BUSID_HIGHER;
+
+  /* Should never reach here.  Abort on both debug builds and
+     non-debug builds */
+  assert(0);
+  fprintf(stderr, "Bad assertion in hwloc %s:%d (aborting)\n", __FILE__, __LINE__);
+  exit(1);
+}
+
+static void
+hwloc_pci_add_object(struct hwloc_obj *parent, struct hwloc_obj **parent_io_first_child_p, struct hwloc_obj *new)
+{
+  struct hwloc_obj **curp, **childp;
+
+  curp = parent_io_first_child_p;
+  while (*curp) {
+    enum hwloc_pci_busid_comparison_e comp = hwloc_pci_compare_busids(new, *curp);
+    switch (comp) {
+    case HWLOC_PCI_BUSID_HIGHER:
+      /* go further */
+      curp = &(*curp)->next_sibling;
+      continue;
+    case HWLOC_PCI_BUSID_INCLUDED:
+      /* insert new below current bridge */
+      hwloc_pci_add_object(*curp, &(*curp)->io_first_child, new);
+      return;
+    case HWLOC_PCI_BUSID_LOWER:
+    case HWLOC_PCI_BUSID_SUPERSET: {
+      /* insert new before current */
+      new->next_sibling = *curp;
+      *curp = new;
+      new->parent = parent;
+      if (new->type == HWLOC_OBJ_BRIDGE) {
+	/* look at remaining siblings and move some below new */
+	childp = &new->io_first_child;
+	curp = &new->next_sibling;
+	while (*curp) {
+	  hwloc_obj_t cur = *curp;
+	  if (hwloc_pci_compare_busids(new, cur) == HWLOC_PCI_BUSID_LOWER) {
+	    /* this sibling remains under root, after new. */
+	    if (cur->attr->pcidev.domain > new->attr->pcidev.domain
+		|| cur->attr->pcidev.bus > new->attr->bridge.downstream.pci.subordinate_bus)
+	      /* this sibling is even above new's subordinate bus, no other sibling could go below new */
+	      return;
+	    curp = &cur->next_sibling;
+	  } else {
+	    /* this sibling goes under new */
+	    *childp = cur;
+	    *curp = cur->next_sibling;
+	    (*childp)->parent = new;
+	    (*childp)->next_sibling = NULL;
+	    childp = &(*childp)->next_sibling;
+	  }
+	}
+      }
+      return;
+    }
+    }
+  }
+  /* add to the end of the list if higher than everybody */
+  new->parent = parent;
+  new->next_sibling = NULL;
+  *curp = new;
+}
+
+void
+hwloc_pcidisc_tree_insert_by_busid(struct hwloc_obj **treep,
+				   struct hwloc_obj *obj)
+{
+  hwloc_pci_add_object(NULL /* no parent on top of tree */, treep, obj);
+}
+
+int
+hwloc_pcidisc_tree_attach(struct hwloc_topology *topology, struct hwloc_obj *old_tree)
+{
+  struct hwloc_obj **next_hb_p;
+  enum hwloc_type_filter_e bfilter;
+
+  if (!old_tree)
+    /* found nothing, exit */
+    return 0;
+
+#ifdef HWLOC_DEBUG
+  hwloc_debug("%s", "\nPCI hierarchy:\n");
+  hwloc_pci_traverse(NULL, old_tree, hwloc_pci_traverse_print_cb);
+  hwloc_debug("%s", "\n");
+#endif
+
+  next_hb_p = &hwloc_get_root_obj(topology)->io_first_child;
+  while (*next_hb_p)
+    next_hb_p = &((*next_hb_p)->next_sibling);
+
+  bfilter = topology->type_filter[HWLOC_OBJ_BRIDGE];
+  if (bfilter == HWLOC_TYPE_FILTER_KEEP_NONE) {
+    *next_hb_p = old_tree;
+    topology->modified = 1;
+    goto done;
+  }
+
+  /*
+   * tree points to all objects connected to any upstream bus in the machine.
+   * We now create one real hostbridge object per upstream bus.
+   * It's not actually a PCI device so we have to create it.
+   */
+  while (old_tree) {
+    /* start a new host bridge */
+    struct hwloc_obj *hostbridge = hwloc_alloc_setup_object(topology, HWLOC_OBJ_BRIDGE, HWLOC_UNKNOWN_INDEX);
+    struct hwloc_obj **dstnextp = &hostbridge->io_first_child;
+    struct hwloc_obj **srcnextp = &old_tree;
+    struct hwloc_obj *child = *srcnextp;
+    unsigned short current_domain = child->attr->pcidev.domain;
+    unsigned char current_bus = child->attr->pcidev.bus;
+    unsigned char current_subordinate = current_bus;
+
+    hwloc_debug("Starting new PCI hostbridge %04x:%02x\n", current_domain, current_bus);
+
+  next_child:
+    /* remove next child from tree */
+    *srcnextp = child->next_sibling;
+    /* append it to hostbridge */
+    *dstnextp = child;
+    child->parent = hostbridge;
+    child->next_sibling = NULL;
+    dstnextp = &child->next_sibling;
+
+    /* compute hostbridge secondary/subordinate buses */
+    if (child->type == HWLOC_OBJ_BRIDGE
+	&& child->attr->bridge.downstream.pci.subordinate_bus > current_subordinate)
+      current_subordinate = child->attr->bridge.downstream.pci.subordinate_bus;
+
+    /* use next child if it has the same domains/bus */
+    child = *srcnextp;
+    if (child
+	&& child->attr->pcidev.domain == current_domain
+	&& child->attr->pcidev.bus == current_bus)
+      goto next_child;
+
+    /* finish setting up this hostbridge */
+    hostbridge->attr->bridge.upstream_type = HWLOC_OBJ_BRIDGE_HOST;
+    hostbridge->attr->bridge.downstream_type = HWLOC_OBJ_BRIDGE_PCI;
+    hostbridge->attr->bridge.downstream.pci.domain = current_domain;
+    hostbridge->attr->bridge.downstream.pci.secondary_bus = current_bus;
+    hostbridge->attr->bridge.downstream.pci.subordinate_bus = current_subordinate;
+    hwloc_debug("New PCI hostbridge %04x:[%02x-%02x]\n",
+		current_domain, current_bus, current_subordinate);
+
+    *next_hb_p = hostbridge;
+    next_hb_p = &hostbridge->next_sibling;
+    topology->modified = 1; /* needed in case somebody reconnects levels before the core calls hwloc_pci_belowroot_apply_locality()
+			     * or if hwloc_pci_belowroot_apply_locality() keeps hostbridges below root.
+			     */
+  }
+
+ done:
+  topology->need_pci_belowroot_apply_locality = 1;
+  return 0;
+}
+
+static struct hwloc_obj *
+hwloc_pci_fixup_busid_parent(struct hwloc_topology *topology __hwloc_attribute_unused,
+			     struct hwloc_pcidev_attr_s *busid,
+			     struct hwloc_obj *parent)
+{
+  /* Xeon E5v3 in cluster-on-die mode only have PCI on the first NUMA node of each package.
+   * but many dual-processor host report the second PCI hierarchy on 2nd NUMA of first package.
+   */
+  if (parent->depth >= 2
+      && parent->type == HWLOC_OBJ_NUMANODE
+      && parent->sibling_rank == 1 && parent->parent->arity == 2
+      && parent->parent->type == HWLOC_OBJ_PACKAGE
+      && parent->parent->sibling_rank == 0 && parent->parent->parent->arity == 2) {
+    const char *cpumodel = hwloc_obj_get_info_by_name(parent->parent, "CPUModel");
+    if (cpumodel && strstr(cpumodel, "Xeon")) {
+      if (!hwloc_hide_errors()) {
+	fprintf(stderr, "****************************************************************************\n");
+	fprintf(stderr, "* hwloc %s has encountered an incorrect PCI locality information.\n", HWLOC_VERSION);
+	fprintf(stderr, "* PCI bus %04x:%02x is supposedly close to 2nd NUMA node of 1st package,\n",
+		busid->domain, busid->bus);
+	fprintf(stderr, "* however hwloc believes this is impossible on this architecture.\n");
+	fprintf(stderr, "* Therefore the PCI bus will be moved to 1st NUMA node of 2nd package.\n");
+	fprintf(stderr, "*\n");
+	fprintf(stderr, "* If you feel this fixup is wrong, disable it by setting in your environment\n");
+	fprintf(stderr, "* HWLOC_PCI_%04x_%02x_LOCALCPUS= (empty value), and report the problem\n",
+		busid->domain, busid->bus);
+	fprintf(stderr, "* to the hwloc's user mailing list together with the XML output of lstopo.\n");
+	fprintf(stderr, "*\n");
+	fprintf(stderr, "* You may silence this message by setting HWLOC_HIDE_ERRORS=1 in your environment.\n");
+	fprintf(stderr, "****************************************************************************\n");
+      }
+      return parent->parent->next_sibling->first_child;
+    }
+  }
+
+  return parent;
+}
+
+static struct hwloc_obj *
+hwloc__pci_find_busid_parent(struct hwloc_topology *topology, struct hwloc_pcidev_attr_s *busid)
+{
+  hwloc_bitmap_t cpuset = hwloc_bitmap_alloc();
+  hwloc_obj_t parent;
+  int forced = 0;
+  int noquirks = 0;
+  unsigned i;
+  int err;
+
+  /* try to match a forced locality */
+  if (topology->pci_has_forced_locality) {
+    for(i=0; i<topology->pci_forced_locality_nr; i++) {
+      if (busid->domain == topology->pci_forced_locality[i].domain
+	  && busid->bus >= topology->pci_forced_locality[i].bus_first
+	  && busid->bus <= topology->pci_forced_locality[i].bus_last) {
+	hwloc_bitmap_copy(cpuset, topology->pci_forced_locality[i].cpuset);
+	forced = 1;
+	break;
+      }
+    }
+    /* if pci locality was forced, even empty, don't let quirks change what the OS reports */
+    noquirks = 1;
+  }
+
+  /* deprecated force locality variables */
+  if (!forced) {
+    const char *env;
+    char envname[256];
+    /* override the cpuset with the environment if given */
+    snprintf(envname, sizeof(envname), "HWLOC_PCI_%04x_%02x_LOCALCPUS",
+	     busid->domain, busid->bus);
+    env = getenv(envname);
+    if (env) {
+      static int reported = 0;
+      if (!topology->pci_has_forced_locality && !reported) {
+	fprintf(stderr, "Environment variable %s is deprecated, please use HWLOC_PCI_LOCALITY instead.\n", env);
+	reported = 1;
+      }
+      if (*env) {
+	/* force the cpuset */
+	hwloc_debug("Overriding localcpus using %s in the environment\n", envname);
+	hwloc_bitmap_sscanf(cpuset, env);
+	forced = 1;
+      }
+      /* if env exists, even empty, don't let quirks change what the OS reports */
+      noquirks = 1;
+    }
+  }
+
+  if (!forced) {
+    /* get the cpuset by asking the OS backend. */
+    struct hwloc_backend *backend = topology->get_pci_busid_cpuset_backend;
+    if (backend)
+      err = backend->get_pci_busid_cpuset(backend, busid, cpuset);
+    else
+      err = -1;
+    if (err < 0)
+      /* if we got nothing, assume this PCI bus is attached to the top of hierarchy */
+      hwloc_bitmap_copy(cpuset, hwloc_topology_get_topology_cpuset(topology));
+  }
+
+  hwloc_debug_bitmap("Attaching PCI tree to cpuset %s\n", cpuset);
+
+  parent = hwloc_find_insert_io_parent_by_complete_cpuset(topology, cpuset);
+  if (parent) {
+    if (!noquirks)
+      /* We found a valid parent. Check that the OS didn't report invalid locality */
+      parent = hwloc_pci_fixup_busid_parent(topology, busid, parent);
+  } else {
+    /* Fallback to root */
+    parent = hwloc_get_root_obj(topology);
+  }
+
+  hwloc_bitmap_free(cpuset);
+  return parent;
+}
+
+struct hwloc_obj *
+hwloc_pcidisc_find_busid_parent(struct hwloc_topology *topology,
+				unsigned domain, unsigned bus, unsigned dev, unsigned func)
+{
+  struct hwloc_pcidev_attr_s busid;
+  busid.domain = domain;
+  busid.bus = bus;
+  busid.dev = dev;
+  busid.func = func;
+  return hwloc__pci_find_busid_parent(topology, &busid);
+}
+
+int
+hwloc_pci_belowroot_apply_locality(struct hwloc_topology *topology)
+{
+  struct hwloc_obj *root = hwloc_get_root_obj(topology);
+  struct hwloc_obj **listp, *obj;
+
+  if (!topology->need_pci_belowroot_apply_locality)
+    return 0;
+  topology->need_pci_belowroot_apply_locality = 0;
+
+  /* root->io_first_child contains some PCI hierarchies, any maybe some non-PCI things.
+   * insert the PCI trees according to their PCI-locality.
+   */
+  listp = &root->io_first_child;
+  while ((obj = *listp) != NULL) {
+    struct hwloc_pcidev_attr_s *busid;
+    struct hwloc_obj *parent;
+
+    /* skip non-PCI objects */
+    if (obj->type != HWLOC_OBJ_PCI_DEVICE
+	&& !(obj->type == HWLOC_OBJ_BRIDGE && obj->attr->bridge.downstream_type == HWLOC_OBJ_BRIDGE_PCI)
+	&& !(obj->type == HWLOC_OBJ_BRIDGE && obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI)) {
+      listp = &obj->next_sibling;
+      continue;
+    }
+
+    if (obj->type == HWLOC_OBJ_PCI_DEVICE
+	|| (obj->type == HWLOC_OBJ_BRIDGE
+	    && obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI))
+      busid = &obj->attr->pcidev;
+    else {
+      /* hostbridges don't have a PCI busid for looking up locality, use their first child if PCI */
+      hwloc_obj_t child = obj->io_first_child;
+      if (child && (child->type == HWLOC_OBJ_PCI_DEVICE
+		    || (child->type == HWLOC_OBJ_BRIDGE
+			&& child->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI)))
+	busid = &obj->io_first_child->attr->pcidev;
+      else
+	continue;
+    }
+
+    /* attach the object (and children) where it belongs */
+    parent = hwloc__pci_find_busid_parent(topology, busid);
+    if (parent == root) {
+      /* keep this object here */
+      listp = &obj->next_sibling;
+    } else {
+      /* dequeue this object */
+      *listp = obj->next_sibling;
+      obj->next_sibling = NULL;
+      hwloc_insert_object_by_parent(topology, parent, obj);
+    }
+  }
+
+  return 0;
+}
+
+static struct hwloc_obj *
+hwloc__pci_belowroot_find_by_busid(hwloc_obj_t parent,
+				   unsigned domain, unsigned bus, unsigned dev, unsigned func)
+{
+  hwloc_obj_t child;
+
+  for_each_io_child(child, parent) {
+    if (child->type == HWLOC_OBJ_PCI_DEVICE
+	|| (child->type == HWLOC_OBJ_BRIDGE
+	    && child->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI)) {
+      if (child->attr->pcidev.domain == domain
+	  && child->attr->pcidev.bus == bus
+	  && child->attr->pcidev.dev == dev
+	  && child->attr->pcidev.func == func)
+	/* that's the right bus id */
+	return child;
+      if (child->attr->pcidev.domain > domain
+	  || (child->attr->pcidev.domain == domain
+	      && child->attr->pcidev.bus > bus))
+	/* bus id too high, won't find anything later, return parent */
+	return parent;
+      if (child->type == HWLOC_OBJ_BRIDGE
+	  && child->attr->bridge.downstream_type == HWLOC_OBJ_BRIDGE_PCI
+	  && child->attr->bridge.downstream.pci.domain == domain
+	  && child->attr->bridge.downstream.pci.secondary_bus <= bus
+	  && child->attr->bridge.downstream.pci.subordinate_bus >= bus)
+	/* not the right bus id, but it's included in the bus below that bridge */
+	return hwloc__pci_belowroot_find_by_busid(child, domain, bus, dev, func);
+
+    } else if (child->type == HWLOC_OBJ_BRIDGE
+	       && child->attr->bridge.upstream_type != HWLOC_OBJ_BRIDGE_PCI
+	       && child->attr->bridge.downstream_type == HWLOC_OBJ_BRIDGE_PCI
+	       /* non-PCI to PCI bridge, just look at the subordinate bus */
+	       && child->attr->bridge.downstream.pci.domain == domain
+	       && child->attr->bridge.downstream.pci.secondary_bus <= bus
+	       && child->attr->bridge.downstream.pci.subordinate_bus >= bus) {
+      /* contains our bus, recurse */
+      return hwloc__pci_belowroot_find_by_busid(child, domain, bus, dev, func);
+    }
+  }
+  /* didn't find anything, return parent */
+  return parent;
+}
+
+struct hwloc_obj *
+hwloc_pcidisc_find_by_busid(struct hwloc_topology *topology,
+			    unsigned domain, unsigned bus, unsigned dev, unsigned func)
+{
+  hwloc_obj_t root = hwloc_get_root_obj(topology);
+  hwloc_obj_t parent = hwloc__pci_belowroot_find_by_busid(root, domain, bus, dev, func);
+  if (parent == root)
+    return NULL;
+  else
+    return parent;
+}
+
+#define HWLOC_PCI_STATUS 0x06
+#define HWLOC_PCI_STATUS_CAP_LIST 0x10
+#define HWLOC_PCI_CAPABILITY_LIST 0x34
+#define HWLOC_PCI_CAP_LIST_ID 0
+#define HWLOC_PCI_CAP_LIST_NEXT 1
+
+unsigned
+hwloc_pcidisc_find_cap(const unsigned char *config, unsigned cap)
+{
+  unsigned char seen[256] = { 0 };
+  unsigned char ptr; /* unsigned char to make sure we stay within the 256-byte config space */
+
+  if (!(config[HWLOC_PCI_STATUS] & HWLOC_PCI_STATUS_CAP_LIST))
+    return 0;
+
+  for (ptr = config[HWLOC_PCI_CAPABILITY_LIST] & ~3;
+       ptr; /* exit if next is 0 */
+       ptr = config[ptr + HWLOC_PCI_CAP_LIST_NEXT] & ~3) {
+    unsigned char id;
+
+    /* Looped around! */
+    if (seen[ptr])
+      break;
+    seen[ptr] = 1;
+
+    id = config[ptr + HWLOC_PCI_CAP_LIST_ID];
+    if (id == cap)
+      return ptr;
+    if (id == 0xff) /* exit if id is 0 or 0xff */
+      break;
+  }
+  return 0;
+}
+
+#define HWLOC_PCI_EXP_LNKSTA 0x12
+#define HWLOC_PCI_EXP_LNKSTA_SPEED 0x000f
+#define HWLOC_PCI_EXP_LNKSTA_WIDTH 0x03f0
+
+int
+hwloc_pcidisc_find_linkspeed(const unsigned char *config,
+			     unsigned offset, float *linkspeed)
+{
+  unsigned linksta, speed, width;
+  float lanespeed;
+
+  memcpy(&linksta, &config[offset + HWLOC_PCI_EXP_LNKSTA], 4);
+  speed = linksta & HWLOC_PCI_EXP_LNKSTA_SPEED; /* PCIe generation */
+  width = (linksta & HWLOC_PCI_EXP_LNKSTA_WIDTH) >> 4; /* how many lanes */
+  /* PCIe Gen1 = 2.5GT/s signal-rate per lane with 8/10 encoding    = 0.25GB/s data-rate per lane
+   * PCIe Gen2 = 5  GT/s signal-rate per lane with 8/10 encoding    = 0.5 GB/s data-rate per lane
+   * PCIe Gen3 = 8  GT/s signal-rate per lane with 128/130 encoding = 1   GB/s data-rate per lane
+   * PCIe Gen4 = 16 GT/s signal-rate per lane with 128/130 encoding = 2   GB/s data-rate per lane
+   */
+
+  /* lanespeed in Gbit/s */
+  if (speed <= 2)
+    lanespeed = 2.5f * speed * 0.8f;
+  else
+    lanespeed = 8.0f * (1<<(speed-3)) * 128/130; /* assume Gen5 will be 32 GT/s and so on */
+
+  /* linkspeed in GB/s */
+  *linkspeed = lanespeed * width / 8;
+  return 0;
+}
+
+#define HWLOC_PCI_HEADER_TYPE 0x0e
+#define HWLOC_PCI_HEADER_TYPE_BRIDGE 1
+#define HWLOC_PCI_CLASS_BRIDGE_PCI 0x0604
+
+hwloc_obj_type_t
+hwloc_pcidisc_check_bridge_type(unsigned device_class, const unsigned char *config)
+{
+  unsigned char headertype;
+
+  if (device_class != HWLOC_PCI_CLASS_BRIDGE_PCI)
+    return HWLOC_OBJ_PCI_DEVICE;
+
+  headertype = config[HWLOC_PCI_HEADER_TYPE] & 0x7f;
+  return (headertype == HWLOC_PCI_HEADER_TYPE_BRIDGE)
+    ? HWLOC_OBJ_BRIDGE : HWLOC_OBJ_PCI_DEVICE;
+}
+
+#define HWLOC_PCI_PRIMARY_BUS 0x18
+#define HWLOC_PCI_SECONDARY_BUS 0x19
+#define HWLOC_PCI_SUBORDINATE_BUS 0x1a
+
+int
+hwloc_pcidisc_setup_bridge_attr(hwloc_obj_t obj,
+				const unsigned char *config)
+{
+  struct hwloc_bridge_attr_s *battr = &obj->attr->bridge;
+  struct hwloc_pcidev_attr_s *pattr = &battr->upstream.pci;
+
+  if (config[HWLOC_PCI_PRIMARY_BUS] != pattr->bus) {
+    /* Sometimes the config space contains 00 instead of the actual primary bus number.
+     * Always trust the bus ID because it was built by the system which has more information
+     * to workaround such problems (e.g. ACPI information about PCI parent/children).
+     */
+    hwloc_debug("  %04x:%02x:%02x.%01x bridge with (ignored) invalid PCI_PRIMARY_BUS %02x\n",
+		pattr->domain, pattr->bus, pattr->dev, pattr->func, config[HWLOC_PCI_PRIMARY_BUS]);
+  }
+
+  battr->upstream_type = HWLOC_OBJ_BRIDGE_PCI;
+  battr->downstream_type = HWLOC_OBJ_BRIDGE_PCI;
+  battr->downstream.pci.domain = pattr->domain;
+  battr->downstream.pci.secondary_bus = config[HWLOC_PCI_SECONDARY_BUS];
+  battr->downstream.pci.subordinate_bus = config[HWLOC_PCI_SUBORDINATE_BUS];
+
+  if (battr->downstream.pci.secondary_bus <= pattr->bus
+      || battr->downstream.pci.subordinate_bus <= pattr->bus
+      || battr->downstream.pci.secondary_bus > battr->downstream.pci.subordinate_bus) {
+    /* This should catch most cases of invalid bridge information
+     * (e.g. 00 for secondary and subordinate).
+     * Ideally we would also check that [secondary-subordinate] is included
+     * in the parent bridge [secondary+1:subordinate]. But that's hard to do
+     * because objects may be discovered out of order (especially in the fsroot case).
+     */
+    hwloc_debug("  %04x:%02x:%02x.%01x bridge has invalid secondary-subordinate buses [%02x-%02x]\n",
+		pattr->domain, pattr->bus, pattr->dev, pattr->func,
+		battr->downstream.pci.secondary_bus, battr->downstream.pci.subordinate_bus);
+    hwloc_free_unlinked_object(obj);
+    return -1;
+  }
+
+  return 0;
+}
+
+const char *
+hwloc_pci_class_string(unsigned short class_id)
+{
+  /* See https://pci-ids.ucw.cz/read/PD/ */
+  switch ((class_id & 0xff00) >> 8) {
+    case 0x00:
+      switch (class_id) {
+	case 0x0001: return "VGA";
+      }
+      break;
+    case 0x01:
+      switch (class_id) {
+	case 0x0100: return "SCSI";
+	case 0x0101: return "IDE";
+	case 0x0102: return "Floppy";
+	case 0x0103: return "IPI";
+	case 0x0104: return "RAID";
+	case 0x0105: return "ATA";
+	case 0x0106: return "SATA";
+	case 0x0107: return "SAS";
+	case 0x0108: return "NVMExp";
+      }
+      return "Storage";
+    case 0x02:
+      switch (class_id) {
+	case 0x0200: return "Ethernet";
+	case 0x0201: return "TokenRing";
+	case 0x0202: return "FDDI";
+	case 0x0203: return "ATM";
+	case 0x0204: return "ISDN";
+	case 0x0205: return "WorldFip";
+	case 0x0206: return "PICMG";
+	case 0x0207: return "InfiniBand";
+	case 0x0208: return "Fabric";
+      }
+      return "Network";
+    case 0x03:
+      switch (class_id) {
+	case 0x0300: return "VGA";
+	case 0x0301: return "XGA";
+	case 0x0302: return "3D";
+      }
+      return "Display";
+    case 0x04:
+      switch (class_id) {
+	case 0x0400: return "MultimediaVideo";
+	case 0x0401: return "MultimediaAudio";
+	case 0x0402: return "Telephony";
+	case 0x0403: return "AudioDevice";
+      }
+      return "Multimedia";
+    case 0x05:
+      switch (class_id) {
+	case 0x0500: return "RAM";
+	case 0x0501: return "Flash";
+      }
+      return "Memory";
+    case 0x06:
+      switch (class_id) {
+	case 0x0600: return "HostBridge";
+	case 0x0601: return "ISABridge";
+	case 0x0602: return "EISABridge";
+	case 0x0603: return "MicroChannelBridge";
+	case 0x0604: return "PCIBridge";
+	case 0x0605: return "PCMCIABridge";
+	case 0x0606: return "NubusBridge";
+	case 0x0607: return "CardBusBridge";
+	case 0x0608: return "RACEwayBridge";
+	case 0x0609: return "SemiTransparentPCIBridge";
+	case 0x060a: return "InfiniBandPCIHostBridge";
+      }
+      return "Bridge";
+    case 0x07:
+      switch (class_id) {
+	case 0x0700: return "Serial";
+	case 0x0701: return "Parallel";
+	case 0x0702: return "MultiportSerial";
+	case 0x0703: return "Model";
+	case 0x0704: return "GPIB";
+	case 0x0705: return "SmartCard";
+      }
+      return "Communication";
+    case 0x08:
+      switch (class_id) {
+	case 0x0800: return "PIC";
+	case 0x0801: return "DMA";
+	case 0x0802: return "Timer";
+	case 0x0803: return "RTC";
+	case 0x0804: return "PCIHotPlug";
+	case 0x0805: return "SDHost";
+	case 0x0806: return "IOMMU";
+      }
+      return "SystemPeripheral";
+    case 0x09:
+      switch (class_id) {
+	case 0x0900: return "Keyboard";
+	case 0x0901: return "DigitizerPen";
+	case 0x0902: return "Mouse";
+	case 0x0903: return "Scanern";
+	case 0x0904: return "Gameport";
+      }
+      return "Input";
+    case 0x0a:
+      return "DockingStation";
+    case 0x0b:
+      switch (class_id) {
+	case 0x0b00: return "386";
+	case 0x0b01: return "486";
+	case 0x0b02: return "Pentium";
+/* 0x0b03 and 0x0b04 might be Pentium and P6 ? */
+	case 0x0b10: return "Alpha";
+	case 0x0b20: return "PowerPC";
+	case 0x0b30: return "MIPS";
+	case 0x0b40: return "Co-Processor";
+      }
+      return "Processor";
+    case 0x0c:
+      switch (class_id) {
+	case 0x0c00: return "FireWire";
+	case 0x0c01: return "ACCESS";
+	case 0x0c02: return "SSA";
+	case 0x0c03: return "USB";
+	case 0x0c04: return "FibreChannel";
+	case 0x0c05: return "SMBus";
+	case 0x0c06: return "InfiniBand";
+	case 0x0c07: return "IPMI-SMIC";
+	case 0x0c08: return "SERCOS";
+	case 0x0c09: return "CANBUS";
+      }
+      return "SerialBus";
+    case 0x0d:
+      switch (class_id) {
+	case 0x0d00: return "IRDA";
+	case 0x0d01: return "ConsumerIR";
+	case 0x0d10: return "RF";
+	case 0x0d11: return "Bluetooth";
+	case 0x0d12: return "Broadband";
+	case 0x0d20: return "802.1a";
+	case 0x0d21: return "802.1b";
+      }
+      return "Wireless";
+    case 0x0e:
+      switch (class_id) {
+	case 0x0e00: return "I2O";
+      }
+      return "Intelligent";
+    case 0x0f:
+      return "Satellite";
+    case 0x10:
+      return "Encryption";
+    case 0x11:
+      return "SignalProcessing";
+    case 0x12:
+      return "ProcessingAccelerator";
+    case 0x13:
+      return "Instrumentation";
+    case 0x40:
+      return "Co-Processor";
+  }
+  return "Other";
+}
diff --git a/src/3rdparty/hwloc/src/shmem.c b/src/3rdparty/hwloc/src/shmem.c
new file mode 100644
index 000000000..6c507f522
--- /dev/null
+++ b/src/3rdparty/hwloc/src/shmem.c
@@ -0,0 +1,287 @@
+/*
+ * Copyright © 2017-2018 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <hwloc/shmem.h>
+#include <private/private.h>
+
+#ifndef HWLOC_WIN_SYS
+
+#include <sys/mman.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#include <assert.h>
+
+#define HWLOC_SHMEM_HEADER_VERSION 1
+
+struct hwloc_shmem_header {
+  uint32_t header_version; /* sanity check */
+  uint32_t header_length; /* where the actual topology starts in the file/mapping */
+  uint64_t mmap_address; /* virtual address to pass to mmap */
+  uint64_t mmap_length; /* length to pass to mmap (includes the header) */
+};
+
+#define HWLOC_SHMEM_MALLOC_ALIGN 8UL
+
+static void *
+tma_shmem_malloc(struct hwloc_tma * tma,
+		 size_t length)
+{
+  void *current = tma->data;
+  tma->data = (char*)tma->data  + ((length + HWLOC_SHMEM_MALLOC_ALIGN - 1) & ~(HWLOC_SHMEM_MALLOC_ALIGN - 1));
+  return current;
+
+}
+
+static void *
+tma_get_length_malloc(struct hwloc_tma * tma,
+		      size_t length)
+{
+  size_t *tma_length = tma->data;
+  *tma_length += (length + HWLOC_SHMEM_MALLOC_ALIGN - 1) & ~(HWLOC_SHMEM_MALLOC_ALIGN - 1);
+  return malloc(length);
+
+}
+
+int
+hwloc_shmem_topology_get_length(hwloc_topology_t topology,
+				size_t *lengthp,
+				unsigned long flags)
+{
+  hwloc_topology_t new;
+  struct hwloc_tma tma;
+  size_t length = 0;
+  unsigned long pagesize = hwloc_getpagesize(); /* round-up to full page for mmap() */
+  int err;
+
+  if (flags) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  tma.malloc = tma_get_length_malloc;
+  tma.dontfree = 0;
+  tma.data = &length;
+
+  err = hwloc__topology_dup(&new, topology, &tma);
+  if (err < 0)
+    return err;
+  hwloc_topology_destroy(new);
+
+  *lengthp = (sizeof(struct hwloc_shmem_header) + length + pagesize - 1) & ~(pagesize - 1);
+  return 0;
+}
+
+int
+hwloc_shmem_topology_write(hwloc_topology_t topology,
+			   int fd, hwloc_uint64_t fileoffset,
+			   void *mmap_address, size_t length,
+			   unsigned long flags)
+{
+  hwloc_topology_t new;
+  struct hwloc_tma tma;
+  struct hwloc_shmem_header header;
+  void *mmap_res;
+  int err;
+
+  if (flags) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  /* refresh old topology distances so that we don't uselessly duplicate invalid distances
+   * without being able to free() them.
+   */
+  hwloc_internal_distances_refresh(topology);
+
+  header.header_version = HWLOC_SHMEM_HEADER_VERSION;
+  header.header_length = sizeof(header);
+  header.mmap_address = (uintptr_t) mmap_address;
+  header.mmap_length = length;
+
+  err = lseek(fd, fileoffset, SEEK_SET);
+  if (err < 0)
+    return -1;
+
+  err = write(fd, &header, sizeof(header));
+  if (err != sizeof(header))
+    return -1;
+
+  err = ftruncate(fd, fileoffset + length);
+  if (err < 0)
+    return -1;
+
+  mmap_res = mmap(mmap_address, length, PROT_READ|PROT_WRITE, MAP_SHARED, fd, fileoffset);
+  if (mmap_res == MAP_FAILED)
+    return -1;
+  if (mmap_res != mmap_address) {
+    munmap(mmap_res, length);
+    errno = EBUSY;
+    return -1;
+  }
+
+  tma.malloc = tma_shmem_malloc;
+  tma.dontfree = 1;
+  tma.data = (char *)mmap_res + sizeof(header);
+  err = hwloc__topology_dup(&new, topology, &tma);
+  if (err < 0)
+    return err;
+  assert((char*)new == (char*)mmap_address + sizeof(header));
+
+  assert((char *)mmap_res <= (char *)mmap_address + length);
+
+  /* now refresh the new distances so that adopters can use them without refreshing the R/O shmem mapping */
+  hwloc_internal_distances_refresh(new);
+
+  /* topology is saved, release resources now */
+  munmap(mmap_address, length);
+  hwloc_components_fini();
+
+  return 0;
+}
+
+int
+hwloc_shmem_topology_adopt(hwloc_topology_t *topologyp,
+			   int fd, hwloc_uint64_t fileoffset,
+			   void *mmap_address, size_t length,
+			   unsigned long flags)
+{
+  hwloc_topology_t new, old;
+  struct hwloc_shmem_header header;
+  void *mmap_res;
+  int err;
+
+  if (flags) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  err = lseek(fd, fileoffset, SEEK_SET);
+  if (err < 0)
+    return -1;
+
+  err = read(fd, &header, sizeof(header));
+  if (err != sizeof(header))
+    return -1;
+
+  if (header.header_version != HWLOC_SHMEM_HEADER_VERSION
+      || header.header_length != sizeof(header)
+      || header.mmap_address != (uintptr_t) mmap_address
+      || header.mmap_length != length) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  mmap_res = mmap(mmap_address, length, PROT_READ, MAP_SHARED, fd, fileoffset);
+  if (mmap_res == MAP_FAILED)
+    return -1;
+  if (mmap_res != mmap_address) {
+    errno = EBUSY;
+    goto out_with_mmap;
+  }
+
+  old = (hwloc_topology_t)((char*)mmap_address + sizeof(header));
+  if (hwloc_topology_abi_check(old) < 0) {
+    errno = EINVAL;
+    goto out_with_mmap;
+  }
+
+  /* enforced by dup() inside shmem_topology_write() */
+  assert(old->is_loaded);
+  assert(old->backends == NULL);
+  assert(old->get_pci_busid_cpuset_backend == NULL);
+
+  hwloc_components_init();
+
+  /* duplicate the topology object so that we ca change use local binding_hooks
+   * (those are likely not mapped at the same location in both processes).
+   */
+  new = malloc(sizeof(struct hwloc_topology));
+  if (!new)
+    goto out_with_components;
+  memcpy(new, old, sizeof(*old));
+  new->tma = NULL;
+  new->adopted_shmem_addr = mmap_address;
+  new->adopted_shmem_length = length;
+  new->topology_abi = HWLOC_TOPOLOGY_ABI;
+  /* setting binding hooks will touch support arrays, so duplicate them too.
+   * could avoid that by requesting a R/W mmap
+   */
+  new->support.discovery = malloc(sizeof(*new->support.discovery));
+  new->support.cpubind = malloc(sizeof(*new->support.cpubind));
+  new->support.membind = malloc(sizeof(*new->support.membind));
+  memcpy(new->support.discovery, old->support.discovery, sizeof(*new->support.discovery));
+  memcpy(new->support.cpubind, old->support.cpubind, sizeof(*new->support.cpubind));
+  memcpy(new->support.membind, old->support.membind, sizeof(*new->support.membind));
+  hwloc_set_binding_hooks(new);
+  /* clear userdata callbacks pointing to the writer process' functions */
+  new->userdata_export_cb = NULL;
+  new->userdata_import_cb = NULL;
+
+#ifndef HWLOC_DEBUG
+  if (getenv("HWLOC_DEBUG_CHECK"))
+#endif
+    hwloc_topology_check(new);
+
+  *topologyp = new;
+  return 0;
+
+ out_with_components:
+  hwloc_components_fini();
+ out_with_mmap:
+  munmap(mmap_res, length);
+  return -1;
+}
+
+void
+hwloc__topology_disadopt(hwloc_topology_t topology)
+{
+  hwloc_components_fini();
+  munmap(topology->adopted_shmem_addr, topology->adopted_shmem_length);
+  free(topology->support.discovery);
+  free(topology->support.cpubind);
+  free(topology->support.membind);
+  free(topology);
+}
+
+#else /* HWLOC_WIN_SYS */
+
+int
+hwloc_shmem_topology_get_length(hwloc_topology_t topology __hwloc_attribute_unused,
+				size_t *lengthp __hwloc_attribute_unused,
+				unsigned long flags __hwloc_attribute_unused)
+{
+  errno = ENOSYS;
+  return -1;
+}
+
+int
+hwloc_shmem_topology_write(hwloc_topology_t topology __hwloc_attribute_unused,
+			   int fd __hwloc_attribute_unused, hwloc_uint64_t fileoffset __hwloc_attribute_unused,
+			   void *mmap_address __hwloc_attribute_unused, size_t length __hwloc_attribute_unused,
+			   unsigned long flags __hwloc_attribute_unused)
+{
+  errno = ENOSYS;
+  return -1;
+}
+
+int
+hwloc_shmem_topology_adopt(hwloc_topology_t *topologyp __hwloc_attribute_unused,
+			   int fd __hwloc_attribute_unused, hwloc_uint64_t fileoffset __hwloc_attribute_unused,
+			   void *mmap_address __hwloc_attribute_unused, size_t length __hwloc_attribute_unused,
+			   unsigned long flags __hwloc_attribute_unused)
+{
+  errno = ENOSYS;
+  return -1;
+}
+
+void
+hwloc__topology_disadopt(hwloc_topology_t topology __hwloc_attribute_unused)
+{
+}
+
+#endif /* HWLOC_WIN_SYS */
diff --git a/src/3rdparty/hwloc/src/static-components.h b/src/3rdparty/hwloc/src/static-components.h
new file mode 100644
index 000000000..dac227a60
--- /dev/null
+++ b/src/3rdparty/hwloc/src/static-components.h
@@ -0,0 +1,15 @@
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_noos_component;
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_xml_component;
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_synthetic_component;
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_xml_nolibxml_component;
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_windows_component;
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_x86_component;
+static const struct hwloc_component * hwloc_static_components[] = {
+  &hwloc_noos_component,
+  &hwloc_xml_component,
+  &hwloc_synthetic_component,
+  &hwloc_xml_nolibxml_component,
+  &hwloc_windows_component,
+  &hwloc_x86_component,
+  NULL
+};
diff --git a/src/3rdparty/hwloc/src/topology-noos.c b/src/3rdparty/hwloc/src/topology-noos.c
new file mode 100644
index 000000000..77871eb17
--- /dev/null
+++ b/src/3rdparty/hwloc/src/topology-noos.c
@@ -0,0 +1,65 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2017 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <private/private.h>
+
+static int
+hwloc_look_noos(struct hwloc_backend *backend)
+{
+  struct hwloc_topology *topology = backend->topology;
+  int nbprocs;
+
+  if (topology->levels[0][0]->cpuset)
+    /* somebody discovered things */
+    return -1;
+
+  nbprocs = hwloc_fallback_nbprocessors(topology);
+  if (nbprocs >= 1)
+    topology->support.discovery->pu = 1;
+  else
+    nbprocs = 1;
+
+  hwloc_alloc_root_sets(topology->levels[0][0]);
+  hwloc_setup_pu_level(topology, nbprocs);
+  hwloc_add_uname_info(topology, NULL);
+  return 0;
+}
+
+static struct hwloc_backend *
+hwloc_noos_component_instantiate(struct hwloc_disc_component *component,
+				 const void *_data1 __hwloc_attribute_unused,
+				 const void *_data2 __hwloc_attribute_unused,
+				 const void *_data3 __hwloc_attribute_unused)
+{
+  struct hwloc_backend *backend;
+  backend = hwloc_backend_alloc(component);
+  if (!backend)
+    return NULL;
+  backend->discover = hwloc_look_noos;
+  return backend;
+}
+
+static struct hwloc_disc_component hwloc_noos_disc_component = {
+  HWLOC_DISC_COMPONENT_TYPE_CPU,
+  "no_os",
+  HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
+  hwloc_noos_component_instantiate,
+  40, /* lower than native OS component, higher than globals */
+  1,
+  NULL
+};
+
+const struct hwloc_component hwloc_noos_component = {
+  HWLOC_COMPONENT_ABI,
+  NULL, NULL,
+  HWLOC_COMPONENT_TYPE_DISC,
+  0,
+  &hwloc_noos_disc_component
+};
diff --git a/src/3rdparty/hwloc/src/topology-synthetic.c b/src/3rdparty/hwloc/src/topology-synthetic.c
new file mode 100644
index 000000000..1fe334d1c
--- /dev/null
+++ b/src/3rdparty/hwloc/src/topology-synthetic.c
@@ -0,0 +1,1521 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2019 Inria.  All rights reserved.
+ * Copyright © 2009-2010 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <private/private.h>
+#include <private/misc.h>
+#include <private/debug.h>
+
+#include <limits.h>
+#include <assert.h>
+#ifdef HAVE_STRINGS_H
+#include <strings.h>
+#endif
+
+struct hwloc_synthetic_attr_s {
+  hwloc_obj_type_t type;
+  unsigned depth; /* For caches/groups */
+  hwloc_obj_cache_type_t cachetype; /* For caches */
+  hwloc_uint64_t memorysize; /* For caches/memory */
+};
+
+struct hwloc_synthetic_indexes_s {
+  /* the indexes= attribute before parsing */
+  const char *string;
+  unsigned long string_length;
+  /* the array of explicit indexes after parsing */
+  unsigned *array;
+
+  /* used while filling the topology */
+  unsigned next; /* id of the next object for that level */
+};
+
+struct hwloc_synthetic_level_data_s {
+  unsigned arity;
+  unsigned long totalwidth;
+
+  struct hwloc_synthetic_attr_s attr;
+  struct hwloc_synthetic_indexes_s indexes;
+
+  struct hwloc_synthetic_attached_s {
+    struct hwloc_synthetic_attr_s attr;
+
+    struct hwloc_synthetic_attached_s *next;
+  } *attached;
+};
+
+struct hwloc_synthetic_backend_data_s {
+  /* synthetic backend parameters */
+  char *string;
+
+  unsigned long numa_attached_nr;
+  struct hwloc_synthetic_indexes_s numa_attached_indexes;
+
+#define HWLOC_SYNTHETIC_MAX_DEPTH 128
+  struct hwloc_synthetic_level_data_s level[HWLOC_SYNTHETIC_MAX_DEPTH];
+};
+
+struct hwloc_synthetic_intlv_loop_s {
+  unsigned step;
+  unsigned nb;
+  unsigned level_depth;
+};
+
+static void
+hwloc_synthetic_process_indexes(struct hwloc_synthetic_backend_data_s *data,
+				struct hwloc_synthetic_indexes_s *indexes,
+				unsigned long total,
+				int verbose)
+{
+  const char *attr = indexes->string;
+  unsigned long length = indexes->string_length;
+  unsigned *array = NULL;
+  size_t i;
+
+  if (!attr)
+    return;
+
+  array = calloc(total, sizeof(*array));
+  if (!array) {
+    if (verbose)
+      fprintf(stderr, "Failed to allocate synthetic index array of size %lu\n", total);
+    goto out;
+  }
+
+  i = strspn(attr, "0123456789,");
+  if (i == length) {
+    /* explicit array of indexes */
+
+    for(i=0; i<total; i++) {
+      const char *next;
+      unsigned idx = strtoul(attr, (char **) &next, 10);
+      if (next == attr) {
+	if (verbose)
+	  fprintf(stderr, "Failed to read synthetic index #%lu at '%s'\n", (unsigned long) i, attr);
+	goto out_with_array;
+      }
+
+      array[i] = idx;
+      if (i != total-1) {
+	if (*next != ',') {
+	  if (verbose)
+	    fprintf(stderr, "Missing comma after synthetic index #%lu at '%s'\n", (unsigned long) i, attr);
+	  goto out_with_array;
+	}
+	attr = next+1;
+      } else {
+	attr = next;
+      }
+    }
+    indexes->array = array;
+
+  } else {
+    /* interleaving */
+    unsigned nr_loops = 1, cur_loop;
+    unsigned minstep = total;
+    unsigned long nbs = 1;
+    unsigned j, mul;
+    const char *tmp;
+
+    tmp = attr;
+    while (tmp) {
+      tmp = strchr(tmp, ':');
+      if (!tmp || tmp >= attr+length)
+	break;
+      nr_loops++;
+      tmp++;
+    }
+
+   {
+    /* nr_loops colon-separated fields, but we may need one more at the end */
+    HWLOC_VLA(struct hwloc_synthetic_intlv_loop_s, loops, nr_loops+1);
+
+    if (*attr >= '0' && *attr <= '9') {
+      /* interleaving as x*y:z*t:... */
+      unsigned step, nb;
+
+      tmp = attr;
+      cur_loop = 0;
+      while (tmp) {
+	char *tmp2, *tmp3;
+	step = (unsigned) strtol(tmp, &tmp2, 0);
+	if (tmp2 == tmp || *tmp2 != '*') {
+	  if (verbose)
+	    fprintf(stderr, "Failed to read synthetic index interleaving loop '%s' without number before '*'\n", tmp);
+	  goto out_with_array;
+	}
+	if (!step) {
+	  if (verbose)
+	    fprintf(stderr, "Invalid interleaving loop with step 0 at '%s'\n", tmp);
+	  goto out_with_array;
+	}
+	tmp2++;
+	nb = (unsigned) strtol(tmp2, &tmp3, 0);
+	if (tmp3 == tmp2 || (*tmp3 && *tmp3 != ':' && *tmp3 != ')' && *tmp3 != ' ')) {
+	  if (verbose)
+	    fprintf(stderr, "Failed to read synthetic index interleaving loop '%s' without number between '*' and ':'\n", tmp);
+	  goto out_with_array;
+	}
+	if (!nb) {
+	  if (verbose)
+	    fprintf(stderr, "Invalid interleaving loop with number 0 at '%s'\n", tmp2);
+	  goto out_with_array;
+	}
+	loops[cur_loop].step = step;
+	loops[cur_loop].nb = nb;
+	if (step < minstep)
+	  minstep = step;
+	nbs *= nb;
+	cur_loop++;
+	if (*tmp3 == ')' || *tmp3 == ' ')
+	  break;
+	tmp = (const char*) (tmp3+1);
+      }
+
+    } else {
+      /* interleaving as type1:type2:... */
+      hwloc_obj_type_t type;
+      union hwloc_obj_attr_u attrs;
+      int err;
+
+      /* find level depths for each interleaving loop */
+      tmp = attr;
+      cur_loop = 0;
+      while (tmp) {
+	err = hwloc_type_sscanf(tmp, &type, &attrs, sizeof(attrs));
+	if (err < 0) {
+	  if (verbose)
+	    fprintf(stderr, "Failed to read synthetic index interleaving loop type '%s'\n", tmp);
+	  goto out_with_array;
+	}
+	if (type == HWLOC_OBJ_MISC || type == HWLOC_OBJ_BRIDGE || type == HWLOC_OBJ_PCI_DEVICE || type == HWLOC_OBJ_OS_DEVICE) {
+	  if (verbose)
+	    fprintf(stderr, "Misc object type disallowed in synthetic index interleaving loop type '%s'\n", tmp);
+	  goto out_with_array;
+	}
+	for(i=0; ; i++) {
+	  if (!data->level[i].arity) {
+	    loops[cur_loop].level_depth = (unsigned)-1;
+	    break;
+	  }
+	  if (type != data->level[i].attr.type)
+	    continue;
+	  if (type == HWLOC_OBJ_GROUP
+	      && attrs.group.depth != (unsigned) -1
+	      && attrs.group.depth != data->level[i].attr.depth)
+	    continue;
+	  loops[cur_loop].level_depth = (unsigned)i;
+	  break;
+	}
+	if (loops[cur_loop].level_depth == (unsigned)-1) {
+	  if (verbose)
+	    fprintf(stderr, "Failed to find level for synthetic index interleaving loop type '%s'\n",
+		    tmp);
+	  goto out_with_array;
+	}
+	tmp = strchr(tmp, ':');
+	if (!tmp || tmp > attr+length)
+	  break;
+	tmp++;
+	cur_loop++;
+      }
+
+      /* compute actual loop step/nb */
+      for(cur_loop=0; cur_loop<nr_loops; cur_loop++) {
+	unsigned mydepth = loops[cur_loop].level_depth;
+	unsigned prevdepth = 0;
+	unsigned step, nb;
+	for(i=0; i<nr_loops; i++) {
+	  if (loops[i].level_depth == mydepth && i != cur_loop) {
+	    if (verbose)
+	      fprintf(stderr, "Invalid duplicate interleaving loop type in synthetic index '%s'\n", attr);
+	    goto out_with_array;
+	  }
+	  if (loops[i].level_depth < mydepth
+	      && loops[i].level_depth > prevdepth)
+	    prevdepth = loops[i].level_depth;
+	}
+	step = total / data->level[mydepth].totalwidth; /* number of objects below us */
+	nb = data->level[mydepth].totalwidth / data->level[prevdepth].totalwidth; /* number of us within parent */
+
+	loops[cur_loop].step = step;
+	loops[cur_loop].nb = nb;
+	assert(nb);
+	assert(step);
+	if (step < minstep)
+	  minstep = step;
+	nbs *= nb;
+      }
+    }
+    assert(nbs);
+
+    if (nbs != total) {
+      /* one loop of total/nbs steps is missing, add it if it's just the smallest one */
+      if (minstep == total/nbs) {
+	loops[nr_loops].step = 1;
+	loops[nr_loops].nb = total/nbs;
+	nr_loops++;
+      } else {
+	if (verbose)
+	  fprintf(stderr, "Invalid index interleaving total width %lu instead of %lu\n", nbs, total);
+	goto out_with_array;
+      }
+    }
+
+    /* generate the array of indexes */
+    mul = 1;
+    for(i=0; i<nr_loops; i++) {
+      unsigned step = loops[i].step;
+      unsigned nb = loops[i].nb;
+      for(j=0; j<total; j++)
+	array[j] += ((j / step) % nb) * mul;
+      mul *= nb;
+    }
+
+    /* check that we have the right values (cannot pass total, cannot give duplicate 0) */
+    for(j=0; j<total; j++) {
+      if (array[j] >= total) {
+	if (verbose)
+	  fprintf(stderr, "Invalid index interleaving generates out-of-range index %u\n", array[j]);
+	goto out_with_array;
+      }
+      if (!array[j] && j) {
+	if (verbose)
+	  fprintf(stderr, "Invalid index interleaving generates duplicate index values\n");
+	goto out_with_array;
+      }
+    }
+
+    indexes->array = array;
+   }
+  }
+
+  return;
+
+ out_with_array:
+  free(array);
+ out:
+  return;
+}
+
+static hwloc_uint64_t
+hwloc_synthetic_parse_memory_attr(const char *attr, const char **endp)
+{
+  const char *endptr;
+  hwloc_uint64_t size;
+  size = strtoull(attr, (char **) &endptr, 0);
+  if (!hwloc_strncasecmp(endptr, "TB", 2)) {
+    size <<= 40;
+    endptr += 2;
+  } else if (!hwloc_strncasecmp(endptr, "GB", 2)) {
+    size <<= 30;
+    endptr += 2;
+  } else if (!hwloc_strncasecmp(endptr, "MB", 2)) {
+    size <<= 20;
+    endptr += 2;
+  } else if (!hwloc_strncasecmp(endptr, "kB", 2)) {
+    size <<= 10;
+    endptr += 2;
+  }
+  *endp = endptr;
+  return size;
+}
+
+static int
+hwloc_synthetic_parse_attrs(const char *attrs, const char **next_posp,
+			    struct hwloc_synthetic_attr_s *sattr,
+			    struct hwloc_synthetic_indexes_s *sind,
+			    int verbose)
+{
+  hwloc_obj_type_t type = sattr->type;
+  const char *next_pos;
+  hwloc_uint64_t memorysize = 0;
+  const char *index_string = NULL;
+  size_t index_string_length = 0;
+
+  next_pos = (const char *) strchr(attrs, ')');
+  if (!next_pos) {
+    if (verbose)
+      fprintf(stderr, "Missing attribute closing bracket in synthetic string doesn't have a number of objects at '%s'\n", attrs);
+    errno = EINVAL;
+    return -1;
+  }
+
+  while (')' != *attrs) {
+    int iscache = hwloc__obj_type_is_cache(type);
+
+    if (iscache && !strncmp("size=", attrs, 5)) {
+      memorysize = hwloc_synthetic_parse_memory_attr(attrs+5, &attrs);
+
+    } else if (!iscache && !strncmp("memory=", attrs, 7)) {
+      memorysize = hwloc_synthetic_parse_memory_attr(attrs+7, &attrs);
+
+    } else if (!strncmp("indexes=", attrs, 8)) {
+      index_string = attrs+8;
+      attrs += 8;
+      index_string_length = strcspn(attrs, " )");
+      attrs += index_string_length;
+
+    } else {
+      if (verbose)
+	fprintf(stderr, "Unknown attribute at '%s'\n", attrs);
+      errno = EINVAL;
+      return -1;
+    }
+
+    if (' ' == *attrs)
+      attrs++;
+    else if (')' != *attrs) {
+      if (verbose)
+	fprintf(stderr, "Missing parameter separator at '%s'\n", attrs);
+      errno = EINVAL;
+      return -1;
+    }
+  }
+
+  sattr->memorysize = memorysize;
+
+  if (index_string) {
+    if (sind->string && verbose)
+      fprintf(stderr, "Overwriting duplicate indexes attribute with last occurence\n");
+    sind->string = index_string;
+    sind->string_length = (unsigned long)index_string_length;
+  }
+
+  *next_posp = next_pos+1;
+  return 0;
+}
+
+/* frees level until arity = 0 */
+static void
+hwloc_synthetic_free_levels(struct hwloc_synthetic_backend_data_s *data)
+{
+  unsigned i;
+  for(i=0; i<HWLOC_SYNTHETIC_MAX_DEPTH; i++) {
+    struct hwloc_synthetic_level_data_s *curlevel = &data->level[i];
+    struct hwloc_synthetic_attached_s **pprev = &curlevel->attached;
+    while (*pprev) {
+      struct hwloc_synthetic_attached_s *cur = *pprev;
+      *pprev = cur->next;
+      free(cur);
+    }
+    free(curlevel->indexes.array);
+    if (!curlevel->arity)
+      break;
+  }
+  free(data->numa_attached_indexes.array);
+}
+
+/* Read from description a series of integers describing a symmetrical
+   topology and update the hwloc_synthetic_backend_data_s accordingly.  On
+   success, return zero.  */
+static int
+hwloc_backend_synthetic_init(struct hwloc_synthetic_backend_data_s *data,
+			     const char *description)
+{
+  const char *pos, *next_pos;
+  unsigned long item, count;
+  unsigned i;
+  int type_count[HWLOC_OBJ_TYPE_MAX];
+  unsigned unset;
+  int verbose = 0;
+  const char *env = getenv("HWLOC_SYNTHETIC_VERBOSE");
+  int err;
+  unsigned long totalarity = 1;
+
+  if (env)
+    verbose = atoi(env);
+
+  data->numa_attached_nr = 0;
+  data->numa_attached_indexes.array = NULL;
+
+  /* default values before we add root attributes */
+  data->level[0].totalwidth = 1;
+  data->level[0].attr.type = HWLOC_OBJ_MACHINE;
+  data->level[0].indexes.string = NULL;
+  data->level[0].indexes.array = NULL;
+  data->level[0].attr.memorysize = 0;
+  data->level[0].attached = NULL;
+  type_count[HWLOC_OBJ_MACHINE] = 1;
+  if (*description == '(') {
+    err = hwloc_synthetic_parse_attrs(description+1, &description, &data->level[0].attr, &data->level[0].indexes, verbose);
+    if (err < 0)
+      return err;
+  }
+
+  data->numa_attached_indexes.string = NULL;
+  data->numa_attached_indexes.array = NULL;
+
+  for (pos = description, count = 1; *pos; pos = next_pos) {
+    hwloc_obj_type_t type = HWLOC_OBJ_TYPE_NONE;
+    union hwloc_obj_attr_u attrs;
+
+    /* initialize parent arity to 0 so that the levels are not infinite */
+    data->level[count-1].arity = 0;
+
+    while (*pos == ' ')
+      pos++;
+
+    if (!*pos)
+      break;
+
+    if (*pos == '[') {
+      /* attached */
+      struct hwloc_synthetic_attached_s *attached, **pprev;
+      char *attr;
+
+      pos++;
+
+      if (hwloc_type_sscanf(pos, &type, &attrs, sizeof(attrs)) < 0) {
+	if (verbose)
+	  fprintf(stderr, "Synthetic string with unknown attached object type at '%s'\n", pos);
+	errno = EINVAL;
+	goto error;
+      }
+      if (type != HWLOC_OBJ_NUMANODE) {
+	if (verbose)
+	  fprintf(stderr, "Synthetic string with disallowed attached object type at '%s'\n", pos);
+	errno = EINVAL;
+	goto error;
+      }
+      data->numa_attached_nr += data->level[count-1].totalwidth;
+
+      attached = malloc(sizeof(*attached));
+      if (attached) {
+	attached->attr.type = type;
+	attached->attr.memorysize = 0;
+	/* attached->attr.depth and .cachetype unused */
+	attached->next = NULL;
+	pprev = &data->level[count-1].attached;
+	while (*pprev)
+	  pprev = &((*pprev)->next);
+	*pprev = attached;
+      }
+
+      next_pos = strchr(pos, ']');
+      if (!next_pos) {
+	if (verbose)
+	  fprintf(stderr,"Synthetic string doesn't have a closing `]' after attached object type at '%s'\n", pos);
+	errno = EINVAL;
+	goto error;
+      }
+
+      attr = strchr(pos, '(');
+      if (attr && attr < next_pos && attached) {
+	const char *dummy;
+	err = hwloc_synthetic_parse_attrs(attr+1, &dummy, &attached->attr, &data->numa_attached_indexes, verbose);
+	if (err < 0)
+	  goto error;
+      }
+
+      next_pos++;
+      continue;
+    }
+
+    /* normal level */
+
+    /* reset defaults */
+    data->level[count].indexes.string = NULL;
+    data->level[count].indexes.array = NULL;
+    data->level[count].attached = NULL;
+
+    if (*pos < '0' || *pos > '9') {
+      if (hwloc_type_sscanf(pos, &type, &attrs, sizeof(attrs)) < 0) {
+	if (!strncmp(pos, "Die", 3) || !strncmp(pos, "Tile", 4) || !strncmp(pos, "Module", 6)) {
+	  type = HWLOC_OBJ_GROUP;
+	} else {
+	  /* FIXME: allow generic "Cache" string? would require to deal with possibly duplicate cache levels */
+	  if (verbose)
+	    fprintf(stderr, "Synthetic string with unknown object type at '%s'\n", pos);
+	  errno = EINVAL;
+	  goto error;
+	}
+      }
+      if (type == HWLOC_OBJ_MACHINE || type == HWLOC_OBJ_MISC || type == HWLOC_OBJ_BRIDGE || type == HWLOC_OBJ_PCI_DEVICE || type == HWLOC_OBJ_OS_DEVICE) {
+	if (verbose)
+	  fprintf(stderr, "Synthetic string with disallowed object type at '%s'\n", pos);
+	errno = EINVAL;
+	goto error;
+      }
+
+      next_pos = strchr(pos, ':');
+      if (!next_pos) {
+	if (verbose)
+	  fprintf(stderr,"Synthetic string doesn't have a `:' after object type at '%s'\n", pos);
+	errno = EINVAL;
+	goto error;
+      }
+      pos = next_pos + 1;
+    }
+
+    data->level[count].attr.type = type;
+    data->level[count].attr.depth = (unsigned) -1;
+    data->level[count].attr.cachetype = (hwloc_obj_cache_type_t) -1;
+    if (hwloc__obj_type_is_cache(type)) {
+      /* these are always initialized */
+      data->level[count].attr.depth = attrs.cache.depth;
+      data->level[count].attr.cachetype = attrs.cache.type;
+    } else if (type == HWLOC_OBJ_GROUP) {
+      /* could be -1 but will be set below */
+      data->level[count].attr.depth = attrs.group.depth;
+    }
+
+    /* number of normal children */
+    item = strtoul(pos, (char **)&next_pos, 0);
+    if (next_pos == pos) {
+      if (verbose)
+	fprintf(stderr,"Synthetic string doesn't have a number of objects at '%s'\n", pos);
+      errno = EINVAL;
+      goto error;
+    }
+    if (!item) {
+      if (verbose)
+	fprintf(stderr,"Synthetic string with disallow 0 number of objects at '%s'\n", pos);
+      errno = EINVAL;
+      goto error;
+    }
+
+    totalarity *= item;
+    data->level[count].totalwidth = totalarity;
+    data->level[count].indexes.string = NULL;
+    data->level[count].indexes.array = NULL;
+    data->level[count].attr.memorysize = 0;
+    if (*next_pos == '(') {
+      err = hwloc_synthetic_parse_attrs(next_pos+1, &next_pos, &data->level[count].attr, &data->level[count].indexes, verbose);
+      if (err < 0)
+	goto error;
+    }
+
+    if (count + 1 >= HWLOC_SYNTHETIC_MAX_DEPTH) {
+      if (verbose)
+	fprintf(stderr,"Too many synthetic levels, max %d\n", HWLOC_SYNTHETIC_MAX_DEPTH);
+      errno = EINVAL;
+      goto error;
+    }
+    if (item > UINT_MAX) {
+      if (verbose)
+	fprintf(stderr,"Too big arity, max %u\n", UINT_MAX);
+      errno = EINVAL;
+      goto error;
+    }
+
+    data->level[count-1].arity = (unsigned)item;
+    count++;
+  }
+
+  if (data->level[count-1].attr.type != HWLOC_OBJ_TYPE_NONE && data->level[count-1].attr.type != HWLOC_OBJ_PU) {
+    if (verbose)
+      fprintf(stderr, "Synthetic string cannot use non-PU type for last level\n");
+    errno = EINVAL;
+    return -1;
+  }
+  data->level[count-1].attr.type = HWLOC_OBJ_PU;
+
+  for(i=HWLOC_OBJ_TYPE_MIN; i<HWLOC_OBJ_TYPE_MAX; i++) {
+    type_count[i] = 0;
+  }
+  for(i=count-1; i>0; i--) {
+    hwloc_obj_type_t type = data->level[i].attr.type;
+    if (type != HWLOC_OBJ_TYPE_NONE) {
+      type_count[type]++;
+    }
+  }
+
+  /* sanity checks */
+  if (!type_count[HWLOC_OBJ_PU]) {
+    if (verbose)
+      fprintf(stderr, "Synthetic string missing ending number of PUs\n");
+    errno = EINVAL;
+    return -1;
+  } else if (type_count[HWLOC_OBJ_PU] > 1) {
+    if (verbose)
+      fprintf(stderr, "Synthetic string cannot have several PU levels\n");
+    errno = EINVAL;
+    return -1;
+  }
+  if (type_count[HWLOC_OBJ_PACKAGE] > 1) {
+    if (verbose)
+      fprintf(stderr, "Synthetic string cannot have several package levels\n");
+    errno = EINVAL;
+    return -1;
+  }
+  if (type_count[HWLOC_OBJ_NUMANODE] > 1) {
+    if (verbose)
+      fprintf(stderr, "Synthetic string cannot have several NUMA node levels\n");
+    errno = EINVAL;
+    return -1;
+  }
+  if (type_count[HWLOC_OBJ_NUMANODE] && data->numa_attached_nr) {
+    if (verbose)
+      fprintf(stderr,"Synthetic string cannot have NUMA nodes both as a level and attached\n");
+    errno = EINVAL;
+    return -1;
+  }
+  if (type_count[HWLOC_OBJ_CORE] > 1) {
+    if (verbose)
+      fprintf(stderr, "Synthetic string cannot have several core levels\n");
+    errno = EINVAL;
+    return -1;
+  }
+
+  /* deal with missing intermediate levels */
+  unset = 0;
+  for(i=1; i<count-1; i++) {
+    if (data->level[i].attr.type == HWLOC_OBJ_TYPE_NONE)
+      unset++;
+  }
+  if (unset && unset != count-2) {
+    if (verbose)
+      fprintf(stderr, "Synthetic string cannot mix unspecified and specified types for levels\n");
+    errno = EINVAL;
+    return -1;
+  }
+  if (unset) {
+    /* we want in priority: numa, package, core, up to 3 caches, groups */
+    unsigned _count = count;
+    unsigned neednuma = 0;
+    unsigned needpack = 0;
+    unsigned needcore = 0;
+    unsigned needcaches = 0;
+    unsigned needgroups = 0;
+    /* 2 levels for machine and PU */
+    _count -= 2;
+
+    neednuma = (_count >= 1 && !data->numa_attached_nr);
+    _count -= neednuma;
+
+    needpack = (_count >= 1);
+    _count -= needpack;
+
+    needcore = (_count >= 1);
+    _count -= needcore;
+
+    needcaches = (_count > 4 ? 4 : _count);
+    _count -= needcaches;
+
+    needgroups = _count;
+
+    /* we place them in order: groups, package, numa, caches, core */
+    for(i = 0; i < needgroups; i++) {
+      unsigned depth = 1 + i;
+      data->level[depth].attr.type = HWLOC_OBJ_GROUP;
+      type_count[HWLOC_OBJ_GROUP]++;
+    }
+    if (needpack) {
+      unsigned depth = 1 + needgroups;
+      data->level[depth].attr.type = HWLOC_OBJ_PACKAGE;
+      type_count[HWLOC_OBJ_PACKAGE] = 1;
+    }
+    if (neednuma) {
+      unsigned depth = 1 + needgroups + needpack;
+      data->level[depth].attr.type = HWLOC_OBJ_NUMANODE;
+      type_count[HWLOC_OBJ_NUMANODE] = 1;
+    }
+    if (needcaches) {
+      /* priority: l2, l1, l3, l1i */
+      /* order: l3, l2, l1, l1i */
+      unsigned l3depth = 1 + needgroups + needpack + neednuma;
+      unsigned l2depth = l3depth + (needcaches >= 3);
+      unsigned l1depth = l2depth + 1;
+      unsigned l1idepth = l1depth + 1;
+      if (needcaches >= 3) {
+	data->level[l3depth].attr.type = HWLOC_OBJ_L3CACHE;
+	data->level[l3depth].attr.depth = 3;
+	data->level[l3depth].attr.cachetype = HWLOC_OBJ_CACHE_UNIFIED;
+	type_count[HWLOC_OBJ_L3CACHE] = 1;
+      }
+      data->level[l2depth].attr.type = HWLOC_OBJ_L2CACHE;
+      data->level[l2depth].attr.depth = 2;
+      data->level[l2depth].attr.cachetype = HWLOC_OBJ_CACHE_UNIFIED;
+      type_count[HWLOC_OBJ_L2CACHE] = 1;
+      if (needcaches >= 2) {
+	data->level[l1depth].attr.type = HWLOC_OBJ_L1CACHE;
+	data->level[l1depth].attr.depth = 1;
+	data->level[l1depth].attr.cachetype = HWLOC_OBJ_CACHE_DATA;
+	type_count[HWLOC_OBJ_L1CACHE] = 1;
+      }
+      if (needcaches >= 4) {
+	data->level[l1idepth].attr.type = HWLOC_OBJ_L1ICACHE;
+	data->level[l1idepth].attr.depth = 1;
+	data->level[l1idepth].attr.cachetype = HWLOC_OBJ_CACHE_INSTRUCTION;
+	type_count[HWLOC_OBJ_L1ICACHE] = 1;
+      }
+    }
+    if (needcore) {
+      unsigned depth = 1 + needgroups + needpack + neednuma + needcaches;
+      data->level[depth].attr.type = HWLOC_OBJ_CORE;
+      type_count[HWLOC_OBJ_CORE] = 1;
+    }
+  }
+
+  /* enforce a NUMA level */
+  if (!type_count[HWLOC_OBJ_NUMANODE] && !data->numa_attached_nr) {
+    /* insert a NUMA level below the automatic machine root */
+    if (verbose)
+      fprintf(stderr, "Inserting a NUMA level with a single object at depth 1\n");
+    /* move existing levels by one */
+    memmove(&data->level[2], &data->level[1], count*sizeof(struct hwloc_synthetic_level_data_s));
+    data->level[1].attr.type = HWLOC_OBJ_NUMANODE;
+    data->level[1].indexes.string = NULL;
+    data->level[1].indexes.array = NULL;
+    data->level[1].attr.memorysize = 0;
+    data->level[1].totalwidth = data->level[0].totalwidth;
+    /* update arity to insert a single NUMA node per parent */
+    data->level[1].arity = data->level[0].arity;
+    data->level[0].arity = 1;
+    count++;
+  }
+
+  for (i=0; i<count; i++) {
+    struct hwloc_synthetic_level_data_s *curlevel = &data->level[i];
+    hwloc_obj_type_t type = curlevel->attr.type;
+
+    if (type == HWLOC_OBJ_GROUP) {
+      if (curlevel->attr.depth == (unsigned)-1)
+	curlevel->attr.depth = type_count[HWLOC_OBJ_GROUP]--;
+
+    } else if (hwloc__obj_type_is_cache(type)) {
+      if (!curlevel->attr.memorysize) {
+	if (1 == curlevel->attr.depth)
+	  /* 32Kb in L1 */
+	  curlevel->attr.memorysize = 32*1024;
+	else
+	  /* *4 at each level, starting from 1MB for L2, unified */
+	  curlevel->attr.memorysize = 256ULL*1024 << (2*curlevel->attr.depth);
+      }
+
+    } else if (type == HWLOC_OBJ_NUMANODE && !curlevel->attr.memorysize) {
+      /* 1GB in memory nodes. */
+      curlevel->attr.memorysize = 1024*1024*1024;
+    }
+
+    hwloc_synthetic_process_indexes(data, &data->level[i].indexes, data->level[i].totalwidth, verbose);
+  }
+
+  hwloc_synthetic_process_indexes(data, &data->numa_attached_indexes, data->numa_attached_nr, verbose);
+
+  data->string = strdup(description);
+  data->level[count-1].arity = 0;
+  return 0;
+
+ error:
+  hwloc_synthetic_free_levels(data);
+  return -1;
+}
+
+static void
+hwloc_synthetic_set_attr(struct hwloc_synthetic_attr_s *sattr,
+			 hwloc_obj_t obj)
+{
+  switch (obj->type) {
+  case HWLOC_OBJ_GROUP:
+    obj->attr->group.kind = HWLOC_GROUP_KIND_SYNTHETIC;
+    obj->attr->group.subkind = sattr->depth-1;
+    break;
+  case HWLOC_OBJ_MACHINE:
+    break;
+  case HWLOC_OBJ_NUMANODE:
+    obj->attr->numanode.local_memory = sattr->memorysize;
+    obj->attr->numanode.page_types_len = 1;
+    obj->attr->numanode.page_types = malloc(sizeof(*obj->attr->numanode.page_types));
+    memset(obj->attr->numanode.page_types, 0, sizeof(*obj->attr->numanode.page_types));
+    obj->attr->numanode.page_types[0].size = 4096;
+    obj->attr->numanode.page_types[0].count = sattr->memorysize / 4096;
+    break;
+  case HWLOC_OBJ_PACKAGE:
+    break;
+  case HWLOC_OBJ_L1CACHE:
+  case HWLOC_OBJ_L2CACHE:
+  case HWLOC_OBJ_L3CACHE:
+  case HWLOC_OBJ_L4CACHE:
+  case HWLOC_OBJ_L5CACHE:
+  case HWLOC_OBJ_L1ICACHE:
+  case HWLOC_OBJ_L2ICACHE:
+  case HWLOC_OBJ_L3ICACHE:
+    obj->attr->cache.depth = sattr->depth;
+    obj->attr->cache.linesize = 64;
+    obj->attr->cache.type = sattr->cachetype;
+    obj->attr->cache.size = sattr->memorysize;
+    break;
+  case HWLOC_OBJ_CORE:
+    break;
+  case HWLOC_OBJ_PU:
+    break;
+  default:
+    /* Should never happen */
+    assert(0);
+    break;
+  }
+}
+
+static unsigned
+hwloc_synthetic_next_index(struct hwloc_synthetic_indexes_s *indexes, hwloc_obj_type_t type)
+{
+  unsigned os_index = indexes->next++;
+
+  if (indexes->array)
+    os_index = indexes->array[os_index];
+  else if (hwloc__obj_type_is_cache(type) || type == HWLOC_OBJ_GROUP)
+    /* don't enforce useless os_indexes for Caches and Groups */
+    os_index = HWLOC_UNKNOWN_INDEX;
+
+  return os_index;
+}
+
+static void
+hwloc_synthetic_insert_attached(struct hwloc_topology *topology,
+				struct hwloc_synthetic_backend_data_s *data,
+				struct hwloc_synthetic_attached_s *attached,
+				hwloc_bitmap_t set)
+{
+  hwloc_obj_t child;
+  unsigned attached_os_index;
+
+  if (!attached)
+    return;
+
+  assert(attached->attr.type == HWLOC_OBJ_NUMANODE);
+
+  attached_os_index = hwloc_synthetic_next_index(&data->numa_attached_indexes, HWLOC_OBJ_NUMANODE);
+
+  child = hwloc_alloc_setup_object(topology, attached->attr.type, attached_os_index);
+  child->cpuset = hwloc_bitmap_dup(set);
+
+  child->nodeset = hwloc_bitmap_alloc();
+  hwloc_bitmap_set(child->nodeset, attached_os_index);
+
+  hwloc_synthetic_set_attr(&attached->attr, child);
+
+  hwloc_insert_object_by_cpuset(topology, child);
+
+  hwloc_synthetic_insert_attached(topology, data, attached->next, set);
+}
+
+/*
+ * Recursively build objects whose cpu start at first_cpu
+ * - level gives where to look in the type, arity and id arrays
+ * - the id array is used as a variable to get unique IDs for a given level.
+ * - generated memory should be added to *memory_kB.
+ * - generated cpus should be added to parent_cpuset.
+ * - next cpu number to be used should be returned.
+ */
+static void
+hwloc__look_synthetic(struct hwloc_topology *topology,
+		      struct hwloc_synthetic_backend_data_s *data,
+		      int level,
+		      hwloc_bitmap_t parent_cpuset)
+{
+  hwloc_obj_t obj;
+  unsigned i;
+  struct hwloc_synthetic_level_data_s *curlevel = &data->level[level];
+  hwloc_obj_type_t type = curlevel->attr.type;
+  hwloc_bitmap_t set;
+  unsigned os_index;
+
+  assert(hwloc__obj_type_is_normal(type) || type == HWLOC_OBJ_NUMANODE);
+  assert(type != HWLOC_OBJ_MACHINE);
+
+  os_index = hwloc_synthetic_next_index(&curlevel->indexes, type);
+
+  set = hwloc_bitmap_alloc();
+  if (!curlevel->arity) {
+    hwloc_bitmap_set(set, os_index);
+  } else {
+    for (i = 0; i < curlevel->arity; i++)
+      hwloc__look_synthetic(topology, data, level + 1, set);
+  }
+
+  hwloc_bitmap_or(parent_cpuset, parent_cpuset, set);
+
+  if (hwloc_filter_check_keep_object_type(topology, type)) {
+    obj = hwloc_alloc_setup_object(topology, type, os_index);
+    obj->cpuset = hwloc_bitmap_dup(set);
+
+    if (type == HWLOC_OBJ_NUMANODE) {
+      obj->nodeset = hwloc_bitmap_alloc();
+      hwloc_bitmap_set(obj->nodeset, os_index);
+    }
+
+    hwloc_synthetic_set_attr(&curlevel->attr, obj);
+
+    hwloc_insert_object_by_cpuset(topology, obj);
+  }
+
+  hwloc_synthetic_insert_attached(topology, data, curlevel->attached, set);
+
+  hwloc_bitmap_free(set);
+}
+
+static int
+hwloc_look_synthetic(struct hwloc_backend *backend)
+{
+  struct hwloc_topology *topology = backend->topology;
+  struct hwloc_synthetic_backend_data_s *data = backend->private_data;
+  hwloc_bitmap_t cpuset = hwloc_bitmap_alloc();
+  unsigned i;
+
+  assert(!topology->levels[0][0]->cpuset);
+
+  hwloc_alloc_root_sets(topology->levels[0][0]);
+
+  topology->support.discovery->pu = 1;
+  topology->support.discovery->numa = 1; /* we add a single NUMA node if none is given */
+  topology->support.discovery->numa_memory = 1; /* specified or default size */
+
+  /* start with os_index 0 for each level */
+  for (i = 0; data->level[i].arity > 0; i++)
+    data->level[i].indexes.next = 0;
+  data->numa_attached_indexes.next = 0;
+  /* ... including the last one */
+  data->level[i].indexes.next = 0;
+
+  /* update first level type according to the synthetic type array */
+  topology->levels[0][0]->type = data->level[0].attr.type;
+  hwloc_synthetic_set_attr(&data->level[0].attr, topology->levels[0][0]);
+
+  for (i = 0; i < data->level[0].arity; i++)
+    hwloc__look_synthetic(topology, data, 1, cpuset);
+
+  hwloc_synthetic_insert_attached(topology, data, data->level[0].attached, cpuset);
+
+  hwloc_bitmap_free(cpuset);
+
+  hwloc_obj_add_info(topology->levels[0][0], "Backend", "Synthetic");
+  hwloc_obj_add_info(topology->levels[0][0], "SyntheticDescription", data->string);
+  return 0;
+}
+
+static void
+hwloc_synthetic_backend_disable(struct hwloc_backend *backend)
+{
+  struct hwloc_synthetic_backend_data_s *data = backend->private_data;
+  hwloc_synthetic_free_levels(data);
+  free(data->string);
+  free(data);
+}
+
+static struct hwloc_backend *
+hwloc_synthetic_component_instantiate(struct hwloc_disc_component *component,
+				      const void *_data1,
+				      const void *_data2 __hwloc_attribute_unused,
+				      const void *_data3 __hwloc_attribute_unused)
+{
+  struct hwloc_backend *backend;
+  struct hwloc_synthetic_backend_data_s *data;
+  int err;
+
+  if (!_data1) {
+    const char *env = getenv("HWLOC_SYNTHETIC");
+    if (env) {
+      /* 'synthetic' was given in HWLOC_COMPONENTS without a description */
+      _data1 = env;
+    } else {
+      errno = EINVAL;
+      goto out;
+    }
+  }
+
+  backend = hwloc_backend_alloc(component);
+  if (!backend)
+    goto out;
+
+  data = malloc(sizeof(*data));
+  if (!data) {
+    errno = ENOMEM;
+    goto out_with_backend;
+  }
+
+  err = hwloc_backend_synthetic_init(data, (const char *) _data1);
+  if (err < 0)
+    goto out_with_data;
+
+  backend->private_data = data;
+  backend->discover = hwloc_look_synthetic;
+  backend->disable = hwloc_synthetic_backend_disable;
+  backend->is_thissystem = 0;
+
+  return backend;
+
+ out_with_data:
+  free(data);
+ out_with_backend:
+  free(backend);
+ out:
+  return NULL;
+}
+
+static struct hwloc_disc_component hwloc_synthetic_disc_component = {
+  HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
+  "synthetic",
+  ~0,
+  hwloc_synthetic_component_instantiate,
+  30,
+  1,
+  NULL
+};
+
+const struct hwloc_component hwloc_synthetic_component = {
+  HWLOC_COMPONENT_ABI,
+  NULL, NULL,
+  HWLOC_COMPONENT_TYPE_DISC,
+  0,
+  &hwloc_synthetic_disc_component
+};
+
+static __hwloc_inline int
+hwloc__export_synthetic_update_status(int *ret, char **tmp, ssize_t *tmplen, int res)
+{
+  if (res < 0)
+    return -1;
+  *ret += res;
+  if (res >= *tmplen)
+    res = *tmplen>0 ? (int)(*tmplen) - 1 : 0;
+  *tmp += res;
+  *tmplen -= res;
+  return 0;
+}
+
+static __hwloc_inline void
+hwloc__export_synthetic_add_char(int *ret, char **tmp, ssize_t *tmplen, char c)
+{
+  if (*tmplen > 1) {
+    (*tmp)[0] = c;
+    (*tmp)[1] = '\0';
+    (*tmp)++;
+    (*tmplen)--;
+  }
+  (*ret)++;
+}
+
+static int
+hwloc__export_synthetic_indexes(hwloc_obj_t *level, unsigned total,
+				char *buffer, size_t buflen)
+{
+  unsigned step = 1;
+  unsigned nr_loops = 0;
+  struct hwloc_synthetic_intlv_loop_s *loops = NULL, *tmploops;
+  hwloc_obj_t cur;
+  unsigned i, j;
+  ssize_t tmplen = buflen;
+  char *tmp = buffer;
+  int res, ret = 0;
+
+  /* must start with 0 */
+  if (level[0]->os_index)
+    goto exportall;
+
+  while (step != total) {
+    /* must be a divider of the total */
+    if (total % step)
+      goto exportall;
+
+    /* look for os_index == step */
+    for(i=1; i<total; i++)
+      if (level[i]->os_index == step)
+	break;
+    if (i == total)
+      goto exportall;
+    for(j=2; j<total/i; j++)
+      if (level[i*j]->os_index != step*j)
+	break;
+
+    nr_loops++;
+    tmploops = realloc(loops, nr_loops*sizeof(*loops));
+    if (!tmploops)
+      goto exportall;
+    loops = tmploops;
+    loops[nr_loops-1].step = i;
+    loops[nr_loops-1].nb = j;
+    step *= j;
+  }
+
+  /* check this interleaving */
+  for(i=0; i<total; i++) {
+    unsigned ind = 0;
+    unsigned mul = 1;
+    for(j=0; j<nr_loops; j++) {
+      ind += (i / loops[j].step) % loops[j].nb * mul;
+      mul *= loops[j].nb;
+    }
+    if (level[i]->os_index != ind)
+      goto exportall;
+  }
+
+  /* success, print it */
+  for(j=0; j<nr_loops; j++) {
+    res = hwloc_snprintf(tmp, tmplen, "%u*%u%s", loops[j].step, loops[j].nb,
+			 j == nr_loops-1 ? ")" : ":");
+    if (hwloc__export_synthetic_update_status(&ret, &tmp, &tmplen, res) < 0) {
+      free(loops);
+      return -1;
+    }
+  }
+
+  free(loops);
+  return ret;
+
+ exportall:
+  free(loops);
+
+  /* dump all indexes */
+  cur = level[0];
+  while (cur) {
+    res = hwloc_snprintf(tmp, tmplen, "%u%s", cur->os_index,
+			 cur->next_cousin ? "," : ")");
+    if (hwloc__export_synthetic_update_status(&ret, &tmp, &tmplen, res) < 0)
+      return -1;
+    cur = cur->next_cousin;
+  }
+  return ret;
+}
+
+static int
+hwloc__export_synthetic_obj_attr(struct hwloc_topology * topology,
+				 hwloc_obj_t obj,
+				 char *buffer, size_t buflen)
+{
+  const char * separator = " ";
+  const char * prefix = "(";
+  char cachesize[64] = "";
+  char memsize[64] = "";
+  int needindexes = 0;
+
+  if (hwloc__obj_type_is_cache(obj->type) && obj->attr->cache.size) {
+    snprintf(cachesize, sizeof(cachesize), "%ssize=%llu",
+	     prefix, (unsigned long long) obj->attr->cache.size);
+    prefix = separator;
+  }
+  if (obj->type == HWLOC_OBJ_NUMANODE && obj->attr->numanode.local_memory) {
+    snprintf(memsize, sizeof(memsize), "%smemory=%llu",
+	     prefix, (unsigned long long) obj->attr->numanode.local_memory);
+    prefix = separator;
+  }
+  if (!obj->logical_index /* only display indexes once per level (not for non-first NUMA children, etc.) */
+      && (obj->type == HWLOC_OBJ_PU || obj->type == HWLOC_OBJ_NUMANODE)) {
+    hwloc_obj_t cur = obj;
+    while (cur) {
+      if (cur->os_index != cur->logical_index) {
+	needindexes = 1;
+	break;
+      }
+      cur = cur->next_cousin;
+    }
+  }
+  if (*cachesize || *memsize || needindexes) {
+    ssize_t tmplen = buflen;
+    char *tmp = buffer;
+    int res, ret = 0;
+
+    res = hwloc_snprintf(tmp, tmplen, "%s%s%s", cachesize, memsize, needindexes ? "" : ")");
+    if (hwloc__export_synthetic_update_status(&ret, &tmp, &tmplen, res) < 0)
+      return -1;
+
+    if (needindexes) {
+      unsigned total;
+      hwloc_obj_t *level;
+
+      if (obj->depth < 0) {
+	assert(obj->depth == HWLOC_TYPE_DEPTH_NUMANODE);
+	total = topology->slevels[HWLOC_SLEVEL_NUMANODE].nbobjs;
+	level = topology->slevels[HWLOC_SLEVEL_NUMANODE].objs;
+      } else {
+	total = topology->level_nbobjects[obj->depth];
+	level = topology->levels[obj->depth];
+      }
+
+      res = hwloc_snprintf(tmp, tmplen, "%sindexes=", prefix);
+      if (hwloc__export_synthetic_update_status(&ret, &tmp, &tmplen, res) < 0)
+	return -1;
+
+      res = hwloc__export_synthetic_indexes(level, total, tmp, tmplen);
+      if (hwloc__export_synthetic_update_status(&ret, &tmp, &tmplen, res) < 0)
+	return -1;
+    }
+    return ret;
+  } else {
+    return 0;
+  }
+}
+
+static int
+hwloc__export_synthetic_obj(struct hwloc_topology * topology, unsigned long flags,
+			    hwloc_obj_t obj, unsigned arity,
+			    char *buffer, size_t buflen)
+{
+  char aritys[12] = "";
+  ssize_t tmplen = buflen;
+  char *tmp = buffer;
+  int res, ret = 0;
+
+  /* <type>:<arity>, except for root */
+  if (arity != (unsigned)-1)
+    snprintf(aritys, sizeof(aritys), ":%u", arity);
+  if (hwloc__obj_type_is_cache(obj->type)
+      && (flags & HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_EXTENDED_TYPES)) {
+    /* v1 uses generic "Cache" for non-extended type name */
+    res = hwloc_snprintf(tmp, tmplen, "Cache%s", aritys);
+
+  } else if (obj->type == HWLOC_OBJ_PACKAGE
+	     && (flags & (HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_EXTENDED_TYPES
+			  |HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_V1))) {
+    /* if exporting to v1 or without extended-types, use all-v1-compatible Socket name */
+    res = hwloc_snprintf(tmp, tmplen, "Socket%s", aritys);
+
+  } else if (obj->type == HWLOC_OBJ_GROUP /* don't export group depth */
+      || flags & HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_EXTENDED_TYPES) {
+    res = hwloc_snprintf(tmp, tmplen, "%s%s", hwloc_obj_type_string(obj->type), aritys);
+  } else {
+    char types[64];
+    hwloc_obj_type_snprintf(types, sizeof(types), obj, 1);
+    res = hwloc_snprintf(tmp, tmplen, "%s%s", types, aritys);
+  }
+  if (hwloc__export_synthetic_update_status(&ret, &tmp, &tmplen, res) < 0)
+    return -1;
+
+  if (!(flags & HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS)) {
+    /* obj attributes */
+    res = hwloc__export_synthetic_obj_attr(topology, obj, tmp, tmplen);
+    if (hwloc__export_synthetic_update_status(&ret, &tmp, &tmplen, res) < 0)
+      return -1;
+  }
+
+  return ret;
+}
+
+static int
+hwloc__export_synthetic_memory_children(struct hwloc_topology * topology, unsigned long flags,
+					hwloc_obj_t parent,
+					char *buffer, size_t buflen,
+					int needprefix, int verbose)
+{
+  hwloc_obj_t mchild;
+  ssize_t tmplen = buflen;
+  char *tmp = buffer;
+  int res, ret = 0;
+
+  mchild = parent->memory_first_child;
+  if (!mchild)
+    return 0;
+
+  if (flags & HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_V1) {
+    /* v1: export a single NUMA child */
+    if (parent->memory_arity > 1 || mchild->type != HWLOC_OBJ_NUMANODE) {
+      /* not supported */
+      if (verbose)
+	fprintf(stderr, "Cannot export to synthetic v1 if multiple memory children are attached to the same location.\n");
+      errno = EINVAL;
+      return -1;
+    }
+
+    if (needprefix)
+      hwloc__export_synthetic_add_char(&ret, &tmp, &tmplen, ' ');
+
+    res = hwloc__export_synthetic_obj(topology, flags, mchild, 1, tmp, tmplen);
+    if (hwloc__export_synthetic_update_status(&ret, &tmp, &tmplen, res) < 0)
+      return -1;
+    return ret;
+  }
+
+  while (mchild) {
+    /* v2: export all NUMA children */
+
+    assert(mchild->type == HWLOC_OBJ_NUMANODE); /* only NUMA node memory children for now */
+
+    if (needprefix)
+      hwloc__export_synthetic_add_char(&ret, &tmp, &tmplen, ' ');
+
+    hwloc__export_synthetic_add_char(&ret, &tmp, &tmplen, '[');
+
+    res = hwloc__export_synthetic_obj(topology, flags, mchild, (unsigned)-1, tmp, tmplen);
+    if (hwloc__export_synthetic_update_status(&ret, &tmp, &tmplen, res) < 0)
+      return -1;
+
+    hwloc__export_synthetic_add_char(&ret, &tmp, &tmplen, ']');
+
+    needprefix = 1;
+    mchild = mchild->next_sibling;
+  }
+
+  return ret;
+}
+
+static int
+hwloc_check_memory_symmetric(struct hwloc_topology * topology)
+{
+  hwloc_bitmap_t remaining_nodes;
+
+  remaining_nodes = hwloc_bitmap_dup(hwloc_get_root_obj(topology)->nodeset);
+  if (!remaining_nodes)
+    /* assume asymmetric */
+    return -1;
+
+  while (!hwloc_bitmap_iszero(remaining_nodes)) {
+    unsigned idx;
+    hwloc_obj_t node;
+    hwloc_obj_t first_parent;
+    unsigned i;
+
+    idx = hwloc_bitmap_first(remaining_nodes);
+    node = hwloc_get_numanode_obj_by_os_index(topology, idx);
+    assert(node);
+
+    first_parent = node->parent;
+    assert(hwloc__obj_type_is_normal(first_parent->type)); /* only depth-1 memory children for now */
+
+    /* check whether all object on parent's level have same number of NUMA children */
+    for(i=0; i<hwloc_get_nbobjs_by_depth(topology, first_parent->depth); i++) {
+      hwloc_obj_t parent, mchild;
+
+      parent = hwloc_get_obj_by_depth(topology, first_parent->depth, i);
+      assert(parent);
+
+      /* must have same memory arity */
+      if (parent->memory_arity != first_parent->memory_arity)
+	goto out_with_bitmap;
+
+      /* clear these NUMA children from remaining_nodes */
+      mchild = parent->memory_first_child;
+      while (mchild) {
+	assert(mchild->type == HWLOC_OBJ_NUMANODE); /* only NUMA node memory children for now */
+	hwloc_bitmap_clr(remaining_nodes, mchild->os_index); /* cannot use parent->nodeset, some normal children may have other NUMA nodes */
+	mchild = mchild->next_sibling;
+      }
+    }
+  }
+
+  hwloc_bitmap_free(remaining_nodes);
+  return 0;
+
+ out_with_bitmap:
+  hwloc_bitmap_free(remaining_nodes);
+  return -1;
+}
+
+int
+hwloc_topology_export_synthetic(struct hwloc_topology * topology,
+				char *buffer, size_t buflen,
+				unsigned long flags)
+{
+  hwloc_obj_t obj = hwloc_get_root_obj(topology);
+  ssize_t tmplen = buflen;
+  char *tmp = buffer;
+  int res, ret = 0;
+  unsigned arity;
+  int needprefix = 0;
+  int verbose = 0;
+  const char *env = getenv("HWLOC_SYNTHETIC_VERBOSE");
+
+  if (env)
+    verbose = atoi(env);
+
+  if (!topology->is_loaded) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (flags & ~(HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_EXTENDED_TYPES
+		|HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS
+		|HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_V1
+		|HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_IGNORE_MEMORY)) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  /* TODO: add a flag to ignore symmetric_subtree and I/Os.
+   * just assume things are symmetric with the left branches of the tree.
+   * but the number of objects per level may be wrong, what to do with OS index array in this case?
+   * only allow ignoring symmetric_subtree if the level width remains OK?
+   */
+
+  /* TODO: add a root object by default, with a prefix such as tree=
+   * so that we can backward-compatibly recognize whether there's a root or not.
+   * and add a flag to disable it.
+   */
+
+  /* TODO: flag to force all indexes, not only for PU and NUMA? */
+
+  if (!obj->symmetric_subtree) {
+    if (verbose)
+      fprintf(stderr, "Cannot export to synthetic unless topology is symmetric (root->symmetric_subtree must be set).\n");
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (!(flags & HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_IGNORE_MEMORY)
+      && hwloc_check_memory_symmetric(topology) < 0) {
+    if (verbose)
+      fprintf(stderr, "Cannot export to synthetic unless memory is attached symmetrically.\n");
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (flags & HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_V1) {
+    /* v1 requires all NUMA at the same level */
+    hwloc_obj_t node;
+    signed pdepth;
+
+    node = hwloc_get_obj_by_type(topology, HWLOC_OBJ_NUMANODE, 0);
+    assert(hwloc__obj_type_is_normal(node->parent->type)); /* only depth-1 memory children for now */
+    pdepth = node->parent->depth;
+
+    while ((node = node->next_cousin) != NULL) {
+      assert(hwloc__obj_type_is_normal(node->parent->type)); /* only depth-1 memory children for now */
+      if (node->parent->depth != pdepth) {
+	if (verbose)
+	  fprintf(stderr, "Cannot export to synthetic v1 if memory is attached to parents at different depths.\n");
+	errno = EINVAL;
+	return -1;
+      }
+    }
+  }
+
+  /* we're good, start exporting */
+
+  if (!(flags & HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS)) {
+    /* obj attributes */
+    res = hwloc__export_synthetic_obj_attr(topology, obj, tmp, tmplen);
+    if (res > 0)
+      needprefix = 1;
+    if (hwloc__export_synthetic_update_status(&ret, &tmp, &tmplen, res) < 0)
+      return -1;
+  }
+
+  if (!(flags & HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_IGNORE_MEMORY)) {
+    res = hwloc__export_synthetic_memory_children(topology, flags, obj, tmp, tmplen, needprefix, verbose);
+    if (res > 0)
+      needprefix = 1;
+    if (hwloc__export_synthetic_update_status(&ret, &tmp, &tmplen, res) < 0)
+      return -1;
+  }
+
+  arity = obj->arity;
+  while (arity) {
+    /* for each level */
+    obj = obj->first_child;
+
+    if (needprefix)
+      hwloc__export_synthetic_add_char(&ret, &tmp, &tmplen, ' ');
+
+    res = hwloc__export_synthetic_obj(topology, flags, obj, arity, tmp, tmplen);
+    if (hwloc__export_synthetic_update_status(&ret, &tmp, &tmplen, res) < 0)
+      return -1;
+
+    if (!(flags & HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_IGNORE_MEMORY)) {
+      res = hwloc__export_synthetic_memory_children(topology, flags, obj, tmp, tmplen, 1, verbose);
+      if (hwloc__export_synthetic_update_status(&ret, &tmp, &tmplen, res) < 0)
+	return -1;
+    }
+
+    /* next level */
+    needprefix = 1;
+    arity = obj->arity;
+  }
+
+  return ret;
+}
diff --git a/src/3rdparty/hwloc/src/topology-windows.c b/src/3rdparty/hwloc/src/topology-windows.c
new file mode 100644
index 000000000..d03645c0f
--- /dev/null
+++ b/src/3rdparty/hwloc/src/topology-windows.c
@@ -0,0 +1,1189 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2018 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/* To try to get all declarations duplicated below.  */
+#define _WIN32_WINNT 0x0601
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <private/private.h>
+#include <private/debug.h>
+
+#include <windows.h>
+
+#ifndef HAVE_KAFFINITY
+typedef ULONG_PTR KAFFINITY, *PKAFFINITY;
+#endif
+
+#ifndef HAVE_PROCESSOR_CACHE_TYPE
+typedef enum _PROCESSOR_CACHE_TYPE {
+  CacheUnified,
+  CacheInstruction,
+  CacheData,
+  CacheTrace
+} PROCESSOR_CACHE_TYPE;
+#endif
+
+#ifndef CACHE_FULLY_ASSOCIATIVE
+#define CACHE_FULLY_ASSOCIATIVE 0xFF
+#endif
+
+#ifndef MAXIMUM_PROC_PER_GROUP /* missing in MinGW */
+#define MAXIMUM_PROC_PER_GROUP 64
+#endif
+
+#ifndef HAVE_CACHE_DESCRIPTOR
+typedef struct _CACHE_DESCRIPTOR {
+  BYTE Level;
+  BYTE Associativity;
+  WORD LineSize;
+  DWORD Size; /* in bytes */
+  PROCESSOR_CACHE_TYPE Type;
+} CACHE_DESCRIPTOR, *PCACHE_DESCRIPTOR;
+#endif
+
+#ifndef HAVE_LOGICAL_PROCESSOR_RELATIONSHIP
+typedef enum _LOGICAL_PROCESSOR_RELATIONSHIP {
+  RelationProcessorCore,
+  RelationNumaNode,
+  RelationCache,
+  RelationProcessorPackage,
+  RelationGroup,
+  RelationAll = 0xffff
+} LOGICAL_PROCESSOR_RELATIONSHIP;
+#else /* HAVE_LOGICAL_PROCESSOR_RELATIONSHIP */
+#  ifndef HAVE_RELATIONPROCESSORPACKAGE
+#    define RelationProcessorPackage 3
+#    define RelationGroup 4
+#    define RelationAll 0xffff
+#  endif /* HAVE_RELATIONPROCESSORPACKAGE */
+#endif /* HAVE_LOGICAL_PROCESSOR_RELATIONSHIP */
+
+#ifndef HAVE_SYSTEM_LOGICAL_PROCESSOR_INFORMATION
+typedef struct _SYSTEM_LOGICAL_PROCESSOR_INFORMATION {
+  ULONG_PTR ProcessorMask;
+  LOGICAL_PROCESSOR_RELATIONSHIP Relationship;
+  _ANONYMOUS_UNION
+  union {
+    struct {
+      BYTE flags;
+    } ProcessorCore;
+    struct {
+      DWORD NodeNumber;
+    } NumaNode;
+    CACHE_DESCRIPTOR Cache;
+    ULONGLONG Reserved[2];
+  } DUMMYUNIONNAME;
+} SYSTEM_LOGICAL_PROCESSOR_INFORMATION, *PSYSTEM_LOGICAL_PROCESSOR_INFORMATION;
+#endif
+
+/* Extended interface, for group support */
+
+#ifndef HAVE_GROUP_AFFINITY
+typedef struct _GROUP_AFFINITY {
+  KAFFINITY Mask;
+  WORD Group;
+  WORD Reserved[3];
+} GROUP_AFFINITY, *PGROUP_AFFINITY;
+#endif
+
+#ifndef HAVE_PROCESSOR_RELATIONSHIP
+typedef struct _PROCESSOR_RELATIONSHIP {
+  BYTE Flags;
+  BYTE Reserved[21];
+  WORD GroupCount;
+  GROUP_AFFINITY GroupMask[ANYSIZE_ARRAY];
+} PROCESSOR_RELATIONSHIP, *PPROCESSOR_RELATIONSHIP;
+#endif
+
+#ifndef HAVE_NUMA_NODE_RELATIONSHIP
+typedef struct _NUMA_NODE_RELATIONSHIP {
+  DWORD NodeNumber;
+  BYTE Reserved[20];
+  GROUP_AFFINITY GroupMask;
+} NUMA_NODE_RELATIONSHIP, *PNUMA_NODE_RELATIONSHIP;
+#endif
+
+#ifndef HAVE_CACHE_RELATIONSHIP
+typedef struct _CACHE_RELATIONSHIP {
+  BYTE Level;
+  BYTE Associativity;
+  WORD LineSize;
+  DWORD CacheSize;
+  PROCESSOR_CACHE_TYPE Type;
+  BYTE Reserved[20];
+  GROUP_AFFINITY GroupMask;
+} CACHE_RELATIONSHIP, *PCACHE_RELATIONSHIP;
+#endif
+
+#ifndef HAVE_PROCESSOR_GROUP_INFO
+typedef struct _PROCESSOR_GROUP_INFO {
+  BYTE MaximumProcessorCount;
+  BYTE ActiveProcessorCount;
+  BYTE Reserved[38];
+  KAFFINITY ActiveProcessorMask;
+} PROCESSOR_GROUP_INFO, *PPROCESSOR_GROUP_INFO;
+#endif
+
+#ifndef HAVE_GROUP_RELATIONSHIP
+typedef struct _GROUP_RELATIONSHIP {
+  WORD MaximumGroupCount;
+  WORD ActiveGroupCount;
+  ULONGLONG Reserved[2];
+  PROCESSOR_GROUP_INFO GroupInfo[ANYSIZE_ARRAY];
+} GROUP_RELATIONSHIP, *PGROUP_RELATIONSHIP;
+#endif
+
+#ifndef HAVE_SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX
+typedef struct _SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX {
+  LOGICAL_PROCESSOR_RELATIONSHIP Relationship;
+  DWORD Size;
+  _ANONYMOUS_UNION
+  union {
+    PROCESSOR_RELATIONSHIP Processor;
+    NUMA_NODE_RELATIONSHIP NumaNode;
+    CACHE_RELATIONSHIP Cache;
+    GROUP_RELATIONSHIP Group;
+    /* Odd: no member to tell the cpu mask of the package... */
+  } DUMMYUNIONNAME;
+} SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, *PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX;
+#endif
+
+#ifndef HAVE_PSAPI_WORKING_SET_EX_BLOCK
+typedef union _PSAPI_WORKING_SET_EX_BLOCK {
+  ULONG_PTR Flags;
+  struct {
+    unsigned Valid  :1;
+    unsigned ShareCount  :3;
+    unsigned Win32Protection  :11;
+    unsigned Shared  :1;
+    unsigned Node  :6;
+    unsigned Locked  :1;
+    unsigned LargePage  :1;
+  };
+} PSAPI_WORKING_SET_EX_BLOCK;
+#endif
+
+#ifndef HAVE_PSAPI_WORKING_SET_EX_INFORMATION
+typedef struct _PSAPI_WORKING_SET_EX_INFORMATION {
+  PVOID VirtualAddress;
+  PSAPI_WORKING_SET_EX_BLOCK VirtualAttributes;
+} PSAPI_WORKING_SET_EX_INFORMATION;
+#endif
+
+#ifndef HAVE_PROCESSOR_NUMBER
+typedef struct _PROCESSOR_NUMBER {
+  WORD Group;
+  BYTE Number;
+  BYTE Reserved;
+} PROCESSOR_NUMBER, *PPROCESSOR_NUMBER;
+#endif
+
+/* Function pointers */
+
+typedef WORD (WINAPI *PFN_GETACTIVEPROCESSORGROUPCOUNT)(void);
+static PFN_GETACTIVEPROCESSORGROUPCOUNT GetActiveProcessorGroupCountProc;
+
+static unsigned long nr_processor_groups = 1;
+static unsigned long max_numanode_index = 0;
+
+typedef WORD (WINAPI *PFN_GETACTIVEPROCESSORCOUNT)(WORD);
+static PFN_GETACTIVEPROCESSORCOUNT GetActiveProcessorCountProc;
+
+typedef DWORD (WINAPI *PFN_GETCURRENTPROCESSORNUMBER)(void);
+static PFN_GETCURRENTPROCESSORNUMBER GetCurrentProcessorNumberProc;
+
+typedef VOID (WINAPI *PFN_GETCURRENTPROCESSORNUMBEREX)(PPROCESSOR_NUMBER);
+static PFN_GETCURRENTPROCESSORNUMBEREX GetCurrentProcessorNumberExProc;
+
+typedef BOOL (WINAPI *PFN_GETLOGICALPROCESSORINFORMATION)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION Buffer, PDWORD ReturnLength);
+static PFN_GETLOGICALPROCESSORINFORMATION GetLogicalProcessorInformationProc;
+
+typedef BOOL (WINAPI *PFN_GETLOGICALPROCESSORINFORMATIONEX)(LOGICAL_PROCESSOR_RELATIONSHIP relationship, PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX Buffer, PDWORD ReturnLength);
+static PFN_GETLOGICALPROCESSORINFORMATIONEX GetLogicalProcessorInformationExProc;
+
+typedef BOOL (WINAPI *PFN_SETTHREADGROUPAFFINITY)(HANDLE hThread, const GROUP_AFFINITY *GroupAffinity, PGROUP_AFFINITY PreviousGroupAffinity);
+static PFN_SETTHREADGROUPAFFINITY SetThreadGroupAffinityProc;
+
+typedef BOOL (WINAPI *PFN_GETTHREADGROUPAFFINITY)(HANDLE hThread, PGROUP_AFFINITY GroupAffinity);
+static PFN_GETTHREADGROUPAFFINITY GetThreadGroupAffinityProc;
+
+typedef BOOL (WINAPI *PFN_GETNUMAAVAILABLEMEMORYNODE)(UCHAR Node, PULONGLONG AvailableBytes);
+static PFN_GETNUMAAVAILABLEMEMORYNODE GetNumaAvailableMemoryNodeProc;
+
+typedef BOOL (WINAPI *PFN_GETNUMAAVAILABLEMEMORYNODEEX)(USHORT Node, PULONGLONG AvailableBytes);
+static PFN_GETNUMAAVAILABLEMEMORYNODEEX GetNumaAvailableMemoryNodeExProc;
+
+typedef LPVOID (WINAPI *PFN_VIRTUALALLOCEXNUMA)(HANDLE hProcess, LPVOID lpAddress, SIZE_T dwSize, DWORD flAllocationType, DWORD flProtect, DWORD nndPreferred);
+static PFN_VIRTUALALLOCEXNUMA VirtualAllocExNumaProc;
+
+typedef BOOL (WINAPI *PFN_VIRTUALFREEEX)(HANDLE hProcess, LPVOID lpAddress, SIZE_T dwSize, DWORD dwFreeType);
+static PFN_VIRTUALFREEEX VirtualFreeExProc;
+
+typedef BOOL (WINAPI *PFN_QUERYWORKINGSETEX)(HANDLE hProcess, PVOID pv, DWORD cb);
+static PFN_QUERYWORKINGSETEX QueryWorkingSetExProc;
+
+static void hwloc_win_get_function_ptrs(void)
+{
+    HMODULE kernel32;
+
+    kernel32 = LoadLibrary("kernel32.dll");
+    if (kernel32) {
+      GetActiveProcessorGroupCountProc =
+	(PFN_GETACTIVEPROCESSORGROUPCOUNT) GetProcAddress(kernel32, "GetActiveProcessorGroupCount");
+      GetActiveProcessorCountProc =
+	(PFN_GETACTIVEPROCESSORCOUNT) GetProcAddress(kernel32, "GetActiveProcessorCount");
+      GetLogicalProcessorInformationProc =
+	(PFN_GETLOGICALPROCESSORINFORMATION) GetProcAddress(kernel32, "GetLogicalProcessorInformation");
+      GetCurrentProcessorNumberProc =
+	(PFN_GETCURRENTPROCESSORNUMBER) GetProcAddress(kernel32, "GetCurrentProcessorNumber");
+      GetCurrentProcessorNumberExProc =
+	(PFN_GETCURRENTPROCESSORNUMBEREX) GetProcAddress(kernel32, "GetCurrentProcessorNumberEx");
+      SetThreadGroupAffinityProc =
+	(PFN_SETTHREADGROUPAFFINITY) GetProcAddress(kernel32, "SetThreadGroupAffinity");
+      GetThreadGroupAffinityProc =
+	(PFN_GETTHREADGROUPAFFINITY) GetProcAddress(kernel32, "GetThreadGroupAffinity");
+      GetNumaAvailableMemoryNodeProc =
+	(PFN_GETNUMAAVAILABLEMEMORYNODE) GetProcAddress(kernel32, "GetNumaAvailableMemoryNode");
+      GetNumaAvailableMemoryNodeExProc =
+	(PFN_GETNUMAAVAILABLEMEMORYNODEEX) GetProcAddress(kernel32, "GetNumaAvailableMemoryNodeEx");
+      GetLogicalProcessorInformationExProc =
+	(PFN_GETLOGICALPROCESSORINFORMATIONEX)GetProcAddress(kernel32, "GetLogicalProcessorInformationEx");
+      QueryWorkingSetExProc =
+	(PFN_QUERYWORKINGSETEX) GetProcAddress(kernel32, "K32QueryWorkingSetEx");
+      VirtualAllocExNumaProc =
+	(PFN_VIRTUALALLOCEXNUMA) GetProcAddress(kernel32, "VirtualAllocExNuma");
+      VirtualFreeExProc =
+	(PFN_VIRTUALFREEEX) GetProcAddress(kernel32, "VirtualFreeEx");
+    }
+
+    if (GetActiveProcessorGroupCountProc)
+      nr_processor_groups = GetActiveProcessorGroupCountProc();
+
+    if (!QueryWorkingSetExProc) {
+      HMODULE psapi = LoadLibrary("psapi.dll");
+      if (psapi)
+        QueryWorkingSetExProc = (PFN_QUERYWORKINGSETEX) GetProcAddress(psapi, "QueryWorkingSetEx");
+    }
+}
+
+/*
+ * ULONG_PTR and DWORD_PTR are 64/32bits depending on the arch
+ * while bitmaps use unsigned long (always 32bits)
+ */
+
+static void hwloc_bitmap_from_ULONG_PTR(hwloc_bitmap_t set, ULONG_PTR mask)
+{
+#if SIZEOF_VOID_P == 8
+  hwloc_bitmap_from_ulong(set, mask & 0xffffffff);
+  hwloc_bitmap_set_ith_ulong(set, 1, mask >> 32);
+#else
+  hwloc_bitmap_from_ulong(set, mask);
+#endif
+}
+
+static void hwloc_bitmap_from_ith_ULONG_PTR(hwloc_bitmap_t set, unsigned i, ULONG_PTR mask)
+{
+#if SIZEOF_VOID_P == 8
+  hwloc_bitmap_from_ith_ulong(set, 2*i, mask & 0xffffffff);
+  hwloc_bitmap_set_ith_ulong(set, 2*i+1, mask >> 32);
+#else
+  hwloc_bitmap_from_ith_ulong(set, i, mask);
+#endif
+}
+
+static void hwloc_bitmap_set_ith_ULONG_PTR(hwloc_bitmap_t set, unsigned i, ULONG_PTR mask)
+{
+#if SIZEOF_VOID_P == 8
+  hwloc_bitmap_set_ith_ulong(set, 2*i, mask & 0xffffffff);
+  hwloc_bitmap_set_ith_ulong(set, 2*i+1, mask >> 32);
+#else
+  hwloc_bitmap_set_ith_ulong(set, i, mask);
+#endif
+}
+
+static ULONG_PTR hwloc_bitmap_to_ULONG_PTR(hwloc_const_bitmap_t set)
+{
+#if SIZEOF_VOID_P == 8
+  ULONG_PTR up = hwloc_bitmap_to_ith_ulong(set, 1);
+  up <<= 32;
+  up |= hwloc_bitmap_to_ulong(set);
+  return up;
+#else
+  return hwloc_bitmap_to_ulong(set);
+#endif
+}
+
+static ULONG_PTR hwloc_bitmap_to_ith_ULONG_PTR(hwloc_const_bitmap_t set, unsigned i)
+{
+#if SIZEOF_VOID_P == 8
+  ULONG_PTR up = hwloc_bitmap_to_ith_ulong(set, 2*i+1);
+  up <<= 32;
+  up |= hwloc_bitmap_to_ith_ulong(set, 2*i);
+  return up;
+#else
+  return hwloc_bitmap_to_ith_ulong(set, i);
+#endif
+}
+
+/* convert set into index+mask if all set bits are in the same ULONG.
+ * otherwise return -1.
+ */
+static int hwloc_bitmap_to_single_ULONG_PTR(hwloc_const_bitmap_t set, unsigned *index, ULONG_PTR *mask)
+{
+  unsigned first_ulp, last_ulp;
+  if (hwloc_bitmap_weight(set) == -1)
+    return -1;
+  first_ulp = hwloc_bitmap_first(set) / (sizeof(ULONG_PTR)*8);
+  last_ulp = hwloc_bitmap_last(set) / (sizeof(ULONG_PTR)*8);
+  if (first_ulp != last_ulp)
+    return -1;
+  *mask = hwloc_bitmap_to_ith_ULONG_PTR(set, first_ulp);
+  *index = first_ulp;
+  return 0;
+}
+
+/**************************************************************
+ * hwloc PU numbering with respect to Windows processor groups
+ *
+ * Everywhere below we reserve 64 physical indexes per processor groups because that's
+ * the maximum (MAXIMUM_PROC_PER_GROUP). Windows may actually use less bits than that
+ * in some groups (either to avoid splitting NUMA nodes across groups, or because of OS
+ * tweaks such as "bcdedit /set groupsize 8") but we keep some unused indexes for simplicity.
+ * That means PU physical indexes and cpusets may be non-contigous.
+ * That also means hwloc_fallback_nbprocessors() below must return the last PU index + 1
+ * instead the actual number of processors.
+ */
+
+/********************
+ * last_cpu_location
+ */
+
+static int
+hwloc_win_get_thisthread_last_cpu_location(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_cpuset_t set, int flags __hwloc_attribute_unused)
+{
+  assert(GetCurrentProcessorNumberExProc || (GetCurrentProcessorNumberProc && nr_processor_groups == 1));
+
+  if (nr_processor_groups > 1 || !GetCurrentProcessorNumberProc) {
+    PROCESSOR_NUMBER num;
+    GetCurrentProcessorNumberExProc(&num);
+    hwloc_bitmap_from_ith_ULONG_PTR(set, num.Group, ((ULONG_PTR)1) << num.Number);
+    return 0;
+  }
+
+  hwloc_bitmap_from_ith_ULONG_PTR(set, 0, ((ULONG_PTR)1) << GetCurrentProcessorNumberProc());
+  return 0;
+}
+
+/* TODO: hwloc_win_get_thisproc_last_cpu_location() using
+ * CreateToolhelp32Snapshot(), Thread32First/Next()
+ * th.th32OwnerProcessID == GetCurrentProcessId() for filtering within process
+ * OpenThread(THREAD_SET_INFORMATION|THREAD_QUERY_INFORMATION, FALSE, te32.th32ThreadID) to get a handle.
+ */
+
+
+/******************************
+ * set cpu/membind for threads
+ */
+
+/* TODO: SetThreadIdealProcessor{,Ex} */
+
+static int
+hwloc_win_set_thread_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_thread_t thread, hwloc_const_bitmap_t hwloc_set, int flags)
+{
+  DWORD_PTR mask;
+  unsigned group;
+
+  if (flags & HWLOC_CPUBIND_NOMEMBIND) {
+    errno = ENOSYS;
+    return -1;
+  }
+
+  if (hwloc_bitmap_to_single_ULONG_PTR(hwloc_set, &group, &mask) < 0) {
+    errno = ENOSYS;
+    return -1;
+  }
+
+  assert(nr_processor_groups == 1 || SetThreadGroupAffinityProc);
+
+  if (nr_processor_groups > 1) {
+    GROUP_AFFINITY aff;
+    memset(&aff, 0, sizeof(aff)); /* we get Invalid Parameter error if Reserved field isn't cleared */
+    aff.Group = group;
+    aff.Mask = mask;
+    if (!SetThreadGroupAffinityProc(thread, &aff, NULL))
+      return -1;
+
+  } else {
+    /* SetThreadAffinityMask() only changes the mask inside the current processor group */
+    /* The resulting binding is always strict */
+    if (!SetThreadAffinityMask(thread, mask))
+      return -1;
+  }
+  return 0;
+}
+
+static int
+hwloc_win_set_thisthread_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t hwloc_set, int flags)
+{
+  return hwloc_win_set_thread_cpubind(topology, GetCurrentThread(), hwloc_set, flags);
+}
+
+static int
+hwloc_win_set_thisthread_membind(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  int ret;
+  hwloc_const_cpuset_t cpuset;
+  hwloc_cpuset_t _cpuset = NULL;
+
+  if ((policy != HWLOC_MEMBIND_DEFAULT && policy != HWLOC_MEMBIND_BIND)
+      || flags & HWLOC_MEMBIND_NOCPUBIND) {
+    errno = ENOSYS;
+    return -1;
+  }
+
+  if (policy == HWLOC_MEMBIND_DEFAULT) {
+    cpuset = hwloc_topology_get_complete_cpuset(topology);
+  } else {
+    cpuset = _cpuset = hwloc_bitmap_alloc();
+    hwloc_cpuset_from_nodeset(topology, _cpuset, nodeset);
+  }
+
+  ret = hwloc_win_set_thisthread_cpubind(topology, cpuset,
+					 (flags & HWLOC_MEMBIND_STRICT) ? HWLOC_CPUBIND_STRICT : 0);
+  hwloc_bitmap_free(_cpuset);
+  return ret;
+}
+
+
+/******************************
+ * get cpu/membind for threads
+ */
+
+static int
+hwloc_win_get_thread_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_thread_t thread, hwloc_cpuset_t set, int flags __hwloc_attribute_unused)
+{
+  GROUP_AFFINITY aff;
+
+  assert(GetThreadGroupAffinityProc);
+
+  if (!GetThreadGroupAffinityProc(thread, &aff))
+    return -1;
+  hwloc_bitmap_from_ith_ULONG_PTR(set, aff.Group, aff.Mask);
+  return 0;
+}
+
+static int
+hwloc_win_get_thisthread_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_cpuset_t set, int flags __hwloc_attribute_unused)
+{
+  return hwloc_win_get_thread_cpubind(topology, GetCurrentThread(), set, flags);
+}
+
+static int
+hwloc_win_get_thisthread_membind(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags)
+{
+  int ret;
+  hwloc_cpuset_t cpuset = hwloc_bitmap_alloc();
+  ret = hwloc_win_get_thread_cpubind(topology, GetCurrentThread(), cpuset, flags);
+  if (!ret) {
+    *policy = HWLOC_MEMBIND_BIND;
+    hwloc_cpuset_to_nodeset(topology, cpuset, nodeset);
+  }
+  hwloc_bitmap_free(cpuset);
+  return ret;
+}
+
+
+/********************************
+ * set cpu/membind for processes
+ */
+
+static int
+hwloc_win_set_proc_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_pid_t proc, hwloc_const_bitmap_t hwloc_set, int flags)
+{
+  DWORD_PTR mask;
+
+  assert(nr_processor_groups == 1);
+
+  if (flags & HWLOC_CPUBIND_NOMEMBIND) {
+    errno = ENOSYS;
+    return -1;
+  }
+
+  /* TODO: SetThreadGroupAffinity() for all threads doesn't enforce the whole process affinity,
+   * maybe because of process-specific resource locality */
+  /* TODO: if we are in a single group (check with GetProcessGroupAffinity()),
+   * SetProcessAffinityMask() changes the binding within that same group.
+   */
+  /* TODO: NtSetInformationProcess() works very well for binding to any mask in a single group,
+   * but it's an internal routine.
+   */
+  /* TODO: checks whether hwloc-bind.c needs to pass INHERIT_PARENT_AFFINITY to CreateProcess() instead of execvp(). */
+
+  /* The resulting binding is always strict */
+  mask = hwloc_bitmap_to_ULONG_PTR(hwloc_set);
+  if (!SetProcessAffinityMask(proc, mask))
+    return -1;
+  return 0;
+}
+
+static int
+hwloc_win_set_thisproc_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t hwloc_set, int flags)
+{
+  return hwloc_win_set_proc_cpubind(topology, GetCurrentProcess(), hwloc_set, flags);
+}
+
+static int
+hwloc_win_set_proc_membind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  int ret;
+  hwloc_const_cpuset_t cpuset;
+  hwloc_cpuset_t _cpuset = NULL;
+
+  if ((policy != HWLOC_MEMBIND_DEFAULT && policy != HWLOC_MEMBIND_BIND)
+      || flags & HWLOC_MEMBIND_NOCPUBIND) {
+    errno = ENOSYS;
+    return -1;
+  }
+
+  if (policy == HWLOC_MEMBIND_DEFAULT) {
+    cpuset = hwloc_topology_get_complete_cpuset(topology);
+  } else {
+    cpuset = _cpuset = hwloc_bitmap_alloc();
+    hwloc_cpuset_from_nodeset(topology, _cpuset, nodeset);
+  }
+
+  ret = hwloc_win_set_proc_cpubind(topology, pid, cpuset,
+				   (flags & HWLOC_MEMBIND_STRICT) ? HWLOC_CPUBIND_STRICT : 0);
+  hwloc_bitmap_free(_cpuset);
+  return ret;
+}
+
+static int
+hwloc_win_set_thisproc_membind(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  return hwloc_win_set_proc_membind(topology, GetCurrentProcess(), nodeset, policy, flags);
+}
+
+
+/********************************
+ * get cpu/membind for processes
+ */
+
+static int
+hwloc_win_get_proc_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_pid_t proc, hwloc_bitmap_t hwloc_set, int flags)
+{
+  DWORD_PTR proc_mask, sys_mask;
+
+  assert(nr_processor_groups == 1);
+
+  if (flags & HWLOC_CPUBIND_NOMEMBIND) {
+    errno = ENOSYS;
+    return -1;
+  }
+
+  /* TODO: if we are in a single group (check with GetProcessGroupAffinity()),
+   * GetProcessAffinityMask() gives the mask within that group.
+   */
+  /* TODO: if we are in multiple groups, GetProcessGroupAffinity() gives their IDs,
+   * but we don't know their masks.
+   */
+  /* TODO: GetThreadGroupAffinity() for all threads can be smaller than the whole process affinity,
+   * maybe because of process-specific resource locality.
+   */
+
+  if (!GetProcessAffinityMask(proc, &proc_mask, &sys_mask))
+    return -1;
+  hwloc_bitmap_from_ULONG_PTR(hwloc_set, proc_mask);
+  return 0;
+}
+
+static int
+hwloc_win_get_proc_membind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags)
+{
+  int ret;
+  hwloc_cpuset_t cpuset = hwloc_bitmap_alloc();
+  ret = hwloc_win_get_proc_cpubind(topology, pid, cpuset,
+				   (flags & HWLOC_MEMBIND_STRICT) ? HWLOC_CPUBIND_STRICT : 0);
+  if (!ret) {
+    *policy = HWLOC_MEMBIND_BIND;
+    hwloc_cpuset_to_nodeset(topology, cpuset, nodeset);
+  }
+  hwloc_bitmap_free(cpuset);
+  return ret;
+}
+
+static int
+hwloc_win_get_thisproc_cpubind(hwloc_topology_t topology, hwloc_bitmap_t hwloc_cpuset, int flags)
+{
+  return hwloc_win_get_proc_cpubind(topology, GetCurrentProcess(), hwloc_cpuset, flags);
+}
+
+static int
+hwloc_win_get_thisproc_membind(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags)
+{
+  return hwloc_win_get_proc_membind(topology, GetCurrentProcess(), nodeset, policy, flags);
+}
+
+
+/************************
+ * membind alloc/free
+ */
+
+static void *
+hwloc_win_alloc(hwloc_topology_t topology __hwloc_attribute_unused, size_t len) {
+  return VirtualAlloc(NULL, len, MEM_COMMIT|MEM_RESERVE, PAGE_EXECUTE_READWRITE);
+}
+
+static void *
+hwloc_win_alloc_membind(hwloc_topology_t topology __hwloc_attribute_unused, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags) {
+  int node;
+
+  switch (policy) {
+    case HWLOC_MEMBIND_DEFAULT:
+    case HWLOC_MEMBIND_BIND:
+      break;
+    default:
+      errno = ENOSYS;
+      return hwloc_alloc_or_fail(topology, len, flags);
+  }
+
+  if (flags & HWLOC_MEMBIND_STRICT) {
+    errno = ENOSYS;
+    return NULL;
+  }
+
+  if (policy == HWLOC_MEMBIND_DEFAULT
+      || hwloc_bitmap_isequal(nodeset, hwloc_topology_get_complete_nodeset(topology)))
+    return hwloc_win_alloc(topology, len);
+
+  if (hwloc_bitmap_weight(nodeset) != 1) {
+    /* Not a single node, can't do this */
+    errno = EXDEV;
+    return hwloc_alloc_or_fail(topology, len, flags);
+  }
+
+  node = hwloc_bitmap_first(nodeset);
+  return VirtualAllocExNumaProc(GetCurrentProcess(), NULL, len, MEM_COMMIT|MEM_RESERVE, PAGE_EXECUTE_READWRITE, node);
+}
+
+static int
+hwloc_win_free_membind(hwloc_topology_t topology __hwloc_attribute_unused, void *addr, size_t len __hwloc_attribute_unused) {
+  if (!addr)
+    return 0;
+  if (!VirtualFreeExProc(GetCurrentProcess(), addr, 0, MEM_RELEASE))
+    return -1;
+  return 0;
+}
+
+
+/**********************
+ * membind for areas
+ */
+
+static int
+hwloc_win_get_area_memlocation(hwloc_topology_t topology __hwloc_attribute_unused, const void *addr, size_t len, hwloc_nodeset_t nodeset, int flags __hwloc_attribute_unused)
+{
+  SYSTEM_INFO SystemInfo;
+  DWORD page_size;
+  uintptr_t start;
+  unsigned nb;
+  PSAPI_WORKING_SET_EX_INFORMATION *pv;
+  unsigned i;
+
+  GetSystemInfo(&SystemInfo);
+  page_size = SystemInfo.dwPageSize;
+
+  start = (((uintptr_t) addr) / page_size) * page_size;
+  nb = (unsigned)((((uintptr_t) addr + len - start) + page_size - 1) / page_size);
+
+  if (!nb)
+    nb = 1;
+
+  pv = calloc(nb, sizeof(*pv));
+  if (!pv)
+    return -1;
+
+  for (i = 0; i < nb; i++)
+    pv[i].VirtualAddress = (void*) (start + i * page_size);
+  if (!QueryWorkingSetExProc(GetCurrentProcess(), pv, nb * sizeof(*pv))) {
+    free(pv);
+    return -1;
+  }
+
+  for (i = 0; i < nb; i++) {
+    if (pv[i].VirtualAttributes.Valid)
+      hwloc_bitmap_set(nodeset, pv[i].VirtualAttributes.Node);
+  }
+
+  free(pv);
+  return 0;
+}
+
+
+/*************************
+ * discovery
+ */
+
+static int
+hwloc_look_windows(struct hwloc_backend *backend)
+{
+  struct hwloc_topology *topology = backend->topology;
+  hwloc_bitmap_t groups_pu_set = NULL;
+  SYSTEM_INFO SystemInfo;
+  DWORD length;
+  int gotnuma = 0;
+  int gotnumamemory = 0;
+
+  if (topology->levels[0][0]->cpuset)
+    /* somebody discovered things */
+    return -1;
+
+  hwloc_alloc_root_sets(topology->levels[0][0]);
+
+  GetSystemInfo(&SystemInfo);
+
+  if (!GetLogicalProcessorInformationExProc && GetLogicalProcessorInformationProc) {
+      PSYSTEM_LOGICAL_PROCESSOR_INFORMATION procInfo, tmpprocInfo;
+      unsigned id;
+      unsigned i;
+      struct hwloc_obj *obj;
+      hwloc_obj_type_t type;
+
+      length = 0;
+      procInfo = NULL;
+
+      while (1) {
+	if (GetLogicalProcessorInformationProc(procInfo, &length))
+	  break;
+	if (GetLastError() != ERROR_INSUFFICIENT_BUFFER)
+	  return -1;
+	tmpprocInfo = realloc(procInfo, length);
+	if (!tmpprocInfo) {
+	  free(procInfo);
+	  goto out;
+	}
+	procInfo = tmpprocInfo;
+      }
+
+      assert(!length || procInfo);
+
+      for (i = 0; i < length / sizeof(*procInfo); i++) {
+
+        /* Ignore unknown caches */
+	if (procInfo->Relationship == RelationCache
+		&& procInfo->Cache.Type != CacheUnified
+		&& procInfo->Cache.Type != CacheData
+		&& procInfo->Cache.Type != CacheInstruction)
+	  continue;
+
+	id = HWLOC_UNKNOWN_INDEX;
+	switch (procInfo[i].Relationship) {
+	  case RelationNumaNode:
+	    type = HWLOC_OBJ_NUMANODE;
+	    id = procInfo[i].NumaNode.NodeNumber;
+	    gotnuma++;
+	    if (id > max_numanode_index)
+	      max_numanode_index = id;
+	    break;
+	  case RelationProcessorPackage:
+	    type = HWLOC_OBJ_PACKAGE;
+	    break;
+	  case RelationCache:
+	    type = (procInfo[i].Cache.Type == CacheInstruction ? HWLOC_OBJ_L1ICACHE : HWLOC_OBJ_L1CACHE) + procInfo[i].Cache.Level - 1;
+	    break;
+	  case RelationProcessorCore:
+	    type = HWLOC_OBJ_CORE;
+	    break;
+	  case RelationGroup:
+	  default:
+	    type = HWLOC_OBJ_GROUP;
+	    break;
+	}
+
+	if (!hwloc_filter_check_keep_object_type(topology, type))
+	  continue;
+
+	obj = hwloc_alloc_setup_object(topology, type, id);
+        obj->cpuset = hwloc_bitmap_alloc();
+	hwloc_debug("%s#%u mask %llx\n", hwloc_obj_type_string(type), id, (unsigned long long) procInfo[i].ProcessorMask);
+	/* ProcessorMask is a ULONG_PTR */
+	hwloc_bitmap_set_ith_ULONG_PTR(obj->cpuset, 0, procInfo[i].ProcessorMask);
+	hwloc_debug_2args_bitmap("%s#%u bitmap %s\n", hwloc_obj_type_string(type), id, obj->cpuset);
+
+	switch (type) {
+	  case HWLOC_OBJ_NUMANODE:
+	    {
+	      ULONGLONG avail;
+	      obj->nodeset = hwloc_bitmap_alloc();
+	      hwloc_bitmap_set(obj->nodeset, id);
+	      if ((GetNumaAvailableMemoryNodeExProc && GetNumaAvailableMemoryNodeExProc(id, &avail))
+		  || (GetNumaAvailableMemoryNodeProc && GetNumaAvailableMemoryNodeProc(id, &avail))) {
+		obj->attr->numanode.local_memory = avail;
+		gotnumamemory++;
+	      }
+	      obj->attr->numanode.page_types_len = 2;
+	      obj->attr->numanode.page_types = malloc(2 * sizeof(*obj->attr->numanode.page_types));
+	      memset(obj->attr->numanode.page_types, 0, 2 * sizeof(*obj->attr->numanode.page_types));
+	      obj->attr->numanode.page_types_len = 1;
+	      obj->attr->numanode.page_types[0].size = SystemInfo.dwPageSize;
+#if HAVE_DECL__SC_LARGE_PAGESIZE
+	      obj->attr->numanode.page_types_len++;
+	      obj->attr->numanode.page_types[1].size = sysconf(_SC_LARGE_PAGESIZE);
+#endif
+	      break;
+	    }
+	  case HWLOC_OBJ_L1CACHE:
+	  case HWLOC_OBJ_L2CACHE:
+	  case HWLOC_OBJ_L3CACHE:
+	  case HWLOC_OBJ_L4CACHE:
+	  case HWLOC_OBJ_L5CACHE:
+	  case HWLOC_OBJ_L1ICACHE:
+	  case HWLOC_OBJ_L2ICACHE:
+	  case HWLOC_OBJ_L3ICACHE:
+	    obj->attr->cache.size = procInfo[i].Cache.Size;
+	    obj->attr->cache.associativity = procInfo[i].Cache.Associativity == CACHE_FULLY_ASSOCIATIVE ? -1 : procInfo[i].Cache.Associativity ;
+	    obj->attr->cache.linesize = procInfo[i].Cache.LineSize;
+	    obj->attr->cache.depth = procInfo[i].Cache.Level;
+	    switch (procInfo->Cache.Type) {
+	      case CacheUnified:
+		obj->attr->cache.type = HWLOC_OBJ_CACHE_UNIFIED;
+		break;
+	      case CacheData:
+		obj->attr->cache.type = HWLOC_OBJ_CACHE_DATA;
+		break;
+	      case CacheInstruction:
+		obj->attr->cache.type = HWLOC_OBJ_CACHE_INSTRUCTION;
+		break;
+	      default:
+		hwloc_free_unlinked_object(obj);
+		continue;
+	    }
+	    break;
+	  case HWLOC_OBJ_GROUP:
+	    obj->attr->group.kind = procInfo[i].Relationship == RelationGroup ? HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP : HWLOC_GROUP_KIND_WINDOWS_RELATIONSHIP_UNKNOWN;
+	    break;
+	  default:
+	    break;
+	}
+	hwloc_insert_object_by_cpuset(topology, obj);
+      }
+
+      free(procInfo);
+  }
+
+  if (GetLogicalProcessorInformationExProc) {
+      PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX procInfoTotal, tmpprocInfoTotal, procInfo;
+      unsigned id;
+      struct hwloc_obj *obj;
+      hwloc_obj_type_t type;
+
+      length = 0;
+      procInfoTotal = NULL;
+
+      while (1) {
+	if (GetLogicalProcessorInformationExProc(RelationAll, procInfoTotal, &length))
+	  break;
+	if (GetLastError() != ERROR_INSUFFICIENT_BUFFER)
+	  return -1;
+        tmpprocInfoTotal = realloc(procInfoTotal, length);
+	if (!tmpprocInfoTotal) {
+	  free(procInfoTotal);
+	  goto out;
+	}
+	procInfoTotal = tmpprocInfoTotal;
+      }
+
+      for (procInfo = procInfoTotal;
+	   (void*) procInfo < (void*) ((uintptr_t) procInfoTotal + length);
+	   procInfo = (void*) ((uintptr_t) procInfo + procInfo->Size)) {
+        unsigned num, i;
+        GROUP_AFFINITY *GroupMask;
+
+        /* Ignore unknown caches */
+	if (procInfo->Relationship == RelationCache
+		&& procInfo->Cache.Type != CacheUnified
+		&& procInfo->Cache.Type != CacheData
+		&& procInfo->Cache.Type != CacheInstruction)
+	  continue;
+
+	id = HWLOC_UNKNOWN_INDEX;
+	switch (procInfo->Relationship) {
+	  case RelationNumaNode:
+	    type = HWLOC_OBJ_NUMANODE;
+            num = 1;
+            GroupMask = &procInfo->NumaNode.GroupMask;
+	    id = procInfo->NumaNode.NodeNumber;
+	    gotnuma++;
+	    if (id > max_numanode_index)
+	      max_numanode_index = id;
+	    break;
+	  case RelationProcessorPackage:
+	    type = HWLOC_OBJ_PACKAGE;
+            num = procInfo->Processor.GroupCount;
+            GroupMask = procInfo->Processor.GroupMask;
+	    break;
+	  case RelationCache:
+	    type = (procInfo->Cache.Type == CacheInstruction ? HWLOC_OBJ_L1ICACHE : HWLOC_OBJ_L1CACHE) + procInfo->Cache.Level - 1;
+            num = 1;
+            GroupMask = &procInfo->Cache.GroupMask;
+	    break;
+	  case RelationProcessorCore:
+	    type = HWLOC_OBJ_CORE;
+            num = procInfo->Processor.GroupCount;
+            GroupMask = procInfo->Processor.GroupMask;
+	    break;
+	  case RelationGroup:
+	    /* So strange an interface... */
+	    for (id = 0; id < procInfo->Group.ActiveGroupCount; id++) {
+              KAFFINITY mask;
+	      hwloc_bitmap_t set;
+
+	      set = hwloc_bitmap_alloc();
+	      mask = procInfo->Group.GroupInfo[id].ActiveProcessorMask;
+	      hwloc_debug("group %u %d cpus mask %lx\n", id,
+			  procInfo->Group.GroupInfo[id].ActiveProcessorCount, mask);
+	      /* KAFFINITY is ULONG_PTR */
+	      hwloc_bitmap_set_ith_ULONG_PTR(set, id, mask);
+	      /* FIXME: what if running 32bits on a 64bits windows with 64-processor groups?
+	       * ULONG_PTR is 32bits, so half the group is invisible?
+	       * maybe scale id to id*8/sizeof(ULONG_PTR) so that groups are 64-PU aligned?
+	       */
+	      hwloc_debug_2args_bitmap("group %u %d bitmap %s\n", id, procInfo->Group.GroupInfo[id].ActiveProcessorCount, set);
+
+	      /* save the set of PUs so that we can create them at the end */
+	      if (!groups_pu_set)
+		groups_pu_set = hwloc_bitmap_alloc();
+	      hwloc_bitmap_or(groups_pu_set, groups_pu_set, set);
+
+	      if (hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_GROUP)) {
+		obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_GROUP, id);
+		obj->cpuset = set;
+		obj->attr->group.kind = HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP;
+		hwloc_insert_object_by_cpuset(topology, obj);
+	      } else
+		hwloc_bitmap_free(set);
+	    }
+	    continue;
+	  default:
+	    /* Don't know how to get the mask.  */
+            hwloc_debug("unknown relation %d\n", procInfo->Relationship);
+	    continue;
+	}
+
+	if (!hwloc_filter_check_keep_object_type(topology, type))
+	  continue;
+
+	obj = hwloc_alloc_setup_object(topology, type, id);
+        obj->cpuset = hwloc_bitmap_alloc();
+        for (i = 0; i < num; i++) {
+          hwloc_debug("%s#%u %d: mask %d:%lx\n", hwloc_obj_type_string(type), id, i, GroupMask[i].Group, GroupMask[i].Mask);
+	  /* GROUP_AFFINITY.Mask is KAFFINITY, which is ULONG_PTR */
+	  hwloc_bitmap_set_ith_ULONG_PTR(obj->cpuset, GroupMask[i].Group, GroupMask[i].Mask);
+	  /* FIXME: scale id to id*8/sizeof(ULONG_PTR) as above? */
+        }
+	hwloc_debug_2args_bitmap("%s#%u bitmap %s\n", hwloc_obj_type_string(type), id, obj->cpuset);
+	switch (type) {
+	  case HWLOC_OBJ_NUMANODE:
+	    {
+	      ULONGLONG avail;
+	      obj->nodeset = hwloc_bitmap_alloc();
+	      hwloc_bitmap_set(obj->nodeset, id);
+	      if ((GetNumaAvailableMemoryNodeExProc && GetNumaAvailableMemoryNodeExProc(id, &avail))
+		  || (GetNumaAvailableMemoryNodeProc && GetNumaAvailableMemoryNodeProc(id, &avail))) {
+	        obj->attr->numanode.local_memory = avail;
+		gotnumamemory++;
+	      }
+	      obj->attr->numanode.page_types = malloc(2 * sizeof(*obj->attr->numanode.page_types));
+	      memset(obj->attr->numanode.page_types, 0, 2 * sizeof(*obj->attr->numanode.page_types));
+	      obj->attr->numanode.page_types_len = 1;
+	      obj->attr->numanode.page_types[0].size = SystemInfo.dwPageSize;
+#if HAVE_DECL__SC_LARGE_PAGESIZE
+	      obj->attr->numanode.page_types_len++;
+	      obj->attr->numanode.page_types[1].size = sysconf(_SC_LARGE_PAGESIZE);
+#endif
+	      break;
+	    }
+	  case HWLOC_OBJ_L1CACHE:
+	  case HWLOC_OBJ_L2CACHE:
+	  case HWLOC_OBJ_L3CACHE:
+	  case HWLOC_OBJ_L4CACHE:
+	  case HWLOC_OBJ_L5CACHE:
+	  case HWLOC_OBJ_L1ICACHE:
+	  case HWLOC_OBJ_L2ICACHE:
+	  case HWLOC_OBJ_L3ICACHE:
+	    obj->attr->cache.size = procInfo->Cache.CacheSize;
+	    obj->attr->cache.associativity = procInfo->Cache.Associativity == CACHE_FULLY_ASSOCIATIVE ? -1 : procInfo->Cache.Associativity ;
+	    obj->attr->cache.linesize = procInfo->Cache.LineSize;
+	    obj->attr->cache.depth = procInfo->Cache.Level;
+	    switch (procInfo->Cache.Type) {
+	      case CacheUnified:
+		obj->attr->cache.type = HWLOC_OBJ_CACHE_UNIFIED;
+		break;
+	      case CacheData:
+		obj->attr->cache.type = HWLOC_OBJ_CACHE_DATA;
+		break;
+	      case CacheInstruction:
+		obj->attr->cache.type = HWLOC_OBJ_CACHE_INSTRUCTION;
+		break;
+	      default:
+		hwloc_free_unlinked_object(obj);
+		continue;
+	    }
+	    break;
+	  default:
+	    break;
+	}
+	hwloc_insert_object_by_cpuset(topology, obj);
+      }
+      free(procInfoTotal);
+  }
+
+  topology->support.discovery->pu = 1;
+  topology->support.discovery->numa = gotnuma;
+  topology->support.discovery->numa_memory = gotnumamemory;
+
+  if (groups_pu_set) {
+    /* the system supports multiple Groups.
+     * PU indexes may be discontiguous, especially if Groups contain less than 64 procs.
+     */
+    hwloc_obj_t obj;
+    unsigned idx;
+    hwloc_bitmap_foreach_begin(idx, groups_pu_set) {
+      obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_PU, idx);
+      obj->cpuset = hwloc_bitmap_alloc();
+      hwloc_bitmap_only(obj->cpuset, idx);
+      hwloc_debug_1arg_bitmap("cpu %u has cpuset %s\n",
+			      idx, obj->cpuset);
+      hwloc_insert_object_by_cpuset(topology, obj);
+    } hwloc_bitmap_foreach_end();
+    hwloc_bitmap_free(groups_pu_set);
+  } else {
+    /* no processor groups */
+    SYSTEM_INFO sysinfo;
+    hwloc_obj_t obj;
+    unsigned idx;
+    GetSystemInfo(&sysinfo);
+    for(idx=0; idx<32; idx++)
+      if (sysinfo.dwActiveProcessorMask & (((DWORD_PTR)1)<<idx)) {
+	obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_PU, idx);
+	obj->cpuset = hwloc_bitmap_alloc();
+	hwloc_bitmap_only(obj->cpuset, idx);
+	hwloc_debug_1arg_bitmap("cpu %u has cpuset %s\n",
+				idx, obj->cpuset);
+	hwloc_insert_object_by_cpuset(topology, obj);
+      }
+  }
+
+ out:
+  hwloc_obj_add_info(topology->levels[0][0], "Backend", "Windows");
+  hwloc_add_uname_info(topology, NULL);
+  return 0;
+}
+
+void
+hwloc_set_windows_hooks(struct hwloc_binding_hooks *hooks,
+			struct hwloc_topology_support *support)
+{
+  if (GetCurrentProcessorNumberExProc || (GetCurrentProcessorNumberProc && nr_processor_groups == 1))
+    hooks->get_thisthread_last_cpu_location = hwloc_win_get_thisthread_last_cpu_location;
+
+  if (nr_processor_groups == 1) {
+    hooks->set_proc_cpubind = hwloc_win_set_proc_cpubind;
+    hooks->get_proc_cpubind = hwloc_win_get_proc_cpubind;
+    hooks->set_thisproc_cpubind = hwloc_win_set_thisproc_cpubind;
+    hooks->get_thisproc_cpubind = hwloc_win_get_thisproc_cpubind;
+    hooks->set_proc_membind = hwloc_win_set_proc_membind;
+    hooks->get_proc_membind = hwloc_win_get_proc_membind;
+    hooks->set_thisproc_membind = hwloc_win_set_thisproc_membind;
+    hooks->get_thisproc_membind = hwloc_win_get_thisproc_membind;
+  }
+  if (nr_processor_groups == 1 || SetThreadGroupAffinityProc) {
+    hooks->set_thread_cpubind = hwloc_win_set_thread_cpubind;
+    hooks->set_thisthread_cpubind = hwloc_win_set_thisthread_cpubind;
+    hooks->set_thisthread_membind = hwloc_win_set_thisthread_membind;
+  }
+  if (GetThreadGroupAffinityProc) {
+    hooks->get_thread_cpubind = hwloc_win_get_thread_cpubind;
+    hooks->get_thisthread_cpubind = hwloc_win_get_thisthread_cpubind;
+    hooks->get_thisthread_membind = hwloc_win_get_thisthread_membind;
+  }
+
+  if (VirtualAllocExNumaProc) {
+    hooks->alloc_membind = hwloc_win_alloc_membind;
+    hooks->alloc = hwloc_win_alloc;
+    hooks->free_membind = hwloc_win_free_membind;
+    support->membind->bind_membind = 1;
+  }
+
+  if (QueryWorkingSetExProc && max_numanode_index <= 63 /* PSAPI_WORKING_SET_EX_BLOCK.Node is 6 bits only */)
+    hooks->get_area_memlocation = hwloc_win_get_area_memlocation;
+}
+
+static int hwloc_windows_component_init(unsigned long flags __hwloc_attribute_unused)
+{
+  hwloc_win_get_function_ptrs();
+  return 0;
+}
+
+static void hwloc_windows_component_finalize(unsigned long flags __hwloc_attribute_unused)
+{
+}
+
+static struct hwloc_backend *
+hwloc_windows_component_instantiate(struct hwloc_disc_component *component,
+				    const void *_data1 __hwloc_attribute_unused,
+				    const void *_data2 __hwloc_attribute_unused,
+				    const void *_data3 __hwloc_attribute_unused)
+{
+  struct hwloc_backend *backend;
+  backend = hwloc_backend_alloc(component);
+  if (!backend)
+    return NULL;
+  backend->discover = hwloc_look_windows;
+  return backend;
+}
+
+static struct hwloc_disc_component hwloc_windows_disc_component = {
+  HWLOC_DISC_COMPONENT_TYPE_CPU,
+  "windows",
+  HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
+  hwloc_windows_component_instantiate,
+  50,
+  1,
+  NULL
+};
+
+const struct hwloc_component hwloc_windows_component = {
+  HWLOC_COMPONENT_ABI,
+  hwloc_windows_component_init, hwloc_windows_component_finalize,
+  HWLOC_COMPONENT_TYPE_DISC,
+  0,
+  &hwloc_windows_disc_component
+};
+
+int
+hwloc_fallback_nbprocessors(struct hwloc_topology *topology __hwloc_attribute_unused) {
+  int n;
+  SYSTEM_INFO sysinfo;
+
+  /* by default, ignore groups (return only the number in the current group) */
+  GetSystemInfo(&sysinfo);
+  n = sysinfo.dwNumberOfProcessors; /* FIXME could be non-contigous, rather return a mask from dwActiveProcessorMask? */
+
+  if (nr_processor_groups > 1) {
+    /* assume n-1 groups are complete, since that's how we store things in cpusets */
+    if (GetActiveProcessorCountProc)
+      n = MAXIMUM_PROC_PER_GROUP*(nr_processor_groups-1)
+	+ GetActiveProcessorCountProc((WORD)nr_processor_groups-1);
+    else
+      n = MAXIMUM_PROC_PER_GROUP*nr_processor_groups;
+  }
+
+  return n;
+}
diff --git a/src/3rdparty/hwloc/src/topology-x86.c b/src/3rdparty/hwloc/src/topology-x86.c
new file mode 100644
index 000000000..4aefdcf1f
--- /dev/null
+++ b/src/3rdparty/hwloc/src/topology-x86.c
@@ -0,0 +1,1583 @@
+/*
+ * Copyright © 2010-2019 Inria.  All rights reserved.
+ * Copyright © 2010-2013 Université Bordeaux
+ * Copyright © 2010-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ *
+ *
+ * This backend is only used when the operating system does not export
+ * the necessary hardware topology information to user-space applications.
+ * Currently, only the FreeBSD backend relies on this x86 backend.
+ *
+ * Other backends such as Linux have their own way to retrieve various
+ * pieces of hardware topology information from the operating system
+ * on various architectures, without having to use this x86-specific code.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <private/private.h>
+#include <private/debug.h>
+#include <private/misc.h>
+
+#include <private/cpuid-x86.h>
+
+#include <sys/types.h>
+#ifdef HAVE_DIRENT_H
+#include <dirent.h>
+#endif
+#ifdef HAVE_VALGRIND_VALGRIND_H
+#include <valgrind/valgrind.h>
+#endif
+
+struct hwloc_x86_backend_data_s {
+  unsigned nbprocs;
+  hwloc_bitmap_t apicid_set;
+  int apicid_unique;
+  char *src_cpuiddump_path;
+  int is_knl;
+};
+
+/************************************
+ * Management of cpuid dump as input
+ */
+
+struct cpuiddump {
+  unsigned nr;
+  struct cpuiddump_entry {
+    unsigned inmask; /* which of ine[abcd]x are set on input */
+    unsigned ineax;
+    unsigned inebx;
+    unsigned inecx;
+    unsigned inedx;
+    unsigned outeax;
+    unsigned outebx;
+    unsigned outecx;
+    unsigned outedx;
+  } *entries;
+};
+
+static void
+cpuiddump_free(struct cpuiddump *cpuiddump)
+{
+  if (cpuiddump->nr)
+    free(cpuiddump->entries);
+  free(cpuiddump);
+}
+
+static struct cpuiddump *
+cpuiddump_read(const char *dirpath, unsigned idx)
+{
+  struct cpuiddump *cpuiddump;
+  struct cpuiddump_entry *cur;
+  FILE *file;
+  char line[128];
+  unsigned nr;
+
+  cpuiddump = malloc(sizeof(*cpuiddump));
+  if (!cpuiddump) {
+    fprintf(stderr, "Failed to allocate cpuiddump for PU #%u, ignoring cpuiddump.\n", idx);
+    goto out;
+  }
+
+ {
+  size_t filenamelen = strlen(dirpath) + 15;
+  HWLOC_VLA(char, filename, filenamelen);
+  snprintf(filename, filenamelen, "%s/pu%u", dirpath, idx);
+  file = fopen(filename, "r");
+  if (!file) {
+    fprintf(stderr, "Could not read dumped cpuid file %s, ignoring cpuiddump.\n", filename);
+    goto out_with_dump;
+  }
+ }
+
+  nr = 0;
+  while (fgets(line, sizeof(line), file))
+    nr++;
+  cpuiddump->entries = malloc(nr * sizeof(struct cpuiddump_entry));
+  if (!cpuiddump->entries) {
+    fprintf(stderr, "Failed to allocate %u cpuiddump entries for PU #%u, ignoring cpuiddump.\n", nr, idx);
+    goto out_with_file;
+  }
+
+  fseek(file, 0, SEEK_SET);
+  cur = &cpuiddump->entries[0];
+  nr = 0;
+  while (fgets(line, sizeof(line), file)) {
+    if (*line == '#')
+      continue;
+    if (sscanf(line, "%x %x %x %x %x => %x %x %x %x",
+	      &cur->inmask,
+	      &cur->ineax, &cur->inebx, &cur->inecx, &cur->inedx,
+	      &cur->outeax, &cur->outebx, &cur->outecx, &cur->outedx) == 9) {
+      cur++;
+      nr++;
+    }
+  }
+
+  cpuiddump->nr = nr;
+  fclose(file);
+  return cpuiddump;
+
+ out_with_file:
+  fclose(file);
+ out_with_dump:
+  free(cpuiddump);
+ out:
+  return NULL;
+}
+
+static void
+cpuiddump_find_by_input(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx, struct cpuiddump *cpuiddump)
+{
+  unsigned i;
+
+  for(i=0; i<cpuiddump->nr; i++) {
+    struct cpuiddump_entry *entry = &cpuiddump->entries[i];
+    if ((entry->inmask & 0x1) && *eax != entry->ineax)
+      continue;
+    if ((entry->inmask & 0x2) && *ebx != entry->inebx)
+      continue;
+    if ((entry->inmask & 0x4) && *ecx != entry->inecx)
+      continue;
+    if ((entry->inmask & 0x8) && *edx != entry->inedx)
+      continue;
+    *eax = entry->outeax;
+    *ebx = entry->outebx;
+    *ecx = entry->outecx;
+    *edx = entry->outedx;
+    return;
+  }
+
+  fprintf(stderr, "Couldn't find %x,%x,%x,%x in dumped cpuid, returning 0s.\n",
+	  *eax, *ebx, *ecx, *edx);
+  *eax = 0;
+  *ebx = 0;
+  *ecx = 0;
+  *edx = 0;
+}
+
+static void cpuid_or_from_dump(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx, struct cpuiddump *src_cpuiddump)
+{
+  if (src_cpuiddump) {
+    cpuiddump_find_by_input(eax, ebx, ecx, edx, src_cpuiddump);
+  } else {
+    hwloc_x86_cpuid(eax, ebx, ecx, edx);
+  }
+}
+
+/*******************************
+ * Core detection routines and structures
+ */
+
+#define has_topoext(features) ((features)[6] & (1 << 22))
+#define has_x2apic(features) ((features)[4] & (1 << 21))
+
+struct cacheinfo {
+  hwloc_obj_cache_type_t type;
+  unsigned level;
+  unsigned nbthreads_sharing;
+  unsigned cacheid;
+
+  unsigned linesize;
+  unsigned linepart;
+  int inclusive;
+  int ways;
+  unsigned sets;
+  unsigned long size;
+};
+
+struct procinfo {
+  unsigned present;
+  unsigned apicid;
+  unsigned packageid;
+  unsigned dieid;
+  unsigned nodeid;
+  unsigned unitid;
+  unsigned threadid;
+  unsigned coreid;
+  unsigned *otherids;
+  unsigned levels;
+  unsigned numcaches;
+  struct cacheinfo *cache;
+  char cpuvendor[13];
+  char cpumodel[3*4*4+1];
+  unsigned cpustepping;
+  unsigned cpumodelnumber;
+  unsigned cpufamilynumber;
+};
+
+enum cpuid_type {
+  intel,
+  amd,
+  zhaoxin,
+  hygon,
+  unknown
+};
+
+static void fill_amd_cache(struct procinfo *infos, unsigned level, hwloc_obj_cache_type_t type, unsigned nbthreads_sharing, unsigned cpuid)
+{
+  struct cacheinfo *cache, *tmpcaches;
+  unsigned cachenum;
+  unsigned long size = 0;
+
+  if (level == 1)
+    size = ((cpuid >> 24)) << 10;
+  else if (level == 2)
+    size = ((cpuid >> 16)) << 10;
+  else if (level == 3)
+    size = ((cpuid >> 18)) << 19;
+  if (!size)
+    return;
+
+  tmpcaches = realloc(infos->cache, (infos->numcaches+1)*sizeof(*infos->cache));
+  if (!tmpcaches)
+    /* failed to allocated, ignore that cache */
+    return;
+  infos->cache = tmpcaches;
+  cachenum = infos->numcaches++;
+
+  cache = &infos->cache[cachenum];
+
+  cache->type = type;
+  cache->level = level;
+  cache->nbthreads_sharing = nbthreads_sharing;
+  cache->linesize = cpuid & 0xff;
+  cache->linepart = 0;
+  cache->inclusive = 0; /* old AMD (K8-K10) supposed to have exclusive caches */
+
+  if (level == 1) {
+    cache->ways = (cpuid >> 16) & 0xff;
+    if (cache->ways == 0xff)
+      /* Fully associative */
+      cache->ways = -1;
+  } else {
+    static const unsigned ways_tab[] = { 0, 1, 2, 0, 4, 0, 8, 0, 16, 0, 32, 48, 64, 96, 128, -1 };
+    unsigned ways = (cpuid >> 12) & 0xf;
+    cache->ways = ways_tab[ways];
+  }
+  cache->size = size;
+  cache->sets = 0;
+
+  hwloc_debug("cache L%u t%u linesize %u ways %d size %luKB\n", cache->level, cache->nbthreads_sharing, cache->linesize, cache->ways, cache->size >> 10);
+}
+
+static void look_exttopoenum(struct procinfo *infos, unsigned leaf, struct cpuiddump *src_cpuiddump)
+{
+  unsigned level, apic_nextshift, apic_number, apic_type, apic_id = 0, apic_shift = 0, id;
+  unsigned threadid __hwloc_attribute_unused = 0; /* shut-up compiler */
+  unsigned eax, ebx, ecx = 0, edx;
+  int apic_packageshift = 0;
+
+  for (level = 0; ; level++) {
+    ecx = level;
+    eax = leaf;
+    cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+    if (!eax && !ebx)
+      break;
+    apic_packageshift = eax & 0x1f;
+  }
+
+  if (level) {
+    infos->otherids = malloc(level * sizeof(*infos->otherids));
+    if (infos->otherids) {
+      infos->levels = level;
+      for (level = 0; ; level++) {
+	ecx = level;
+	eax = leaf;
+	cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+	if (!eax && !ebx)
+	  break;
+	apic_nextshift = eax & 0x1f;
+	apic_number = ebx & 0xffff;
+	apic_type = (ecx & 0xff00) >> 8;
+	apic_id = edx;
+	id = (apic_id >> apic_shift) & ((1 << (apic_packageshift - apic_shift)) - 1);
+	hwloc_debug("x2APIC %08x %u: nextshift %u num %2u type %u id %2u\n", apic_id, level, apic_nextshift, apic_number, apic_type, id);
+	infos->apicid = apic_id;
+	infos->otherids[level] = UINT_MAX;
+	switch (apic_type) {
+	case 1:
+	  threadid = id;
+	  /* apic_number is the actual number of threads per core */
+	  break;
+	case 2:
+	  infos->coreid = id;
+	  /* apic_number is the actual number of threads per module */
+	  break;
+	case 5:
+	  infos->dieid = id;
+	  /* apic_number is the actual number of threads per package */
+	  break;
+	default:
+	  hwloc_debug("x2APIC %u: unknown type %u\n", level, apic_type);
+	  infos->otherids[level] = apic_id >> apic_shift;
+	  break;
+	}
+	apic_shift = apic_nextshift;
+      }
+      infos->apicid = apic_id;
+      infos->packageid = apic_id >> apic_shift;
+      hwloc_debug("x2APIC remainder: %u\n", infos->packageid);
+      hwloc_debug("this is thread %u of core %u\n", threadid, infos->coreid);
+    }
+  }
+}
+
+/* Fetch information from the processor itself thanks to cpuid and store it in
+ * infos for summarize to analyze them globally */
+static void look_proc(struct hwloc_backend *backend, struct procinfo *infos, unsigned highest_cpuid, unsigned highest_ext_cpuid, unsigned *features, enum cpuid_type cpuid_type, struct cpuiddump *src_cpuiddump)
+{
+  struct hwloc_x86_backend_data_s *data = backend->private_data;
+  unsigned eax, ebx, ecx = 0, edx;
+  unsigned cachenum;
+  struct cacheinfo *cache;
+  unsigned regs[4];
+  unsigned legacy_max_log_proc; /* not valid on Intel processors with > 256 threads, or when cpuid 0x80000008 is supported */
+  unsigned legacy_log_proc_id;
+  unsigned _model, _extendedmodel, _family, _extendedfamily;
+
+  infos->present = 1;
+
+  /* Get apicid, legacy_max_log_proc, packageid, legacy_log_proc_id from cpuid 0x01 */
+  eax = 0x01;
+  cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+  infos->apicid = ebx >> 24;
+  if (edx & (1 << 28))
+    legacy_max_log_proc = 1 << hwloc_flsl(((ebx >> 16) & 0xff) - 1);
+  else
+    legacy_max_log_proc = 1;
+  hwloc_debug("APIC ID 0x%02x legacy_max_log_proc %u\n", infos->apicid, legacy_max_log_proc);
+  infos->packageid = infos->apicid / legacy_max_log_proc;
+  legacy_log_proc_id = infos->apicid % legacy_max_log_proc;
+  hwloc_debug("phys %u legacy thread %u\n", infos->packageid, legacy_log_proc_id);
+
+  /* Get cpu model/family/stepping numbers from same cpuid */
+  _model          = (eax>>4) & 0xf;
+  _extendedmodel  = (eax>>16) & 0xf;
+  _family         = (eax>>8) & 0xf;
+  _extendedfamily = (eax>>20) & 0xff;
+  if ((cpuid_type == intel || cpuid_type == amd || cpuid_type == hygon) && _family == 0xf) {
+    infos->cpufamilynumber = _family + _extendedfamily;
+  } else {
+    infos->cpufamilynumber = _family;
+  }
+  if ((cpuid_type == intel && (_family == 0x6 || _family == 0xf))
+      || ((cpuid_type == amd || cpuid_type == hygon) && _family == 0xf)
+      || (cpuid_type == zhaoxin && (_family == 0x6 || _family == 0x7))) {
+    infos->cpumodelnumber = _model + (_extendedmodel << 4);
+  } else {
+    infos->cpumodelnumber = _model;
+  }
+  infos->cpustepping = eax & 0xf;
+
+  if (cpuid_type == intel && infos->cpufamilynumber == 0x6 &&
+      (infos->cpumodelnumber == 0x57 || infos->cpumodelnumber == 0x85))
+    data->is_knl = 1; /* KNM is the same as KNL */
+
+  /* Get cpu vendor string from cpuid 0x00 */
+  memset(regs, 0, sizeof(regs));
+  regs[0] = 0;
+  cpuid_or_from_dump(&regs[0], &regs[1], &regs[3], &regs[2], src_cpuiddump);
+  memcpy(infos->cpuvendor, regs+1, 4*3);
+  /* infos was calloc'ed, already ends with \0 */
+
+  /* Get cpu model string from cpuid 0x80000002-4 */
+  if (highest_ext_cpuid >= 0x80000004) {
+    memset(regs, 0, sizeof(regs));
+    regs[0] = 0x80000002;
+    cpuid_or_from_dump(&regs[0], &regs[1], &regs[2], &regs[3], src_cpuiddump);
+    memcpy(infos->cpumodel, regs, 4*4);
+    regs[0] = 0x80000003;
+    cpuid_or_from_dump(&regs[0], &regs[1], &regs[2], &regs[3], src_cpuiddump);
+    memcpy(infos->cpumodel + 4*4, regs, 4*4);
+    regs[0] = 0x80000004;
+    cpuid_or_from_dump(&regs[0], &regs[1], &regs[2], &regs[3], src_cpuiddump);
+    memcpy(infos->cpumodel + 4*4*2, regs, 4*4);
+    /* infos was calloc'ed, already ends with \0 */
+  }
+
+  /* Get core/thread information from cpuid 0x80000008
+   * (not supported on Intel)
+   */
+  if (cpuid_type != intel && cpuid_type != zhaoxin && highest_ext_cpuid >= 0x80000008) {
+    unsigned max_nbcores;
+    unsigned max_nbthreads;
+    unsigned coreidsize;
+    unsigned logprocid;
+    eax = 0x80000008;
+    cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+    coreidsize = (ecx >> 12) & 0xf;
+    hwloc_debug("core ID size: %u\n", coreidsize);
+    if (!coreidsize) {
+      max_nbcores = (ecx & 0xff) + 1;
+    } else
+      max_nbcores = 1 << coreidsize;
+    hwloc_debug("Thus max # of cores: %u\n", max_nbcores);
+    /* Still no multithreaded AMD */
+    max_nbthreads = 1 ;
+    hwloc_debug("and max # of threads: %u\n", max_nbthreads);
+    /* legacy_max_log_proc is deprecated, it can be smaller than max_nbcores,
+     * which is the maximum number of cores that the processor could theoretically support
+     * (see "Multiple Core Calculation" in the AMD CPUID specification).
+     * Recompute packageid/threadid/coreid accordingly.
+     */
+    infos->packageid = infos->apicid / max_nbcores;
+    logprocid = infos->apicid % max_nbcores;
+    infos->threadid = logprocid % max_nbthreads;
+    infos->coreid = logprocid / max_nbthreads;
+    hwloc_debug("this is thread %u of core %u\n", infos->threadid, infos->coreid);
+  }
+
+  infos->numcaches = 0;
+  infos->cache = NULL;
+
+  /* Get apicid, nodeid, unitid from cpuid 0x8000001e
+   * and cache information from cpuid 0x8000001d
+   * (AMD topology extension)
+   */
+  if (cpuid_type != intel && cpuid_type != zhaoxin && has_topoext(features)) {
+    unsigned apic_id, node_id, nodes_per_proc;
+
+    /* the code below doesn't want any other cache yet */
+    assert(!infos->numcaches);
+
+    eax = 0x8000001e;
+    cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+    infos->apicid = apic_id = eax;
+
+    if (infos->cpufamilynumber == 0x16) {
+      /* ecx is reserved */
+      node_id = 0;
+      nodes_per_proc = 1;
+    } else {
+      /* AMD other families or Hygon family 18h */
+      node_id = ecx & 0xff;
+      nodes_per_proc = ((ecx >> 8) & 7) + 1;
+    }
+    infos->nodeid = node_id;
+    if ((infos->cpufamilynumber == 0x15 && nodes_per_proc > 2)
+	|| ((infos->cpufamilynumber == 0x17 || infos->cpufamilynumber == 0x18) && nodes_per_proc > 4)) {
+      hwloc_debug("warning: undefined nodes_per_proc value %u, assuming it means %u\n", nodes_per_proc, nodes_per_proc);
+    }
+
+    if (infos->cpufamilynumber <= 0x16) { /* topoext appeared in 0x15 and compute-units were only used in 0x15 and 0x16 */
+      unsigned unit_id, cores_per_unit;
+      infos->unitid = unit_id = ebx & 0xff;
+      cores_per_unit = ((ebx >> 8) & 0xff) + 1;
+      hwloc_debug("topoext %08x, %u nodes, node %u, %u cores in unit %u\n", apic_id, nodes_per_proc, node_id, cores_per_unit, unit_id);
+      /* coreid and unitid are package-wide (core 0-15 and unit 0-7 on 16-core 2-NUMAnode processor).
+       * The Linux kernel reduces theses to NUMA-node-wide (by applying %core_per_node and %unit_per node respectively).
+       * It's not clear if we should do this as well.
+       */
+    } else {
+      unsigned core_id, threads_per_core;
+      infos->coreid = core_id = ebx & 0xff;
+      threads_per_core = ((ebx >> 8) & 0xff) + 1;
+      hwloc_debug("topoext %08x, %u nodes, node %u, %u threads in core %u\n", apic_id, nodes_per_proc, node_id, threads_per_core, core_id);
+    }
+
+    for (cachenum = 0; ; cachenum++) {
+      eax = 0x8000001d;
+      ecx = cachenum;
+      cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+      if ((eax & 0x1f) == 0)
+	break;
+      infos->numcaches++;
+    }
+
+    cache = infos->cache = malloc(infos->numcaches * sizeof(*infos->cache));
+    if (cache) {
+     for (cachenum = 0; ; cachenum++) {
+      unsigned long linesize, linepart, ways, sets;
+      eax = 0x8000001d;
+      ecx = cachenum;
+      cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+
+      if ((eax & 0x1f) == 0)
+	break;
+      switch (eax & 0x1f) {
+      case 1: cache->type = HWLOC_OBJ_CACHE_DATA; break;
+      case 2: cache->type = HWLOC_OBJ_CACHE_INSTRUCTION; break;
+      default: cache->type = HWLOC_OBJ_CACHE_UNIFIED; break;
+      }
+
+      cache->level = (eax >> 5) & 0x7;
+      /* Note: actually number of cores */
+      cache->nbthreads_sharing = ((eax >> 14) &  0xfff) + 1;
+
+      cache->linesize = linesize = (ebx & 0xfff) + 1;
+      cache->linepart = linepart = ((ebx >> 12) & 0x3ff) + 1;
+      ways = ((ebx >> 22) & 0x3ff) + 1;
+
+      if (eax & (1 << 9))
+	/* Fully associative */
+	cache->ways = -1;
+      else
+	cache->ways = ways;
+      cache->sets = sets = ecx + 1;
+      cache->size = linesize * linepart * ways * sets;
+      cache->inclusive = edx & 0x2;
+
+      hwloc_debug("cache %u L%u%c t%u linesize %lu linepart %lu ways %lu sets %lu, size %luKB\n",
+		  cachenum, cache->level,
+		  cache->type == HWLOC_OBJ_CACHE_DATA ? 'd' : cache->type == HWLOC_OBJ_CACHE_INSTRUCTION ? 'i' : 'u',
+		  cache->nbthreads_sharing, linesize, linepart, ways, sets, cache->size >> 10);
+
+      cache++;
+     }
+    } else {
+     infos->numcaches = 0;
+    }
+  } else {
+    /* If there's no topoext,
+     * get cache information from cpuid 0x80000005 and 0x80000006
+     * (not supported on Intel)
+     */
+    if (cpuid_type != intel && cpuid_type != zhaoxin && highest_ext_cpuid >= 0x80000005) {
+      eax = 0x80000005;
+      cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+      fill_amd_cache(infos, 1, HWLOC_OBJ_CACHE_DATA, 1, ecx); /* private L1d */
+      fill_amd_cache(infos, 1, HWLOC_OBJ_CACHE_INSTRUCTION, 1, edx); /* private L1i */
+    }
+    if (cpuid_type != intel && cpuid_type != zhaoxin && highest_ext_cpuid >= 0x80000006) {
+      eax = 0x80000006;
+      cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+      if (ecx & 0xf000)
+	/* This is actually supported on Intel but LinePerTag isn't returned in bits 8-11.
+	 * Could be useful if some Intels (at least before Core micro-architecture)
+	 * support this leaf without leaf 0x4.
+	 */
+	fill_amd_cache(infos, 2, HWLOC_OBJ_CACHE_UNIFIED, 1, ecx); /* private L2u */
+      if (edx & 0xf000)
+	fill_amd_cache(infos, 3, HWLOC_OBJ_CACHE_UNIFIED, legacy_max_log_proc, edx); /* package-wide L3u */
+    }
+  }
+
+  /* Get thread/core + cache information from cpuid 0x04
+   * (not supported on AMD)
+   */
+  if ((cpuid_type != amd && cpuid_type != hygon) && highest_cpuid >= 0x04) {
+    unsigned max_nbcores;
+    unsigned max_nbthreads;
+    unsigned level;
+    struct cacheinfo *tmpcaches;
+    unsigned oldnumcaches = infos->numcaches; /* in case we got caches above */
+
+    for (cachenum = 0; ; cachenum++) {
+      eax = 0x04;
+      ecx = cachenum;
+      cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+
+      hwloc_debug("cache %u type %u\n", cachenum, eax & 0x1f);
+      if ((eax & 0x1f) == 0)
+	break;
+      level = (eax >> 5) & 0x7;
+      if (data->is_knl && level == 3)
+	/* KNL reports wrong L3 information (size always 0, cpuset always the entire machine, ignore it */
+	break;
+      infos->numcaches++;
+
+      if (!cachenum) {
+	/* by the way, get thread/core information from the first cache */
+	max_nbcores = ((eax >> 26) & 0x3f) + 1;
+	max_nbthreads = legacy_max_log_proc / max_nbcores;
+	hwloc_debug("thus %u threads\n", max_nbthreads);
+	infos->threadid = legacy_log_proc_id % max_nbthreads;
+	infos->coreid = legacy_log_proc_id / max_nbthreads;
+	hwloc_debug("this is thread %u of core %u\n", infos->threadid, infos->coreid);
+      }
+    }
+
+    tmpcaches = realloc(infos->cache, infos->numcaches * sizeof(*infos->cache));
+    if (!tmpcaches) {
+     infos->numcaches = oldnumcaches;
+    } else {
+     infos->cache = tmpcaches;
+     cache = &infos->cache[oldnumcaches];
+
+     for (cachenum = 0; ; cachenum++) {
+      unsigned long linesize, linepart, ways, sets;
+      eax = 0x04;
+      ecx = cachenum;
+      cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+
+      if ((eax & 0x1f) == 0)
+	break;
+      level = (eax >> 5) & 0x7;
+      if (data->is_knl && level == 3)
+	/* KNL reports wrong L3 information (size always 0, cpuset always the entire machine, ignore it */
+	break;
+      switch (eax & 0x1f) {
+      case 1: cache->type = HWLOC_OBJ_CACHE_DATA; break;
+      case 2: cache->type = HWLOC_OBJ_CACHE_INSTRUCTION; break;
+      default: cache->type = HWLOC_OBJ_CACHE_UNIFIED; break;
+      }
+
+      cache->level = level;
+      cache->nbthreads_sharing = ((eax >> 14) & 0xfff) + 1;
+
+      cache->linesize = linesize = (ebx & 0xfff) + 1;
+      cache->linepart = linepart = ((ebx >> 12) & 0x3ff) + 1;
+      ways = ((ebx >> 22) & 0x3ff) + 1;
+      if (eax & (1 << 9))
+        /* Fully associative */
+        cache->ways = -1;
+      else
+        cache->ways = ways;
+      cache->sets = sets = ecx + 1;
+      cache->size = linesize * linepart * ways * sets;
+      cache->inclusive = edx & 0x2;
+
+      hwloc_debug("cache %u L%u%c t%u linesize %lu linepart %lu ways %lu sets %lu, size %luKB\n",
+		  cachenum, cache->level,
+		  cache->type == HWLOC_OBJ_CACHE_DATA ? 'd' : cache->type == HWLOC_OBJ_CACHE_INSTRUCTION ? 'i' : 'u',
+		  cache->nbthreads_sharing, linesize, linepart, ways, sets, cache->size >> 10);
+      cache++;
+     }
+    }
+  }
+
+  if ((cpuid_type == intel) && highest_cpuid >= 0x1f) {
+    /* Get package/die/module/tile/core/thread information from cpuid 0x1f
+     * (Intel v2 Extended Topology Enumeration)
+     */
+    look_exttopoenum(infos, 0x1f, src_cpuiddump);
+
+  } else if ((cpuid_type == intel || cpuid_type == zhaoxin) && highest_cpuid >= 0x0b && has_x2apic(features)) {
+    /* Get package/core/thread information from cpuid 0x0b
+     * (Intel v1 Extended Topology Enumeration)
+     */
+    look_exttopoenum(infos, 0x0b, src_cpuiddump);
+  }
+
+  /* Now that we have all info, compute cacheids and apply quirks */
+  for (cachenum = 0; cachenum < infos->numcaches; cachenum++) {
+    cache = &infos->cache[cachenum];
+
+    /* default cacheid value */
+    cache->cacheid = infos->apicid / cache->nbthreads_sharing;
+
+    if (cpuid_type == amd) {
+      /* AMD quirks */
+      if (infos->cpufamilynumber == 0x17
+	  && cache->level == 3 && cache->nbthreads_sharing == 6) {
+	/* AMD family 0x17 always shares L3 between 8 APIC ids,
+	 * even when only 6 APIC ids are enabled and reported in nbthreads_sharing
+	 * (on 24-core CPUs).
+	 */
+	cache->cacheid = infos->apicid / 8;
+
+      } else if (infos->cpufamilynumber== 0x10 && infos->cpumodelnumber == 0x9
+	  && cache->level == 3
+	  && (cache->ways == -1 || (cache->ways % 2 == 0)) && cache->nbthreads_sharing >= 8) {
+	/* Fix AMD family 0x10 model 0x9 (Magny-Cours) with 8 or 12 cores.
+	 * The L3 (and its associativity) is actually split into two halves).
+	 */
+	if (cache->nbthreads_sharing == 16)
+	  cache->nbthreads_sharing = 12; /* nbthreads_sharing is a power of 2 but the processor actually has 8 or 12 cores */
+	cache->nbthreads_sharing /= 2;
+	cache->size /= 2;
+	if (cache->ways != -1)
+	  cache->ways /= 2;
+	/* AMD Magny-Cours 12-cores processor reserve APIC ids as AAAAAABBBBBB....
+	 * among first L3 (A), second L3 (B), and unexisting cores (.).
+	 * On multi-socket servers, L3 in non-first sockets may have APIC id ranges
+	 * such as [16-21] that are not aligned on multiple of nbthreads_sharing (6).
+	 * That means, we can't just compare apicid/nbthreads_sharing to identify siblings.
+	 */
+	cache->cacheid = (infos->apicid % legacy_max_log_proc) / cache->nbthreads_sharing /* cacheid within the package */
+	  + 2 * (infos->apicid / legacy_max_log_proc); /* add 2 caches per previous package */
+
+      } else if (infos->cpufamilynumber == 0x15
+		 && (infos->cpumodelnumber == 0x1 /* Bulldozer */ || infos->cpumodelnumber == 0x2 /* Piledriver */)
+		 && cache->level == 3 && cache->nbthreads_sharing == 6) {
+	/* AMD Bulldozer and Piledriver 12-core processors have same APIC ids as Magny-Cours below,
+	 * but we can't merge the checks because the original nbthreads_sharing must be exactly 6 here.
+	 */
+	cache->cacheid = (infos->apicid % legacy_max_log_proc) / cache->nbthreads_sharing /* cacheid within the package */
+	  + 2 * (infos->apicid / legacy_max_log_proc); /* add 2 cache per previous package */
+      }
+    } else if (cpuid_type == hygon) {
+      if (infos->cpufamilynumber == 0x18
+	  && cache->level == 3 && cache->nbthreads_sharing == 6) {
+        /* Hygon family 0x18 always shares L3 between 8 APIC ids,
+         * even when only 6 APIC ids are enabled and reported in nbthreads_sharing
+         * (on 24-core CPUs).
+         */
+        cache->cacheid = infos->apicid / 8;
+      }
+    }
+  }
+
+  if (hwloc_bitmap_isset(data->apicid_set, infos->apicid))
+    data->apicid_unique = 0;
+  else
+    hwloc_bitmap_set(data->apicid_set, infos->apicid);
+}
+
+static void
+hwloc_x86_add_cpuinfos(hwloc_obj_t obj, struct procinfo *info, int replace)
+{
+  char number[12];
+  if (info->cpuvendor[0])
+    hwloc__add_info_nodup(&obj->infos, &obj->infos_count, "CPUVendor", info->cpuvendor, replace);
+  snprintf(number, sizeof(number), "%u", info->cpufamilynumber);
+  hwloc__add_info_nodup(&obj->infos, &obj->infos_count, "CPUFamilyNumber", number, replace);
+  snprintf(number, sizeof(number), "%u", info->cpumodelnumber);
+  hwloc__add_info_nodup(&obj->infos, &obj->infos_count, "CPUModelNumber", number, replace);
+  if (info->cpumodel[0]) {
+    const char *c = info->cpumodel;
+    while (*c == ' ')
+      c++;
+    hwloc__add_info_nodup(&obj->infos, &obj->infos_count, "CPUModel", c, replace);
+  }
+  snprintf(number, sizeof(number), "%u", info->cpustepping);
+  hwloc__add_info_nodup(&obj->infos, &obj->infos_count, "CPUStepping", number, replace);
+}
+
+/* Analyse information stored in infos, and build/annotate topology levels accordingly */
+static void summarize(struct hwloc_backend *backend, struct procinfo *infos, int fulldiscovery)
+{
+  struct hwloc_topology *topology = backend->topology;
+  struct hwloc_x86_backend_data_s *data = backend->private_data;
+  unsigned nbprocs = data->nbprocs;
+  hwloc_bitmap_t complete_cpuset = hwloc_bitmap_alloc();
+  unsigned i, j, l, level;
+  int one = -1;
+  hwloc_bitmap_t remaining_cpuset;
+  int gotnuma = 0;
+
+  for (i = 0; i < nbprocs; i++)
+    if (infos[i].present) {
+      hwloc_bitmap_set(complete_cpuset, i);
+      one = i;
+    }
+
+  if (one == -1) {
+    hwloc_bitmap_free(complete_cpuset);
+    return;
+  }
+
+  remaining_cpuset = hwloc_bitmap_alloc();
+
+  /* Ideally, when fulldiscovery=0, we could add any object that doesn't exist yet.
+   * But what if the x86 and the native backends disagree because one is buggy? Which one to trust?
+   * We only add missing caches, and annotate other existing objects for now.
+   */
+
+  if (hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_PACKAGE)) {
+    /* Look for packages */
+    hwloc_obj_t package;
+
+    hwloc_bitmap_copy(remaining_cpuset, complete_cpuset);
+    while ((i = hwloc_bitmap_first(remaining_cpuset)) != (unsigned) -1) {
+      if (fulldiscovery) {
+	unsigned packageid = infos[i].packageid;
+	hwloc_bitmap_t package_cpuset = hwloc_bitmap_alloc();
+
+	for (j = i; j < nbprocs; j++) {
+	  if (infos[j].packageid == packageid) {
+	    hwloc_bitmap_set(package_cpuset, j);
+	    hwloc_bitmap_clr(remaining_cpuset, j);
+	  }
+	}
+	package = hwloc_alloc_setup_object(topology, HWLOC_OBJ_PACKAGE, packageid);
+	package->cpuset = package_cpuset;
+
+	hwloc_x86_add_cpuinfos(package, &infos[i], 0);
+
+	hwloc_debug_1arg_bitmap("os package %u has cpuset %s\n",
+				packageid, package_cpuset);
+	hwloc_insert_object_by_cpuset(topology, package);
+
+      } else {
+	/* Annotate packages previously-existing packages */
+	hwloc_bitmap_t set = hwloc_bitmap_alloc();
+	hwloc_bitmap_set(set, i);
+	package = hwloc_get_next_obj_covering_cpuset_by_type(topology, set, HWLOC_OBJ_PACKAGE, NULL);
+	hwloc_bitmap_free(set);
+	if (package) {
+	  /* Found package above that PU, annotate if no such attribute yet */
+	  hwloc_x86_add_cpuinfos(package, &infos[i], 1);
+	  hwloc_bitmap_andnot(remaining_cpuset, remaining_cpuset, package->cpuset);
+	} else {
+	  /* No package, annotate the root object */
+	  hwloc_x86_add_cpuinfos(hwloc_get_root_obj(topology), &infos[i], 1);
+	  break;
+	}
+      }
+    }
+  }
+
+  /* Look for Numa nodes inside packages (cannot be filtered-out) */
+  if (fulldiscovery && getenv("HWLOC_X86_TOPOEXT_NUMANODES")) {
+    hwloc_bitmap_t node_cpuset;
+    hwloc_obj_t node;
+
+    /* FIXME: if there's memory inside the root object, divide it into NUMA nodes? */
+
+    hwloc_bitmap_copy(remaining_cpuset, complete_cpuset);
+    while ((i = hwloc_bitmap_first(remaining_cpuset)) != (unsigned) -1) {
+      unsigned packageid = infos[i].packageid;
+      unsigned nodeid = infos[i].nodeid;
+
+      if (nodeid == (unsigned)-1) {
+        hwloc_bitmap_clr(remaining_cpuset, i);
+	continue;
+      }
+
+      node_cpuset = hwloc_bitmap_alloc();
+      for (j = i; j < nbprocs; j++) {
+	if (infos[j].nodeid == (unsigned) -1) {
+	  hwloc_bitmap_clr(remaining_cpuset, j);
+	  continue;
+	}
+
+        if (infos[j].packageid == packageid && infos[j].nodeid == nodeid) {
+          hwloc_bitmap_set(node_cpuset, j);
+          hwloc_bitmap_clr(remaining_cpuset, j);
+        }
+      }
+      node = hwloc_alloc_setup_object(topology, HWLOC_OBJ_NUMANODE, nodeid);
+      node->cpuset = node_cpuset;
+      node->nodeset = hwloc_bitmap_alloc();
+      hwloc_bitmap_set(node->nodeset, nodeid);
+      hwloc_debug_1arg_bitmap("os node %u has cpuset %s\n",
+          nodeid, node_cpuset);
+      hwloc_insert_object_by_cpuset(topology, node);
+      gotnuma++;
+    }
+  }
+
+  if (hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_GROUP)) {
+    if (fulldiscovery) {
+      char *env;
+      int dont_merge;
+      hwloc_bitmap_t unit_cpuset, die_cpuset;
+      hwloc_obj_t unit, die;
+
+      /* Look for Compute units inside packages */
+      hwloc_bitmap_copy(remaining_cpuset, complete_cpuset);
+      while ((i = hwloc_bitmap_first(remaining_cpuset)) != (unsigned) -1) {
+	unsigned packageid = infos[i].packageid;
+	unsigned unitid = infos[i].unitid;
+
+	if (unitid == (unsigned)-1) {
+	  hwloc_bitmap_clr(remaining_cpuset, i);
+	  continue;
+	}
+
+	unit_cpuset = hwloc_bitmap_alloc();
+	for (j = i; j < nbprocs; j++) {
+	  if (infos[j].unitid == (unsigned) -1) {
+	    hwloc_bitmap_clr(remaining_cpuset, j);
+	    continue;
+	  }
+
+	  if (infos[j].packageid == packageid && infos[j].unitid == unitid) {
+	    hwloc_bitmap_set(unit_cpuset, j);
+	    hwloc_bitmap_clr(remaining_cpuset, j);
+	  }
+	}
+	unit = hwloc_alloc_setup_object(topology, HWLOC_OBJ_GROUP, unitid);
+	unit->cpuset = unit_cpuset;
+	unit->subtype = strdup("ComputeUnit");
+	unit->attr->group.kind = HWLOC_GROUP_KIND_AMD_COMPUTE_UNIT;
+	hwloc_debug_1arg_bitmap("os unit %u has cpuset %s\n",
+				unitid, unit_cpuset);
+	hwloc_insert_object_by_cpuset(topology, unit);
+      }
+
+      /* Look for Dies inside packages */
+      env = getenv("HWLOC_DONT_MERGE_DIE_GROUPS");
+      dont_merge = env && atoi(env);
+      hwloc_bitmap_copy(remaining_cpuset, complete_cpuset);
+      while ((i = hwloc_bitmap_first(remaining_cpuset)) != (unsigned) -1) {
+	unsigned packageid = infos[i].packageid;
+	unsigned dieid = infos[i].dieid;
+
+	if (dieid == (unsigned)-1) {
+	  hwloc_bitmap_clr(remaining_cpuset, i);
+	  continue;
+	}
+
+	die_cpuset = hwloc_bitmap_alloc();
+	for (j = i; j < nbprocs; j++) {
+	  if (infos[j].dieid == (unsigned) -1) {
+	    hwloc_bitmap_clr(remaining_cpuset, j);
+	    continue;
+	  }
+
+	  if (infos[j].packageid == packageid && infos[j].dieid == dieid) {
+	    hwloc_bitmap_set(die_cpuset, j);
+	    hwloc_bitmap_clr(remaining_cpuset, j);
+	  }
+	}
+	die = hwloc_alloc_setup_object(topology, HWLOC_OBJ_GROUP, dieid);
+	die->cpuset = die_cpuset;
+	die->subtype = strdup("Die");
+	die->attr->group.kind = HWLOC_GROUP_KIND_INTEL_DIE;
+	die->attr->group.dont_merge = dont_merge;
+	hwloc_debug_1arg_bitmap("os die %u has cpuset %s\n",
+				dieid, die_cpuset);
+	hwloc_insert_object_by_cpuset(topology, die);
+      }
+
+      /* Look for unknown objects */
+      if (infos[one].otherids) {
+	for (level = infos[one].levels-1; level <= infos[one].levels-1; level--) {
+	  if (infos[one].otherids[level] != UINT_MAX) {
+	    hwloc_bitmap_t unknown_cpuset;
+	    hwloc_obj_t unknown_obj;
+
+	    hwloc_bitmap_copy(remaining_cpuset, complete_cpuset);
+	    while ((i = hwloc_bitmap_first(remaining_cpuset)) != (unsigned) -1) {
+	      unsigned unknownid = infos[i].otherids[level];
+
+	      unknown_cpuset = hwloc_bitmap_alloc();
+	      for (j = i; j < nbprocs; j++) {
+		if (infos[j].otherids[level] == unknownid) {
+		  hwloc_bitmap_set(unknown_cpuset, j);
+		  hwloc_bitmap_clr(remaining_cpuset, j);
+		}
+	      }
+	      unknown_obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_GROUP, unknownid);
+	      unknown_obj->cpuset = unknown_cpuset;
+	      unknown_obj->attr->group.kind = HWLOC_GROUP_KIND_INTEL_EXTTOPOENUM_UNKNOWN;
+	      unknown_obj->attr->group.subkind = level;
+	      hwloc_debug_2args_bitmap("os unknown%u %u has cpuset %s\n",
+				       level, unknownid, unknown_cpuset);
+	      hwloc_insert_object_by_cpuset(topology, unknown_obj);
+	    }
+	  }
+	}
+      }
+    }
+  }
+
+  if (hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_CORE)) {
+    /* Look for cores */
+    if (fulldiscovery) {
+      hwloc_bitmap_t core_cpuset;
+      hwloc_obj_t core;
+
+      hwloc_bitmap_copy(remaining_cpuset, complete_cpuset);
+      while ((i = hwloc_bitmap_first(remaining_cpuset)) != (unsigned) -1) {
+	unsigned packageid = infos[i].packageid;
+	unsigned nodeid = infos[i].nodeid;
+	unsigned coreid = infos[i].coreid;
+
+	if (coreid == (unsigned) -1) {
+	  hwloc_bitmap_clr(remaining_cpuset, i);
+	  continue;
+	}
+
+	core_cpuset = hwloc_bitmap_alloc();
+	for (j = i; j < nbprocs; j++) {
+	  if (infos[j].coreid == (unsigned) -1) {
+	    hwloc_bitmap_clr(remaining_cpuset, j);
+	    continue;
+	  }
+
+	  if (infos[j].packageid == packageid && infos[j].nodeid == nodeid && infos[j].coreid == coreid) {
+	    hwloc_bitmap_set(core_cpuset, j);
+	    hwloc_bitmap_clr(remaining_cpuset, j);
+	  }
+	}
+	core = hwloc_alloc_setup_object(topology, HWLOC_OBJ_CORE, coreid);
+	core->cpuset = core_cpuset;
+	hwloc_debug_1arg_bitmap("os core %u has cpuset %s\n",
+				coreid, core_cpuset);
+	hwloc_insert_object_by_cpuset(topology, core);
+      }
+    }
+  }
+
+  /* Look for PUs (cannot be filtered-out) */
+  if (fulldiscovery) {
+    hwloc_debug("%s", "\n\n * CPU cpusets *\n\n");
+    for (i=0; i<nbprocs; i++)
+      if(infos[i].present) { /* Only add present PU. We don't know if others actually exist */
+       struct hwloc_obj *obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_PU, i);
+       obj->cpuset = hwloc_bitmap_alloc();
+       hwloc_bitmap_only(obj->cpuset, i);
+       hwloc_debug_1arg_bitmap("PU %u has cpuset %s\n", i, obj->cpuset);
+       hwloc_insert_object_by_cpuset(topology, obj);
+     }
+  }
+
+  /* Look for caches */
+  /* First find max level */
+  level = 0;
+  for (i = 0; i < nbprocs; i++)
+    for (j = 0; j < infos[i].numcaches; j++)
+      if (infos[i].cache[j].level > level)
+        level = infos[i].cache[j].level;
+  while (level > 0) {
+    hwloc_obj_cache_type_t type;
+    HWLOC_BUILD_ASSERT(HWLOC_OBJ_CACHE_DATA == HWLOC_OBJ_CACHE_UNIFIED+1);
+    HWLOC_BUILD_ASSERT(HWLOC_OBJ_CACHE_INSTRUCTION == HWLOC_OBJ_CACHE_DATA+1);
+    for (type = HWLOC_OBJ_CACHE_UNIFIED; type <= HWLOC_OBJ_CACHE_INSTRUCTION; type++) {
+      /* Look for caches of that type at level level */
+      hwloc_obj_type_t otype;
+      hwloc_obj_t cache;
+
+      otype = hwloc_cache_type_by_depth_type(level, type);
+      if (otype == HWLOC_OBJ_TYPE_NONE)
+	continue;
+      if (!hwloc_filter_check_keep_object_type(topology, otype))
+	continue;
+
+      hwloc_bitmap_copy(remaining_cpuset, complete_cpuset);
+      while ((i = hwloc_bitmap_first(remaining_cpuset)) != (unsigned) -1) {
+	hwloc_bitmap_t puset;
+
+	for (l = 0; l < infos[i].numcaches; l++) {
+	  if (infos[i].cache[l].level == level && infos[i].cache[l].type == type)
+	    break;
+	}
+	if (l == infos[i].numcaches) {
+	  /* no cache Llevel of that type in i */
+	  hwloc_bitmap_clr(remaining_cpuset, i);
+	  continue;
+	}
+
+	puset = hwloc_bitmap_alloc();
+	hwloc_bitmap_set(puset, i);
+	cache = hwloc_get_next_obj_covering_cpuset_by_type(topology, puset, otype, NULL);
+	hwloc_bitmap_free(puset);
+
+	if (cache) {
+	  /* Found cache above that PU, annotate if no such attribute yet */
+	  if (!hwloc_obj_get_info_by_name(cache, "Inclusive"))
+	    hwloc_obj_add_info(cache, "Inclusive", infos[i].cache[l].inclusive ? "1" : "0");
+	  hwloc_bitmap_andnot(remaining_cpuset, remaining_cpuset, cache->cpuset);
+	} else {
+	  /* Add the missing cache */
+	  hwloc_bitmap_t cache_cpuset;
+	  unsigned packageid = infos[i].packageid;
+	  unsigned cacheid = infos[i].cache[l].cacheid;
+	  /* Now look for others sharing it */
+	  cache_cpuset = hwloc_bitmap_alloc();
+	  for (j = i; j < nbprocs; j++) {
+	    unsigned l2;
+	    for (l2 = 0; l2 < infos[j].numcaches; l2++) {
+	      if (infos[j].cache[l2].level == level && infos[j].cache[l2].type == type)
+		break;
+	    }
+	    if (l2 == infos[j].numcaches) {
+	      /* no cache Llevel of that type in j */
+	      hwloc_bitmap_clr(remaining_cpuset, j);
+	      continue;
+	    }
+	    if (infos[j].packageid == packageid && infos[j].cache[l2].cacheid == cacheid) {
+	      hwloc_bitmap_set(cache_cpuset, j);
+	      hwloc_bitmap_clr(remaining_cpuset, j);
+	    }
+	  }
+	  cache = hwloc_alloc_setup_object(topology, otype, HWLOC_UNKNOWN_INDEX);
+	  cache->attr->cache.depth = level;
+	  cache->attr->cache.size = infos[i].cache[l].size;
+	  cache->attr->cache.linesize = infos[i].cache[l].linesize;
+	  cache->attr->cache.associativity = infos[i].cache[l].ways;
+	  cache->attr->cache.type = infos[i].cache[l].type;
+	  cache->cpuset = cache_cpuset;
+	  hwloc_obj_add_info(cache, "Inclusive", infos[i].cache[l].inclusive ? "1" : "0");
+	  hwloc_debug_2args_bitmap("os L%u cache %u has cpuset %s\n",
+				   level, cacheid, cache_cpuset);
+	  hwloc_insert_object_by_cpuset(topology, cache);
+	}
+      }
+    }
+    level--;
+  }
+
+  /* FIXME: if KNL and L2 disabled, add tiles instead of L2 */
+
+  hwloc_bitmap_free(remaining_cpuset);
+  hwloc_bitmap_free(complete_cpuset);
+
+  if (gotnuma)
+    topology->support.discovery->numa = 1;
+}
+
+static int
+look_procs(struct hwloc_backend *backend, struct procinfo *infos, int fulldiscovery,
+	   unsigned highest_cpuid, unsigned highest_ext_cpuid, unsigned *features, enum cpuid_type cpuid_type,
+	   int (*get_cpubind)(hwloc_topology_t topology, hwloc_cpuset_t set, int flags),
+	   int (*set_cpubind)(hwloc_topology_t topology, hwloc_const_cpuset_t set, int flags))
+{
+  struct hwloc_x86_backend_data_s *data = backend->private_data;
+  struct hwloc_topology *topology = backend->topology;
+  unsigned nbprocs = data->nbprocs;
+  hwloc_bitmap_t orig_cpuset = NULL;
+  hwloc_bitmap_t set = NULL;
+  unsigned i;
+
+  if (!data->src_cpuiddump_path) {
+    orig_cpuset = hwloc_bitmap_alloc();
+    if (get_cpubind(topology, orig_cpuset, HWLOC_CPUBIND_STRICT)) {
+      hwloc_bitmap_free(orig_cpuset);
+      return -1;
+    }
+    set = hwloc_bitmap_alloc();
+  }
+
+  for (i = 0; i < nbprocs; i++) {
+    struct cpuiddump *src_cpuiddump = NULL;
+    if (data->src_cpuiddump_path) {
+      src_cpuiddump = cpuiddump_read(data->src_cpuiddump_path, i);
+      if (!src_cpuiddump)
+	continue;
+    } else {
+      hwloc_bitmap_only(set, i);
+      hwloc_debug("binding to CPU%u\n", i);
+      if (set_cpubind(topology, set, HWLOC_CPUBIND_STRICT)) {
+	hwloc_debug("could not bind to CPU%u: %s\n", i, strerror(errno));
+	continue;
+      }
+    }
+
+    look_proc(backend, &infos[i], highest_cpuid, highest_ext_cpuid, features, cpuid_type, src_cpuiddump);
+
+    if (data->src_cpuiddump_path) {
+      cpuiddump_free(src_cpuiddump);
+    }
+  }
+
+  if (!data->src_cpuiddump_path) {
+    set_cpubind(topology, orig_cpuset, 0);
+    hwloc_bitmap_free(set);
+    hwloc_bitmap_free(orig_cpuset);
+  }
+
+  if (!data->apicid_unique)
+    fulldiscovery = 0;
+  else
+    summarize(backend, infos, fulldiscovery);
+  return 0;
+}
+
+#if defined HWLOC_FREEBSD_SYS && defined HAVE_CPUSET_SETID
+#include <sys/param.h>
+#include <sys/cpuset.h>
+typedef cpusetid_t hwloc_x86_os_state_t;
+static void hwloc_x86_os_state_save(hwloc_x86_os_state_t *state, struct cpuiddump *src_cpuiddump)
+{
+  if (!src_cpuiddump) {
+    /* temporary make all cpus available during discovery */
+    cpuset_getid(CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, state);
+    cpuset_setid(CPU_WHICH_PID, -1, 0);
+  }
+}
+static void hwloc_x86_os_state_restore(hwloc_x86_os_state_t *state, struct cpuiddump *src_cpuiddump)
+{
+  if (!src_cpuiddump) {
+    /* restore initial cpuset */
+    cpuset_setid(CPU_WHICH_PID, -1, *state);
+  }
+}
+#else /* !defined HWLOC_FREEBSD_SYS || !defined HAVE_CPUSET_SETID */
+typedef void * hwloc_x86_os_state_t;
+static void hwloc_x86_os_state_save(hwloc_x86_os_state_t *state __hwloc_attribute_unused, struct cpuiddump *src_cpuiddump __hwloc_attribute_unused) { }
+static void hwloc_x86_os_state_restore(hwloc_x86_os_state_t *state __hwloc_attribute_unused, struct cpuiddump *src_cpuiddump __hwloc_attribute_unused) { }
+#endif /* !defined HWLOC_FREEBSD_SYS || !defined HAVE_CPUSET_SETID */
+
+/* GenuineIntel */
+#define INTEL_EBX ('G' | ('e'<<8) | ('n'<<16) | ('u'<<24))
+#define INTEL_EDX ('i' | ('n'<<8) | ('e'<<16) | ('I'<<24))
+#define INTEL_ECX ('n' | ('t'<<8) | ('e'<<16) | ('l'<<24))
+
+/* AuthenticAMD */
+#define AMD_EBX ('A' | ('u'<<8) | ('t'<<16) | ('h'<<24))
+#define AMD_EDX ('e' | ('n'<<8) | ('t'<<16) | ('i'<<24))
+#define AMD_ECX ('c' | ('A'<<8) | ('M'<<16) | ('D'<<24))
+
+/* HYGON "HygonGenuine" */
+#define HYGON_EBX ('H' | ('y'<<8) | ('g'<<16) | ('o'<<24))
+#define HYGON_EDX ('n' | ('G'<<8) | ('e'<<16) | ('n'<<24))
+#define HYGON_ECX ('u' | ('i'<<8) | ('n'<<16) | ('e'<<24))
+
+/* (Zhaoxin) CentaurHauls */
+#define ZX_EBX ('C' | ('e'<<8) | ('n'<<16) | ('t'<<24))
+#define ZX_EDX ('a' | ('u'<<8) | ('r'<<16) | ('H'<<24))
+#define ZX_ECX ('a' | ('u'<<8) | ('l'<<16) | ('s'<<24))
+/* (Zhaoxin) Shanghai */
+#define SH_EBX (' ' | (' '<<8) | ('S'<<16) | ('h'<<24))
+#define SH_EDX ('a' | ('n'<<8) | ('g'<<16) | ('h'<<24))
+#define SH_ECX ('a' | ('i'<<8) | (' '<<16) | (' '<<24))
+
+/* fake cpubind for when nbprocs=1 and no binding support */
+static int fake_get_cpubind(hwloc_topology_t topology __hwloc_attribute_unused,
+			    hwloc_cpuset_t set __hwloc_attribute_unused,
+			    int flags __hwloc_attribute_unused)
+{
+  return 0;
+}
+static int fake_set_cpubind(hwloc_topology_t topology __hwloc_attribute_unused,
+			    hwloc_const_cpuset_t set __hwloc_attribute_unused,
+			    int flags __hwloc_attribute_unused)
+{
+  return 0;
+}
+
+static
+int hwloc_look_x86(struct hwloc_backend *backend, int fulldiscovery)
+{
+  struct hwloc_x86_backend_data_s *data = backend->private_data;
+  unsigned nbprocs = data->nbprocs;
+  unsigned eax, ebx, ecx = 0, edx;
+  unsigned i;
+  unsigned highest_cpuid;
+  unsigned highest_ext_cpuid;
+  /* This stores cpuid features with the same indexing as Linux */
+  unsigned features[10] = { 0 };
+  struct procinfo *infos = NULL;
+  enum cpuid_type cpuid_type = unknown;
+  hwloc_x86_os_state_t os_state;
+  struct hwloc_binding_hooks hooks;
+  struct hwloc_topology_support support;
+  struct hwloc_topology_membind_support memsupport __hwloc_attribute_unused;
+  int (*get_cpubind)(hwloc_topology_t topology, hwloc_cpuset_t set, int flags) = NULL;
+  int (*set_cpubind)(hwloc_topology_t topology, hwloc_const_cpuset_t set, int flags) = NULL;
+  struct cpuiddump *src_cpuiddump = NULL;
+  int ret = -1;
+
+  if (data->src_cpuiddump_path) {
+    /* just read cpuid from the dump */
+    src_cpuiddump = cpuiddump_read(data->src_cpuiddump_path, 0);
+    if (!src_cpuiddump)
+      goto out;
+
+  } else {
+    /* otherwise check if binding works */
+    memset(&hooks, 0, sizeof(hooks));
+    support.membind = &memsupport;
+    hwloc_set_native_binding_hooks(&hooks, &support);
+    if (hooks.get_thisthread_cpubind && hooks.set_thisthread_cpubind) {
+      get_cpubind = hooks.get_thisthread_cpubind;
+      set_cpubind = hooks.set_thisthread_cpubind;
+    } else if (hooks.get_thisproc_cpubind && hooks.set_thisproc_cpubind) {
+      /* FIXME: if called by a multithreaded program, we will restore the original process binding
+       * for each thread instead of their own original thread binding.
+       * See issue #158.
+       */
+      get_cpubind = hooks.get_thisproc_cpubind;
+      set_cpubind = hooks.set_thisproc_cpubind;
+    } else {
+      /* we need binding support if there are multiple PUs */
+      if (nbprocs > 1)
+	goto out;
+      get_cpubind = fake_get_cpubind;
+      set_cpubind = fake_set_cpubind;
+    }
+  }
+
+  if (!src_cpuiddump && !hwloc_have_x86_cpuid())
+    goto out;
+
+  infos = calloc(nbprocs, sizeof(struct procinfo));
+  if (NULL == infos)
+    goto out;
+  for (i = 0; i < nbprocs; i++) {
+    infos[i].nodeid = (unsigned) -1;
+    infos[i].packageid = (unsigned) -1;
+    infos[i].dieid = (unsigned) -1;
+    infos[i].unitid = (unsigned) -1;
+    infos[i].coreid = (unsigned) -1;
+    infos[i].threadid = (unsigned) -1;
+  }
+
+  eax = 0x00;
+  cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+  highest_cpuid = eax;
+  if (ebx == INTEL_EBX && ecx == INTEL_ECX && edx == INTEL_EDX)
+    cpuid_type = intel;
+  else if (ebx == AMD_EBX && ecx == AMD_ECX && edx == AMD_EDX)
+    cpuid_type = amd;
+  else if ((ebx == ZX_EBX && ecx == ZX_ECX && edx == ZX_EDX)
+	   || (ebx == SH_EBX && ecx == SH_ECX && edx == SH_EDX))
+    cpuid_type = zhaoxin;
+  else if (ebx == HYGON_EBX && ecx == HYGON_ECX && edx == HYGON_EDX)
+    cpuid_type = hygon;
+
+  hwloc_debug("highest cpuid %x, cpuid type %u\n", highest_cpuid, cpuid_type);
+  if (highest_cpuid < 0x01) {
+      goto out_with_infos;
+  }
+
+  eax = 0x01;
+  cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+  features[0] = edx;
+  features[4] = ecx;
+
+  eax = 0x80000000;
+  cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+  highest_ext_cpuid = eax;
+
+  hwloc_debug("highest extended cpuid %x\n", highest_ext_cpuid);
+
+  if (highest_cpuid >= 0x7) {
+    eax = 0x7;
+    ecx = 0;
+    cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+    features[9] = ebx;
+  }
+
+  if (cpuid_type != intel && highest_ext_cpuid >= 0x80000001) {
+    eax = 0x80000001;
+    cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+    features[1] = edx;
+    features[6] = ecx;
+  }
+
+  hwloc_x86_os_state_save(&os_state, src_cpuiddump);
+
+  ret = look_procs(backend, infos, fulldiscovery,
+		   highest_cpuid, highest_ext_cpuid, features, cpuid_type,
+		   get_cpubind, set_cpubind);
+  if (!ret)
+    /* success, we're done */
+    goto out_with_os_state;
+
+  if (nbprocs == 1) {
+    /* only one processor, no need to bind */
+    look_proc(backend, &infos[0], highest_cpuid, highest_ext_cpuid, features, cpuid_type, src_cpuiddump);
+    summarize(backend, infos, fulldiscovery);
+    ret = 0;
+  }
+
+out_with_os_state:
+  hwloc_x86_os_state_restore(&os_state, src_cpuiddump);
+
+out_with_infos:
+  if (NULL != infos) {
+    for (i = 0; i < nbprocs; i++) {
+      free(infos[i].cache);
+      free(infos[i].otherids);
+    }
+    free(infos);
+  }
+
+out:
+  if (src_cpuiddump)
+    cpuiddump_free(src_cpuiddump);
+  return ret;
+}
+
+static int
+hwloc_x86_discover(struct hwloc_backend *backend)
+{
+  struct hwloc_x86_backend_data_s *data = backend->private_data;
+  struct hwloc_topology *topology = backend->topology;
+  int alreadypus = 0;
+  int ret;
+
+#if HAVE_DECL_RUNNING_ON_VALGRIND
+  if (RUNNING_ON_VALGRIND && !data->src_cpuiddump_path) {
+    fprintf(stderr, "hwloc x86 backend cannot work under Valgrind, disabling.\n"
+	    "May be reenabled by dumping CPUIDs with hwloc-gather-cpuid\n"
+	    "and reloading them under Valgrind with HWLOC_CPUID_PATH.\n");
+    return 0;
+  }
+#endif
+
+  if (data->src_cpuiddump_path) {
+    assert(data->nbprocs > 0); /* enforced by hwloc_x86_component_instantiate() */
+    topology->support.discovery->pu = 1;
+  } else {
+    int nbprocs = hwloc_fallback_nbprocessors(topology);
+    if (nbprocs >= 1)
+      topology->support.discovery->pu = 1;
+    else
+      nbprocs = 1;
+    data->nbprocs = (unsigned) nbprocs;
+  }
+
+  if (topology->levels[0][0]->cpuset) {
+    /* somebody else discovered things */
+    if (topology->nb_levels == 2 && topology->level_nbobjects[1] == data->nbprocs) {
+      /* only PUs were discovered, as much as we would, complete the topology with everything else */
+      alreadypus = 1;
+      goto fulldiscovery;
+    }
+
+    /* several object types were added, we can't easily complete, just do partial discovery */
+    hwloc_topology_reconnect(topology, 0);
+    ret = hwloc_look_x86(backend, 0);
+    if (ret)
+      hwloc_obj_add_info(topology->levels[0][0], "Backend", "x86");
+    return 0;
+  } else {
+    /* topology is empty, initialize it */
+    hwloc_alloc_root_sets(topology->levels[0][0]);
+  }
+
+fulldiscovery:
+  if (hwloc_look_x86(backend, 1) < 0) {
+    /* if failed, create PUs */
+    if (!alreadypus)
+      hwloc_setup_pu_level(topology, data->nbprocs);
+  }
+
+  hwloc_obj_add_info(topology->levels[0][0], "Backend", "x86");
+
+  if (!data->src_cpuiddump_path) { /* CPUID dump works for both x86 and x86_64 */
+#ifdef HAVE_UNAME
+    hwloc_add_uname_info(topology, NULL); /* we already know is_thissystem() is true */
+#else
+    /* uname isn't available, manually setup the "Architecture" info */
+#ifdef HWLOC_X86_64_ARCH
+    hwloc_obj_add_info(topology->levels[0][0], "Architecture", "x86_64");
+#else
+    hwloc_obj_add_info(topology->levels[0][0], "Architecture", "x86");
+#endif
+#endif
+  }
+
+  return 1;
+}
+
+static int
+hwloc_x86_check_cpuiddump_input(const char *src_cpuiddump_path, hwloc_bitmap_t set)
+{
+
+#if !(defined HWLOC_WIN_SYS && !defined __MINGW32__ && !defined __CYGWIN__) /* needs a lot of work */
+  struct dirent *dirent;
+  DIR *dir;
+  FILE *file;
+  char line [32];
+
+  dir = opendir(src_cpuiddump_path);
+  if (!dir) 
+    return -1;
+
+  char path[strlen(src_cpuiddump_path) + strlen("/hwloc-cpuid-info") + 1];
+  sprintf(path, "%s/hwloc-cpuid-info", src_cpuiddump_path);
+  file = fopen(path, "r");
+  if (!file) {
+    fprintf(stderr, "Couldn't open dumped cpuid summary %s\n", path);
+    goto out_with_dir;
+  }
+  if (!fgets(line, sizeof(line), file)) {
+    fprintf(stderr, "Found read dumped cpuid summary in %s\n", path);
+    fclose(file);
+    goto out_with_dir;
+  }
+  fclose(file);
+  if (strcmp(line, "Architecture: x86\n")) {
+    fprintf(stderr, "Found non-x86 dumped cpuid summary in %s: %s\n", path, line);
+    goto out_with_dir;
+  }
+
+  while ((dirent = readdir(dir)) != NULL) {
+    if (!strncmp(dirent->d_name, "pu", 2)) {
+      char *end;
+      unsigned long idx = strtoul(dirent->d_name+2, &end, 10);
+      if (!*end)
+	hwloc_bitmap_set(set, idx);
+      else
+	fprintf(stderr, "Ignoring invalid dirent `%s' in dumped cpuid directory `%s'\n",
+		dirent->d_name, src_cpuiddump_path);
+    }
+  }
+  closedir(dir);
+
+  if (hwloc_bitmap_iszero(set)) {
+    fprintf(stderr, "Did not find any valid pu%%u entry in dumped cpuid directory `%s'\n",
+	    src_cpuiddump_path);
+    return -1;
+  } else if (hwloc_bitmap_last(set) != hwloc_bitmap_weight(set) - 1) {
+    /* The x86 backends enforces contigous set of PUs starting at 0 so far */
+    fprintf(stderr, "Found non-contigous pu%%u range in dumped cpuid directory `%s'\n",
+	    src_cpuiddump_path);
+    return -1;
+  }
+
+  return 0;
+
+out_with_dir:
+  closedir(dir);
+#endif /* HWLOC_WIN_SYS & !__MINGW32__ needs a lot of work */
+  return -1;
+}
+
+static void
+hwloc_x86_backend_disable(struct hwloc_backend *backend)
+{
+  struct hwloc_x86_backend_data_s *data = backend->private_data;
+  hwloc_bitmap_free(data->apicid_set);
+  free(data->src_cpuiddump_path);
+  free(data);
+}
+
+static struct hwloc_backend *
+hwloc_x86_component_instantiate(struct hwloc_disc_component *component,
+				const void *_data1 __hwloc_attribute_unused,
+				const void *_data2 __hwloc_attribute_unused,
+				const void *_data3 __hwloc_attribute_unused)
+{
+  struct hwloc_backend *backend;
+  struct hwloc_x86_backend_data_s *data;
+  const char *src_cpuiddump_path;
+
+  backend = hwloc_backend_alloc(component);
+  if (!backend)
+    goto out;
+
+  data = malloc(sizeof(*data));
+  if (!data) {
+    errno = ENOMEM;
+    goto out_with_backend;
+  }
+
+  backend->private_data = data;
+  backend->discover = hwloc_x86_discover;
+  backend->disable = hwloc_x86_backend_disable;
+
+  /* default values */
+  data->is_knl = 0;
+  data->apicid_set = hwloc_bitmap_alloc();
+  data->apicid_unique = 1;
+  data->src_cpuiddump_path = NULL;
+
+  src_cpuiddump_path = getenv("HWLOC_CPUID_PATH");
+  if (src_cpuiddump_path) {
+    hwloc_bitmap_t set = hwloc_bitmap_alloc();
+    if (!hwloc_x86_check_cpuiddump_input(src_cpuiddump_path, set)) {
+      backend->is_thissystem = 0;
+      data->src_cpuiddump_path = strdup(src_cpuiddump_path);
+      assert(!hwloc_bitmap_iszero(set)); /* enforced by hwloc_x86_check_cpuiddump_input() */
+      data->nbprocs = hwloc_bitmap_weight(set);
+    } else {
+      fprintf(stderr, "Ignoring dumped cpuid directory.\n");
+    }
+    hwloc_bitmap_free(set);
+  }
+
+  return backend;
+
+ out_with_backend:
+  free(backend);
+ out:
+  return NULL;
+}
+
+static struct hwloc_disc_component hwloc_x86_disc_component = {
+  HWLOC_DISC_COMPONENT_TYPE_CPU,
+  "x86",
+  HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
+  hwloc_x86_component_instantiate,
+  45, /* between native and no_os */
+  1,
+  NULL
+};
+
+const struct hwloc_component hwloc_x86_component = {
+  HWLOC_COMPONENT_ABI,
+  NULL, NULL,
+  HWLOC_COMPONENT_TYPE_DISC,
+  0,
+  &hwloc_x86_disc_component
+};
diff --git a/src/3rdparty/hwloc/src/topology-xml-nolibxml.c b/src/3rdparty/hwloc/src/topology-xml-nolibxml.c
new file mode 100644
index 000000000..5a0d02da4
--- /dev/null
+++ b/src/3rdparty/hwloc/src/topology-xml-nolibxml.c
@@ -0,0 +1,919 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2018 Inria.  All rights reserved.
+ * Copyright © 2009-2011 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <hwloc/plugins.h>
+#include <private/private.h>
+#include <private/misc.h>
+#include <private/xml.h>
+#include <private/debug.h>
+
+#include <string.h>
+#include <assert.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+/*******************
+ * Import routines *
+ *******************/
+
+struct hwloc__nolibxml_backend_data_s {
+  size_t buflen; /* size of both buffer and copy buffers, set during backend_init() */
+  char *buffer; /* allocated and filled during backend_init() */
+  char *copy; /* allocated during backend_init(), used later during actual parsing */
+};
+
+typedef struct hwloc__nolibxml_import_state_data_s {
+  char *tagbuffer; /* buffer containing the next tag */
+  char *attrbuffer; /* buffer containing the next attribute of the current node */
+  char *tagname; /* tag name of the current node */
+  int closed; /* set if the current node is auto-closing */
+} __hwloc_attribute_may_alias * hwloc__nolibxml_import_state_data_t;
+
+static char *
+hwloc__nolibxml_import_ignore_spaces(char *buffer)
+{
+  return buffer + strspn(buffer, " \t\n");
+}
+
+static int
+hwloc__nolibxml_import_next_attr(hwloc__xml_import_state_t state, char **namep, char **valuep)
+{
+  hwloc__nolibxml_import_state_data_t nstate = (void*) state->data;
+  size_t namelen;
+  size_t len, escaped;
+  char *buffer, *value, *end;
+
+  if (!nstate->attrbuffer)
+    return -1;
+
+  /* find the beginning of an attribute */
+  buffer = hwloc__nolibxml_import_ignore_spaces(nstate->attrbuffer);
+  namelen = strspn(buffer, "abcdefghijklmnopqrstuvwxyz_");
+  if (buffer[namelen] != '=' || buffer[namelen+1] != '\"')
+    return -1;
+  buffer[namelen] = '\0';
+  *namep = buffer;
+
+  /* find the beginning of its value, and unescape it */
+  *valuep = value = buffer+namelen+2;
+  len = 0; escaped = 0;
+  while (value[len+escaped] != '\"') {
+    if (value[len+escaped] == '&') {
+      if (!strncmp(&value[1+len+escaped], "#10;", 4)) {
+	escaped += 4;
+	value[len] = '\n';
+      } else if (!strncmp(&value[1+len+escaped], "#13;", 4)) {
+	escaped += 4;
+	value[len] = '\r';
+      } else if (!strncmp(&value[1+len+escaped], "#9;", 3)) {
+	escaped += 3;
+	value[len] = '\t';
+      } else if (!strncmp(&value[1+len+escaped], "quot;", 5)) {
+	escaped += 5;
+	value[len] = '\"';
+      } else if (!strncmp(&value[1+len+escaped], "lt;", 3)) {
+	escaped += 3;
+	value[len] = '<';
+      } else if (!strncmp(&value[1+len+escaped], "gt;", 3)) {
+	escaped += 3;
+	value[len] = '>';
+      } else if (!strncmp(&value[1+len+escaped], "amp;", 4)) {
+	escaped += 4;
+	value[len] = '&';
+      } else {
+	return -1;
+      }
+    } else {
+      value[len] = value[len+escaped];
+    }
+    len++;
+    if (value[len+escaped] == '\0')
+      return -1;
+  }
+  value[len] = '\0';
+
+  /* find next attribute */
+  end = &value[len+escaped+1]; /* skip the ending " */
+  nstate->attrbuffer = hwloc__nolibxml_import_ignore_spaces(end);
+  return 0;
+}
+
+static int
+hwloc__nolibxml_import_find_child(hwloc__xml_import_state_t state,
+				  hwloc__xml_import_state_t childstate,
+				  char **tagp)
+{
+  hwloc__nolibxml_import_state_data_t nstate = (void*) state->data;
+  hwloc__nolibxml_import_state_data_t nchildstate = (void*) childstate->data;
+  char *buffer = nstate->tagbuffer;
+  char *end;
+  char *tag;
+  size_t namelen;
+
+  childstate->parent = state;
+  childstate->global = state->global;
+
+  /* auto-closed tags have no children */
+  if (nstate->closed)
+    return 0;
+
+  /* find the beginning of the tag */
+  buffer = hwloc__nolibxml_import_ignore_spaces(buffer);
+  if (buffer[0] != '<')
+    return -1;
+  buffer++;
+
+  /* if closing tag, return nothing and do not advance */
+  if (buffer[0] == '/')
+    return 0;
+
+  /* normal tag */
+  tag = nchildstate->tagname = buffer;
+
+  /* find the end, mark it and return it */
+  end = strchr(buffer, '>');
+  if (!end)
+    return -1;
+  end[0] = '\0';
+  nchildstate->tagbuffer = end+1;
+
+  /* handle auto-closing tags */
+  if (end[-1] == '/') {
+    nchildstate->closed = 1;
+    end[-1] = '\0';
+  } else
+    nchildstate->closed = 0;
+
+  /* find attributes */
+  namelen = strspn(buffer, "abcdefghijklmnopqrstuvwxyz1234567890_");
+
+  if (buffer[namelen] == '\0') {
+    /* no attributes */
+    nchildstate->attrbuffer = NULL;
+    *tagp = tag;
+    return 1;
+  }
+
+  if (buffer[namelen] != ' ')
+    return -1;
+
+  /* found a space, likely starting attributes */
+  buffer[namelen] = '\0';
+  nchildstate->attrbuffer = buffer+namelen+1;
+  *tagp = tag;
+  return 1;
+}
+
+static int
+hwloc__nolibxml_import_close_tag(hwloc__xml_import_state_t state)
+{
+  hwloc__nolibxml_import_state_data_t nstate = (void*) state->data;
+  char *buffer = nstate->tagbuffer;
+  char *end;
+
+  /* auto-closed tags need nothing */
+  if (nstate->closed)
+    return 0;
+
+  /* find the beginning of the tag */
+  buffer = hwloc__nolibxml_import_ignore_spaces(buffer);
+  if (buffer[0] != '<')
+    return -1;
+  buffer++;
+
+  /* find the end, mark it and return it to the parent */
+  end = strchr(buffer, '>');
+  if (!end)
+    return -1;
+  end[0] = '\0';
+  nstate->tagbuffer = end+1;
+
+  /* if closing tag, return nothing */
+  if (buffer[0] != '/' || strcmp(buffer+1, nstate->tagname) )
+    return -1;
+  return 0;
+}
+
+static void
+hwloc__nolibxml_import_close_child(hwloc__xml_import_state_t state)
+{
+  hwloc__nolibxml_import_state_data_t nstate = (void*) state->data;
+  hwloc__nolibxml_import_state_data_t nparent = (void*) state->parent->data;
+  nparent->tagbuffer = nstate->tagbuffer;
+}
+
+static int
+hwloc__nolibxml_import_get_content(hwloc__xml_import_state_t state,
+				   char **beginp, size_t expected_length)
+{
+  hwloc__nolibxml_import_state_data_t nstate = (void*) state->data;
+  char *buffer = nstate->tagbuffer;
+  size_t length;
+  char *end;
+
+  /* auto-closed tags have no content */
+  if (nstate->closed) {
+    if (expected_length)
+      return -1;
+    *beginp = (char *) "";
+    return 0;
+  }
+
+  /* find the next tag, where the content ends */
+  end = strchr(buffer, '<');
+  if (!end)
+    return -1;
+
+  length = (size_t) (end-buffer);
+  if (length != expected_length)
+    return -1;
+  nstate->tagbuffer = end;
+  *end = '\0'; /* mark as 0-terminated for now */
+  *beginp = buffer;
+  return 1;
+}
+
+static void
+hwloc__nolibxml_import_close_content(hwloc__xml_import_state_t state)
+{
+  /* put back the '<' that we overwrote to 0-terminate the content */
+  hwloc__nolibxml_import_state_data_t nstate = (void*) state->data;
+  if (!nstate->closed)
+    *nstate->tagbuffer = '<';
+}
+
+static int
+hwloc_nolibxml_look_init(struct hwloc_xml_backend_data_s *bdata,
+			 struct hwloc__xml_import_state_s *state)
+{
+  hwloc__nolibxml_import_state_data_t nstate = (void*) state->data;
+  struct hwloc__nolibxml_backend_data_s *nbdata = bdata->data;
+  unsigned major, minor;
+  char *end;
+  char *buffer;
+
+  HWLOC_BUILD_ASSERT(sizeof(*nstate) <= sizeof(state->data));
+
+  /* use a copy in the temporary buffer, we may modify during parsing */
+  buffer = nbdata->copy;
+  memcpy(buffer, nbdata->buffer, nbdata->buflen);
+
+  /* skip headers */
+  while (!strncmp(buffer, "<?xml ", 6) || !strncmp(buffer, "<!DOCTYPE ", 10)) {
+    buffer = strchr(buffer, '\n');
+    if (!buffer)
+      goto failed;
+    buffer++;
+  }
+
+  /* find topology tag */
+  if (sscanf(buffer, "<topology version=\"%u.%u\">", &major, &minor) == 2) {
+    bdata->version_major = major;
+    bdata->version_minor = minor;
+    end = strchr(buffer, '>') + 1;
+  } else if (!strncmp(buffer, "<topology>", 10)) {
+    bdata->version_major = 1;
+    bdata->version_minor = 0;
+    end = buffer + 10;
+  } else if (!strncmp(buffer, "<root>", 6)) {
+    bdata->version_major = 0;
+    bdata->version_minor = 9;
+    end = buffer + 6;
+  } else
+    goto failed;
+
+  state->global->next_attr = hwloc__nolibxml_import_next_attr;
+  state->global->find_child = hwloc__nolibxml_import_find_child;
+  state->global->close_tag = hwloc__nolibxml_import_close_tag;
+  state->global->close_child = hwloc__nolibxml_import_close_child;
+  state->global->get_content = hwloc__nolibxml_import_get_content;
+  state->global->close_content = hwloc__nolibxml_import_close_content;
+  state->parent = NULL;
+  nstate->closed = 0;
+  nstate->tagbuffer = end;
+  nstate->tagname = (char *) "topology";
+  nstate->attrbuffer = NULL;
+  return 0; /* success */
+
+ failed:
+  return -1; /* failed */
+}
+
+/* can be called at the end of the import (to cleanup things early),
+ * or by backend_exit() if load failed for other reasons.
+ */
+static void
+hwloc_nolibxml_free_buffers(struct hwloc_xml_backend_data_s *bdata)
+{
+  struct hwloc__nolibxml_backend_data_s *nbdata = bdata->data;
+  if (nbdata->buffer) {
+    free(nbdata->buffer);
+    nbdata->buffer = NULL;
+  }
+  if (nbdata->copy) {
+    free(nbdata->copy);
+    nbdata->copy = NULL;
+  }
+}
+
+static void
+hwloc_nolibxml_look_done(struct hwloc_xml_backend_data_s *bdata, int result)
+{
+  hwloc_nolibxml_free_buffers(bdata);
+
+  if (result < 0 && hwloc__xml_verbose())
+    fprintf(stderr, "Failed to parse XML input with the minimalistic parser. If it was not\n"
+	    "generated by hwloc, try enabling full XML support with libxml2.\n");
+}
+
+/********************
+ * Backend routines *
+ ********************/
+
+static void
+hwloc_nolibxml_backend_exit(struct hwloc_xml_backend_data_s *bdata)
+{
+  struct hwloc__nolibxml_backend_data_s *nbdata = bdata->data;
+  hwloc_nolibxml_free_buffers(bdata);
+  free(nbdata);
+}
+
+static int
+hwloc_nolibxml_read_file(const char *xmlpath, char **bufferp, size_t *buflenp)
+{
+  FILE * file;
+  size_t buflen, offset, readlen;
+  struct stat statbuf;
+  char *buffer, *tmp;
+  size_t ret;
+
+  if (!strcmp(xmlpath, "-"))
+    xmlpath = "/dev/stdin";
+
+  file = fopen(xmlpath, "r");
+  if (!file)
+    goto out;
+
+  /* find the required buffer size for regular files, or use 4k when unknown, we'll realloc later if needed */
+  buflen = 4096;
+  if (!stat(xmlpath, &statbuf))
+    if (S_ISREG(statbuf.st_mode))
+      buflen = statbuf.st_size+1; /* one additional byte so that the first fread() gets EOF too */
+
+  buffer = malloc(buflen+1); /* one more byte for the ending \0 */
+  if (!buffer)
+    goto out_with_file;
+
+  offset = 0; readlen = buflen;
+  while (1) {
+    ret = fread(buffer+offset, 1, readlen, file);
+
+    offset += ret;
+    buffer[offset] = 0;
+
+    if (ret != readlen)
+      break;
+
+    buflen *= 2;
+    tmp = realloc(buffer, buflen+1);
+    if (!tmp)
+      goto out_with_buffer;
+    buffer = tmp;
+    readlen = buflen/2;
+  }
+
+  fclose(file);
+  *bufferp = buffer;
+  *buflenp = offset+1;
+  return 0;
+
+ out_with_buffer:
+  free(buffer);
+ out_with_file:
+  fclose(file);
+ out:
+  return -1;
+}
+
+static int
+hwloc_nolibxml_backend_init(struct hwloc_xml_backend_data_s *bdata,
+			    const char *xmlpath, const char *xmlbuffer, int xmlbuflen)
+{
+  struct hwloc__nolibxml_backend_data_s *nbdata = malloc(sizeof(*nbdata));
+
+  if (!nbdata)
+    goto out;
+  bdata->data = nbdata;
+
+  if (xmlbuffer) {
+    nbdata->buffer = malloc(xmlbuflen+1);
+    if (!nbdata->buffer)
+      goto out_with_nbdata;
+    nbdata->buflen = xmlbuflen+1;
+    memcpy(nbdata->buffer, xmlbuffer, xmlbuflen);
+    nbdata->buffer[xmlbuflen] = '\0';
+
+  } else {
+    int err = hwloc_nolibxml_read_file(xmlpath, &nbdata->buffer, &nbdata->buflen);
+    if (err < 0)
+      goto out_with_nbdata;
+  }
+
+  /* allocate a temporary copy buffer that we may modify during parsing */
+  nbdata->copy = malloc(nbdata->buflen+1);
+  if (!nbdata->copy)
+    goto out_with_buffer;
+  nbdata->copy[nbdata->buflen] = '\0';
+
+  bdata->look_init = hwloc_nolibxml_look_init;
+  bdata->look_done = hwloc_nolibxml_look_done;
+  bdata->backend_exit = hwloc_nolibxml_backend_exit;
+  return 0;
+
+out_with_buffer:
+  free(nbdata->buffer);
+out_with_nbdata:
+  free(nbdata);
+out:
+  return -1;
+}
+
+static int
+hwloc_nolibxml_import_diff(struct hwloc__xml_import_state_s *state,
+			   const char *xmlpath, const char *xmlbuffer, int xmlbuflen,
+			   hwloc_topology_diff_t *firstdiffp, char **refnamep)
+{
+  hwloc__nolibxml_import_state_data_t nstate = (void*) state->data;
+  struct hwloc__xml_import_state_s childstate;
+  char *refname = NULL;
+  char *buffer, *tmp, *tag;
+  size_t buflen;
+  int ret;
+
+  HWLOC_BUILD_ASSERT(sizeof(*nstate) <= sizeof(state->data));
+
+  if (xmlbuffer) {
+    buffer = malloc(xmlbuflen);
+    if (!buffer)
+      goto out;
+    memcpy(buffer, xmlbuffer, xmlbuflen);
+    buflen = xmlbuflen;
+
+  } else {
+    ret = hwloc_nolibxml_read_file(xmlpath, &buffer, &buflen);
+    if (ret < 0)
+      goto out;
+  }
+
+  /* skip headers */
+  tmp = buffer;
+  while (!strncmp(tmp, "<?xml ", 6) || !strncmp(tmp, "<!DOCTYPE ", 10)) {
+    tmp = strchr(tmp, '\n');
+    if (!tmp)
+      goto out_with_buffer;
+    tmp++;
+  }
+
+  state->global->next_attr = hwloc__nolibxml_import_next_attr;
+  state->global->find_child = hwloc__nolibxml_import_find_child;
+  state->global->close_tag = hwloc__nolibxml_import_close_tag;
+  state->global->close_child = hwloc__nolibxml_import_close_child;
+  state->global->get_content = hwloc__nolibxml_import_get_content;
+  state->global->close_content = hwloc__nolibxml_import_close_content;
+  state->parent = NULL;
+  nstate->closed = 0;
+  nstate->tagbuffer = tmp;
+  nstate->tagname = NULL;
+  nstate->attrbuffer = NULL;
+
+  /* find root */
+  ret = hwloc__nolibxml_import_find_child(state, &childstate, &tag);
+  if (ret < 0)
+    goto out_with_buffer;
+  if (!tag || strcmp(tag, "topologydiff"))
+    goto out_with_buffer;
+
+  while (1) {
+    char *attrname, *attrvalue;
+    if (hwloc__nolibxml_import_next_attr(&childstate, &attrname, &attrvalue) < 0)
+      break;
+    if (!strcmp(attrname, "refname")) {
+      free(refname);
+      refname = strdup(attrvalue);
+    } else
+      goto out_with_buffer;
+  }
+
+  ret = hwloc__xml_import_diff(&childstate, firstdiffp);
+  if (refnamep && !ret)
+    *refnamep = refname;
+  else
+    free(refname);
+
+  free(buffer);
+  return ret;
+
+out_with_buffer:
+  free(buffer);
+  free(refname);
+out:
+  return -1;
+}
+
+/*******************
+ * Export routines *
+ *******************/
+
+typedef struct hwloc__nolibxml_export_state_data_s {
+  char *buffer; /* (moving) buffer where to write */
+  size_t written; /* how many bytes were written (or would have be written if not truncated) */
+  size_t remaining; /* how many bytes are still available in the buffer */
+  unsigned indent; /* indentation level for the next line */
+  unsigned nr_children;
+  unsigned has_content;
+} __hwloc_attribute_may_alias * hwloc__nolibxml_export_state_data_t;
+
+static void
+hwloc__nolibxml_export_update_buffer(hwloc__nolibxml_export_state_data_t ndata, int res)
+{
+  if (res >= 0) {
+    ndata->written += res;
+    if (res >= (int) ndata->remaining)
+      res = ndata->remaining>0 ? (int)ndata->remaining-1 : 0;
+    ndata->buffer += res;
+    ndata->remaining -= res;
+  }
+}
+
+static char *
+hwloc__nolibxml_export_escape_string(const char *src)
+{
+  size_t fulllen, sublen;
+  char *escaped, *dst;
+
+  fulllen = strlen(src);
+
+  sublen = strcspn(src, "\n\r\t\"<>&");
+  if (sublen == fulllen)
+    return NULL; /* nothing to escape */
+
+  escaped = malloc(fulllen*6+1); /* escaped chars are replaced by at most 6 char */
+  dst = escaped;
+
+  memcpy(dst, src, sublen);
+  src += sublen;
+  dst += sublen;
+
+  while (*src) {
+    int replen;
+    switch (*src) {
+    case '\n': strcpy(dst, "&#10;");  replen=5; break;
+    case '\r': strcpy(dst, "&#13;");  replen=5; break;
+    case '\t': strcpy(dst, "&#9;");   replen=4; break;
+    case '\"': strcpy(dst, "&quot;"); replen=6; break;
+    case '<':  strcpy(dst, "&lt;");   replen=4; break;
+    case '>':  strcpy(dst, "&gt;");   replen=4; break;
+    case '&':  strcpy(dst, "&amp;");  replen=5; break;
+    default: replen=0; break;
+    }
+    dst+=replen; src++;
+
+    sublen = strcspn(src, "\n\r\t\"<>&");
+    memcpy(dst, src, sublen);
+    src += sublen;
+    dst += sublen;
+  }
+
+  *dst = 0;
+  return escaped;
+}
+
+static void
+hwloc__nolibxml_export_new_child(hwloc__xml_export_state_t parentstate,
+				 hwloc__xml_export_state_t state,
+				 const char *name)
+{
+  hwloc__nolibxml_export_state_data_t npdata = (void *) parentstate->data;
+  hwloc__nolibxml_export_state_data_t ndata = (void *) state->data;
+  int res;
+
+  assert(!npdata->has_content);
+  if (!npdata->nr_children) {
+    res = hwloc_snprintf(npdata->buffer, npdata->remaining, ">\n");
+    hwloc__nolibxml_export_update_buffer(npdata, res);
+  }
+  npdata->nr_children++;
+
+  state->parent = parentstate;
+  state->new_child = parentstate->new_child;
+  state->new_prop = parentstate->new_prop;
+  state->add_content = parentstate->add_content;
+  state->end_object = parentstate->end_object;
+  state->global = parentstate->global;
+
+  ndata->buffer = npdata->buffer;
+  ndata->written = npdata->written;
+  ndata->remaining = npdata->remaining;
+  ndata->indent = npdata->indent + 2;
+
+  ndata->nr_children = 0;
+  ndata->has_content = 0;
+
+  res = hwloc_snprintf(ndata->buffer, ndata->remaining, "%*s<%s", (int) npdata->indent, "", name);
+  hwloc__nolibxml_export_update_buffer(ndata, res);
+}
+
+static void
+hwloc__nolibxml_export_new_prop(hwloc__xml_export_state_t state, const char *name, const char *value)
+{
+  hwloc__nolibxml_export_state_data_t ndata = (void *) state->data;
+  char *escaped = hwloc__nolibxml_export_escape_string(value);
+  int res = hwloc_snprintf(ndata->buffer, ndata->remaining, " %s=\"%s\"", name, escaped ? (const char *) escaped : value);
+  hwloc__nolibxml_export_update_buffer(ndata, res);
+  free(escaped);
+}
+
+static void
+hwloc__nolibxml_export_end_object(hwloc__xml_export_state_t state, const char *name)
+{
+  hwloc__nolibxml_export_state_data_t ndata = (void *) state->data;
+  hwloc__nolibxml_export_state_data_t npdata = (void *) state->parent->data;
+  int res;
+
+  assert (!(ndata->has_content && ndata->nr_children));
+  if (ndata->has_content) {
+    res = hwloc_snprintf(ndata->buffer, ndata->remaining, "</%s>\n", name);
+  } else if (ndata->nr_children) {
+    res = hwloc_snprintf(ndata->buffer, ndata->remaining, "%*s</%s>\n", (int) npdata->indent, "", name);
+  } else {
+    res = hwloc_snprintf(ndata->buffer, ndata->remaining, "/>\n");
+  }
+  hwloc__nolibxml_export_update_buffer(ndata, res);
+
+  npdata->buffer = ndata->buffer;
+  npdata->written = ndata->written;
+  npdata->remaining = ndata->remaining;
+}
+
+static void
+hwloc__nolibxml_export_add_content(hwloc__xml_export_state_t state, const char *buffer, size_t length)
+{
+  hwloc__nolibxml_export_state_data_t ndata = (void *) state->data;
+  int res;
+
+  assert(!ndata->nr_children);
+  if (!ndata->has_content) {
+    res = hwloc_snprintf(ndata->buffer, ndata->remaining, ">");
+    hwloc__nolibxml_export_update_buffer(ndata, res);
+  }
+  ndata->has_content = 1;
+
+  res = hwloc_snprintf(ndata->buffer, ndata->remaining, buffer, length);
+  hwloc__nolibxml_export_update_buffer(ndata, res);
+}
+
+static size_t
+hwloc___nolibxml_prepare_export(hwloc_topology_t topology, struct hwloc__xml_export_data_s *edata,
+				char *xmlbuffer, int buflen, unsigned long flags)
+{
+  struct hwloc__xml_export_state_s state, childstate;
+  hwloc__nolibxml_export_state_data_t ndata = (void *) &state.data;
+  int v1export = flags & HWLOC_TOPOLOGY_EXPORT_XML_FLAG_V1;
+  int res;
+
+  HWLOC_BUILD_ASSERT(sizeof(*ndata) <= sizeof(state.data));
+
+  state.new_child = hwloc__nolibxml_export_new_child;
+  state.new_prop = hwloc__nolibxml_export_new_prop;
+  state.add_content = hwloc__nolibxml_export_add_content;
+  state.end_object = hwloc__nolibxml_export_end_object;
+  state.global = edata;
+
+  ndata->indent = 0;
+  ndata->written = 0;
+  ndata->buffer = xmlbuffer;
+  ndata->remaining = buflen;
+
+  ndata->nr_children = 1; /* don't close a non-existing previous tag when opening the topology tag */
+  ndata->has_content = 0;
+
+  res = hwloc_snprintf(ndata->buffer, ndata->remaining,
+		 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
+		 "<!DOCTYPE topology SYSTEM \"%s\">\n", v1export ? "hwloc.dtd" : "hwloc2.dtd");
+  hwloc__nolibxml_export_update_buffer(ndata, res);
+  hwloc__nolibxml_export_new_child(&state, &childstate, "topology");
+  if (!(flags & HWLOC_TOPOLOGY_EXPORT_XML_FLAG_V1))
+    hwloc__nolibxml_export_new_prop(&childstate, "version", "2.0");
+  hwloc__xml_export_topology (&childstate, topology, flags);
+  hwloc__nolibxml_export_end_object(&childstate, "topology");
+
+  return ndata->written+1; /* ending \0 */
+}
+
+static int
+hwloc_nolibxml_export_buffer(hwloc_topology_t topology, struct hwloc__xml_export_data_s *edata,
+			     char **bufferp, int *buflenp, unsigned long flags)
+{
+  char *buffer;
+  size_t bufferlen, res;
+
+  bufferlen = 16384; /* random guess for large enough default */
+  buffer = malloc(bufferlen);
+  if (!buffer)
+    return -1;
+  res = hwloc___nolibxml_prepare_export(topology, edata, buffer, (int)bufferlen, flags);
+
+  if (res > bufferlen) {
+    char *tmp = realloc(buffer, res);
+    if (!tmp) {
+      free(buffer);
+      return -1;
+    }
+    buffer = tmp;
+    hwloc___nolibxml_prepare_export(topology, edata, buffer, (int)res, flags);
+  }
+
+  *bufferp = buffer;
+  *buflenp = (int)res;
+  return 0;
+}
+
+static int
+hwloc_nolibxml_export_file(hwloc_topology_t topology, struct hwloc__xml_export_data_s *edata,
+			   const char *filename, unsigned long flags)
+{
+  FILE *file;
+  char *buffer;
+  int bufferlen;
+  int ret;
+
+  ret = hwloc_nolibxml_export_buffer(topology, edata, &buffer, &bufferlen, flags);
+  if (ret < 0)
+    return -1;
+
+  if (!strcmp(filename, "-")) {
+    file = stdout;
+  } else {
+    file = fopen(filename, "w");
+    if (!file) {
+      free(buffer);
+      return -1;
+    }
+  }
+
+  ret = (int)fwrite(buffer, 1, bufferlen-1 /* don't write the ending \0 */, file);
+  if (ret == bufferlen-1) {
+    ret = 0;
+  } else {
+    errno = ferror(file);
+    ret = -1;
+  }
+
+  free(buffer);
+
+  if (file != stdout)
+    fclose(file);
+  return ret;
+}
+
+static size_t
+hwloc___nolibxml_prepare_export_diff(hwloc_topology_diff_t diff, const char *refname, char *xmlbuffer, int buflen)
+{
+  struct hwloc__xml_export_state_s state, childstate;
+  hwloc__nolibxml_export_state_data_t ndata = (void *) &state.data;
+  int res;
+
+  HWLOC_BUILD_ASSERT(sizeof(*ndata) <= sizeof(state.data));
+
+  state.new_child = hwloc__nolibxml_export_new_child;
+  state.new_prop = hwloc__nolibxml_export_new_prop;
+  state.add_content = hwloc__nolibxml_export_add_content;
+  state.end_object = hwloc__nolibxml_export_end_object;
+
+  ndata->indent = 0;
+  ndata->written = 0;
+  ndata->buffer = xmlbuffer;
+  ndata->remaining = buflen;
+
+  ndata->nr_children = 1; /* don't close a non-existing previous tag when opening the topology tag */
+  ndata->has_content = 0;
+
+  res = hwloc_snprintf(ndata->buffer, ndata->remaining,
+		 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
+		 "<!DOCTYPE topologydiff SYSTEM \"hwloc2-diff.dtd\">\n");
+  hwloc__nolibxml_export_update_buffer(ndata, res);
+  hwloc__nolibxml_export_new_child(&state, &childstate, "topologydiff");
+  if (refname)
+    hwloc__nolibxml_export_new_prop(&childstate, "refname", refname);
+  hwloc__xml_export_diff (&childstate, diff);
+  hwloc__nolibxml_export_end_object(&childstate, "topologydiff");
+
+  return ndata->written+1;
+}
+
+static int
+hwloc_nolibxml_export_diff_buffer(hwloc_topology_diff_t diff, const char *refname, char **bufferp, int *buflenp)
+{
+  char *buffer;
+  size_t bufferlen, res;
+
+  bufferlen = 16384; /* random guess for large enough default */
+  buffer = malloc(bufferlen);
+  if (!buffer)
+    return -1;
+  res = hwloc___nolibxml_prepare_export_diff(diff, refname, buffer, (int)bufferlen);
+
+  if (res > bufferlen) {
+    char *tmp = realloc(buffer, res);
+    if (!tmp) {
+      free(buffer);
+      return -1;
+    }
+    buffer = tmp;
+    hwloc___nolibxml_prepare_export_diff(diff, refname, buffer, (int)res);
+  }
+
+  *bufferp = buffer;
+  *buflenp = (int)res;
+  return 0;
+}
+
+static int
+hwloc_nolibxml_export_diff_file(hwloc_topology_diff_t diff, const char *refname, const char *filename)
+{
+  FILE *file;
+  char *buffer;
+  int bufferlen;
+  int ret;
+
+  ret = hwloc_nolibxml_export_diff_buffer(diff, refname, &buffer, &bufferlen);
+  if (ret < 0)
+    return -1;
+
+  if (!strcmp(filename, "-")) {
+    file = stdout;
+  } else {
+    file = fopen(filename, "w");
+    if (!file) {
+      free(buffer);
+      return -1;
+    }
+  }
+
+  ret = (int)fwrite(buffer, 1, bufferlen-1 /* don't write the ending \0 */, file);
+  if (ret == bufferlen-1) {
+    ret = 0;
+  } else {
+    errno = ferror(file);
+    ret = -1;
+  }
+
+  free(buffer);
+
+  if (file != stdout)
+    fclose(file);
+  return ret;
+}
+
+static void
+hwloc_nolibxml_free_buffer(void *xmlbuffer)
+{
+  free(xmlbuffer);
+}
+
+/*************
+ * Callbacks *
+ *************/
+
+static struct hwloc_xml_callbacks hwloc_xml_nolibxml_callbacks = {
+  hwloc_nolibxml_backend_init,
+  hwloc_nolibxml_export_file,
+  hwloc_nolibxml_export_buffer,
+  hwloc_nolibxml_free_buffer,
+  hwloc_nolibxml_import_diff,
+  hwloc_nolibxml_export_diff_file,
+  hwloc_nolibxml_export_diff_buffer
+};
+
+static struct hwloc_xml_component hwloc_nolibxml_xml_component = {
+  &hwloc_xml_nolibxml_callbacks,
+  NULL
+};
+
+const struct hwloc_component hwloc_xml_nolibxml_component = {
+  HWLOC_COMPONENT_ABI,
+  NULL, NULL,
+  HWLOC_COMPONENT_TYPE_XML,
+  0,
+  &hwloc_nolibxml_xml_component
+};
diff --git a/src/3rdparty/hwloc/src/topology-xml.c b/src/3rdparty/hwloc/src/topology-xml.c
new file mode 100644
index 000000000..e7c5ef621
--- /dev/null
+++ b/src/3rdparty/hwloc/src/topology-xml.c
@@ -0,0 +1,2886 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2019 Inria.  All rights reserved.
+ * Copyright © 2009-2011 Université Bordeaux
+ * Copyright © 2009-2018 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <private/xml.h>
+#include <private/private.h>
+#include <private/misc.h>
+#include <private/debug.h>
+
+#include <math.h>
+
+int
+hwloc__xml_verbose(void)
+{
+  static int checked = 0;
+  static int verbose = 0;
+  if (!checked) {
+    const char *env = getenv("HWLOC_XML_VERBOSE");
+    if (env)
+      verbose = atoi(env);
+    checked = 1;
+  }
+  return verbose;
+}
+
+static int
+hwloc_nolibxml_import(void)
+{
+  static int checked = 0;
+  static int nolibxml = 0;
+  if (!checked) {
+    const char *env = getenv("HWLOC_LIBXML");
+    if (env) {
+      nolibxml = !atoi(env);
+    } else {
+      env = getenv("HWLOC_LIBXML_IMPORT");
+      if (env)
+	nolibxml = !atoi(env);
+    }
+    checked = 1;
+  }
+  return nolibxml;
+}
+
+static int
+hwloc_nolibxml_export(void)
+{
+  static int checked = 0;
+  static int nolibxml = 0;
+  if (!checked) {
+    const char *env = getenv("HWLOC_LIBXML");
+    if (env) {
+      nolibxml = !atoi(env);
+    } else {
+      env = getenv("HWLOC_LIBXML_EXPORT");
+      if (env)
+	nolibxml = !atoi(env);
+    }
+    checked = 1;
+  }
+  return nolibxml;
+}
+
+#define BASE64_ENCODED_LENGTH(length) (4*(((length)+2)/3))
+
+/*********************************
+ ********* XML callbacks *********
+ *********************************/
+
+/* set when registering nolibxml and libxml components.
+ * modifications protected by the components mutex.
+ * read by the common XML code in topology-xml.c to jump to the right XML backend.
+ */
+static struct hwloc_xml_callbacks *hwloc_nolibxml_callbacks = NULL, *hwloc_libxml_callbacks = NULL;
+
+void
+hwloc_xml_callbacks_register(struct hwloc_xml_component *comp)
+{
+  if (!hwloc_nolibxml_callbacks)
+    hwloc_nolibxml_callbacks = comp->nolibxml_callbacks;
+  if (!hwloc_libxml_callbacks)
+    hwloc_libxml_callbacks = comp->libxml_callbacks;
+}
+
+void
+hwloc_xml_callbacks_reset(void)
+{
+  hwloc_nolibxml_callbacks = NULL;
+  hwloc_libxml_callbacks = NULL;
+}
+
+/************************************************
+ ********* XML import (common routines) *********
+ ************************************************/
+
+#define _HWLOC_OBJ_CACHE_OLD (HWLOC_OBJ_TYPE_MAX+1) /* temporarily used when importing pre-v2.0 attribute-less cache types */
+#define _HWLOC_OBJ_FUTURE    (HWLOC_OBJ_TYPE_MAX+2) /* temporarily used when ignoring future types */
+
+static void
+hwloc__xml_import_object_attr(struct hwloc_topology *topology,
+			      struct hwloc_xml_backend_data_s *data,
+			      struct hwloc_obj *obj,
+			      const char *name, const char *value,
+			      hwloc__xml_import_state_t state)
+{
+  if (!strcmp(name, "type")) {
+    /* already handled */
+    return;
+  }
+
+  else if (!strcmp(name, "os_index"))
+    obj->os_index = strtoul(value, NULL, 10);
+  else if (!strcmp(name, "gp_index")) {
+    obj->gp_index = strtoull(value, NULL, 10);
+    if (!obj->gp_index && hwloc__xml_verbose())
+      fprintf(stderr, "%s: unexpected zero gp_index, topology may be invalid\n", state->global->msgprefix);
+    if (obj->gp_index >= topology->next_gp_index)
+      topology->next_gp_index = obj->gp_index + 1;
+  } else if (!strcmp(name, "cpuset")) {
+    if (!obj->cpuset)
+      obj->cpuset = hwloc_bitmap_alloc();
+    hwloc_bitmap_sscanf(obj->cpuset, value);
+  } else if (!strcmp(name, "complete_cpuset")) {
+    if (!obj->complete_cpuset)
+      obj->complete_cpuset = hwloc_bitmap_alloc();
+    hwloc_bitmap_sscanf(obj->complete_cpuset, value);
+  } else if (!strcmp(name, "allowed_cpuset")) {
+    /* ignored except for root */
+    if (!obj->parent)
+      hwloc_bitmap_sscanf(topology->allowed_cpuset, value);
+  } else if (!strcmp(name, "nodeset")) {
+    if (!obj->nodeset)
+      obj->nodeset = hwloc_bitmap_alloc();
+    hwloc_bitmap_sscanf(obj->nodeset, value);
+  } else if (!strcmp(name, "complete_nodeset")) {
+    if (!obj->complete_nodeset)
+      obj->complete_nodeset = hwloc_bitmap_alloc();
+    hwloc_bitmap_sscanf(obj->complete_nodeset, value);
+  } else if (!strcmp(name, "allowed_nodeset")) {
+    /* ignored except for root */
+    if (!obj->parent)
+      hwloc_bitmap_sscanf(topology->allowed_nodeset, value);
+  } else if (!strcmp(name, "name")) {
+    if (obj->name)
+      free(obj->name);
+    obj->name = strdup(value);
+  } else if (!strcmp(name, "subtype")) {
+    if (obj->subtype)
+      free(obj->subtype);
+    obj->subtype = strdup(value);
+  }
+
+  else if (!strcmp(name, "cache_size")) {
+    unsigned long long lvalue = strtoull(value, NULL, 10);
+    if (hwloc__obj_type_is_cache(obj->type) || obj->type == _HWLOC_OBJ_CACHE_OLD)
+      obj->attr->cache.size = lvalue;
+    else if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: ignoring cache_size attribute for non-cache object type\n",
+	      state->global->msgprefix);
+  }
+
+  else if (!strcmp(name, "cache_linesize")) {
+    unsigned long lvalue = strtoul(value, NULL, 10);
+    if (hwloc__obj_type_is_cache(obj->type) || obj->type == _HWLOC_OBJ_CACHE_OLD)
+      obj->attr->cache.linesize = lvalue;
+    else if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: ignoring cache_linesize attribute for non-cache object type\n",
+	      state->global->msgprefix);
+  }
+
+  else if (!strcmp(name, "cache_associativity")) {
+    int lvalue = atoi(value);
+    if (hwloc__obj_type_is_cache(obj->type) || obj->type == _HWLOC_OBJ_CACHE_OLD)
+      obj->attr->cache.associativity = lvalue;
+    else if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: ignoring cache_associativity attribute for non-cache object type\n",
+	      state->global->msgprefix);
+  }
+
+  else if (!strcmp(name, "cache_type")) {
+    unsigned long lvalue = strtoul(value, NULL, 10);
+    if (hwloc__obj_type_is_cache(obj->type) || obj->type == _HWLOC_OBJ_CACHE_OLD) {
+      if (lvalue == HWLOC_OBJ_CACHE_UNIFIED
+	  || lvalue == HWLOC_OBJ_CACHE_DATA
+	  || lvalue == HWLOC_OBJ_CACHE_INSTRUCTION)
+	obj->attr->cache.type = (hwloc_obj_cache_type_t) lvalue;
+      else
+	fprintf(stderr, "%s: ignoring invalid cache_type attribute %lu\n",
+		state->global->msgprefix, lvalue);
+    } else if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: ignoring cache_type attribute for non-cache object type\n",
+	      state->global->msgprefix);
+  }
+
+  else if (!strcmp(name, "local_memory")) {
+    unsigned long long lvalue = strtoull(value, NULL, 10);
+    if (obj->type == HWLOC_OBJ_NUMANODE)
+      obj->attr->numanode.local_memory = lvalue;
+    else if (!obj->parent)
+      topology->machine_memory.local_memory = lvalue;
+    else if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: ignoring local_memory attribute for non-NUMAnode non-root object\n",
+	      state->global->msgprefix);
+  }
+
+  else if (!strcmp(name, "depth")) {
+    unsigned long lvalue = strtoul(value, NULL, 10);
+     if (hwloc__obj_type_is_cache(obj->type) || obj->type == _HWLOC_OBJ_CACHE_OLD) {
+	obj->attr->cache.depth = lvalue;
+     } else if (obj->type == HWLOC_OBJ_GROUP || obj->type == HWLOC_OBJ_BRIDGE) {
+       /* will be overwritten by the core */
+     } else if (hwloc__xml_verbose())
+       fprintf(stderr, "%s: ignoring depth attribute for object type without depth\n",
+	       state->global->msgprefix);
+  }
+
+  else if (!strcmp(name, "kind")) {
+    unsigned long lvalue = strtoul(value, NULL, 10);
+    if (obj->type == HWLOC_OBJ_GROUP)
+      obj->attr->group.kind = lvalue;
+    else if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: ignoring kind attribute for non-group object type\n",
+	      state->global->msgprefix);
+  }
+
+  else if (!strcmp(name, "subkind")) {
+    unsigned long lvalue = strtoul(value, NULL, 10);
+    if (obj->type == HWLOC_OBJ_GROUP)
+      obj->attr->group.subkind = lvalue;
+    else if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: ignoring subkind attribute for non-group object type\n",
+	      state->global->msgprefix);
+  }
+
+  else if (!strcmp(name, "dont_merge")) {
+    unsigned long lvalue = strtoul(value, NULL, 10);
+    if (obj->type == HWLOC_OBJ_GROUP)
+      obj->attr->group.dont_merge = lvalue;
+    else if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: ignoring dont_merge attribute for non-group object type\n",
+	      state->global->msgprefix);
+  }
+
+  else if (!strcmp(name, "pci_busid")) {
+    switch (obj->type) {
+    case HWLOC_OBJ_PCI_DEVICE:
+    case HWLOC_OBJ_BRIDGE: {
+      unsigned domain, bus, dev, func;
+      if (sscanf(value, "%04x:%02x:%02x.%01x",
+		 &domain, &bus, &dev, &func) != 4) {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "%s: ignoring invalid pci_busid format string %s\n",
+		  state->global->msgprefix, value);
+      } else {
+	obj->attr->pcidev.domain = domain;
+	obj->attr->pcidev.bus = bus;
+	obj->attr->pcidev.dev = dev;
+	obj->attr->pcidev.func = func;
+      }
+      break;
+    }
+    default:
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: ignoring pci_busid attribute for non-PCI object\n",
+		state->global->msgprefix);
+      break;
+    }
+  }
+
+  else if (!strcmp(name, "pci_type")) {
+    switch (obj->type) {
+    case HWLOC_OBJ_PCI_DEVICE:
+    case HWLOC_OBJ_BRIDGE: {
+      unsigned classid, vendor, device, subvendor, subdevice, revision;
+      if (sscanf(value, "%04x [%04x:%04x] [%04x:%04x] %02x",
+		 &classid, &vendor, &device, &subvendor, &subdevice, &revision) != 6) {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "%s: ignoring invalid pci_type format string %s\n",
+		  state->global->msgprefix, value);
+      } else {
+	obj->attr->pcidev.class_id = classid;
+	obj->attr->pcidev.vendor_id = vendor;
+	obj->attr->pcidev.device_id = device;
+	obj->attr->pcidev.subvendor_id = subvendor;
+	obj->attr->pcidev.subdevice_id = subdevice;
+	obj->attr->pcidev.revision = revision;
+      }
+      break;
+    }
+    default:
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: ignoring pci_type attribute for non-PCI object\n",
+		state->global->msgprefix);
+      break;
+    }
+  }
+
+  else if (!strcmp(name, "pci_link_speed")) {
+    switch (obj->type) {
+    case HWLOC_OBJ_PCI_DEVICE:
+    case HWLOC_OBJ_BRIDGE: {
+      obj->attr->pcidev.linkspeed = (float) atof(value);
+      break;
+    }
+    default:
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: ignoring pci_link_speed attribute for non-PCI object\n",
+		state->global->msgprefix);
+      break;
+    }
+  }
+
+  else if (!strcmp(name, "bridge_type")) {
+    switch (obj->type) {
+    case HWLOC_OBJ_BRIDGE: {
+      unsigned upstream_type, downstream_type;
+      if (sscanf(value, "%u-%u", &upstream_type, &downstream_type) != 2) {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "%s: ignoring invalid bridge_type format string %s\n",
+		  state->global->msgprefix, value);
+      } else {
+	obj->attr->bridge.upstream_type = (hwloc_obj_bridge_type_t) upstream_type;
+	obj->attr->bridge.downstream_type = (hwloc_obj_bridge_type_t) downstream_type;
+      };
+      break;
+    }
+    default:
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: ignoring bridge_type attribute for non-bridge object\n",
+		state->global->msgprefix);
+      break;
+    }
+  }
+
+  else if (!strcmp(name, "bridge_pci")) {
+    switch (obj->type) {
+    case HWLOC_OBJ_BRIDGE: {
+      unsigned domain, secbus, subbus;
+      if (sscanf(value, "%04x:[%02x-%02x]",
+		 &domain, &secbus, &subbus) != 3) {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "%s: ignoring invalid bridge_pci format string %s\n",
+		  state->global->msgprefix, value);
+      } else {
+	obj->attr->bridge.downstream.pci.domain = domain;
+	obj->attr->bridge.downstream.pci.secondary_bus = secbus;
+	obj->attr->bridge.downstream.pci.subordinate_bus = subbus;
+      }
+      break;
+    }
+    default:
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: ignoring bridge_pci attribute for non-bridge object\n",
+		state->global->msgprefix);
+      break;
+    }
+  }
+
+  else if (!strcmp(name, "osdev_type")) {
+    switch (obj->type) {
+    case HWLOC_OBJ_OS_DEVICE: {
+      unsigned osdev_type;
+      if (sscanf(value, "%u", &osdev_type) != 1) {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "%s: ignoring invalid osdev_type format string %s\n",
+		  state->global->msgprefix, value);
+      } else
+	obj->attr->osdev.type = (hwloc_obj_osdev_type_t) osdev_type;
+      break;
+    }
+    default:
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: ignoring osdev_type attribute for non-osdev object\n",
+		state->global->msgprefix);
+      break;
+    }
+  }
+
+  else if (data->version_major < 2) {
+    /************************
+     * deprecated from 1.x
+     */
+    if (!strcmp(name, "os_level")
+	|| !strcmp(name, "online_cpuset"))
+      { /* ignored */ }
+
+    /*************************
+     * deprecated from 1.0
+     */
+    else if (!strcmp(name, "dmi_board_vendor")) {
+      if (value[0])
+	hwloc_obj_add_info(obj, "DMIBoardVendor", value);
+    }
+    else if (!strcmp(name, "dmi_board_name")) {
+      if (value[0])
+	hwloc_obj_add_info(obj, "DMIBoardName", value);
+    }
+
+    else if (data->version_major < 1) {
+      /*************************
+       * deprecated from 0.9
+       */
+      if (!strcmp(name, "memory_kB")) {
+	unsigned long long lvalue = strtoull(value, NULL, 10);
+	if (obj->type == _HWLOC_OBJ_CACHE_OLD)
+	  obj->attr->cache.size = lvalue << 10;
+	else if (obj->type == HWLOC_OBJ_NUMANODE)
+	  obj->attr->numanode.local_memory = lvalue << 10;
+	else if (!obj->parent)
+	  topology->machine_memory.local_memory = lvalue << 10;
+	else if (hwloc__xml_verbose())
+	  fprintf(stderr, "%s: ignoring memory_kB attribute for non-NUMAnode non-root object\n",
+		  state->global->msgprefix);
+      }
+      else if (!strcmp(name, "huge_page_size_kB")) {
+	unsigned long lvalue = strtoul(value, NULL, 10);
+	if (obj->type == HWLOC_OBJ_NUMANODE || !obj->parent) {
+	  struct hwloc_numanode_attr_s *memory = obj->type == HWLOC_OBJ_NUMANODE ? &obj->attr->numanode : &topology->machine_memory;
+	  if (!memory->page_types) {
+	    memory->page_types = malloc(sizeof(*memory->page_types));
+	    memory->page_types_len = 1;
+	  }
+	  memory->page_types[0].size = lvalue << 10;
+	} else if (hwloc__xml_verbose()) {
+	  fprintf(stderr, "%s: ignoring huge_page_size_kB attribute for non-NUMAnode non-root object\n",
+		  state->global->msgprefix);
+	}
+      }
+      else if (!strcmp(name, "huge_page_free")) {
+	unsigned long lvalue = strtoul(value, NULL, 10);
+	if (obj->type == HWLOC_OBJ_NUMANODE || !obj->parent) {
+	  struct hwloc_numanode_attr_s *memory = obj->type == HWLOC_OBJ_NUMANODE ? &obj->attr->numanode : &topology->machine_memory;
+	  if (!memory->page_types) {
+	    memory->page_types = malloc(sizeof(*memory->page_types));
+	    memory->page_types_len = 1;
+	  }
+	  memory->page_types[0].count = lvalue;
+	} else if (hwloc__xml_verbose()) {
+	  fprintf(stderr, "%s: ignoring huge_page_free attribute for non-NUMAnode non-root object\n",
+		  state->global->msgprefix);
+	}
+      }
+      /* end of deprecated from 0.9 */
+      else goto unknown;
+    }
+    /* end of deprecated from 1.0 */
+    else goto unknown;
+  }
+  else {
+  unknown:
+    if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: ignoring unknown object attribute %s\n",
+	      state->global->msgprefix, name);
+  }
+}
+
+
+static int
+hwloc__xml_import_info(struct hwloc_xml_backend_data_s *data,
+		       hwloc_obj_t obj,
+		       hwloc__xml_import_state_t state)
+{
+  char *infoname = NULL;
+  char *infovalue = NULL;
+
+  while (1) {
+    char *attrname, *attrvalue;
+    if (state->global->next_attr(state, &attrname, &attrvalue) < 0)
+      break;
+    if (!strcmp(attrname, "name"))
+      infoname = attrvalue;
+    else if (!strcmp(attrname, "value"))
+      infovalue = attrvalue;
+    else
+      return -1;
+  }
+
+  if (infoname) {
+    /* empty strings are ignored by libxml */
+    if (data->version_major < 2 &&
+	(!strcmp(infoname, "Type") || !strcmp(infoname, "CoProcType"))) {
+      /* 1.x stored subtype in Type or CoProcType */
+      if (infovalue) {
+	if (obj->subtype)
+	  free(obj->subtype);
+	obj->subtype = strdup(infovalue);
+      }
+    } else {
+      if (infovalue)
+	hwloc_obj_add_info(obj, infoname, infovalue);
+    }
+  }
+
+  return state->global->close_tag(state);
+}
+
+static int
+hwloc__xml_import_pagetype(hwloc_topology_t topology __hwloc_attribute_unused, struct hwloc_numanode_attr_s *memory,
+			   hwloc__xml_import_state_t state)
+{
+  uint64_t size = 0, count = 0;
+
+  while (1) {
+    char *attrname, *attrvalue;
+    if (state->global->next_attr(state, &attrname, &attrvalue) < 0)
+      break;
+    if (!strcmp(attrname, "size"))
+      size = strtoull(attrvalue, NULL, 10);
+    else if (!strcmp(attrname, "count"))
+      count = strtoull(attrvalue, NULL, 10);
+    else
+      return -1;
+  }
+
+  if (size) {
+    unsigned idx = memory->page_types_len;
+    struct hwloc_memory_page_type_s *tmp;
+    tmp = realloc(memory->page_types, (idx+1)*sizeof(*memory->page_types));
+    if (tmp) { /* if failed to allocate, ignore this page_type entry */
+      memory->page_types = tmp;
+      memory->page_types_len = idx+1;
+      memory->page_types[idx].size = size;
+      memory->page_types[idx].count = count;
+    }
+  }
+
+  return state->global->close_tag(state);
+}
+
+static int
+hwloc__xml_v1import_distances(struct hwloc_xml_backend_data_s *data,
+			      hwloc_obj_t obj,
+			      hwloc__xml_import_state_t state)
+{
+  unsigned long reldepth = 0, nbobjs = 0;
+  float latbase = 0;
+  char *tag;
+  int ret;
+
+  while (1) {
+    char *attrname, *attrvalue;
+    if (state->global->next_attr(state, &attrname, &attrvalue) < 0)
+      break;
+    if (!strcmp(attrname, "nbobjs"))
+      nbobjs = strtoul(attrvalue, NULL, 10);
+    else if (!strcmp(attrname, "relative_depth"))
+      reldepth = strtoul(attrvalue, NULL, 10);
+    else if (!strcmp(attrname, "latency_base"))
+      latbase = (float) atof(attrvalue);
+    else
+      return -1;
+  }
+
+  if (nbobjs && reldepth && latbase) {
+    unsigned i;
+    float *matrix;
+    struct hwloc__xml_imported_v1distances_s *v1dist;
+
+    matrix = malloc(nbobjs*nbobjs*sizeof(float));
+    v1dist = malloc(sizeof(*v1dist));
+    if (!matrix || !v1dist) {
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: failed to allocate v1distance matrix for %lu objects\n",
+		state->global->msgprefix, nbobjs);
+      free(v1dist);
+      free(matrix);
+      return -1;
+    }
+
+    v1dist->kind = HWLOC_DISTANCES_KIND_FROM_OS|HWLOC_DISTANCES_KIND_MEANS_LATENCY;
+    /* TODO: we can't know for sure if it comes from the OS.
+     * On Linux/x86, it would be 10 on the diagonal.
+     * On Solaris/T5, 15 on the diagonal.
+     * Just check whether all values are integers, and that all values on the diagonal are minimal and identical?
+     */
+
+    v1dist->nbobjs = nbobjs;
+    v1dist->floats = matrix;
+
+    for(i=0; i<nbobjs*nbobjs; i++) {
+      struct hwloc__xml_import_state_s childstate;
+      char *attrname, *attrvalue;
+      float val;
+
+      ret = state->global->find_child(state, &childstate, &tag);
+      if (ret <= 0 || strcmp(tag, "latency")) {
+	/* a latency child is needed */
+	free(matrix);
+	free(v1dist);
+	return -1;
+      }
+
+      ret = state->global->next_attr(&childstate, &attrname, &attrvalue);
+      if (ret < 0 || strcmp(attrname, "value")) {
+	free(matrix);
+	free(v1dist);
+	return -1;
+      }
+
+      val = (float) atof((char *) attrvalue);
+      matrix[i] = val * latbase;
+
+      ret = state->global->close_tag(&childstate);
+      if (ret < 0) {
+	free(matrix);
+	free(v1dist);
+	return -1;
+      }
+
+      state->global->close_child(&childstate);
+    }
+
+    if (nbobjs < 2) {
+      /* distances with a single object are useless, even if the XML isn't invalid */
+      assert(nbobjs == 1);
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: ignoring invalid distance matrix with only 1 object\n",
+		state->global->msgprefix);
+      free(matrix);
+      free(v1dist);
+
+    } else if (obj->parent) {
+      /* we currently only import distances attached to root.
+       * we can't save obj in v1dist because obj could be dropped during insert if ignored.
+       * we could save its complete_cpu/nodeset instead to find it back later.
+       * but it doesn't matter much since only NUMA distances attached to root matter.
+       */
+      free(matrix);
+      free(v1dist);
+
+    } else {
+      /* queue the distance for real */
+      v1dist->prev = data->last_v1dist;
+      v1dist->next = NULL;
+      if (data->last_v1dist)
+	data->last_v1dist->next = v1dist;
+      else
+	data->first_v1dist = v1dist;
+      data->last_v1dist = v1dist;
+    }
+  }
+
+  return state->global->close_tag(state);
+}
+
+static int
+hwloc__xml_import_userdata(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj,
+			   hwloc__xml_import_state_t state)
+{
+  size_t length = 0;
+  int encoded = 0;
+  char *name = NULL; /* optional */
+  int ret;
+
+  while (1) {
+    char *attrname, *attrvalue;
+    if (state->global->next_attr(state, &attrname, &attrvalue) < 0)
+      break;
+    if (!strcmp(attrname, "length"))
+      length = strtoul(attrvalue, NULL, 10);
+    else if (!strcmp(attrname, "encoding"))
+      encoded = !strcmp(attrvalue, "base64");
+    else if (!strcmp(attrname, "name"))
+      name = attrvalue;
+    else
+      return -1;
+  }
+
+  if (!topology->userdata_import_cb) {
+    char *buffer;
+    size_t reallength = encoded ? BASE64_ENCODED_LENGTH(length) : length;
+    ret = state->global->get_content(state, &buffer, reallength);
+    if (ret < 0)
+      return -1;
+
+  } else if (topology->userdata_not_decoded) {
+      char *buffer, *fakename;
+      size_t reallength = encoded ? BASE64_ENCODED_LENGTH(length) : length;
+      ret = state->global->get_content(state, &buffer, reallength);
+      if (ret < 0)
+        return -1;
+      fakename = malloc(6 + 1 + (name ? strlen(name) : 4) + 1);
+      if (!fakename)
+	return -1;
+      sprintf(fakename, encoded ? "base64%c%s" : "normal%c%s", name ? ':' : '-', name ? name : "anon");
+      topology->userdata_import_cb(topology, obj, fakename, buffer, length);
+      free(fakename);
+
+  } else if (encoded && length) {
+      char *encoded_buffer;
+      size_t encoded_length = BASE64_ENCODED_LENGTH(length);
+      ret = state->global->get_content(state, &encoded_buffer, encoded_length);
+      if (ret < 0)
+        return -1;
+      if (ret) {
+	char *decoded_buffer = malloc(length+1);
+	if (!decoded_buffer)
+	  return -1;
+	assert(encoded_buffer[encoded_length] == 0);
+	ret = hwloc_decode_from_base64(encoded_buffer, decoded_buffer, length+1);
+	if (ret != (int) length) {
+	  free(decoded_buffer);
+	  return -1;
+	}
+	topology->userdata_import_cb(topology, obj, name, decoded_buffer, length);
+	free(decoded_buffer);
+      }
+
+  } else { /* always handle length==0 in the non-encoded case */
+      char *buffer = (char *) "";
+      if (length) {
+	ret = state->global->get_content(state, &buffer, length);
+	if (ret < 0)
+	  return -1;
+      }
+      topology->userdata_import_cb(topology, obj, name, buffer, length);
+  }
+
+  state->global->close_content(state);
+  return state->global->close_tag(state);
+}
+
+static void hwloc__xml_import_report_outoforder(hwloc_topology_t topology, hwloc_obj_t new, hwloc_obj_t old)
+{
+  char *progname = hwloc_progname(topology);
+  const char *origversion = hwloc_obj_get_info_by_name(topology->levels[0][0], "hwlocVersion");
+  const char *origprogname = hwloc_obj_get_info_by_name(topology->levels[0][0], "ProcessName");
+  char *c1, *cc1, t1[64];
+  char *c2 = NULL, *cc2 = NULL, t2[64];
+
+  hwloc_bitmap_asprintf(&c1, new->cpuset);
+  hwloc_bitmap_asprintf(&cc1, new->complete_cpuset);
+  hwloc_obj_type_snprintf(t1, sizeof(t1), new, 0);
+
+  if (old->cpuset)
+    hwloc_bitmap_asprintf(&c2, old->cpuset);
+  if (old->complete_cpuset)
+    hwloc_bitmap_asprintf(&cc2, old->complete_cpuset);
+  hwloc_obj_type_snprintf(t2, sizeof(t2), old, 0);
+
+  fprintf(stderr, "****************************************************************************\n");
+  fprintf(stderr, "* hwloc has encountered an out-of-order XML topology load.\n");
+  fprintf(stderr, "* Object %s cpuset %s complete %s\n",
+	  t1, c1, cc1);
+  fprintf(stderr, "* was inserted after object %s with %s and %s.\n",
+	  t2, c2 ? c2 : "none", cc2 ? cc2 : "none");
+  fprintf(stderr, "* The error occured in hwloc %s inside process `%s', while\n",
+	  HWLOC_VERSION,
+	  progname ? progname : "<unknown>");
+  if (origversion || origprogname)
+    fprintf(stderr, "* the input XML was generated by hwloc %s inside process `%s'.\n",
+	    origversion ? origversion : "(unknown version)",
+	    origprogname ? origprogname : "<unknown>");
+  else
+    fprintf(stderr, "* the input XML was generated by an unspecified ancient hwloc release.\n");
+  fprintf(stderr, "* Please check that your input topology XML file is valid.\n");
+  fprintf(stderr, "* Set HWLOC_DEBUG_CHECK=1 in the environment to detect further issues.\n");
+  fprintf(stderr, "****************************************************************************\n");
+
+  free(c1);
+  free(cc1);
+  free(c2);
+  free(cc2);
+  free(progname);
+}
+
+static int
+hwloc__xml_import_object(hwloc_topology_t topology,
+			 struct hwloc_xml_backend_data_s *data,
+			 hwloc_obj_t parent, hwloc_obj_t obj, int *gotignored,
+			 hwloc__xml_import_state_t state)
+{
+  int ignored = 0;
+  int childrengotignored = 0;
+  int attribute_less_cache = 0;
+  int numa_was_root = 0;
+  char *tag;
+  struct hwloc__xml_import_state_s childstate;
+
+  /* set parent now since it's used during import below or in subfunctions */
+  obj->parent = parent;
+
+  /* process attributes */
+  while (1) {
+    char *attrname, *attrvalue;
+    if (state->global->next_attr(state, &attrname, &attrvalue) < 0)
+      break;
+    if (!strcmp(attrname, "type")) {
+      if (hwloc_type_sscanf(attrvalue, &obj->type, NULL, 0) < 0) {
+	if (!strcasecmp(attrvalue, "Cache")) {
+	  obj->type = _HWLOC_OBJ_CACHE_OLD; /* will be fixed below */
+	  attribute_less_cache = 1;
+	} else if (!strcasecmp(attrvalue, "System")) {
+	  if (!parent)
+	    obj->type = HWLOC_OBJ_MACHINE;
+	  else {
+	    if (hwloc__xml_verbose())
+	      fprintf(stderr, "%s: obsolete System object only allowed at root\n",
+		      state->global->msgprefix);
+	    goto error_with_object;
+	  }
+	} else if (!strcasecmp(attrvalue, "Die")) {
+	  /* deal with possible future type */
+	  obj->type = HWLOC_OBJ_GROUP;
+	  obj->subtype = strdup("Die");
+	  obj->attr->group.kind = HWLOC_GROUP_KIND_INTEL_DIE;
+	  obj->attr->group.dont_merge = data->dont_merge_die_groups;
+	} else if (!strcasecmp(attrvalue, "Tile")) {
+	  /* deal with possible future type */
+	  obj->type = HWLOC_OBJ_GROUP;
+	  obj->subtype = strdup("Tile");
+	  obj->attr->group.kind = HWLOC_GROUP_KIND_INTEL_TILE;
+	} else if (!strcasecmp(attrvalue, "Module")) {
+	  /* deal with possible future type */
+	  obj->type = HWLOC_OBJ_GROUP;
+	  obj->subtype = strdup("Module");
+	  obj->attr->group.kind = HWLOC_GROUP_KIND_INTEL_MODULE;
+	} else if (!strcasecmp(attrvalue, "MemCache")) {
+	  /* ignore possible future type */
+	  obj->type = _HWLOC_OBJ_FUTURE;
+	  ignored = 1;
+	  if (hwloc__xml_verbose())
+	    fprintf(stderr, "%s: %s object not-supported, will be ignored\n",
+		    state->global->msgprefix, attrvalue);
+	} else {
+	  if (hwloc__xml_verbose())
+	    fprintf(stderr, "%s: unrecognized object type string %s\n",
+		    state->global->msgprefix, attrvalue);
+	  goto error_with_object;
+	}
+      }
+    } else {
+      /* type needed first */
+      if (obj->type == HWLOC_OBJ_TYPE_NONE) {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "%s: object attribute %s found before type\n",
+		  state->global->msgprefix,  attrname);
+	goto error_with_object;
+      }
+      hwloc__xml_import_object_attr(topology, data, obj, attrname, attrvalue, state);
+    }
+  }
+
+  /* process non-object subnodes to get info attrs (as well as page_types, etc) */
+  while (1) {
+    int ret;
+
+    tag = NULL;
+    ret = state->global->find_child(state, &childstate, &tag);
+    if (ret < 0)
+      goto error;
+    if (!ret)
+      break;
+
+    if (!strcmp(tag, "object")) {
+      /* we'll handle children later */
+      break;
+
+    } else if (!strcmp(tag, "page_type")) {
+      if (obj->type == HWLOC_OBJ_NUMANODE) {
+	ret = hwloc__xml_import_pagetype(topology, &obj->attr->numanode, &childstate);
+      } else if (!parent) {
+	ret = hwloc__xml_import_pagetype(topology, &topology->machine_memory, &childstate);
+      } else {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "%s: invalid non-NUMAnode object child %s\n",
+		  state->global->msgprefix, tag);
+	ret = -1;
+      }
+
+    } else if (!strcmp(tag, "info")) {
+      ret = hwloc__xml_import_info(data, obj, &childstate);
+    } else if (data->version_major < 2 && !strcmp(tag, "distances")) {
+      ret = hwloc__xml_v1import_distances(data, obj, &childstate);
+    } else if (!strcmp(tag, "userdata")) {
+      ret = hwloc__xml_import_userdata(topology, obj, &childstate);
+    } else {
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: invalid special object child %s\n",
+		state->global->msgprefix, tag);
+      ret = -1;
+    }
+
+    if (ret < 0)
+      goto error;
+
+    state->global->close_child(&childstate);
+  }
+
+  if (parent && obj->type == HWLOC_OBJ_MACHINE) {
+    /* replace non-root Machine with Groups */
+    obj->type = HWLOC_OBJ_GROUP;
+  }
+
+  if (parent && data->version_major >= 2) {
+    /* check parent/child types for 2.x */
+    if (hwloc__obj_type_is_normal(obj->type)) {
+      if (!hwloc__obj_type_is_normal(parent->type)) {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "normal object %s cannot be child of non-normal parent %s\n",
+		  hwloc_obj_type_string(obj->type), hwloc_obj_type_string(parent->type));
+	goto error_with_object;
+      }
+    } else if (hwloc__obj_type_is_memory(obj->type)) {
+      if (hwloc__obj_type_is_io(parent->type) || HWLOC_OBJ_MISC == parent->type) {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "Memory object %s cannot be child of non-normal-or-memory parent %s\n",
+		  hwloc_obj_type_string(obj->type), hwloc_obj_type_string(parent->type));
+	goto error_with_object;
+      }
+    } else if (hwloc__obj_type_is_io(obj->type)) {
+      if (hwloc__obj_type_is_memory(parent->type) || HWLOC_OBJ_MISC == parent->type) {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "I/O object %s cannot be child of non-normal-or-I/O parent %s\n",
+		  hwloc_obj_type_string(obj->type), hwloc_obj_type_string(parent->type));
+	goto error_with_object;
+      }
+    }
+
+  } else if (parent && data->version_major < 2) {
+    /* check parent/child types for pre-v2.0 */
+    if (hwloc__obj_type_is_normal(obj->type) || HWLOC_OBJ_NUMANODE == obj->type) {
+      if (hwloc__obj_type_is_special(parent->type)) {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "v1.x normal v1.x object %s cannot be child of special parent %s\n",
+		  hwloc_obj_type_string(obj->type), hwloc_obj_type_string(parent->type));
+	goto error_with_object;
+      }
+    } else if (hwloc__obj_type_is_io(obj->type)) {
+      if (HWLOC_OBJ_MISC == parent->type) {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "I/O object %s cannot be child of Misc parent\n",
+		  hwloc_obj_type_string(obj->type));
+	goto error_with_object;
+      }
+    }
+  }
+
+  if (data->version_major < 2) {
+    /***************************
+     * 1.x specific checks
+     */
+
+    /* attach pre-v2.0 children of NUMA nodes to normal parent */
+    if (parent && parent->type == HWLOC_OBJ_NUMANODE) {
+      parent = parent->parent;
+      assert(parent);
+    }
+
+    /* insert a group above pre-v2.0 NUMA nodes if needed */
+    if (obj->type == HWLOC_OBJ_NUMANODE) {
+      if (!parent) {
+	/* crazy case of NUMA node root (only possible when filtering Machine keep_structure in v1.x),
+	 * reinsert a Machine object
+	 */
+	hwloc_obj_t machine = hwloc_alloc_setup_object(topology, HWLOC_OBJ_MACHINE, HWLOC_UNKNOWN_INDEX);
+	machine->cpuset = hwloc_bitmap_dup(obj->cpuset);
+	machine->complete_cpuset = hwloc_bitmap_dup(obj->cpuset);
+	machine->nodeset = hwloc_bitmap_dup(obj->nodeset);
+	machine->complete_nodeset = hwloc_bitmap_dup(obj->complete_nodeset);
+	topology->levels[0][0] = machine;
+	parent = machine;
+	numa_was_root = 1;
+
+      } else if (!hwloc_bitmap_isequal(obj->complete_cpuset, parent->complete_cpuset)) {
+	/* This NUMA node has a different locality from its parent.
+	 * Don't attach it to this parent, or it well get its parent cpusets.
+	 * Add an intermediate Group with the desired locality.
+	 */
+	int needgroup = 1;
+	hwloc_obj_t sibling;
+
+	sibling = parent->memory_first_child;
+	if (sibling && !sibling->subtype
+	    && !sibling->next_sibling
+	    && obj->subtype && !strcmp(obj->subtype, "MCDRAM")
+	    && hwloc_bitmap_iszero(obj->complete_cpuset)) {
+	  /* this is KNL MCDRAM, we want to attach it near its DDR sibling */
+	  needgroup = 0;
+	}
+	/* Ideally we would also detect similar cases on future non-KNL platforms with multiple local NUMA nodes.
+	 * That's unlikely to occur with v1.x.
+	 * And we have no way to be sure if this CPU-less node is desired or not.
+	 */
+
+	if (needgroup
+	    && hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_GROUP)) {
+	  hwloc_obj_t group = hwloc_alloc_setup_object(topology, HWLOC_OBJ_GROUP, HWLOC_UNKNOWN_INDEX);
+	  group->gp_index = 0; /* will be initialized at the end of the discovery once we know the max */
+	  group->cpuset = hwloc_bitmap_dup(obj->cpuset);
+	  group->complete_cpuset = hwloc_bitmap_dup(obj->cpuset);
+	  group->nodeset = hwloc_bitmap_dup(obj->nodeset);
+	  group->complete_nodeset = hwloc_bitmap_dup(obj->complete_nodeset);
+	  group->attr->group.kind = HWLOC_GROUP_KIND_MEMORY;
+	  hwloc_insert_object_by_parent(topology, parent, group);
+	  parent = group;
+	}
+      }
+    }
+
+    /* fixup attribute-less caches imported from pre-v2.0 XMLs */
+    if (attribute_less_cache) {
+      assert(obj->type == _HWLOC_OBJ_CACHE_OLD);
+      obj->type = hwloc_cache_type_by_depth_type(obj->attr->cache.depth, obj->attr->cache.type);
+    }
+
+    /* fixup Misc objects inserted by cpusets in pre-v2.0 XMLs */
+    if (obj->type == HWLOC_OBJ_MISC && obj->cpuset)
+      obj->type = HWLOC_OBJ_GROUP;
+
+    /* check set consistency.
+     * 1.7.2 and earlier reported I/O Groups with only a cpuset, we don't want to reject those XMLs yet.
+     * Ignore those Groups since fixing the missing sets is hard (would need to look at children sets which are not available yet).
+     * Just abort the XML for non-Groups.
+     */
+    if (!obj->cpuset != !obj->complete_cpuset) {
+      /* has some cpuset without others */
+      if (obj->type == HWLOC_OBJ_GROUP) {
+	ignored = 1;
+      } else {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "%s: invalid object %s P#%u with some missing cpusets\n",
+		  state->global->msgprefix, hwloc_obj_type_string(obj->type), obj->os_index);
+	goto error_with_object;
+      }
+    } else if (!obj->nodeset != !obj->complete_nodeset) {
+      /* has some nodeset without others */
+      if (obj->type == HWLOC_OBJ_GROUP) {
+	ignored = 1;
+      } else {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "%s: invalid object %s P#%u with some missing nodesets\n",
+		  state->global->msgprefix, hwloc_obj_type_string(obj->type), obj->os_index);
+	goto error_with_object;
+      }
+    } else if (obj->nodeset && !obj->cpuset) {
+      /* has nodesets without cpusets (the contrary is allowed in pre-2.0) */
+      if (obj->type == HWLOC_OBJ_GROUP) {
+	ignored = 1;
+      } else {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "%s: invalid object %s P#%u with either cpuset or nodeset missing\n",
+		  state->global->msgprefix, hwloc_obj_type_string(obj->type), obj->os_index);
+	goto error_with_object;
+      }
+    }
+    /* end of 1.x specific checks */
+  }
+
+  /* check that cache attributes are coherent with the actual type */
+  if (hwloc__obj_type_is_cache(obj->type)
+      && obj->type != hwloc_cache_type_by_depth_type(obj->attr->cache.depth, obj->attr->cache.type)) {
+    if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: invalid cache type %s with attribute depth %u and type %d\n",
+	      state->global->msgprefix, hwloc_obj_type_string(obj->type), obj->attr->cache.depth, (int) obj->attr->cache.type);
+    goto error_with_object;
+  }
+
+  /* check special types vs cpuset */
+  if (!obj->cpuset && !hwloc__obj_type_is_special(obj->type)) {
+    if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: invalid normal object %s P#%u without cpuset\n",
+	      state->global->msgprefix, hwloc_obj_type_string(obj->type), obj->os_index);
+    goto error_with_object;
+  }
+  if (obj->cpuset && hwloc__obj_type_is_special(obj->type)) {
+    if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: invalid special object %s with cpuset\n",
+	      state->global->msgprefix, hwloc_obj_type_string(obj->type));
+    goto error_with_object;
+  }
+
+  /* check parent vs child sets */
+  if (obj->cpuset && parent && !parent->cpuset) {
+    if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: invalid object %s P#%u with cpuset while parent has none\n",
+	      state->global->msgprefix, hwloc_obj_type_string(obj->type), obj->os_index);
+    goto error_with_object;
+  }
+  if (obj->nodeset && parent && !parent->nodeset) {
+    if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: invalid object %s P#%u with nodeset while parent has none\n",
+	      state->global->msgprefix, hwloc_obj_type_string(obj->type), obj->os_index);
+    goto error_with_object;
+  }
+
+  /* check NUMA nodes */
+  if (obj->type == HWLOC_OBJ_NUMANODE) {
+    if (!obj->nodeset) {
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: invalid NUMA node object P#%u without nodeset\n",
+		state->global->msgprefix, obj->os_index);
+      goto error_with_object;
+    }
+    data->nbnumanodes++;
+    obj->prev_cousin = data->last_numanode;
+    obj->next_cousin = NULL;
+    if (data->last_numanode)
+      data->last_numanode->next_cousin = obj;
+    else
+      data->first_numanode = obj;
+    data->last_numanode = obj;
+  }
+
+  if (!hwloc_filter_check_keep_object(topology, obj)) {
+    /* Ignore this object instead of inserting it.
+     *
+     * Well, let the core ignore the root object later
+     * because we don't know yet if root has more than one child.
+     */
+    if (parent)
+      ignored = 1;
+  }
+
+  if (parent && !ignored) {
+    /* root->parent is NULL, and root is already inserted */
+    hwloc_insert_object_by_parent(topology, parent, obj);
+    /* insert_object_by_parent() doesn't merge during insert, so obj is still valid */
+  }
+
+  /* process object subnodes, if we found one win the above loop */
+  while (tag) {
+    int ret;
+
+    if (!strcmp(tag, "object")) {
+      hwloc_obj_t childobj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_TYPE_MAX, HWLOC_UNKNOWN_INDEX);
+      childobj->parent = ignored ? parent : obj;
+      ret = hwloc__xml_import_object(topology, data, ignored ? parent : obj, childobj,
+				     &childrengotignored,
+				     &childstate);
+    } else {
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: invalid special object child %s while looking for objects\n",
+		state->global->msgprefix, tag);
+      ret = -1;
+    }
+
+    if (ret < 0)
+      goto error;
+
+    state->global->close_child(&childstate);
+
+    tag = NULL;
+    ret = state->global->find_child(state, &childstate, &tag);
+    if (ret < 0)
+      goto error;
+    if (!ret)
+      break;
+  }
+
+  if (numa_was_root) {
+    /* duplicate NUMA infos to root, most of them are likely root-specific */
+    unsigned i;
+    for(i=0; i<obj->infos_count; i++) {
+      struct hwloc_info_s *info = &obj->infos[i];
+      hwloc_obj_add_info(parent, info->name, info->value);
+    }
+    /* TODO some infos are root-only (hwlocVersion, ProcessName, etc), remove them from obj? */
+  }
+
+  if (ignored) {
+    /* drop that object, and tell the parent that one child got ignored */
+    hwloc_free_unlinked_object(obj);
+    *gotignored = 1;
+
+  } else if (obj->first_child) {
+    /* now that all children are inserted, make sure they are in-order,
+     * so that the core doesn't have to deal with crappy children list.
+     */
+    hwloc_obj_t cur, next;
+    for(cur = obj->first_child, next = cur->next_sibling;
+	next;
+	cur = next, next = next->next_sibling) {
+      /* If reordering is needed, at least one pair of consecutive children will be out-of-order.
+       * So just check pairs of consecutive children.
+       *
+       * We checked above that complete_cpuset is always set.
+       */
+      if (hwloc_bitmap_compare_first(next->complete_cpuset, cur->complete_cpuset) < 0) {
+	/* next should be before cur */
+	if (!childrengotignored) {
+	  static int reported = 0;
+	  if (!reported && !hwloc_hide_errors()) {
+	    hwloc__xml_import_report_outoforder(topology, next, cur);
+	    reported = 1;
+	  }
+	}
+	hwloc__reorder_children(obj);
+	break;
+      }
+    }
+    /* no need to reorder memory children as long as there are no intermediate memory objects
+     * that could cause reordering when filtered-out.
+     */
+  }
+
+  return state->global->close_tag(state);
+
+ error_with_object:
+  if (parent)
+    /* root->parent is NULL, and root is already inserted. the caller will cleanup that root. */
+    hwloc_free_unlinked_object(obj);
+ error:
+  return -1;
+}
+
+static int
+hwloc__xml_v2import_distances(hwloc_topology_t topology,
+			      hwloc__xml_import_state_t state)
+{
+  hwloc_obj_type_t type = HWLOC_OBJ_TYPE_NONE;
+  unsigned nbobjs = 0;
+  int indexing = 0;
+  int os_indexing = 0;
+  int gp_indexing = 0;
+  unsigned long kind = 0;
+  unsigned nr_indexes, nr_u64values;
+  uint64_t *indexes;
+  uint64_t *u64values;
+  int ret;
+
+  /* process attributes */
+  while (1) {
+    char *attrname, *attrvalue;
+    if (state->global->next_attr(state, &attrname, &attrvalue) < 0)
+      break;
+    if (!strcmp(attrname, "nbobjs"))
+      nbobjs = strtoul(attrvalue, NULL, 10);
+    else if (!strcmp(attrname, "type")) {
+      if (hwloc_type_sscanf(attrvalue, &type, NULL, 0) < 0)
+	goto out;
+    }
+    else if (!strcmp(attrname, "indexing")) {
+      indexing = 1;
+      if (!strcmp(attrvalue, "os"))
+	os_indexing = 1;
+      else if (!strcmp(attrvalue, "gp"))
+	gp_indexing = 1;
+    }
+    else if (!strcmp(attrname, "kind")) {
+      kind = strtoul(attrvalue, NULL, 10);
+    }
+    else {
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: ignoring unknown distance attribute %s\n",
+		state->global->msgprefix, attrname);
+    }
+  }
+
+  /* abort if missing attribute */
+  if (!nbobjs || type == HWLOC_OBJ_TYPE_NONE || !indexing || !kind) {
+    if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: distance2 missing some attributes\n",
+	      state->global->msgprefix);
+    goto out;
+  }
+
+  indexes = malloc(nbobjs*sizeof(*indexes));
+  u64values = malloc(nbobjs*nbobjs*sizeof(*u64values));
+  if (!indexes || !u64values) {
+    if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: failed to allocate distances arrays for %u objects\n",
+	      state->global->msgprefix, nbobjs);
+    goto out_with_arrays;
+  }
+
+  /* process children */
+  nr_indexes = 0;
+  nr_u64values = 0;
+  while (1) {
+    struct hwloc__xml_import_state_s childstate;
+    char *attrname, *attrvalue, *tag, *buffer;
+    int length;
+    int is_index = 0;
+    int is_u64values = 0;
+
+    ret = state->global->find_child(state, &childstate, &tag);
+    if (ret <= 0)
+      break;
+
+    if (!strcmp(tag, "indexes"))
+      is_index = 1;
+    else if (!strcmp(tag, "u64values"))
+      is_u64values = 1;
+    if (!is_index && !is_u64values) {
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: distance2 with unrecognized child %s\n",
+		state->global->msgprefix, tag);
+      goto out_with_arrays;
+    }
+
+    if (state->global->next_attr(&childstate, &attrname, &attrvalue) < 0
+	|| strcmp(attrname, "length")) {
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: distance2 child must have length attribute\n",
+		state->global->msgprefix);
+      goto out_with_arrays;
+    }
+    length = atoi(attrvalue);
+
+    ret = state->global->get_content(&childstate, &buffer, length);
+    if (ret < 0) {
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: distance2 child needs content of length %d\n",
+		state->global->msgprefix, length);
+      goto out_with_arrays;
+    }
+
+    if (is_index) {
+      /* get indexes */
+      char *tmp;
+      if (nr_indexes >= nbobjs) {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "%s: distance2 with more than %u indexes\n",
+		  state->global->msgprefix, nbobjs);
+	goto out_with_arrays;
+      }
+      tmp = buffer;
+      while (1) {
+	char *next;
+	unsigned long long u = strtoull(tmp, &next, 0);
+	if (next == tmp)
+	  break;
+	indexes[nr_indexes++] = u;
+	if (*next != ' ')
+	  break;
+	if (nr_indexes == nbobjs)
+	  break;
+	tmp = next+1;
+      }
+
+    } else if (is_u64values) {
+      /* get uint64_t values */
+      char *tmp;
+      if (nr_u64values >= nbobjs*nbobjs) {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "%s: distance2 with more than %u u64values\n",
+		  state->global->msgprefix, nbobjs*nbobjs);
+	goto out_with_arrays;
+      }
+      tmp = buffer;
+      while (1) {
+	char *next;
+	unsigned long long u = strtoull(tmp, &next, 0);
+	if (next == tmp)
+	  break;
+	u64values[nr_u64values++] = u;
+	if (*next != ' ')
+	  break;
+	if (nr_u64values == nbobjs*nbobjs)
+	  break;
+	tmp = next+1;
+      }
+    }
+
+    state->global->close_content(&childstate);
+
+    ret = state->global->close_tag(&childstate);
+    if (ret < 0) {
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: distance2 with more than %u indexes\n",
+		state->global->msgprefix, nbobjs);
+      goto out_with_arrays;
+    }
+
+    state->global->close_child(&childstate);
+  }
+
+  if (nr_indexes != nbobjs) {
+    if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: distance2 with less than %u indexes\n",
+	      state->global->msgprefix, nbobjs);
+    goto out_with_arrays;
+  }
+  if (nr_u64values != nbobjs*nbobjs) {
+    if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: distance2 with less than %u u64values\n",
+	      state->global->msgprefix, nbobjs*nbobjs);
+    goto out_with_arrays;
+  }
+
+  if (nbobjs < 2) {
+    /* distances with a single object are useless, even if the XML isn't invalid */
+    if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: ignoring distances2 with only %u objects\n",
+	      state->global->msgprefix, nbobjs);
+    goto out_ignore;
+  }
+  if (type == HWLOC_OBJ_PU || type == HWLOC_OBJ_NUMANODE) {
+    if (!os_indexing) {
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: ignoring PU or NUMA distances2 without os_indexing\n",
+		state->global->msgprefix);
+      goto out_ignore;
+    }
+  } else {
+    if (!gp_indexing) {
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: ignoring !PU or !NUMA distances2 without gp_indexing\n",
+		state->global->msgprefix);
+      goto out_ignore;
+    }
+  }
+
+  hwloc_internal_distances_add_by_index(topology, type, nbobjs, indexes, u64values, kind, 0);
+
+  /* prevent freeing below */
+  indexes = NULL;
+  u64values = NULL;
+
+ out_ignore:
+  free(indexes);
+  free(u64values);
+  return state->global->close_tag(state);
+
+ out_with_arrays:
+  free(indexes);
+  free(u64values);
+ out:
+  return -1;
+}
+
+static int
+hwloc__xml_import_diff_one(hwloc__xml_import_state_t state,
+			   hwloc_topology_diff_t *firstdiffp,
+			   hwloc_topology_diff_t *lastdiffp)
+{
+  char *type_s = NULL;
+  char *obj_depth_s = NULL;
+  char *obj_index_s = NULL;
+  char *obj_attr_type_s = NULL;
+/* char *obj_attr_index_s = NULL; unused for now */
+  char *obj_attr_name_s = NULL;
+  char *obj_attr_oldvalue_s = NULL;
+  char *obj_attr_newvalue_s = NULL;
+
+  while (1) {
+    char *attrname, *attrvalue;
+    if (state->global->next_attr(state, &attrname, &attrvalue) < 0)
+      break;
+    if (!strcmp(attrname, "type"))
+      type_s = attrvalue;
+    else if (!strcmp(attrname, "obj_depth"))
+      obj_depth_s = attrvalue;
+    else if (!strcmp(attrname, "obj_index"))
+      obj_index_s = attrvalue;
+    else if (!strcmp(attrname, "obj_attr_type"))
+      obj_attr_type_s = attrvalue;
+    else if (!strcmp(attrname, "obj_attr_index"))
+      { /* obj_attr_index_s = attrvalue; unused for now */ }
+    else if (!strcmp(attrname, "obj_attr_name"))
+      obj_attr_name_s = attrvalue;
+    else if (!strcmp(attrname, "obj_attr_oldvalue"))
+      obj_attr_oldvalue_s = attrvalue;
+    else if (!strcmp(attrname, "obj_attr_newvalue"))
+      obj_attr_newvalue_s = attrvalue;
+    else {
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: ignoring unknown diff attribute %s\n",
+		state->global->msgprefix, attrname);
+      return -1;
+    }
+  }
+
+  if (type_s) {
+    switch (atoi(type_s)) {
+    default:
+      break;
+    case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR: {
+      /* object attribute diff */
+      hwloc_topology_diff_obj_attr_type_t obj_attr_type;
+      hwloc_topology_diff_t diff;
+
+      /* obj_attr mandatory generic attributes */
+      if (!obj_depth_s || !obj_index_s || !obj_attr_type_s) {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "%s: missing mandatory obj attr generic attributes\n",
+		  state->global->msgprefix);
+	break;
+      }
+
+      /* obj_attr mandatory attributes common to all subtypes */
+      if (!obj_attr_oldvalue_s || !obj_attr_newvalue_s) {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "%s: missing mandatory obj attr value attributes\n",
+		  state->global->msgprefix);
+	break;
+      }
+
+      /* mandatory attributes for obj_attr_info subtype */
+      obj_attr_type = atoi(obj_attr_type_s);
+      if (obj_attr_type == HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_INFO && !obj_attr_name_s) {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "%s: missing mandatory obj attr info name attribute\n",
+		  state->global->msgprefix);
+	break;
+      }
+
+      /* now we know we have everything we need */
+      diff = malloc(sizeof(*diff));
+      if (!diff)
+	return -1;
+      diff->obj_attr.type = HWLOC_TOPOLOGY_DIFF_OBJ_ATTR;
+      diff->obj_attr.obj_depth = atoi(obj_depth_s);
+      diff->obj_attr.obj_index = atoi(obj_index_s);
+      memset(&diff->obj_attr.diff, 0, sizeof(diff->obj_attr.diff));
+      diff->obj_attr.diff.generic.type = obj_attr_type;
+
+      switch (atoi(obj_attr_type_s)) {
+      case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_SIZE:
+	diff->obj_attr.diff.uint64.oldvalue = strtoull(obj_attr_oldvalue_s, NULL, 0);
+	diff->obj_attr.diff.uint64.newvalue = strtoull(obj_attr_newvalue_s, NULL, 0);
+	break;
+      case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_INFO:
+	diff->obj_attr.diff.string.name = strdup(obj_attr_name_s);
+	/* FALLTHRU */
+      case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_NAME:
+	diff->obj_attr.diff.string.oldvalue = strdup(obj_attr_oldvalue_s);
+	diff->obj_attr.diff.string.newvalue = strdup(obj_attr_newvalue_s);
+	break;
+      }
+
+      if (*firstdiffp)
+	(*lastdiffp)->generic.next = diff;
+      else
+        *firstdiffp = diff;
+      *lastdiffp = diff;
+      diff->generic.next = NULL;
+    }
+    }
+  }
+
+  return state->global->close_tag(state);
+}
+
+int
+hwloc__xml_import_diff(hwloc__xml_import_state_t state,
+		       hwloc_topology_diff_t *firstdiffp)
+{
+  hwloc_topology_diff_t firstdiff = NULL, lastdiff = NULL;
+  *firstdiffp = NULL;
+
+  while (1) {
+    struct hwloc__xml_import_state_s childstate;
+    char *tag;
+    int ret;
+
+    ret = state->global->find_child(state, &childstate, &tag);
+    if (ret < 0)
+      return -1;
+    if (!ret)
+      break;
+
+    if (!strcmp(tag, "diff")) {
+      ret = hwloc__xml_import_diff_one(&childstate, &firstdiff, &lastdiff);
+    } else
+      ret = -1;
+
+    if (ret < 0)
+      return ret;
+
+    state->global->close_child(&childstate);
+  }
+
+  *firstdiffp = firstdiff;
+  return 0;
+}
+
+/***********************************
+ ********* main XML import *********
+ ***********************************/
+
+static void
+hwloc_convert_from_v1dist_floats(hwloc_topology_t topology, unsigned nbobjs, float *floats, uint64_t *u64s)
+{
+  unsigned i;
+  int is_uint;
+  char *env;
+  float scale = 1000.f;
+  char scalestring[20];
+
+  env = getenv("HWLOC_XML_V1DIST_SCALE");
+  if (env) {
+    scale = (float) atof(env);
+    goto scale;
+  }
+
+  is_uint = 1;
+  /* find out if all values are integers */
+  for(i=0; i<nbobjs*nbobjs; i++) {
+    float f, iptr, fptr;
+    f = floats[i];
+    if (f < 0.f) {
+      is_uint = 0;
+      break;
+    }
+    fptr = modff(f, &iptr);
+    if (fptr > .001f && fptr < .999f) {
+      is_uint = 0;
+      break;
+    }
+    u64s[i] = (int)(f+.5f);
+  }
+  if (is_uint)
+    return;
+
+ scale:
+  /* TODO heuristic to find a good scale */
+  for(i=0; i<nbobjs*nbobjs; i++)
+    u64s[i] = (uint64_t)(scale * floats[i]);
+
+  /* save the scale in root info attrs.
+   * Not perfect since we may have multiple of them,
+   * and some distances might disappear in case of restrict, etc.
+   */
+  sprintf(scalestring, "%f", scale);
+  hwloc_obj_add_info(hwloc_get_root_obj(topology), "xmlv1DistancesScale", scalestring);
+}
+
+/* this canNOT be the first XML call */
+static int
+hwloc_look_xml(struct hwloc_backend *backend)
+{
+  struct hwloc_topology *topology = backend->topology;
+  struct hwloc_xml_backend_data_s *data = backend->private_data;
+  struct hwloc__xml_import_state_s state, childstate;
+  struct hwloc_obj *root = topology->levels[0][0];
+  char *tag;
+  int gotignored = 0;
+  hwloc_localeswitch_declare;
+  char *env;
+  int ret;
+
+  state.global = data;
+
+  assert(!root->cpuset);
+
+  hwloc_localeswitch_init();
+
+  data->nbnumanodes = 0;
+  data->first_numanode = data->last_numanode = NULL;
+  data->first_v1dist = data->last_v1dist = NULL;
+
+  env = getenv("HWLOC_DONT_MERGE_DIE_GROUPS");
+  data->dont_merge_die_groups = env && atoi(env);
+
+  ret = data->look_init(data, &state);
+  if (ret < 0)
+    goto failed;
+
+  if (data->version_major > 2) {
+    if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: cannot import XML version %u.%u > 2\n",
+	      data->msgprefix, data->version_major, data->version_minor);
+    goto err;
+  }
+
+  /* find root object tag and import it */
+  ret = state.global->find_child(&state, &childstate, &tag);
+  if (ret < 0 || !ret || strcmp(tag, "object"))
+    goto failed;
+  ret = hwloc__xml_import_object(topology, data, NULL /*  no parent */, root,
+				 &gotignored,
+				 &childstate);
+  if (ret < 0)
+    goto failed;
+  state.global->close_child(&childstate);
+  assert(!gotignored);
+
+  /* the root may have changed if we had to reinsert a Machine */
+  root = topology->levels[0][0];
+
+  if (data->version_major >= 2) {
+    /* find v2 distances */
+    while (1) {
+      ret = state.global->find_child(&state, &childstate, &tag);
+      if (ret < 0)
+	goto failed;
+      if (!ret)
+	break;
+      if (strcmp(tag, "distances2")) {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "%s: ignoring unknown tag `%s' after root object, expected `distances2'\n",
+		  data->msgprefix, tag);
+	goto done;
+      }
+      ret = hwloc__xml_v2import_distances(topology, &childstate);
+      if (ret < 0)
+	goto failed;
+      state.global->close_child(&childstate);
+    }
+  }
+
+  /* find end of topology tag */
+  state.global->close_tag(&state);
+
+done:
+  if (!root->cpuset) {
+    if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: invalid root object without cpuset\n",
+	      data->msgprefix);
+    goto err;
+  }
+
+  /* update pre-v2.0 memory group gp_index */
+  if (data->version_major < 2 && data->first_numanode) {
+    hwloc_obj_t node = data->first_numanode;
+    do {
+      if (node->parent->type == HWLOC_OBJ_GROUP
+	  && !node->parent->gp_index)
+	node->parent->gp_index = topology->next_gp_index++;
+      node = node->next_cousin;
+    } while (node);
+  }
+
+  if (data->version_major < 2 && data->first_v1dist) {
+    /* handle v1 distances */
+    struct hwloc__xml_imported_v1distances_s *v1dist, *v1next = data->first_v1dist;
+    while ((v1dist = v1next) != NULL) {
+      unsigned nbobjs = v1dist->nbobjs;
+      v1next = v1dist->next;
+      /* Handle distances as NUMA node distances if nbobjs matches.
+       * Otherwise drop, only NUMA distances really matter.
+       *
+       * We could also attach to a random level with the right nbobjs,
+       * but it would require to have those objects in the original XML order (like the first_numanode cousin-list).
+       * because the topology order can be different if some parents are ignored during load.
+       */
+      if (nbobjs == data->nbnumanodes) {
+	hwloc_obj_t *objs = malloc(nbobjs*sizeof(hwloc_obj_t));
+	uint64_t *values = malloc(nbobjs*nbobjs*sizeof(*values));
+	if (objs && values) {
+	  hwloc_obj_t node;
+	  unsigned i;
+	  for(i=0, node = data->first_numanode;
+	      i<nbobjs;
+	      i++, node = node->next_cousin)
+	    objs[i] = node;
+hwloc_convert_from_v1dist_floats(topology, nbobjs, v1dist->floats, values);
+	  hwloc_internal_distances_add(topology, nbobjs, objs, values, v1dist->kind, 0);
+	} else {
+	  free(objs);
+	  free(values);
+	}
+      }
+      free(v1dist->floats);
+      free(v1dist);
+    }
+    data->first_v1dist = data->last_v1dist = NULL;
+  }
+
+  /* FIXME:
+   * We should check that the existing object sets are consistent:
+   * no intersection between objects of a same level,
+   * object sets included in parent sets.
+   * hwloc never generated such buggy XML, but users could create one.
+   *
+   * We want to add these checks to the existing core code that
+   * adds missing sets and propagates parent/children sets
+   * (in case another backend ever generates buggy object sets as well).
+   */
+
+  if (data->version_major >= 2) {
+    /* v2 must have non-empty nodesets since at least one NUMA node is required */
+    if (!root->nodeset) {
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: invalid root object without nodeset\n",
+		data->msgprefix);
+      goto err;
+    }
+    if (hwloc_bitmap_iszero(root->nodeset)) {
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: invalid root object with empty nodeset\n",
+		data->msgprefix);
+      goto err;
+    }
+  } else {
+    /* if v1 without nodeset, the core will add a default NUMA node and nodesets */
+  }
+
+  /* allocate default cpusets and nodesets if missing, the core will restrict them */
+  hwloc_alloc_root_sets(root);
+
+  /* keep the "Backend" information intact */
+  /* we could add "BackendSource=XML" to notify that XML was used between the actual backend and here */
+
+  topology->support.discovery->pu = 1;
+  if (data->nbnumanodes) {
+    topology->support.discovery->numa = 1;
+    topology->support.discovery->numa_memory = 1; // FIXME
+  }
+
+  if (data->look_done)
+    data->look_done(data, 0);
+
+  hwloc_localeswitch_fini();
+  return 0;
+
+ failed:
+  if (data->look_done)
+    data->look_done(data, -1);
+  if (hwloc__xml_verbose())
+    fprintf(stderr, "%s: XML component discovery failed.\n",
+	    data->msgprefix);
+ err:
+  hwloc_free_object_siblings_and_children(root->first_child);
+  root->first_child = NULL;
+  hwloc_free_object_siblings_and_children(root->memory_first_child);
+  root->memory_first_child = NULL;
+  hwloc_free_object_siblings_and_children(root->io_first_child);
+  root->io_first_child = NULL;
+  hwloc_free_object_siblings_and_children(root->misc_first_child);
+  root->misc_first_child = NULL;
+
+  /* make sure the core will abort */
+  if (root->cpuset)
+    hwloc_bitmap_zero(root->cpuset);
+  if (root->nodeset)
+    hwloc_bitmap_zero(root->nodeset);
+
+  hwloc_localeswitch_fini();
+  return -1;
+}
+
+/* this can be the first XML call */
+int
+hwloc_topology_diff_load_xml(const char *xmlpath,
+			     hwloc_topology_diff_t *firstdiffp, char **refnamep)
+{
+  struct hwloc__xml_import_state_s state;
+  struct hwloc_xml_backend_data_s fakedata; /* only for storing global info during parsing */
+  hwloc_localeswitch_declare;
+  const char *local_basename;
+  int force_nolibxml;
+  int ret;
+
+  state.global = &fakedata;
+
+  local_basename = strrchr(xmlpath, '/');
+  if (local_basename)
+    local_basename++;
+  else
+    local_basename = xmlpath;
+  fakedata.msgprefix = strdup(local_basename);
+
+  hwloc_components_init();
+  assert(hwloc_nolibxml_callbacks);
+
+  hwloc_localeswitch_init();
+
+  *firstdiffp = NULL;
+
+  force_nolibxml = hwloc_nolibxml_import();
+retry:
+  if (!hwloc_libxml_callbacks || (hwloc_nolibxml_callbacks && force_nolibxml))
+    ret = hwloc_nolibxml_callbacks->import_diff(&state, xmlpath, NULL, 0, firstdiffp, refnamep);
+  else {
+    ret = hwloc_libxml_callbacks->import_diff(&state, xmlpath, NULL, 0, firstdiffp, refnamep);
+    if (ret < 0 && errno == ENOSYS) {
+      hwloc_libxml_callbacks = NULL;
+      goto retry;
+    }
+  }
+
+  hwloc_localeswitch_fini();
+  hwloc_components_fini();
+  free(fakedata.msgprefix);
+  return ret;
+}
+
+/* this can be the first XML call */
+int
+hwloc_topology_diff_load_xmlbuffer(const char *xmlbuffer, int buflen,
+				   hwloc_topology_diff_t *firstdiffp, char **refnamep)
+{
+  struct hwloc__xml_import_state_s state;
+  struct hwloc_xml_backend_data_s fakedata; /* only for storing global info during parsing */
+  hwloc_localeswitch_declare;
+  int force_nolibxml;
+  int ret;
+
+  state.global = &fakedata;
+  fakedata.msgprefix = strdup("xmldiffbuffer");
+
+  hwloc_components_init();
+  assert(hwloc_nolibxml_callbacks);
+
+  hwloc_localeswitch_init();
+
+  *firstdiffp = NULL;
+
+  force_nolibxml = hwloc_nolibxml_import();
+ retry:
+  if (!hwloc_libxml_callbacks || (hwloc_nolibxml_callbacks && force_nolibxml))
+    ret = hwloc_nolibxml_callbacks->import_diff(&state, NULL, xmlbuffer, buflen, firstdiffp, refnamep);
+  else {
+    ret = hwloc_libxml_callbacks->import_diff(&state, NULL, xmlbuffer, buflen, firstdiffp, refnamep);
+    if (ret < 0 && errno == ENOSYS) {
+      hwloc_libxml_callbacks = NULL;
+      goto retry;
+    }
+  }
+
+  hwloc_localeswitch_fini();
+  hwloc_components_fini();
+  free(fakedata.msgprefix);
+  return ret;
+}
+
+/************************************************
+ ********* XML export (common routines) *********
+ ************************************************/
+
+#define HWLOC_XML_CHAR_VALID(c) (((c) >= 32 && (c) <= 126) || (c) == '\t' || (c) == '\n' || (c) == '\r')
+
+static int
+hwloc__xml_export_check_buffer(const char *buf, size_t length)
+{
+  unsigned i;
+  for(i=0; i<length; i++)
+    if (!HWLOC_XML_CHAR_VALID(buf[i]))
+      return -1;
+  return 0;
+}
+
+/* strdup and remove ugly chars from random string */
+static char*
+hwloc__xml_export_safestrdup(const char *old)
+{
+  char *new = malloc(strlen(old)+1);
+  char *dst = new;
+  const char *src = old;
+  while (*src) {
+    if (HWLOC_XML_CHAR_VALID(*src))
+      *(dst++) = *src;
+    src++;
+  }
+  *dst = '\0';
+  return new;
+}
+
+static void
+hwloc__xml_export_object_contents (hwloc__xml_export_state_t state, hwloc_topology_t topology, hwloc_obj_t obj, unsigned long flags)
+{
+  char *setstring = NULL, *setstring2 = NULL;
+  char tmp[255];
+  int v1export = flags & HWLOC_TOPOLOGY_EXPORT_XML_FLAG_V1;
+  unsigned i,j;
+
+  if (v1export && obj->type == HWLOC_OBJ_PACKAGE)
+    state->new_prop(state, "type", "Socket");
+  else if (v1export && hwloc__obj_type_is_cache(obj->type))
+    state->new_prop(state, "type", "Cache");
+  else
+    state->new_prop(state, "type", hwloc_obj_type_string(obj->type));
+
+  if (obj->os_index != HWLOC_UNKNOWN_INDEX) {
+    sprintf(tmp, "%u", obj->os_index);
+    state->new_prop(state, "os_index", tmp);
+  }
+
+  if (obj->cpuset) {
+    if (v1export && obj->type == HWLOC_OBJ_NUMANODE && obj->sibling_rank > 0) {
+      /* v1 non-first NUMA nodes have empty cpusets */
+      state->new_prop(state, "cpuset", "0x0");
+      state->new_prop(state, "online_cpuset", "0x0");
+      state->new_prop(state, "complete_cpuset", "0x0");
+      state->new_prop(state, "allowed_cpuset", "0x0");
+
+    } else {
+      /* normal case */
+      hwloc_bitmap_asprintf(&setstring, obj->cpuset);
+      state->new_prop(state, "cpuset", setstring);
+
+      hwloc_bitmap_asprintf(&setstring2, obj->complete_cpuset);
+      state->new_prop(state, "complete_cpuset", setstring2);
+      free(setstring2);
+
+      if (v1export)
+	state->new_prop(state, "online_cpuset", setstring);
+      free(setstring);
+
+      if (v1export || !obj->parent) {
+	hwloc_bitmap_t allowed_cpuset = hwloc_bitmap_dup(obj->cpuset);
+	hwloc_bitmap_and(allowed_cpuset, allowed_cpuset, topology->allowed_cpuset);
+	hwloc_bitmap_asprintf(&setstring, allowed_cpuset);
+	state->new_prop(state, "allowed_cpuset", setstring);
+	free(setstring);
+	hwloc_bitmap_free(allowed_cpuset);
+      }
+    }
+
+    /* If exporting v1, we should clear second local NUMA bits from nodeset,
+     * but the importer will clear them anyway.
+     */
+    hwloc_bitmap_asprintf(&setstring, obj->nodeset);
+    state->new_prop(state, "nodeset", setstring);
+    free(setstring);
+
+    hwloc_bitmap_asprintf(&setstring, obj->complete_nodeset);
+    state->new_prop(state, "complete_nodeset", setstring);
+    free(setstring);
+
+    if (v1export || !obj->parent) {
+      hwloc_bitmap_t allowed_nodeset = hwloc_bitmap_dup(obj->nodeset);
+      hwloc_bitmap_and(allowed_nodeset, allowed_nodeset, topology->allowed_nodeset);
+      hwloc_bitmap_asprintf(&setstring, allowed_nodeset);
+      state->new_prop(state, "allowed_nodeset", setstring);
+      free(setstring);
+      hwloc_bitmap_free(allowed_nodeset);
+    }
+  }
+
+  if (!v1export) {
+    sprintf(tmp, "%llu", (unsigned long long) obj->gp_index);
+    state->new_prop(state, "gp_index", tmp);
+  }
+
+  if (obj->name) {
+    char *name = hwloc__xml_export_safestrdup(obj->name);
+    state->new_prop(state, "name", name);
+    free(name);
+  }
+  if (!v1export && obj->subtype) {
+    char *subtype = hwloc__xml_export_safestrdup(obj->subtype);
+    state->new_prop(state, "subtype", subtype);
+    free(subtype);
+  }
+
+  switch (obj->type) {
+  case HWLOC_OBJ_NUMANODE:
+    if (obj->attr->numanode.local_memory) {
+      sprintf(tmp, "%llu", (unsigned long long) obj->attr->numanode.local_memory);
+      state->new_prop(state, "local_memory", tmp);
+    }
+    for(i=0; i<obj->attr->numanode.page_types_len; i++) {
+      struct hwloc__xml_export_state_s childstate;
+      state->new_child(state, &childstate, "page_type");
+      sprintf(tmp, "%llu", (unsigned long long) obj->attr->numanode.page_types[i].size);
+      childstate.new_prop(&childstate, "size", tmp);
+      sprintf(tmp, "%llu", (unsigned long long) obj->attr->numanode.page_types[i].count);
+      childstate.new_prop(&childstate, "count", tmp);
+      childstate.end_object(&childstate, "page_type");
+    }
+    break;
+  case HWLOC_OBJ_L1CACHE:
+  case HWLOC_OBJ_L2CACHE:
+  case HWLOC_OBJ_L3CACHE:
+  case HWLOC_OBJ_L4CACHE:
+  case HWLOC_OBJ_L5CACHE:
+  case HWLOC_OBJ_L1ICACHE:
+  case HWLOC_OBJ_L2ICACHE:
+  case HWLOC_OBJ_L3ICACHE:
+    sprintf(tmp, "%llu", (unsigned long long) obj->attr->cache.size);
+    state->new_prop(state, "cache_size", tmp);
+    sprintf(tmp, "%u", obj->attr->cache.depth);
+    state->new_prop(state, "depth", tmp);
+    sprintf(tmp, "%u", (unsigned) obj->attr->cache.linesize);
+    state->new_prop(state, "cache_linesize", tmp);
+    sprintf(tmp, "%d", obj->attr->cache.associativity);
+    state->new_prop(state, "cache_associativity", tmp);
+    sprintf(tmp, "%d", (int) obj->attr->cache.type);
+    state->new_prop(state, "cache_type", tmp);
+    break;
+  case HWLOC_OBJ_GROUP:
+    if (v1export) {
+      sprintf(tmp, "%u", obj->attr->group.depth);
+      state->new_prop(state, "depth", tmp);
+      if (obj->attr->group.dont_merge)
+        state->new_prop(state, "dont_merge", "1");
+    } else {
+      sprintf(tmp, "%u", obj->attr->group.kind);
+      state->new_prop(state, "kind", tmp);
+      sprintf(tmp, "%u", obj->attr->group.subkind);
+      state->new_prop(state, "subkind", tmp);
+      if (obj->attr->group.dont_merge)
+        state->new_prop(state, "dont_merge", "1");
+    }
+    break;
+  case HWLOC_OBJ_BRIDGE:
+    sprintf(tmp, "%d-%d", (int) obj->attr->bridge.upstream_type, (int) obj->attr->bridge.downstream_type);
+    state->new_prop(state, "bridge_type", tmp);
+    sprintf(tmp, "%u", obj->attr->bridge.depth);
+    state->new_prop(state, "depth", tmp);
+    if (obj->attr->bridge.downstream_type == HWLOC_OBJ_BRIDGE_PCI) {
+      sprintf(tmp, "%04x:[%02x-%02x]",
+	      (unsigned) obj->attr->bridge.downstream.pci.domain,
+	      (unsigned) obj->attr->bridge.downstream.pci.secondary_bus,
+	      (unsigned) obj->attr->bridge.downstream.pci.subordinate_bus);
+      state->new_prop(state, "bridge_pci", tmp);
+    }
+    if (obj->attr->bridge.upstream_type != HWLOC_OBJ_BRIDGE_PCI)
+      break;
+    /* FALLTHRU */
+  case HWLOC_OBJ_PCI_DEVICE:
+    sprintf(tmp, "%04x:%02x:%02x.%01x",
+	    (unsigned) obj->attr->pcidev.domain,
+	    (unsigned) obj->attr->pcidev.bus,
+	    (unsigned) obj->attr->pcidev.dev,
+	    (unsigned) obj->attr->pcidev.func);
+    state->new_prop(state, "pci_busid", tmp);
+    sprintf(tmp, "%04x [%04x:%04x] [%04x:%04x] %02x",
+	    (unsigned) obj->attr->pcidev.class_id,
+	    (unsigned) obj->attr->pcidev.vendor_id, (unsigned) obj->attr->pcidev.device_id,
+	    (unsigned) obj->attr->pcidev.subvendor_id, (unsigned) obj->attr->pcidev.subdevice_id,
+	    (unsigned) obj->attr->pcidev.revision);
+    state->new_prop(state, "pci_type", tmp);
+    sprintf(tmp, "%f", obj->attr->pcidev.linkspeed);
+    state->new_prop(state, "pci_link_speed", tmp);
+    break;
+  case HWLOC_OBJ_OS_DEVICE:
+    sprintf(tmp, "%d", (int) obj->attr->osdev.type);
+    state->new_prop(state, "osdev_type", tmp);
+    break;
+  default:
+    break;
+  }
+
+  for(i=0; i<obj->infos_count; i++) {
+    char *name = hwloc__xml_export_safestrdup(obj->infos[i].name);
+    char *value = hwloc__xml_export_safestrdup(obj->infos[i].value);
+    struct hwloc__xml_export_state_s childstate;
+    state->new_child(state, &childstate, "info");
+    childstate.new_prop(&childstate, "name", name);
+    childstate.new_prop(&childstate, "value", value);
+    childstate.end_object(&childstate, "info");
+    free(name);
+    free(value);
+  }
+  if (v1export && obj->subtype) {
+    char *subtype = hwloc__xml_export_safestrdup(obj->subtype);
+    struct hwloc__xml_export_state_s childstate;
+    int is_coproctype = (obj->type == HWLOC_OBJ_OS_DEVICE && obj->attr->osdev.type == HWLOC_OBJ_OSDEV_COPROC);
+    state->new_child(state, &childstate, "info");
+    childstate.new_prop(&childstate, "name", is_coproctype ? "CoProcType" : "Type");
+    childstate.new_prop(&childstate, "value", subtype);
+    childstate.end_object(&childstate, "info");
+    free(subtype);
+  }
+
+  if (v1export && !obj->parent) {
+    /* only latency matrices covering the entire machine can be exported to v1 */
+    struct hwloc_internal_distances_s *dist;
+    /* refresh distances since we need objects below */
+    hwloc_internal_distances_refresh(topology);
+    for(dist = topology->first_dist; dist; dist = dist->next) {
+      struct hwloc__xml_export_state_s childstate;
+      unsigned nbobjs = dist->nbobjs;
+      int depth;
+
+      if (nbobjs != (unsigned) hwloc_get_nbobjs_by_type(topology, dist->type))
+	continue;
+      if (!(dist->kind & HWLOC_DISTANCES_KIND_MEANS_LATENCY))
+	continue;
+     {
+      HWLOC_VLA(unsigned, logical_to_v2array, nbobjs);
+      for(i=0; i<nbobjs; i++)
+	logical_to_v2array[dist->objs[i]->logical_index] = i;
+
+      /* compute the relative depth */
+      if (dist->type == HWLOC_OBJ_NUMANODE) {
+	/* for NUMA nodes, use the highest normal-parent depth + 1 */
+	depth = -1;
+	for(i=0; i<nbobjs; i++) {
+	  hwloc_obj_t parent = dist->objs[i]->parent;
+	  while (hwloc__obj_type_is_memory(parent->type))
+	    parent = parent->parent;
+	  if (parent->depth+1 > depth)
+	    depth = parent->depth+1;
+	}
+      } else {
+	/* for non-NUMA nodes, increase the object depth if any of them has memory above */
+	int parent_with_memory = 0;
+	for(i=0; i<nbobjs; i++) {
+	  hwloc_obj_t parent = dist->objs[i]->parent;
+	  while (parent) {
+	    if (parent->memory_first_child) {
+	      parent_with_memory = 1;
+	      goto done;
+	    }
+	    parent = parent->parent;
+	  }
+	}
+      done:
+	depth = hwloc_get_type_depth(topology, dist->type) + parent_with_memory;
+      }
+
+      state->new_child(state, &childstate, "distances");
+      sprintf(tmp, "%u", nbobjs);
+      childstate.new_prop(&childstate, "nbobjs", tmp);
+      sprintf(tmp, "%d", depth);
+      childstate.new_prop(&childstate, "relative_depth", tmp);
+      sprintf(tmp, "%f", 1.f);
+      childstate.new_prop(&childstate, "latency_base", tmp);
+      for(i=0; i<nbobjs; i++) {
+        for(j=0; j<nbobjs; j++) {
+	  /* we should export i*nbobjs+j, we translate using logical_to_v2array[] */
+	  unsigned k = logical_to_v2array[i]*nbobjs+logical_to_v2array[j];
+	  struct hwloc__xml_export_state_s greatchildstate;
+	  childstate.new_child(&childstate, &greatchildstate, "latency");
+	  sprintf(tmp, "%f", (float) dist->values[k]);
+	  greatchildstate.new_prop(&greatchildstate, "value", tmp);
+	  greatchildstate.end_object(&greatchildstate, "latency");
+	}
+      }
+      childstate.end_object(&childstate, "distances");
+     }
+    }
+  }
+
+  if (obj->userdata && topology->userdata_export_cb)
+    topology->userdata_export_cb((void*) state, topology, obj);
+}
+
+static void
+hwloc__xml_v2export_object (hwloc__xml_export_state_t parentstate, hwloc_topology_t topology, hwloc_obj_t obj, unsigned long flags)
+{
+  struct hwloc__xml_export_state_s state;
+  hwloc_obj_t child;
+
+  parentstate->new_child(parentstate, &state, "object");
+
+  hwloc__xml_export_object_contents(&state, topology, obj, flags);
+
+  for_each_memory_child(child, obj)
+    hwloc__xml_v2export_object (&state, topology, child, flags);
+  for_each_child(child, obj)
+    hwloc__xml_v2export_object (&state, topology, child, flags);
+  for_each_io_child(child, obj)
+    hwloc__xml_v2export_object (&state, topology, child, flags);
+  for_each_misc_child(child, obj)
+    hwloc__xml_v2export_object (&state, topology, child, flags);
+
+  state.end_object(&state, "object");
+}
+
+static void
+hwloc__xml_v1export_object (hwloc__xml_export_state_t parentstate, hwloc_topology_t topology, hwloc_obj_t obj, unsigned long flags);
+
+static void
+hwloc__xml_v1export_object_with_memory(hwloc__xml_export_state_t parentstate, hwloc_topology_t topology, hwloc_obj_t obj, unsigned long flags)
+{
+  struct hwloc__xml_export_state_s gstate, mstate, ostate, *state = parentstate;
+  hwloc_obj_t child;
+
+  if (obj->parent->arity > 1 && obj->memory_arity > 1 && parentstate->global->v1_memory_group) {
+    /* child has sibling, we must add a Group around those memory children */
+    hwloc_obj_t group = parentstate->global->v1_memory_group;
+    parentstate->new_child(parentstate, &gstate, "object");
+    group->cpuset = obj->cpuset;
+    group->complete_cpuset = obj->complete_cpuset;
+    group->nodeset = obj->nodeset;
+    group->complete_nodeset = obj->complete_nodeset;
+    hwloc__xml_export_object_contents (&gstate, topology, group, flags);
+    group->cpuset = NULL;
+    group->complete_cpuset = NULL;
+    group->nodeset = NULL;
+    group->complete_nodeset = NULL;
+    state = &gstate;
+  }
+
+  /* export first memory child */
+  child = obj->memory_first_child;
+  assert(child->type == HWLOC_OBJ_NUMANODE);
+  state->new_child(state, &mstate, "object");
+  hwloc__xml_export_object_contents (&mstate, topology, child, flags);
+
+  /* then the actual object */
+  mstate.new_child(&mstate, &ostate, "object");
+  hwloc__xml_export_object_contents (&ostate, topology, obj, flags);
+
+  /* then its normal/io/misc children */
+  for_each_child(child, obj)
+    hwloc__xml_v1export_object (&ostate, topology, child, flags);
+  for_each_io_child(child, obj)
+    hwloc__xml_v1export_object (&ostate, topology, child, flags);
+  for_each_misc_child(child, obj)
+    hwloc__xml_v1export_object (&ostate, topology, child, flags);
+
+  /* close object and first memory child */
+  ostate.end_object(&ostate, "object");
+  mstate.end_object(&mstate, "object");
+
+  /* now other memory children */
+  for_each_memory_child(child, obj)
+    if (child->sibling_rank > 0)
+      hwloc__xml_v1export_object (state, topology, child, flags);
+
+  if (state == &gstate) {
+    /* close group if any */
+    gstate.end_object(&gstate, "object");
+  }
+}
+
+static void
+hwloc__xml_v1export_object (hwloc__xml_export_state_t parentstate, hwloc_topology_t topology, hwloc_obj_t obj, unsigned long flags)
+{
+  struct hwloc__xml_export_state_s state;
+  hwloc_obj_t child;
+
+  parentstate->new_child(parentstate, &state, "object");
+
+  hwloc__xml_export_object_contents(&state, topology, obj, flags);
+
+  for_each_child(child, obj) {
+    if (!child->memory_arity) {
+      /* no memory child, just export normally */
+      hwloc__xml_v1export_object (&state, topology, child, flags);
+    } else {
+      hwloc__xml_v1export_object_with_memory(&state, topology, child, flags);
+    }
+  }
+
+  for_each_io_child(child, obj)
+    hwloc__xml_v1export_object (&state, topology, child, flags);
+  for_each_misc_child(child, obj)
+    hwloc__xml_v1export_object (&state, topology, child, flags);
+
+  state.end_object(&state, "object");
+}
+
+#define EXPORT_ARRAY(state, type, nr, values, tagname, format, maxperline) do { \
+  unsigned _i = 0; \
+  while (_i<(nr)) { \
+    char _tmp[255]; /* enough for (snprintf(format)+space) x maxperline */ \
+    char _tmp2[16]; \
+    size_t _len = 0; \
+    unsigned _j; \
+    struct hwloc__xml_export_state_s _childstate; \
+    (state)->new_child(state, &_childstate, tagname); \
+    for(_j=0; \
+	_i+_j<(nr) && _j<maxperline; \
+	_j++) \
+      _len += sprintf(_tmp+_len, format " ", (type) (values)[_i+_j]); \
+    _i += _j; \
+    sprintf(_tmp2, "%lu", (unsigned long) _len); \
+    _childstate.new_prop(&_childstate, "length", _tmp2); \
+    _childstate.add_content(&_childstate, _tmp, _len); \
+    _childstate.end_object(&_childstate, tagname); \
+  } \
+} while (0)
+
+static void
+hwloc__xml_v2export_distances(hwloc__xml_export_state_t parentstate, hwloc_topology_t topology)
+{
+  struct hwloc_internal_distances_s *dist;
+  for(dist = topology->first_dist; dist; dist = dist->next) {
+    char tmp[255];
+    unsigned nbobjs = dist->nbobjs;
+    struct hwloc__xml_export_state_s state;
+
+    parentstate->new_child(parentstate, &state, "distances2");
+
+    state.new_prop(&state, "type", hwloc_obj_type_string(dist->type));
+    sprintf(tmp, "%u", nbobjs);
+    state.new_prop(&state, "nbobjs", tmp);
+    sprintf(tmp, "%lu", dist->kind);
+    state.new_prop(&state, "kind", tmp);
+
+    state.new_prop(&state, "indexing",
+		   (dist->type == HWLOC_OBJ_NUMANODE || dist->type == HWLOC_OBJ_PU) ? "os" : "gp");
+    /* TODO don't hardwire 10 below. either snprintf the max to guess it, or just append until the end of the buffer */
+    EXPORT_ARRAY(&state, unsigned long long, nbobjs, dist->indexes, "indexes", "%llu", 10);
+    EXPORT_ARRAY(&state, unsigned long long, nbobjs*nbobjs, dist->values, "u64values", "%llu", 10);
+    state.end_object(&state, "distances2");
+  }
+}
+
+void
+hwloc__xml_export_topology(hwloc__xml_export_state_t state, hwloc_topology_t topology, unsigned long flags)
+{
+  hwloc_obj_t root = hwloc_get_root_obj(topology);
+
+  if (flags & HWLOC_TOPOLOGY_EXPORT_XML_FLAG_V1) {
+    if (root->memory_first_child) {
+      /* we don't use hwloc__xml_v1export_object_with_memory() because we want/can keep root above the numa node */
+      struct hwloc__xml_export_state_s rstate, mstate;
+      hwloc_obj_t child;
+      /* export the root */
+      state->new_child(state, &rstate, "object");
+      hwloc__xml_export_object_contents (&rstate, topology, root, flags);
+      /* export first memory child */
+      child = root->memory_first_child;
+      assert(child->type == HWLOC_OBJ_NUMANODE);
+      rstate.new_child(&rstate, &mstate, "object");
+      hwloc__xml_export_object_contents (&mstate, topology, child, flags);
+      /* then its normal/io/misc children */
+      for_each_child(child, root)
+	hwloc__xml_v1export_object (&mstate, topology, child, flags);
+      for_each_io_child(child, root)
+	hwloc__xml_v1export_object (&mstate, topology, child, flags);
+      for_each_misc_child(child, root)
+	hwloc__xml_v1export_object (&mstate, topology, child, flags);
+      /* close first memory child */
+      mstate.end_object(&mstate, "object");
+      /* now other memory children */
+      for_each_memory_child(child, root)
+	if (child->sibling_rank > 0)
+	  hwloc__xml_v1export_object (&rstate, topology, child, flags);
+      /* close the root */
+      rstate.end_object(&rstate, "object");
+    } else {
+      hwloc__xml_v1export_object(state, topology, root, flags);
+    }
+
+  } else {
+    hwloc__xml_v2export_object (state, topology, root, flags);
+    hwloc__xml_v2export_distances (state, topology);
+  }
+}
+
+void
+hwloc__xml_export_diff(hwloc__xml_export_state_t parentstate, hwloc_topology_diff_t diff)
+{
+  while (diff) {
+    struct hwloc__xml_export_state_s state;
+    char tmp[255];
+
+    parentstate->new_child(parentstate, &state, "diff");
+
+    sprintf(tmp, "%d", (int) diff->generic.type);
+    state.new_prop(&state, "type", tmp);
+
+    switch (diff->generic.type) {
+    case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR:
+      sprintf(tmp, "%d", diff->obj_attr.obj_depth);
+      state.new_prop(&state, "obj_depth", tmp);
+      sprintf(tmp, "%u", diff->obj_attr.obj_index);
+      state.new_prop(&state, "obj_index", tmp);
+
+      sprintf(tmp, "%d", (int) diff->obj_attr.diff.generic.type);
+      state.new_prop(&state, "obj_attr_type", tmp);
+
+      switch (diff->obj_attr.diff.generic.type) {
+      case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_SIZE:
+	sprintf(tmp, "%llu", (unsigned long long) diff->obj_attr.diff.uint64.index);
+	state.new_prop(&state, "obj_attr_index", tmp);
+	sprintf(tmp, "%llu", (unsigned long long) diff->obj_attr.diff.uint64.oldvalue);
+	state.new_prop(&state, "obj_attr_oldvalue", tmp);
+	sprintf(tmp, "%llu", (unsigned long long) diff->obj_attr.diff.uint64.newvalue);
+	state.new_prop(&state, "obj_attr_newvalue", tmp);
+	break;
+      case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_NAME:
+      case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_INFO:
+	if (diff->obj_attr.diff.string.name)
+	  state.new_prop(&state, "obj_attr_name", diff->obj_attr.diff.string.name);
+	state.new_prop(&state, "obj_attr_oldvalue", diff->obj_attr.diff.string.oldvalue);
+	state.new_prop(&state, "obj_attr_newvalue", diff->obj_attr.diff.string.newvalue);
+	break;
+      }
+
+      break;
+    default:
+      assert(0);
+    }
+    state.end_object(&state, "diff");
+
+    diff = diff->generic.next;
+  }
+}
+
+/**********************************
+ ********* main XML export ********
+ **********************************/
+
+/* this can be the first XML call */
+int hwloc_topology_export_xml(hwloc_topology_t topology, const char *filename, unsigned long flags)
+{
+  hwloc_localeswitch_declare;
+  struct hwloc__xml_export_data_s edata;
+  int force_nolibxml;
+  int ret;
+
+  if (!topology->is_loaded) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  assert(hwloc_nolibxml_callbacks); /* the core called components_init() for the topology */
+
+  if (flags & ~HWLOC_TOPOLOGY_EXPORT_XML_FLAG_V1) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  hwloc_internal_distances_refresh(topology);
+
+  hwloc_localeswitch_init();
+
+  edata.v1_memory_group = NULL;
+  if (flags & HWLOC_TOPOLOGY_EXPORT_XML_FLAG_V1)
+    /* temporary group to be used during v1 export of memory children */
+    edata.v1_memory_group = hwloc_alloc_setup_object(topology, HWLOC_OBJ_GROUP, HWLOC_UNKNOWN_INDEX);
+
+  force_nolibxml = hwloc_nolibxml_export();
+retry:
+  if (!hwloc_libxml_callbacks || (hwloc_nolibxml_callbacks && force_nolibxml))
+    ret = hwloc_nolibxml_callbacks->export_file(topology, &edata, filename, flags);
+  else {
+    ret = hwloc_libxml_callbacks->export_file(topology, &edata, filename, flags);
+    if (ret < 0 && errno == ENOSYS) {
+      hwloc_libxml_callbacks = NULL;
+      goto retry;
+    }
+  }
+
+  if (edata.v1_memory_group)
+    hwloc_free_unlinked_object(edata.v1_memory_group);
+
+  hwloc_localeswitch_fini();
+  return ret;
+}
+
+/* this can be the first XML call */
+int hwloc_topology_export_xmlbuffer(hwloc_topology_t topology, char **xmlbuffer, int *buflen, unsigned long flags)
+{
+  hwloc_localeswitch_declare;
+  struct hwloc__xml_export_data_s edata;
+  int force_nolibxml;
+  int ret;
+
+  if (!topology->is_loaded) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  assert(hwloc_nolibxml_callbacks); /* the core called components_init() for the topology */
+
+  if (flags & ~HWLOC_TOPOLOGY_EXPORT_XML_FLAG_V1) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  hwloc_internal_distances_refresh(topology);
+
+  hwloc_localeswitch_init();
+
+  edata.v1_memory_group = NULL;
+  if (flags & HWLOC_TOPOLOGY_EXPORT_XML_FLAG_V1)
+    /* temporary group to be used during v1 export of memory children */
+    edata.v1_memory_group = hwloc_alloc_setup_object(topology, HWLOC_OBJ_GROUP, HWLOC_UNKNOWN_INDEX);
+
+  force_nolibxml = hwloc_nolibxml_export();
+retry:
+  if (!hwloc_libxml_callbacks || (hwloc_nolibxml_callbacks && force_nolibxml))
+    ret = hwloc_nolibxml_callbacks->export_buffer(topology, &edata, xmlbuffer, buflen, flags);
+  else {
+    ret = hwloc_libxml_callbacks->export_buffer(topology, &edata, xmlbuffer, buflen, flags);
+    if (ret < 0 && errno == ENOSYS) {
+      hwloc_libxml_callbacks = NULL;
+      goto retry;
+    }
+  }
+
+  if (edata.v1_memory_group)
+    hwloc_free_unlinked_object(edata.v1_memory_group);
+
+  hwloc_localeswitch_fini();
+  return ret;
+}
+
+/* this can be the first XML call */
+int
+hwloc_topology_diff_export_xml(hwloc_topology_diff_t diff, const char *refname,
+			       const char *filename)
+{
+  hwloc_localeswitch_declare;
+  hwloc_topology_diff_t tmpdiff;
+  int force_nolibxml;
+  int ret;
+
+  tmpdiff = diff;
+  while (tmpdiff) {
+    if (tmpdiff->generic.type == HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX) {
+      errno = EINVAL;
+      return -1;
+    }
+    tmpdiff = tmpdiff->generic.next;
+  }
+
+  hwloc_components_init();
+  assert(hwloc_nolibxml_callbacks);
+
+  hwloc_localeswitch_init();
+
+  force_nolibxml = hwloc_nolibxml_export();
+retry:
+  if (!hwloc_libxml_callbacks || (hwloc_nolibxml_callbacks && force_nolibxml))
+    ret = hwloc_nolibxml_callbacks->export_diff_file(diff, refname, filename);
+  else {
+    ret = hwloc_libxml_callbacks->export_diff_file(diff, refname, filename);
+    if (ret < 0 && errno == ENOSYS) {
+      hwloc_libxml_callbacks = NULL;
+      goto retry;
+    }
+  }
+
+  hwloc_localeswitch_fini();
+  hwloc_components_fini();
+  return ret;
+}
+
+/* this can be the first XML call */
+int
+hwloc_topology_diff_export_xmlbuffer(hwloc_topology_diff_t diff, const char *refname,
+				     char **xmlbuffer, int *buflen)
+{
+  hwloc_localeswitch_declare;
+  hwloc_topology_diff_t tmpdiff;
+  int force_nolibxml;
+  int ret;
+
+  tmpdiff = diff;
+  while (tmpdiff) {
+    if (tmpdiff->generic.type == HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX) {
+      errno = EINVAL;
+      return -1;
+    }
+    tmpdiff = tmpdiff->generic.next;
+  }
+
+  hwloc_components_init();
+  assert(hwloc_nolibxml_callbacks);
+
+  hwloc_localeswitch_init();
+
+  force_nolibxml = hwloc_nolibxml_export();
+retry:
+  if (!hwloc_libxml_callbacks || (hwloc_nolibxml_callbacks && force_nolibxml))
+    ret = hwloc_nolibxml_callbacks->export_diff_buffer(diff, refname, xmlbuffer, buflen);
+  else {
+    ret = hwloc_libxml_callbacks->export_diff_buffer(diff, refname, xmlbuffer, buflen);
+    if (ret < 0 && errno == ENOSYS) {
+      hwloc_libxml_callbacks = NULL;
+      goto retry;
+    }
+  }
+
+  hwloc_localeswitch_fini();
+  hwloc_components_fini();
+  return ret;
+}
+
+void hwloc_free_xmlbuffer(hwloc_topology_t topology __hwloc_attribute_unused, char *xmlbuffer)
+{
+  int force_nolibxml;
+
+  assert(hwloc_nolibxml_callbacks); /* the core called components_init() for the topology */
+
+  force_nolibxml = hwloc_nolibxml_export();
+  if (!hwloc_libxml_callbacks || (hwloc_nolibxml_callbacks && force_nolibxml))
+    hwloc_nolibxml_callbacks->free_buffer(xmlbuffer);
+  else
+    hwloc_libxml_callbacks->free_buffer(xmlbuffer);
+}
+
+void
+hwloc_topology_set_userdata_export_callback(hwloc_topology_t topology,
+					    void (*export)(void *reserved, struct hwloc_topology *topology, struct hwloc_obj *obj))
+{
+  topology->userdata_export_cb = export;
+}
+
+static void
+hwloc__export_obj_userdata(hwloc__xml_export_state_t parentstate, int encoded,
+			   const char *name, size_t length, const void *buffer, size_t encoded_length)
+{
+  struct hwloc__xml_export_state_s state;
+  char tmp[255];
+  parentstate->new_child(parentstate, &state, "userdata");
+  if (name)
+    state.new_prop(&state, "name", name);
+  sprintf(tmp, "%lu", (unsigned long) length);
+  state.new_prop(&state, "length", tmp);
+  if (encoded)
+    state.new_prop(&state, "encoding", "base64");
+  if (encoded_length)
+    state.add_content(&state, buffer, encoded ? encoded_length : length);
+  state.end_object(&state, "userdata");
+}
+
+int
+hwloc_export_obj_userdata(void *reserved,
+			  struct hwloc_topology *topology, struct hwloc_obj *obj __hwloc_attribute_unused,
+			  const char *name, const void *buffer, size_t length)
+{
+  hwloc__xml_export_state_t state = reserved;
+
+  if (!buffer) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if ((name && hwloc__xml_export_check_buffer(name, strlen(name)) < 0)
+      || hwloc__xml_export_check_buffer(buffer, length) < 0) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (topology->userdata_not_decoded) {
+    int encoded;
+    size_t encoded_length;
+    const char *realname;
+    if (!strncmp(name, "base64", 6)) {
+      encoded = 1;
+      encoded_length = BASE64_ENCODED_LENGTH(length);
+    } else {
+      assert(!strncmp(name, "normal", 6));
+      encoded = 0;
+      encoded_length = length;
+    }
+    if (name[6] == ':')
+      realname = name+7;
+    else {
+      assert(!strcmp(name+6, "-anon"));
+      realname = NULL;
+    }
+    hwloc__export_obj_userdata(state, encoded, realname, length, buffer, encoded_length);
+
+  } else
+    hwloc__export_obj_userdata(state, 0, name, length, buffer, length);
+
+  return 0;
+}
+
+int
+hwloc_export_obj_userdata_base64(void *reserved,
+				 struct hwloc_topology *topology __hwloc_attribute_unused, struct hwloc_obj *obj __hwloc_attribute_unused,
+				 const char *name, const void *buffer, size_t length)
+{
+  hwloc__xml_export_state_t state = reserved;
+  size_t encoded_length;
+  char *encoded_buffer;
+  int ret __hwloc_attribute_unused;
+
+  if (!buffer) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  assert(!topology->userdata_not_decoded);
+
+  if (name && hwloc__xml_export_check_buffer(name, strlen(name)) < 0) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  encoded_length = BASE64_ENCODED_LENGTH(length);
+  encoded_buffer = malloc(encoded_length+1);
+  if (!encoded_buffer) {
+    errno = ENOMEM;
+    return -1;
+  }
+
+  ret = hwloc_encode_to_base64(buffer, length, encoded_buffer, encoded_length+1);
+  assert(ret == (int) encoded_length);
+
+  hwloc__export_obj_userdata(state, 1, name, length, encoded_buffer, encoded_length);
+
+  free(encoded_buffer);
+  return 0;
+}
+
+void
+hwloc_topology_set_userdata_import_callback(hwloc_topology_t topology,
+					    void (*import)(struct hwloc_topology *topology, struct hwloc_obj *obj, const char *name, const void *buffer, size_t length))
+{
+  topology->userdata_import_cb = import;
+}
+
+/***************************************
+ ************ XML component ************
+ ***************************************/
+
+static void
+hwloc_xml_backend_disable(struct hwloc_backend *backend)
+{
+  struct hwloc_xml_backend_data_s *data = backend->private_data;
+  data->backend_exit(data);
+  free(data->msgprefix);
+  free(data);
+}
+
+static struct hwloc_backend *
+hwloc_xml_component_instantiate(struct hwloc_disc_component *component,
+				const void *_data1,
+				const void *_data2,
+				const void *_data3)
+{
+  struct hwloc_xml_backend_data_s *data;
+  struct hwloc_backend *backend;
+  const char *env;
+  int force_nolibxml;
+  const char * xmlpath = (const char *) _data1;
+  const char * xmlbuffer = (const char *) _data2;
+  int xmlbuflen = (int)(uintptr_t) _data3;
+  const char *local_basename;
+  int err;
+
+  assert(hwloc_nolibxml_callbacks); /* the core called components_init() for the component's topology */
+
+  if (!xmlpath && !xmlbuffer) {
+    env = getenv("HWLOC_XMLFILE");
+    if (env) {
+      /* 'xml' was given in HWLOC_COMPONENTS without a filename */
+      xmlpath = env;
+    } else {
+      errno = EINVAL;
+      goto out;
+    }
+  }
+
+  backend = hwloc_backend_alloc(component);
+  if (!backend)
+    goto out;
+
+  data = malloc(sizeof(*data));
+  if (!data) {
+    errno = ENOMEM;
+    goto out_with_backend;
+  }
+
+  backend->private_data = data;
+  backend->discover = hwloc_look_xml;
+  backend->disable = hwloc_xml_backend_disable;
+  backend->is_thissystem = 0;
+
+  if (xmlpath) {
+    local_basename = strrchr(xmlpath, '/');
+    if (local_basename)
+      local_basename++;
+    else
+      local_basename = xmlpath;
+  } else {
+    local_basename = "xmlbuffer";
+  }
+  data->msgprefix = strdup(local_basename);
+
+  force_nolibxml = hwloc_nolibxml_import();
+retry:
+  if (!hwloc_libxml_callbacks || (hwloc_nolibxml_callbacks && force_nolibxml))
+    err = hwloc_nolibxml_callbacks->backend_init(data, xmlpath, xmlbuffer, xmlbuflen);
+  else {
+    err = hwloc_libxml_callbacks->backend_init(data, xmlpath, xmlbuffer, xmlbuflen);
+    if (err < 0 && errno == ENOSYS) {
+      hwloc_libxml_callbacks = NULL;
+      goto retry;
+    }
+  }
+  if (err < 0)
+    goto out_with_data;
+
+  return backend;
+
+ out_with_data:
+  free(data->msgprefix);
+  free(data);
+ out_with_backend:
+  free(backend);
+ out:
+  return NULL;
+}
+
+static struct hwloc_disc_component hwloc_xml_disc_component = {
+  HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
+  "xml",
+  ~0,
+  hwloc_xml_component_instantiate,
+  30,
+  1,
+  NULL
+};
+
+const struct hwloc_component hwloc_xml_component = {
+  HWLOC_COMPONENT_ABI,
+  NULL, NULL,
+  HWLOC_COMPONENT_TYPE_DISC,
+  0,
+  &hwloc_xml_disc_component
+};
diff --git a/src/3rdparty/hwloc/src/topology.c b/src/3rdparty/hwloc/src/topology.c
new file mode 100644
index 000000000..55678a084
--- /dev/null
+++ b/src/3rdparty/hwloc/src/topology.c
@@ -0,0 +1,4484 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2018 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+
+#define _ATFILE_SOURCE
+#include <assert.h>
+#include <sys/types.h>
+#ifdef HAVE_DIRENT_H
+#include <dirent.h>
+#endif
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#include <string.h>
+#include <errno.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <float.h>
+
+#include <hwloc.h>
+#include <private/private.h>
+#include <private/debug.h>
+#include <private/misc.h>
+
+#ifdef HAVE_MACH_MACH_INIT_H
+#include <mach/mach_init.h>
+#endif
+#ifdef HAVE_MACH_MACH_HOST_H
+#include <mach/mach_host.h>
+#endif
+
+#ifdef HAVE_SYS_PARAM_H
+#include <sys/param.h>
+#endif
+
+#ifdef HAVE_SYS_SYSCTL_H
+#include <sys/sysctl.h>
+#endif
+
+#ifdef HWLOC_WIN_SYS
+#include <windows.h>
+#endif
+
+unsigned hwloc_get_api_version(void)
+{
+  return HWLOC_API_VERSION;
+}
+
+int hwloc_topology_abi_check(hwloc_topology_t topology)
+{
+  return topology->topology_abi != HWLOC_TOPOLOGY_ABI ? -1 : 0;
+}
+
+int hwloc_hide_errors(void)
+{
+  static int hide = 0;
+  static int checked = 0;
+  if (!checked) {
+    const char *envvar = getenv("HWLOC_HIDE_ERRORS");
+    if (envvar)
+      hide = atoi(envvar);
+    checked = 1;
+  }
+  return hide;
+}
+
+void hwloc_report_os_error(const char *msg, int line)
+{
+  static int reported = 0;
+
+  if (!reported && !hwloc_hide_errors()) {
+    fprintf(stderr, "****************************************************************************\n");
+    fprintf(stderr, "* hwloc %s received invalid information from the operating system.\n", HWLOC_VERSION);
+    fprintf(stderr, "*\n");
+    fprintf(stderr, "* %s\n", msg);
+    fprintf(stderr, "* Error occurred in topology.c line %d\n", line);
+    fprintf(stderr, "*\n");
+    fprintf(stderr, "* The following FAQ entry in the hwloc documentation may help:\n");
+    fprintf(stderr, "*   What should I do when hwloc reports \"operating system\" warnings?\n");
+    fprintf(stderr, "* Otherwise please report this error message to the hwloc user's mailing list,\n");
+#ifdef HWLOC_LINUX_SYS
+    fprintf(stderr, "* along with the files generated by the hwloc-gather-topology script.\n");
+#else
+    fprintf(stderr, "* along with any relevant topology information from your platform.\n");
+#endif
+    fprintf(stderr, "* \n");
+    fprintf(stderr, "* hwloc will now ignore this invalid topology information and continue.\n");
+    fprintf(stderr, "****************************************************************************\n");
+    reported = 1;
+  }
+}
+
+#if defined(HAVE_SYSCTLBYNAME)
+int hwloc_get_sysctlbyname(const char *name, int64_t *ret)
+{
+  union {
+    int32_t i32;
+    int64_t i64;
+  } n;
+  size_t size = sizeof(n);
+  if (sysctlbyname(name, &n, &size, NULL, 0))
+    return -1;
+  switch (size) {
+    case sizeof(n.i32):
+      *ret = n.i32;
+      break;
+    case sizeof(n.i64):
+      *ret = n.i64;
+      break;
+    default:
+      return -1;
+  }
+  return 0;
+}
+#endif
+
+#if defined(HAVE_SYSCTL)
+int hwloc_get_sysctl(int name[], unsigned namelen, int *ret)
+{
+  int n;
+  size_t size = sizeof(n);
+  if (sysctl(name, namelen, &n, &size, NULL, 0))
+    return -1;
+  if (size != sizeof(n))
+    return -1;
+  *ret = n;
+  return 0;
+}
+#endif
+
+/* Return the OS-provided number of processors.  Unlike other methods such as
+   reading sysfs on Linux, this method is not virtualizable; thus it's only
+   used as a fall-back method, allowing virtual backends (FSROOT, etc) to
+   have the desired effect.  */
+#ifndef HWLOC_WIN_SYS /* The windows implementation is in topology-windows.c */
+int
+hwloc_fallback_nbprocessors(struct hwloc_topology *topology __hwloc_attribute_unused) {
+  int n;
+#if HAVE_DECL__SC_NPROCESSORS_ONLN
+  n = sysconf(_SC_NPROCESSORS_ONLN);
+#elif HAVE_DECL__SC_NPROC_ONLN
+  n = sysconf(_SC_NPROC_ONLN);
+#elif HAVE_DECL__SC_NPROCESSORS_CONF
+  n = sysconf(_SC_NPROCESSORS_CONF);
+#elif HAVE_DECL__SC_NPROC_CONF
+  n = sysconf(_SC_NPROC_CONF);
+#elif defined(HAVE_HOST_INFO) && HAVE_HOST_INFO
+  struct host_basic_info info;
+  mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
+  host_info(mach_host_self(), HOST_BASIC_INFO, (integer_t*) &info, &count);
+  n = info.avail_cpus;
+#elif defined(HAVE_SYSCTLBYNAME)
+  int64_t nn;
+  if (hwloc_get_sysctlbyname("hw.ncpu", &nn))
+    nn = -1;
+  n = nn;
+#elif defined(HAVE_SYSCTL) && HAVE_DECL_CTL_HW && HAVE_DECL_HW_NCPU
+  static int name[2] = {CTL_HW, HW_NCPU};
+  if (hwloc_get_sysctl(name, sizeof(name)/sizeof(*name), &n))
+    n = -1;
+#else
+#ifdef __GNUC__
+#warning No known way to discover number of available processors on this system
+#endif
+  n = -1;
+#endif
+  return n;
+}
+#endif /* !HWLOC_WIN_SYS */
+
+/*
+ * Use the given number of processors to set a PU level.
+ */
+void
+hwloc_setup_pu_level(struct hwloc_topology *topology,
+		     unsigned nb_pus)
+{
+  struct hwloc_obj *obj;
+  unsigned oscpu,cpu;
+
+  hwloc_debug("%s", "\n\n * CPU cpusets *\n\n");
+  for (cpu=0,oscpu=0; cpu<nb_pus; oscpu++)
+    {
+      obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_PU, oscpu);
+      obj->cpuset = hwloc_bitmap_alloc();
+      hwloc_bitmap_only(obj->cpuset, oscpu);
+
+      hwloc_debug_2args_bitmap("cpu %u (os %u) has cpuset %s\n",
+		 cpu, oscpu, obj->cpuset);
+      hwloc_insert_object_by_cpuset(topology, obj);
+
+      cpu++;
+    }
+}
+
+/* Traverse children of a parent in a safe way: reread the next pointer as
+ * appropriate to prevent crash on child deletion:  */
+#define for_each_child_safe(child, parent, pchild) \
+  for (pchild = &(parent)->first_child, child = *pchild; \
+       child; \
+       /* Check whether the current child was not dropped.  */ \
+       (*pchild == child ? pchild = &(child->next_sibling) : NULL), \
+       /* Get pointer to next child.  */ \
+        child = *pchild)
+#define for_each_memory_child_safe(child, parent, pchild) \
+  for (pchild = &(parent)->memory_first_child, child = *pchild; \
+       child; \
+       /* Check whether the current child was not dropped.  */ \
+       (*pchild == child ? pchild = &(child->next_sibling) : NULL), \
+       /* Get pointer to next child.  */ \
+        child = *pchild)
+#define for_each_io_child_safe(child, parent, pchild) \
+  for (pchild = &(parent)->io_first_child, child = *pchild; \
+       child; \
+       /* Check whether the current child was not dropped.  */ \
+       (*pchild == child ? pchild = &(child->next_sibling) : NULL), \
+       /* Get pointer to next child.  */ \
+        child = *pchild)
+#define for_each_misc_child_safe(child, parent, pchild) \
+  for (pchild = &(parent)->misc_first_child, child = *pchild; \
+       child; \
+       /* Check whether the current child was not dropped.  */ \
+       (*pchild == child ? pchild = &(child->next_sibling) : NULL), \
+       /* Get pointer to next child.  */ \
+        child = *pchild)
+
+#ifdef HWLOC_DEBUG
+/* Just for debugging.  */
+static void
+hwloc_debug_print_object(int indent __hwloc_attribute_unused, hwloc_obj_t obj)
+{
+  char type[64], idx[12], attr[1024], *cpuset = NULL;
+  hwloc_debug("%*s", 2*indent, "");
+  hwloc_obj_type_snprintf(type, sizeof(type), obj, 1);
+  if (obj->os_index != HWLOC_UNKNOWN_INDEX)
+    snprintf(idx, sizeof(idx), "#%u", obj->os_index);
+  else
+    *idx = '\0';
+  hwloc_obj_attr_snprintf(attr, sizeof(attr), obj, " ", 1);
+  hwloc_debug("%s%s%s%s%s", type, idx, *attr ? "(" : "", attr, *attr ? ")" : "");
+  if (obj->name)
+    hwloc_debug(" name \"%s\"", obj->name);
+  if (obj->subtype)
+    hwloc_debug(" subtype \"%s\"", obj->subtype);
+  if (obj->cpuset) {
+    hwloc_bitmap_asprintf(&cpuset, obj->cpuset);
+    hwloc_debug(" cpuset %s", cpuset);
+    free(cpuset);
+  }
+  if (obj->complete_cpuset) {
+    hwloc_bitmap_asprintf(&cpuset, obj->complete_cpuset);
+    hwloc_debug(" complete %s", cpuset);
+    free(cpuset);
+  }
+  if (obj->nodeset) {
+    hwloc_bitmap_asprintf(&cpuset, obj->nodeset);
+    hwloc_debug(" nodeset %s", cpuset);
+    free(cpuset);
+  }
+  if (obj->complete_nodeset) {
+    hwloc_bitmap_asprintf(&cpuset, obj->complete_nodeset);
+    hwloc_debug(" completeN %s", cpuset);
+    free(cpuset);
+  }
+  if (obj->arity)
+    hwloc_debug(" arity %u", obj->arity);
+  hwloc_debug("%s", "\n");
+}
+
+static void
+hwloc_debug_print_objects(int indent __hwloc_attribute_unused, hwloc_obj_t obj)
+{
+  hwloc_obj_t child;
+  hwloc_debug_print_object(indent, obj);
+  for_each_child (child, obj)
+    hwloc_debug_print_objects(indent + 1, child);
+  for_each_memory_child (child, obj)
+    hwloc_debug_print_objects(indent + 1, child);
+  for_each_io_child (child, obj)
+    hwloc_debug_print_objects(indent + 1, child);
+  for_each_misc_child (child, obj)
+    hwloc_debug_print_objects(indent + 1, child);
+}
+#else /* !HWLOC_DEBUG */
+#define hwloc_debug_print_object(indent, obj) do { /* nothing */ } while (0)
+#define hwloc_debug_print_objects(indent, obj) do { /* nothing */ } while (0)
+#endif /* !HWLOC_DEBUG */
+
+void hwloc__free_infos(struct hwloc_info_s *infos, unsigned count)
+{
+  unsigned i;
+  for(i=0; i<count; i++) {
+    free(infos[i].name);
+    free(infos[i].value);
+  }
+  free(infos);
+}
+
+int hwloc__add_info(struct hwloc_info_s **infosp, unsigned *countp, const char *name, const char *value)
+{
+  unsigned count = *countp;
+  struct hwloc_info_s *infos = *infosp;
+#define OBJECT_INFO_ALLOC 8
+  /* nothing allocated initially, (re-)allocate by multiple of 8 */
+  unsigned alloccount = (count + 1 + (OBJECT_INFO_ALLOC-1)) & ~(OBJECT_INFO_ALLOC-1);
+  if (count != alloccount) {
+    struct hwloc_info_s *tmpinfos = realloc(infos, alloccount*sizeof(*infos));
+    if (!tmpinfos)
+      /* failed to allocate, ignore this info */
+      goto out_with_array;
+    *infosp = infos = tmpinfos;
+  }
+  infos[count].name = strdup(name);
+  if (!infos[count].name)
+    goto out_with_array;
+  infos[count].value = strdup(value);
+  if (!infos[count].value)
+    goto out_with_name;
+  *countp = count+1;
+  return 0;
+
+ out_with_name:
+  free(infos[count].name);
+ out_with_array:
+  /* don't bother reducing the array */
+  return -1;
+}
+
+int hwloc__add_info_nodup(struct hwloc_info_s **infosp, unsigned *countp,
+			  const char *name, const char *value,
+			  int replace)
+{
+  struct hwloc_info_s *infos = *infosp;
+  unsigned count = *countp;
+  unsigned i;
+  for(i=0; i<count; i++) {
+    if (!strcmp(infos[i].name, name)) {
+      if (replace) {
+	char *new = strdup(value);
+	if (!new)
+	  return -1;
+	free(infos[i].value);
+	infos[i].value = new;
+      }
+      return 0;
+    }
+  }
+  return hwloc__add_info(infosp, countp, name, value);
+}
+
+int hwloc__move_infos(struct hwloc_info_s **dst_infosp, unsigned *dst_countp,
+		      struct hwloc_info_s **src_infosp, unsigned *src_countp)
+{
+  unsigned dst_count = *dst_countp;
+  struct hwloc_info_s *dst_infos = *dst_infosp;
+  unsigned src_count = *src_countp;
+  struct hwloc_info_s *src_infos = *src_infosp;
+  unsigned i;
+#define OBJECT_INFO_ALLOC 8
+  /* nothing allocated initially, (re-)allocate by multiple of 8 */
+  unsigned alloccount = (dst_count + src_count + (OBJECT_INFO_ALLOC-1)) & ~(OBJECT_INFO_ALLOC-1);
+  if (dst_count != alloccount) {
+    struct hwloc_info_s *tmp_infos = realloc(dst_infos, alloccount*sizeof(*dst_infos));
+    if (!tmp_infos)
+      /* Failed to realloc, ignore the appended infos */
+      goto drop;
+    dst_infos = tmp_infos;
+  }
+  for(i=0; i<src_count; i++, dst_count++) {
+    dst_infos[dst_count].name = src_infos[i].name;
+    dst_infos[dst_count].value = src_infos[i].value;
+  }
+  *dst_infosp = dst_infos;
+  *dst_countp = dst_count;
+  free(src_infos);
+  *src_infosp = NULL;
+  *src_countp = 0;
+  return 0;
+
+ drop:
+  /* drop src infos, don't modify dst_infos at all */
+  for(i=0; i<src_count; i++) {
+    free(src_infos[i].name);
+    free(src_infos[i].value);
+  }
+  free(src_infos);
+  *src_infosp = NULL;
+  *src_countp = 0;
+  return -1;
+}
+
+int hwloc_obj_add_info(hwloc_obj_t obj, const char *name, const char *value)
+{
+  return hwloc__add_info(&obj->infos, &obj->infos_count, name, value);
+}
+
+/* This function may be called with topology->tma set, it cannot free() or realloc() */
+static int hwloc__tma_dup_infos(struct hwloc_tma *tma, hwloc_obj_t new, hwloc_obj_t src)
+{
+  unsigned i, j;
+  new->infos = hwloc_tma_calloc(tma, src->infos_count * sizeof(*src->infos));
+  if (!new->infos)
+    return -1;
+  for(i=0; i<src->infos_count; i++) {
+    new->infos[i].name = hwloc_tma_strdup(tma, src->infos[i].name);
+    new->infos[i].value = hwloc_tma_strdup(tma, src->infos[i].value);
+    if (!new->infos[i].name || !new->infos[i].value)
+      goto failed;
+  }
+  new->infos_count = src->infos_count;
+  return 0;
+
+ failed:
+  assert(!tma || !tma->dontfree); /* this tma cannot fail to allocate */
+  for(j=0; j<=i; j++) {
+    free(new->infos[i].name);
+    free(new->infos[i].value);
+  }
+  free(new->infos);
+  new->infos = NULL;
+  return -1;
+}
+
+static void
+hwloc__free_object_contents(hwloc_obj_t obj)
+{
+  switch (obj->type) {
+  case HWLOC_OBJ_NUMANODE:
+    free(obj->attr->numanode.page_types);
+    break;
+  default:
+    break;
+  }
+  hwloc__free_infos(obj->infos, obj->infos_count);
+  free(obj->attr);
+  free(obj->children);
+  free(obj->subtype);
+  free(obj->name);
+  hwloc_bitmap_free(obj->cpuset);
+  hwloc_bitmap_free(obj->complete_cpuset);
+  hwloc_bitmap_free(obj->nodeset);
+  hwloc_bitmap_free(obj->complete_nodeset);
+}
+
+/* Free an object and all its content.  */
+void
+hwloc_free_unlinked_object(hwloc_obj_t obj)
+{
+  hwloc__free_object_contents(obj);
+  free(obj);
+}
+
+/* Replace old with contents of new object, and make new freeable by the caller.
+ * Only updates next_sibling/first_child pointers,
+ * so may only be used during early discovery.
+ */
+static void
+hwloc_replace_linked_object(hwloc_obj_t old, hwloc_obj_t new)
+{
+  /* drop old fields */
+  hwloc__free_object_contents(old);
+  /* copy old tree pointers to new */
+  new->parent = old->parent;
+  new->next_sibling = old->next_sibling;
+  new->first_child = old->first_child;
+  new->memory_first_child = old->memory_first_child;
+  new->io_first_child = old->io_first_child;
+  new->misc_first_child = old->misc_first_child;
+  /* copy new contents to old now that tree pointers are OK */
+  memcpy(old, new, sizeof(*old));
+  /* clear new to that we may free it */
+  memset(new, 0,sizeof(*new));
+}
+
+/* Remove an object and its children from its parent and free them.
+ * Only updates next_sibling/first_child pointers,
+ * so may only be used during early discovery or during destroy.
+ */
+static void
+unlink_and_free_object_and_children(hwloc_obj_t *pobj)
+{
+  hwloc_obj_t obj = *pobj, child, *pchild;
+
+  for_each_child_safe(child, obj, pchild)
+    unlink_and_free_object_and_children(pchild);
+  for_each_memory_child_safe(child, obj, pchild)
+    unlink_and_free_object_and_children(pchild);
+  for_each_io_child_safe(child, obj, pchild)
+    unlink_and_free_object_and_children(pchild);
+  for_each_misc_child_safe(child, obj, pchild)
+    unlink_and_free_object_and_children(pchild);
+
+  *pobj = obj->next_sibling;
+  hwloc_free_unlinked_object(obj);
+}
+
+/* Free an object and its children without unlinking from parent.
+ */
+void
+hwloc_free_object_and_children(hwloc_obj_t obj)
+{
+  unlink_and_free_object_and_children(&obj);
+}
+
+/* Free an object, its next siblings and their children without unlinking from parent.
+ */
+void
+hwloc_free_object_siblings_and_children(hwloc_obj_t obj)
+{
+  while (obj)
+    unlink_and_free_object_and_children(&obj);
+}
+
+/* insert the (non-empty) list of sibling starting at firstnew as new children of newparent,
+ * and return the address of the pointer to the next one
+ */
+static hwloc_obj_t *
+insert_siblings_list(hwloc_obj_t *firstp, hwloc_obj_t firstnew, hwloc_obj_t newparent)
+{
+  hwloc_obj_t tmp;
+  assert(firstnew);
+  *firstp = tmp = firstnew;
+  tmp->parent = newparent;
+  while (tmp->next_sibling) {
+    tmp = tmp->next_sibling;
+    tmp->parent = newparent;
+  }
+  return &tmp->next_sibling;
+}
+
+/* Take the new list starting at firstnew and prepend it to the old list starting at *firstp,
+ * and mark the new children as children of newparent.
+ * May be used during early or late discovery (updates prev_sibling and sibling_rank).
+ * List firstnew must be non-NULL.
+ */
+static void
+prepend_siblings_list(hwloc_obj_t *firstp, hwloc_obj_t firstnew, hwloc_obj_t newparent)
+{
+  hwloc_obj_t *tmpp, tmp, last;
+  unsigned length;
+
+  /* update parent pointers and find the length and end of the new list */
+  for(length = 0, tmpp = &firstnew, last = NULL ; *tmpp; length++, last = *tmpp, tmpp = &((*tmpp)->next_sibling))
+    (*tmpp)->parent = newparent;
+
+  /* update sibling_rank */
+  for(tmp = *firstp; tmp; tmp = tmp->next_sibling)
+    tmp->sibling_rank += length; /* if it wasn't initialized yet, it'll be overwritten later */
+
+  /* place the existing list at the end of the new one */
+  *tmpp = *firstp;
+  if (*firstp)
+    (*firstp)->prev_sibling = last;
+
+  /* use the beginning of the new list now */
+  *firstp = firstnew;
+}
+
+/* Take the new list starting at firstnew and append it to the old list starting at *firstp,
+ * and mark the new children as children of newparent.
+ * May be used during early or late discovery (updates prev_sibling and sibling_rank).
+ */
+static void
+append_siblings_list(hwloc_obj_t *firstp, hwloc_obj_t firstnew, hwloc_obj_t newparent)
+{
+  hwloc_obj_t *tmpp, tmp, last;
+  unsigned length;
+
+  /* find the length and end of the existing list */
+  for(length = 0, tmpp = firstp, last = NULL ; *tmpp; length++, last = *tmpp, tmpp = &((*tmpp)->next_sibling));
+
+  /* update parent pointers and sibling_rank */
+  for(tmp = firstnew; tmp; tmp = tmp->next_sibling) {
+    tmp->parent = newparent;
+    tmp->sibling_rank += length; /* if it wasn't set yet, it'll be overwritten later */
+  }
+
+  /* place new list at the end of the old one */
+  *tmpp = firstnew;
+  if (firstnew)
+    firstnew->prev_sibling = last;
+}
+
+/* Remove an object from its parent and free it.
+ * Only updates next_sibling/first_child pointers,
+ * so may only be used during early discovery.
+ *
+ * Children are inserted in the parent.
+ * If children should be inserted somewhere else (e.g. when merging with a child),
+ * the caller should move them before calling this function.
+ */
+static void
+unlink_and_free_single_object(hwloc_obj_t *pparent)
+{
+  hwloc_obj_t old = *pparent;
+  hwloc_obj_t *lastp;
+
+  if (old->type == HWLOC_OBJ_MISC) {
+    /* Misc object */
+
+    /* no normal children */
+    assert(!old->first_child);
+    /* no memory children */
+    assert(!old->memory_first_child);
+    /* no I/O children */
+    assert(!old->io_first_child);
+
+    if (old->misc_first_child)
+      /* insert old misc object children as new siblings below parent instead of old */
+      lastp = insert_siblings_list(pparent, old->misc_first_child, old->parent);
+    else
+      lastp = pparent;
+    /* append old siblings back */
+    *lastp = old->next_sibling;
+
+  } else if (hwloc__obj_type_is_io(old->type)) {
+    /* I/O object */
+
+    /* no normal children */
+    assert(!old->first_child);
+    /* no memory children */
+    assert(!old->memory_first_child);
+
+    if (old->io_first_child)
+      /* insert old I/O object children as new siblings below parent instead of old */
+      lastp = insert_siblings_list(pparent, old->io_first_child, old->parent);
+    else
+      lastp = pparent;
+    /* append old siblings back */
+    *lastp = old->next_sibling;
+
+    /* append old Misc children to parent */
+    if (old->misc_first_child)
+      append_siblings_list(&old->parent->misc_first_child, old->misc_first_child, old->parent);
+
+  } else if (hwloc__obj_type_is_memory(old->type)) {
+    /* memory object */
+
+    /* no normal children */
+    assert(!old->first_child);
+    /* no I/O children */
+    assert(!old->io_first_child);
+
+    if (old->memory_first_child)
+      /* insert old memory object children as new siblings below parent instead of old */
+      lastp = insert_siblings_list(pparent, old->memory_first_child, old->parent);
+    else
+      lastp = pparent;
+    /* append old siblings back */
+    *lastp = old->next_sibling;
+
+    /* append old Misc children to parent */
+    if (old->misc_first_child)
+      append_siblings_list(&old->parent->misc_first_child, old->misc_first_child, old->parent);
+
+  } else {
+    /* Normal object */
+
+    if (old->first_child)
+      /* insert old object children as new siblings below parent instead of old */
+      lastp = insert_siblings_list(pparent, old->first_child, old->parent);
+    else
+      lastp = pparent;
+    /* append old siblings back */
+    *lastp = old->next_sibling;
+
+    /* append old memory, I/O and Misc children to parent
+     * old->parent cannot be NULL (removing root), misc children should have been moved by the caller earlier.
+     */
+    if (old->memory_first_child)
+      append_siblings_list(&old->parent->memory_first_child, old->memory_first_child, old->parent);
+    if (old->io_first_child)
+      append_siblings_list(&old->parent->io_first_child, old->io_first_child, old->parent);
+    if (old->misc_first_child)
+      append_siblings_list(&old->parent->misc_first_child, old->misc_first_child, old->parent);
+  }
+
+  hwloc_free_unlinked_object(old);
+}
+
+/* This function may use a tma, it cannot free() or realloc() */
+static int
+hwloc__duplicate_object(struct hwloc_topology *newtopology,
+			struct hwloc_obj *newparent,
+			struct hwloc_obj *newobj,
+			struct hwloc_obj *src)
+{
+  struct hwloc_tma *tma = newtopology->tma;
+  hwloc_obj_t *level;
+  unsigned level_width;
+  size_t len;
+  unsigned i;
+  hwloc_obj_t child, prev;
+  int err = 0;
+
+  /* either we're duplicating to an already allocated new root, which has no newparent,
+   * or we're duplicating to a non-yet allocated new non-root, which will have a newparent.
+   */
+  assert(!newparent == !!newobj);
+
+  if (!newobj) {
+    newobj = hwloc_alloc_setup_object(newtopology, src->type, src->os_index);
+    if (!newobj)
+      return -1;
+  }
+
+  /* duplicate all non-object-pointer fields */
+  newobj->logical_index = src->logical_index;
+  newobj->depth = src->depth;
+  newobj->sibling_rank = src->sibling_rank;
+
+  newobj->type = src->type;
+  newobj->os_index = src->os_index;
+  newobj->gp_index = src->gp_index;
+  newobj->symmetric_subtree = src->symmetric_subtree;
+
+  if (src->name)
+    newobj->name = hwloc_tma_strdup(tma, src->name);
+  if (src->subtype)
+    newobj->subtype = hwloc_tma_strdup(tma, src->subtype);
+  newobj->userdata = src->userdata;
+
+  newobj->total_memory = src->total_memory;
+
+  memcpy(newobj->attr, src->attr, sizeof(*newobj->attr));
+
+  if (src->type == HWLOC_OBJ_NUMANODE && src->attr->numanode.page_types_len) {
+    len = src->attr->numanode.page_types_len * sizeof(struct hwloc_memory_page_type_s);
+    newobj->attr->numanode.page_types = hwloc_tma_malloc(tma, len);
+    memcpy(newobj->attr->numanode.page_types, src->attr->numanode.page_types, len);
+  }
+
+  newobj->cpuset = hwloc_bitmap_tma_dup(tma, src->cpuset);
+  newobj->complete_cpuset = hwloc_bitmap_tma_dup(tma, src->complete_cpuset);
+  newobj->nodeset = hwloc_bitmap_tma_dup(tma, src->nodeset);
+  newobj->complete_nodeset = hwloc_bitmap_tma_dup(tma, src->complete_nodeset);
+
+  hwloc__tma_dup_infos(tma, newobj, src);
+
+  /* find our level */
+  if (src->depth < 0) {
+    i = HWLOC_SLEVEL_FROM_DEPTH(src->depth);
+    level = newtopology->slevels[i].objs;
+    level_width = newtopology->slevels[i].nbobjs;
+    /* deal with first/last pointers of special levels, even if not really needed */
+    if (!newobj->logical_index)
+      newtopology->slevels[i].first = newobj;
+    if (newobj->logical_index == newtopology->slevels[i].nbobjs - 1)
+      newtopology->slevels[i].last = newobj;
+  } else {
+    level = newtopology->levels[src->depth];
+    level_width = newtopology->level_nbobjects[src->depth];
+  }
+  /* place us for real */
+  assert(newobj->logical_index < level_width);
+  level[newobj->logical_index] = newobj;
+  /* link to already-inserted cousins
+   * (hwloc_pci_belowroot_apply_locality() can cause out-of-order logical indexes)
+   */
+  if (newobj->logical_index > 0 && level[newobj->logical_index-1]) {
+    newobj->prev_cousin = level[newobj->logical_index-1];
+    level[newobj->logical_index-1]->next_cousin = newobj;
+  }
+  if (newobj->logical_index < level_width-1 && level[newobj->logical_index+1]) {
+    newobj->next_cousin = level[newobj->logical_index+1];
+    level[newobj->logical_index+1]->prev_cousin = newobj;
+  }
+
+  /* prepare for children */
+  if (src->arity) {
+    newobj->children = hwloc_tma_malloc(tma, src->arity * sizeof(*newobj->children));
+    if (!newobj->children)
+      return -1;
+  }
+  newobj->arity = src->arity;
+  newobj->memory_arity = src->memory_arity;
+  newobj->io_arity = src->io_arity;
+  newobj->misc_arity = src->misc_arity;
+
+  /* actually insert children now */
+  for_each_child(child, src) {
+    err = hwloc__duplicate_object(newtopology, newobj, NULL, child);
+    if (err < 0)
+      goto out_with_children;
+  }
+  for_each_memory_child(child, src) {
+    err = hwloc__duplicate_object(newtopology, newobj, NULL, child);
+    if (err < 0)
+      return err;
+  }
+  for_each_io_child(child, src) {
+    err = hwloc__duplicate_object(newtopology, newobj, NULL, child);
+    if (err < 0)
+      goto out_with_children;
+  }
+  for_each_misc_child(child, src) {
+    err = hwloc__duplicate_object(newtopology, newobj, NULL, child);
+    if (err < 0)
+      goto out_with_children;
+  }
+
+ out_with_children:
+
+  /* link children if all of them where inserted */
+  if (!err) {
+    /* only next_sibling is set by insert_by_parent().
+     * sibling_rank was set above.
+     */
+    if (newobj->arity) {
+      newobj->children[0]->prev_sibling = NULL;
+      for(i=1; i<newobj->arity; i++)
+	newobj->children[i]->prev_sibling = newobj->children[i-1];
+      newobj->last_child = newobj->children[newobj->arity-1];
+    }
+    if (newobj->memory_arity) {
+      child = newobj->memory_first_child;
+      prev = NULL;
+      while (child) {
+	child->prev_sibling = prev;
+	prev = child;
+	child = child->next_sibling;
+      }
+    }
+    if (newobj->io_arity) {
+      child = newobj->io_first_child;
+      prev = NULL;
+      while (child) {
+	child->prev_sibling = prev;
+	prev = child;
+	child = child->next_sibling;
+      }
+    }
+    if (newobj->misc_arity) {
+      child = newobj->misc_first_child;
+      prev = NULL;
+      while (child) {
+	child->prev_sibling = prev;
+	prev = child;
+	child = child->next_sibling;
+      }
+    }
+  }
+
+  /* some children insertion may have failed, but some children may have been inserted below us already.
+   * keep inserting ourself and let the caller clean the entire tree if we return an error.
+   */
+
+  if (newparent) {
+    /* no need to check the children insert order here, the source topology
+     * is supposed to be OK already, and we have debug asserts.
+     */
+    hwloc_insert_object_by_parent(newtopology, newparent, newobj);
+
+    /* place us inside our parent children array */
+    if (hwloc__obj_type_is_normal(newobj->type))
+      newparent->children[newobj->sibling_rank] = newobj;
+  }
+
+  return err;
+}
+
+static int
+hwloc__topology_init (struct hwloc_topology **topologyp, unsigned nblevels, struct hwloc_tma *tma);
+
+/* This function may use a tma, it cannot free() or realloc() */
+int
+hwloc__topology_dup(hwloc_topology_t *newp,
+		    hwloc_topology_t old,
+		    struct hwloc_tma *tma)
+{
+  hwloc_topology_t new;
+  hwloc_obj_t newroot;
+  hwloc_obj_t oldroot = hwloc_get_root_obj(old);
+  unsigned i;
+  int err;
+
+  if (!old->is_loaded) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  err = hwloc__topology_init(&new, old->nb_levels_allocated, tma);
+  if (err < 0)
+    goto out;
+
+  new->flags = old->flags;
+  memcpy(new->type_filter, old->type_filter, sizeof(old->type_filter));
+  new->is_thissystem = old->is_thissystem;
+  new->is_loaded = 1;
+  new->pid = old->pid;
+  new->next_gp_index = old->next_gp_index;
+
+  memcpy(&new->binding_hooks, &old->binding_hooks, sizeof(old->binding_hooks));
+
+  memcpy(new->support.discovery, old->support.discovery, sizeof(*old->support.discovery));
+  memcpy(new->support.cpubind, old->support.cpubind, sizeof(*old->support.cpubind));
+  memcpy(new->support.membind, old->support.membind, sizeof(*old->support.membind));
+
+  new->allowed_cpuset = hwloc_bitmap_tma_dup(tma, old->allowed_cpuset);
+  new->allowed_nodeset = hwloc_bitmap_tma_dup(tma, old->allowed_nodeset);
+
+  new->userdata_export_cb = old->userdata_export_cb;
+  new->userdata_import_cb = old->userdata_import_cb;
+  new->userdata_not_decoded = old->userdata_not_decoded;
+
+  assert(!old->machine_memory.local_memory);
+  assert(!old->machine_memory.page_types_len);
+  assert(!old->machine_memory.page_types);
+
+  for(i = HWLOC_OBJ_TYPE_MIN; i < HWLOC_OBJ_TYPE_MAX; i++)
+    new->type_depth[i] = old->type_depth[i];
+
+  /* duplicate levels and we'll place objects there when duplicating objects */
+  new->nb_levels = old->nb_levels;
+  assert(new->nb_levels_allocated >= new->nb_levels);
+  for(i=1 /* root level already allocated */ ; i<new->nb_levels; i++) {
+    new->level_nbobjects[i] = old->level_nbobjects[i];
+    new->levels[i] = hwloc_tma_calloc(tma, new->level_nbobjects[i] * sizeof(*new->levels[i]));
+  }
+  for(i=0; i<HWLOC_NR_SLEVELS; i++) {
+    new->slevels[i].nbobjs = old->slevels[i].nbobjs;
+    if (new->slevels[i].nbobjs)
+      new->slevels[i].objs = hwloc_tma_calloc(tma, new->slevels[i].nbobjs * sizeof(*new->slevels[i].objs));
+  }
+
+  /* recursively duplicate object children */
+  newroot = hwloc_get_root_obj(new);
+  err = hwloc__duplicate_object(new, NULL, newroot, oldroot);
+  if (err < 0)
+    goto out_with_topology;
+
+  err = hwloc_internal_distances_dup(new, old);
+  if (err < 0)
+    goto out_with_topology;
+
+  /* we connected everything during duplication */
+  new->modified = 0;
+
+  /* no need to duplicate backends, topology is already loaded */
+  new->backends = NULL;
+  new->get_pci_busid_cpuset_backend = NULL;
+
+#ifndef HWLOC_DEBUG
+  if (getenv("HWLOC_DEBUG_CHECK"))
+#endif
+    hwloc_topology_check(new);
+
+  *newp = new;
+  return 0;
+
+ out_with_topology:
+  assert(!tma || !tma->dontfree); /* this tma cannot fail to allocate */
+  hwloc_topology_destroy(new);
+ out:
+  return -1;
+}
+
+int
+hwloc_topology_dup(hwloc_topology_t *newp,
+		   hwloc_topology_t old)
+{
+  return hwloc__topology_dup(newp, old, NULL);
+}
+
+/* WARNING: The indexes of this array MUST match the ordering that of
+   the obj_order_type[] array, below.  Specifically, the values must
+   be laid out such that:
+
+       obj_order_type[obj_type_order[N]] = N
+
+   for all HWLOC_OBJ_* values of N.  Put differently:
+
+       obj_type_order[A] = B
+
+   where the A values are in order of the hwloc_obj_type_t enum, and
+   the B values are the corresponding indexes of obj_order_type.
+
+   We can't use C99 syntax to initialize this in a little safer manner
+   -- bummer.  :-(
+
+   Correctness is asserted in hwloc_topology_init() when debug is enabled.
+   */
+/***** Make sure you update obj_type_priority[] below as well. *****/
+static const unsigned obj_type_order[] = {
+    /* first entry is HWLOC_OBJ_MACHINE */  0,
+    /* next entry is HWLOC_OBJ_PACKAGE */  3,
+    /* next entry is HWLOC_OBJ_CORE */     12,
+    /* next entry is HWLOC_OBJ_PU */       16,
+    /* next entry is HWLOC_OBJ_L1CACHE */  10,
+    /* next entry is HWLOC_OBJ_L2CACHE */  8,
+    /* next entry is HWLOC_OBJ_L3CACHE */  6,
+    /* next entry is HWLOC_OBJ_L4CACHE */  5,
+    /* next entry is HWLOC_OBJ_L5CACHE */  4,
+    /* next entry is HWLOC_OBJ_L1ICACHE */ 11,
+    /* next entry is HWLOC_OBJ_L2ICACHE */ 9,
+    /* next entry is HWLOC_OBJ_L3ICACHE */ 7,
+    /* next entry is HWLOC_OBJ_GROUP */    1,
+    /* next entry is HWLOC_OBJ_NUMANODE */ 2,
+    /* next entry is HWLOC_OBJ_BRIDGE */   13,
+    /* next entry is HWLOC_OBJ_PCI_DEVICE */  14,
+    /* next entry is HWLOC_OBJ_OS_DEVICE */   15,
+    /* next entry is HWLOC_OBJ_MISC */     17
+};
+
+#ifndef NDEBUG /* only used in debug check assert if !NDEBUG */
+static const hwloc_obj_type_t obj_order_type[] = {
+  HWLOC_OBJ_MACHINE,
+  HWLOC_OBJ_GROUP,
+  HWLOC_OBJ_NUMANODE,
+  HWLOC_OBJ_PACKAGE,
+  HWLOC_OBJ_L5CACHE,
+  HWLOC_OBJ_L4CACHE,
+  HWLOC_OBJ_L3CACHE,
+  HWLOC_OBJ_L3ICACHE,
+  HWLOC_OBJ_L2CACHE,
+  HWLOC_OBJ_L2ICACHE,
+  HWLOC_OBJ_L1CACHE,
+  HWLOC_OBJ_L1ICACHE,
+  HWLOC_OBJ_CORE,
+  HWLOC_OBJ_BRIDGE,
+  HWLOC_OBJ_PCI_DEVICE,
+  HWLOC_OBJ_OS_DEVICE,
+  HWLOC_OBJ_PU,
+  HWLOC_OBJ_MISC /* Misc is always a leaf */
+};
+#endif
+/***** Make sure you update obj_type_priority[] below as well. *****/
+
+/* priority to be used when merging identical parent/children object
+ * (in merge_useless_child), keep the highest priority one.
+ *
+ * Always keep Machine/NUMANode/PU/PCIDev/OSDev
+ * then Core
+ * then Package
+ * then Cache,
+ * then Instruction Caches
+ * then always drop Group/Misc/Bridge.
+ *
+ * Some type won't actually ever be involved in such merging.
+ */
+/***** Make sure you update this array when changing the list of types. *****/
+static const int obj_type_priority[] = {
+  /* first entry is HWLOC_OBJ_MACHINE */     90,
+  /* next entry is HWLOC_OBJ_PACKAGE */     40,
+  /* next entry is HWLOC_OBJ_CORE */        60,
+  /* next entry is HWLOC_OBJ_PU */          100,
+  /* next entry is HWLOC_OBJ_L1CACHE */     20,
+  /* next entry is HWLOC_OBJ_L2CACHE */     20,
+  /* next entry is HWLOC_OBJ_L3CACHE */     20,
+  /* next entry is HWLOC_OBJ_L4CACHE */     20,
+  /* next entry is HWLOC_OBJ_L5CACHE */     20,
+  /* next entry is HWLOC_OBJ_L1ICACHE */    19,
+  /* next entry is HWLOC_OBJ_L2ICACHE */    19,
+  /* next entry is HWLOC_OBJ_L3ICACHE */    19,
+  /* next entry is HWLOC_OBJ_GROUP */       0,
+  /* next entry is HWLOC_OBJ_NUMANODE */    100,
+  /* next entry is HWLOC_OBJ_BRIDGE */      0,
+  /* next entry is HWLOC_OBJ_PCI_DEVICE */  100,
+  /* next entry is HWLOC_OBJ_OS_DEVICE */   100,
+  /* next entry is HWLOC_OBJ_MISC */        0
+};
+
+int hwloc_compare_types (hwloc_obj_type_t type1, hwloc_obj_type_t type2)
+{
+  unsigned order1 = obj_type_order[type1];
+  unsigned order2 = obj_type_order[type2];
+
+  /* only normal objects are comparable. others are only comparable with machine */
+  if (!hwloc__obj_type_is_normal(type1)
+      && hwloc__obj_type_is_normal(type2) && type2 != HWLOC_OBJ_MACHINE)
+    return HWLOC_TYPE_UNORDERED;
+  if (!hwloc__obj_type_is_normal(type2)
+      && hwloc__obj_type_is_normal(type1) && type1 != HWLOC_OBJ_MACHINE)
+    return HWLOC_TYPE_UNORDERED;
+
+  return order1 - order2;
+}
+
+enum hwloc_obj_cmp_e {
+  HWLOC_OBJ_EQUAL = HWLOC_BITMAP_EQUAL,			/**< \brief Equal */
+  HWLOC_OBJ_INCLUDED = HWLOC_BITMAP_INCLUDED,		/**< \brief Strictly included into */
+  HWLOC_OBJ_CONTAINS = HWLOC_BITMAP_CONTAINS,		/**< \brief Strictly contains */
+  HWLOC_OBJ_INTERSECTS = HWLOC_BITMAP_INTERSECTS,	/**< \brief Intersects, but no inclusion! */
+  HWLOC_OBJ_DIFFERENT = HWLOC_BITMAP_DIFFERENT		/**< \brief No intersection */
+};
+
+static enum hwloc_obj_cmp_e
+hwloc_type_cmp(hwloc_obj_t obj1, hwloc_obj_t obj2)
+{
+  hwloc_obj_type_t type1 = obj1->type;
+  hwloc_obj_type_t type2 = obj2->type;
+  int compare;
+
+  compare = hwloc_compare_types(type1, type2);
+  if (compare == HWLOC_TYPE_UNORDERED)
+    return HWLOC_OBJ_DIFFERENT; /* we cannot do better */
+  if (compare > 0)
+    return HWLOC_OBJ_INCLUDED;
+  if (compare < 0)
+    return HWLOC_OBJ_CONTAINS;
+
+  if (obj1->type == HWLOC_OBJ_GROUP
+      && (obj1->attr->group.kind != obj2->attr->group.kind
+	  || obj1->attr->group.subkind != obj2->attr->group.subkind))
+    return HWLOC_OBJ_DIFFERENT; /* we cannot do better */
+
+  return HWLOC_OBJ_EQUAL;
+}
+
+/*
+ * How to compare objects based on cpusets.
+ */
+
+static int
+hwloc_obj_cmp_sets(hwloc_obj_t obj1, hwloc_obj_t obj2)
+{
+  hwloc_bitmap_t set1, set2;
+  int res = HWLOC_OBJ_DIFFERENT;
+
+  assert(!hwloc__obj_type_is_special(obj1->type));
+  assert(!hwloc__obj_type_is_special(obj2->type));
+
+  /* compare cpusets first */
+  if (obj1->complete_cpuset && obj2->complete_cpuset) {
+    set1 = obj1->complete_cpuset;
+    set2 = obj2->complete_cpuset;
+  } else {
+    set1 = obj1->cpuset;
+    set2 = obj2->cpuset;
+  }
+  if (set1 && set2 && !hwloc_bitmap_iszero(set1) && !hwloc_bitmap_iszero(set2)) {
+    res = hwloc_bitmap_compare_inclusion(set1, set2);
+    if (res == HWLOC_OBJ_INTERSECTS)
+      return HWLOC_OBJ_INTERSECTS;
+  }
+
+  /* then compare nodesets, and combine the results */
+  if (obj1->complete_nodeset && obj2->complete_nodeset) {
+    set1 = obj1->complete_nodeset;
+    set2 = obj2->complete_nodeset;
+  } else {
+    set1 = obj1->nodeset;
+    set2 = obj2->nodeset;
+  }
+  if (set1 && set2 && !hwloc_bitmap_iszero(set1) && !hwloc_bitmap_iszero(set2)) {
+    int noderes = hwloc_bitmap_compare_inclusion(set1, set2);
+    /* deal with conflicting cpusets/nodesets inclusions */
+    if (noderes == HWLOC_OBJ_INCLUDED) {
+      if (res == HWLOC_OBJ_CONTAINS)
+	/* contradicting order for cpusets and nodesets */
+	return HWLOC_OBJ_INTERSECTS;
+      res = HWLOC_OBJ_INCLUDED;
+
+    } else if (noderes == HWLOC_OBJ_CONTAINS) {
+      if (res == HWLOC_OBJ_INCLUDED)
+	/* contradicting order for cpusets and nodesets */
+	return HWLOC_OBJ_INTERSECTS;
+      res = HWLOC_OBJ_CONTAINS;
+
+    } else if (noderes == HWLOC_OBJ_INTERSECTS) {
+      return HWLOC_OBJ_INTERSECTS;
+
+    } else {
+      /* nodesets are different, keep the cpuset order */
+
+    }
+  }
+
+  return res;
+}
+
+/* Compare object cpusets based on complete_cpuset if defined (always correctly ordered),
+ * or fallback to the main cpusets (only correctly ordered during early insert before disallowed bits are cleared).
+ *
+ * This is the sane way to compare object among a horizontal level.
+ */
+int
+hwloc__object_cpusets_compare_first(hwloc_obj_t obj1, hwloc_obj_t obj2)
+{
+  if (obj1->complete_cpuset && obj2->complete_cpuset)
+    return hwloc_bitmap_compare_first(obj1->complete_cpuset, obj2->complete_cpuset);
+  else if (obj1->cpuset && obj2->cpuset)
+    return hwloc_bitmap_compare_first(obj1->cpuset, obj2->cpuset);
+  else if (obj1->complete_nodeset && obj2->complete_nodeset)
+    return hwloc_bitmap_compare_first(obj1->complete_nodeset, obj2->complete_nodeset);
+  else if (obj1->nodeset && obj2->nodeset)
+    return hwloc_bitmap_compare_first(obj1->nodeset, obj2->nodeset);
+  return 0;
+}
+
+/* format the obj info to print in error messages */
+static void
+hwloc__report_error_format_obj(char *buf, size_t buflen, hwloc_obj_t obj)
+{
+	char typestr[64];
+	char *cpusetstr;
+	char *nodesetstr = NULL;
+	hwloc_obj_type_snprintf(typestr, sizeof(typestr), obj, 0);
+	hwloc_bitmap_asprintf(&cpusetstr, obj->cpuset);
+	if (obj->nodeset) /* may be missing during insert */
+	  hwloc_bitmap_asprintf(&nodesetstr, obj->nodeset);
+	if (obj->os_index != HWLOC_UNKNOWN_INDEX)
+	  snprintf(buf, buflen, "%s (P#%u cpuset %s%s%s)",
+		   typestr, obj->os_index, cpusetstr,
+		   nodesetstr ? " nodeset " : "",
+		   nodesetstr ? nodesetstr : "");
+	else
+	  snprintf(buf, buflen, "%s (cpuset %s%s%s)",
+		   typestr, cpusetstr,
+		   nodesetstr ? " nodeset " : "",
+		   nodesetstr ? nodesetstr : "");
+	free(cpusetstr);
+	free(nodesetstr);
+}
+
+/*
+ * How to insert objects into the topology.
+ *
+ * Note: during detection, only the first_child and next_sibling pointers are
+ * kept up to date.  Others are computed only once topology detection is
+ * complete.
+ */
+
+/* merge new object attributes in old.
+ * use old if defined, otherwise use new.
+ */
+static void
+merge_insert_equal(hwloc_obj_t new, hwloc_obj_t old)
+{
+  if (old->os_index == HWLOC_UNKNOWN_INDEX)
+    old->os_index = new->os_index;
+
+  if (new->infos_count) {
+    /* FIXME: dedup */
+    hwloc__move_infos(&old->infos, &old->infos_count,
+		      &new->infos, &new->infos_count);
+  }
+
+  if (new->name && !old->name) {
+    old->name = new->name;
+    new->name = NULL;
+  }
+  if (new->subtype && !old->subtype) {
+    old->subtype = new->subtype;
+    new->subtype = NULL;
+  }
+
+  /* Ignore userdata. It will be NULL before load().
+   * It may be non-NULL if alloc+insert_group() after load().
+   */
+
+  switch(new->type) {
+  case HWLOC_OBJ_NUMANODE:
+    if (new->attr->numanode.local_memory && !old->attr->numanode.local_memory) {
+      /* no memory in old, use new memory */
+      old->attr->numanode.local_memory = new->attr->numanode.local_memory;
+      free(old->attr->numanode.page_types);
+      old->attr->numanode.page_types_len = new->attr->numanode.page_types_len;
+      old->attr->numanode.page_types = new->attr->numanode.page_types;
+      new->attr->numanode.page_types = NULL;
+      new->attr->numanode.page_types_len = 0;
+    }
+    /* old->attr->numanode.total_memory will be updated by propagate_total_memory() */
+    break;
+  case HWLOC_OBJ_L1CACHE:
+  case HWLOC_OBJ_L2CACHE:
+  case HWLOC_OBJ_L3CACHE:
+  case HWLOC_OBJ_L4CACHE:
+  case HWLOC_OBJ_L5CACHE:
+  case HWLOC_OBJ_L1ICACHE:
+  case HWLOC_OBJ_L2ICACHE:
+  case HWLOC_OBJ_L3ICACHE:
+    if (!old->attr->cache.size)
+      old->attr->cache.size = new->attr->cache.size;
+    if (!old->attr->cache.linesize)
+      old->attr->cache.size = new->attr->cache.linesize;
+    if (!old->attr->cache.associativity)
+      old->attr->cache.size = new->attr->cache.linesize;
+    break;
+  default:
+    break;
+  }
+}
+
+/* returns the result of merge, or NULL if not merged */
+static __hwloc_inline hwloc_obj_t
+hwloc__insert_try_merge_group(hwloc_obj_t old, hwloc_obj_t new)
+{
+  if (new->type == HWLOC_OBJ_GROUP && old->type == HWLOC_OBJ_GROUP) {
+    /* which group do we keep? */
+    if (new->attr->group.dont_merge) {
+      if (old->attr->group.dont_merge)
+	/* nobody wants to be merged */
+	return NULL;
+
+      /* keep the new one, it doesn't want to be merged */
+      hwloc_replace_linked_object(old, new);
+      return new;
+
+    } else {
+      if (old->attr->group.dont_merge)
+	/* keep the old one, it doesn't want to be merged */
+	return old;
+
+      /* compare subkinds to decice who to keep */
+      if (new->attr->group.kind < old->attr->group.kind)
+	hwloc_replace_linked_object(old, new);
+      return old;
+    }
+  }
+
+  if (new->type == HWLOC_OBJ_GROUP && !new->attr->group.dont_merge) {
+
+    if (old->type == HWLOC_OBJ_PU && new->attr->group.kind == HWLOC_GROUP_KIND_MEMORY)
+      /* Never merge Memory groups with PU, we don't want to attach Memory under PU */
+      return NULL;
+
+    /* Remove the Group now. The normal ignore code path wouldn't tell us whether the Group was removed or not,
+     * while some callers need to know (at least hwloc_topology_insert_group()).
+     */
+    return old;
+
+  } else if (old->type == HWLOC_OBJ_GROUP && !old->attr->group.dont_merge) {
+
+    if (new->type == HWLOC_OBJ_PU && old->attr->group.kind == HWLOC_GROUP_KIND_MEMORY)
+      /* Never merge Memory groups with PU, we don't want to attach Memory under PU */
+      return NULL;
+
+    /* Replace the Group with the new object contents
+     * and let the caller free the new object
+     */
+    hwloc_replace_linked_object(old, new);
+    return old;
+
+  } else {
+    /* cannot merge */
+    return NULL;
+  }
+}
+
+/* Try to insert OBJ in CUR, recurse if needed.
+ * Returns the object if it was inserted,
+ * the remaining object it was merged,
+ * NULL if failed to insert.
+ */
+static struct hwloc_obj *
+hwloc___insert_object_by_cpuset(struct hwloc_topology *topology, hwloc_obj_t cur, hwloc_obj_t obj,
+			        hwloc_report_error_t report_error)
+{
+  hwloc_obj_t child, next_child = NULL;
+  /* These will always point to the pointer to their next last child. */
+  hwloc_obj_t *cur_children = &cur->first_child;
+  hwloc_obj_t *obj_children = &obj->first_child;
+  /* Pointer where OBJ should be put */
+  hwloc_obj_t *putp = NULL; /* OBJ position isn't found yet */
+
+  assert(!hwloc__obj_type_is_memory(obj->type));
+
+  /* Iteration with prefetching to be completely safe against CHILD removal.
+   * The list is already sorted by cpuset, and there's no intersection between siblings.
+   */
+  for (child = cur->first_child, child ? next_child = child->next_sibling : NULL;
+       child;
+       child = next_child, child ? next_child = child->next_sibling : NULL) {
+
+    int res = hwloc_obj_cmp_sets(obj, child);
+    int setres = res;
+
+    if (res == HWLOC_OBJ_EQUAL) {
+      hwloc_obj_t merged = hwloc__insert_try_merge_group(child, obj);
+      if (merged)
+	return merged;
+      /* otherwise compare actual types to decide of the inclusion */
+      res = hwloc_type_cmp(obj, child);
+    }
+
+    switch (res) {
+      case HWLOC_OBJ_EQUAL:
+	/* Two objects with same type.
+	 * Groups are handled above.
+	 */
+	merge_insert_equal(obj, child);
+	/* Already present, no need to insert.  */
+	return child;
+
+      case HWLOC_OBJ_INCLUDED:
+	/* OBJ is strictly contained is some child of CUR, go deeper.  */
+	return hwloc___insert_object_by_cpuset(topology, child, obj, report_error);
+
+      case HWLOC_OBJ_INTERSECTS:
+        if (report_error) {
+	  char childstr[512];
+	  char objstr[512];
+	  char msg[1100];
+	  hwloc__report_error_format_obj(objstr, sizeof(objstr), obj);
+	  hwloc__report_error_format_obj(childstr, sizeof(childstr), child);
+	  snprintf(msg, sizeof(msg), "%s intersects with %s without inclusion!", objstr, childstr);
+	  report_error(msg, __LINE__);
+	}
+	goto putback;
+
+      case HWLOC_OBJ_DIFFERENT:
+        /* OBJ should be a child of CUR before CHILD, mark its position if not found yet. */
+	if (!putp && hwloc__object_cpusets_compare_first(obj, child) < 0)
+	  /* Don't insert yet, there could be intersect errors later */
+	  putp = cur_children;
+	/* Advance cur_children.  */
+	cur_children = &child->next_sibling;
+	break;
+
+      case HWLOC_OBJ_CONTAINS:
+	/* OBJ contains CHILD, remove CHILD from CUR */
+	*cur_children = child->next_sibling;
+	child->next_sibling = NULL;
+	/* Put CHILD in OBJ */
+	*obj_children = child;
+	obj_children = &child->next_sibling;
+	child->parent = obj;
+	if (setres == HWLOC_OBJ_EQUAL) {
+	  obj->memory_first_child = child->memory_first_child;
+	  child->memory_first_child = NULL;
+	}
+	break;
+    }
+  }
+  /* cur/obj_children points to last CUR/OBJ child next_sibling pointer, which must be NULL. */
+  assert(!*obj_children);
+  assert(!*cur_children);
+
+  /* Put OBJ where it belongs, or in last in CUR's children.  */
+  if (!putp)
+    putp = cur_children;
+  obj->next_sibling = *putp;
+  *putp = obj;
+  obj->parent = cur;
+
+  topology->modified = 1;
+  return obj;
+
+ putback:
+  /* Put-back OBJ children in CUR and return an error. */
+  if (putp)
+    cur_children = putp; /* No need to try to insert before where OBJ was supposed to go */
+  else
+    cur_children = &cur->first_child; /* Start from the beginning */
+  /* We can insert in order, but there can be holes in the middle. */
+  while ((child = obj->first_child) != NULL) {
+    /* Remove from OBJ */
+    obj->first_child = child->next_sibling;
+    obj->parent = cur;
+    /* Find child position in CUR, and insert. */
+    while (*cur_children && hwloc__object_cpusets_compare_first(*cur_children, child) < 0)
+      cur_children = &(*cur_children)->next_sibling;
+    child->next_sibling = *cur_children;
+    *cur_children = child;
+  }
+  return NULL;
+}
+
+/* this differs from hwloc_get_obj_covering_cpuset() by:
+ * - not looking at the parent cpuset first, which means we can insert
+ *   below root even if root PU bits are not set yet (PU are inserted later).
+ * - returning the first child that exactly matches instead of walking down in case
+ *   of identical children.
+ */
+static struct hwloc_obj *
+hwloc__find_obj_covering_memory_cpuset(struct hwloc_topology *topology, hwloc_obj_t parent, hwloc_bitmap_t cpuset)
+{
+  hwloc_obj_t child = hwloc_get_child_covering_cpuset(topology, cpuset, parent);
+  if (!child)
+    return parent;
+  if (child && hwloc_bitmap_isequal(child->cpuset, cpuset))
+    return child;
+  return hwloc__find_obj_covering_memory_cpuset(topology, child, cpuset);
+}
+
+static struct hwloc_obj *
+hwloc__find_insert_memory_parent(struct hwloc_topology *topology, hwloc_obj_t obj,
+				 hwloc_report_error_t report_error)
+{
+  hwloc_obj_t parent, group, result;
+
+  if (hwloc_bitmap_iszero(obj->cpuset)) {
+    /* CPU-less go in dedicated group below root */
+    parent = topology->levels[0][0];
+
+  } else {
+    /* find the highest obj covering the cpuset */
+    parent = hwloc__find_obj_covering_memory_cpuset(topology, topology->levels[0][0], obj->cpuset);
+    if (!parent) {
+      /* fallback to root */
+      parent = hwloc_get_root_obj(topology);
+    }
+
+    if (parent->type == HWLOC_OBJ_PU) {
+      /* Never attach to PU, try parent */
+      parent = parent->parent;
+      assert(parent);
+    }
+
+    /* TODO: if root->cpuset was updated earlier, we would be sure whether the group will remain identical to root */
+    if (parent != topology->levels[0][0] && hwloc_bitmap_isequal(parent->cpuset, obj->cpuset))
+      /* that parent is fine */
+      return parent;
+  }
+
+  if (!hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_GROUP))
+    /* even if parent isn't perfect, we don't want an intermediate group */
+    return parent;
+
+  /* need to insert an intermediate group for attaching the NUMA node */
+  group = hwloc_alloc_setup_object(topology, HWLOC_OBJ_GROUP, HWLOC_UNKNOWN_INDEX);
+  if (!group)
+    /* failed to create the group, fallback to larger parent */
+    return parent;
+
+  group->attr->group.kind = HWLOC_GROUP_KIND_MEMORY;
+  group->cpuset = hwloc_bitmap_dup(obj->cpuset);
+  group->complete_cpuset = hwloc_bitmap_dup(obj->complete_cpuset);
+  /* we could duplicate nodesets too but hwloc__insert_object_by_cpuset()
+   * doesn't actually need it. and it could prevent future calls from reusing
+   * that groups for other NUMA nodes.
+   */
+  if (!group->cpuset != !obj->cpuset
+      || !group->complete_cpuset != !obj->complete_cpuset) {
+    /* failed to create the group, fallback to larger parent */
+    hwloc_free_unlinked_object(group);
+    return parent;
+  }
+
+  result = hwloc__insert_object_by_cpuset(topology, parent, group, report_error);
+  if (!result) {
+    /* failed to insert, fallback to larger parent */
+    return parent;
+  }
+
+  assert(result == group);
+  return group;
+}
+
+/*attach the given memory object below the given normal parent. */
+struct hwloc_obj *
+hwloc__attach_memory_object(struct hwloc_topology *topology, hwloc_obj_t parent,
+			    hwloc_obj_t obj,
+			    hwloc_report_error_t report_error __hwloc_attribute_unused)
+{
+  hwloc_obj_t *cur_children;
+
+  assert(parent);
+  assert(hwloc__obj_type_is_normal(parent->type));
+
+#if 0
+  /* TODO: enable this instead of hack in fixup_sets once NUMA nodes are inserted late */
+  /* copy the parent cpuset in case it's larger than expected.
+   * we could also keep the cpuset smaller than the parent and say that a normal-parent
+   * can have multiple memory children with smaller cpusets.
+   * However, the user decided the ignore Groups, so hierarchy/locality loss is expected.
+   */
+  hwloc_bitmap_copy(obj->cpuset, parent->cpuset);
+#endif
+
+  /* only NUMA nodes are memory for now, just append to the end of the list */
+  assert(obj->type == HWLOC_OBJ_NUMANODE);
+  assert(obj->nodeset);
+  cur_children = &parent->memory_first_child;
+  while (*cur_children) {
+    /* TODO check that things are inserted in order.
+     * it's OK for KNL, the only user so far
+     */
+    cur_children = &(*cur_children)->next_sibling;
+  }
+  *cur_children = obj;
+  obj->next_sibling = NULL;
+
+  /* Initialize the complete nodeset if needed */
+  if (!obj->complete_nodeset) {
+    obj->complete_nodeset = hwloc_bitmap_dup(obj->nodeset);
+  }
+
+  /* Add the bit to the top sets, and to the parent CPU-side object */
+  if (obj->type == HWLOC_OBJ_NUMANODE) {
+    if (hwloc_bitmap_isset(obj->nodeset, obj->os_index))
+      hwloc_bitmap_set(topology->levels[0][0]->nodeset, obj->os_index);
+    hwloc_bitmap_set(topology->levels[0][0]->complete_nodeset, obj->os_index);
+  }
+
+  topology->modified = 1;
+  return obj;
+}
+
+/* insertion routine that lets you change the error reporting callback */
+struct hwloc_obj *
+hwloc__insert_object_by_cpuset(struct hwloc_topology *topology, hwloc_obj_t root,
+			       hwloc_obj_t obj,
+			       hwloc_report_error_t report_error)
+{
+  struct hwloc_obj *result;
+
+#ifdef HWLOC_DEBUG
+  assert(!hwloc__obj_type_is_special(obj->type));
+
+  /* we need at least one non-NULL set (normal or complete, cpuset or nodeset) */
+  assert(obj->cpuset || obj->complete_cpuset || obj->nodeset || obj->complete_nodeset);
+  /* we support the case where all of them are empty.
+   * it may happen when hwloc__find_insert_memory_parent()
+   * inserts a Group for a CPU-less NUMA-node.
+   */
+#endif
+
+  if (hwloc__obj_type_is_memory(obj->type)) {
+    if (!root) {
+      root = hwloc__find_insert_memory_parent(topology, obj, report_error);
+      if (!root) {
+	hwloc_free_unlinked_object(obj);
+	return NULL;
+      }
+    }
+    return hwloc__attach_memory_object(topology, root, obj, report_error);
+  }
+
+  if (!root)
+    /* Start at the top. */
+    root = topology->levels[0][0];
+
+  result = hwloc___insert_object_by_cpuset(topology, root, obj, report_error);
+  if (result && result->type == HWLOC_OBJ_PU) {
+      /* Add the bit to the top sets */
+      if (hwloc_bitmap_isset(result->cpuset, result->os_index))
+	hwloc_bitmap_set(topology->levels[0][0]->cpuset, result->os_index);
+      hwloc_bitmap_set(topology->levels[0][0]->complete_cpuset, result->os_index);
+  }
+  if (result != obj) {
+    /* either failed to insert, or got merged, free the original object */
+    hwloc_free_unlinked_object(obj);
+  }
+  return result;
+}
+
+/* the default insertion routine warns in case of error.
+ * it's used by most backends */
+struct hwloc_obj *
+hwloc_insert_object_by_cpuset(struct hwloc_topology *topology, hwloc_obj_t obj)
+{
+  return hwloc__insert_object_by_cpuset(topology, NULL, obj, hwloc_report_os_error);
+}
+
+void
+hwloc_insert_object_by_parent(struct hwloc_topology *topology, hwloc_obj_t parent, hwloc_obj_t obj)
+{
+  hwloc_obj_t *current;
+
+  if (obj->type == HWLOC_OBJ_MISC) {
+    /* Append to the end of the Misc list */
+    for (current = &parent->misc_first_child; *current; current = &(*current)->next_sibling);
+  } else if (hwloc__obj_type_is_io(obj->type)) {
+    /* Append to the end of the I/O list */
+    for (current = &parent->io_first_child; *current; current = &(*current)->next_sibling);
+  } else if (hwloc__obj_type_is_memory(obj->type)) {
+    /* Append to the end of the memory list */
+    for (current = &parent->memory_first_child; *current; current = &(*current)->next_sibling);
+    /* Add the bit to the top sets */
+    if (obj->type == HWLOC_OBJ_NUMANODE) {
+      if (hwloc_bitmap_isset(obj->nodeset, obj->os_index))
+	hwloc_bitmap_set(topology->levels[0][0]->nodeset, obj->os_index);
+      hwloc_bitmap_set(topology->levels[0][0]->complete_nodeset, obj->os_index);
+    }
+  } else {
+    /* Append to the end of the list.
+     * The caller takes care of inserting children in the right cpuset order, without intersection between them.
+     * Duplicating doesn't need to check the order since the source topology is supposed to be OK already.
+     * XML reorders if needed, and fails on intersecting siblings.
+     * Other callers just insert random objects such as I/O or Misc, no cpuset issue there.
+     */
+    for (current = &parent->first_child; *current; current = &(*current)->next_sibling);
+    /* Add the bit to the top sets */
+    if (obj->type == HWLOC_OBJ_PU) {
+      if (hwloc_bitmap_isset(obj->cpuset, obj->os_index))
+	hwloc_bitmap_set(topology->levels[0][0]->cpuset, obj->os_index);
+      hwloc_bitmap_set(topology->levels[0][0]->complete_cpuset, obj->os_index);
+    }
+  }
+
+  *current = obj;
+  obj->parent = parent;
+  obj->next_sibling = NULL;
+  topology->modified = 1;
+}
+
+hwloc_obj_t
+hwloc_alloc_setup_object(hwloc_topology_t topology,
+			 hwloc_obj_type_t type, unsigned os_index)
+{
+  struct hwloc_obj *obj = hwloc_tma_malloc(topology->tma, sizeof(*obj));
+  memset(obj, 0, sizeof(*obj));
+  obj->type = type;
+  obj->os_index = os_index;
+  obj->gp_index = topology->next_gp_index++;
+  obj->attr = hwloc_tma_malloc(topology->tma, sizeof(*obj->attr));
+  memset(obj->attr, 0, sizeof(*obj->attr));
+  /* do not allocate the cpuset here, let the caller do it */
+  return obj;
+}
+
+hwloc_obj_t
+hwloc_topology_alloc_group_object(struct hwloc_topology *topology)
+{
+  if (!topology->is_loaded) {
+    /* this could actually work, see insert() below */
+    errno = EINVAL;
+    return NULL;
+  }
+  return hwloc_alloc_setup_object(topology, HWLOC_OBJ_GROUP, HWLOC_UNKNOWN_INDEX);
+}
+
+static void hwloc_propagate_symmetric_subtree(hwloc_topology_t topology, hwloc_obj_t root);
+static void propagate_total_memory(hwloc_obj_t obj);
+static void hwloc_set_group_depth(hwloc_topology_t topology);
+
+hwloc_obj_t
+hwloc_topology_insert_group_object(struct hwloc_topology *topology, hwloc_obj_t obj)
+{
+  hwloc_obj_t res, root;
+  int cmp;
+
+  if (!topology->is_loaded) {
+    /* this could actually work, we would just need to disable connect_children/levels below */
+    hwloc_free_unlinked_object(obj);
+    errno = EINVAL;
+    return NULL;
+  }
+
+  if (topology->type_filter[HWLOC_OBJ_GROUP] == HWLOC_TYPE_FILTER_KEEP_NONE) {
+    hwloc_free_unlinked_object(obj);
+    errno = EINVAL;
+    return NULL;
+  }
+
+  root = hwloc_get_root_obj(topology);
+  if (obj->cpuset)
+    hwloc_bitmap_and(obj->cpuset, obj->cpuset, root->cpuset);
+  if (obj->complete_cpuset)
+    hwloc_bitmap_and(obj->complete_cpuset, obj->complete_cpuset, root->complete_cpuset);
+  if (obj->nodeset)
+    hwloc_bitmap_and(obj->nodeset, obj->nodeset, root->nodeset);
+  if (obj->complete_nodeset)
+    hwloc_bitmap_and(obj->complete_nodeset, obj->complete_nodeset, root->complete_nodeset);
+
+  if ((!obj->cpuset || hwloc_bitmap_iszero(obj->cpuset))
+      && (!obj->complete_cpuset || hwloc_bitmap_iszero(obj->complete_cpuset))
+      && (!obj->nodeset || hwloc_bitmap_iszero(obj->nodeset))
+      && (!obj->complete_nodeset || hwloc_bitmap_iszero(obj->complete_nodeset))) {
+    hwloc_free_unlinked_object(obj);
+    errno = EINVAL;
+    return NULL;
+  }
+
+  cmp = hwloc_obj_cmp_sets(obj, root);
+  if (cmp == HWLOC_OBJ_INCLUDED) {
+    res = hwloc__insert_object_by_cpuset(topology, NULL, obj, NULL /* do not show errors on stdout */);
+  } else {
+    /* just merge root */
+    res = root;
+  }
+
+  if (!res)
+    return NULL;
+  if (res != obj)
+    /* merged */
+    return res;
+
+  /* properly inserted */
+  hwloc_obj_add_children_sets(obj);
+  if (hwloc_topology_reconnect(topology, 0) < 0)
+    return NULL;
+
+  hwloc_propagate_symmetric_subtree(topology, topology->levels[0][0]);
+  hwloc_set_group_depth(topology);
+
+#ifndef HWLOC_DEBUG
+  if (getenv("HWLOC_DEBUG_CHECK"))
+#endif
+    hwloc_topology_check(topology);
+
+  return obj;
+}
+
+hwloc_obj_t
+hwloc_topology_insert_misc_object(struct hwloc_topology *topology, hwloc_obj_t parent, const char *name)
+{
+  hwloc_obj_t obj;
+
+  if (topology->type_filter[HWLOC_OBJ_MISC] == HWLOC_TYPE_FILTER_KEEP_NONE) {
+    errno = EINVAL;
+    return NULL;
+  }
+
+  if (!topology->is_loaded) {
+    errno = EINVAL;
+    return NULL;
+  }
+
+  obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_MISC, HWLOC_UNKNOWN_INDEX);
+  if (name)
+    obj->name = strdup(name);
+
+  hwloc_insert_object_by_parent(topology, parent, obj);
+
+  /* FIXME: only connect misc parent children and misc level,
+   * but this API is likely not performance critical anyway
+   */
+  hwloc_topology_reconnect(topology, 0);
+
+#ifndef HWLOC_DEBUG
+  if (getenv("HWLOC_DEBUG_CHECK"))
+#endif
+    hwloc_topology_check(topology);
+
+  return obj;
+}
+
+/* assuming set is included in the topology complete_cpuset
+ * and all objects have a proper complete_cpuset,
+ * return the best one containing set.
+ * if some object are equivalent (same complete_cpuset), return the highest one.
+ */
+static hwloc_obj_t
+hwloc_get_highest_obj_covering_complete_cpuset (hwloc_topology_t topology, hwloc_const_cpuset_t set)
+{
+  hwloc_obj_t current = hwloc_get_root_obj(topology);
+  hwloc_obj_t child;
+
+  if (hwloc_bitmap_isequal(set, current->complete_cpuset))
+    /* root cpuset is exactly what we want, no need to look at children, we want the highest */
+    return current;
+
+ recurse:
+  /* find the right child */
+  for_each_child(child, current) {
+    if (hwloc_bitmap_isequal(set, child->complete_cpuset))
+      /* child puset is exactly what we want, no need to look at children, we want the highest */
+      return child;
+    if (!hwloc_bitmap_iszero(child->complete_cpuset) && hwloc_bitmap_isincluded(set, child->complete_cpuset))
+      break;
+  }
+
+  if (child) {
+    current = child;
+    goto recurse;
+  }
+
+  /* no better child */
+  return current;
+}
+
+hwloc_obj_t
+hwloc_find_insert_io_parent_by_complete_cpuset(struct hwloc_topology *topology, hwloc_cpuset_t cpuset)
+{
+  hwloc_obj_t group_obj, largeparent, parent;
+
+  /* restrict to the existing complete cpuset to avoid errors later */
+  hwloc_bitmap_and(cpuset, cpuset, hwloc_topology_get_complete_cpuset(topology));
+  if (hwloc_bitmap_iszero(cpuset))
+    /* remaining cpuset is empty, invalid */
+    return NULL;
+
+  largeparent = hwloc_get_highest_obj_covering_complete_cpuset(topology, cpuset);
+  if (hwloc_bitmap_isequal(largeparent->complete_cpuset, cpuset)
+      || !hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_GROUP))
+    /* Found a valid object (normal case) */
+    return largeparent;
+
+  /* we need to insert an intermediate group */
+  group_obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_GROUP, HWLOC_UNKNOWN_INDEX);
+  if (!group_obj)
+    /* Failed to insert the exact Group, fallback to largeparent */
+    return largeparent;
+
+  group_obj->complete_cpuset = hwloc_bitmap_dup(cpuset);
+  hwloc_bitmap_and(cpuset, cpuset, hwloc_topology_get_topology_cpuset(topology));
+  group_obj->cpuset = hwloc_bitmap_dup(cpuset);
+  group_obj->attr->group.kind = HWLOC_GROUP_KIND_IO;
+  parent = hwloc__insert_object_by_cpuset(topology, largeparent, group_obj, hwloc_report_os_error);
+  if (!parent)
+    /* Failed to insert the Group, maybe a conflicting cpuset */
+    return largeparent;
+
+  /* Group couldn't get merged or we would have gotten the right largeparent earlier */
+  assert(parent == group_obj);
+
+  /* Group inserted without being merged, everything OK, setup its sets */
+  hwloc_obj_add_children_sets(group_obj);
+
+  return parent;
+}
+
+static int hwloc_memory_page_type_compare(const void *_a, const void *_b)
+{
+  const struct hwloc_memory_page_type_s *a = _a;
+  const struct hwloc_memory_page_type_s *b = _b;
+  /* consider 0 as larger so that 0-size page_type go to the end */
+  if (!b->size)
+    return -1;
+  /* don't cast a-b in int since those are ullongs */
+  if (b->size == a->size)
+    return 0;
+  return a->size < b->size ? -1 : 1;
+}
+
+/* Propagate memory counts */
+static void
+propagate_total_memory(hwloc_obj_t obj)
+{
+  hwloc_obj_t child;
+  unsigned i;
+
+  /* reset total before counting local and children memory */
+  obj->total_memory = 0;
+
+  /* Propagate memory up. */
+  for_each_child(child, obj) {
+    propagate_total_memory(child);
+    obj->total_memory += child->total_memory;
+  }
+  for_each_memory_child(child, obj) {
+    propagate_total_memory(child);
+    obj->total_memory += child->total_memory;
+  }
+  /* No memory under I/O or Misc */
+
+  if (obj->type == HWLOC_OBJ_NUMANODE) {
+    obj->total_memory += obj->attr->numanode.local_memory;
+
+    /* By the way, sort the page_type array.
+     * Cannot do it on insert since some backends (e.g. XML) add page_types after inserting the object.
+     */
+    qsort(obj->attr->numanode.page_types, obj->attr->numanode.page_types_len, sizeof(*obj->attr->numanode.page_types), hwloc_memory_page_type_compare);
+    /* Ignore 0-size page_types, they are at the end */
+    for(i=obj->attr->numanode.page_types_len; i>=1; i--)
+      if (obj->attr->numanode.page_types[i-1].size)
+	break;
+    obj->attr->numanode.page_types_len = i;
+  }
+}
+
+/* Now that root sets are ready, propagate them to children
+ * by allocating missing sets and restricting existing ones.
+ */
+static void
+fixup_sets(hwloc_obj_t obj)
+{
+  int in_memory_list;
+  hwloc_obj_t child;
+
+  child = obj->first_child;
+  in_memory_list = 0;
+  /* iterate over normal children first, we'll come back for memory children later */
+
+ iterate:
+  while (child) {
+    /* our cpuset must be included in our parent's one */
+    hwloc_bitmap_and(child->cpuset, child->cpuset, obj->cpuset);
+    hwloc_bitmap_and(child->nodeset, child->nodeset, obj->nodeset);
+    /* our complete_cpuset must be included in our parent's one, but can be larger than our cpuset */
+    if (child->complete_cpuset) {
+      hwloc_bitmap_and(child->complete_cpuset, child->complete_cpuset, obj->complete_cpuset);
+    } else {
+      child->complete_cpuset = hwloc_bitmap_dup(child->cpuset);
+    }
+    if (child->complete_nodeset) {
+      hwloc_bitmap_and(child->complete_nodeset, child->complete_nodeset, obj->complete_nodeset);
+    } else {
+      child->complete_nodeset = hwloc_bitmap_dup(child->nodeset);
+    }
+
+    fixup_sets(child);
+    child = child->next_sibling;
+  }
+
+  /* switch to memory children list if any */
+  if (!in_memory_list && obj->memory_first_child) {
+    child = obj->memory_first_child;
+    in_memory_list = 1;
+    goto iterate;
+  }
+
+  /* No sets in I/O or Misc */
+}
+
+/* Setup object cpusets/nodesets by OR'ing its children. */
+int
+hwloc_obj_add_other_obj_sets(hwloc_obj_t dst, hwloc_obj_t src)
+{
+#define ADD_OTHER_OBJ_SET(_dst, _src, _set)			\
+  if ((_src)->_set) {						\
+    if (!(_dst)->_set)						\
+      (_dst)->_set = hwloc_bitmap_alloc();			\
+    hwloc_bitmap_or((_dst)->_set, (_dst)->_set, (_src)->_set);	\
+  }
+  ADD_OTHER_OBJ_SET(dst, src, cpuset);
+  ADD_OTHER_OBJ_SET(dst, src, complete_cpuset);
+  ADD_OTHER_OBJ_SET(dst, src, nodeset);
+  ADD_OTHER_OBJ_SET(dst, src, complete_nodeset);
+  return 0;
+}
+
+int
+hwloc_obj_add_children_sets(hwloc_obj_t obj)
+{
+  hwloc_obj_t child;
+  for_each_child(child, obj) {
+    hwloc_obj_add_other_obj_sets(obj, child);
+  }
+  /* No need to look at Misc children, they contain no PU. */
+  return 0;
+}
+
+/* CPU objects are inserted by cpusets, we know their cpusets are properly included.
+ * We just need fixup_sets() to make sure they aren't too wide.
+ *
+ * Memory objects are inserted by cpusets to find their CPU parent,
+ * but nodesets are only used inside the memory hierarchy below that parent.
+ * Thus we need to propagate nodesets to CPU-side parents and children.
+ *
+ * A memory object nodeset consists of NUMA nodes below it.
+ * A normal object nodeset consists in NUMA nodes attached to any
+ * of its children or parents.
+ */
+static void
+propagate_nodeset(hwloc_obj_t obj)
+{
+  hwloc_obj_t child;
+
+  /* Start our nodeset from the parent one.
+   * It was emptied at root, and it's being filled with local nodes
+   * in that branch of the tree as we recurse down.
+   */
+  if (!obj->nodeset)
+    obj->nodeset = hwloc_bitmap_alloc();
+  if (obj->parent)
+    hwloc_bitmap_copy(obj->nodeset, obj->parent->nodeset);
+  else
+    hwloc_bitmap_zero(obj->nodeset);
+
+  /* Don't clear complete_nodeset, just make sure it contains nodeset.
+   * We cannot clear the complete_nodeset at root and rebuild it down because
+   * some bits may correspond to offline/disallowed NUMA nodes missing in the topology.
+   */
+  if (!obj->complete_nodeset)
+    obj->complete_nodeset = hwloc_bitmap_dup(obj->nodeset);
+  else
+    hwloc_bitmap_or(obj->complete_nodeset, obj->complete_nodeset, obj->nodeset);
+
+  /* now add our local nodeset */
+  for_each_memory_child(child, obj) {
+    /* FIXME rather recurse in the memory hierarchy */
+
+    /* first, update children complete_nodeset if needed */
+    if (!child->complete_nodeset)
+      child->complete_nodeset = hwloc_bitmap_dup(child->nodeset);
+    else
+      hwloc_bitmap_or(child->complete_nodeset, child->complete_nodeset, child->nodeset);
+
+    /* add memory children nodesets to ours */
+    hwloc_bitmap_or(obj->nodeset, obj->nodeset, child->nodeset);
+    hwloc_bitmap_or(obj->complete_nodeset, obj->complete_nodeset, child->complete_nodeset);
+
+    /* by the way, copy our cpusets to memory children */
+    if (child->cpuset)
+      hwloc_bitmap_copy(child->cpuset, obj->cpuset);
+    else
+      child->cpuset = hwloc_bitmap_dup(obj->cpuset);
+    if (child->complete_cpuset)
+      hwloc_bitmap_copy(child->complete_cpuset, obj->complete_cpuset);
+    else
+      child->complete_cpuset = hwloc_bitmap_dup(obj->complete_cpuset);
+  }
+
+  /* Propagate our nodeset to CPU children. */
+  for_each_child(child, obj) {
+    propagate_nodeset(child);
+  }
+
+  /* Propagate CPU children specific nodesets back to us.
+   *
+   * We cannot merge these two loops because we don't want to first child
+   * nodeset to be propagated back to us and then down to the second child.
+   * Each child may have its own local nodeset,
+   * each of them is propagated to us, but not to other children.
+   */
+  for_each_child(child, obj) {
+    hwloc_bitmap_or(obj->nodeset, obj->nodeset, child->nodeset);
+    hwloc_bitmap_or(obj->complete_nodeset, obj->complete_nodeset, child->complete_nodeset);
+  }
+
+  /* No nodeset under I/O or Misc */
+
+}
+
+static void
+remove_unused_sets(hwloc_topology_t topology, hwloc_obj_t obj)
+{
+  hwloc_obj_t child;
+
+  hwloc_bitmap_and(obj->cpuset, obj->cpuset, topology->allowed_cpuset);
+  hwloc_bitmap_and(obj->nodeset, obj->nodeset, topology->allowed_nodeset);
+
+  for_each_child(child, obj)
+    remove_unused_sets(topology, child);
+  for_each_memory_child(child, obj)
+    remove_unused_sets(topology, child);
+  /* No cpuset under I/O or Misc */
+}
+
+static void
+hwloc__filter_bridges(hwloc_topology_t topology, hwloc_obj_t root, unsigned depth)
+{
+  hwloc_obj_t child, *pchild;
+
+  /* filter I/O children and recurse */
+  for_each_io_child_safe(child, root, pchild) {
+    enum hwloc_type_filter_e filter = topology->type_filter[child->type];
+
+    /* recurse into grand-children */
+    hwloc__filter_bridges(topology, child, depth+1);
+
+    child->attr->bridge.depth = depth;
+
+    if (child->type == HWLOC_OBJ_BRIDGE
+	&& filter == HWLOC_TYPE_FILTER_KEEP_IMPORTANT
+	&& !child->io_first_child) {
+      unlink_and_free_single_object(pchild);
+      topology->modified = 1;
+    }
+  }
+}
+
+static void
+hwloc_filter_bridges(hwloc_topology_t topology, hwloc_obj_t parent)
+{
+  hwloc_obj_t child = parent->first_child;
+  while (child) {
+    hwloc_filter_bridges(topology, child);
+    child = child->next_sibling;
+  }
+
+  hwloc__filter_bridges(topology, parent, 0);
+}
+
+void
+hwloc__reorder_children(hwloc_obj_t parent)
+{
+  /* move the children list on the side */
+  hwloc_obj_t *prev, child, children = parent->first_child;
+  parent->first_child = NULL;
+  while (children) {
+    /* dequeue child */
+    child = children;
+    children = child->next_sibling;
+    /* find where to enqueue it */
+    prev = &parent->first_child;
+    while (*prev && hwloc__object_cpusets_compare_first(child, *prev) > 0)
+      prev = &((*prev)->next_sibling);
+    /* enqueue */
+    child->next_sibling = *prev;
+    *prev = child;
+  }
+  /* No ordering to enforce for Misc or I/O children. */
+}
+
+/* Remove all normal children whose cpuset is empty,
+ * and memory children whose nodeset is empty.
+ * Also don't remove objects that have I/O children, but ignore Misc.
+ */
+static void
+remove_empty(hwloc_topology_t topology, hwloc_obj_t *pobj)
+{
+  hwloc_obj_t obj = *pobj, child, *pchild;
+
+  for_each_child_safe(child, obj, pchild)
+    remove_empty(topology, pchild);
+  for_each_memory_child_safe(child, obj, pchild)
+    remove_empty(topology, pchild);
+  /* No cpuset under I/O or Misc */
+
+  if (obj->first_child /* only remove if all children were removed above, so that we don't remove parents of NUMAnode */
+      || obj->memory_first_child /* only remove if no memory attached there */
+      || obj->io_first_child /* only remove if no I/O is attached there */)
+    /* ignore Misc */
+    return;
+
+  if (hwloc__obj_type_is_normal(obj->type)) {
+    if (!hwloc_bitmap_iszero(obj->cpuset))
+      return;
+  } else {
+    assert(hwloc__obj_type_is_memory(obj->type));
+    if (!hwloc_bitmap_iszero(obj->nodeset))
+      return;
+  }
+
+  hwloc_debug("%s", "\nRemoving empty object ");
+  hwloc_debug_print_object(0, obj);
+  unlink_and_free_single_object(pobj);
+  topology->modified = 1;
+}
+
+/* reset type depth before modifying levels (either reconnecting or filtering/keep_structure) */
+static void
+hwloc_reset_normal_type_depths(hwloc_topology_t topology)
+{
+  unsigned i;
+  for (i=HWLOC_OBJ_TYPE_MIN; i<=HWLOC_OBJ_GROUP; i++)
+    topology->type_depth[i] = HWLOC_TYPE_DEPTH_UNKNOWN;
+  /* type contiguity is asserted in topology_check() */
+}
+
+static int
+hwloc_dont_merge_group_level(hwloc_topology_t topology, unsigned i)
+{
+  unsigned j;
+
+  /* Don't merge some groups in that level? */
+  for(j=0; j<topology->level_nbobjects[i]; j++)
+    if (topology->levels[i][j]->attr->group.dont_merge)
+      return 1;
+
+  return 0;
+}
+
+/* compare i-th and i-1-th levels structure */
+static int
+hwloc_compare_levels_structure(hwloc_topology_t topology, unsigned i)
+{
+  int checkmemory = (topology->levels[i][0]->type == HWLOC_OBJ_PU);
+  unsigned j;
+
+  if (topology->level_nbobjects[i-1] != topology->level_nbobjects[i])
+    return -1;
+
+  for(j=0; j<topology->level_nbobjects[i]; j++) {
+    if (topology->levels[i-1][j]->arity != 1)
+      return -1;
+    if (checkmemory && topology->levels[i-1][j]->memory_arity)
+      /* don't merge PUs if there's memory above */
+      return -1;
+  }
+  /* same number of objects with arity 1 above, no problem */
+  return 0;
+}
+
+/* return > 0 if any level was removed, which means reconnect is needed */
+static void
+hwloc_filter_levels_keep_structure(hwloc_topology_t topology)
+{
+  unsigned i, j;
+  int res = 0;
+
+  /* start from the bottom since we'll remove intermediate levels */
+  for(i=topology->nb_levels-1; i>0; i--) {
+    int replacechild = 0, replaceparent = 0;
+    hwloc_obj_t obj1 = topology->levels[i-1][0];
+    hwloc_obj_t obj2 = topology->levels[i][0];
+    hwloc_obj_type_t type1 = obj1->type;
+    hwloc_obj_type_t type2 = obj2->type;
+
+    /* Check whether parents and/or children can be replaced */
+    if (topology->type_filter[type1] == HWLOC_TYPE_FILTER_KEEP_STRUCTURE) {
+      /* Parents can be ignored in favor of children.  */
+      replaceparent = 1;
+      if (type1 == HWLOC_OBJ_GROUP && hwloc_dont_merge_group_level(topology, i-1))
+	replaceparent = 0;
+    }
+    if (topology->type_filter[type2] == HWLOC_TYPE_FILTER_KEEP_STRUCTURE) {
+      /* Children can be ignored in favor of parents.  */
+      replacechild = 1;
+      if (type1 == HWLOC_OBJ_GROUP && hwloc_dont_merge_group_level(topology, i))
+	replacechild = 0;
+    }
+    if (!replacechild && !replaceparent)
+      /* no ignoring */
+      continue;
+    /* Decide which one to actually replace */
+    if (replaceparent && replacechild) {
+      /* If both may be replaced, look at obj_type_priority */
+      if (obj_type_priority[type1] >= obj_type_priority[type2])
+	replaceparent = 0;
+      else
+	replacechild = 0;
+    }
+    /* Are these levels actually identical? */
+    if (hwloc_compare_levels_structure(topology, i) < 0)
+      continue;
+    hwloc_debug("may merge levels #%u=%s and #%u=%s\n",
+		i-1, hwloc_obj_type_string(type1), i, hwloc_obj_type_string(type2));
+
+    /* OK, remove intermediate objects from the tree. */
+    for(j=0; j<topology->level_nbobjects[i]; j++) {
+      hwloc_obj_t parent = topology->levels[i-1][j];
+      hwloc_obj_t child = topology->levels[i][j];
+      unsigned k;
+      if (replacechild) {
+	/* move child's children to parent */
+	parent->first_child = child->first_child;
+	parent->last_child = child->last_child;
+	parent->arity = child->arity;
+	free(parent->children);
+	parent->children = child->children;
+	child->children = NULL;
+	/* update children parent */
+	for(k=0; k<parent->arity; k++)
+	  parent->children[k]->parent = parent;
+	/* append child memory/io/misc children to parent */
+	if (child->memory_first_child) {
+	  append_siblings_list(&parent->memory_first_child, child->memory_first_child, parent);
+	  parent->memory_arity += child->memory_arity;
+	}
+	if (child->io_first_child) {
+	  append_siblings_list(&parent->io_first_child, child->io_first_child, parent);
+	  parent->io_arity += child->io_arity;
+	}
+	if (child->misc_first_child) {
+	  append_siblings_list(&parent->misc_first_child, child->misc_first_child, parent);
+	  parent->misc_arity += child->misc_arity;
+	}
+	hwloc_free_unlinked_object(child);
+      } else {
+	/* replace parent with child in grand-parent */
+	if (parent->parent) {
+	  parent->parent->children[parent->sibling_rank] = child;
+	  child->sibling_rank = parent->sibling_rank;
+	  if (!parent->sibling_rank) {
+	    parent->parent->first_child = child;
+	    /* child->prev_sibling was already NULL, child was single */
+	  } else {
+	    child->prev_sibling = parent->parent->children[parent->sibling_rank-1];
+	    child->prev_sibling->next_sibling = child;
+	  }
+	  if (parent->sibling_rank == parent->parent->arity-1) {
+	    parent->parent->last_child = child;
+	    /* child->next_sibling was already NULL, child was single */
+	  } else {
+	    child->next_sibling = parent->parent->children[parent->sibling_rank+1];
+	    child->next_sibling->prev_sibling = child;
+	  }
+	  /* update child parent */
+	  child->parent = parent->parent;
+	} else {
+	  /* make child the new root */
+	  topology->levels[0][0] = child;
+	  child->parent = NULL;
+	}
+	/* prepend parent memory/io/misc children to child */
+	if (parent->memory_first_child) {
+	  prepend_siblings_list(&child->memory_first_child, parent->memory_first_child, child);
+	  child->memory_arity += parent->memory_arity;
+	}
+	if (parent->io_first_child) {
+	  prepend_siblings_list(&child->io_first_child, parent->io_first_child, child);
+	  child->io_arity += parent->io_arity;
+	}
+	if (parent->misc_first_child) {
+	  prepend_siblings_list(&child->misc_first_child, parent->misc_first_child, child);
+	  child->misc_arity += parent->misc_arity;
+	}
+	hwloc_free_unlinked_object(parent);
+	/* prev/next_sibling will be updated below in another loop */
+      }
+    }
+    if (replaceparent && i>1) {
+      /* Update sibling list within modified parent->parent arrays */
+      for(j=0; j<topology->level_nbobjects[i]; j++) {
+	hwloc_obj_t child = topology->levels[i][j];
+	unsigned rank = child->sibling_rank;
+	child->prev_sibling = rank > 0 ? child->parent->children[rank-1] : NULL;
+	child->next_sibling = rank < child->parent->arity-1 ? child->parent->children[rank+1] : NULL;
+      }
+    }
+
+    /* Update levels so that the next reconnect isn't confused */
+    if (replaceparent) {
+      /* Removing level i-1, so move levels [i..nb_levels-1] to [i-1..] */
+      free(topology->levels[i-1]);
+      memmove(&topology->levels[i-1],
+	      &topology->levels[i],
+	      (topology->nb_levels-i)*sizeof(topology->levels[i]));
+      memmove(&topology->level_nbobjects[i-1],
+	      &topology->level_nbobjects[i],
+	      (topology->nb_levels-i)*sizeof(topology->level_nbobjects[i]));
+      hwloc_debug("removed parent level %s at depth %u\n",
+		  hwloc_obj_type_string(type1), i-1);
+    } else {
+      /* Removing level i, so move levels [i+1..nb_levels-1] and later to [i..] */
+      free(topology->levels[i]);
+      memmove(&topology->levels[i],
+	      &topology->levels[i+1],
+	      (topology->nb_levels-1-i)*sizeof(topology->levels[i]));
+      memmove(&topology->level_nbobjects[i],
+	      &topology->level_nbobjects[i+1],
+	      (topology->nb_levels-1-i)*sizeof(topology->level_nbobjects[i]));
+      hwloc_debug("removed child level %s at depth %u\n",
+		  hwloc_obj_type_string(type2), i);
+    }
+    topology->level_nbobjects[topology->nb_levels-1] = 0;
+    topology->levels[topology->nb_levels-1] = NULL;
+    topology->nb_levels--;
+
+    res++;
+  }
+
+  if (res > 0) {
+    /* Update object and type depths if some levels were removed */
+    hwloc_reset_normal_type_depths(topology);
+    for(i=0; i<topology->nb_levels; i++) {
+      hwloc_obj_type_t type = topology->levels[i][0]->type;
+      for(j=0; j<topology->level_nbobjects[i]; j++)
+	topology->levels[i][j]->depth = (int)i;
+      if (topology->type_depth[type] == HWLOC_TYPE_DEPTH_UNKNOWN)
+	topology->type_depth[type] = (int)i;
+      else
+	topology->type_depth[type] = HWLOC_TYPE_DEPTH_MULTIPLE;
+    }
+  }
+}
+
+static void
+hwloc_propagate_symmetric_subtree(hwloc_topology_t topology, hwloc_obj_t root)
+{
+  hwloc_obj_t child;
+  unsigned arity = root->arity;
+  int ok;
+
+  /* assume we're not symmetric by default */
+  root->symmetric_subtree = 0;
+
+  /* if no child, we are symmetric */
+  if (!arity)
+    goto good;
+
+  /* FIXME ignore memory just like I/O and Misc? */
+
+  /* look at normal children only, I/O and Misc are ignored.
+   * return if any child is not symmetric.
+   */
+  ok = 1;
+  for_each_child(child, root) {
+    hwloc_propagate_symmetric_subtree(topology, child);
+    if (!child->symmetric_subtree)
+      ok = 0;
+  }
+  if (!ok)
+    return;
+  /* Misc and I/O children do not care about symmetric_subtree */
+
+  /* if single child is symmetric, we're good */
+  if (arity == 1)
+    goto good;
+
+  /* now check that children subtrees are identical.
+   * just walk down the first child in each tree and compare their depth and arities
+   */
+{
+  HWLOC_VLA(hwloc_obj_t, array, arity);
+  memcpy(array, root->children, arity * sizeof(*array));
+  while (1) {
+    unsigned i;
+    /* check current level arities and depth */
+    for(i=1; i<arity; i++)
+      if (array[i]->depth != array[0]->depth
+	  || array[i]->arity != array[0]->arity) {
+      return;
+    }
+    if (!array[0]->arity)
+      /* no more children level, we're ok */
+      break;
+    /* look at first child of each element now */
+    for(i=0; i<arity; i++)
+      array[i] = array[i]->first_child;
+  }
+}
+
+  /* everything went fine, we're symmetric */
+ good:
+  root->symmetric_subtree = 1;
+}
+
+static void hwloc_set_group_depth(hwloc_topology_t topology)
+{
+  unsigned groupdepth = 0;
+  unsigned i, j;
+  for(i=0; i<topology->nb_levels; i++)
+    if (topology->levels[i][0]->type == HWLOC_OBJ_GROUP) {
+      for (j = 0; j < topology->level_nbobjects[i]; j++)
+	topology->levels[i][j]->attr->group.depth = groupdepth;
+      groupdepth++;
+    }
+}
+
+/*
+ * Initialize handy pointers in the whole topology.
+ * The topology only had first_child and next_sibling pointers.
+ * When this funtions return, all parent/children pointers are initialized.
+ * The remaining fields (levels, cousins, logical_index, depth, ...) will
+ * be setup later in hwloc_connect_levels().
+ *
+ * Can be called several times, so may have to update the array.
+ */
+static void
+hwloc_connect_children(hwloc_obj_t parent)
+{
+  unsigned n, oldn = parent->arity;
+  hwloc_obj_t child, prev_child;
+  int ok;
+
+  /* Main children list */
+
+  ok = 1;
+  prev_child = NULL;
+  for (n = 0, child = parent->first_child;
+       child;
+       n++,   prev_child = child, child = child->next_sibling) {
+    child->sibling_rank = n;
+    child->prev_sibling = prev_child;
+    /* already OK in the array? */
+    if (n >= oldn || parent->children[n] != child)
+      ok = 0;
+    /* recurse */
+    hwloc_connect_children(child);
+  }
+  parent->last_child = prev_child;
+  parent->arity = n;
+  if (!n) {
+    /* no need for an array anymore */
+    free(parent->children);
+    parent->children = NULL;
+    goto memory;
+  }
+  if (ok)
+    /* array is already OK (even if too large) */
+    goto memory;
+
+  /* alloc a larger array if needed */
+  if (oldn < n) {
+    free(parent->children);
+    parent->children = malloc(n * sizeof(*parent->children));
+  }
+  /* refill */
+  for (n = 0, child = parent->first_child;
+       child;
+       n++,   child = child->next_sibling) {
+    parent->children[n] = child;
+  }
+
+
+
+ memory:
+  /* Memory children list */
+
+  prev_child = NULL;
+  for (n = 0, child = parent->memory_first_child;
+       child;
+       n++,   prev_child = child, child = child->next_sibling) {
+    child->parent = parent;
+    child->sibling_rank = n;
+    child->prev_sibling = prev_child;
+    hwloc_connect_children(child);
+  }
+  parent->memory_arity = n;
+
+  /* I/O children list */
+
+  prev_child = NULL;
+  for (n = 0, child = parent->io_first_child;
+       child;
+       n++,   prev_child = child, child = child->next_sibling) {
+    child->parent = parent;
+    child->sibling_rank = n;
+    child->prev_sibling = prev_child;
+    hwloc_connect_children(child);
+  }
+  parent->io_arity = n;
+
+  /* Misc children list */
+
+  prev_child = NULL;
+  for (n = 0, child = parent->misc_first_child;
+       child;
+       n++,   prev_child = child, child = child->next_sibling) {
+    child->parent = parent;
+    child->sibling_rank = n;
+    child->prev_sibling = prev_child;
+    hwloc_connect_children(child);
+  }
+  parent->misc_arity = n;
+}
+
+/*
+ * Check whether there is an object below ROOT that has the same type as OBJ
+ */
+static int
+find_same_type(hwloc_obj_t root, hwloc_obj_t obj)
+{
+  hwloc_obj_t child;
+
+  if (hwloc_type_cmp(root, obj) == HWLOC_OBJ_EQUAL)
+    return 1;
+
+  for_each_child (child, root)
+    if (find_same_type(child, obj))
+      return 1;
+
+  return 0;
+}
+
+/* traverse the array of current object and compare them with top_obj.
+ * if equal, take the object and put its children into the remaining objs.
+ * if not equal, put the object into the remaining objs.
+ */
+static unsigned
+hwloc_level_take_objects(hwloc_obj_t top_obj,
+			 hwloc_obj_t *current_objs, unsigned n_current_objs,
+			 hwloc_obj_t *taken_objs, unsigned n_taken_objs __hwloc_attribute_unused,
+			 hwloc_obj_t *remaining_objs, unsigned n_remaining_objs __hwloc_attribute_unused)
+{
+  unsigned taken_i = 0;
+  unsigned new_i = 0;
+  unsigned i, j;
+
+  for (i = 0; i < n_current_objs; i++)
+    if (hwloc_type_cmp(top_obj, current_objs[i]) == HWLOC_OBJ_EQUAL) {
+      /* Take it, add main children.  */
+      taken_objs[taken_i++] = current_objs[i];
+      for (j = 0; j < current_objs[i]->arity; j++)
+	remaining_objs[new_i++] = current_objs[i]->children[j];
+    } else {
+      /* Leave it.  */
+      remaining_objs[new_i++] = current_objs[i];
+    }
+
+#ifdef HWLOC_DEBUG
+  /* Make sure we didn't mess up.  */
+  assert(taken_i == n_taken_objs);
+  assert(new_i == n_current_objs - n_taken_objs + n_remaining_objs);
+#endif
+
+  return new_i;
+}
+
+static int
+hwloc_build_level_from_list(struct hwloc_special_level_s *slevel)
+{
+  unsigned i, nb;
+  struct hwloc_obj * obj;
+
+  /* count */
+  obj = slevel->first;
+  i = 0;
+  while (obj) {
+    i++;
+    obj = obj->next_cousin;
+  }
+  nb = i;
+
+  if (nb) {
+    /* allocate and fill level */
+    slevel->objs = malloc(nb * sizeof(struct hwloc_obj *));
+    obj = slevel->first;
+    i = 0;
+    while (obj) {
+      obj->logical_index = i;
+      slevel->objs[i] = obj;
+      i++;
+      obj = obj->next_cousin;
+    }
+  }
+
+  slevel->nbobjs = nb;
+  return 0;
+}
+
+static void
+hwloc_append_special_object(struct hwloc_special_level_s *level, hwloc_obj_t obj)
+{
+  if (level->first) {
+    obj->prev_cousin = level->last;
+    obj->prev_cousin->next_cousin = obj;
+    level->last = obj;
+  } else {
+    obj->prev_cousin = NULL;
+    level->first = level->last = obj;
+  }
+}
+
+/* Append special objects to their lists */
+static void
+hwloc_list_special_objects(hwloc_topology_t topology, hwloc_obj_t obj)
+{
+  hwloc_obj_t child;
+
+  if (obj->type == HWLOC_OBJ_NUMANODE) {
+    obj->next_cousin = NULL;
+    obj->depth = HWLOC_TYPE_DEPTH_NUMANODE;
+    /* Insert the main NUMA node list */
+    hwloc_append_special_object(&topology->slevels[HWLOC_SLEVEL_NUMANODE], obj);
+
+    /* Recurse */
+    for_each_memory_child(child, obj)
+      hwloc_list_special_objects(topology, child);
+    for_each_misc_child(child, obj)
+      hwloc_list_special_objects(topology, child);
+
+  } else if (obj->type == HWLOC_OBJ_MISC) {
+    obj->next_cousin = NULL;
+    obj->depth = HWLOC_TYPE_DEPTH_MISC;
+    /* Insert the main Misc list */
+    hwloc_append_special_object(&topology->slevels[HWLOC_SLEVEL_MISC], obj);
+    /* Recurse, Misc only have Misc children */
+    for_each_misc_child(child, obj)
+      hwloc_list_special_objects(topology, child);
+
+  } else if (hwloc__obj_type_is_io(obj->type)) {
+    obj->next_cousin = NULL;
+
+    if (obj->type == HWLOC_OBJ_BRIDGE) {
+      obj->depth = HWLOC_TYPE_DEPTH_BRIDGE;
+      /* Insert in the main bridge list */
+      hwloc_append_special_object(&topology->slevels[HWLOC_SLEVEL_BRIDGE], obj);
+
+    } else if (obj->type == HWLOC_OBJ_PCI_DEVICE) {
+      obj->depth = HWLOC_TYPE_DEPTH_PCI_DEVICE;
+      /* Insert in the main pcidev list */
+      hwloc_append_special_object(&topology->slevels[HWLOC_SLEVEL_PCIDEV], obj);
+
+    } else if (obj->type == HWLOC_OBJ_OS_DEVICE) {
+      obj->depth = HWLOC_TYPE_DEPTH_OS_DEVICE;
+      /* Insert in the main osdev list */
+      hwloc_append_special_object(&topology->slevels[HWLOC_SLEVEL_OSDEV], obj);
+    }
+    /* Recurse, I/O only have I/O and Misc children */
+    for_each_io_child(child, obj)
+      hwloc_list_special_objects(topology, child);
+    for_each_misc_child(child, obj)
+      hwloc_list_special_objects(topology, child);
+
+  } else {
+    /* Recurse */
+    for_each_child(child, obj)
+      hwloc_list_special_objects(topology, child);
+    for_each_memory_child(child, obj)
+      hwloc_list_special_objects(topology, child);
+    for_each_io_child(child, obj)
+      hwloc_list_special_objects(topology, child);
+    for_each_misc_child(child, obj)
+      hwloc_list_special_objects(topology, child);
+  }
+}
+
+/* Build I/O levels */
+static void
+hwloc_connect_io_misc_levels(hwloc_topology_t topology)
+{
+  unsigned i;
+
+  for(i=0; i<HWLOC_NR_SLEVELS; i++)
+    free(topology->slevels[i].objs);
+  memset(&topology->slevels, 0, sizeof(topology->slevels));
+
+  hwloc_list_special_objects(topology, topology->levels[0][0]);
+
+  for(i=0; i<HWLOC_NR_SLEVELS; i++)
+    hwloc_build_level_from_list(&topology->slevels[i]);
+}
+
+/*
+ * Do the remaining work that hwloc_connect_children() did not do earlier.
+ * Requires object arity and children list to be properly initialized (by hwloc_connect_children()).
+ */
+static int
+hwloc_connect_levels(hwloc_topology_t topology)
+{
+  unsigned l, i=0;
+  hwloc_obj_t *objs, *taken_objs, *new_objs, top_obj, root;
+  unsigned n_objs, n_taken_objs, n_new_objs;
+
+  /* reset non-root levels (root was initialized during init and will not change here) */
+  for(l=1; l<topology->nb_levels; l++)
+    free(topology->levels[l]);
+  memset(topology->levels+1, 0, (topology->nb_levels-1)*sizeof(*topology->levels));
+  memset(topology->level_nbobjects+1, 0, (topology->nb_levels-1)*sizeof(*topology->level_nbobjects));
+  topology->nb_levels = 1;
+
+  /* initialize all non-IO/non-Misc depths to unknown */
+  hwloc_reset_normal_type_depths(topology);
+
+  /* initialize root type depth */
+  root = topology->levels[0][0];
+  root->depth = 0;
+  topology->type_depth[root->type] = 0;
+  /* root level */
+  root->logical_index = 0;
+  root->prev_cousin = NULL;
+  root->next_cousin = NULL;
+  /* root as a child of nothing */
+  root->parent = NULL;
+  root->sibling_rank = 0;
+  root->prev_sibling = NULL;
+  root->next_sibling = NULL;
+
+  /* Start with children of the whole system.  */
+  n_objs = topology->levels[0][0]->arity;
+  objs = malloc(n_objs * sizeof(objs[0]));
+  if (!objs) {
+    errno = ENOMEM;
+    return -1;
+  }
+  memcpy(objs, topology->levels[0][0]->children, n_objs*sizeof(objs[0]));
+
+  /* Keep building levels while there are objects left in OBJS.  */
+  while (n_objs) {
+    /* At this point, the objs array contains only objects that may go into levels */
+
+    /* First find which type of object is the topmost.
+     * Don't use PU if there are other types since we want to keep PU at the bottom.
+     */
+
+    /* Look for the first non-PU object, and use the first PU if we really find nothing else */
+    for (i = 0; i < n_objs; i++)
+      if (objs[i]->type != HWLOC_OBJ_PU)
+        break;
+    top_obj = i == n_objs ? objs[0] : objs[i];
+
+    /* See if this is actually the topmost object */
+    for (i = 0; i < n_objs; i++) {
+      if (hwloc_type_cmp(top_obj, objs[i]) != HWLOC_OBJ_EQUAL) {
+	if (find_same_type(objs[i], top_obj)) {
+	  /* OBJS[i] is strictly above an object of the same type as TOP_OBJ, so it
+	   * is above TOP_OBJ.  */
+	  top_obj = objs[i];
+	}
+      }
+    }
+
+    /* Now peek all objects of the same type, build a level with that and
+     * replace them with their children.  */
+
+    /* First count them.  */
+    n_taken_objs = 0;
+    n_new_objs = 0;
+    for (i = 0; i < n_objs; i++)
+      if (hwloc_type_cmp(top_obj, objs[i]) == HWLOC_OBJ_EQUAL) {
+	n_taken_objs++;
+	n_new_objs += objs[i]->arity;
+      }
+
+    /* New level.  */
+    taken_objs = malloc((n_taken_objs + 1) * sizeof(taken_objs[0]));
+    /* New list of pending objects.  */
+    if (n_objs - n_taken_objs + n_new_objs) {
+      new_objs = malloc((n_objs - n_taken_objs + n_new_objs) * sizeof(new_objs[0]));
+    } else {
+#ifdef HWLOC_DEBUG
+      assert(!n_new_objs);
+      assert(n_objs == n_taken_objs);
+#endif
+      new_objs = NULL;
+    }
+
+    n_new_objs = hwloc_level_take_objects(top_obj,
+					  objs, n_objs,
+					  taken_objs, n_taken_objs,
+					  new_objs, n_new_objs);
+
+    /* Ok, put numbers in the level and link cousins.  */
+    for (i = 0; i < n_taken_objs; i++) {
+      taken_objs[i]->depth = (int) topology->nb_levels;
+      taken_objs[i]->logical_index = i;
+      if (i) {
+	taken_objs[i]->prev_cousin = taken_objs[i-1];
+	taken_objs[i-1]->next_cousin = taken_objs[i];
+      }
+    }
+    taken_objs[0]->prev_cousin = NULL;
+    taken_objs[n_taken_objs-1]->next_cousin = NULL;
+
+    /* One more level!  */
+    hwloc_debug("--- %s level", hwloc_obj_type_string(top_obj->type));
+    hwloc_debug(" has number %u\n\n", topology->nb_levels);
+
+    if (topology->type_depth[top_obj->type] == HWLOC_TYPE_DEPTH_UNKNOWN)
+      topology->type_depth[top_obj->type] = (int) topology->nb_levels;
+    else
+      topology->type_depth[top_obj->type] = HWLOC_TYPE_DEPTH_MULTIPLE; /* mark as unknown */
+
+    taken_objs[n_taken_objs] = NULL;
+
+    if (topology->nb_levels == topology->nb_levels_allocated) {
+      /* extend the arrays of levels */
+      void *tmplevels, *tmpnbobjs;
+      tmplevels = realloc(topology->levels,
+			  2 * topology->nb_levels_allocated * sizeof(*topology->levels));
+      tmpnbobjs = realloc(topology->level_nbobjects,
+			  2 * topology->nb_levels_allocated * sizeof(*topology->level_nbobjects));
+      if (!tmplevels || !tmpnbobjs) {
+	fprintf(stderr, "hwloc failed to realloc level arrays to %u\n", topology->nb_levels_allocated * 2);
+
+	/* if one realloc succeeded, make sure the caller will free the new buffer */
+	if (tmplevels)
+	  topology->levels = tmplevels;
+	if (tmpnbobjs)
+	  topology->level_nbobjects = tmpnbobjs;
+	/* the realloc that failed left topology->level_foo untouched, will be freed by the caller */
+
+	free(objs);
+	free(taken_objs);
+	free(new_objs);
+	errno = ENOMEM;
+	return -1;
+      }
+      topology->levels = tmplevels;
+      topology->level_nbobjects = tmpnbobjs;
+      memset(topology->levels + topology->nb_levels_allocated,
+	     0, topology->nb_levels_allocated * sizeof(*topology->levels));
+      memset(topology->level_nbobjects + topology->nb_levels_allocated,
+	     0, topology->nb_levels_allocated * sizeof(*topology->level_nbobjects));
+      topology->nb_levels_allocated *= 2;
+    }
+    /* add the new level */
+    topology->level_nbobjects[topology->nb_levels] = n_taken_objs;
+    topology->levels[topology->nb_levels] = taken_objs;
+
+    topology->nb_levels++;
+
+    free(objs);
+
+    /* Switch to new_objs */
+    objs = new_objs;
+    n_objs = n_new_objs;
+  }
+
+  /* It's empty now.  */
+  free(objs);
+
+  return 0;
+}
+
+int
+hwloc_topology_reconnect(struct hwloc_topology *topology, unsigned long flags)
+{
+  if (flags) {
+    errno = EINVAL;
+    return -1;
+  }
+  if (!topology->modified)
+    return 0;
+
+  hwloc_connect_children(topology->levels[0][0]);
+
+  if (hwloc_connect_levels(topology) < 0)
+    return -1;
+
+  hwloc_connect_io_misc_levels(topology);
+
+  topology->modified = 0;
+
+  return 0;
+}
+
+void hwloc_alloc_root_sets(hwloc_obj_t root)
+{
+  /*
+   * All sets are initially NULL.
+   *
+   * At least one backend should call this function to initialize all sets at once.
+   * XML uses it lazily in case only some sets were given in the XML import.
+   *
+   * Other backends can check root->cpuset != NULL to see if somebody
+   * discovered things before them.
+   */
+  if (!root->cpuset)
+     root->cpuset = hwloc_bitmap_alloc();
+  if (!root->complete_cpuset)
+     root->complete_cpuset = hwloc_bitmap_alloc();
+  if (!root->nodeset)
+    root->nodeset = hwloc_bitmap_alloc();
+  if (!root->complete_nodeset)
+    root->complete_nodeset = hwloc_bitmap_alloc();
+}
+
+/* Main discovery loop */
+static int
+hwloc_discover(struct hwloc_topology *topology)
+{
+  struct hwloc_backend *backend;
+
+  topology->modified = 0; /* no need to reconnect yet */
+
+  topology->allowed_cpuset = hwloc_bitmap_alloc_full();
+  topology->allowed_nodeset = hwloc_bitmap_alloc_full();
+
+  /* discover() callbacks should use hwloc_insert to add objects initialized
+   * through hwloc_alloc_setup_object.
+   * For node levels, nodeset and memory must be initialized.
+   * For cache levels, memory and type/depth must be initialized.
+   * For group levels, depth must be initialized.
+   */
+
+  /* There must be at least a PU object for each logical processor, at worse
+   * produced by hwloc_setup_pu_level()
+   */
+
+  /* To be able to just use hwloc_insert_object_by_cpuset to insert the object
+   * in the topology according to the cpuset, the cpuset field must be
+   * initialized.
+   */
+
+  /* A priori, All processors are visible in the topology, and allowed
+   * for the application.
+   *
+   * - If some processors exist but topology information is unknown for them
+   *   (and thus the backend couldn't create objects for them), they should be
+   *   added to the complete_cpuset field of the lowest object where the object
+   *   could reside.
+   *
+   * - If some processors are not allowed for the application (e.g. for
+   *   administration reasons), they should be dropped from the allowed_cpuset
+   *   field.
+   *
+   * The same applies to the node sets complete_nodeset and allowed_cpuset.
+   *
+   * If such field doesn't exist yet, it can be allocated, and initialized to
+   * zero (for complete), or to full (for allowed). The values are
+   * automatically propagated to the whole tree after detection.
+   */
+
+  /*
+   * Discover CPUs first
+   */
+  backend = topology->backends;
+  while (NULL != backend) {
+    if (backend->component->type != HWLOC_DISC_COMPONENT_TYPE_CPU
+	&& backend->component->type != HWLOC_DISC_COMPONENT_TYPE_GLOBAL)
+      /* not yet */
+      goto next_cpubackend;
+    if (!backend->discover)
+      goto next_cpubackend;
+    backend->discover(backend);
+    hwloc_debug_print_objects(0, topology->levels[0][0]);
+
+next_cpubackend:
+    backend = backend->next;
+  }
+
+  /* One backend should have called hwloc_alloc_root_sets()
+   * and set bits during PU and NUMA insert.
+   */
+  if (!topology->levels[0][0]->cpuset || hwloc_bitmap_iszero(topology->levels[0][0]->cpuset)) {
+    hwloc_debug("%s", "No PU added by any CPU and global backend\n");
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (topology->binding_hooks.get_allowed_resources && topology->is_thissystem) {
+    const char *env = getenv("HWLOC_THISSYSTEM_ALLOWED_RESOURCES");
+    if ((env && atoi(env))
+	|| (topology->flags & HWLOC_TOPOLOGY_FLAG_THISSYSTEM_ALLOWED_RESOURCES))
+      topology->binding_hooks.get_allowed_resources(topology);
+  }
+
+  /* If there's no NUMA node, add one with all the memory.
+   * root->complete_nodeset wouldn't be empty if any NUMA was ever added:
+   * - insert_by_cpuset() adds bits whe PU/NUMA are added.
+   * - XML takes care of sanitizing nodesets.
+   */
+  if (hwloc_bitmap_iszero(topology->levels[0][0]->complete_nodeset)) {
+    hwloc_obj_t node;
+    hwloc_debug("%s", "\nAdd missing single NUMA node\n");
+    node = hwloc_alloc_setup_object(topology, HWLOC_OBJ_NUMANODE, 0);
+    node->cpuset = hwloc_bitmap_dup(topology->levels[0][0]->cpuset);
+    node->nodeset = hwloc_bitmap_alloc();
+    /* other nodesets will be filled below */
+    hwloc_bitmap_set(node->nodeset, 0);
+    memcpy(&node->attr->numanode, &topology->machine_memory, sizeof(topology->machine_memory));
+    memset(&topology->machine_memory, 0, sizeof(topology->machine_memory));
+    hwloc_insert_object_by_cpuset(topology, node);
+  } else {
+    /* if we're sure we found all NUMA nodes without their sizes (x86 backend?),
+     * we could split topology->total_memory in all of them.
+     */
+    free(topology->machine_memory.page_types);
+    memset(&topology->machine_memory, 0, sizeof(topology->machine_memory));
+  }
+
+  hwloc_debug("%s", "\nFixup root sets\n");
+  hwloc_bitmap_and(topology->levels[0][0]->cpuset, topology->levels[0][0]->cpuset, topology->levels[0][0]->complete_cpuset);
+  hwloc_bitmap_and(topology->levels[0][0]->nodeset, topology->levels[0][0]->nodeset, topology->levels[0][0]->complete_nodeset);
+
+  hwloc_bitmap_and(topology->allowed_cpuset, topology->allowed_cpuset, topology->levels[0][0]->cpuset);
+  hwloc_bitmap_and(topology->allowed_nodeset, topology->allowed_nodeset, topology->levels[0][0]->nodeset);
+
+  hwloc_debug("%s", "\nPropagate sets\n");
+  /* cpuset are already there thanks to the _by_cpuset insertion,
+   * but nodeset have to be propagated below and above NUMA nodes
+   */
+  propagate_nodeset(topology->levels[0][0]);
+  /* now fixup parent/children sets */
+  fixup_sets(topology->levels[0][0]);
+
+  hwloc_debug_print_objects(0, topology->levels[0][0]);
+
+  if (!(topology->flags & HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM)) {
+    hwloc_debug("%s", "\nRemoving unauthorized sets from all sets\n");
+    remove_unused_sets(topology, topology->levels[0][0]);
+    hwloc_debug_print_objects(0, topology->levels[0][0]);
+  }
+
+  /* see if we should ignore the root now that we know how many children it has */
+  if (!hwloc_filter_check_keep_object(topology, topology->levels[0][0])
+      && topology->levels[0][0]->first_child && !topology->levels[0][0]->first_child->next_sibling) {
+    hwloc_obj_t oldroot = topology->levels[0][0];
+    hwloc_obj_t newroot = oldroot->first_child;
+    /* switch to the new root */
+    newroot->parent = NULL;
+    topology->levels[0][0] = newroot;
+    /* move oldroot memory/io/misc children before newroot children */
+    if (oldroot->memory_first_child)
+      prepend_siblings_list(&newroot->memory_first_child, oldroot->memory_first_child, newroot);
+    if (oldroot->io_first_child)
+      prepend_siblings_list(&newroot->io_first_child, oldroot->io_first_child, newroot);
+    if (oldroot->misc_first_child)
+      prepend_siblings_list(&newroot->misc_first_child, oldroot->misc_first_child, newroot);
+    /* destroy oldroot and use the new one */
+    hwloc_free_unlinked_object(oldroot);
+  }
+
+  /*
+   * All object cpusets and nodesets are properly set now.
+   */
+
+  /* Now connect handy pointers to make remaining discovery easier. */
+  hwloc_debug("%s", "\nOk, finished tweaking, now connect\n");
+  if (hwloc_topology_reconnect(topology, 0) < 0)
+    return -1;
+  hwloc_debug_print_objects(0, topology->levels[0][0]);
+
+  /*
+   * Additional discovery with other backends
+   */
+
+  backend = topology->backends;
+  while (NULL != backend) {
+    if (backend->component->type == HWLOC_DISC_COMPONENT_TYPE_CPU
+	|| backend->component->type == HWLOC_DISC_COMPONENT_TYPE_GLOBAL)
+      /* already done above */
+      goto next_noncpubackend;
+    if (!backend->discover)
+      goto next_noncpubackend;
+    backend->discover(backend);
+    hwloc_debug_print_objects(0, topology->levels[0][0]);
+
+next_noncpubackend:
+    backend = backend->next;
+  }
+
+  hwloc_pci_belowroot_apply_locality(topology);
+
+  hwloc_debug("%s", "\nNow reconnecting\n");
+  hwloc_debug_print_objects(0, topology->levels[0][0]);
+
+  /* Remove some stuff */
+
+  hwloc_debug("%s", "\nRemoving bridge objects if needed\n");
+  hwloc_filter_bridges(topology, topology->levels[0][0]);
+  hwloc_debug_print_objects(0, topology->levels[0][0]);
+
+  hwloc_debug("%s", "\nRemoving empty objects\n");
+  remove_empty(topology, &topology->levels[0][0]);
+  if (!topology->levels[0][0]) {
+    fprintf(stderr, "Topology became empty, aborting!\n");
+    return -1;
+  }
+  if (hwloc_bitmap_iszero(topology->levels[0][0]->cpuset)) {
+    fprintf(stderr, "Topology does not contain any PU, aborting!\n");
+    return -1;
+  }
+  if (hwloc_bitmap_iszero(topology->levels[0][0]->nodeset)) {
+    fprintf(stderr, "Topology does not contain any NUMA node, aborting!\n");
+    return -1;
+  }
+  hwloc_debug_print_objects(0, topology->levels[0][0]);
+
+  /* Reconnect things after all these changes.
+   * Often needed because of Groups inserted for I/Os.
+   * And required for KEEP_STRUCTURE below.
+   */
+  if (hwloc_topology_reconnect(topology, 0) < 0)
+    return -1;
+
+  hwloc_debug("%s", "\nRemoving levels with HWLOC_TYPE_FILTER_KEEP_STRUCTURE\n");
+  hwloc_filter_levels_keep_structure(topology);
+  hwloc_debug_print_objects(0, topology->levels[0][0]);
+
+  /* accumulate children memory in total_memory fields (only once parent is set) */
+  hwloc_debug("%s", "\nPropagate total memory up\n");
+  propagate_total_memory(topology->levels[0][0]);
+
+  /* setup the symmetric_subtree attribute */
+  hwloc_propagate_symmetric_subtree(topology, topology->levels[0][0]);
+
+  /* apply group depths */
+  hwloc_set_group_depth(topology);
+
+  /* add some identification attributes if not loading from XML */
+  if (topology->backends
+      && strcmp(topology->backends->component->name, "xml")) {
+    char *value;
+    /* add a hwlocVersion */
+    hwloc_obj_add_info(topology->levels[0][0], "hwlocVersion", HWLOC_VERSION);
+    /* add a ProcessName */
+    value = hwloc_progname(topology);
+    if (value) {
+      hwloc_obj_add_info(topology->levels[0][0], "ProcessName", value);
+      free(value);
+    }
+  }
+
+  return 0;
+}
+
+/* To be called before discovery is actually launched,
+ * Resets everything in case a previous load initialized some stuff.
+ */
+void
+hwloc_topology_setup_defaults(struct hwloc_topology *topology)
+{
+  struct hwloc_obj *root_obj;
+
+  /* reset support */
+  memset(&topology->binding_hooks, 0, sizeof(topology->binding_hooks));
+  memset(topology->support.discovery, 0, sizeof(*topology->support.discovery));
+  memset(topology->support.cpubind, 0, sizeof(*topology->support.cpubind));
+  memset(topology->support.membind, 0, sizeof(*topology->support.membind));
+
+  /* Only the System object on top by default */
+  topology->next_gp_index = 1; /* keep 0 as an invalid value */
+  topology->nb_levels = 1; /* there's at least SYSTEM */
+  topology->levels[0] = hwloc_tma_malloc (topology->tma, sizeof (hwloc_obj_t));
+  topology->level_nbobjects[0] = 1;
+
+  /* Machine-wide memory */
+  topology->machine_memory.local_memory = 0;
+  topology->machine_memory.page_types_len = 0;
+  topology->machine_memory.page_types = NULL;
+
+  /* Allowed stuff */
+  topology->allowed_cpuset = NULL;
+  topology->allowed_nodeset = NULL;
+
+  /* NULLify other special levels */
+  memset(&topology->slevels, 0, sizeof(topology->slevels));
+  /* assert the indexes of special levels */
+  HWLOC_BUILD_ASSERT(HWLOC_SLEVEL_NUMANODE == HWLOC_SLEVEL_FROM_DEPTH(HWLOC_TYPE_DEPTH_NUMANODE));
+  HWLOC_BUILD_ASSERT(HWLOC_SLEVEL_MISC == HWLOC_SLEVEL_FROM_DEPTH(HWLOC_TYPE_DEPTH_MISC));
+  HWLOC_BUILD_ASSERT(HWLOC_SLEVEL_BRIDGE == HWLOC_SLEVEL_FROM_DEPTH(HWLOC_TYPE_DEPTH_BRIDGE));
+  HWLOC_BUILD_ASSERT(HWLOC_SLEVEL_PCIDEV == HWLOC_SLEVEL_FROM_DEPTH(HWLOC_TYPE_DEPTH_PCI_DEVICE));
+  HWLOC_BUILD_ASSERT(HWLOC_SLEVEL_OSDEV == HWLOC_SLEVEL_FROM_DEPTH(HWLOC_TYPE_DEPTH_OS_DEVICE));
+
+  /* sane values to type_depth */
+  hwloc_reset_normal_type_depths(topology);
+  topology->type_depth[HWLOC_OBJ_NUMANODE] = HWLOC_TYPE_DEPTH_NUMANODE;
+  topology->type_depth[HWLOC_OBJ_MISC] = HWLOC_TYPE_DEPTH_MISC;
+  topology->type_depth[HWLOC_OBJ_BRIDGE] = HWLOC_TYPE_DEPTH_BRIDGE;
+  topology->type_depth[HWLOC_OBJ_PCI_DEVICE] = HWLOC_TYPE_DEPTH_PCI_DEVICE;
+  topology->type_depth[HWLOC_OBJ_OS_DEVICE] = HWLOC_TYPE_DEPTH_OS_DEVICE;
+
+  /* Create the actual machine object, but don't touch its attributes yet
+   * since the OS backend may still change the object into something else
+   * (for instance System)
+   */
+  root_obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_MACHINE, 0);
+  topology->levels[0][0] = root_obj;
+}
+
+static void hwloc__topology_filter_init(struct hwloc_topology *topology);
+
+/* This function may use a tma, it cannot free() or realloc() */
+static int
+hwloc__topology_init (struct hwloc_topology **topologyp,
+		      unsigned nblevels,
+		      struct hwloc_tma *tma)
+{
+  struct hwloc_topology *topology;
+
+  topology = hwloc_tma_malloc (tma, sizeof (struct hwloc_topology));
+  if(!topology)
+    return -1;
+
+  topology->tma = tma;
+
+  hwloc_components_init(); /* uses malloc without tma, but won't need it since dup() caller already took a reference */
+  hwloc_backends_init(topology);
+  hwloc_pci_discovery_init(topology); /* make sure both dup() and load() get sane variables */
+
+  /* Setup topology context */
+  topology->is_loaded = 0;
+  topology->flags = 0;
+  topology->is_thissystem = 1;
+  topology->pid = 0;
+  topology->userdata = NULL;
+  topology->topology_abi = HWLOC_TOPOLOGY_ABI;
+  topology->adopted_shmem_addr = NULL;
+  topology->adopted_shmem_length = 0;
+
+  topology->support.discovery = hwloc_tma_malloc(tma, sizeof(*topology->support.discovery));
+  topology->support.cpubind = hwloc_tma_malloc(tma, sizeof(*topology->support.cpubind));
+  topology->support.membind = hwloc_tma_malloc(tma, sizeof(*topology->support.membind));
+
+  topology->nb_levels_allocated = nblevels; /* enough for default 9 levels = Mach+Pack+NUMA+L3+L2+L1d+L1i+Co+PU */
+  topology->levels = hwloc_tma_calloc(tma, topology->nb_levels_allocated * sizeof(*topology->levels));
+  topology->level_nbobjects = hwloc_tma_calloc(tma, topology->nb_levels_allocated * sizeof(*topology->level_nbobjects));
+
+  hwloc__topology_filter_init(topology);
+
+  hwloc_internal_distances_init(topology);
+
+  topology->userdata_export_cb = NULL;
+  topology->userdata_import_cb = NULL;
+  topology->userdata_not_decoded = 0;
+
+  /* Make the topology look like something coherent but empty */
+  hwloc_topology_setup_defaults(topology);
+
+  *topologyp = topology;
+  return 0;
+}
+
+int
+hwloc_topology_init (struct hwloc_topology **topologyp)
+{
+  return hwloc__topology_init(topologyp,
+			      16, /* 16 is enough for default 9 levels = Mach+Pack+NUMA+L3+L2+L1d+L1i+Co+PU */
+			      NULL); /* no TMA for normal topologies, too many allocations to fix */
+}
+
+int
+hwloc_topology_set_pid(struct hwloc_topology *topology __hwloc_attribute_unused,
+                       hwloc_pid_t pid __hwloc_attribute_unused)
+{
+  if (topology->is_loaded) {
+    errno = EBUSY;
+    return -1;
+  }
+
+  /* this does *not* change the backend */
+#ifdef HWLOC_LINUX_SYS
+  topology->pid = pid;
+  return 0;
+#else /* HWLOC_LINUX_SYS */
+  errno = ENOSYS;
+  return -1;
+#endif /* HWLOC_LINUX_SYS */
+}
+
+int
+hwloc_topology_set_synthetic(struct hwloc_topology *topology, const char *description)
+{
+  if (topology->is_loaded) {
+    errno = EBUSY;
+    return -1;
+  }
+
+  return hwloc_disc_component_force_enable(topology,
+					   0 /* api */,
+					   -1, "synthetic",
+					   description, NULL, NULL);
+}
+
+int
+hwloc_topology_set_xml(struct hwloc_topology *topology,
+		       const char *xmlpath)
+{
+  if (topology->is_loaded) {
+    errno = EBUSY;
+    return -1;
+  }
+
+  return hwloc_disc_component_force_enable(topology,
+					   0 /* api */,
+					   -1, "xml",
+					   xmlpath, NULL, NULL);
+}
+
+int
+hwloc_topology_set_xmlbuffer(struct hwloc_topology *topology,
+                             const char *xmlbuffer,
+                             int size)
+{
+  if (topology->is_loaded) {
+    errno = EBUSY;
+    return -1;
+  }
+
+  return hwloc_disc_component_force_enable(topology,
+					   0 /* api */,
+					   -1, "xml", NULL,
+					   xmlbuffer, (void*) (uintptr_t) size);
+}
+
+int
+hwloc_topology_set_flags (struct hwloc_topology *topology, unsigned long flags)
+{
+  if (topology->is_loaded) {
+    /* actually harmless */
+    errno = EBUSY;
+    return -1;
+  }
+
+  if (flags & ~(HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM|HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM|HWLOC_TOPOLOGY_FLAG_THISSYSTEM_ALLOWED_RESOURCES)) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  topology->flags = flags;
+  return 0;
+}
+
+unsigned long
+hwloc_topology_get_flags (struct hwloc_topology *topology)
+{
+  return topology->flags;
+}
+
+static void
+hwloc__topology_filter_init(struct hwloc_topology *topology)
+{
+  hwloc_obj_type_t type;
+  /* Only ignore useless cruft by default */
+  for(type = HWLOC_OBJ_TYPE_MIN; type < HWLOC_OBJ_TYPE_MAX; type++)
+    topology->type_filter[type] = HWLOC_TYPE_FILTER_KEEP_ALL;
+  topology->type_filter[HWLOC_OBJ_L1ICACHE] = HWLOC_TYPE_FILTER_KEEP_NONE;
+  topology->type_filter[HWLOC_OBJ_L2ICACHE] = HWLOC_TYPE_FILTER_KEEP_NONE;
+  topology->type_filter[HWLOC_OBJ_L3ICACHE] = HWLOC_TYPE_FILTER_KEEP_NONE;
+  topology->type_filter[HWLOC_OBJ_GROUP] = HWLOC_TYPE_FILTER_KEEP_STRUCTURE;
+  topology->type_filter[HWLOC_OBJ_MISC] = HWLOC_TYPE_FILTER_KEEP_NONE;
+  topology->type_filter[HWLOC_OBJ_BRIDGE] = HWLOC_TYPE_FILTER_KEEP_NONE;
+  topology->type_filter[HWLOC_OBJ_PCI_DEVICE] = HWLOC_TYPE_FILTER_KEEP_NONE;
+  topology->type_filter[HWLOC_OBJ_OS_DEVICE] = HWLOC_TYPE_FILTER_KEEP_NONE;
+}
+
+static int
+hwloc__topology_set_type_filter(struct hwloc_topology *topology, hwloc_obj_type_t type, enum hwloc_type_filter_e filter)
+{
+  if (type == HWLOC_OBJ_PU || type == HWLOC_OBJ_NUMANODE || type == HWLOC_OBJ_MACHINE) {
+    if (filter != HWLOC_TYPE_FILTER_KEEP_ALL) {
+      /* we need the Machine, PU and NUMA levels */
+      errno = EINVAL;
+      return -1;
+    }
+  } else if (hwloc__obj_type_is_special(type)) {
+    if (filter == HWLOC_TYPE_FILTER_KEEP_STRUCTURE) {
+      /* I/O and Misc are outside of the main topology structure, makes no sense. */
+      errno = EINVAL;
+      return -1;
+    }
+  } else if (type == HWLOC_OBJ_GROUP) {
+    if (filter == HWLOC_TYPE_FILTER_KEEP_ALL) {
+      /* Groups are always ignored, at least keep_structure */
+      errno = EINVAL;
+      return -1;
+    }
+  }
+
+  /* "important" just means "all" for non-I/O non-Misc */
+  if (!hwloc__obj_type_is_special(type) && filter == HWLOC_TYPE_FILTER_KEEP_IMPORTANT)
+    filter = HWLOC_TYPE_FILTER_KEEP_ALL;
+
+  topology->type_filter[type] = filter;
+  return 0;
+}
+
+int
+hwloc_topology_set_type_filter(struct hwloc_topology *topology, hwloc_obj_type_t type, enum hwloc_type_filter_e filter)
+{
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_TYPE_MIN == 0);
+  if ((unsigned) type >= HWLOC_OBJ_TYPE_MAX) {
+    errno = EINVAL;
+    return -1;
+  }
+  if (topology->is_loaded) {
+    errno = EBUSY;
+    return -1;
+  }
+  return hwloc__topology_set_type_filter(topology, type, filter);
+}
+
+int
+hwloc_topology_set_all_types_filter(struct hwloc_topology *topology, enum hwloc_type_filter_e filter)
+{
+  hwloc_obj_type_t type;
+  if (topology->is_loaded) {
+    errno = EBUSY;
+    return -1;
+  }
+  for(type = HWLOC_OBJ_TYPE_MIN; type < HWLOC_OBJ_TYPE_MAX; type++)
+    hwloc__topology_set_type_filter(topology, type, filter);
+  return 0;
+}
+
+int
+hwloc_topology_set_cache_types_filter(hwloc_topology_t topology, enum hwloc_type_filter_e filter)
+{
+  unsigned i;
+  for(i=HWLOC_OBJ_L1CACHE; i<HWLOC_OBJ_L3ICACHE; i++)
+    hwloc_topology_set_type_filter(topology, (hwloc_obj_type_t) i, filter);
+  return 0;
+}
+
+int
+hwloc_topology_set_icache_types_filter(hwloc_topology_t topology, enum hwloc_type_filter_e filter)
+{
+  unsigned i;
+  for(i=HWLOC_OBJ_L1ICACHE; i<HWLOC_OBJ_L3ICACHE; i++)
+    hwloc_topology_set_type_filter(topology, (hwloc_obj_type_t) i, filter);
+  return 0;
+}
+
+int
+hwloc_topology_set_io_types_filter(hwloc_topology_t topology, enum hwloc_type_filter_e filter)
+{
+  hwloc_topology_set_type_filter(topology, HWLOC_OBJ_BRIDGE, filter);
+  hwloc_topology_set_type_filter(topology, HWLOC_OBJ_PCI_DEVICE, filter);
+  hwloc_topology_set_type_filter(topology, HWLOC_OBJ_OS_DEVICE, filter);
+  return 0;
+}
+
+int
+hwloc_topology_get_type_filter(struct hwloc_topology *topology, hwloc_obj_type_t type, enum hwloc_type_filter_e *filterp)
+{
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_TYPE_MIN == 0);
+  if ((unsigned) type >= HWLOC_OBJ_TYPE_MAX) {
+    errno = EINVAL;
+    return -1;
+  }
+  *filterp = topology->type_filter[type];
+  return 0;
+}
+
+void
+hwloc_topology_clear (struct hwloc_topology *topology)
+{
+  /* no need to set to NULL after free() since callers will call setup_defaults() or just destroy the rest of the topology */
+  unsigned l;
+  hwloc_internal_distances_destroy(topology);
+  hwloc_free_object_and_children(topology->levels[0][0]);
+  hwloc_bitmap_free(topology->allowed_cpuset);
+  hwloc_bitmap_free(topology->allowed_nodeset);
+  for (l=0; l<topology->nb_levels; l++)
+    free(topology->levels[l]);
+  for(l=0; l<HWLOC_NR_SLEVELS; l++)
+    free(topology->slevels[l].objs);
+  free(topology->machine_memory.page_types);
+}
+
+void
+hwloc_topology_destroy (struct hwloc_topology *topology)
+{
+  if (topology->adopted_shmem_addr) {
+    hwloc__topology_disadopt(topology);
+    return;
+  }
+
+  hwloc_backends_disable_all(topology);
+  hwloc_components_fini();
+
+  hwloc_topology_clear(topology);
+
+  free(topology->levels);
+  free(topology->level_nbobjects);
+
+  free(topology->support.discovery);
+  free(topology->support.cpubind);
+  free(topology->support.membind);
+  free(topology);
+}
+
+int
+hwloc_topology_load (struct hwloc_topology *topology)
+{
+  int err;
+
+  if (topology->is_loaded) {
+    errno = EBUSY;
+    return -1;
+  }
+
+  hwloc_internal_distances_prepare(topology);
+
+  if (getenv("HWLOC_XML_USERDATA_NOT_DECODED"))
+    topology->userdata_not_decoded = 1;
+
+  /* Ignore variables if HWLOC_COMPONENTS is set. It will be processed later */
+  if (!getenv("HWLOC_COMPONENTS")) {
+    /* Only apply variables if we have not changed the backend yet.
+     * Only the first one will be kept.
+     * Check for FSROOT first since it's for debugging so likely needs to override everything else.
+     * Check for XML last (that's the one that may be set system-wide by administrators)
+     * so that it's only used if other variables are not set,
+     * to allow users to override easily.
+     */
+    if (!topology->backends) {
+      const char *fsroot_path_env = getenv("HWLOC_FSROOT");
+      if (fsroot_path_env)
+	hwloc_disc_component_force_enable(topology,
+					  1 /* env force */,
+					  HWLOC_DISC_COMPONENT_TYPE_CPU, "linux",
+					  NULL /* backend will getenv again */, NULL, NULL);
+    }
+    if (!topology->backends) {
+      const char *cpuid_path_env = getenv("HWLOC_CPUID_PATH");
+      if (cpuid_path_env)
+	hwloc_disc_component_force_enable(topology,
+					  1 /* env force */,
+					  HWLOC_DISC_COMPONENT_TYPE_CPU, "x86",
+					  NULL /* backend will getenv again */, NULL, NULL);
+    }
+    if (!topology->backends) {
+      const char *synthetic_env = getenv("HWLOC_SYNTHETIC");
+      if (synthetic_env)
+	hwloc_disc_component_force_enable(topology,
+					  1 /* env force */,
+					  -1, "synthetic",
+					  synthetic_env, NULL, NULL);
+    }
+    if (!topology->backends) {
+      const char *xmlpath_env = getenv("HWLOC_XMLFILE");
+      if (xmlpath_env)
+	hwloc_disc_component_force_enable(topology,
+					  1 /* env force */,
+					  -1, "xml",
+					  xmlpath_env, NULL, NULL);
+    }
+  }
+
+  /* instantiate all possible other backends now */
+  hwloc_disc_components_enable_others(topology);
+  /* now that backends are enabled, update the thissystem flag and some callbacks */
+  hwloc_backends_is_thissystem(topology);
+  hwloc_backends_find_callbacks(topology);
+  /*
+   * Now set binding hooks according to topology->is_thissystem
+   * and what the native OS backend offers.
+   */
+  hwloc_set_binding_hooks(topology);
+
+  hwloc_pci_discovery_prepare(topology);
+
+  /* actual topology discovery */
+  err = hwloc_discover(topology);
+  if (err < 0)
+    goto out;
+
+  hwloc_pci_discovery_exit(topology);
+
+#ifndef HWLOC_DEBUG
+  if (getenv("HWLOC_DEBUG_CHECK"))
+#endif
+    hwloc_topology_check(topology);
+
+  /* Mark distances objs arrays as invalid since we may have removed objects
+   * from the topology after adding the distances (remove_empty, etc).
+   * It would be hard to actually verify whether it's needed.
+   */
+  hwloc_internal_distances_invalidate_cached_objs(topology);
+  /* And refresh distances so that multithreaded concurrent distances_get()
+   * don't refresh() concurrently (disallowed).
+   */
+  hwloc_internal_distances_refresh(topology);
+
+  topology->is_loaded = 1;
+  return 0;
+
+ out:
+  hwloc_pci_discovery_exit(topology);
+  hwloc_topology_clear(topology);
+  hwloc_topology_setup_defaults(topology);
+  hwloc_backends_disable_all(topology);
+  return -1;
+}
+
+/* adjust object cpusets according the given droppedcpuset,
+ * drop object whose cpuset becomes empty and that have no children,
+ * and propagate NUMA node removal as nodeset changes in parents.
+ */
+static void
+restrict_object_by_cpuset(hwloc_topology_t topology, unsigned long flags, hwloc_obj_t *pobj,
+			  hwloc_bitmap_t droppedcpuset, hwloc_bitmap_t droppednodeset)
+{
+  hwloc_obj_t obj = *pobj, child, *pchild;
+  int modified = 0;
+
+  if (hwloc_bitmap_intersects(obj->complete_cpuset, droppedcpuset)) {
+    hwloc_bitmap_andnot(obj->cpuset, obj->cpuset, droppedcpuset);
+    hwloc_bitmap_andnot(obj->complete_cpuset, obj->complete_cpuset, droppedcpuset);
+    modified = 1;
+  } else {
+    if ((flags & HWLOC_RESTRICT_FLAG_REMOVE_CPULESS)
+	&& hwloc_bitmap_iszero(obj->complete_cpuset)) {
+      /* we're empty, there's a NUMAnode below us, it'll be removed this time */
+      modified = 1;
+    }
+    /* nodeset cannot intersect unless cpuset intersects or is empty */
+    if (droppednodeset)
+      assert(!hwloc_bitmap_intersects(obj->complete_nodeset, droppednodeset)
+	     || hwloc_bitmap_iszero(obj->complete_cpuset));
+  }
+  if (droppednodeset) {
+    hwloc_bitmap_andnot(obj->nodeset, obj->nodeset, droppednodeset);
+    hwloc_bitmap_andnot(obj->complete_nodeset, obj->complete_nodeset, droppednodeset);
+  }
+
+  if (modified) {
+    for_each_child_safe(child, obj, pchild)
+      restrict_object_by_cpuset(topology, flags, pchild, droppedcpuset, droppednodeset);
+    /* if some hwloc_bitmap_first(child->complete_cpuset) changed, children might need to be reordered */
+    hwloc__reorder_children(obj);
+
+    for_each_memory_child_safe(child, obj, pchild)
+      restrict_object_by_cpuset(topology, flags, pchild, droppedcpuset, droppednodeset);
+    /* local NUMA nodes have the same cpusets, no need to reorder them */
+
+    /* Nothing to restrict under I/O or Misc */
+  }
+
+  if (!obj->first_child && !obj->memory_first_child /* arity not updated before connect_children() */
+      && hwloc_bitmap_iszero(obj->cpuset)
+      && (obj->type != HWLOC_OBJ_NUMANODE || (flags & HWLOC_RESTRICT_FLAG_REMOVE_CPULESS))) {
+    /* remove object */
+    hwloc_debug("%s", "\nRemoving object during restrict");
+    hwloc_debug_print_object(0, obj);
+
+    if (!(flags & HWLOC_RESTRICT_FLAG_ADAPT_IO)) {
+      hwloc_free_object_siblings_and_children(obj->io_first_child);
+      obj->io_first_child = NULL;
+    }
+    if (!(flags & HWLOC_RESTRICT_FLAG_ADAPT_MISC)) {
+      hwloc_free_object_siblings_and_children(obj->misc_first_child);
+      obj->misc_first_child = NULL;
+    }
+    assert(!obj->first_child);
+    assert(!obj->memory_first_child);
+    unlink_and_free_single_object(pobj);
+    topology->modified = 1;
+  }
+}
+
+int
+hwloc_topology_restrict(struct hwloc_topology *topology, hwloc_const_cpuset_t cpuset, unsigned long flags)
+{
+  hwloc_bitmap_t droppedcpuset, droppednodeset;
+
+  if (!topology->is_loaded) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (flags & ~(HWLOC_RESTRICT_FLAG_REMOVE_CPULESS
+		|HWLOC_RESTRICT_FLAG_ADAPT_MISC|HWLOC_RESTRICT_FLAG_ADAPT_IO)) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  /* make sure we'll keep something in the topology */
+  if (!hwloc_bitmap_intersects(cpuset, topology->allowed_cpuset)) {
+    errno = EINVAL; /* easy failure, just don't touch the topology */
+    return -1;
+  }
+
+  droppedcpuset = hwloc_bitmap_alloc();
+  droppednodeset = hwloc_bitmap_alloc();
+  if (!droppedcpuset || !droppednodeset) {
+    hwloc_bitmap_free(droppedcpuset);
+    hwloc_bitmap_free(droppednodeset);
+    return -1;
+  }
+
+  /* cpuset to clear */
+  hwloc_bitmap_not(droppedcpuset, cpuset);
+  /* nodeset to clear */
+  if (flags & HWLOC_RESTRICT_FLAG_REMOVE_CPULESS) {
+    hwloc_obj_t node = hwloc_get_obj_by_type(topology, HWLOC_OBJ_NUMANODE, 0);
+    do {
+      /* node will be removed if nodeset gets or was empty */
+      if (hwloc_bitmap_iszero(node->cpuset)
+	  || hwloc_bitmap_isincluded(node->cpuset, droppedcpuset))
+	hwloc_bitmap_set(droppednodeset, node->os_index);
+      node = node->next_cousin;
+    } while (node);
+
+    /* check we're not removing all NUMA nodes */
+    if (hwloc_bitmap_isincluded(topology->allowed_nodeset, droppednodeset)) {
+      errno = EINVAL; /* easy failure, just don't touch the topology */
+      hwloc_bitmap_free(droppedcpuset);
+      hwloc_bitmap_free(droppednodeset);
+      return -1;
+    }
+  }
+  /* remove nodeset if empty */
+  if (!(flags & HWLOC_RESTRICT_FLAG_REMOVE_CPULESS)
+      || hwloc_bitmap_iszero(droppednodeset)) {
+    hwloc_bitmap_free(droppednodeset);
+    droppednodeset = NULL;
+  }
+
+  /* now recurse to filter sets and drop things */
+  restrict_object_by_cpuset(topology, flags, &topology->levels[0][0], droppedcpuset, droppednodeset);
+  hwloc_bitmap_andnot(topology->allowed_cpuset, topology->allowed_cpuset, droppedcpuset);
+  if (droppednodeset)
+    hwloc_bitmap_andnot(topology->allowed_nodeset, topology->allowed_nodeset, droppednodeset);
+
+  hwloc_bitmap_free(droppedcpuset);
+  hwloc_bitmap_free(droppednodeset);
+
+  if (hwloc_topology_reconnect(topology, 0) < 0)
+    goto out;
+
+  /* some objects may have disappeared, we need to update distances objs arrays */
+  hwloc_internal_distances_invalidate_cached_objs(topology);
+
+  hwloc_filter_levels_keep_structure(topology);
+  hwloc_propagate_symmetric_subtree(topology, topology->levels[0][0]);
+  propagate_total_memory(topology->levels[0][0]);
+
+#ifndef HWLOC_DEBUG
+  if (getenv("HWLOC_DEBUG_CHECK"))
+#endif
+    hwloc_topology_check(topology);
+
+  return 0;
+
+ out:
+  /* unrecoverable failure, re-init the topology */
+   hwloc_topology_clear(topology);
+   hwloc_topology_setup_defaults(topology);
+   return -1;
+}
+
+int
+hwloc_topology_is_thissystem(struct hwloc_topology *topology)
+{
+  return topology->is_thissystem;
+}
+
+int
+hwloc_topology_get_depth(struct hwloc_topology *topology)
+{
+  return (int) topology->nb_levels;
+}
+
+const struct hwloc_topology_support *
+hwloc_topology_get_support(struct hwloc_topology * topology)
+{
+  return &topology->support;
+}
+
+void hwloc_topology_set_userdata(struct hwloc_topology * topology, const void *userdata)
+{
+  topology->userdata = (void *) userdata;
+}
+
+void * hwloc_topology_get_userdata(struct hwloc_topology * topology)
+{
+  return topology->userdata;
+}
+
+hwloc_const_cpuset_t
+hwloc_topology_get_complete_cpuset(hwloc_topology_t topology)
+{
+  return hwloc_get_root_obj(topology)->complete_cpuset;
+}
+
+hwloc_const_cpuset_t
+hwloc_topology_get_topology_cpuset(hwloc_topology_t topology)
+{
+  return hwloc_get_root_obj(topology)->cpuset;
+}
+
+hwloc_const_cpuset_t
+hwloc_topology_get_allowed_cpuset(hwloc_topology_t topology)
+{
+  return topology->allowed_cpuset;
+}
+
+hwloc_const_nodeset_t
+hwloc_topology_get_complete_nodeset(hwloc_topology_t topology)
+{
+  return hwloc_get_root_obj(topology)->complete_nodeset;
+}
+
+hwloc_const_nodeset_t
+hwloc_topology_get_topology_nodeset(hwloc_topology_t topology)
+{
+  return hwloc_get_root_obj(topology)->nodeset;
+}
+
+hwloc_const_nodeset_t
+hwloc_topology_get_allowed_nodeset(hwloc_topology_t topology)
+{
+  return topology->allowed_nodeset;
+}
+
+
+/****************
+ * Debug Checks *
+ ****************/
+
+#ifndef NDEBUG /* assert only enabled if !NDEBUG */
+
+static void
+hwloc__check_child_siblings(hwloc_obj_t parent, hwloc_obj_t *array,
+			    unsigned arity, unsigned i,
+			    hwloc_obj_t child, hwloc_obj_t prev)
+{
+  assert(child->parent == parent);
+
+  assert(child->sibling_rank == i);
+  if (array)
+    assert(child == array[i]);
+
+  if (prev)
+    assert(prev->next_sibling == child);
+  assert(child->prev_sibling == prev);
+
+  if (!i)
+    assert(child->prev_sibling == NULL);
+  else
+    assert(child->prev_sibling != NULL);
+
+  if (i == arity-1)
+    assert(child->next_sibling == NULL);
+  else
+    assert(child->next_sibling != NULL);
+}
+
+static void
+hwloc__check_object(hwloc_topology_t topology, hwloc_bitmap_t gp_indexes, hwloc_obj_t obj);
+
+/* check children between a parent object */
+static void
+hwloc__check_normal_children(hwloc_topology_t topology, hwloc_bitmap_t gp_indexes, hwloc_obj_t parent)
+{
+  hwloc_obj_t child, prev;
+  unsigned j;
+
+  if (!parent->arity) {
+    /* check whether that parent has no children for real */
+    assert(!parent->children);
+    assert(!parent->first_child);
+    assert(!parent->last_child);
+    return;
+  }
+  /* check whether that parent has children for real */
+  assert(parent->children);
+  assert(parent->first_child);
+  assert(parent->last_child);
+
+  /* sibling checks */
+  for(prev = NULL, child = parent->first_child, j = 0;
+      child;
+      prev = child, child = child->next_sibling, j++) {
+    /* normal child */
+    assert(hwloc__obj_type_is_normal(child->type));
+    /* check depth */
+    assert(child->depth > parent->depth);
+    /* check siblings */
+    hwloc__check_child_siblings(parent, parent->children, parent->arity, j, child, prev);
+    /* recurse */
+    hwloc__check_object(topology, gp_indexes, child);
+  }
+  /* check arity */
+  assert(j == parent->arity);
+
+  assert(parent->first_child == parent->children[0]);
+  assert(parent->last_child == parent->children[parent->arity-1]);
+
+  /* no normal children below a PU */
+  if (parent->type == HWLOC_OBJ_PU)
+    assert(!parent->arity);
+}
+
+static void
+hwloc__check_children_cpusets(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj)
+{
+  /* we already checked in the caller that objects have either all sets or none */
+  hwloc_obj_t child;
+  int prev_first, prev_empty;
+
+  if (obj->type == HWLOC_OBJ_PU) {
+    /* PU cpuset is just itself, with no normal children */
+    assert(hwloc_bitmap_weight(obj->cpuset) == 1);
+    assert(hwloc_bitmap_first(obj->cpuset) == (int) obj->os_index);
+    assert(hwloc_bitmap_weight(obj->complete_cpuset) == 1);
+    assert(hwloc_bitmap_first(obj->complete_cpuset) == (int) obj->os_index);
+    if (!(topology->flags & HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM)) {
+      assert(hwloc_bitmap_isset(topology->allowed_cpuset, (int) obj->os_index));
+    }
+    assert(!obj->arity);
+  } else if (hwloc__obj_type_is_memory(obj->type)) {
+    /* memory object cpuset is equal to its parent */
+    assert(hwloc_bitmap_isequal(obj->parent->cpuset, obj->cpuset));
+    assert(!obj->arity);
+  } else if (!hwloc__obj_type_is_special(obj->type)) {
+    hwloc_bitmap_t set;
+    /* other obj cpuset is an exclusive OR of normal children, except for PUs */
+    set = hwloc_bitmap_alloc();
+    for_each_child(child, obj) {
+      assert(!hwloc_bitmap_intersects(set, child->cpuset));
+      hwloc_bitmap_or(set, set, child->cpuset);
+    }
+    assert(hwloc_bitmap_isequal(set, obj->cpuset));
+    hwloc_bitmap_free(set);
+  }
+
+  /* check that memory children have same cpuset */
+  for_each_memory_child(child, obj)
+    assert(hwloc_bitmap_isequal(obj->cpuset, child->cpuset));
+
+  /* check that children complete_cpusets are properly ordered, empty ones may be anywhere
+   * (can be wrong for main cpuset since removed PUs can break the ordering).
+   */
+  prev_first = -1; /* -1 works fine with first comparisons below */
+  prev_empty = 0; /* no empty cpuset in previous children */
+  for_each_child(child, obj) {
+    int first = hwloc_bitmap_first(child->complete_cpuset);
+    if (first >= 0) {
+      assert(!prev_empty); /* no objects with CPU after objects without CPU */
+      assert(prev_first < first);
+    } else {
+      prev_empty = 1;
+    }
+    prev_first = first;
+  }
+}
+
+static void
+hwloc__check_memory_children(hwloc_topology_t topology, hwloc_bitmap_t gp_indexes, hwloc_obj_t parent)
+{
+  unsigned j;
+  hwloc_obj_t child, prev;
+
+  if (!parent->memory_arity) {
+    /* check whether that parent has no children for real */
+    assert(!parent->memory_first_child);
+    return;
+  }
+  /* check whether that parent has children for real */
+  assert(parent->memory_first_child);
+
+  for(prev = NULL, child = parent->memory_first_child, j = 0;
+      child;
+      prev = child, child = child->next_sibling, j++) {
+    assert(hwloc__obj_type_is_memory(child->type));
+    /* check siblings */
+    hwloc__check_child_siblings(parent, NULL, parent->memory_arity, j, child, prev);
+    /* only Memory and Misc children, recurse */
+    assert(!child->first_child);
+    assert(!child->io_first_child);
+    hwloc__check_object(topology, gp_indexes, child);
+  }
+  /* check arity */
+  assert(j == parent->memory_arity);
+
+  /* no memory children below a NUMA node */
+  if (parent->type == HWLOC_OBJ_NUMANODE)
+    assert(!parent->memory_arity);
+}
+
+static void
+hwloc__check_io_children(hwloc_topology_t topology, hwloc_bitmap_t gp_indexes, hwloc_obj_t parent)
+{
+  unsigned j;
+  hwloc_obj_t child, prev;
+
+  if (!parent->io_arity) {
+    /* check whether that parent has no children for real */
+    assert(!parent->io_first_child);
+    return;
+  }
+  /* check whether that parent has children for real */
+  assert(parent->io_first_child);
+
+  for(prev = NULL, child = parent->io_first_child, j = 0;
+      child;
+      prev = child, child = child->next_sibling, j++) {
+    /* all children must be I/O */
+    assert(hwloc__obj_type_is_io(child->type));
+    /* check siblings */
+    hwloc__check_child_siblings(parent, NULL, parent->io_arity, j, child, prev);
+    /* only I/O and Misc children, recurse */
+    assert(!child->first_child);
+    assert(!child->memory_first_child);
+    hwloc__check_object(topology, gp_indexes, child);
+  }
+  /* check arity */
+  assert(j == parent->io_arity);
+}
+
+static void
+hwloc__check_misc_children(hwloc_topology_t topology, hwloc_bitmap_t gp_indexes, hwloc_obj_t parent)
+{
+  unsigned j;
+  hwloc_obj_t child, prev;
+
+  if (!parent->misc_arity) {
+    /* check whether that parent has no children for real */
+    assert(!parent->misc_first_child);
+    return;
+  }
+  /* check whether that parent has children for real */
+  assert(parent->misc_first_child);
+
+  for(prev = NULL, child = parent->misc_first_child, j = 0;
+      child;
+      prev = child, child = child->next_sibling, j++) {
+    /* all children must be Misc */
+    assert(child->type == HWLOC_OBJ_MISC);
+    /* check siblings */
+    hwloc__check_child_siblings(parent, NULL, parent->misc_arity, j, child, prev);
+    /* only Misc children, recurse */
+    assert(!child->first_child);
+    assert(!child->memory_first_child);
+    assert(!child->io_first_child);
+    hwloc__check_object(topology, gp_indexes, child);
+  }
+  /* check arity */
+  assert(j == parent->misc_arity);
+}
+
+static void
+hwloc__check_object(hwloc_topology_t topology, hwloc_bitmap_t gp_indexes, hwloc_obj_t obj)
+{
+  assert(!hwloc_bitmap_isset(gp_indexes, obj->gp_index));
+  hwloc_bitmap_set(gp_indexes, obj->gp_index);
+
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_TYPE_MIN == 0);
+  assert((unsigned) obj->type < HWLOC_OBJ_TYPE_MAX);
+
+  assert(hwloc_filter_check_keep_object(topology, obj));
+
+  /* check that sets and depth */
+  if (hwloc__obj_type_is_special(obj->type)) {
+    assert(!obj->cpuset);
+    if (obj->type == HWLOC_OBJ_BRIDGE)
+      assert(obj->depth == HWLOC_TYPE_DEPTH_BRIDGE);
+    else if (obj->type == HWLOC_OBJ_PCI_DEVICE)
+      assert(obj->depth == HWLOC_TYPE_DEPTH_PCI_DEVICE);
+    else if (obj->type == HWLOC_OBJ_OS_DEVICE)
+      assert(obj->depth == HWLOC_TYPE_DEPTH_OS_DEVICE);
+    else if (obj->type == HWLOC_OBJ_MISC)
+      assert(obj->depth == HWLOC_TYPE_DEPTH_MISC);
+  } else {
+    assert(obj->cpuset);
+    if (obj->type == HWLOC_OBJ_NUMANODE)
+      assert(obj->depth == HWLOC_TYPE_DEPTH_NUMANODE);
+    else
+      assert(obj->depth >= 0);
+  }
+
+  /* group depth cannot be -1 anymore in v2.0+ */
+  if (obj->type == HWLOC_OBJ_GROUP) {
+    assert(obj->attr->group.depth != (unsigned) -1);
+  }
+
+  /* there's other cpusets and nodesets if and only if there's a main cpuset */
+  assert(!!obj->cpuset == !!obj->complete_cpuset);
+  assert(!!obj->cpuset == !!obj->nodeset);
+  assert(!!obj->nodeset == !!obj->complete_nodeset);
+
+  /* check that complete/inline sets are larger than the main sets */
+  if (obj->cpuset) {
+    assert(hwloc_bitmap_isincluded(obj->cpuset, obj->complete_cpuset));
+    assert(hwloc_bitmap_isincluded(obj->nodeset, obj->complete_nodeset));
+  }
+
+  /* check cache type/depth vs type */
+  if (hwloc__obj_type_is_cache(obj->type)) {
+    if (hwloc__obj_type_is_icache(obj->type))
+      assert(obj->attr->cache.type == HWLOC_OBJ_CACHE_INSTRUCTION);
+    else if (hwloc__obj_type_is_dcache(obj->type))
+      assert(obj->attr->cache.type == HWLOC_OBJ_CACHE_DATA
+	     || obj->attr->cache.type == HWLOC_OBJ_CACHE_UNIFIED);
+    else
+      assert(0);
+    assert(hwloc_cache_type_by_depth_type(obj->attr->cache.depth, obj->attr->cache.type) == obj->type);
+  }
+
+  /* check children */
+  hwloc__check_normal_children(topology, gp_indexes, obj);
+  hwloc__check_memory_children(topology, gp_indexes, obj);
+  hwloc__check_io_children(topology, gp_indexes, obj);
+  hwloc__check_misc_children(topology, gp_indexes, obj);
+  hwloc__check_children_cpusets(topology, obj);
+  /* nodesets are checked during another recursion with state below */
+}
+
+static void
+hwloc__check_nodesets(hwloc_topology_t topology, hwloc_obj_t obj, hwloc_bitmap_t parentset)
+{
+  hwloc_obj_t child;
+  int prev_first;
+
+  if (obj->type == HWLOC_OBJ_NUMANODE) {
+    /* NUMANODE nodeset is just itself, with no memory/normal children */
+    assert(hwloc_bitmap_weight(obj->nodeset) == 1);
+    assert(hwloc_bitmap_first(obj->nodeset) == (int) obj->os_index);
+    assert(hwloc_bitmap_weight(obj->complete_nodeset) == 1);
+    assert(hwloc_bitmap_first(obj->complete_nodeset) == (int) obj->os_index);
+    if (!(topology->flags & HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM)) {
+      assert(hwloc_bitmap_isset(topology->allowed_nodeset, (int) obj->os_index));
+    }
+    assert(!obj->arity);
+    assert(!obj->memory_arity);
+    assert(hwloc_bitmap_isincluded(obj->nodeset, parentset));
+  } else {
+    hwloc_bitmap_t myset;
+    hwloc_bitmap_t childset;
+
+    /* the local nodeset is an exclusive OR of memory children */
+    myset = hwloc_bitmap_alloc();
+    for_each_memory_child(child, obj) {
+      assert(!hwloc_bitmap_intersects(myset, child->nodeset));
+      hwloc_bitmap_or(myset, myset, child->nodeset);
+    }
+    /* the local nodeset cannot intersect with parents' local nodeset */
+    assert(!hwloc_bitmap_intersects(myset, parentset));
+    hwloc_bitmap_or(parentset, parentset, myset);
+    hwloc_bitmap_free(myset);
+    /* parentset now contains parent+local contribution */
+
+    /* for each children, recurse to check/get its contribution */
+    childset = hwloc_bitmap_alloc();
+    for_each_child(child, obj) {
+      hwloc_bitmap_t set = hwloc_bitmap_dup(parentset); /* don't touch parentset, we don't want to propagate the first child contribution to other children */
+      hwloc__check_nodesets(topology, child, set);
+      /* extract this child contribution */
+      hwloc_bitmap_andnot(set, set, parentset);
+      /* save it */
+      assert(!hwloc_bitmap_intersects(childset, set));
+      hwloc_bitmap_or(childset, childset, set);
+      hwloc_bitmap_free(set);
+    }
+    /* combine child contribution into parentset */
+    assert(!hwloc_bitmap_intersects(parentset, childset));
+    hwloc_bitmap_or(parentset, parentset, childset);
+    hwloc_bitmap_free(childset);
+    /* now check that our nodeset is combination of parent, local and children */
+    assert(hwloc_bitmap_isequal(obj->nodeset, parentset));
+  }
+
+  /* check that children complete_nodesets are properly ordered, empty ones may be anywhere
+   * (can be wrong for main nodeset since removed PUs can break the ordering).
+   */
+  prev_first = -1; /* -1 works fine with first comparisons below */
+  for_each_memory_child(child, obj) {
+    int first = hwloc_bitmap_first(child->complete_nodeset);
+    assert(prev_first < first);
+    prev_first = first;
+  }
+}
+
+static void
+hwloc__check_level(struct hwloc_topology *topology, int depth,
+		   hwloc_obj_t first, hwloc_obj_t last)
+{
+  unsigned width = hwloc_get_nbobjs_by_depth(topology, depth);
+  struct hwloc_obj *prev = NULL;
+  hwloc_obj_t obj;
+  unsigned j;
+
+  /* check each object of the level */
+  for(j=0; j<width; j++) {
+    obj = hwloc_get_obj_by_depth(topology, depth, j);
+    /* check that the object is corrected placed horizontally and vertically */
+    assert(obj);
+    assert(obj->depth == depth);
+    assert(obj->logical_index == j);
+    /* check that all objects in the level have the same type */
+    if (prev) {
+      assert(hwloc_type_cmp(obj, prev) == HWLOC_OBJ_EQUAL);
+      assert(prev->next_cousin == obj);
+    }
+    assert(obj->prev_cousin == prev);
+
+    /* check that PUs and NUMA nodes have correct cpuset/nodeset */
+    if (obj->type == HWLOC_OBJ_NUMANODE) {
+      assert(hwloc_bitmap_weight(obj->complete_nodeset) == 1);
+      assert(hwloc_bitmap_first(obj->complete_nodeset) == (int) obj->os_index);
+    }
+    prev = obj;
+  }
+  if (prev)
+    assert(prev->next_cousin == NULL);
+
+  if (width) {
+    /* check first object of the level */
+    obj = hwloc_get_obj_by_depth(topology, depth, 0);
+    assert(obj);
+    assert(!obj->prev_cousin);
+    /* check type */
+    assert(hwloc_get_depth_type(topology, depth) == obj->type);
+    assert(depth == hwloc_get_type_depth(topology, obj->type)
+	   || HWLOC_TYPE_DEPTH_MULTIPLE == hwloc_get_type_depth(topology, obj->type));
+    /* check last object of the level */
+    obj = hwloc_get_obj_by_depth(topology, depth, width-1);
+    assert(obj);
+    assert(!obj->next_cousin);
+  }
+
+  if (depth < 0) {
+    assert(first == hwloc_get_obj_by_depth(topology, depth, 0));
+    assert(last == hwloc_get_obj_by_depth(topology, depth, width-1));
+  } else {
+    assert(!first);
+    assert(!last);
+  }
+
+  /* check last+1 object of the level */
+  obj = hwloc_get_obj_by_depth(topology, depth, width);
+  assert(!obj);
+}
+
+/* check a whole topology structure */
+void
+hwloc_topology_check(struct hwloc_topology *topology)
+{
+  struct hwloc_obj *obj;
+  hwloc_bitmap_t gp_indexes, set;
+  hwloc_obj_type_t type;
+  unsigned i;
+  int j, depth;
+
+  /* make sure we can use ranges to check types */
+
+  /* hwloc__obj_type_is_{,d,i}cache() want cache types to be ordered like this */
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_L2CACHE == HWLOC_OBJ_L1CACHE + 1);
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_L3CACHE == HWLOC_OBJ_L2CACHE + 1);
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_L4CACHE == HWLOC_OBJ_L3CACHE + 1);
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_L5CACHE == HWLOC_OBJ_L4CACHE + 1);
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_L1ICACHE == HWLOC_OBJ_L5CACHE + 1);
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_L2ICACHE == HWLOC_OBJ_L1ICACHE + 1);
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_L3ICACHE == HWLOC_OBJ_L2ICACHE + 1);
+
+  /* hwloc__obj_type_is_normal(), hwloc__obj_type_is_memory(), hwloc__obj_type_is_io(), hwloc__obj_type_is_special()
+   * and hwloc_reset_normal_type_depths()
+   * want special types to be ordered like this, after all normal types.
+   */
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_NUMANODE   + 1 == HWLOC_OBJ_BRIDGE);
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_BRIDGE     + 1 == HWLOC_OBJ_PCI_DEVICE);
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_PCI_DEVICE + 1 == HWLOC_OBJ_OS_DEVICE);
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_OS_DEVICE  + 1 == HWLOC_OBJ_MISC);
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_MISC       + 1 == HWLOC_OBJ_TYPE_MAX);
+
+  /* make sure order and priority arrays have the right size */
+  HWLOC_BUILD_ASSERT(sizeof(obj_type_order)/sizeof(*obj_type_order) == HWLOC_OBJ_TYPE_MAX);
+  HWLOC_BUILD_ASSERT(sizeof(obj_order_type)/sizeof(*obj_order_type) == HWLOC_OBJ_TYPE_MAX);
+  HWLOC_BUILD_ASSERT(sizeof(obj_type_priority)/sizeof(*obj_type_priority) == HWLOC_OBJ_TYPE_MAX);
+
+  /* make sure group are not entirely ignored */
+  assert(topology->type_filter[HWLOC_OBJ_GROUP] != HWLOC_TYPE_FILTER_KEEP_ALL);
+
+  /* make sure order arrays are coherent */
+  for(type=HWLOC_OBJ_TYPE_MIN; type<HWLOC_OBJ_TYPE_MAX; type++)
+    assert(obj_order_type[obj_type_order[type]] == type);
+  for(i=HWLOC_OBJ_TYPE_MIN; i<HWLOC_OBJ_TYPE_MAX; i++)
+    assert(obj_type_order[obj_order_type[i]] == i);
+
+  depth = hwloc_topology_get_depth(topology);
+
+  assert(!topology->modified);
+
+  /* check that first level is Machine.
+   * Root object cannot be ignored. And Machine can only be merged into PU,
+   * but there must be a NUMA node below Machine, and it cannot be below PU.
+   */
+  assert(hwloc_get_depth_type(topology, 0) == HWLOC_OBJ_MACHINE);
+
+  /* check that last level is PU and that it doesn't have memory */
+  assert(hwloc_get_depth_type(topology, depth-1) == HWLOC_OBJ_PU);
+  assert(hwloc_get_nbobjs_by_depth(topology, depth-1) > 0);
+  for(i=0; i<hwloc_get_nbobjs_by_depth(topology, depth-1); i++) {
+    obj = hwloc_get_obj_by_depth(topology, depth-1, i);
+    assert(obj);
+    assert(obj->type == HWLOC_OBJ_PU);
+    assert(!obj->memory_first_child);
+  }
+  /* check that other levels are not PU or Machine */
+  for(j=1; j<depth-1; j++) {
+    assert(hwloc_get_depth_type(topology, j) != HWLOC_OBJ_PU);
+    assert(hwloc_get_depth_type(topology, j) != HWLOC_OBJ_MACHINE);
+  }
+
+  /* check normal levels */
+  for(j=0; j<depth; j++) {
+    int d;
+    type = hwloc_get_depth_type(topology, j);
+    assert(type != HWLOC_OBJ_NUMANODE);
+    assert(type != HWLOC_OBJ_PCI_DEVICE);
+    assert(type != HWLOC_OBJ_BRIDGE);
+    assert(type != HWLOC_OBJ_OS_DEVICE);
+    assert(type != HWLOC_OBJ_MISC);
+    d = hwloc_get_type_depth(topology, type);
+    assert(d == j || d == HWLOC_TYPE_DEPTH_MULTIPLE);
+  }
+
+  /* check type depths, even if there's no such level */
+  for(type=HWLOC_OBJ_TYPE_MIN; type<HWLOC_OBJ_TYPE_MAX; type++) {
+    int d;
+    d = hwloc_get_type_depth(topology, type);
+    if (type == HWLOC_OBJ_NUMANODE) {
+      assert(d == HWLOC_TYPE_DEPTH_NUMANODE);
+      assert(hwloc_get_depth_type(topology, d) == HWLOC_OBJ_NUMANODE);
+    } else if (type == HWLOC_OBJ_BRIDGE) {
+      assert(d == HWLOC_TYPE_DEPTH_BRIDGE);
+      assert(hwloc_get_depth_type(topology, d) == HWLOC_OBJ_BRIDGE);
+    } else if (type == HWLOC_OBJ_PCI_DEVICE) {
+      assert(d == HWLOC_TYPE_DEPTH_PCI_DEVICE);
+      assert(hwloc_get_depth_type(topology, d) == HWLOC_OBJ_PCI_DEVICE);
+    } else if (type == HWLOC_OBJ_OS_DEVICE) {
+      assert(d == HWLOC_TYPE_DEPTH_OS_DEVICE);
+      assert(hwloc_get_depth_type(topology, d) == HWLOC_OBJ_OS_DEVICE);
+    } else if (type == HWLOC_OBJ_MISC) {
+      assert(d == HWLOC_TYPE_DEPTH_MISC);
+      assert(hwloc_get_depth_type(topology, d) == HWLOC_OBJ_MISC);
+    } else {
+      assert(d >=0 || d == HWLOC_TYPE_DEPTH_UNKNOWN || d == HWLOC_TYPE_DEPTH_MULTIPLE);
+    }
+  }
+
+  /* top-level specific checks */
+  assert(hwloc_get_nbobjs_by_depth(topology, 0) == 1);
+  obj = hwloc_get_root_obj(topology);
+  assert(obj);
+  assert(!obj->parent);
+  assert(obj->cpuset);
+  assert(!obj->depth);
+
+  /* check that allowed sets are larger than the main sets */
+  if (topology->flags & HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM) {
+    assert(hwloc_bitmap_isincluded(topology->allowed_cpuset, obj->cpuset));
+    assert(hwloc_bitmap_isincluded(topology->allowed_nodeset, obj->nodeset));
+  } else {
+    assert(hwloc_bitmap_isequal(topology->allowed_cpuset, obj->cpuset));
+    assert(hwloc_bitmap_isequal(topology->allowed_nodeset, obj->nodeset));
+  }
+
+  /* check each level */
+  for(j=0; j<depth; j++)
+    hwloc__check_level(topology, j, NULL, NULL);
+  for(j=0; j<HWLOC_NR_SLEVELS; j++)
+    hwloc__check_level(topology, HWLOC_SLEVEL_TO_DEPTH(j), topology->slevels[j].first, topology->slevels[j].last);
+
+  /* recurse and check the tree of children, and type-specific checks */
+  gp_indexes = hwloc_bitmap_alloc(); /* TODO prealloc to topology->next_gp_index */
+  hwloc__check_object(topology, gp_indexes, obj);
+  hwloc_bitmap_free(gp_indexes);
+
+  /* recurse and check the nodesets of children */
+  set = hwloc_bitmap_alloc();
+  hwloc__check_nodesets(topology, obj, set);
+  hwloc_bitmap_free(set);
+}
+
+#else /* NDEBUG */
+
+void
+hwloc_topology_check(struct hwloc_topology *topology __hwloc_attribute_unused)
+{
+}
+
+#endif /* NDEBUG */
diff --git a/src/3rdparty/hwloc/src/traversal.c b/src/3rdparty/hwloc/src/traversal.c
new file mode 100644
index 000000000..9c5e6268c
--- /dev/null
+++ b/src/3rdparty/hwloc/src/traversal.c
@@ -0,0 +1,616 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2018 Inria.  All rights reserved.
+ * Copyright © 2009-2010 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <private/private.h>
+#include <private/misc.h>
+#include <private/debug.h>
+#ifdef HAVE_STRINGS_H
+#include <strings.h>
+#endif /* HAVE_STRINGS_H */
+
+int
+hwloc_get_type_depth (struct hwloc_topology *topology, hwloc_obj_type_t type)
+{
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_TYPE_MIN == 0);
+  if ((unsigned) type >= HWLOC_OBJ_TYPE_MAX)
+    return HWLOC_TYPE_DEPTH_UNKNOWN;
+  else
+    return topology->type_depth[type];
+}
+
+hwloc_obj_type_t
+hwloc_get_depth_type (hwloc_topology_t topology, int depth)
+{
+  if ((unsigned)depth >= topology->nb_levels)
+    switch (depth) {
+    case HWLOC_TYPE_DEPTH_NUMANODE:
+      return HWLOC_OBJ_NUMANODE;
+    case HWLOC_TYPE_DEPTH_BRIDGE:
+      return HWLOC_OBJ_BRIDGE;
+    case HWLOC_TYPE_DEPTH_PCI_DEVICE:
+      return HWLOC_OBJ_PCI_DEVICE;
+    case HWLOC_TYPE_DEPTH_OS_DEVICE:
+      return HWLOC_OBJ_OS_DEVICE;
+    case HWLOC_TYPE_DEPTH_MISC:
+      return HWLOC_OBJ_MISC;
+    default:
+      return HWLOC_OBJ_TYPE_NONE;
+    }
+  return topology->levels[depth][0]->type;
+}
+
+int
+hwloc_get_memory_parents_depth (hwloc_topology_t topology)
+{
+  int depth = HWLOC_TYPE_DEPTH_UNKNOWN;
+  /* memory leaves are always NUMA nodes for now, no need to check parents of other memory types */
+  hwloc_obj_t numa = hwloc_get_obj_by_depth(topology, HWLOC_TYPE_DEPTH_NUMANODE, 0);
+  assert(numa);
+  while (numa) {
+    hwloc_obj_t parent = numa->parent;
+    /* walk-up the memory hierarchy */
+    while (hwloc__obj_type_is_memory(parent->type))
+      parent = parent->parent;
+
+    if (depth == HWLOC_TYPE_DEPTH_UNKNOWN)
+      depth = parent->depth;
+    else if (depth != parent->depth)
+      return HWLOC_TYPE_DEPTH_MULTIPLE;
+
+    numa = numa->next_cousin;
+  }
+
+  assert(depth >= 0);
+  return depth;
+}
+
+unsigned
+hwloc_get_nbobjs_by_depth (struct hwloc_topology *topology, int depth)
+{
+  if ((unsigned)depth >= topology->nb_levels) {
+    unsigned l = HWLOC_SLEVEL_FROM_DEPTH(depth);
+    if (l < HWLOC_NR_SLEVELS)
+      return topology->slevels[l].nbobjs;
+    else
+      return 0;
+  }
+  return topology->level_nbobjects[depth];
+}
+
+struct hwloc_obj *
+hwloc_get_obj_by_depth (struct hwloc_topology *topology, int depth, unsigned idx)
+{
+  if ((unsigned)depth >= topology->nb_levels) {
+    unsigned l = HWLOC_SLEVEL_FROM_DEPTH(depth);
+    if (l < HWLOC_NR_SLEVELS)
+      return idx < topology->slevels[l].nbobjs ? topology->slevels[l].objs[idx] : NULL;
+    else
+      return NULL;
+  }
+  if (idx >= topology->level_nbobjects[depth])
+    return NULL;
+  return topology->levels[depth][idx];
+}
+
+int
+hwloc_obj_type_is_normal(hwloc_obj_type_t type)
+{
+  return hwloc__obj_type_is_normal(type);
+}
+
+int
+hwloc_obj_type_is_memory(hwloc_obj_type_t type)
+{
+  return hwloc__obj_type_is_memory(type);
+}
+
+int
+hwloc_obj_type_is_io(hwloc_obj_type_t type)
+{
+  return hwloc__obj_type_is_io(type);
+}
+
+int
+hwloc_obj_type_is_cache(hwloc_obj_type_t type)
+{
+  return hwloc__obj_type_is_cache(type);
+}
+
+int
+hwloc_obj_type_is_dcache(hwloc_obj_type_t type)
+{
+  return hwloc__obj_type_is_dcache(type);
+}
+
+int
+hwloc_obj_type_is_icache(hwloc_obj_type_t type)
+{
+  return hwloc__obj_type_is_icache(type);
+}
+
+unsigned hwloc_get_closest_objs (struct hwloc_topology *topology, struct hwloc_obj *src, struct hwloc_obj **objs, unsigned max)
+{
+  struct hwloc_obj *parent, *nextparent, **src_objs;
+  unsigned i,src_nbobjects;
+  unsigned stored = 0;
+
+  if (!src->cpuset)
+    return 0;
+
+  src_nbobjects = topology->level_nbobjects[src->depth];
+  src_objs = topology->levels[src->depth];
+
+  parent = src;
+  while (stored < max) {
+    while (1) {
+      nextparent = parent->parent;
+      if (!nextparent)
+	goto out;
+      if (!hwloc_bitmap_isequal(parent->cpuset, nextparent->cpuset))
+	break;
+      parent = nextparent;
+    }
+
+    /* traverse src's objects and find those that are in nextparent and were not in parent */
+    for(i=0; i<src_nbobjects; i++) {
+      if (hwloc_bitmap_isincluded(src_objs[i]->cpuset, nextparent->cpuset)
+	  && !hwloc_bitmap_isincluded(src_objs[i]->cpuset, parent->cpuset)) {
+	objs[stored++] = src_objs[i];
+	if (stored == max)
+	  goto out;
+      }
+    }
+    parent = nextparent;
+  }
+
+ out:
+  return stored;
+}
+
+static int
+hwloc__get_largest_objs_inside_cpuset (struct hwloc_obj *current, hwloc_const_bitmap_t set,
+				       struct hwloc_obj ***res, int *max)
+{
+  int gotten = 0;
+  unsigned i;
+
+  /* the caller must ensure this */
+  if (*max <= 0)
+    return 0;
+
+  if (hwloc_bitmap_isequal(current->cpuset, set)) {
+    **res = current;
+    (*res)++;
+    (*max)--;
+    return 1;
+  }
+
+  for (i=0; i<current->arity; i++) {
+    hwloc_bitmap_t subset;
+    int ret;
+
+    /* split out the cpuset part corresponding to this child and see if there's anything to do */
+    if (!hwloc_bitmap_intersects(set,current->children[i]->cpuset))
+      continue;
+
+    subset = hwloc_bitmap_dup(set);
+    hwloc_bitmap_and(subset, subset, current->children[i]->cpuset);
+    ret = hwloc__get_largest_objs_inside_cpuset (current->children[i], subset, res, max);
+    gotten += ret;
+    hwloc_bitmap_free(subset);
+
+    /* if no more room to store remaining objects, return what we got so far */
+    if (!*max)
+      break;
+  }
+
+  return gotten;
+}
+
+int
+hwloc_get_largest_objs_inside_cpuset (struct hwloc_topology *topology, hwloc_const_bitmap_t set,
+				      struct hwloc_obj **objs, int max)
+{
+  struct hwloc_obj *current = topology->levels[0][0];
+
+  if (!hwloc_bitmap_isincluded(set, current->cpuset))
+    return -1;
+
+  if (max <= 0)
+    return 0;
+
+  return hwloc__get_largest_objs_inside_cpuset (current, set, &objs, &max);
+}
+
+const char *
+hwloc_obj_type_string (hwloc_obj_type_t obj)
+{
+  switch (obj)
+    {
+    case HWLOC_OBJ_MACHINE: return "Machine";
+    case HWLOC_OBJ_MISC: return "Misc";
+    case HWLOC_OBJ_GROUP: return "Group";
+    case HWLOC_OBJ_NUMANODE: return "NUMANode";
+    case HWLOC_OBJ_PACKAGE: return "Package";
+    case HWLOC_OBJ_L1CACHE: return "L1Cache";
+    case HWLOC_OBJ_L2CACHE: return "L2Cache";
+    case HWLOC_OBJ_L3CACHE: return "L3Cache";
+    case HWLOC_OBJ_L4CACHE: return "L4Cache";
+    case HWLOC_OBJ_L5CACHE: return "L5Cache";
+    case HWLOC_OBJ_L1ICACHE: return "L1iCache";
+    case HWLOC_OBJ_L2ICACHE: return "L2iCache";
+    case HWLOC_OBJ_L3ICACHE: return "L3iCache";
+    case HWLOC_OBJ_CORE: return "Core";
+    case HWLOC_OBJ_BRIDGE: return "Bridge";
+    case HWLOC_OBJ_PCI_DEVICE: return "PCIDev";
+    case HWLOC_OBJ_OS_DEVICE: return "OSDev";
+    case HWLOC_OBJ_PU: return "PU";
+    default: return "Unknown";
+    }
+}
+
+int
+hwloc_type_sscanf(const char *string, hwloc_obj_type_t *typep,
+		  union hwloc_obj_attr_u *attrp, size_t attrsize)
+{
+  hwloc_obj_type_t type = (hwloc_obj_type_t) -1;
+  unsigned depthattr = (unsigned) -1;
+  hwloc_obj_cache_type_t cachetypeattr = (hwloc_obj_cache_type_t) -1; /* unspecified */
+  hwloc_obj_bridge_type_t ubtype = (hwloc_obj_bridge_type_t) -1;
+  hwloc_obj_osdev_type_t ostype = (hwloc_obj_osdev_type_t) -1;
+  char *end;
+
+  /* never match the ending \0 since we want to match things like core:2 too.
+   * just use hwloc_strncasecmp() everywhere.
+   */
+
+  /* types without a custom depth */
+
+  /* osdev subtype first to avoid conflicts coproc/core etc */
+  if (!hwloc_strncasecmp(string, "os", 2)) {
+    type = HWLOC_OBJ_OS_DEVICE;
+  } else if (!hwloc_strncasecmp(string, "bloc", 4)) {
+    type = HWLOC_OBJ_OS_DEVICE;
+    ostype = HWLOC_OBJ_OSDEV_BLOCK;
+  } else if (!hwloc_strncasecmp(string, "net", 3)) {
+    type = HWLOC_OBJ_OS_DEVICE;
+    ostype = HWLOC_OBJ_OSDEV_NETWORK;
+  } else if (!hwloc_strncasecmp(string, "openfab", 7)) {
+    type = HWLOC_OBJ_OS_DEVICE;
+    ostype = HWLOC_OBJ_OSDEV_OPENFABRICS;
+  } else if (!hwloc_strncasecmp(string, "dma", 3)) {
+    type = HWLOC_OBJ_OS_DEVICE;
+    ostype = HWLOC_OBJ_OSDEV_DMA;
+  } else if (!hwloc_strncasecmp(string, "gpu", 3)) {
+    type = HWLOC_OBJ_OS_DEVICE;
+    ostype = HWLOC_OBJ_OSDEV_GPU;
+  } else if (!hwloc_strncasecmp(string, "copro", 5)
+	     || !hwloc_strncasecmp(string, "co-pro", 6)) {
+    type = HWLOC_OBJ_OS_DEVICE;
+    ostype = HWLOC_OBJ_OSDEV_COPROC;
+
+  } else if (!hwloc_strncasecmp(string, "machine", 2)) {
+    type = HWLOC_OBJ_MACHINE;
+  } else if (!hwloc_strncasecmp(string, "node", 2)
+	     || !hwloc_strncasecmp(string, "numa", 2)) { /* matches node and numanode */
+    type = HWLOC_OBJ_NUMANODE;
+  } else if (!hwloc_strncasecmp(string, "package", 2)
+	     || !hwloc_strncasecmp(string, "socket", 2)) { /* backward compat with v1.10 */
+    type = HWLOC_OBJ_PACKAGE;
+  } else if (!hwloc_strncasecmp(string, "core", 2)) {
+    type = HWLOC_OBJ_CORE;
+  } else if (!hwloc_strncasecmp(string, "pu", 2)) {
+    type = HWLOC_OBJ_PU;
+  } else if (!hwloc_strncasecmp(string, "misc", 4)) {
+    type = HWLOC_OBJ_MISC;
+
+  } else if (!hwloc_strncasecmp(string, "bridge", 4)) {
+    type = HWLOC_OBJ_BRIDGE;
+  } else if (!hwloc_strncasecmp(string, "hostbridge", 6)) {
+    type = HWLOC_OBJ_BRIDGE;
+    ubtype = HWLOC_OBJ_BRIDGE_HOST;
+  } else if (!hwloc_strncasecmp(string, "pcibridge", 5)) {
+    type = HWLOC_OBJ_BRIDGE;
+    ubtype = HWLOC_OBJ_BRIDGE_PCI;
+
+  } else if (!hwloc_strncasecmp(string, "pci", 3)) {
+    type = HWLOC_OBJ_PCI_DEVICE;
+
+  /* types with depthattr */
+  } else if ((string[0] == 'l' || string[0] == 'L') && string[1] >= '0' && string[1] <= '9') {
+    depthattr = strtol(string+1, &end, 10);
+    if (*end == 'i') {
+      if (depthattr >= 1 && depthattr <= 3) {
+	type = HWLOC_OBJ_L1ICACHE + depthattr-1;
+	cachetypeattr = HWLOC_OBJ_CACHE_INSTRUCTION;
+      } else
+	return -1;
+    } else {
+      if (depthattr >= 1 && depthattr <= 5) {
+	type = HWLOC_OBJ_L1CACHE + depthattr-1;
+	cachetypeattr = *end == 'd' ? HWLOC_OBJ_CACHE_DATA : HWLOC_OBJ_CACHE_UNIFIED;
+      } else
+	return -1;
+    }
+
+  } else if (!hwloc_strncasecmp(string, "group", 2)) {
+    size_t length;
+    type = HWLOC_OBJ_GROUP;
+    length = strcspn(string, "0123456789");
+    if (length <= 5 && !hwloc_strncasecmp(string, "group", length)
+	&& string[length] >= '0' && string[length] <= '9') {
+      depthattr = strtol(string+length, &end, 10);
+    }
+
+  } else
+    return -1;
+
+  *typep = type;
+  if (attrp) {
+    if (hwloc__obj_type_is_cache(type) && attrsize >= sizeof(attrp->cache)) {
+      attrp->cache.depth = depthattr;
+      attrp->cache.type = cachetypeattr;
+    } else if (type == HWLOC_OBJ_GROUP && attrsize >= sizeof(attrp->group)) {
+      attrp->group.depth = depthattr;
+    } else if (type == HWLOC_OBJ_BRIDGE && attrsize >= sizeof(attrp->bridge)) {
+      attrp->bridge.upstream_type = ubtype;
+      attrp->bridge.downstream_type = HWLOC_OBJ_BRIDGE_PCI; /* nothing else so far */
+    } else if (type == HWLOC_OBJ_OS_DEVICE && attrsize >= sizeof(attrp->osdev)) {
+      attrp->osdev.type = ostype;
+    }
+  }
+  return 0;
+}
+
+int
+hwloc_type_sscanf_as_depth(const char *string, hwloc_obj_type_t *typep,
+			   hwloc_topology_t topology, int *depthp)
+{
+  union hwloc_obj_attr_u attr;
+  hwloc_obj_type_t type;
+  int depth;
+  int err;
+
+  err = hwloc_type_sscanf(string, &type, &attr, sizeof(attr));
+  if (err < 0)
+    return err;
+
+  depth = hwloc_get_type_depth(topology, type);
+  if (type == HWLOC_OBJ_GROUP
+      && depth == HWLOC_TYPE_DEPTH_MULTIPLE
+      && attr.group.depth != (unsigned)-1) {
+    unsigned l;
+    depth = HWLOC_TYPE_DEPTH_UNKNOWN;
+    for(l=0; l<topology->nb_levels; l++) {
+      if (topology->levels[l][0]->type == HWLOC_OBJ_GROUP
+	  && topology->levels[l][0]->attr->group.depth == attr.group.depth) {
+	depth = (int)l;
+	break;
+      }
+    }
+  }
+
+  if (typep)
+    *typep = type;
+  *depthp = depth;
+  return 0;
+}
+
+static const char* hwloc_obj_cache_type_letter(hwloc_obj_cache_type_t type)
+{
+  switch (type) {
+  case HWLOC_OBJ_CACHE_UNIFIED: return "";
+  case HWLOC_OBJ_CACHE_DATA: return "d";
+  case HWLOC_OBJ_CACHE_INSTRUCTION: return "i";
+  default: return "unknown";
+  }
+}
+
+int
+hwloc_obj_type_snprintf(char * __hwloc_restrict string, size_t size, hwloc_obj_t obj, int verbose)
+{
+  hwloc_obj_type_t type = obj->type;
+  switch (type) {
+  case HWLOC_OBJ_MISC:
+  case HWLOC_OBJ_MACHINE:
+  case HWLOC_OBJ_NUMANODE:
+  case HWLOC_OBJ_PACKAGE:
+  case HWLOC_OBJ_CORE:
+  case HWLOC_OBJ_PU:
+    return hwloc_snprintf(string, size, "%s", hwloc_obj_type_string(type));
+  case HWLOC_OBJ_L1CACHE:
+  case HWLOC_OBJ_L2CACHE:
+  case HWLOC_OBJ_L3CACHE:
+  case HWLOC_OBJ_L4CACHE:
+  case HWLOC_OBJ_L5CACHE:
+  case HWLOC_OBJ_L1ICACHE:
+  case HWLOC_OBJ_L2ICACHE:
+  case HWLOC_OBJ_L3ICACHE:
+    return hwloc_snprintf(string, size, "L%u%s%s", obj->attr->cache.depth,
+			  hwloc_obj_cache_type_letter(obj->attr->cache.type),
+			  verbose ? "Cache" : "");
+  case HWLOC_OBJ_GROUP:
+    if (obj->attr->group.depth != (unsigned) -1)
+      return hwloc_snprintf(string, size, "%s%u", hwloc_obj_type_string(type), obj->attr->group.depth);
+    else
+      return hwloc_snprintf(string, size, "%s", hwloc_obj_type_string(type));
+  case HWLOC_OBJ_BRIDGE:
+    return hwloc_snprintf(string, size, obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI ? "PCIBridge" : "HostBridge");
+  case HWLOC_OBJ_PCI_DEVICE:
+    return hwloc_snprintf(string, size, "PCI");
+  case HWLOC_OBJ_OS_DEVICE:
+    switch (obj->attr->osdev.type) {
+    case HWLOC_OBJ_OSDEV_BLOCK: return hwloc_snprintf(string, size, "Block");
+    case HWLOC_OBJ_OSDEV_NETWORK: return hwloc_snprintf(string, size, verbose ? "Network" : "Net");
+    case HWLOC_OBJ_OSDEV_OPENFABRICS: return hwloc_snprintf(string, size, "OpenFabrics");
+    case HWLOC_OBJ_OSDEV_DMA: return hwloc_snprintf(string, size, "DMA");
+    case HWLOC_OBJ_OSDEV_GPU: return hwloc_snprintf(string, size, "GPU");
+    case HWLOC_OBJ_OSDEV_COPROC: return hwloc_snprintf(string, size, verbose ? "Co-Processor" : "CoProc");
+    default:
+      if (size > 0)
+	*string = '\0';
+      return 0;
+    }
+    break;
+  default:
+    if (size > 0)
+      *string = '\0';
+    return 0;
+  }
+}
+
+int
+hwloc_obj_attr_snprintf(char * __hwloc_restrict string, size_t size, hwloc_obj_t obj, const char * separator, int verbose)
+{
+  const char *prefix = "";
+  char *tmp = string;
+  ssize_t tmplen = size;
+  int ret = 0;
+  int res;
+
+  /* make sure we output at least an empty string */
+  if (size)
+    *string = '\0';
+
+  /* print memory attributes */
+  res = 0;
+  if (verbose) {
+    if (obj->type == HWLOC_OBJ_NUMANODE && obj->attr->numanode.local_memory)
+      res = hwloc_snprintf(tmp, tmplen, "%slocal=%lu%s%stotal=%lu%s",
+			   prefix,
+			   (unsigned long) hwloc_memory_size_printf_value(obj->attr->numanode.local_memory, verbose),
+			   hwloc_memory_size_printf_unit(obj->attr->numanode.local_memory, verbose),
+			   separator,
+			   (unsigned long) hwloc_memory_size_printf_value(obj->total_memory, verbose),
+			   hwloc_memory_size_printf_unit(obj->total_memory, verbose));
+    else if (obj->total_memory)
+      res = hwloc_snprintf(tmp, tmplen, "%stotal=%lu%s",
+			   prefix,
+			   (unsigned long) hwloc_memory_size_printf_value(obj->total_memory, verbose),
+			   hwloc_memory_size_printf_unit(obj->total_memory, verbose));
+  } else {
+    if (obj->type == HWLOC_OBJ_NUMANODE && obj->attr->numanode.local_memory)
+      res = hwloc_snprintf(tmp, tmplen, "%s%lu%s",
+			   prefix,
+			   (unsigned long) hwloc_memory_size_printf_value(obj->attr->numanode.local_memory, verbose),
+			   hwloc_memory_size_printf_unit(obj->attr->numanode.local_memory, verbose));
+  }
+  if (res < 0)
+    return -1;
+  ret += res;
+  if (ret > 0)
+    prefix = separator;
+  if (res >= tmplen)
+    res = tmplen>0 ? (int)tmplen - 1 : 0;
+  tmp += res;
+  tmplen -= res;
+
+  /* printf type-specific attributes */
+  res = 0;
+  switch (obj->type) {
+  case HWLOC_OBJ_L1CACHE:
+  case HWLOC_OBJ_L2CACHE:
+  case HWLOC_OBJ_L3CACHE:
+  case HWLOC_OBJ_L4CACHE:
+  case HWLOC_OBJ_L5CACHE:
+  case HWLOC_OBJ_L1ICACHE:
+  case HWLOC_OBJ_L2ICACHE:
+  case HWLOC_OBJ_L3ICACHE:
+    if (verbose) {
+      char assoc[32];
+      if (obj->attr->cache.associativity == -1)
+	snprintf(assoc, sizeof(assoc), "%sfully-associative", separator);
+      else if (obj->attr->cache.associativity == 0)
+	*assoc = '\0';
+      else
+	snprintf(assoc, sizeof(assoc), "%sways=%d", separator, obj->attr->cache.associativity);
+      res = hwloc_snprintf(tmp, tmplen, "%ssize=%lu%s%slinesize=%u%s",
+			   prefix,
+			   (unsigned long) hwloc_memory_size_printf_value(obj->attr->cache.size, verbose),
+			   hwloc_memory_size_printf_unit(obj->attr->cache.size, verbose),
+			   separator, obj->attr->cache.linesize,
+			   assoc);
+    } else
+      res = hwloc_snprintf(tmp, tmplen, "%s%lu%s",
+			   prefix,
+			   (unsigned long) hwloc_memory_size_printf_value(obj->attr->cache.size, verbose),
+			   hwloc_memory_size_printf_unit(obj->attr->cache.size, verbose));
+    break;
+  case HWLOC_OBJ_BRIDGE:
+    if (verbose) {
+      char up[128], down[64];
+      /* upstream is PCI or HOST */
+      if (obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI) {
+        char linkspeed[64]= "";
+        if (obj->attr->pcidev.linkspeed)
+          snprintf(linkspeed, sizeof(linkspeed), "%slink=%.2fGB/s", separator, obj->attr->pcidev.linkspeed);
+	snprintf(up, sizeof(up), "busid=%04x:%02x:%02x.%01x%sid=%04x:%04x%sclass=%04x(%s)%s",
+		 obj->attr->pcidev.domain, obj->attr->pcidev.bus, obj->attr->pcidev.dev, obj->attr->pcidev.func, separator,
+		 obj->attr->pcidev.vendor_id, obj->attr->pcidev.device_id, separator,
+		 obj->attr->pcidev.class_id, hwloc_pci_class_string(obj->attr->pcidev.class_id), linkspeed);
+      } else
+        *up = '\0';
+      /* downstream is_PCI */
+      snprintf(down, sizeof(down), "buses=%04x:[%02x-%02x]",
+	       obj->attr->bridge.downstream.pci.domain, obj->attr->bridge.downstream.pci.secondary_bus, obj->attr->bridge.downstream.pci.subordinate_bus);
+      if (*up)
+	res = hwloc_snprintf(string, size, "%s%s%s", up, separator, down);
+      else
+	res = hwloc_snprintf(string, size, "%s", down);
+    }
+    break;
+  case HWLOC_OBJ_PCI_DEVICE:
+    if (verbose) {
+      char linkspeed[64]= "";
+      if (obj->attr->pcidev.linkspeed)
+        snprintf(linkspeed, sizeof(linkspeed), "%slink=%.2fGB/s", separator, obj->attr->pcidev.linkspeed);
+      res = hwloc_snprintf(string, size, "busid=%04x:%02x:%02x.%01x%sid=%04x:%04x%sclass=%04x(%s)%s",
+			   obj->attr->pcidev.domain, obj->attr->pcidev.bus, obj->attr->pcidev.dev, obj->attr->pcidev.func, separator,
+			   obj->attr->pcidev.vendor_id, obj->attr->pcidev.device_id, separator,
+			   obj->attr->pcidev.class_id, hwloc_pci_class_string(obj->attr->pcidev.class_id), linkspeed);
+    }
+    break;
+  default:
+    break;
+  }
+  if (res < 0)
+    return -1;
+  ret += res;
+  if (ret > 0)
+    prefix = separator;
+  if (res >= tmplen)
+    res = tmplen>0 ? (int)tmplen - 1 : 0;
+  tmp += res;
+  tmplen -= res;
+
+  /* printf infos */
+  if (verbose) {
+    unsigned i;
+    for(i=0; i<obj->infos_count; i++) {
+      struct hwloc_info_s *info = &obj->infos[i];
+      const char *quote = strchr(info->value, ' ') ? "\"" : "";
+      res = hwloc_snprintf(tmp, tmplen, "%s%s=%s%s%s",
+			     prefix,
+			     info->name,
+			     quote, info->value, quote);
+      if (res < 0)
+        return -1;
+      ret += res;
+      if (res >= tmplen)
+        res = tmplen>0 ? (int)tmplen - 1 : 0;
+      tmp += res;
+      tmplen -= res;
+      if (ret > 0)
+        prefix = separator;
+    }
+  }
+
+  return ret;
+}
diff --git a/src/backend/cpu/cpu.cmake b/src/backend/cpu/cpu.cmake
index b685d7e4b..1072df088 100644
--- a/src/backend/cpu/cpu.cmake
+++ b/src/backend/cpu/cpu.cmake
@@ -19,12 +19,18 @@ set(SOURCES_BACKEND_CPU
 
 
 if (WITH_HWLOC)
-    find_package(HWLOC REQUIRED)
+    if (CMAKE_CXX_COMPILER_ID MATCHES MSVC)
+        add_subdirectory(src/3rdparty/hwloc)
+        include_directories(src/3rdparty/hwloc/include)
+        set(CPUID_LIB hwloc)
+    else()
+        find_package(HWLOC REQUIRED)
+        include_directories(${HWLOC_INCLUDE_DIR})
+        set(CPUID_LIB ${HWLOC_LIBRARY})
+    endif()
 
     set(WITH_LIBCPUID OFF)
 
-    include_directories(${HWLOC_INCLUDE_DIR})
-
     remove_definitions(/DXMRIG_FEATURE_LIBCPUID)
     add_definitions(/DXMRIG_FEATURE_HWLOC)
 
@@ -32,7 +38,6 @@ if (WITH_HWLOC)
         add_definitions(/DXMRIG_HWLOC_DEBUG)
     endif()
 
-    set(CPUID_LIB "")
     set(SOURCES_CPUID
         src/backend/cpu/platform/BasicCpuInfo.cpp
         src/backend/cpu/platform/BasicCpuInfo.h