From d1c5f18373834a35b4fcece0ec4037bcedff7022 Mon Sep 17 00:00:00 2001 From: "Andrew J. Hesford" Date: Thu, 25 Feb 2021 09:24:17 -0500 Subject: [PATCH] openmpi: backport AVX improvements, make mca_op_avx work --- srcpkgs/openmpi/patches/avx.patch | 1682 +++++++++++++++++++++++++++++ srcpkgs/openmpi/template | 9 +- 2 files changed, 1687 insertions(+), 4 deletions(-) create mode 100644 srcpkgs/openmpi/patches/avx.patch diff --git a/srcpkgs/openmpi/patches/avx.patch b/srcpkgs/openmpi/patches/avx.patch new file mode 100644 index 00000000000..7088fbd4636 --- /dev/null +++ b/srcpkgs/openmpi/patches/avx.patch @@ -0,0 +1,1682 @@ +From 6de8dfd236b7180b04c911d2306c3b805abf7182 Mon Sep 17 00:00:00 2001 +From: George Bosilca +Date: Mon, 28 Dec 2020 15:36:05 -0500 +Subject: [PATCH 1/3] Major update to the AVX* detection and support + +1. Consistent march flag order between configure and make. + +2. op/avx: give the option to skip some tests + +it is possible to skip some intrinsic tests by setting some environment variables to "no" before invoking configure: + - ompi_cv_op_avx_check_avx512 + - ompi_cv_op_avx_check_avx2 + - ompi_cv_op_avx_check_avx + - ompi_cv_op_avx_check_sse41 + - ompi_cv_op_avx_check_sse3 + +3. op/avx: update AVX512 flags + +try +-mavx512f -mavx512bw -mavx512vl -mavx512dq +instead of +-march=skylake-avx512 + +since the former is less likely to conflict with user provided CFLAGS +(e.g. -march=...) + +Thanks Bart Oldeman for pointing this. + +4. op/avx: have the op/avx library depend on libmpi.so + +Refs. open-mpi/ompi#8323 + +Signed-off-by: Gilles Gouaillardet +Signed-off-by: George Bosilca +--- + ompi/mca/op/avx/Makefile.am | 4 +- + ompi/mca/op/avx/configure.m4 | 325 ++++++++++++++++++----------------- + 2 files changed, 174 insertions(+), 155 deletions(-) + +diff --git ompi/mca/op/avx/Makefile.am ompi/mca/op/avx/Makefile.am +index 41dcf2e1834..b1d84d90b33 100644 +--- ompi/mca/op/avx/Makefile.am ++++ ompi/mca/op/avx/Makefile.am +@@ -2,7 +2,7 @@ + # Copyright (c) 2019-2020 The University of Tennessee and The University + # of Tennessee Research Foundation. All rights + # reserved. +-# Copyright (c) 2020 Research Organization for Information Science ++# Copyright (c) 2020-2021 Research Organization for Information Science + # and Technology (RIST). All rights reserved. + # $COPYRIGHT$ + # +@@ -86,7 +86,7 @@ mcacomponentdir = $(ompilibdir) + mcacomponent_LTLIBRARIES = $(component_install) + mca_op_avx_la_SOURCES = $(sources) + mca_op_avx_la_LIBADD = $(specialized_op_libs) +-mca_op_avx_la_LDFLAGS = -module -avoid-version ++mca_op_avx_la_LDFLAGS = -module -avoid-version $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la + + + # Specific information for static builds. +diff --git ompi/mca/op/avx/configure.m4 ompi/mca/op/avx/configure.m4 +index 09d8b374c8e..f61b7100ef4 100644 +--- ompi/mca/op/avx/configure.m4 ++++ ompi/mca/op/avx/configure.m4 +@@ -29,6 +29,13 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[ + op_avx_support=0 + op_avx2_support=0 + op_avx512_support=0 ++ ++ AS_VAR_PUSHDEF([op_avx_check_sse3], [ompi_cv_op_avx_check_sse3]) ++ AS_VAR_PUSHDEF([op_avx_check_sse41], [ompi_cv_op_avx_check_sse41]) ++ AS_VAR_PUSHDEF([op_avx_check_avx], [ompi_cv_op_avx_check_avx]) ++ AS_VAR_PUSHDEF([op_avx_check_avx2], [ompi_cv_op_avx_check_avx2]) ++ AS_VAR_PUSHDEF([op_avx_check_avx512], [ompi_cv_op_avx_check_avx512]) ++ + OPAL_VAR_SCOPE_PUSH([op_avx_cflags_save]) + + AS_IF([test "$opal_cv_asm_arch" = "X86_64"], +@@ -37,21 +44,9 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[ + # + # Check for AVX512 support + # +- AC_MSG_CHECKING([for AVX512 support (no additional flags)]) +- AC_LINK_IFELSE( +- [AC_LANG_PROGRAM([[#include ]], +- [[ +- __m512 vA, vB; +- _mm512_add_ps(vA, vB) +- ]])], +- [op_avx512_support=1 +- AC_MSG_RESULT([yes])], +- [AC_MSG_RESULT([no])]) +- +- AS_IF([test $op_avx512_support -eq 0], +- [AC_MSG_CHECKING([for AVX512 support (with -march=skylake-avx512)]) +- op_avx_cflags_save="$CFLAGS" +- CFLAGS="$CFLAGS -march=skylake-avx512" ++ AC_CACHE_CHECK([if we are checking for AVX512 support], op_avx_check_avx512, AS_VAR_SET(op_avx_check_avx512, yes)) ++ AS_IF([test "$op_avx_check_avx512" = "yes"], ++ [AC_MSG_CHECKING([for AVX512 support (no additional flags)]) + AC_LINK_IFELSE( + [AC_LANG_PROGRAM([[#include ]], + [[ +@@ -59,99 +54,115 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[ + _mm512_add_ps(vA, vB) + ]])], + [op_avx512_support=1 +- MCA_BUILD_OP_AVX512_FLAGS="-march=skylake-avx512" + AC_MSG_RESULT([yes])], + [AC_MSG_RESULT([no])]) +- CFLAGS="$op_avx_cflags_save" +- ]) +- # +- # Some combination of gcc and older as would not correctly build the code generated by +- # _mm256_loadu_si256. Screen them out. +- # +- AS_IF([test $op_avx512_support -eq 1], +- [AC_MSG_CHECKING([if _mm512_loadu_si512 generates code that can be compiled]) +- op_avx_cflags_save="$CFLAGS" +- CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX512_FLAGS" +- AC_LINK_IFELSE( +- [AC_LANG_PROGRAM([[#include ]], +- [[ ++ ++ AS_IF([test $op_avx512_support -eq 0], ++ [AC_MSG_CHECKING([for AVX512 support (with -mavx512f -mavx512bw -mavx512vl -mavx512dq)]) ++ op_avx_cflags_save="$CFLAGS" ++ CFLAGS="-mavx512f -mavx512bw -mavx512vl -mavx512dq $CFLAGS" ++ AC_LINK_IFELSE( ++ [AC_LANG_PROGRAM([[#include ]], ++ [[ ++ __m512 vA, vB; ++ _mm512_add_ps(vA, vB) ++ ]])], ++ [op_avx512_support=1 ++ MCA_BUILD_OP_AVX512_FLAGS="-mavx512f -mavx512bw -mavx512vl -mavx512dq" ++ AC_MSG_RESULT([yes])], ++ [AC_MSG_RESULT([no])]) ++ CFLAGS="$op_avx_cflags_save" ++ ]) ++ # ++ # Some combination of gcc and older as would not correctly build the code generated by ++ # _mm256_loadu_si256. Screen them out. ++ # ++ AS_IF([test $op_avx512_support -eq 1], ++ [AC_MSG_CHECKING([if _mm512_loadu_si512 generates code that can be compiled]) ++ op_avx_cflags_save="$CFLAGS" ++ CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX512_FLAGS" ++ AC_LINK_IFELSE( ++ [AC_LANG_PROGRAM([[#include ]], ++ [[ + int A[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; + __m512i vA = _mm512_loadu_si512((__m512i*)&(A[1])) +- ]])], +- [AC_MSG_RESULT([yes])], +- [op_avx512_support=0 +- MCA_BUILD_OP_AVX512_FLAGS="" +- AC_MSG_RESULT([no])]) +- CFLAGS="$op_avx_cflags_save" +- ]) +- # +- # Some PGI compilers do not define _mm512_mullo_epi64. Screen them out. +- # +- AS_IF([test $op_avx512_support -eq 1], +- [AC_MSG_CHECKING([if _mm512_mullo_epi64 generates code that can be compiled]) +- op_avx_cflags_save="$CFLAGS" +- CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX512_FLAGS" +- AC_LINK_IFELSE( +- [AC_LANG_PROGRAM([[#include ]], +- [[ ++ ]])], ++ [AC_MSG_RESULT([yes])], ++ [op_avx512_support=0 ++ MCA_BUILD_OP_AVX512_FLAGS="" ++ AC_MSG_RESULT([no])]) ++ CFLAGS="$op_avx_cflags_save" ++ ]) ++ # ++ # Some PGI compilers do not define _mm512_mullo_epi64. Screen them out. ++ # ++ AS_IF([test $op_avx512_support -eq 1], ++ [AC_MSG_CHECKING([if _mm512_mullo_epi64 generates code that can be compiled]) ++ op_avx_cflags_save="$CFLAGS" ++ CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX512_FLAGS" ++ AC_LINK_IFELSE( ++ [AC_LANG_PROGRAM([[#include ]], ++ [[ + __m512i vA, vB; + _mm512_mullo_epi64(vA, vB) +- ]])], +- [AC_MSG_RESULT([yes])], +- [op_avx512_support=0 +- MCA_BUILD_OP_AVX512_FLAGS="" +- AC_MSG_RESULT([no])]) +- CFLAGS="$op_avx_cflags_save" +- ]) ++ ]])], ++ [AC_MSG_RESULT([yes])], ++ [op_avx512_support=0 ++ MCA_BUILD_OP_AVX512_FLAGS="" ++ AC_MSG_RESULT([no])]) ++ CFLAGS="$op_avx_cflags_save" ++ ])]) + # + # Check support for AVX2 + # +- AC_MSG_CHECKING([for AVX2 support (no additional flags)]) +- AC_LINK_IFELSE( +- [AC_LANG_PROGRAM([[#include ]], +- [[ ++ AC_CACHE_CHECK([if we are checking for AVX2 support], op_avx_check_avx2, AS_VAR_SET(op_avx_check_avx2, yes)) ++ AS_IF([test "$op_avx_check_avx2" = "yes"], ++ [AC_MSG_CHECKING([for AVX2 support (no additional flags)]) ++ AC_LINK_IFELSE( ++ [AC_LANG_PROGRAM([[#include ]], ++ [[ + __m256 vA, vB; + _mm256_add_ps(vA, vB) +- ]])], +- [op_avx2_support=1 +- AC_MSG_RESULT([yes])], +- [AC_MSG_RESULT([no])]) +- AS_IF([test $op_avx2_support -eq 0], +- [AC_MSG_CHECKING([for AVX2 support (with -mavx2)]) +- op_avx_cflags_save="$CFLAGS" +- CFLAGS="$CFLAGS -mavx2" +- AC_LINK_IFELSE( +- [AC_LANG_PROGRAM([[#include ]], +- [[ ++ ]])], ++ [op_avx2_support=1 ++ AC_MSG_RESULT([yes])], ++ [AC_MSG_RESULT([no])]) ++ AS_IF([test $op_avx2_support -eq 0], ++ [AC_MSG_CHECKING([for AVX2 support (with -mavx2)]) ++ op_avx_cflags_save="$CFLAGS" ++ CFLAGS="-mavx2 $CFLAGS" ++ AC_LINK_IFELSE( ++ [AC_LANG_PROGRAM([[#include ]], ++ [[ + __m256 vA, vB; + _mm256_add_ps(vA, vB) +- ]])], +- [op_avx2_support=1 +- MCA_BUILD_OP_AVX2_FLAGS="-mavx2" +- AC_MSG_RESULT([yes])], +- [AC_MSG_RESULT([no])]) +- CFLAGS="$op_avx_cflags_save" +- ]) +- # +- # Some combination of gcc and older as would not correctly build the code generated by +- # _mm256_loadu_si256. Screen them out. +- # +- AS_IF([test $op_avx2_support -eq 1], +- [AC_MSG_CHECKING([if _mm256_loadu_si256 generates code that can be compiled]) +- op_avx_cflags_save="$CFLAGS" +- CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX2_FLAGS" +- AC_LINK_IFELSE( +- [AC_LANG_PROGRAM([[#include ]], +- [[ ++ ]])], ++ [op_avx2_support=1 ++ MCA_BUILD_OP_AVX2_FLAGS="-mavx2" ++ AC_MSG_RESULT([yes])], ++ [AC_MSG_RESULT([no])]) ++ CFLAGS="$op_avx_cflags_save" ++ ]) ++ # ++ # Some combination of gcc and older as would not correctly build the code generated by ++ # _mm256_loadu_si256. Screen them out. ++ # ++ AS_IF([test $op_avx2_support -eq 1], ++ [AC_MSG_CHECKING([if _mm256_loadu_si256 generates code that can be compiled]) ++ op_avx_cflags_save="$CFLAGS" ++ CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX2_FLAGS" ++ AC_LINK_IFELSE( ++ [AC_LANG_PROGRAM([[#include ]], ++ [[ + int A[8] = {0, 1, 2, 3, 4, 5, 6, 7}; + __m256i vA = _mm256_loadu_si256((__m256i*)&A) +- ]])], +- [AC_MSG_RESULT([yes])], +- [op_avx2_support=0 +- MCA_BUILD_OP_AVX2_FLAGS="" +- AC_MSG_RESULT([no])]) +- CFLAGS="$op_avx_cflags_save" +- ]) ++ ]])], ++ [AC_MSG_RESULT([yes])], ++ [op_avx2_support=0 ++ MCA_BUILD_OP_AVX2_FLAGS="" ++ AC_MSG_RESULT([no])]) ++ CFLAGS="$op_avx_cflags_save" ++ ])]) + # + # What about early AVX support. The rest of the logic is slightly different as + # we need to include some of the SSE4.1 and SSE3 instructions. So, we first check +@@ -160,90 +171,92 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[ + # the AVX flag, and then recheck if we have support for the SSE4.1 and SSE3 + # instructions. + # +- AC_MSG_CHECKING([for AVX support (no additional flags)]) +- AC_LINK_IFELSE( +- [AC_LANG_PROGRAM([[#include ]], +- [[ ++ AC_CACHE_CHECK([if we are checking for AVX support], op_avx_check_avx, AS_VAR_SET(op_avx_check_avx, yes)) ++ AS_IF([test "$op_avx_check_avx" = "yes"], ++ [AC_MSG_CHECKING([for AVX support (no additional flags)]) ++ AC_LINK_IFELSE( ++ [AC_LANG_PROGRAM([[#include ]], ++ [[ + __m128 vA, vB; + _mm_add_ps(vA, vB) +- ]])], +- [op_avx_support=1 +- AC_MSG_RESULT([yes])], +- [AC_MSG_RESULT([no])]) ++ ]])], ++ [op_avx_support=1 ++ AC_MSG_RESULT([yes])], ++ [AC_MSG_RESULT([no])])]) + # + # Check for SSE4.1 support + # +- AS_IF([test $op_avx_support -eq 1], +- [AC_MSG_CHECKING([for SSE4.1 support]) +- AC_LINK_IFELSE( +- [AC_LANG_PROGRAM([[#include ]], +- [[ ++ AC_CACHE_CHECK([if we are checking for SSE4.1 support], op_avx_check_sse41, AS_VAR_SET(op_avx_check_sse41, yes)) ++ AS_IF([test $op_avx_support -eq 1 && test "$op_avx_check_sse41" = "yes"], ++ [AC_MSG_CHECKING([for SSE4.1 support]) ++ AC_LINK_IFELSE( ++ [AC_LANG_PROGRAM([[#include ]], ++ [[ + __m128i vA, vB; + (void)_mm_max_epi8(vA, vB) +- ]])], +- [op_sse41_support=1 +- AC_MSG_RESULT([yes])], +- [AC_MSG_RESULT([no])]) +- ]) ++ ]])], ++ [op_sse41_support=1 ++ AC_MSG_RESULT([yes])], ++ [AC_MSG_RESULT([no])]) ++ ]) + # + # Check for SSE3 support + # +- AS_IF([test $op_avx_support -eq 1], +- [AC_MSG_CHECKING([for SSE3 support]) +- AC_LINK_IFELSE( +- [AC_LANG_PROGRAM([[#include ]], +- [[ ++ AC_CACHE_CHECK([if we are checking for SSE3 support], op_avx_check_sse3, AS_VAR_SET(op_avx_check_sse3, yes)) ++ AS_IF([test $op_avx_support -eq 1 && test "$op_avx_check_sse3" = "yes"], ++ [AC_MSG_CHECKING([for SSE3 support]) ++ AC_LINK_IFELSE( ++ [AC_LANG_PROGRAM([[#include ]], ++ [[ + int A[4] = {0, 1, 2, 3}; + __m128i vA = _mm_lddqu_si128((__m128i*)&A) +- ]])], +- [op_sse3_support=1 +- AC_MSG_RESULT([yes])], +- [AC_MSG_RESULT([no])]) +- ]) ++ ]])], ++ [op_sse3_support=1 ++ AC_MSG_RESULT([yes])], ++ [AC_MSG_RESULT([no])]) ++ ]) + # Second pass, do we need to add the AVX flag ? + AS_IF([test $op_avx_support -eq 0 || test $op_sse41_support -eq 0 || test $op_sse3_support -eq 0], +- [AC_MSG_CHECKING([for AVX support (with -mavx)]) +- op_avx_cflags_save="$CFLAGS" +- CFLAGS="$CFLAGS -mavx" +- AC_LINK_IFELSE( +- [AC_LANG_PROGRAM([[#include ]], +- [[ ++ [AS_IF([test "$op_avx_check_avx" = "yes"], ++ [AC_MSG_CHECKING([for AVX support (with -mavx)]) ++ op_avx_cflags_save="$CFLAGS" ++ CFLAGS="-mavx $CFLAGS" ++ AC_LINK_IFELSE( ++ [AC_LANG_PROGRAM([[#include ]], ++ [[ + __m128 vA, vB; + _mm_add_ps(vA, vB) + ]])], +- [op_avx_support=1 +- MCA_BUILD_OP_AVX_FLAGS="-mavx" +- op_sse41_support=0 +- op_sse3_support=0 +- AC_MSG_RESULT([yes])], +- [AC_MSG_RESULT([no])]) ++ [op_avx_support=1 ++ MCA_BUILD_OP_AVX_FLAGS="-mavx" ++ op_sse41_support=0 ++ op_sse3_support=0 ++ AC_MSG_RESULT([yes])], ++ [AC_MSG_RESULT([no])])]) + +- AS_IF([test $op_sse41_support -eq 0], +- [AC_MSG_CHECKING([for SSE4.1 support]) +- AC_LINK_IFELSE( +- [AC_LANG_PROGRAM([[#include ]], +- [[ ++ AS_IF([test "$op_avx_check_sse41" = "yes" && test $op_sse41_support -eq 0], ++ [AC_MSG_CHECKING([for SSE4.1 support]) ++ AC_LINK_IFELSE( ++ [AC_LANG_PROGRAM([[#include ]], ++ [[ + __m128i vA, vB; + (void)_mm_max_epi8(vA, vB) +- ]])], +- [op_sse41_support=1 +- AC_MSG_RESULT([yes])], +- [AC_MSG_RESULT([no])]) +- ]) +- AS_IF([test $op_sse3_support -eq 0], +- [AC_MSG_CHECKING([for SSE3 support]) +- AC_LINK_IFELSE( +- [AC_LANG_PROGRAM([[#include ]], ++ ]])], ++ [op_sse41_support=1 ++ AC_MSG_RESULT([yes])], ++ [AC_MSG_RESULT([no])])]) ++ AS_IF([test "$op_avx_check_sse3" = "yes" && test $op_sse3_support -eq 0], ++ [AC_MSG_CHECKING([for SSE3 support]) ++ AC_LINK_IFELSE( ++ [AC_LANG_PROGRAM([[#include ]], + [[ + int A[4] = {0, 1, 2, 3}; + __m128i vA = _mm_lddqu_si128((__m128i*)&A) + ]])], +- [op_sse3_support=1 +- AC_MSG_RESULT([yes])], +- [AC_MSG_RESULT([no])]) +- ]) +- CFLAGS="$op_avx_cflags_save" +- ]) ++ [op_sse3_support=1 ++ AC_MSG_RESULT([yes])], ++ [AC_MSG_RESULT([no])])]) ++ CFLAGS="$op_avx_cflags_save"]) + + AC_LANG_POP([C]) + ]) +@@ -276,6 +289,12 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[ + AC_SUBST(MCA_BUILD_OP_AVX2_FLAGS) + AC_SUBST(MCA_BUILD_OP_AVX_FLAGS) + ++ AS_VAR_POPDEF([op_avx_check_avx512]) ++ AS_VAR_POPDEF([op_avx_check_avx2]) ++ AS_VAR_POPDEF([op_avx_check_avx]) ++ AS_VAR_POPDEF([op_avx_check_sse41]) ++ AS_VAR_POPDEF([op_avx_check_sse3]) ++ + OPAL_VAR_SCOPE_POP + # Enable this component iff we have at least the most basic form of support + # for vectorial ISA + +From 1c386327da5757632392aa6c967acfbebc85c7ff Mon Sep 17 00:00:00 2001 +From: George Bosilca +Date: Mon, 28 Dec 2020 12:18:07 -0500 +Subject: [PATCH 2/3] AVX code generation improvements + +1. Allow fallback to a lesser AVX support during make + +Due to the fact that some distro restrict the compiule architecture +during make (while not setting any restrictions during configure) we +need to detect the target architecture also during make in order to +restrict the code we generate. + +2. Add comments and better protect the arch specific code. + +Identify all the vectorial functions used and clasify them according to +the neccesary hardware capabilities. +Use these requirements to protect the code for load and stores (the rest +of the code being automatically generated it is more difficult to +protect). + +3. Correctly check for AVX* support. + +Signed-off-by: George Bosilca +--- + ompi/mca/op/avx/configure.m4 | 28 +-- + ompi/mca/op/avx/op_avx_functions.c | 322 ++++++++++++++++++++++++----- + 2 files changed, 288 insertions(+), 62 deletions(-) + +diff --git ompi/mca/op/avx/configure.m4 ompi/mca/op/avx/configure.m4 +index f61b7100ef4..f3651f09d43 100644 +--- ompi/mca/op/avx/configure.m4 ++++ ompi/mca/op/avx/configure.m4 +@@ -44,7 +44,7 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[ + # + # Check for AVX512 support + # +- AC_CACHE_CHECK([if we are checking for AVX512 support], op_avx_check_avx512, AS_VAR_SET(op_avx_check_avx512, yes)) ++ AC_CACHE_CHECK([for AVX512 support], op_avx_check_avx512, AS_VAR_SET(op_avx_check_avx512, yes)) + AS_IF([test "$op_avx_check_avx512" = "yes"], + [AC_MSG_CHECKING([for AVX512 support (no additional flags)]) + AC_LINK_IFELSE( +@@ -115,14 +115,14 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[ + # + # Check support for AVX2 + # +- AC_CACHE_CHECK([if we are checking for AVX2 support], op_avx_check_avx2, AS_VAR_SET(op_avx_check_avx2, yes)) ++ AC_CACHE_CHECK([for AVX2 support], op_avx_check_avx2, AS_VAR_SET(op_avx_check_avx2, yes)) + AS_IF([test "$op_avx_check_avx2" = "yes"], + [AC_MSG_CHECKING([for AVX2 support (no additional flags)]) + AC_LINK_IFELSE( + [AC_LANG_PROGRAM([[#include ]], + [[ +- __m256 vA, vB; +- _mm256_add_ps(vA, vB) ++ __m256i vA, vB, vC; ++ vC = _mm256_and_si256(vA, vB) + ]])], + [op_avx2_support=1 + AC_MSG_RESULT([yes])], +@@ -134,8 +134,8 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[ + AC_LINK_IFELSE( + [AC_LANG_PROGRAM([[#include ]], + [[ +- __m256 vA, vB; +- _mm256_add_ps(vA, vB) ++ __m256i vA, vB, vC; ++ vC = _mm256_and_si256(vA, vB) + ]])], + [op_avx2_support=1 + MCA_BUILD_OP_AVX2_FLAGS="-mavx2" +@@ -164,21 +164,21 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[ + CFLAGS="$op_avx_cflags_save" + ])]) + # +- # What about early AVX support. The rest of the logic is slightly different as ++ # What about early AVX support? The rest of the logic is slightly different as + # we need to include some of the SSE4.1 and SSE3 instructions. So, we first check + # if we can compile AVX code without a flag, then we validate that we have support + # for the SSE4.1 and SSE3 instructions we need. If not, we check for the usage of + # the AVX flag, and then recheck if we have support for the SSE4.1 and SSE3 + # instructions. + # +- AC_CACHE_CHECK([if we are checking for AVX support], op_avx_check_avx, AS_VAR_SET(op_avx_check_avx, yes)) ++ AC_CACHE_CHECK([for AVX support], op_avx_check_avx, AS_VAR_SET(op_avx_check_avx, yes)) + AS_IF([test "$op_avx_check_avx" = "yes"], + [AC_MSG_CHECKING([for AVX support (no additional flags)]) + AC_LINK_IFELSE( + [AC_LANG_PROGRAM([[#include ]], + [[ +- __m128 vA, vB; +- _mm_add_ps(vA, vB) ++ __m256 vA, vB, vC; ++ vC = _mm256_add_ps(vA, vB) + ]])], + [op_avx_support=1 + AC_MSG_RESULT([yes])], +@@ -186,7 +186,7 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[ + # + # Check for SSE4.1 support + # +- AC_CACHE_CHECK([if we are checking for SSE4.1 support], op_avx_check_sse41, AS_VAR_SET(op_avx_check_sse41, yes)) ++ AC_CACHE_CHECK([for SSE4.1 support], op_avx_check_sse41, AS_VAR_SET(op_avx_check_sse41, yes)) + AS_IF([test $op_avx_support -eq 1 && test "$op_avx_check_sse41" = "yes"], + [AC_MSG_CHECKING([for SSE4.1 support]) + AC_LINK_IFELSE( +@@ -202,7 +202,7 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[ + # + # Check for SSE3 support + # +- AC_CACHE_CHECK([if we are checking for SSE3 support], op_avx_check_sse3, AS_VAR_SET(op_avx_check_sse3, yes)) ++ AC_CACHE_CHECK([for SSE3 support], op_avx_check_sse3, AS_VAR_SET(op_avx_check_sse3, yes)) + AS_IF([test $op_avx_support -eq 1 && test "$op_avx_check_sse3" = "yes"], + [AC_MSG_CHECKING([for SSE3 support]) + AC_LINK_IFELSE( +@@ -224,8 +224,8 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[ + AC_LINK_IFELSE( + [AC_LANG_PROGRAM([[#include ]], + [[ +- __m128 vA, vB; +- _mm_add_ps(vA, vB) ++ __m256 vA, vB, vC; ++ vC = _mm256_add_ps(vA, vB) + ]])], + [op_avx_support=1 + MCA_BUILD_OP_AVX_FLAGS="-mavx" +diff --git ompi/mca/op/avx/op_avx_functions.c ompi/mca/op/avx/op_avx_functions.c +index c79a8b34506..575ebf95d6a 100644 +--- ompi/mca/op/avx/op_avx_functions.c ++++ ompi/mca/op/avx/op_avx_functions.c +@@ -1,5 +1,5 @@ + /* +- * Copyright (c) 2019-2020 The University of Tennessee and The University ++ * Copyright (c) 2019-2021 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2020 Research Organization for Information Science +@@ -24,16 +24,42 @@ + #include "ompi/mca/op/avx/op_avx.h" + + #include +- ++/** ++ * The following logic is necessary to cope with distro maintainer's desire to change the compilation ++ * flags after the configure step, leading to inconsistencies between what OMPI has detected and what ++ * code can be generated during make. If we detect that the current code generation architecture has ++ * been changed from our own setting and cannot generate the code we need (AVX512, AVX2) we fall back ++ * to a lesser support (AVX512 -> AVX2, AVX2 -> AVX, AVX -> error out). ++ */ + #if defined(GENERATE_AVX512_CODE) +-#define PREPEND _avx512 +-#elif defined(GENERATE_AVX2_CODE) +-#define PREPEND _avx2 +-#elif defined(GENERATE_AVX_CODE) +-#define PREPEND _avx +-#else +-#error This file should not be compiled in this conditions +-#endif ++# if defined(__AVX512BW__) && defined(__AVX512F__) && defined(__AVX512VL__) ++# define PREPEND _avx512 ++# else ++# undef GENERATE_AVX512_CODE ++# endif /* defined(__AVX512BW__) && defined(__AVX512F__) && defined(__AVX512VL__) */ ++#endif /* defined(GENERATE_AVX512_CODE) */ ++ ++#if !defined(PREPEND) && defined(GENERATE_AVX2_CODE) ++# if defined(__AVX2__) ++# define PREPEND _avx2 ++# else ++# undef GENERATE_AVX2_CODE ++# endif /* defined(__AVX2__) */ ++#endif /* !defined(PREPEND) && defined(GENERATE_AVX2_CODE) */ ++ ++#if !defined(PREPEND) && defined(GENERATE_AVX_CODE) ++# if defined(__AVX__) ++# define PREPEND _avx ++# endif ++#endif /* !defined(PREPEND) && defined(GENERATE_AVX_CODE) */ ++ ++#if !defined(PREPEND) ++# if OMPI_MCA_OP_HAVE_AVX512 || OMPI_MCA_OP_HAVE_AVX2 ++# error The configure step has detected possible support for AVX512 and/or AVX2 but the compiler flags during make are too restrictive. Please disable the AVX component by adding --enable-mca-no-build=op-avx to your configure step. ++# else ++# error This file should not be compiled in this conditions. Please provide the config.log file to the OMPI developers. ++# endif /* OMPI_MCA_OP_HAVE_AVX512 || OMPI_MCA_OP_HAVE_AVX2 */ ++#endif /* !defined(PREPEND) */ + + /* + * Concatenate preprocessor tokens A and B without expanding macro definitions +@@ -46,6 +72,102 @@ + */ + #define OP_CONCAT(A, B) OP_CONCAT_NX(A, B) + ++/* ++ * grep -e "_mm[125][251][862]_.*(" avx512.c -o | sed 's/(//g' | sort | uniq ++ * ++ * https://software.intel.com/sites/landingpage/IntrinsicsGuide ++ * ++ * _mm_add_epi[8,16,32,64] SSE2 ++ * _mm_add_pd SSE2 ++ * _mm_add_ps SSE ++ * _mm_adds_epi[8,16] SSE2 ++ * _mm_adds_epu[8,16] SSE2 ++ * _mm_and_si128 SSE2 ++ * _mm_lddqu_si128 SSE3 ++ * _mm_loadu_pd SSE2 ++ * _mm_loadu_ps SSE ++ * _mm_max_epi8 SSE4.1 ++ * _mm_max_epi16 SSE2 ++ * _mm_max_epi32 SSE4.1 ++ * _mm_max_epi64 AVX512VL + AVX512F ++ * _mm_max_epu8 SSE2 ++ * _mm_max_epu[16,32] SSE4.1 ++ * _mm_max_epu64 AVX512VL + AVX512F ++ * _mm_max_pd SSE2 ++ * _mm_max_ps SSE ++ * _mm_min_epi8 SSE4.1 ++ * _mm_min_epi16 SSE2 ++ * _mm_min_epi32 SSE4.1 ++ * _mm_min_epi64 AVX512VL + AVX512F ++ * _mm_min_epu8 SSE2 ++ * _mm_min_epu[16,32] SSE4.1 ++ * _mm_min_epu64 AVX512VL + AVX512F ++ * _mm_min_pd SSE2 ++ * _mm_min_ps SSE ++ * _mm_mul_pd SSE2 ++ * _mm_mul_ps SSE ++ * _mm_mullo_epi16 SSE2 ++ * _mm_mullo_epi32 SSE4.1 ++ * _mm_mullo_epi64 AVX512VL + AVX512DQ ++ * _mm_or_si128 SSE2 ++ * _mm_storeu_pd SSE2 ++ * _mm_storeu_ps SSE ++ * _mm_storeu_si128 SSE2 ++ * _mm_xor_si128 SSE2 ++ * _mm256_add_epi[8,16,32,64] AVX2 ++ * _mm256_add_p[s,d] AVX ++ * _mm256_adds_epi[8,16] AVX2 ++ * _mm256_adds_epu[8,16] AVX2 ++ * _mm256_and_si256 AVX2 ++ * _mm256_loadu_p[s,d] AVX ++ * _mm256_loadu_si256 AVX ++ * _mm256_max_epi[8,16,32] AVX2 ++ * _mm256_max_epi64 AVX512VL + AVX512F ++ * _mm256_max_epu[8,16,32] AVX2 ++ * _mm256_max_epu64 AVX512VL + AVX512F ++ * _mm256_max_p[s,d] AVX ++ * _mm256_min_epi[8,16,32] AVX2 ++ * _mm256_min_epi64 AVX512VL + AVX512F ++ * _mm256_min_epu[8,16,32] AVX2 ++ * _mm256_min_epu64 AVX512VL + AVX512F ++ * _mm256_min_p[s,d] AVX ++ * _mm256_mul_p[s,d] AVX ++ * _mm256_mullo_epi[16,32] AVX2 ++ * _mm256_mullo_epi64 AVX512VL + AVX512DQ ++ * _mm256_or_si256 AVX2 ++ * _mm256_storeu_p[s,d] AVX ++ * _mm256_storeu_si256 AVX ++ * _mm256_xor_si256 AVX2 ++ * _mm512_add_epi[8,16] AVX512BW ++ * _mm512_add_epi[32,64] AVX512F ++ * _mm512_add_p[s,d] AVX512F ++ * _mm512_adds_epi[8,16] AVX512BW ++ * _mm512_adds_epu[8,16] AVX512BW ++ * _mm512_and_si512 AVX512F ++ * _mm512_cvtepi16_epi8 AVX512BW ++ * _mm512_cvtepi8_epi16 AVX512BW ++ * _mm512_loadu_p[s,d] AVX512F ++ * _mm512_loadu_si512 AVX512F ++ * _mm512_max_epi[8,16] AVX512BW ++ * _mm512_max_epi[32,64] AVX512F ++ * _mm512_max_epu[8,16] AVX512BW ++ * _mm512_max_epu[32,64] AVX512F ++ * _mm512_max_p[s,d] AVX512F ++ * _mm512_min_epi[8,16] AVX512BW ++ * _mm512_min_epi[32,64] AVX512F ++ * _mm512_min_epu[8,16] AVX512BW ++ * _mm512_min_epu[32,64] AVX512F ++ * _mm512_min_p[s,d] AVX512F ++ * _mm512_mul_p[s,d] AVX512F ++ * _mm512_mullo_epi16 AVX512BW ++ * _mm512_mullo_epi32 AVX512F ++ * _mm512_mullo_epi64 AVX512DQ ++ * _mm512_or_si512 AVX512F ++ * _mm512_storeu_p[s,d] AVX512F ++ * _mm512_storeu_si512 AVX512F ++ * _mm512_xor_si512 AVX512F ++ */ ++ + /* + * Since all the functions in this file are essentially identical, we + * use a macro to substitute in names and types. The core operation +@@ -62,13 +184,14 @@ + (((_flag) & mca_op_avx_component.flags) == (_flag)) + + #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) ++#if __AVX512F__ + #define OP_AVX_AVX512_FUNC(name, type_sign, type_size, type, op) \ + if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG|OMPI_OP_AVX_HAS_AVX512BW_FLAG) ) { \ + int types_per_step = (512 / 8) / sizeof(type); \ + for( ; left_over >= types_per_step; left_over -= types_per_step ) { \ +- __m512i vecA = _mm512_loadu_si512((__m512*)in); \ ++ __m512i vecA = _mm512_loadu_si512((__m512*)in); \ + in += types_per_step; \ +- __m512i vecB = _mm512_loadu_si512((__m512*)out); \ ++ __m512i vecB = _mm512_loadu_si512((__m512*)out); \ + __m512i res = _mm512_##op##_ep##type_sign##type_size(vecA, vecB); \ + _mm512_storeu_si512((__m512*)out, res); \ + out += types_per_step; \ +@@ -76,10 +199,14 @@ + if( 0 == left_over ) return; \ + } + #else ++#error Target architecture lacks AVX512F support needed for _mm512_loadu_si512 and _mm512_storeu_si512 ++#endif /* __AVX512F__ */ ++#else + #define OP_AVX_AVX512_FUNC(name, type_sign, type_size, type, op) {} + #endif /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */ + + #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) ++#if __AVX__ + #define OP_AVX_AVX2_FUNC(name, type_sign, type_size, type, op) \ + if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX2_FLAG | OMPI_OP_AVX_HAS_AVX_FLAG) ) { \ + int types_per_step = (256 / 8) / sizeof(type); /* AVX2 */ \ +@@ -87,30 +214,37 @@ + __m256i vecA = _mm256_loadu_si256((__m256i*)in); \ + in += types_per_step; \ + __m256i vecB = _mm256_loadu_si256((__m256i*)out); \ +- __m256i res = _mm256_##op##_ep##type_sign##type_size(vecA, vecB); \ ++ __m256i res = _mm256_##op##_ep##type_sign##type_size(vecA, vecB); \ + _mm256_storeu_si256((__m256i*)out, res); \ + out += types_per_step; \ + } \ + if( 0 == left_over ) return; \ + } + #else ++#error Target architecture lacks AVX support needed for _mm256_loadu_si256 and _mm256_storeu_si256 ++#endif /* __AVX__ */ ++#else + #define OP_AVX_AVX2_FUNC(name, type_sign, type_size, type, op) {} + #endif /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */ + + #if defined(GENERATE_SSE3_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) ++#if __SSE3__ + #define OP_AVX_SSE4_1_FUNC(name, type_sign, type_size, type, op) \ + if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE3_FLAG | OMPI_OP_AVX_HAS_SSE4_1_FLAG) ) { \ +- int types_per_step = (128 / 8) / sizeof(type); /* AVX */ \ ++ int types_per_step = (128 / 8) / sizeof(type); \ + for( ; left_over >= types_per_step; left_over -= types_per_step ) { \ + __m128i vecA = _mm_lddqu_si128((__m128i*)in); \ + in += types_per_step; \ + __m128i vecB = _mm_lddqu_si128((__m128i*)out); \ +- __m128i res = _mm_##op##_ep##type_sign##type_size(vecA, vecB); \ ++ __m128i res = _mm_##op##_ep##type_sign##type_size(vecA, vecB); \ + _mm_storeu_si128((__m128i*)out, res); \ + out += types_per_step; \ + } \ + } + #else ++#error Target architecture lacks SSE3 support needed for _mm_lddqu_si128 and _mm_storeu_si128 ++#endif /* __SSE3__ */ ++#else + #define OP_AVX_SSE4_1_FUNC(name, type_sign, type_size, type, op) {} + #endif /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */ + +@@ -143,12 +277,13 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in + } + + #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) ++#if __AVX512BW__ && __AVX__ + #define OP_AVX_AVX512_MUL(name, type_sign, type_size, type, op) \ + if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG | OMPI_OP_AVX_HAS_AVX512BW_FLAG) ) { \ + int types_per_step = (256 / 8) / sizeof(type); \ + for (; left_over >= types_per_step; left_over -= types_per_step) { \ +- __m256i vecA_tmp = _mm256_loadu_si256((__m256i*)in); \ +- __m256i vecB_tmp = _mm256_loadu_si256((__m256i*)out); \ ++ __m256i vecA_tmp = _mm256_loadu_si256((__m256i*)in); \ ++ __m256i vecB_tmp = _mm256_loadu_si256((__m256i*)out); \ + in += types_per_step; \ + __m512i vecA = _mm512_cvtepi8_epi16(vecA_tmp); \ + __m512i vecB = _mm512_cvtepi8_epi16(vecB_tmp); \ +@@ -160,6 +295,9 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in + if( 0 == left_over ) return; \ + } + #else ++#error Target architecture lacks AVX512BW and AVX support needed for _mm256_loadu_si256, _mm256_storeu_si256 and _mm512_cvtepi8_epi16 ++#endif /* __AVX512BW__ && __AVX__ */ ++#else + #define OP_AVX_AVX512_MUL(name, type_sign, type_size, type, op) {} + #endif /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */ + /** +@@ -201,13 +339,14 @@ static void OP_CONCAT( ompi_op_avx_2buff_##name##_##type, PREPEND)(const void *_ + * + */ + #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) ++#if __AVX512F__ + #define OP_AVX_AVX512_BIT_FUNC(name, type_size, type, op) \ + if( OMPI_OP_AVX_HAS_FLAGS( OMPI_OP_AVX_HAS_AVX512F_FLAG) ) { \ + types_per_step = (512 / 8) / sizeof(type); \ + for (; left_over >= types_per_step; left_over -= types_per_step) { \ +- __m512i vecA = _mm512_loadu_si512((__m512i*)in); \ ++ __m512i vecA = _mm512_loadu_si512((__m512i*)in); \ + in += types_per_step; \ +- __m512i vecB = _mm512_loadu_si512((__m512i*)out); \ ++ __m512i vecB = _mm512_loadu_si512((__m512i*)out); \ + __m512i res = _mm512_##op##_si512(vecA, vecB); \ + _mm512_storeu_si512((__m512i*)out, res); \ + out += types_per_step; \ +@@ -215,10 +354,14 @@ static void OP_CONCAT( ompi_op_avx_2buff_##name##_##type, PREPEND)(const void *_ + if( 0 == left_over ) return; \ + } + #else ++#error Target architecture lacks AVX512F support needed for _mm512_loadu_si512 and _mm512_storeu_si512 ++#endif /* __AVX512F__ */ ++#else + #define OP_AVX_AVX512_BIT_FUNC(name, type_size, type, op) {} + #endif /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */ + + #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) ++#if __AVX__ + #define OP_AVX_AVX2_BIT_FUNC(name, type_size, type, op) \ + if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX2_FLAG | OMPI_OP_AVX_HAS_AVX_FLAG) ) { \ + types_per_step = (256 / 8) / sizeof(type); \ +@@ -226,17 +369,21 @@ static void OP_CONCAT( ompi_op_avx_2buff_##name##_##type, PREPEND)(const void *_ + __m256i vecA = _mm256_loadu_si256((__m256i*)in); \ + in += types_per_step; \ + __m256i vecB = _mm256_loadu_si256((__m256i*)out); \ +- __m256i res = _mm256_##op##_si256(vecA, vecB); \ ++ __m256i res = _mm256_##op##_si256(vecA, vecB); \ + _mm256_storeu_si256((__m256i*)out, res); \ + out += types_per_step; \ + } \ + if( 0 == left_over ) return; \ + } + #else ++#error Target architecture lacks AVX support needed for _mm256_loadu_si256 and _mm256_storeu_si256 ++#endif /* __AVX__ */ ++#else + #define OP_AVX_AVX2_BIT_FUNC(name, type_size, type, op) {} + #endif /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */ + + #if defined(GENERATE_SSE3_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) ++#if __SSE3__ && __SSE2__ + #define OP_AVX_SSE3_BIT_FUNC(name, type_size, type, op) \ + if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE3_FLAG) ) { \ + types_per_step = (128 / 8) / sizeof(type); \ +@@ -244,12 +391,15 @@ static void OP_CONCAT( ompi_op_avx_2buff_##name##_##type, PREPEND)(const void *_ + __m128i vecA = _mm_lddqu_si128((__m128i*)in); \ + in += types_per_step; \ + __m128i vecB = _mm_lddqu_si128((__m128i*)out); \ +- __m128i res = _mm_##op##_si128(vecA, vecB); \ ++ __m128i res = _mm_##op##_si128(vecA, vecB); \ + _mm_storeu_si128((__m128i*)out, res); \ + out += types_per_step; \ + } \ + } + #else ++#error Target architecture lacks SSE2 and SSE3 support needed for _mm_lddqu_si128 and _mm_storeu_si128 ++#endif /* __SSE3__ && __SSE2__ */ ++#else + #define OP_AVX_SSE3_BIT_FUNC(name, type_size, type, op) {} + #endif /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */ + +@@ -282,12 +432,13 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in + } + + #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) ++#if __AVX512F__ + #define OP_AVX_AVX512_FLOAT_FUNC(op) \ + if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) { \ + types_per_step = (512 / 8) / sizeof(float); \ + for (; left_over >= types_per_step; left_over -= types_per_step) { \ +- __m512 vecA = _mm512_loadu_ps((__m512*)in); \ +- __m512 vecB = _mm512_loadu_ps((__m512*)out); \ ++ __m512 vecA = _mm512_loadu_ps((__m512*)in); \ ++ __m512 vecB = _mm512_loadu_ps((__m512*)out); \ + in += types_per_step; \ + __m512 res = _mm512_##op##_ps(vecA, vecB); \ + _mm512_storeu_ps((__m512*)out, res); \ +@@ -296,28 +447,36 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in + if( 0 == left_over ) return; \ + } + #else ++#error Target architecture lacks AVX512F support needed for _mm512_loadu_ps and _mm512_storeu_ps ++#endif /* __AVX512F__ */ ++#else + #define OP_AVX_AVX512_FLOAT_FUNC(op) {} + #endif /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */ + + #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) ++#if __AVX__ + #define OP_AVX_AVX_FLOAT_FUNC(op) \ + if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) { \ + types_per_step = (256 / 8) / sizeof(float); \ + for( ; left_over >= types_per_step; left_over -= types_per_step ) { \ +- __m256 vecA = _mm256_loadu_ps(in); \ ++ __m256 vecA = _mm256_loadu_ps(in); \ + in += types_per_step; \ +- __m256 vecB = _mm256_loadu_ps(out); \ ++ __m256 vecB = _mm256_loadu_ps(out); \ + __m256 res = _mm256_##op##_ps(vecA, vecB); \ +- _mm256_storeu_ps(out, res); \ ++ _mm256_storeu_ps(out, res); \ + out += types_per_step; \ + } \ + if( 0 == left_over ) return; \ + } + #else ++#error Target architecture lacks AVX support needed for _mm256_loadu_ps and _mm256_storeu_ps ++#endif /* __AVX__ */ ++#else + #define OP_AVX_AVX_FLOAT_FUNC(op) {} + #endif /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */ + + #if defined(GENERATE_AVX_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) ++#if __SSE__ + #define OP_AVX_SSE_FLOAT_FUNC(op) \ + if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE_FLAG) ) { \ + types_per_step = (128 / 8) / sizeof(float); \ +@@ -331,6 +490,9 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in + } \ + } + #else ++#error Target architecture lacks SSE support needed for _mm_loadu_ps and _mm_storeu_ps ++#endif /* __SSE__ */ ++#else + #define OP_AVX_SSE_FLOAT_FUNC(op) {} + #endif /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */ + +@@ -363,13 +525,14 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, v + } + + #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) ++#if __AVX512F__ + #define OP_AVX_AVX512_DOUBLE_FUNC(op) \ + if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) { \ + types_per_step = (512 / 8) / sizeof(double); \ + for (; left_over >= types_per_step; left_over -= types_per_step) { \ +- __m512d vecA = _mm512_loadu_pd(in); \ ++ __m512d vecA = _mm512_loadu_pd(in); \ + in += types_per_step; \ +- __m512d vecB = _mm512_loadu_pd(out); \ ++ __m512d vecB = _mm512_loadu_pd(out); \ + __m512d res = _mm512_##op##_pd(vecA, vecB); \ + _mm512_storeu_pd((out), res); \ + out += types_per_step; \ +@@ -377,17 +540,21 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, v + if( 0 == left_over ) return; \ + } + #else ++#error Target architecture lacks AVXF512 support needed for _mm512_loadu_pd and _mm512_storeu_pd ++#endif /* __AVXF512__ */ ++#else + #define OP_AVX_AVX512_DOUBLE_FUNC(op) {} + #endif /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */ + + #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) ++#if __AVX__ + #define OP_AVX_AVX_DOUBLE_FUNC(op) \ + if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) { \ + types_per_step = (256 / 8) / sizeof(double); \ + for( ; left_over >= types_per_step; left_over -= types_per_step ) { \ +- __m256d vecA = _mm256_loadu_pd(in); \ ++ __m256d vecA = _mm256_loadu_pd(in); \ + in += types_per_step; \ +- __m256d vecB = _mm256_loadu_pd(out); \ ++ __m256d vecB = _mm256_loadu_pd(out); \ + __m256d res = _mm256_##op##_pd(vecA, vecB); \ + _mm256_storeu_pd(out, res); \ + out += types_per_step; \ +@@ -395,10 +562,14 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, v + if( 0 == left_over ) return; \ + } + #else ++#error Target architecture lacks AVX support needed for _mm256_loadu_pd and _mm256_storeu_pd ++#endif /* __AVX__ */ ++#else + #define OP_AVX_AVX_DOUBLE_FUNC(op) {} + #endif /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */ + + #if defined(GENERATE_AVX_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) ++#if __SSE2__ + #define OP_AVX_SSE2_DOUBLE_FUNC(op) \ + if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE2_FLAG) ) { \ + types_per_step = (128 / 8) / sizeof(double); \ +@@ -412,6 +583,9 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, v + } \ + } + #else ++#error Target architecture lacks SSE2 support needed for _mm_loadu_pd and _mm_storeu_pd ++#endif /* __SSE2__ */ ++#else + #define OP_AVX_SSE2_DOUBLE_FUNC(op) {} + #endif /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */ + +@@ -580,12 +754,13 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_double,PREPEND)(const void *_in, + * routines, needed for some optimizations. + */ + #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) ++#if __AVX512F__ + #define OP_AVX_AVX512_FUNC_3(name, type_sign, type_size, type, op) \ + if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG|OMPI_OP_AVX_HAS_AVX512BW_FLAG) ) { \ + int types_per_step = (512 / 8) / sizeof(type); \ + for (; left_over >= types_per_step; left_over -= types_per_step) { \ +- __m512i vecA = _mm512_loadu_si512(in1); \ +- __m512i vecB = _mm512_loadu_si512(in2); \ ++ __m512i vecA = _mm512_loadu_si512(in1); \ ++ __m512i vecB = _mm512_loadu_si512(in2); \ + in1 += types_per_step; \ + in2 += types_per_step; \ + __m512i res = _mm512_##op##_ep##type_sign##type_size(vecA, vecB); \ +@@ -595,10 +770,14 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_double,PREPEND)(const void *_in, + if( 0 == left_over ) return; \ + } + #else ++#error Target architecture lacks AVX512F support needed for _mm512_loadu_si512 and _mm512_storeu_si512 ++#endif /* __AVX512F__ */ ++#else + #define OP_AVX_AVX512_FUNC_3(name, type_sign, type_size, type, op) {} + #endif /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */ + + #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) ++#if __AVX__ + #define OP_AVX_AVX2_FUNC_3(name, type_sign, type_size, type, op) \ + if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX2_FLAG | OMPI_OP_AVX_HAS_AVX_FLAG) ) { \ + int types_per_step = (256 / 8) / sizeof(type); \ +@@ -607,17 +786,21 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_double,PREPEND)(const void *_in, + __m256i vecB = _mm256_loadu_si256((__m256i*)in2); \ + in1 += types_per_step; \ + in2 += types_per_step; \ +- __m256i res = _mm256_##op##_ep##type_sign##type_size(vecA, vecB); \ ++ __m256i res = _mm256_##op##_ep##type_sign##type_size(vecA, vecB); \ + _mm256_storeu_si256((__m256i*)out, res); \ + out += types_per_step; \ + } \ + if( 0 == left_over ) return; \ + } + #else ++#error Target architecture lacks AVX support needed for _mm256_loadu_si256 and _mm256_storeu_si256 ++#endif /* __AVX__ */ ++#else + #define OP_AVX_AVX2_FUNC_3(name, type_sign, type_size, type, op) {} + #endif /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */ + + #if defined(GENERATE_SSE3_CODE) && defined(OMPI_MCA_OP_HAVE_SSE41) && (1 == OMPI_MCA_OP_HAVE_SSE41) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) ++#if __SSE3__ && __SSE2__ + #define OP_AVX_SSE4_1_FUNC_3(name, type_sign, type_size, type, op) \ + if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE3_FLAG | OMPI_OP_AVX_HAS_SSE4_1_FLAG) ) { \ + int types_per_step = (128 / 8) / sizeof(type); \ +@@ -626,12 +809,15 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_double,PREPEND)(const void *_in, + __m128i vecB = _mm_lddqu_si128((__m128i*)in2); \ + in1 += types_per_step; \ + in2 += types_per_step; \ +- __m128i res = _mm_##op##_ep##type_sign##type_size(vecA, vecB); \ ++ __m128i res = _mm_##op##_ep##type_sign##type_size(vecA, vecB); \ + _mm_storeu_si128((__m128i*)out, res); \ + out += types_per_step; \ + } \ + } + #else ++#error Target architecture lacks SSE2 and SSE3 support needed for _mm_lddqu_si128 and _mm_storeu_si128 ++#endif /* __SSE3__ && __SSE2__ */ ++#else + #define OP_AVX_SSE4_1_FUNC_3(name, type_sign, type_size, type, op) {} + #endif /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */ + +@@ -667,12 +853,13 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re + } + + #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) ++#if __AVX512BW__ && __AVX__ + #define OP_AVX_AVX512_MUL_3(name, type_sign, type_size, type, op) \ + if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG | OMPI_OP_AVX_HAS_AVX512BW_FLAG) ) { \ + int types_per_step = (256 / 8) / sizeof(type); \ + for (; left_over >= types_per_step; left_over -= types_per_step) { \ +- __m256i vecA_tmp = _mm256_loadu_si256((__m256i*)in1); \ +- __m256i vecB_tmp = _mm256_loadu_si256((__m256i*)in2); \ ++ __m256i vecA_tmp = _mm256_loadu_si256((__m256i*)in1); \ ++ __m256i vecB_tmp = _mm256_loadu_si256((__m256i*)in2); \ + in1 += types_per_step; \ + in2 += types_per_step; \ + __m512i vecA = _mm512_cvtepi8_epi16(vecA_tmp); \ +@@ -685,6 +872,9 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re + if( 0 == left_over ) return; \ + } + #else ++#error Target architecture lacks AVX512BW and AVX support needed for _mm256_loadu_si256, _mm256_storeu_si256 and _mm512_cvtepi8_epi16 ++#endif /* __AVX512BW__ && __AVX__ */ ++#else + #define OP_AVX_AVX512_MUL_3(name, type_sign, type_size, type, op) {} + #endif /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */ + /** +@@ -723,12 +913,13 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re + } + + #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) ++#if __AVX512F__ + #define OP_AVX_AVX512_BIT_FUNC_3(name, type_size, type, op) \ + if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) { \ + types_per_step = (512 / 8) / sizeof(type); \ + for (; left_over >= types_per_step; left_over -= types_per_step) { \ +- __m512i vecA = _mm512_loadu_si512(in1); \ +- __m512i vecB = _mm512_loadu_si512(in2); \ ++ __m512i vecA = _mm512_loadu_si512(in1); \ ++ __m512i vecB = _mm512_loadu_si512(in2); \ + in1 += types_per_step; \ + in2 += types_per_step; \ + __m512i res = _mm512_##op##_si512(vecA, vecB); \ +@@ -738,10 +929,14 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re + if( 0 == left_over ) return; \ + } + #else ++#error Target architecture lacks AVX512F support needed for _mm512_loadu_si512 and _mm512_storeu_si512 ++#endif /* __AVX512F__ */ ++#else + #define OP_AVX_AVX512_BIT_FUNC_3(name, type_size, type, op) {} + #endif /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */ + + #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) ++#if __AVX__ + #define OP_AVX_AVX2_BIT_FUNC_3(name, type_size, type, op) \ + if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX2_FLAG | OMPI_OP_AVX_HAS_AVX_FLAG) ) { \ + types_per_step = (256 / 8) / sizeof(type); \ +@@ -750,17 +945,21 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re + __m256i vecB = _mm256_loadu_si256((__m256i*)in2); \ + in1 += types_per_step; \ + in2 += types_per_step; \ +- __m256i res = _mm256_##op##_si256(vecA, vecB); \ ++ __m256i res = _mm256_##op##_si256(vecA, vecB); \ + _mm256_storeu_si256((__m256i*)out, res); \ + out += types_per_step; \ + } \ + if( 0 == left_over ) return; \ + } + #else ++#error Target architecture lacks AVX support needed for _mm256_loadu_si256 and _mm256_storeu_si256 ++#endif /* __AVX__ */ ++#else + #define OP_AVX_AVX2_BIT_FUNC_3(name, type_size, type, op) {} + #endif /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */ + + #if defined(GENERATE_SSE3_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) ++#if __SSE3__ && __SSE2__ + #define OP_AVX_SSE3_BIT_FUNC_3(name, type_size, type, op) \ + if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE3_FLAG) ) { \ + types_per_step = (128 / 8) / sizeof(type); \ +@@ -769,12 +968,15 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re + __m128i vecB = _mm_lddqu_si128((__m128i*)in2); \ + in1 += types_per_step; \ + in2 += types_per_step; \ +- __m128i res = _mm_##op##_si128(vecA, vecB); \ ++ __m128i res = _mm_##op##_si128(vecA, vecB); \ + _mm_storeu_si128((__m128i*)out, res); \ + out += types_per_step; \ + } \ + } + #else ++#error Target architecture lacks SSE2 and SSE3 support needed for _mm_lddqu_si128 and _mm_storeu_si128 ++#endif /* __SSE3__ && __SSE2__ */ ++#else + #define OP_AVX_SSE3_BIT_FUNC_3(name, type_size, type, op) {} + #endif /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */ + +@@ -809,12 +1011,13 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1, + } + + #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) ++#if __AVX512F__ + #define OP_AVX_AVX512_FLOAT_FUNC_3(op) \ + if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) { \ + types_per_step = (512 / 8) / sizeof(float); \ + for (; left_over >= types_per_step; left_over -= types_per_step) { \ +- __m512 vecA = _mm512_loadu_ps(in1); \ +- __m512 vecB = _mm512_loadu_ps(in2); \ ++ __m512 vecA = _mm512_loadu_ps(in1); \ ++ __m512 vecB = _mm512_loadu_ps(in2); \ + in1 += types_per_step; \ + in2 += types_per_step; \ + __m512 res = _mm512_##op##_ps(vecA, vecB); \ +@@ -824,16 +1027,20 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1, + if( 0 == left_over ) return; \ + } + #else ++#error Target architecture lacks AVX512F support needed for _mm512_loadu_ps and _mm512_storeu_ps ++#endif /* __AVX512F__ */ ++#else + #define OP_AVX_AVX512_FLOAT_FUNC_3(op) {} + #endif /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */ + + #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) ++#if __AVX__ + #define OP_AVX_AVX_FLOAT_FUNC_3(op) \ + if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) { \ + types_per_step = (256 / 8) / sizeof(float); \ + for( ; left_over >= types_per_step; left_over -= types_per_step ) { \ +- __m256 vecA = _mm256_loadu_ps(in1); \ +- __m256 vecB = _mm256_loadu_ps(in2); \ ++ __m256 vecA = _mm256_loadu_ps(in1); \ ++ __m256 vecB = _mm256_loadu_ps(in2); \ + in1 += types_per_step; \ + in2 += types_per_step; \ + __m256 res = _mm256_##op##_ps(vecA, vecB); \ +@@ -843,10 +1050,14 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1, + if( 0 == left_over ) return; \ + } + #else ++#error Target architecture lacks AVX support needed for _mm256_loadu_ps and _mm256_storeu_ps ++#endif /* __AVX__ */ ++#else + #define OP_AVX_AVX_FLOAT_FUNC_3(op) {} + #endif /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */ + + #if defined(GENERATE_AVX_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) ++#if __SSE__ + #define OP_AVX_SSE_FLOAT_FUNC_3(op) \ + if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE_FLAG) ) { \ + types_per_step = (128 / 8) / sizeof(float); \ +@@ -861,6 +1072,9 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1, + } \ + } + #else ++#error Target architecture lacks SSE support needed for _mm_loadu_ps and _mm_storeu_ps ++#endif /* __SSE__ */ ++#else + #define OP_AVX_SSE_FLOAT_FUNC_3(op) {} + #endif /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */ + +@@ -895,12 +1109,13 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1, + } + + #if defined(GENERATE_AVX512_CODE) && defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) ++#if __AVX512F__ + #define OP_AVX_AVX512_DOUBLE_FUNC_3(op) \ + if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) { \ + types_per_step = (512 / 8) / sizeof(double); \ + for (; left_over >= types_per_step; left_over -= types_per_step) { \ +- __m512d vecA = _mm512_loadu_pd((in1)); \ +- __m512d vecB = _mm512_loadu_pd((in2)); \ ++ __m512d vecA = _mm512_loadu_pd((in1)); \ ++ __m512d vecB = _mm512_loadu_pd((in2)); \ + in1 += types_per_step; \ + in2 += types_per_step; \ + __m512d res = _mm512_##op##_pd(vecA, vecB); \ +@@ -910,16 +1125,20 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1, + if( 0 == left_over ) return; \ + } + #else ++#error Target architecture lacks AVXF512 support needed for _mm512_loadu_pd and _mm512_storeu_pd ++#endif /* __AVXF512__ */ ++#else + #define OP_AVX_AVX512_DOUBLE_FUNC_3(op) {} + #endif /* defined(OMPI_MCA_OP_HAVE_AVX512) && (1 == OMPI_MCA_OP_HAVE_AVX512) */ + + #if defined(GENERATE_AVX2_CODE) && defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) ++#if __AVX__ + #define OP_AVX_AVX_DOUBLE_FUNC_3(op) \ + if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) { \ + types_per_step = (256 / 8) / sizeof(double); \ + for( ; left_over >= types_per_step; left_over -= types_per_step ) { \ +- __m256d vecA = _mm256_loadu_pd(in1); \ +- __m256d vecB = _mm256_loadu_pd(in2); \ ++ __m256d vecA = _mm256_loadu_pd(in1); \ ++ __m256d vecB = _mm256_loadu_pd(in2); \ + in1 += types_per_step; \ + in2 += types_per_step; \ + __m256d res = _mm256_##op##_pd(vecA, vecB); \ +@@ -929,10 +1148,14 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1, + if( 0 == left_over ) return; \ + } + #else ++#error Target architecture lacks AVX support needed for _mm256_loadu_pd and _mm256_storeu_pd ++#endif /* __AVX__ */ ++#else + #define OP_AVX_AVX_DOUBLE_FUNC_3(op) {} + #endif /* defined(OMPI_MCA_OP_HAVE_AVX2) && (1 == OMPI_MCA_OP_HAVE_AVX2) */ + + #if defined(GENERATE_AVX_CODE) && defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) ++#if __SSE2__ + #define OP_AVX_SSE2_DOUBLE_FUNC_3(op) \ + if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE2_FLAG) ) { \ + types_per_step = (128 / 8) / sizeof(double); \ +@@ -947,6 +1170,9 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1, + } \ + } + #else ++#error Target architecture lacks SSE2 support needed for _mm_loadu_pd and _mm_storeu_pd ++#endif /* __SSE2__ */ ++#else + #define OP_AVX_SSE2_DOUBLE_FUNC_3(op) {} + #endif /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */ + + +From ac6f658a4127bbfd916f26946330d9906dcbae54 Mon Sep 17 00:00:00 2001 +From: George Bosilca +Date: Tue, 5 Jan 2021 22:40:26 -0500 +Subject: [PATCH 3/3] A better test for MPI_OP performance. + +The test now has the ability to add a shift to all or to any of the +input and output buffers to assess the impact of unaligned operations. + +Signed-off-by: George Bosilca +--- + test/datatype/reduce_local.c | 161 ++++++++++++++++++++++------------- + 1 file changed, 104 insertions(+), 57 deletions(-) + +diff --git test/datatype/reduce_local.c test/datatype/reduce_local.c +index 97890f94227..f227439b714 100644 +--- test/datatype/reduce_local.c ++++ test/datatype/reduce_local.c +@@ -59,7 +59,7 @@ static int total_errors = 0; + _a < _b ? _a : _b; }) + + static void print_status(char* op, char* type, int type_size, +- int count, double duration, ++ int count, int max_shift, double *duration, int repeats, + int correct ) + { + if(correct) { +@@ -68,7 +68,15 @@ static void print_status(char* op, char* type, int type_size, + printf("%-10s %s [\033[1;31mfail\033[0m]", op, type); + total_errors++; + } +- printf(" count %-10d time %.6f seconds\n", count, duration); ++ if( 1 == max_shift ) { ++ printf(" count %-10d time (seconds) %.8f seconds\n", count, duration[0] / repeats); ++ } else { ++ printf(" count %-10d time (seconds / shifts) ", count); ++ for( int i = 0; i < max_shift; i++ ) { ++ printf("%.8f ", duration[i] / repeats ); ++ } ++ printf("\n"); ++ } + } + + static int do_ops_built = 0; +@@ -115,19 +123,23 @@ do { \ + const TYPE *_p1 = ((TYPE*)(INBUF)), *_p3 = ((TYPE*)(CHECK_BUF)); \ + TYPE *_p2 = ((TYPE*)(INOUT_BUF)); \ + skip_op_type = 0; \ +- for(int _k = 0; _k < min((COUNT), 4); +_k++ ) { \ +- memcpy(_p2, _p3, sizeof(TYPE) * (COUNT)); \ +- tstart = MPI_Wtime(); \ +- MPI_Reduce_local(_p1+_k, _p2+_k, (COUNT)-_k, (MPITYPE), (MPIOP)); \ +- tend = MPI_Wtime(); \ +- if( check ) { \ +- for( i = 0; i < (COUNT)-_k; i++ ) { \ +- if(((_p2+_k)[i]) == (((_p1+_k)[i]) OPNAME ((_p3+_k)[i]))) \ +- continue; \ +- printf("First error at alignment %d position %d (%" TYPE_PREFIX " %s %" TYPE_PREFIX " != %" TYPE_PREFIX ")\n", \ +- _k, i, (_p1+_k)[i], (#OPNAME), (_p3+_k)[i], (_p2+_k)[i]); \ +- correctness = 0; \ +- break; \ ++ for(int _k = 0; _k < min((COUNT), max_shift); +_k++ ) { \ ++ duration[_k] = 0.0; \ ++ for(int _r = repeats; _r > 0; _r--) { \ ++ memcpy(_p2, _p3, sizeof(TYPE) * (COUNT)); \ ++ tstart = MPI_Wtime(); \ ++ MPI_Reduce_local(_p1+_k, _p2+_k, (COUNT)-_k, (MPITYPE), (MPIOP)); \ ++ tend = MPI_Wtime(); \ ++ duration[_k] += (tend - tstart); \ ++ if( check ) { \ ++ for( i = 0; i < (COUNT)-_k; i++ ) { \ ++ if(((_p2+_k)[i]) == (((_p1+_k)[i]) OPNAME ((_p3+_k)[i]))) \ ++ continue; \ ++ printf("First error at alignment %d position %d (%" TYPE_PREFIX " %s %" TYPE_PREFIX " != %" TYPE_PREFIX ")\n", \ ++ _k, i, (_p1+_k)[i], (#OPNAME), (_p3+_k)[i], (_p2+_k)[i]); \ ++ correctness = 0; \ ++ break; \ ++ } \ + } \ + } \ + } \ +@@ -139,20 +151,24 @@ do { \ + const TYPE *_p1 = ((TYPE*)(INBUF)), *_p3 = ((TYPE*)(CHECK_BUF)); \ + TYPE *_p2 = ((TYPE*)(INOUT_BUF)); \ + skip_op_type = 0; \ +- for(int _k = 0; _k < min((COUNT), 4); +_k++ ) { \ +- memcpy(_p2, _p3, sizeof(TYPE) * (COUNT)); \ +- tstart = MPI_Wtime(); \ +- MPI_Reduce_local(_p1+_k, _p2+_k, (COUNT), (MPITYPE), (MPIOP)); \ +- tend = MPI_Wtime(); \ +- if( check ) { \ +- for( i = 0; i < (COUNT); i++ ) { \ +- TYPE _v1 = *(_p1+_k), _v2 = *(_p2+_k), _v3 = *(_p3+_k); \ +- if(_v2 == OPNAME(_v1, _v3)) \ +- continue; \ +- printf("First error at alignment %d position %d (%" TYPE_PREFIX " != %s(%" TYPE_PREFIX ", %" TYPE_PREFIX ")\n", \ +- _k, i, _v1, (#OPNAME), _v3, _v2); \ +- correctness = 0; \ +- break; \ ++ for(int _k = 0; _k < min((COUNT), max_shift); +_k++ ) { \ ++ duration[_k] = 0.0; \ ++ for(int _r = repeats; _r > 0; _r--) { \ ++ memcpy(_p2, _p3, sizeof(TYPE) * (COUNT)); \ ++ tstart = MPI_Wtime(); \ ++ MPI_Reduce_local(_p1+_k, _p2+_k, (COUNT), (MPITYPE), (MPIOP)); \ ++ tend = MPI_Wtime(); \ ++ duration[_k] += (tend - tstart); \ ++ if( check ) { \ ++ for( i = 0; i < (COUNT); i++ ) { \ ++ TYPE _v1 = *(_p1+_k), _v2 = *(_p2+_k), _v3 = *(_p3+_k); \ ++ if(_v2 == OPNAME(_v1, _v3)) \ ++ continue; \ ++ printf("First error at alignment %d position %d (%" TYPE_PREFIX " != %s(%" TYPE_PREFIX ", %" TYPE_PREFIX ")\n", \ ++ _k, i, _v1, (#OPNAME), _v3, _v2); \ ++ correctness = 0; \ ++ break; \ ++ } \ + } \ + } \ + } \ +@@ -163,24 +179,36 @@ int main(int argc, char **argv) + { + static void *in_buf = NULL, *inout_buf = NULL, *inout_check_buf = NULL; + int count, type_size = 8, rank, size, provided, correctness = 1; +- int repeats = 1, i, c; +- double tstart, tend; ++ int repeats = 1, i, c, op1_alignment = 0, res_alignment = 0; ++ int max_shift = 4; ++ double *duration, tstart, tend; + bool check = true; + char type[5] = "uifd", *op = "sum", *mpi_type; + int lower = 1, upper = 1000000, skip_op_type; + MPI_Op mpi_op; + +- while( -1 != (c = getopt(argc, argv, "l:u:t:o:s:n:vfh")) ) { ++ while( -1 != (c = getopt(argc, argv, "l:u:r:t:o:i:s:n:1:2:vfh")) ) { + switch(c) { + case 'l': + lower = atoi(optarg); + if( lower <= 0 ) { +- fprintf(stderr, "The number of elements must be positive\n"); ++ fprintf(stderr, "The lower number of elements must be positive\n"); + exit(-1); + } + break; + case 'u': + upper = atoi(optarg); ++ if( lower <= 0 ) { ++ fprintf(stderr, "The upper number of elements must be positive\n"); ++ exit(-1); ++ } ++ break; ++ case 'i': ++ max_shift = atoi(optarg); ++ if( max_shift <= 0 ) { ++ fprintf(stderr, "The max shift must be positive\n"); ++ exit(-1); ++ } + break; + case 'f': + check = false; +@@ -216,14 +244,32 @@ int main(int argc, char **argv) + exit(-1); + } + break; ++ case '1': ++ op1_alignment = atoi(optarg); ++ if( op1_alignment < 0 ) { ++ fprintf(stderr, "alignment for the first operand must be positive\n"); ++ exit(-1); ++ } ++ break; ++ case '2': ++ res_alignment = atoi(optarg); ++ if( res_alignment < 0 ) { ++ fprintf(stderr, "alignment for the result must be positive\n"); ++ exit(-1); ++ } ++ break; + case 'h': + fprintf(stdout, "%s options are:\n" + " -l : lower number of elements\n" + " -u : upper number of elements\n" + " -s : 8, 16, 32 or 64 bits elements\n" + " -t [i,u,f,d] : type of the elements to apply the operations on\n" ++ " -r : number of repetitions for each test\n" + " -o : comma separated list of operations to execute among\n" + " sum, min, max, prod, bor, bxor, band\n" ++ " -i : shift on all buffers to check alignment\n" ++ " -1 : (mis)alignment in elements for the first op\n" ++ " -2 : (mis)alignment in elements for the result\n" + " -v: increase the verbosity level\n" + " -h: this help message\n", argv[0]); + exit(0); +@@ -233,9 +279,10 @@ int main(int argc, char **argv) + if( !do_ops_built ) { /* not yet done, take the default */ + build_do_ops( "all", do_ops); + } +- in_buf = malloc(upper * sizeof(double)); +- inout_buf = malloc(upper * sizeof(double)); +- inout_check_buf = malloc(upper * sizeof(double)); ++ posix_memalign( &in_buf, 64, (upper + op1_alignment) * sizeof(double)); ++ posix_memalign( &inout_buf, 64, (upper + res_alignment) * sizeof(double)); ++ posix_memalign( &inout_check_buf, 64, upper * sizeof(double)); ++ duration = (double*)malloc(max_shift * sizeof(double)); + + ompi_mpi_init(argc, argv, MPI_THREAD_SERIALIZED, &provided, false); + +@@ -253,8 +300,8 @@ int main(int argc, char **argv) + correctness = 1; + if('i' == type[type_idx]) { + if( 8 == type_size ) { +- int8_t *in_int8 = (int8_t*)in_buf, +- *inout_int8 = (int8_t*)inout_buf, ++ int8_t *in_int8 = (int8_t*)((char*)in_buf + op1_alignment * sizeof(int8_t)), ++ *inout_int8 = (int8_t*)((char*)inout_buf + res_alignment * sizeof(int8_t)), + *inout_int8_for_check = (int8_t*)inout_check_buf; + for( i = 0; i < count; i++ ) { + in_int8[i] = 5; +@@ -299,8 +346,8 @@ int main(int argc, char **argv) + } + } + if( 16 == type_size ) { +- int16_t *in_int16 = (int16_t*)in_buf, +- *inout_int16 = (int16_t*)inout_buf, ++ int16_t *in_int16 = (int16_t*)((char*)in_buf + op1_alignment * sizeof(int16_t)), ++ *inout_int16 = (int16_t*)((char*)inout_buf + res_alignment * sizeof(int16_t)), + *inout_int16_for_check = (int16_t*)inout_check_buf; + for( i = 0; i < count; i++ ) { + in_int16[i] = 5; +@@ -345,8 +392,8 @@ int main(int argc, char **argv) + } + } + if( 32 == type_size ) { +- int32_t *in_int32 = (int32_t*)in_buf, +- *inout_int32 = (int32_t*)inout_buf, ++ int32_t *in_int32 = (int32_t*)((char*)in_buf + op1_alignment * sizeof(int32_t)), ++ *inout_int32 = (int32_t*)((char*)inout_buf + res_alignment * sizeof(int32_t)), + *inout_int32_for_check = (int32_t*)inout_check_buf; + for( i = 0; i < count; i++ ) { + in_int32[i] = 5; +@@ -391,8 +438,8 @@ int main(int argc, char **argv) + } + } + if( 64 == type_size ) { +- int64_t *in_int64 = (int64_t*)in_buf, +- *inout_int64 = (int64_t*)inout_buf, ++ int64_t *in_int64 = (int64_t*)((char*)in_buf + op1_alignment * sizeof(int64_t)), ++ *inout_int64 = (int64_t*)((char*)inout_buf + res_alignment * sizeof(int64_t)), + *inout_int64_for_check = (int64_t*)inout_check_buf; + for( i = 0; i < count; i++ ) { + in_int64[i] = 5; +@@ -440,8 +487,8 @@ int main(int argc, char **argv) + + if( 'u' == type[type_idx] ) { + if( 8 == type_size ) { +- uint8_t *in_uint8 = (uint8_t*)in_buf, +- *inout_uint8 = (uint8_t*)inout_buf, ++ uint8_t *in_uint8 = (uint8_t*)((char*)in_buf + op1_alignment * sizeof(uint8_t)), ++ *inout_uint8 = (uint8_t*)((char*)inout_buf + res_alignment * sizeof(uint8_t)), + *inout_uint8_for_check = (uint8_t*)inout_check_buf; + for( i = 0; i < count; i++ ) { + in_uint8[i] = 5; +@@ -486,8 +533,8 @@ int main(int argc, char **argv) + } + } + if( 16 == type_size ) { +- uint16_t *in_uint16 = (uint16_t*)in_buf, +- *inout_uint16 = (uint16_t*)inout_buf, ++ uint16_t *in_uint16 = (uint16_t*)((char*)in_buf + op1_alignment * sizeof(uint16_t)), ++ *inout_uint16 = (uint16_t*)((char*)inout_buf + res_alignment * sizeof(uint16_t)), + *inout_uint16_for_check = (uint16_t*)inout_check_buf; + for( i = 0; i < count; i++ ) { + in_uint16[i] = 5; +@@ -532,8 +579,8 @@ int main(int argc, char **argv) + } + } + if( 32 == type_size ) { +- uint32_t *in_uint32 = (uint32_t*)in_buf, +- *inout_uint32 = (uint32_t*)inout_buf, ++ uint32_t *in_uint32 = (uint32_t*)((char*)in_buf + op1_alignment * sizeof(uint32_t)), ++ *inout_uint32 = (uint32_t*)((char*)inout_buf + res_alignment * sizeof(uint32_t)), + *inout_uint32_for_check = (uint32_t*)inout_check_buf; + for( i = 0; i < count; i++ ) { + in_uint32[i] = 5; +@@ -578,8 +625,8 @@ int main(int argc, char **argv) + } + } + if( 64 == type_size ) { +- uint64_t *in_uint64 = (uint64_t*)in_buf, +- *inout_uint64 = (uint64_t*)inout_buf, ++ uint64_t *in_uint64 = (uint64_t*)((char*)in_buf + op1_alignment * sizeof(uint64_t)), ++ *inout_uint64 = (uint64_t*)((char*)inout_buf + res_alignment * sizeof(uint64_t)), + *inout_uint64_for_check = (uint64_t*)inout_check_buf; + for( i = 0; i < count; i++ ) { + in_uint64[i] = 5; +@@ -626,8 +673,8 @@ int main(int argc, char **argv) + } + + if( 'f' == type[type_idx] ) { +- float *in_float = (float*)in_buf, +- *inout_float = (float*)inout_buf, ++ float *in_float = (float*)((char*)in_buf + op1_alignment * sizeof(float)), ++ *inout_float = (float*)((char*)inout_buf + res_alignment * sizeof(float)), + *inout_float_for_check = (float*)inout_check_buf; + for( i = 0; i < count; i++ ) { + in_float[i] = 1000.0+1; +@@ -658,8 +705,8 @@ int main(int argc, char **argv) + } + + if( 'd' == type[type_idx] ) { +- double *in_double = (double*)in_buf, +- *inout_double = (double*)inout_buf, ++ double *in_double = (double*)((char*)in_buf + op1_alignment * sizeof(double)), ++ *inout_double = (double*)((char*)inout_buf + res_alignment * sizeof(double)), + *inout_double_for_check = (double*)inout_check_buf; + for( i = 0; i < count; i++ ) { + in_double[i] = 10.0+1; +@@ -691,7 +738,7 @@ int main(int argc, char **argv) + check_and_continue: + if( !skip_op_type ) + print_status(array_of_ops[do_ops[op_idx]].mpi_op_name, +- mpi_type, type_size, count, tend-tstart, correctness); ++ mpi_type, type_size, count, max_shift, duration, repeats, correctness); + } + if( !skip_op_type ) + printf("\n"); diff --git a/srcpkgs/openmpi/template b/srcpkgs/openmpi/template index 5f962b4fd38..c0eeb69b1e3 100644 --- a/srcpkgs/openmpi/template +++ b/srcpkgs/openmpi/template @@ -1,12 +1,11 @@ # Template file for 'openmpi' pkgname=openmpi version=4.1.0 -revision=2 +revision=3 build_style=gnu-configure configure_args="--enable-ipv6 --with-hwloc=${XBPS_CROSS_BASE}/usr" -hostmakedepends="gcc-fortran libgomp-devel perl pkg-config" -makedepends="libgfortran-devel libgomp-devel libhwloc-devel zlib-devel - libevent-devel" +hostmakedepends="gcc-fortran libgomp-devel perl pkg-config automake libtool" +makedepends="libgfortran-devel libgomp-devel libhwloc-devel zlib-devel libevent-devel" conf_files=" /etc/openmpi-default-hostfile /etc/openmpi-mca-params.conf @@ -26,6 +25,8 @@ post_extract() { } pre_configure() { + ./autogen.pl --force + # used wrongly upstream? but nocross anyway export CFLAGS="$CFLAGS $LDFLAGS" export CXXFLAGS="$CXXFLAGS $LDFLAGS"