diff --git a/.appveyor.yml b/.appveyor.yml index f4f56fa159..cafad4817f 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -23,6 +23,12 @@ environment: CC: clang THREADING: openmp + - LIB_TYPE: static + CONFIG: auto + CC: clang + THREADING: openmp + SANDBOX: yes + install: - set "PATH=C:\msys64\mingw64\bin;C:\msys64\bin;%PATH%" - if [%CC%]==[clang] set "PATH=C:\Program Files\LLVM\bin;%PATH%" @@ -34,6 +40,7 @@ build_script: - if [%LIB_TYPE%]==[shared] set "CONFIGURE_OPTS=%CONFIGURE_OPTS% --enable-shared --disable-static" - if [%LIB_TYPE%]==[static] set "CONFIGURE_OPTS=%CONFIGURE_OPTS% --disable-shared --enable-static" - if not [%CBLAS%]==[no] set "CONFIGURE_OPTS=%CONFIGURE_OPTS% --enable-cblas" +- if [%SANDBOX%]==[yes] set "CONFIGURE_OPTS=%CONFIGURE_OPTS% -s gemmlike" - set RANLIB=echo - set LIBPTHREAD= - set "PATH=%PATH%;C:\blis\lib" diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 0000000000..ad2bf582ac --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,264 @@ +version: 2.1 + +branches: + only: + - master + - dev + - amd + +executors: + linux: # Docker using the Base Convenience Image + docker: + - image: cimg/base:2024.10 + linuxnew: # Docker using the Base Convenience Image + docker: + - image: cimg/base:current-22.04 + macos: &macos-executor # macos executor running Xcode + macos: + xcode: 14.2.0 + linuxvm: # executor type + machine: + image: ubuntu-2204:current + +workflows: + build: + jobs: + # Default: + # - build: + # os: linux + # CC: gcc + # OOT: 0 + # TEST: FAST + # SDE: 0 + # LEVEL0: 0 + # THR: none + # CONF: auto + # BLD: '' + # LDFLAGS: '' + # TESTSUITE_WRAPPER: '' + # PACKAGES: '' + + # full testsuite (all tests + mixed datatype (gemm_nn only) + salt + OOT) + - build: + OOT: 1 + TEST: ALL + THR: openmp,pthreads + CONF: x86_64 + + # SDE testing for x86_64 + # Also test LEVEL0 here because g++ uses tons of memory for test_taxpbys.cxx + - build: + # linuxvm must be used because it provides 8G RAM and SDE fails with 4G RAM + os: linuxvm + SDE: 1 + LEVEL0: 1 + CONF: x86_64 + + # test generic kernels + - build: + CONF: generic_broadcast + + # clang build + - build: + CC: clang + THR: openmp,pthreads + CXX: clang++ + PACKAGES: clang libomp-dev + + # macOS with system compiler (clang) + - build: + os: macos + THR: pthreads + CC: clang + CXX: clang++ + + # cortexa15 build and fast testsuite (qemu) + - build: + CC: arm-linux-gnueabihf-gcc + CXX: arm-linux-gnueabihf-g++ + CONF: cortexa15 + PACKAGES: 'gcc-arm-linux-gnueabihf g++-arm-linux-gnueabihf libc6-dev-armhf-cross qemu-system-arm qemu-user' + TESTSUITE_WRAPPER: 'qemu-arm -cpu cortex-a15 -L /usr/arm-linux-gnueabihf/' + + # cortexa57 build and fast testsuite (qemu) + - build: + CC: aarch64-linux-gnu-gcc + CXX: aarch64-linux-gnu-g++ + CONF: cortexa57 + PACKAGES: 'gcc-aarch64-linux-gnu g++-aarch64-linux-gnu libc6-dev-arm64-cross qemu-system-arm qemu-user' + TESTSUITE_WRAPPER: 'qemu-aarch64 -L /usr/aarch64-linux-gnu/' + + # Apple M1 (firestorm) build and fast testsuite (qemu) + - build: + CC: aarch64-linux-gnu-gcc + CXX: aarch64-linux-gnu-g++ + CONF: firestorm + PACKAGES: 'gcc-aarch64-linux-gnu g++-aarch64-linux-gnu libc6-dev-arm64-cross qemu-system-arm qemu-user' + TESTSUITE_WRAPPER: 'qemu-aarch64 -L /usr/aarch64-linux-gnu/' + + # armsve build and fast testsuite (qemu) + - build: + CC: aarch64-linux-gnu-gcc-10 + CXX: aarch64-linux-gnu-g++-10 + CONF: armsve + PACKAGES: 'gcc-10-aarch64-linux-gnu g++-10-aarch64-linux-gnu libc6-dev-arm64-cross qemu-system-arm qemu-user' + TESTSUITE_WRAPPER: 'qemu-aarch64 -cpu max,sve=true,sve512=true -L /usr/aarch64-linux-gnu/' + + # arm64 build and fast testsuite (qemu) + # NOTE: This entry omits the -cpu flag so that while both NEON and SVE kernels + # are compiled, only NEON kernels will be tested. (h/t to RuQing Xu) + - build: + CC: aarch64-linux-gnu-gcc-10 + CXX: aarch64-linux-gnu-g++-10 + CONF: arm64 + PACKAGES: 'gcc-10-aarch64-linux-gnu g++-10-aarch64-linux-gnu libc6-dev-arm64-cross qemu-system-arm qemu-user' + TESTSUITE_WRAPPER: 'qemu-aarch64 -L /usr/aarch64-linux-gnu/' + + # The RISC-V targets require the qemu version available in jammy or newer. + # When CI is upgraded, the packages should be activated and do_script.sh + # cleaned up. + # PACKAGES="qemu-user qemu-user-binfmt" + - build: + CONF: rv64iv + BLD: --disable-shared + LDFLAGS: -static + - build: + CONF: rv32iv + BLD: --disable-shared + LDFLAGS: -static + - build: + CONF: sifive_x280 + BLD: --disable-shared + LDFLAGS: -static + +jobs: + build: + parameters: + os: + type: executor + default: linux + CC: + type: string + default: gcc + CXX: + type: string + default: g++ + OOT: + type: integer + default: 0 + TEST: + type: string + default: FAST + SDE: + type: integer + default: 0 + LEVEL0: + type: integer + default: 0 + THR: + type: string + default: none + CONF: + type: string + default: auto + BLD: + type: string + default: '' + LDFLAGS: + type: string + default: '' + TESTSUITE_WRAPPER: + type: string + default: '' + PACKAGES: + type: string + default: '' + executor: << parameters.os >> + steps: + - checkout + + - when: + condition: + not: + equal: [ *macos-executor, << parameters.os >> ] + steps: + - run: + name: Installing Dependencies + command: + sudo apt-get update && sudo NEEDRESTART_MODE=a apt-get install -y make python3 << parameters.PACKAGES >> + + - run: + name: Configuring, Building, Testing + command: | + export DIST_PATH=. + export CC="<< parameters.CC >>" + export CXX="<< parameters.CXX >>" + export OOT="<< parameters.OOT >>" + export CONF="<< parameters.CONF >>" + export TEST="<< parameters.TEST >>" + export BLD="<< parameters.BLD >>" + export LDFLAGS="<< parameters.LDFLAGS >>" + export SDE="<< parameters.SDE >>" + export LEVEL0="<< parameters.LEVEL0 >>" + export THR="<< parameters.THR >>" + export TESTSUITE_WRAPPER="<< parameters.TESTSUITE_WRAPPER >>" + + pwd + if [ $OOT -eq 1 ]; then export DIST_PATH=`pwd`; mkdir ../oot; cd ../oot; chmod -R a-w $DIST_PATH; fi + pwd + + if [ "$CONF" = "rv64iv" ]; then + $DIST_PATH/ci/do_riscv.sh "$CONF"; + export CC=$DIST_PATH/../toolchain/riscv/bin/riscv64-unknown-linux-gnu-gcc; + export CXX=$DIST_PATH/../toolchain/riscv/bin/riscv64-unknown-linux-gnu-g++; + export TESTSUITE_WRAPPER="$DIST_PATH/../toolchain/qemu-riscv64 -cpu rv64,vext_spec=v1.0,v=true,vlen=128 -B 0x100000"; + fi + if [ "$CONF" = "rv32iv" ]; then + $DIST_PATH/ci/do_riscv.sh "$CONF"; + export CC=$DIST_PATH/../toolchain/riscv/bin/riscv32-unknown-linux-gnu-gcc; + export CXX=$DIST_PATH/../toolchain/riscv/bin/riscv32-unknown-linux-gnu-g++; + export TESTSUITE_WRAPPER="$DIST_PATH/../toolchain/qemu-riscv32 -cpu rv32,vext_spec=v1.0,v=true,vlen=128 -B 0x100000"; + fi + if [ "$CONF" = "sifive_x280" ]; then + $DIST_PATH/ci/do_riscv.sh "$CONF"; + export CC=$DIST_PATH/../toolchain/riscv/bin/clang; + export CXX=$DIST_PATH/../toolchain/riscv/bin/clang++; + export TESTSUITE_WRAPPER="$DIST_PATH/../toolchain/qemu-riscv64 -cpu rv64,vext_spec=v1.0,v=true,vlen=512 -B 0x100000"; + fi + + if [ "$CONF" = "generic_broadcast" ]; then + export CONF=generic + export CFLAGS="-DBLIS_BBM_s=2 -DBLIS_BBM_d=2 -DBLIS_BBM_c=2 -DBLIS_BBM_z=2 -DBLIS_BBN_s=4 -DBLIS_BBN_d=4 -DBLIS_BBN_c=4 -DBLIS_BBN_z=4" + fi + + echo "Configuration:" + echo "CC = $CC" + echo "CXX = $CXX" + echo "OOT = $OOT" + echo "CONF = $CONF" + echo "THR = $THR" + echo "TEST = $TEST" + echo "BLD = $BLD" + echo "SDE = $SDE" + echo "LEVEL0 = $LEVEL0" + echo "DIST_PATH = $DIST_PATH" + echo "CFLAGS = $CFLAGS" + echo "LDFLAGS = $LDFLAGS" + echo "TESTSUITE_WRAPPER = $TESTSUITE_WRAPPER" + + $DIST_PATH/configure -p `pwd`/../install -t $THR $BLD CC=$CC $CONF + pwd + ls -l + $CC --version + $CC -v + + make V=1 -j2 + make install + + if [ "$BLD" = "" ] && [ "$TESTSUITE_WRAPPER" = "" ] ; then $DIST_PATH/ci/cxx/cxx-test.sh $DIST_PATH $(ls -1 include); fi + # Qemu SVE is failing sgemmt in some cases. Skip as this issue is not observed + # on real chip (A64fx). + if [ "$CONF" = "armsve" ]; then sed -i 's/.*\.*/0/' $DIST_PATH/testsuite/input.operations.fast; fi + if [ "$TEST" != "0" ]; then $DIST_PATH/ci/do_testsuite.sh; fi + if [ "$SDE" = "1" ]; then $DIST_PATH/ci/do_sde.sh; fi + if [ "$LEVEL0" = "1" ]; then $DIST_PATH/ci/do_level0.sh; fi diff --git a/.dir-locals.el b/.dir-locals.el index fccb205020..c0dc5741b7 100644 --- a/.dir-locals.el +++ b/.dir-locals.el @@ -1,9 +1,32 @@ -;; First (minimal) attempt at configuring Emacs CC mode for the BLIS -;; layout requirements. -((c-mode . ((c-file-style . "stroustrup") +;; Emacs formatting for the BLIS layout requirements. + +( + ;; Recognize *.mk files as Makefile fragments + (auto-mode-alist . (("\\.mk\\'" . makefile-mode)) ) + + ;; Makefiles require tabs and are almost always width 8 + (makefile-mode . ( + (indent-tabs-mode . t) + (tab-width . 8) + ) + ) + + ;; C code formatting roughly according to docs/CodingConventions.md + (c-mode . ( + (c-file-style . "bsd") (c-basic-offset . 4) (comment-start . "// ") (comment-end . "") - (indent-tabs-mode . t) - (tab-width . 4) - (parens-require-spaces . nil)))) + (parens-require-spaces . nil) + ) + ) + + ;; Default formatting for all source files not overriden above + (prog-mode . ( + (indent-tabs-mode . nil) + (tab-width . 4) + (require-final-newline . t) + (eval add-hook `before-save-hook `delete-trailing-whitespace) + ) + ) +) diff --git a/.gitignore b/.gitignore index 49b22c2b8e..5255bcb739 100644 --- a/.gitignore +++ b/.gitignore @@ -31,6 +31,7 @@ config.mk bli_config.h +bli_addon.h # -- monolithic headers -- @@ -43,6 +44,7 @@ include/*/*.h # -- misc. -- # BLIS testsuite output file +output.testsuite output.testsuite.* # BLAS test output files @@ -52,3 +54,6 @@ out.* GPATH GRTAGS GTAGS + +# Mac DS.store files +.DS_Store diff --git a/.travis.yml b/.travis.yml index 555e9a11a2..df955764f8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,22 +12,23 @@ matrix: - os: linux compiler: gcc env: OOT=1 TEST=ALL SDE=1 THR="none" CONF="x86_64" \ - PACKAGES="gcc-8 binutils" + PACKAGES="gcc-9 binutils" # openmp build - os: linux compiler: gcc env: OOT=0 TEST=FAST SDE=0 THR="openmp" CONF="auto" \ - PACKAGES="gcc-8 binutils" + PACKAGES="gcc-9 binutils" # pthreads build - os: linux compiler: gcc env: OOT=0 TEST=FAST SDE=0 THR="pthreads" CONF="auto" \ - PACKAGES="gcc-8 binutils" + PACKAGES="gcc-9 binutils" # clang build - os: linux compiler: clang env: OOT=0 TEST=FAST SDE=0 THR="none" CONF="auto" - # There seems to be some difficulty installing 2 Clang toolchains of different versions. + # There seems to be some difficulty installing two Clang toolchains of + # different versions. # Use the TravisCI default. # PACKAGES="clang-8 binutils" # macOS with system compiler (clang) @@ -62,22 +63,70 @@ matrix: CC=aarch64-linux-gnu-gcc-10 CXX=aarch64-linux-gnu-g++-10 \ PACKAGES="gcc-10-aarch64-linux-gnu g++-10-aarch64-linux-gnu libc6-dev-arm64-cross qemu-system-arm qemu-user" \ TESTSUITE_WRAPPER="qemu-aarch64 -cpu max,sve=true,sve512=true -L /usr/aarch64-linux-gnu/" + # arm64 build and fast testsuite (qemu) + # NOTE: This entry omits the -cpu flag so that while both NEON and SVE kernels + # are compiled, only NEON kernels will be tested. (h/t to RuQing Xu) + - os: linux + compiler: aarch64-linux-gnu-gcc-10 + env: OOT=0 TEST=FAST SDE=0 THR="none" CONF="arm64" \ + CC=aarch64-linux-gnu-gcc-10 CXX=aarch64-linux-gnu-g++-10 \ + PACKAGES="gcc-10-aarch64-linux-gnu g++-10-aarch64-linux-gnu libc6-dev-arm64-cross qemu-system-arm qemu-user" \ + TESTSUITE_WRAPPER="qemu-aarch64 -L /usr/aarch64-linux-gnu/" + # The RISC-V targets require the qemu version available in jammy or newer. + # When CI is upgraded, the packages should be activated and do_script.sh + # cleaned up. + # PACKAGES="qemu-user qemu-user-binfmt" + - os: linux + compiler: riscv64-unknown-linux-gcc + env: OOT=0 TEST=FAST SDE=0 THR="none" BLD="--disable-shared" CONF="rv64iv" \ + CC=riscv64-unknown-linux-gnu-gcc \ + LDFLAGS=-static + - os: linux + compiler: riscv32-unknown-linux-gcc + env: OOT=0 TEST=FAST SDE=0 THR="none" BLD="--disable-shared" CONF="rv32iv" \ + CC=riscv32-unknown-linux-gnu-gcc \ + LDFLAGS=-static + - os: linux + compiler: clang + env: OOT=0 TEST=FAST SDE=0 THR="none" BLD="--disable-shared" CONF="sifive_x280" \ + CC=clang \ + LDFLAGS=-static install: -- if [ "$CC" = "gcc" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then export CC="gcc-8"; fi +- if [ "$CC" = "gcc" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then export CC="gcc-9"; fi - if [ -n "$PACKAGES" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo apt-get install -y $PACKAGES; fi script: - export DIST_PATH=. - pwd - if [ $OOT -eq 1 ]; then export DIST_PATH=`pwd`; mkdir ../oot; cd ../oot; chmod -R a-w $DIST_PATH; fi - pwd -- $DIST_PATH/configure -p `pwd`/../install -t $THR CC=$CC $CONF +- if [ "$CONF" = "rv64iv" ]; then + $DIST_PATH/ci/do_riscv.sh "$CONF"; + export CC=$DIST_PATH/../toolchain/riscv/bin/riscv64-unknown-linux-gnu-gcc; + export CXX=$DIST_PATH/../toolchain/riscv/bin/riscv64-unknown-linux-gnu-g++; + export TESTSUITE_WRAPPER="$DIST_PATH/../toolchain/qemu-riscv64 -cpu rv64,vext_spec=v1.0,v=true,vlen=128 -B 0x100000"; + fi +- if [ "$CONF" = "rv32iv" ]; then + $DIST_PATH/ci/do_riscv.sh "$CONF"; + export CC=$DIST_PATH/../toolchain/riscv/bin/riscv32-unknown-linux-gnu-gcc; + export CXX=$DIST_PATH/../toolchain/riscv/bin/riscv32-unknown-linux-gnu-g++; + export TESTSUITE_WRAPPER="$DIST_PATH/../toolchain/qemu-riscv32 -cpu rv32,vext_spec=v1.0,v=true,vlen=128 -B 0x100000"; + fi +- if [ "$CONF" = "sifive_x280" ]; then + $DIST_PATH/ci/do_riscv.sh "$CONF"; + export CC=$DIST_PATH/../toolchain/riscv/bin/clang; + export CXX=$DIST_PATH/../toolchain/riscv/bin/clang++; + export TESTSUITE_WRAPPER="$DIST_PATH/../toolchain/qemu-riscv64 -cpu rv64,vext_spec=v1.0,v=true,vlen=512 -B 0x100000"; + fi +- $DIST_PATH/configure -p `pwd`/../install -t $THR $BLD CC=$CC $CONF - pwd - ls -l - $CC --version +- $CC -v - make -j 2 - make install -- $DIST_PATH/travis/cxx/cxx-test.sh $DIST_PATH $(ls -1 include) -# Qemu SVE is failing sgemmt in some cases. Skip as this issue is not observed on real chip (A64fx). +- if [ "$BLD" = "" ]; then $DIST_PATH/ci/cxx/cxx-test.sh $DIST_PATH $(ls -1 include); fi +# Qemu SVE is failing sgemmt in some cases. Skip as this issue is not observed +# on real chip (A64fx). - if [ "$CONF" = "armsve" ]; then sed -i 's/.*\.*/0/' $DIST_PATH/testsuite/input.operations.fast; fi -- if [ "$TEST" != "0" ]; then travis_wait 30 $DIST_PATH/travis/do_testsuite.sh; fi -- if [ "$SDE" = "1" ]; then travis_wait 30 $DIST_PATH/travis/do_sde.sh; fi +- if [ "$TEST" != "0" ]; then travis_wait 30 $DIST_PATH/ci/do_testsuite.sh; fi +- if [ "$SDE" = "1" ]; then travis_wait 30 $DIST_PATH/ci/do_sde.sh; fi diff --git a/CHANGELOG b/CHANGELOG index 13eaa52caa..76691e13d0 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,10 +1,5898 @@ -commit 8535b3e11d2297854991c4272932ce4974dda629 (HEAD -> master, tag: 0.8.1) +commit c2af113c7ba6d0dcc128ba36ec6e140d89180cf3 (HEAD -> master) +Author: Field G. Van Zee +Date: Mon May 6 13:37:47 2024 -0500 + + Version file update (1.0) + +commit 5ab286f61525f8ead35ecc258305a5ccd4ee096b (origin/master, origin/HEAD) +Author: Field G. Van Zee +Date: Mon May 6 13:14:52 2024 -0500 + + Added a script to help create new rc branches. + + Details: + - Added a new script, build/start-new-rc.sh, which: + 1. Updates the version file with a new version string. + 2. Commits (locally) the version string update. + 3. Updates the CHANGELOG file with the output of 'git log'. + 4. Commits (locally) the CHANGLOG file update. + 5. Creates a new branch whose name is equal to "-rc0" where + is the new version string. + 6. Reminds the user to execute some final steps if everything looks + good. + This new script will help in the future when it's time to start a new + release candidate branch/lineage off of 'master'. Note that this + script is based on build/bump-version.sh (which itself may change in + the future due to changes in the way versions/releases will be handled + going forward). + +commit cad51491e8a0b306015a5a02881dc2a9b60dd8d9 +Author: Field G. Van Zee +Date: Tue Apr 30 16:46:54 2024 -0500 + + Use "-i auto" by default in test/3 drivers. + + Details: + - Request default induced method behavior of BLIS via "-i auto" when + running the standalone performance drivers in test/3 via the runme.sh + script present in that directory. (Previously, the runme.sh script + would use "-i native" by default.) This change was originally intended + for fd1a7e3. + +commit fd1a7e3ca9547718aa61c806848099705216182b +Author: Field G. Van Zee +Date: Thu Apr 25 15:00:59 2024 -0500 + + Allow test/3 drivers to use default ind_t method. (#804) + + Details: + - Previously, the standalone performance drivers in test/3 were written + under the assumption that the user would want to explicitly test + either native execution *or* 1m. But because the accompanying runme.sh + script defaults to passing "native" in for the -i command line option + (which explicitly sets the induced method type), running the script + without modification causes the test drivers to use slow reference + microkernels on systems where native complex-domain microkernels are + not registered -- which will yield poor performance for complex-domain + level-3 operations. Furthermore, even if a user was aware of this, the + test drivers did not support any single value for the -i option that + would test BLIS using the library's default behavior -- that is, using + 1m on systems where it is needed and native execution on systems that + have native microkernels implemented and registered. + - This commit addresses the aforementioned issue by supporting a new + value for the -i option: "auto". The "auto" value causes the driver + to avoid explicitly setting the induced method altogether, leaving + BLIS's default behavior in place. This "auto" option is also now the + default setting within the runme.sh script. Thanks to Leick Robinson + for finding and reporting this issue. + - Also added support for "nat" as a shorthand for "native", which + the help text already (erroneously) claimed was supported. + +commit a49238e6141c96a41aa3c2a4adb0b0663d0b4968 +Author: Devin Matthews +Date: Wed Apr 24 15:07:18 2024 -0500 + + Refactor the control tree and other infrastructure (#710) + + Details: + 1. A "plugin" architecture. + - Users are now able to register new kernels, kernel preferences, and + blocksizes at runtime, directly from user applications. + - Plugins can be created, configured, and built using only an installed + version of BLIS -- no source or source code changes required. + - Plugins support both reference and optimized kernels, as well as + custom configuration-to-kernel-set mappings. + - Building plugins (including reference and relevant optimized kernels) + for enabled architectures or architecture families is automated, as is + linking into the final library. + - The configure script is now installed as 'configure-plugin'. In this + mode, it can be used to initialize a plugin from a template including + optional example code, and prepare a build system for compiling the + plugin into a shared or static library. + - Additional configuration files, templates, and build system components + are also installed to '%prefix%/share/blis'. + - The cntx_t struct now has extensible data structures for holding + kernels, preferences, and blocksizes. These are based on a "stack" + structure which contains a list of fixed-size data blocks. Adding a + new entry (which may require allocating a new block or reallocating + the block pointer array) requires locking, but looking up entries is + lock-free and takes O(1) time. + - Kernels can depend on either 1 or 2 type parameters (e.g. + mixed-precision packing requires 2). The func2_t struct supports + the latter, but can be implicitly cast to func_t if only "diagonal" + entries are needed. The number of type parameters can be inferred from + the kernel ID for type safety. + - Functions have been added to register new kernels, preferences, and + blocksizes with the global kernel structure (gks). This creates + corresponding entries in each allocated context and returns the next + available ID. Plugins use this API to register user kernels, although + the user is responsible for tracking the returned IDs for later + lookup. Setting newly-registered reference kernels, as well as + overriding these with optimized kernels is done in exactly the same + manner as in bli_cntx_init_ref() and bli_cntx_init_(). + + 2. Restructuring of the control and thread control trees. + - The control tree has been substantially restructured to support more + flexibility. + - The "default" control trees for gemm (also used for + hemm/symm/herk/her2k/syrk/syr2k/trmm/trmm3) and trsm are now + represented as a single structure containing all necessary control + tree nodes and parameters. + - An API has been added to modify the default gemm/trsm control trees. + - This same API is used by the framework and packm/gemm/trsm variants + to access specific control tree nodes. + - Users can alternatively create a custom control tree from scratch. + - The blocksizes are now encoded directly in the control tree, rather + than via loop IDs. The logic for adjusting blocksizes for certain + operations has been moved to the control tree initialization. + - Type information is encoded in the control tree to drive proper + selection of packing and computational kernels provided by the user. + - The packing microkernel now receives an opaque "params" struct which + is user-definable and can be used to pass additional information + through the call stack. + - The auxinfo_t struct has been updated with a .params field for + opaque user data as well as the global offsets of the current + microtile. + - The packm and gemm variants can be overridden by the user, and also + receive an opaque params struct via the associated control tree + node. + - The structure-aware packing kernel bli_packm_struc_cxk() is no longer + hard-coded to be called from the default packm variant, but can be + overridden by the user. It also supports mixed-precision/mixed-domain + natively now. + - The thread control tree (thrinfo_t) is now created entirely up-front + by inspecting the control tree. The required number of threads at each + level is encoded in the control tree via loop IDs (actually a bitfield + of loop IDs), although the ordering and number of such IDs is + arbitrary. The logic for adjusting the number of threads at each level + based on operation type (e.g. trmm) is now in the control tree + initialization and expressed by combining loop IDs from multiple + levels into a single level. + - The mem_t object containing the pack buffer pointer has been moved + from the control tree to the thread control tree. NOTE: **The control + tree is now strictly const throughout the operation, and only a + single copy is shared by all threads.** + - The thread control tree node for packing has been changed so that + there is no longer a "fake" node indicating a team of single threads. + Instead, the number of threads and thread IDs in the "normal" thread + control tree node are used. This change has also been made to the + gemmsup thread control tree and packing variants, as well as to the + gemmlike sandbox. + - Parameters controlling packing (e.g. inversion of the diagonal, + direction, schema) are not stored directly in the control tree but in + the opaque params struct. The packing control tree node and its + default params struct are stored together in the "combined" + gemm/trsm control tree structure and initialized as a unit. Users can + update these parameters individually or substitute a custom packm + variant and params struct. + - The "target" and "execution" datatypes has been removed from the obj_t + struct and replaced by type information in the control tree. + - The "sub-node" and "sub-prenode" of a control tree node have been + replaced by an arbitrary number of sub-nodes accessed by index. There + is a hard cap on the number of sub-nodes (currently 2). Sub-nodes are + added during control tree initialization, *after* + creation/initialization of the parent node through an updated API. + - The level-3 thread decorator has been significantly simplified and + directly calls bli_l3_int(). The control tree is created externally, + and it is no longer necessary to alias matrices or set object pack + schemas. Also, the rntm_t passed in may be NULL. Finally, family + and scalar information is no longer needed here. + - bli_l3_int() is now a simple inline function which extracts the next + control tree node and variant and calls it. + - bli_*_front() have been removed and inlined into the expert object + API with significant simplification. + - 1m (or other induced method) no longer uses an alternative cntx_t. + - The .pack_fn/.ker_fn pointers and associated params fields on the + obj_t were removed in favor of the present solution. + + 3. Overhaul of variable substitution in configure script. + - The configure script has been somewhat re-written to use a + centralized mechanism for substituting variables into build system and + other configuration files. + - All substitution variables go through the same pathway now, which + necessitated some variable naming changes for variables which were + named the same in e.g. Makefile and bli_config.h but with + different definitions. + - CC and CXX variables can now contain spaces, e.g. 'g++ -std=c++17'. + This provides better support for integration with build tooling such + as autotools. + + 4. Overhaul of packing kernels. + - Previously there were two packing kernels referenced in the cntx_t + structure for MRxk and NRxk shaped micropanels, respectively. These + have now been merged into one kernel which is responsible for packing + any dense rectangular portion of either A or B. + - The packing kernel now receives information about the register + blocksize (cdim_max) and duplication factor (the "broadcast-B" + format, although this can also apply to the A matrix). + - The structure-aware packing kernel (bli_packm_struc_cxk(), which is + now user-overridable) also receives global offsets of the current + micropanel within A or B. + - Explicit kernels for packing the diagonal blocks of + triangular/symmetric/Hermitian matrices have been added to the + cntx_t. This means that the bli_packm_struc_ckx() "kernel" no longer + needs to directly touch data (except to zero out some regions). + - bli_packm_struc_cxk() has also been updated to work only in terms of + fundamental elements (i.e., real datatypes) when computing offsets and + when zeroing data, which greatly simplifies mixed-domain/1m packing. + - bli_packm_scalar() has been updated to better support complex scalars + in mixed-domain operations. + - Pack schemas for PACKED_ROW_PANELS* and PACKED_COL_PANELS* have + been merged into simply PACKED_PANELS*. This reflects the merging of + the packing kernels into a single generic kernel. There were only a + very few places which needed the row/column information and this is + now supplied by alternative means. + - Packing variants always behave "as if" the A matrix were being packed + (i.e. the code assumes packing column-stored row panels). Packing of B + is handled by applying an implicit or explicit transpose before + packing. This change also applies to gemmsup. + + 5. Improved MD/MP support. + - All level-3 operations (except trsm) now support full + mixed-domain/mixed-precision operation. + - Explicit 1m packing kernels have been added in the cntx_t. + - An explicit 1m microkernel wrapper has been added to the cntx_t. + - An extra packing kernel for the "ro" format has been added, along with + the pack_t enumeration value. This supports the packing for + real*complex -> real, including potential scaling by a complex alpha, + support for structured matrices, etc. + - Extra microkernel wrappers for mixed-domain operations have been added + to support the 'ccr' (and by extension, 'crc'), 'rcc', and 'crr' + cases. Notably this includes full support for general stride storage + and complex alpha/beta. + - Packing kernels and gemm microkernels are now "templated" based on two + type parameters rather than one. For packing this allows direct + optimization of mixed-precision kernels, and for gemm microkernels + this allows direct optimization of mixed-precision without writing to + a temporary buffer. Reference packing kernels are directly + instantiated for all mixes of precisions, while by default + mixed-precision gemm microkernels are supported via a microkernel + wrapper. The "old" way of specifying optimized kernels using a single + type parameter works unchanged. + - alpha and beta are typecast appropriately to the computational or + output datatype, respectively, and **always** to the complex domain. + Scalar typecasting has also been added to gemmsup for safety. + - The gemm macrokernel doesn't have to do any typecasting anymore, as a + microkernel wrapper or optimized mixed-precision/mixed-domain kernel + now handles this. + - 1m and mixed-domain operations now always use a microkernel wrapper, + rather than adjusting parameters in the gemm macrokernel. + - The gemmt macrokernel **does** still have to handle explicit + write-back of microtiles which intersect the diagonal, although + typecasting has already been performed. + - The gemmt_x_ker_var2(), trmm_xx_ker_var2(), and trsm_xx_ker_var2() + functions have been removed. The appropriate macrokernel pointer is + selected during control tree initialization. + - Real domain MR/NR are checked for even-ness based on the gemm + microkernel's row preference in order to guarantee proper 1m and + mixed-domain operation. + - Full range of mixed-domain/mixed-precision functionality tested in the + testsuite ('input.*.mixed'). + + 6. Other changes: + - The build system has been updated to support C++ source files + throughout the framework. While the intent is not to add such files to + BLIS itself, this supports plugins written in C++. + - Many instances of configuration-specific code have been simplified by + introducing an INSERT_GENTCONF macro which instantiates a block of + code for each enabled sub-configuration. The ConfigurationHowTo.md + document has been updated accordingly. + - PASTEMAC?/PASTECH?/PASTEF77? have been removed in favor of + variadic macros which accept any number of arguments (up to a + reasonable limit). + - The INSERT_GENTFUNC* macros have been updated to clean up + mixed-precision and mixed-domain instantiations. + - bli_align_dim_to_mult() has been updated to support rounding either up + or down based on a flag. + - Checking for empty matrices and other early exits (level-3 only) has + been consolidated into a single utility function. + - The auxinfo_t struct is always passed as const. + - The new function bli_obj_alias_submatrix() aliases a matrix while also + resetting the root to NULL, offsets to zero (while adjusting the + buffer), and applying any implicit transpose. + - Level-3 pruning functions now only check matrix structure to see what + to do, not the operation family. + - gemmsup packing has been updated to use the "normal" pack buffer + allocation routines. + - Remove duplicate checks for early return from gemmsup handler. + - bli_determine_blocksize() has been significantly simplified. + - Partitioning packed panels is no longer allowed. + - Added bli_xxsame macros. + - Automated the calculation of info bit shifts and masks based on + predefined bit sizes for various flags. This greatly simplifies + reordering, adding, or removing flags from the info/info2 bitfields. + - Moved more BLIS_NUM_* macros into the corresponding enums as the + last entry so that the value is automatically computed. + - Better const-correctness in some level0 scalar macros. + - Better mixed-precision support in some level0 scalar macros. + - Added a bli_axpbys_mxn() macro. + - bli_thread_range_sub() takes explicit thread ID and number of threads + rather than a thrinfo_t node. + - "De-templated" BLIS gemmlike sandbox (specifically, bls_gemm_bp_var1() + and bls_packm_var1()). + - Combined bls_l3_packm_[ab]() into one function with thin wrappers. + - Deleted bls_packm_var[23](). + - Add a "termination tag" to the testsuite output so that + 'make check-blis' can accurately check for successful completion. + - Add a new function to centrally compute FLOPs for level-3 operations + in the testsuite. + +commit a316d2c6c33fc1f8f7c58c4210ab203f48349041 +Author: Devin Matthews +Date: Thu Mar 28 12:52:00 2024 -0500 + + Fix incorrect commenting of `BLIS_RNTM_INITIALIZER` and `BLIS_OBJECT_INITIALIZER`. + +commit 664cc6bc3ea610b4ecea63d78c6024c48f045635 +Author: Devin Matthews +Date: Tue Mar 26 16:25:17 2024 -0500 + + Update BLIS_*_INITIALIZER macros for C++ compatibility. (#802) + + Details: + - Remove designated initializer syntax. This isn't officially supported + until C++20. + - Arrange initializers in the order in which they are defined in the + struct. Even with standard or extension support for designated + initializers, initializing non-static members out-of-order is an + error in C++. + - Remove the conditional code which uses '-1' as the default value of + the 'pack_buf' member of 'mem_t' in C, but 'BLIS_BUFFER_FOR_GEN_USE' + in C++. Simply use the latter as a common-sense default. + +commit 1a8c8180b32cf5988bf9eb5d2f0f8111a729993a +Author: John <50754967+j-bm@users.noreply.github.com> +Date: Thu Feb 15 12:35:10 2024 -0400 + + Add cpu part codes for various manufacturers and use in the code (#794) + + * Add cpu_id symbols for arm v8. + + * Add symbols for arm v7. + + * Always assume firestorm on Apple aarch64. + + * Fixes incorrect usage of model vs. part in some places. + + * Fixes #793 + + --------- + + Co-authored-by: J + +commit c382d8bdccc07e22a341fe04960f0cbf4eec083b +Author: Igor Zhuravlov +Date: Sun Jan 14 04:03:31 2024 +0000 + + Fix errors and typos in docs/BLIS*API.md (#791) + + Details: + - Fixed errors and unified formatting in docs/BLIS*API.md docs. + +commit a72e4569f2a03cc3578c019bf7ce25491a44137d +Author: Field G. Van Zee +Date: Wed Dec 6 18:21:47 2023 -0600 + + Include bli_config.h before bli_system.h in cblas.h. (#789) + + Details: + - Previously, in cblas.h, bli_config.h was being #included *after* + bli_system.h, which meant that the BLIS_ENABLE_SYSTEM macro was + never defined in time for proper OS detection. This bug only + affected cblas.h -- blis.h had been correctly #including + bli_config.h before bli_system.h since fb93d24. Thanks to + Edward Smyth for reporting this bug and suggesting the fix. + +commit 1236ddab455ef3a6293ab394ff06b3a19c2913d9 +Author: Field G. Van Zee +Date: Sun Dec 3 16:42:34 2023 -0600 + + Fixed random segfault in test/3 drivers. (#788) + + Details: + - Fixed a segfault in the non-gemm test drivers in test/3 that was the + result of sometimes leaving either .n_str or .k_str fields of the + params_t struct uninitialized, depending on the operation in question. + For example, in test_hemm.c, init_def_params() would only initialize + the .m_str and .n_str fields, but not the .k_str field. Even though + hemm doesn't use a 'k' dimension, the proc_params() function (called + via parse_cl_params()) universally attempts to convert all three into + integers via sscanf(), which was understandably failing when one of + those strings was a NULL pointer. I'm not sure how this code ever + worked to begin with. Special thanks to Leick Robinson for finding and + reporting this bug. + +commit 141a6c9a8e7557d9c7d28aecedec9dc5377dba13 +Author: Field G. Van Zee +Date: Tue Nov 21 12:26:43 2023 -0600 + + Install helper headers to INCDIR prefix. (#787) + + Details: + - Install one-line headers to INCDIR whose entire purpose is to + #include the actual headers within the local 'blis' header directory + so that applications can #include "blis.h" instead of #include + (and/or "cblas.h" instead of if CBLAS is + enabled) when headers are installed to global paths. (Note that + INCDIR is the installation prefix for headers as specified by + '--includedir=INCDIR', which defaults to 'PREFIX/include' if not + specified.) Not sure how this problem went unreported for so long, + since presumably any user trying to #include "blis.h" from a global + installation would have encountered a compiler error. + - The one-line blis.h and cblas.h headers now reside in the 'build' + directory, ready to install as is. + - Thanks to to Jed Brown for reporting this via Issue #786, and for + Devin Matthews and Mo Zhou for their engagement. + - Harmonized the rule in the top-level Makefile for installing blis.pc + into SHAREDIR/pkgconfig with conventions for others vis-a-vis + verbosity/non-verbosity. + +commit 2d9439298b336aa6d0ee000a5285a3adb4e6d462 +Author: Devin Matthews +Date: Tue Nov 21 12:18:07 2023 -0600 + + Allow users to defines [sd]complex using std::complex (#784) + + Details: + - In C++ applications, it makes a lot of sense to interface to BLIS + using C++'s standard complex number library, which uses a template + class std::complex. Obviously BLIS doesn't know anything about this + and defaults to a custom struct to represent complex numbers. This PR + updates the bli_[cz]{real,imag}() functions to accept std::complex + numbers when a C++ compiler is being used. Note that this has no + effect on the compilation of the BLIS library (or testsuite), and only + comes into play when including blis.h into a C++ project and forcing + the use of std::complex for scomplex and dcomplex. + - The application can explicitly request std:complex-based types via: + + #define BLIS_ENABLE_STD_COMPLEX + #include + // Call BLIS functions using std::complex here. + + - Fixed a bug in the definition of some scalar level-0 macros, since + bli_creal()/bli_cimag() and bli_zreal()/bli_zimag() are no longer + interchangeable. + +commit f7ce54a252028483e4c6af619015eb22063d5541 (origin/1.0-rc0) +Author: Field G. Van Zee +Date: Fri Nov 3 15:52:57 2023 -0500 + + CREDITS file update. + +commit 05388ddb66f8bf2d62009b162d64bf2d99226b83 +Author: Aaron Hutchinson <113382047+Aaron-Hutchinson@users.noreply.github.com> +Date: Fri Nov 3 13:30:31 2023 -0700 + + Added 'sifive_x280' subconfig, kernel set. (#737) + + Details: + - Added a new 'sifive_x280' subconfiguration for SiFive's x280 RISC-V + instruction set architecture. The subconfig registers kernels from a + correspondingly new kernel set, also named 'sifive_x280'. + - Added the aforementioned kernel set, which includes intrinsics- and + assembly-based implementations of most level-1v kernels along with + level-1f kernels axpy2v dotaxpyv, packm kernels, and level-3 gemm, + gemmtrsm_l, and gemmtrsm_u microkernels (plus supporting files). + - Registered the 'sifive_x280' subconfig as belonging to a singleton + family by the same name. + - Added an entry to '.travis.yml' to test the new subconfig via qemu. + - Updates to 'travis/do_riscv.sh' script to support the 'sifive_x280' + subconfig and to reflect updated tarball names. + - Special thanks to Lee Killough, Devin Matthews, and Angelika Schwarz + for their engagement on this commit. + +commit 7a87e57b69d697a9b06231a5c0423c00fa375dc1 (origin/10.0-rc0) +Author: Srinivas Yadav <43375352+srinivasyadav18@users.noreply.github.com> +Date: Sat Oct 14 02:05:41 2023 -0500 + + Fixed HPX barrier synchronization (#783) + + Details: + - Fixed hpx barrier synchronization. HPX was hanging on larger cores + because blis was using non-hpx synchronization primitives. But when + using hpx-runtime only hpx-synchronization primitives should be used. + Hence, a C style wrapper hpx_barrier_t is introduced to perform hpx + barrier operations. + - Replaced hpx::for_loop with hpx::futures. Using hpx::for_loop with + hpx::barrier on n_threads greater than actual hardware thread count + causes synchronization issues making hpx hanging. This can be avoided + by using hpx::futures, which are relatively very lightweight, robust + and scalable. + +commit 8fff1e31da1c87e46cacec112b0ac280ab47cd8b +Author: Field G. Van Zee +Date: Thu Oct 12 15:51:41 2023 -0500 + + Fixed bug in sup threshold registration. (#782) + + Details: + - Fixed a bug that resulted in BLIS non-deterministically calling the + gemmsup handler, irrespective of the thresholds that are registered + via bli_cntx_set_blkszs(). + - Deep dive: In bli_cntx_init_ref.c, the default values for the gemmsup + thresholds (BLIS_[MNK]T blocksizes) wre being set to zero so that no + operation ever matched the criteria for gemmsup (unless specific sup + thresholds are registered). HOWEVER, these thresholds are set via + bli_cntx_set_blkszs() which calls bli_blksz_copy_if_pos(), which was + only coping the thresholds into the gks' cntx_t if the values were + strictly positive. Thus, the zero values passed into + bli_cntx_set_blkszs() were being ignored and those threshold slots + within the gks were left uninitialized. The upshot of this is that the + reference gemmsup handler was being called for gemm problems + essentially at random (and as it turns out, very rarely the reference + gemmsup implementation would encounter a divide-by-zero error). + - The problem was fixed by changing bli_blksz_copy_if_pos() so that it + copies values that are non-negative (values >= 0 instead of > 0). The + function was also renamed to bli_blksz_copy_if_nonneg() + - Also needed to standardize use of -1 as the sole value to embed into + blksz_t structs as a signal to bli_cntx_set_blkszs() to *not* register + a value for that slot (and instead let whatever existing values + remain). This required updates to the bli_cntx_init_*() functions for + bgq, cortexa9, knc, penryn, power7, and template subconfigs, as some + of these codes were using 0 instead of -1. + - Fixes #781. Thanks to Devin Matthews for identifying, diagnosing, and + proposing a fix for this issue. + +commit 1e264a42474b535431768ef925bbd518412d392e +Author: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com> +Date: Mon Oct 2 18:29:46 2023 -0500 + + Update zen3 subconfig to support NVHPC compilers. (#779) + + Details: + - Parse $(CC_VENDOR) values of "nvc" in 'zen3' make_defs.mk file. + - Minor refactor to accommodate above edit. + - CREDITS file update. + +commit c2099ed2519dcac8ee421faf999b36e1c2260be7 +Author: Field G. Van Zee +Date: Mon Oct 2 14:56:48 2023 -0500 + + Fixed brokenness when sba is disabled. (#777) + + Details: + - Previously, disabling the sba via --disable-sba-pools resulted in a + segfault due to a sanity-check-triggering abort(). The problem was + that the sba, as currently used in the l3 thread decorators, did not + yet (fully) support pools being disabled. The solution entailed + creating wrapper function, bli_sba_array_elem(), which either calls + bli_apool_array_elem() (when sba pools are enabled at configure time) + or returns a NULL sba_pool pointer (when sba pools are disabled), and + calling bli_sba_array_elem() in place of bli_apool_array_elem(). Note + that the NULL pointer returned by bli_sba_array_elem() when the sba + pools are disabled does no harm since in that situation the pointer + goes unreferenced when acquiring and releasing small blocks. Thanks to + John Mather for reporting this bug. + - Guarded the bodies of bli_sba_init() and bli_sba_finalize() with + #ifdef BLIS_ENABLE_SBA_POOLS. I don't think this was actually necessary + to fix the aforementioned bug, but it seems like good practice. + - Moved the code in bli_l3_thrinfo_create() that checked that the array* + pointer is non-NULL before calling bli_sba_array_elem() (previously + bli_apool_array_elem()) into the definition of bli_sba_array_elem(). + - Renamed various instances of 'pool' variables and function parameters + to 'sba_pool' to emphasize what kind of pool it represents. + - Whitespace changes. + +commit 37ca4fd168525a71937d16aaf6a13c0de5b4daef +Author: Field G. Van Zee +Date: Thu Sep 28 16:37:57 2023 -0500 + + Implemented [cz]symv_(), [cz]syr_(), [cz]rot_(). (#778) + + Details: + - Expanded existing BLAS compatibility APIs to provide interfaces to + [cz]symv_(), [cz]syr_(). This was easy since those operations were + already implemented natively in BLIS; the APIs were previously + omitted only because they were not formally part of the BLAS. + - Implemented [cz]rot_() by feeding code from LAPACK 3.11 through + f2c. + - Thanks to James Foster for pointing out that LAPACK contains these + additional symbols, which prompted these additions, as well as for + testing the [cz]rot_() functions from Julia's test infrastructure. + - CREDITS file update. + +commit 6f412204004666abac266409a203cb635efbabf3 +Author: Field G. Van Zee +Date: Tue Sep 26 18:00:54 2023 -0500 + + Added 'altra', 'altramax' subconfigs. (#775) + + Details: + - Forward-ported 'altra' and 'altramax' subconfigurations from the + older 'stable' branch lineage [1]. These subconfigs primarily target + the Ampere Altra and AltraMax (ARM) processors. They also contain + "QuickStart" directories with information and scripts to help + use BLIS on these microarchitectures. Thanks to Jeff Diamond and + Leick Robinson for developing these subconfigs and resources. + - Updated kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c according to + changes in the 'stable' lineage, mostly related to re-enabling of + assembly code branches that target general stride IO. + + [1] Note that the 'stable' branch is being used to make sure that more + recent commits do not introduce unreasonable performance + regressions. As such, the name should be interpreted as shorthand + for "performance stable," not "API stable." + +commit a4a63295b96ed5b32f4df6477d24db07bf431202 +Author: Srinivas Yadav <43375352+srinivasyadav18@users.noreply.github.com> +Date: Tue Sep 26 17:58:38 2023 -0500 + + Fixes to HPC runtime code path. (#773) + + Details: + - Fixed hpx::for_each invocation and replace with hpx::for_loop. The HPX + runtime was initialized using hpx::start, but the hpx::for_each + function was being called on a non-hpx runtime (i.e standard BLIS + runtime - single main thread). To run hpx::for_each on HPX runtime + correctly, the code now uses hpx::run_as_hpx_thread(func, args...). + - Replaced hpx::for_each with hpx::for_loop, which eliminates use of + hpx::util::counting_iterator. + - Employ hpx::execution::chunk_size(1) to make sure that a thread + resides on a particular core. + - Replaced hpx::apply() with updated version hpx::post(). + - Initialize tdata->id = 0 in libblis.c to 0, as it is the main thread + and is needed for writing results to output file. + - By default, if not specified, the HPX runtime uses all N threads/cores + available in the system. But, if we want to only specify n_threads out + N threads, we use hpx::execution::experimental::num_cores(n_threads). + +commit c6546c1131b1ddd45ef13f9f2b620ce2e955dbf8 +Author: John Mather <54645798+jmather-sesi@users.noreply.github.com> +Date: Wed Sep 20 13:41:07 2023 -0400 + + Fixed broken link in Multithreading.md. (#774) + + Details: + - Replaced 404'd link in docs/Multithreading.md with an archive from + The Wayback Machine. + - CREDITS file update. + +commit 6dcf7666eff14348e82fbc2750be4b199321e1b9 +Author: Field G. Van Zee +Date: Sun Aug 27 14:18:57 2023 -0500 + + Revamped bli_init() to use TLS where feasible. (#767) + + Details: + - Revamped bli_init_apis() and bli_finalize_apis() to use separate + bli_pthread_switch_t objects for each of the five sub-API init + functions, with the objects for the 'ind' and 'rntm' sub-APIs being + declared with BLIS_THREAD_LOCAL. This allows some APIs to be treated + as thread-local and the rest as thread-shared. Thanks to Edward Smyth + for requesting application thread-specific rntm_t structs, which + inspired these change. + - Combined bli_thread_init_from_env() and bli_pack_init_from_env() into + a new function, bli_rntm_init_rntm_from_env(), and placed the combined + code in bli_rntm.c inside of a new bli_rntm_init() function. Then + removed the (now empty) bli_pack_init() and _finalize() function defs. + - Deprecated bli_rntm_init() for the purposes of initializing a rntm_t + (temporarily preserving it as bli_rntm_clear() in a cpp-undefined code + block) so that the function name could be used for the aforementioned + bli_rntm_init() function. + - Updated libblis_test_pobj_create() in test_libblis.c to use a static + rntm_t initializer instead of the deprecated bli_rntm_init() + function-based option. + - Minor updates to docs/Multithreading.md, including removal of + bli_rntm_init() in the example of how to initialize rntm_t structs. + - Changed the return value of bli_gks_init(), bli_ind_init(), + bli_memsys_init(), bli_thread_init(), and bli_rntm_init() (and their + finalize() counterparts) from 'void' to 'int' so that those functions + match the function type expected by bli_pthread_switch_on()/_off(). + Those init/finalize functions now return 0 to indicate success, which + is needed so that the switch actually changes state from off to on + and vice versa. + - Defined bli_thread_reset(), which copies the contents of the + global_rntm_at_init() struct into the global_rntm struct (for the + current application thread). + - Guard calls to bli_pthread_mutex_lock()/_unlock() in + - bli_pack_set_pack_a() and _pack_b() + - bli_rntm_init_from_global() + - bli_thread_set_ways() + - bli_thread_set_num_threads() + - bli_thread_set_thread_impl() + - bli_thread_reset() + - bli_l3_ind_oper_set_enable() + with #ifdef BLIS_DISABLE_TLS (since TLS precludes the possibility of + race conditions). + - In frame/base/bli_rntm.c, declare global_rntm, global_rntm_at_init, + and global_rntm_mutex as BLIS_THREAD_LOCAL so that separate + application threads can change the number of ways of BLIS parallelism + independently from one another. + - Access global_rntm only via a new private (not exported) function, + bli_global_rntm(). Defined a similar function for a rntm_t new to + this commit, global_rntm_at_init, which preserves the state of the + global rntm at initialization-time. + - In frame/3/bli_l3_ind.c, added a guard to the declaration of the + static variable oper_st_mutex with #ifdef BLIS_DISABLE_TLS so that the + mutex is omitted altogether when TLS is enabled (which prevents the + compiler from warning about an unused variable). + - Removed redundant code from bli_thread.c: + #ifdef BLIS_ENABLE_HPX + #include "bli_thread_hpx.h" + #endif + since this code is already present in bli_thread.h. + - Thanks to Minh Quan Ho for his review of and feedback on this commit. + - Comment updates. + +commit fa6a9b24ae2ddbd5f30f657d46004843581c768c +Author: Field G. Van Zee +Date: Sat Aug 19 12:44:34 2023 -0500 + + Fixed error when using common.mk from testsuite. (#768) + + Details: + - Commit 2db31e0 (#755) inserted logic into common.mk that attempts to + preprocess build/detect/android/bionic.h to determine whether the + __BIONIC__ macro is defined (in which case -lrt should not be included + in LDFLAGS). However, the path to bionic.h was encoded without regard + to DIST_PATH, and so utilizing common.mk anywhere that isn't the top- + level directory (such as in the testsuite directory) resulted in a + compiler error: + + gcc: error: build/detect/android/bionic.h: No such file or directory + gcc: fatal error: no input files + compilation terminated. + + This commit adds a $(DIST_PATH) prefix to the path to bionic.h so that + it can be located from other applications' Makefiles that use BLIS's + makefile fragments. + +commit 634e532c8dcce7383d96ba33276df65c656b2198 +Author: Field G. Van Zee +Date: Wed Aug 9 21:54:49 2023 -0500 + + Set thrcomm timpl_t id inside init functions. (#766) + + Details: + - Previously, the timpl_t id being used when a thrcomm_t is being + initialized was set within the bli_thrcomm_init() dispatch function + after the timpl_t-specific bli_thrcomm_init_*() function returned. But + it just occurred to me that each bli_thrcomm_init_*() function already + intrinsically knows its own timpl_t value. This commit shifts the + setting of the thrcomm_t.ti field into the corresponding + bli_thrcomm_init_*() function for each timpl_t type (e.g. single, + openmp, pthreads, hpx). + - Removed long-deprecated code dating back nearly 10 years. + - Whitespace changes + - Comment updates. + +commit 3cf17b4a91232709bc6a205b0e4d7ecc96579aa9 +Author: Field G. Van Zee +Date: Mon Aug 7 13:46:20 2023 -0500 + + Small fixes/improvements to docs/Multithreading.md. (#764) + + Details: + - Added reminders that #include "blis.h" must be added to source files + in order to access BLIS API function prototypes. Thanks to Barry Smith + for suggesting this improvement. + - Fixed pre-existing typos. + - CREDITS file update. + +commit dbc79812c390f812c7bf030bfcf87e947a1443c4 +Author: Field G. Van Zee +Date: Fri Jul 28 18:16:38 2023 -0500 + + CREDITS file update. + + Details: + - Thanks to Igor Zhuravlov for PR #753 (commit 915daaa). + +commit 915daaa43cd189c86d93d72cd249714f126e9425 +Author: Igor Zhuravlov +Date: Thu Jul 27 20:33:59 2023 +0000 + + Fix typos in docs + example code comments. (#753) + + Details: + - Fixed various typos in API documentation in docs/BLIS*API.md and + comments in the source code examples within examples/?api/*.c. + +commit 2db31e057e7e9c97fc60021b5ae72a01a48d7588 +Author: Lee Killough <15950023+leekillough@users.noreply.github.com> +Date: Thu Jul 27 15:27:21 2023 -0500 + + Exclude -lrt on Android with Bionic libraries. (#755) + + Details: + - Added build/detect/android/bionic.h header to test whether the + __BIONIC__ cpp macro is defined. + - In common.mk, only add -lrt to LDFLAGS when Bionic is not present. + - CREDITS file update. + +commit 22ad8c1b752364784f320168b31995945ad84a59 +Author: ct-clmsn +Date: Thu Jul 27 16:23:29 2023 -0400 + + Small fixes to support hpx in the testsuite (#759) + + Details: + - Minor changes to test_libblis.c to support hpx. + +commit c91b41d022e33da82b3b06c82be047a29873d9b6 +Author: Lee Killough <15950023+leekillough@users.noreply.github.com> +Date: Wed Jul 26 14:37:08 2023 -0500 + + Auto-detect the RISC-V ABI of the compiler and use -mabi= during RISC-V Builds (#750) + + Details: + - Generate a build error if there is a 32/64-bit mismatch between the + RISC-V ABI or architecture and the BLIS configuration selected. + - Handle Q, Zicsr, ZiFencei, Zba, Zbb, Zbc, Zbs and Zfh extensions in + the RISC-V architecture auto-detection. ZiFencei and Zicsr is not + detectable with built-in RISC-V macros right now. + - ZiFencei is not important for BLIS because doesn't it have + Just-In-Time compilation or self-modifying code, and Zicsr is implied + by the floating-point extensions, which are required for good + performance in BLIS. + - Move RISC-V autodetect header files to build/detect/riscv/. + +commit a0b04e3c007f1207e5678bf20c07752906742fb7 (origin/aocl-blas, aocl-blas) +Author: Field G. Van Zee +Date: Mon Jun 26 17:59:21 2023 -0500 + + Rewrote regen-symbols.sh (gen-libblis-symbols.sh). (#751) + + Details: + - Wrote an alternative to regen-symbols.sh, gen-libblis-symbols.sh, + that generates a list of exported symbols from the monolithic blis.h + file rather than peeking inside of the shared object via nm. (This new + script lives in the 'build' directory and the older script has been + retired to build/old.) Special thanks to Devin Matthews for authoring + gen-libblis-symbols.sh. + - Added a 'symbols' target to the top-level Makefile which will refresh + build/libblis-symbols.def, with supporting changes to common.mk. + - Updates to build/libblis-symbols.def using the new symbol-generating + script. + +commit 6b894c30b9bb2c2518848d74e4c8d96844f77f24 +Author: Field G. Van Zee +Date: Mon Jun 12 17:22:44 2023 -0500 + + Rewrote/fixed broken tree barrier implementation. + + Details: + - Rewrote the defintion of bli_thrcomm_tree_barrier() so that it (a) + actually worked again, and (b) used atomics instead of a basic C99 + spin loop. (Note that the conventional barrier implementation is + still enabled by default; the tree barrier must be toggled on + manually within the configuration.) + - Added an early return to the definition of bli_thrcomm_barrier() in + the cases where comm == NULL or comm->n_threads == 1. + - Reordered thread-related and thread-dependent header #include + directives in blis.h so that the BLIS_TREE_BARRIER and + BLIS_TREE_BARRIER_ARITY macros, which would be defined in the target + configuration's in the bli_family_*.h file, would be #included prior + to the inclusion of the thrcomm_t header that uses them. + - Changed the type of barrier_t.count from 'int' to 'dim_t'. + - Changed the type of barrier_t.signal from 'volatile int' to 'gint_t'. + - Special thanks to Leick Robinson for contributing these changes. + - Whitespace changes. + +commit d639554894b6252a86bd3164921bce6fbb9e3b5e +Author: Field G. Van Zee +Date: Wed Jun 7 16:11:14 2023 -0500 + + Pad thrcomm_t fields to avoid false sharing. + + Details: + - Inserted a cache line of padding between various fields of the + thrcomm_t and, in the case of the (presently defunct) tree barrier, + fields of the barrier_t. This additional padding ensures that these + fields, which both serve different purposes when performing a thread + barrier, are only accessed when needed (and not just due to their + spatial locality with their cache line neighbors). + - Added a new cpp macro constant, BLIS_CACHE_LINE_SIZE, to + bli_config_macro_defs. This new constant defines the size of a cache + line (in bytes) and defaults to 64. + - Special thanks to Leick Robinson for discovering this false sharing + issue and developing/submitting the patch. + +commit 89b7863fc9a88903917deedc6a5ad9fd17f83713 +Author: Devin Matthews +Date: Mon May 8 16:51:18 2023 -0500 + + Fix 1m enablement for herk/her2k/syrk/syr2k. (#743) + + Details: + - Ever since 28b0982, herk, her2k, syrk, and syr2k have been implemented + in terms of the gemmt expert API. And since the decision of which + induced method to use (1m or native) is made *below* the level of the + expert API, executing any of {herk,her2k,syrk,syr2k} results in BLIS + checking the enablement status for gemmt. + - This commit applies a band-aid of sorts to this issue by modifying + bli_l3_ind_oper_get_enable() and bli_l3_ind_oper_set_enable() so that + any attempts to query or modify the internal enablement status for + herk, her2k, syrk, or syr2k instead does so for gemmt. + - This solution isn't perfect since, in theory, the user could enable 1m + for, say, herk but then disable it for syrk, and then be confused when + herk runs via native execution. But we don't anticipate that users + modify 1m enablement at the operation level, and so in practice this + solution is likely fine for now. + +commit 138de3b3e88c5bf7d8718c45c88811771cf42db8 +Author: Ajay Panyala +Date: Sun May 7 13:01:38 2023 -0700 + + add nvhpc compiler support (#719) + + Add detection of the NVIDIA nvhpc compiler (`nvc`) in `configure`, and adjust some warning options in `config.mk`. Currently, no specific options for `nvc` have been added in the relevant configurations so it may not be usable without further tweaks. + +commit 0873c0f6ed03fea321d1631b3d1a385a306aa797 +Author: Devin Matthews +Date: Sun May 7 14:03:19 2023 -0500 + + Consolidate INSERT_ macro sets via variadic macros. (#744) + + Details: + - Consolidated INSERT_GENTFUNC_* (and corresponding GENTPROT) macro sets + using variadic macros (__VA_ARGS__), which means we no longer need a + different INSERT_ macro for each possible number of arguments the + macro might take. This change seems reasonable given that variadic + macros are a standard C99 feature and widely supported. I took care + not to use variadic macros where 0 variadic arguments are expected + since that is a non-standard extension. + - Added pre-typecast parentheses to arithmetic expressions in printf() + statements in bli_thread_range_tlb.c. + +commit ef9d3e6675320a53e7cb477c16b01388e708b1da +Author: h-vetinari +Date: Sun May 7 04:59:35 2023 +1100 + + Added missing #include for Windows. (#747) + + Details: + - This commit fixes issue #746, in which the _access() function (called + from within blastest/f2c/open.c) is undeclared when compiling on + Windows with clang 16. + +commit 6fd9aabb03d172a792a7eeb106c7d965cf038421 +Author: Devin Matthews +Date: Fri May 5 14:22:52 2023 -0500 + + Fix bug in detecting Fortran compiler vendor (#745) + + `FC` was used instead of `found_fc`. + +commit 8215b02f99aa77ecc7d813508c247565115319d7 +Author: Lee Killough <15950023+leekillough@users.noreply.github.com> +Date: Wed Apr 12 12:59:27 2023 -0500 + + Apply #738 to make_defs.mk of RISC-V subconfigs. (#740) + + Details: + - PR #738 -- which moved -fPIC flag insertion responsibilities from + common.mk to the subconfigs' individual make_defs.mk files -- was + merged shortly before the introduction of new RISC-V subconfigs in + #693. This commit brings those RISC-V subconfigs up to date with the + new -fPIC conventions. + +commit 6b38c5ac07a2a27738674784e58aa699bf895447 +Author: angsch <17718454+angsch@users.noreply.github.com> +Date: Tue Apr 11 19:27:43 2023 +0200 + + Add RISC-V target (#693) + + Details: + - There are four RISC-V base configurations: 'rv32i', 'rv32iv', 'rv64i', + and 'rv64iv', namely the 32-bit and 64-bit implementations with and + without the 'V' vector extension. Additional extensions such as 'M' + (multiplication), 'A' (atomics), 'F' ('float' hardware support), 'D' + ('double' hardware support), and 'C' (compressed-length instructions), + are automatically used when available. If they are not available, then + software equivalents (e.g., softfloat and -latomic) are used. + - './configure auto' can be invoked on a RISC-V build platform, and will + automatically detect RISC-V CPU extensions through the RISC-V C API: + https://github.com/riscv-non-isa/riscv-c-api-doc/blob/master/riscv-c-api.md + - The assembly kernels assume the presence of the vector extension + RVV 1.0. + - It is possible to build 'rv[32,64]iv' for any value of VLEN. + However, if VLEN < 128, the targets will fall back to the generic + kernels and blocksizes. + - The vector microkernels are vector-length agnostic and work with + every VLEN >=128, but are expected to work best with smaller vector + lengths, i.e., VLEN <= 512. + - The assembly kernels cover column major storage (rs_c == 1). + - The blocksizes aim at being a good generic choice for out-of-order + cores. They are not tuned to a specific RISC-V HPC core. + - The vector kernels have been tested using vlen={128,256,512}. + - The single- and double-precision assembly code routines for 'sgemm' + and 'dgemm', or for 'cgemm' and 'zgemm', are combined in their RISC-V + vector assembly source code, and are differentiated only with macros. + - The XLEN=32 and XLEN=64 versions of the RISC-V assembly code are + identical, except that callee-saved registers are saved and restored + differently. There are RISC-V assembly code #include files for + handling the saving and restoring of callee-saved registers, and they + are future-proof if ever XLEN=128. + - Multiplications, such as computing array strides and offsets, are + performed in C, and later passed to the RISC-V assembly kernels. This + is so that the compiler can determine whether the 'M' (multiply) + extension is available and use multiplication instructions, or call + library helper functions instead. + - A new macro called bli_static_assert() has been added to perform + static assertions at compile-time, regardless of the C/C++ dialect of + the compiler. The original motivation of this was to ensure that + calling RISC-V assembly kernels would not silently truncate arguments + of type 'dim_t' or 'inc_t' (so-called "narrowing conversions"). + - RISC-V CI tests have been added to Travis CI, using the + riscv-gnu-toolchain cross-compiler, and qemu simulator. + - Thanks to Lee Killough for collaborating on this commit. + +commit 593d01761910af6a9a16ee0ac097142732f73c29 +Author: Field G. Van Zee +Date: Sat Apr 8 16:44:16 2023 -0500 + + CREDITS file update. + +commit 259f68479671bbaf9c5986759aaa0004f9b05a24 +Author: Field G. Van Zee +Date: Fri Apr 7 16:11:34 2023 -0500 + + CREDITS file update. + + Details: + - Added attributions associated with commits: + - 98d4678 9b1beec: @bartoldeman + - 2b05948 059f151: @ct-clmsn + - Reordered attirubtion for @decandia50. + +commit aea8e1d9243631635ca788d5e14f0f29328e637d +Author: Field G. Van Zee +Date: Mon Apr 3 12:17:51 2023 -0500 + + Optionally disable thread-local storage. (#735) + + Details: + - Implemented a new configure option, --disable-tls, which allows the + user to optionally disable the use of thread-local storage qualifiers + on static variables in BLIS. This option will rarely be needed, but + in some situations may allow BLIS to compile when TLS is unavailable. + Thanks to Nick Knight for suggesting this option. + - Unlike the --disable-system option, --disable-tls does not forcibly + disable threading. Instead, warnings of the possible consequences of + using threading with TLS disabled are added to: + - the output of './configure --help'; + - the output of 'configure' the --disable-tls option is parsed; + - the informational header output by the testsuite. + Thanks to Minh Quan Ho for suggesting these warnings. + - Modified frame/include/bli_lang_defs.h so that BLIS_THREAD_LOCAL is + defined to nothing when BLIS_ENABLE_TLS is not defined. + - Defined bli_info_get_enable_tls(), which returns whether the cpp macro + BLIS_ENABLE_TLS was defined. + - Edited --disable-system configure status output for clarity. + - Whitespace updates. + +commit 3f1432abe75cc306ef90a04381d7e0d8739fded8 +Author: Lee Killough <15950023+leekillough@users.noreply.github.com> +Date: Mon Apr 3 12:10:59 2023 -0500 + + Add output.testsuite to .gitignore (#736) + + Details: + - Added `output.testsuite` to .gitignore since it was previously not + being matched by `output.testsuite.*`. + +commit 38fc5237520a2f20914a9de8bb14d5999009b3fb +Author: Field G. Van Zee +Date: Thu Mar 30 17:30:07 2023 -0500 + + Added mm_algorithm pdf files (bp and pb). + + Details: + - Added PDF versions of the PowerPoint files added in 17cd260. + +commit 17cd260cb504b2f3997c32daec77f4c828fbb32b +Author: Field G. Van Zee +Date: Wed Mar 29 21:47:12 2023 -0500 + + Added mm_algorithm pptx files (bp and pb). + + Details: + - Added two PowerPoint files that contain slides depicting the classic + Goto algorithm for matrix multiplication as well as its sister + "panel-block" algorithm. These files reside in docs/diagrams. + +commit 9d778e0f7c94d8752dd578101e4fc6893a1f54ef +Author: Field G. Van Zee +Date: Wed Mar 29 17:36:49 2023 -0500 + + Move -fPIC insertion to subconfigs' make_defs.mk. (#738) + + * Move -fPIC insertion to subconfigs' make_defs.mk. + + Details: + - Previously, common.mk was appending -fPIC to the CPICFLAGS variables + set within the various subconfigurations' make_defs.mk files. This + seemed somewhat unintuitive, and so now the -fPIC flag is assigned to + the various subconfigs' CPICFLAGS variables in the respective + make_defs.mk files. + - This also commit changes the logic in common.mk so that instead of + appending, the variable is overwritten, but now *only* in the case + of Windows (since apparently -fPIC needs to be omitted there). Thanks + to Nick Knight for catching and reporting this weirdness. + +commit 04090df01175477394d1e73af2e5769751d47cd6 +Author: Field G. Van Zee +Date: Mon Mar 27 14:13:10 2023 -0500 + + Fixed compile errors with `BLIS_DISABLE_BLAS_DEFS`. (#730) + + * Fixed compile errors with BLIS_DISABLE_BLAS_DEFS. + + Details: + - This commit fixes a compile-time error related to the type definition + (prototype) of dsdot_() when BLIS_DISABLE_BLAS_DEFS is defined by the + application (or the configuration), which is actually a symptom of a + larger design issue when disabling BLAS prototypes. The macro was + intended to allow applications to bring their own BLAS prototypes and + suppress the inclusion of duplicate (or possibly conflicting) + prototypes within blis.h. However, prototypes are still needed during + compilation even if they are ultimately omitted from blis.h. The + problem is that almost every source file in BLIS--including the BLAS + compatibility layer--only includes one header (blis.h), and if we + were to #include a new header in the BLAS source files (to isolate + only the BLAS prototypes), we would also have to make the build system + aware of the location of those headers. Thanks to Edward Smyth of AMD + for reporting this issue. + - The solution I settled upon was to remove all cpp guards from all BLAS + headers (by changing them to #if 1, for easy search-and-replace + anchoring in the future if we ever need to re-insert guards) and + modifying bli_blas.h so that the BLAS prototypes are #included if + either (a) BLIS_ENABLE_BLAS_DEFS is defined, or (b) + BLIS_ENABLE_BLAS_DEFS is *not* defined but BLIS_IS_BUILDING_LIBRARY + *is* defined. (Thanks to Devin Matthews for steering me away from an + inferior solution.) + - This commit also spins off the actual BLAS prototypes/definitions to + a separate file, bli_blas_defs.h. + - CREDITS file update. + +commit 5f841307f668f65b7ed5a479bd8374d2581208cf +Author: Field G. Van Zee +Date: Fri Mar 24 20:05:13 2023 -0500 + + Omit -fPIC if shared library build is disabled. (#732) + + Details: + - Updated common.mk so that when --disable-shared option is given to + configure: + 1. The -fPIC compiler flag is omitted from the individual + configuration family members' CPICFLAGS variables (which are + initialized in each subconfig's make_defs.mk file); and + 2. The BUILD_SYMFLAGS variable, which contains compiler flags needed + to control the symbol export behavior, is left blank. + - The net result of these changes is that flags specific to shared + library builds are only used when a shared library is actually + scheduled to be built. Thanks to Nick Knight for reporting this issue. + - CREDITS file update. + +commit 72c37eb80f964b7840377076e5009aec5b29d320 (origin/riscv) +Author: Lee Killough <15950023+leekillough@users.noreply.github.com> +Date: Thu Mar 23 16:01:55 2023 -0500 + + Updated configure to pass all shellcheck checks. (#729) + + Details: + - Modified configure so that it passes all 'shellcheck' checks, + disabling ones which we violate but which are just stylistic, or are + special cases in our code. + - Miscellaneous other minor changes, such as rearranged redirections in + long sed/perl pipes to look more natural. + - Whitespace tweaks. + +commit 60f36347c16e6336215cd52b4e5f3c0f96e7c253 +Author: Field G. Van Zee +Date: Wed Feb 22 20:37:30 2023 -0600 + + Fixed bugs in scal2v ref kernel when alpha == 1. (#728) + + Details: + - Fixed a typo bug in ref_kernels/1/bli_scal2v_ref.c where the + conditional that was supposed to be checking for cases when alpha is + equal to 1.0 (so that copyv could be used instead of scal2v) was + instead erroneously comparing alpha against 0.0. + - Fixed another bug in the same function whereby BLIS_NO_CONJUGATE was + erroneously being passed into copyv instead of the kernel's conjx + parameter. This second bug was inert, however, due to the first bug + since the "alpha == 0.0" case was already being handled, resulting in + the code block never executing. + +commit fab18dca46618799bb0b4f652820b33d36a5d4d4 +Author: Field G. Van Zee +Date: Wed Feb 22 16:50:00 2023 -0600 + + Use 'void*' datatypes in kernel APIs. (#727) + + Details: + - Migrated all kernel APIs to use void* pointers instead of float*, + double*, scomplex*, and dcomplex* pointers. This allows us to define + many fewer kernel function pointer types, which also makes it much + easier to know which function pointer type to use at any given time. + (For example, whereas before there was ?axpyv_ker_ft, ?axpyv_ker_vft, + and axpyv_ker_vft, now there is just axpyv_ker_ft, which is equivalent + so what axpyv_ker_vft used to be.) + - Refactored how kernel function prototypes and kernel function types + are defined so as to reduce redundant code. Specifically, the + function signatures (excluding cntx_t* and, in the case of level-3 + microkernels, auxinfo_t*) are defined in new headers named, for + example, bli_l1v_ker_params.h. Those signatures are reused via macro + instantiation when defining both kernel prototypes and kernel function + types. This will hopefully make it a little easier to update, add, and + manage kernel APIs going forward. + - Updated all reference kernels according to the aforementioned switch + to void* pointers. + - Updated all optimzied kernels according to the aforementioned switch + to void* pointers. This sometimes required renaming variables, + inserting typecasting so that pointer arithmetic could continue to + function as intended, and related tweaks. + - Updated sandbox/gemmlike according to the aforementioned switch to + void* pointers. + - Renamed: + - frame/1/bli_l1v_ft_ker.h -> frame/1/bli_l1v_ker_ft.h + - frame/1f/bli_l1f_ft_ker.h -> frame/1f/bli_l1f_ker_ft.h + - frame/1m/bli_l1m_ft_ker.h -> frame/1m/bli_l1m_ker_ft.h + - frame/3/bli_l1m_ft_ukr.h -> frame/3/bli_l1m_ukr_ft.h + - frame/3/bli_l3_sup_ft_ker.h -> frame/3/bli_l3_sup_ker_ft.h + to better align with naming of neighboring files. + - Added the missing "void* params" argument to bli_?packm_struc_cxk() in + frame/1m/packm/bli_packm_struc_cxk.c. This argument is being passed + into the function from bli_packm_blk_var1(), but wasn't being "caught" + by the function definition itself. The function prototype for + bli_?packm_struc_cxk() also needed updating. + - Reordered the last two parameters in bli_?packm_struc_cxk(). + (Previously, the "void* params" was passed in after the + "const cntx_t* cntx", although because of the above bug the params + argument wasn't actually present in the function definition.) + +commit 93c63d1f469c4650df082d0fa2f29c46db0e25f5 +Author: Field G. Van Zee +Date: Mon Feb 20 11:14:23 2023 -0600 + + Use 'const' pointers in kernel APIs. (#722) + + Details: + - Qualified all input-only data pointers in the various kernel APIs with + the 'const' keyword while also removing 'restrict' from those kernel + APIs. (Use of 'restrict' was maintained in kernel implementations, + where appropriate.) This affected the function pointer types defined + for all of the kernels, their prototypes, and the reference and + optimized kernel definitions' signatures. + - Templatized the definitions of copys_mxn and xpbys_mxn static inline + functions. + - Minor whitespace and style changes (e.g. combining local variable + declaration and initialization into a single statement). + - Removed some unused kernel code left in 'old' directories. + - Thanks to Nisanth M P for helping to validate changes to the power10 + microkernels. + +commit 4e18cd34f909c5045597f411340ede3a5e0bc5e1 +Author: RuQing Xu +Date: Sun Feb 19 04:18:41 2023 +0900 + + Restored ArmSVE general storage case. (#708) + + Details: + - Restored general storage case in armsve kernels. + - Reason for doing this: Though real `g`-storage is difficult to + speedup, `g`-codepath here can provide a good support for + transposed-storage. i.e. at least good for `GEMM_UKR_SETUP_CT_AMBI`. + - By experience, this solution is only *a little* slower than in-reg + transpose. Plus in-reg transpose is only possible for a fixed VL in + our case. + +commit 0ba6e9eafb1e667373d9dbc2aa045557921f33e2 +Author: Lee Killough <15950023+leekillough@users.noreply.github.com> +Date: Sat Feb 18 13:15:42 2023 -0600 + + Refined emacs handling of indentation. (#717) + + Details: + - This refines the emacs autoformatting to be better in line with + contribution guidelines. + - Removed a stray shebang in a .mk file which confuses emacs about the + file mode, which should be makefile-mode. (emacs also removes stray + whitespace at the ends of lines.) + +commit 059f15105b1643fe56084f883c22b3cadf368b39 +Author: ct-clmsn +Date: Sat Feb 18 14:13:23 2023 -0500 + + Updated hpx namespace for make_count_shape. (#725) + + Details: + - The hpx namespace for *counting_shape changed. This PR updates the use + of counting_shape in blis to comply with the change in hpx. + - Co-authored-by: ctaylor + +commit 0b421eff130b5c896edcc09e7358d18564d177e9 +Author: Field G. Van Zee +Date: Sat Feb 18 13:11:41 2023 -0600 + + Added an 'arm64' entry to `.travis.yml`. (#726) + + Details: + - Added a new 'arm64' entry to the .travis.yml file in an attempt to get + Travis CI to compile both NEON and SVE kernels, even if only NEON + kernels are exercised in the testing. With this new 'arm64' entry, the + 'cortexa57' entry becomes redundant and may be removed. Thanks to + RuQing Xu for this suggestion. + - Previously, the macro BLIS_SIMD_MAX_SIZE was *not* being set in + bli_kernels_arm64.h, which meant that the default value of 64 was + being used. This caused a runtime consistency check to fail in + bli_gks.c (in Travis CI), one which requires that + + mr * nr * dt_size > BLIS_STACK_BUF_MAX_SIZE + + for all datatype sizes dt_size, where BLIS_STACK_BUF_MAX_SIZE is + defined as + + BLIS_SIMD_MAX_NUM_REGISTERS * BLIS_SIMD_MAX_SIZE * 2 + + This commit increases BLIS_SIMD_MAX_SIZE to 128 for the 'arm64' + configuration, thus overriding the default and (hopefully) avoiding + the aforementioned consistency check failures. + - Appended '|| cat ./output.testsuite' to all 'make' commands in + travis/do_testsuite.sh. Thanks to RuQing Xu for this suggestion. + - Whitespace changes. + +commit b1d3fc7e5b0927086e336a23f16ea59aa3611ccb +Author: Field G. Van Zee +Date: Fri Feb 10 15:34:47 2023 -0600 + + Redirect grep stderr to /dev/null. (#723) + + Details: + - In common.mk, added a redirection of stderr to /dev/null for the grep + command being used to gather a list of header files #included from + bli_cntx_ref.c. The redirection is desirable because as of grep 3.8, + regular expressions with "stray" backslashes trigger warnings [1]. + But removing the backslash seems to break the BLIS build system when + using pre-3.8 versions of grep, so this seems to be easiest way to + satisfy the BLIS build system for both pre- and post-3.8 grep + environments. + + [1] https://lists.gnu.org/archive/html/info-gnu/2022-09/msg00001.html + +commit e3d352f1fcc93e6a46fde1aa4a7f0a18fb27bd42 +Author: Nisanth M P +Date: Wed Feb 8 06:11:41 2023 +0530 + + Added runtime selection of 'power' config family. (#718) + + Details: + - Created a 'power' umbrella configuration family, which, when targeted + at configure-time, will build both 'power9' and 'power10' subconfigs. + (With this feature, a BLIS shared library could be compiled on a + power9 system and run on power10 and vice-versa. Unoptimised code + will execute if it is linked and run on any other generic system.) + - This new configuration family will only work with gcc, since that is + the only compiler supported by both power9 and power10 subconfigs in + BLIS. + - Documented power9 and power10 as supported microarchitectures in the + docs/HardwareSupport.md document. + +commit e730c685d09336b3bd09e86c94330c4eba967f3e +Author: Field G. Van Zee +Date: Mon Feb 6 15:31:54 2023 -0600 + + Define `BLIS_VERSION_STRING` in `blis.h`. (#720) + + Details: + - Previously, the version string was communicated from configure to + config.mk (via the config.mk.in template), where it was included via + the top-level Makefile, where it was then used to define the + preprocessor macro BLIS_VERSION_STRING via a command line argument to + the compiler (via -D). This macro is then used within bli_info.c to + initialize a static string which can then be queried via the + bli_info_get_version_str() function. However, there are some + applications that may find utility in being able to access the version + string by inspecting the monolithic (flattened) blis.h header file + that is created at compile time and installed alongside the library. + This commit moves the definition of BLIS_VERSION_STRING into + bli_config.h (via the bli_config.h.in template) so that it is + embedded in blis.h. The version string is now available in three + places: + - the static/shared library, which is installed in the 'lib' + subdirectory of the install prefix (query-able via the + bli_info_get_version_str() function); + - the config.mk makefile fragment, which is installed in the 'share' + subdirectory of the install prefix (in the VERSION variable); + - the blis.h header file, which is installed in the 'include' + subdirectory of the install prefix (via the BLIS_VERSION_STRING + macro constant). + Thanks to Mohsen Aznaveh and Tim Davis for providing the idea for this + change. + - CREDITS file update. + +commit dc5d00a6ce0350cd82859d8c24f23d98f205d8db +Author: Lee Killough <15950023+leekillough@users.noreply.github.com> +Date: Fri Jan 27 17:36:47 2023 -0600 + + Typecast printf() args to avoid compiler warnings. (#716) + + Details: + - In bli_thread_range_tlb.c, typecast integer arguments passed to + printf() -- which are typically disabled unless debugging -- to type + "long" to guarantee a match to the "%ld" format specifiers used in + those calls. This avoids spurious warnings with certain compilers in + certain toolchain environments, such as 32-bit RISC-V (rv32iv). + +commit ecbcf4008815035c695822fcaf106477debff89a +Author: Lee Killough <15950023+leekillough@users.noreply.github.com> +Date: Wed Jan 18 20:35:50 2023 -0600 + + Use here-document for 'configure --help' output. (#714) + + Details: + - Changed the configure script function that outputs "--help" text to do + so via so-called "here-document" syntax for improved readability and + maintainability. The change eliminates hundreds of echo statements and + makes it easier to change existing configure options' help text, along + with other benefits such as eliminating the need to escape double- + quote characters ("). + +commit c334ec278f5e2a101625629b2e13bbf1b38dede5 +Author: Devin Matthews +Date: Wed Jan 18 13:10:19 2023 -0600 + + Merge tlb- and slab/rr-specific gemm macrokernels. (#711) + + Details: + - Merged the tlb-specific gemm macrokernel (_var2b) with the slab/rr- + specific one (var2) so that a single function can be compiled with + either tlb or slab/rr support, depending on the value of the + BLIS_ENABLE_JRIR_TLB, _SLAB, and _RR. This is done by incorporating + information from both approaches: the start/end/inc for the JR and IR + loops from slab or rr partitioning; and the number of assigned + microtiles, plus the starting IR dimension offset for all iterations + after the first (ir_next). With these changes, slab, rr, and tlb can + all be parameterized by initializing a similar set of variables prior + to the jr loop. + - Removed the wrap-around logic that sets the "b_next" field of the + auxinfo_t struct, which executes during the last IR iteration of the + last JR iteration. The potential benefit of this code is so minor + (and hinges on the microkernel making use of the b_next field) that + it's arguably not worth including. The code also does the wrong + thing for some threads whenever JR_NT > 1, since only thread 0 (in the + JR group) would even compute with the first micropanel of B. + - Re-expressed the definition of bli_is_last_iter_slrr so that slab and + tlb use the same code rather than rr and tlb. + - Adjusted the initialization of the gemm control tree accordingly. + +commit 5793a77937aee9847a5692c8e44b36a6380800a1 +Author: HarshDave12 <122850830+HarshDave12@users.noreply.github.com> +Date: Tue Jan 17 21:55:02 2023 +0530 + + Fixed mis-mapped instruction for VEXTRACTF64X2. (#713) + + Details: + - This commit fixes a typo in the macro definition for the extended + inline assembly macro VEXTRACTF64X2 in bli_x86_asm_macros.h. The macro + was previously defined (incorrectly) in terms of the vextractf64x4 + instruction rather than vextractf64x2. + - CREDITS file update. + +commit 16d2e9ea9ca0853197b416eba701b840a8587bca +Author: Field G. Van Zee +Date: Fri Jan 13 20:03:01 2023 -0600 + + Defined lt, lte, gt, gte + misc. other updates. (#712) + + Details: + - Changed invertsc operation to be a non-destructive operation; that is, + it now takes separate input and output operands. This change applies + to both the object and typed APIs. + - Defined an alternative square root operation, sqrtrsc, which, when + operating on complex scalars, assumes the imaginary part of the input + to be zero. + - Changed the semantics of addm, subm, copym, axpym, scal2m, and xpbym + so that when the source matrix has an implicit unit diagonal, the + operation leaves the diagonal of the destination matrix untouched. + Previously, the operations would interpret an implicit unit diagonal + on the source matrix as a request to manifest the unit diagonal + *explicitly* on output (either as something to copy in the case of + copym, or something to compute with in the cases of addm, subm, axpym, + scal2m, and xpbym). It turns out that this behavior was too cute by + half and could cause unintended headaches for practical use cases. + (This change in behavior also required small modifications to the trmv + and trsv testsuite modules so that they would properly test matrices + with unit diagonals.) + - Added missing dependencies for copym to gemv, ger, hemv, trmv, and + trsv testsuite modules. + - Implemented level-0-like ltsc, ltesc, gtsc, gtesc operations in + frame/util, which use lt, lte, gt, and gte level-0 scalar macros. + - Trivial variable rename in bli_part.c to harmonize with other + variable naming conventions. + +commit 9a366b14fe52c469f4664ef5dd93d85be8d97baa +Author: Field G. Van Zee +Date: Thu Jan 12 13:07:22 2023 -0600 + + Implement cntx_t pointer caching in gks. (#709) + + Details: + - Refactored the gks cntx_t query functions so that: (1) there is a + clearer pattern of similarity between functions that query a native + context and those that query its induced (1m) counterpart; and (2) + queried cntx_t pointers (for both native and induced cntx_t pointers) + are cached (by default), or deep-queried upon each invocation, + depending on whether cpp macro BLIS_ENABLE_GKS_CACHING is defined. + - Refactored query-related functions in bli_arch.c to cache the queried + arch_t value (by default), or deep-query the arch_t value upon each + invocation, depending on whether cpp macro BLIS_ENABLE_GKS_CACHING is + defined. + - Tweaked the behavior of bli_gks_query_ind_cntx_impl() (formerly named + bli_gks_query_ind_cntx()) so that the induced method cntx_t struct is + repopulated each time the function is called. (It is still only + allocated once on first call.) This was mostly done in preparation for + some future in which the arch_t value might change at runtime. In such + a scenario, the induced method context would need to be recalculated + any time the native context changes. + - Added preprocessor logic to bli_config_macro_defs.h to handle enabling + or disabling of cntx_t pointer caching (via BLIS_ENABLE_GKS_CACHING). + - For now, cntx_t pointer caching is enabled by default and does not + correspond to any official configure option. Disabling can be done + by inserting a #define for BLIS_DISABLE_GKS_CACHING into the + appropriate bli_family_*.h header file within the configuration of + interest. + - Thanks to Harihara Sudhan S (AMD) for suggesting that cntxt_t pointers + (and not just arch_t values) be cached. + - Comment updates. + +commit b895ec9f1f66fb93972589c06bff171337153a31 +Author: Nisanth M P +Date: Wed Jan 11 09:02:32 2023 +0530 + + Fixing type-mismatch errors in power10 sandbox (#701) + + Details: + - This commit fixes a mismatch between the function type signature of + bli_gemm_ex() required by BLIS and the version of the function defined + within the power10 sandbox. It also performs typecasting upon calling + bli_gemm_front() to attain type consistency with the type signature + defined by BLIS for bli_gemm_front(). + +commit 38d88d5c131253066cad4f98eea06fa9299cae3b +Author: Devin Matthews +Date: Tue Jan 10 21:24:58 2023 -0600 + + Define new global scalar (obj_t) constants. (#703) + + Details: + - This commit defines the following new global scalar constants: + - BLIS_ONE_I: This constant encodes the imaginary unit. + - BLIS_MINUS_ONE_I: This constant encodes the negative imaginary unit. + - BLIS_NAN: This constant encodes a not-a-number value. Both real and + imaginary parts are set to NaN for complex datatypes. + +commit cdb22b8ffa5b31a0c16ac1a7bcecefeb5216f669 +Author: Nisanth M P +Date: Wed Jan 11 08:50:57 2023 +0530 + + Disable power10 kernels other than sgemm, dgemm. (#705) + + Details: + - There is a power10 sandbox which uses microkernels for datatypes other + than float and double (or scomplex/dcomplex). In a regular power10- + configured build (that is, with the sandbox disabled), there were + compile errors for some of these other non-sgemm/non-dgemm + microkernels. This commit protects those kernels with a new cpp macro + guard (which is defined in sandbox/power10/bli_sandbox.h) that + prevents that kernel code from being compiled for normal, non-sandbox + power10 builds. + +commit d220f9c436c0dae409974724d42ab6c52f12a726 +Author: Nisanth M P +Date: Wed Jan 11 08:43:03 2023 +0530 + + Fix k = 0 edge case in power10 microkernels (#706) + + Details: + - When power10 sgemm and dgemm microkernels are called with k = 0, they + become caught in infinite loops and segfault. This is fixed now via an + early exit in the case of k = 0. + +commit 2e1ba9d13c23a06a7b6f8bd326af428f7ea68c31 +Author: Field G. Van Zee +Date: Tue Jan 10 21:05:54 2023 -0600 + + Tile-level partitioning in jr/ir loops (ex-trsm). (#695) + + Details: + - Reimplemented parallelization of the JR loop in gemmt (which is + recycled for herk, her2k, syrk, and syr2k). Previously, the + rectangular region of the current MC x NC panel of C would be + parallelized separately from from the diagonal region of that same + submatrix, with the rectangular portion being assigned to threads via + slab or round-robin (rr) partitioning (as determined at configure- + time) and the diagonal region being assigned via round-robin. This + approach did not work well when extracting lots of parallelism from + the JR loop and was often suboptimal even for smaller degrees of + parallelism. This commit implements tile-level load balancing (tlb) in + which the IR loop is effectively subjugated in service of more + equitably dividing work in the JR loop. This approach is especially + potent for certain situations where the diagonal region of the MC x NR + panel of C are significant relative to the entire region. However, it + also seems to benefit many problem sizes of other level-3 operations + (excluding trsm, which has an inherent algorithmic dependency in the + IR loop that prevents the application of tlb). For now, tlb is + implemented as _var2b.c macrokernels for gemm (which forms the basis + for gemm, hemm, and symm), gemmt (which forms the basis of herk, + her2k, syrk, and syr2k), and trmm (which forms the basis of trmm and + trmm3). Which function pointers (_var2() or _var2b()) are embedded in + the control tree will depend on whether the BLIS_ENABLE_JRIR_TLB cpp + macro is defined, which is controlled by the value passed to the + existing --thread-part-jrir=METHOD (or -r METHOD) configure option. + This script adds 'tlb' as a valid option alongside the previously + supported values of 'slab' and 'rr'. ('slab' is still the default.) + Thanks to Leick Robinson for abstractly inspiring this work, and to + Minh Quan Ho for inquiring (in PR #562, and before that in Issue #437) + about the possibility of improved load balance in macrokernel loops, + and even prototyping what it might look like, long before I fully + understood the problem. + - In bli_thread_range_weighted_sub(), tweaked the the way we compute the + area of the current MC x NC trapezoidal panel of C by better taking + into account the microtile structure along the diagonal. Previously, + it was an underestimate, as it assumed MR = NR = 1 (that is, it + assumed that the microtile column of C that overlapped with microtiles + exactly coincided with the diagonal). Now, we only assume MR = NR. + This is still a slight underestimate when MR != NR, so the additional + area is scaled by 1.5 in a hackish attempt to compensate for this, as + well as other additional effects that are difficult to model (such as + the increased cost of writing to temporary tiles before finally + updating C). The net effect of this better estimation of the + trapezoidal area should be (on average) slightly larger regions + assigned to threads that have little or no overlap with the diagonal + region (and correspondingly slightly smaller regions in the diagonal + region), which we expect will lead to slightly better load balancing + in most situations. + - Spun off the contents of bli_thread.[ch] that relate to computing + thread ranges into one of three source/header file pairs: + - bli_thread_range.[ch], which define functions that are not specific + to the jr/ir loops; + - bli_thread_range_slab_rr.[ch], which define functions that implement + slab or round-robin partitioning for the jr/ir loops; + - bli_thread_range_tlb.[ch], which define functions that implement + tlb for the jr/ir loops. + - Fixed the computation of a_next in the last iteration of the IR loop + in bli_gemmt_l_ker_var2(). Previously, it always "wrapped" back around + to the first micropanel of the current MC x KC packed block of A. + However, this is almost never actually the micropanel that is used + next. A new macro, bli_gemmt_l_wrap_a_upanel(), computes a_next + correctly, with a similarly named bli_gemmt_u_wrap_a_upanel() for use + in the upper-stored case (which *does* actually always choose the + first micropanel of A as its a_next at the end of the IR loop). + - Removed adjustments for a_next/b_next (a2/b2) for the diagonal- + intersecting case of gemmt_l_ker_var2() and the above-diagonal case + of gemmt_u_ker_var2() since these cases will only coincide with the + last iteration of the IR loop in very small problems. + - Defined bli_is_last_iter_l() and bli_is_last_iter_u(), the latter of + which explicitly considers whether the current microtile is the last + tile that intersects the diagonal. (The former does the same, but the + computation coincides with the original bli_is_last_iter().) These + functions are now used in gemmt to test when a_next (or a2) should + "wrap" (as discussed above). Also defined bli_is_last_iter_tlb_l() + and bli_is_last_iter_tlb_u(), which are similar to the aforementioned + functions but are used when employing tlb in gemmt. + - Redefined macros in bli_packm_thrinfo.h, which test whether an + iteration of work is assigned to a thread, as static inline functions + in bli_param_macro_defs.h (and then deleted bli_packm_thrinfo.h). + In the process of redefining these macros, I also renamed them from + bli_packm_my_iter_rr/sl() to bli_is_my_iter_rr/sl(). + - Renamed + bli_thread_range_jrir_rr() -> bli_thread_range_rr() + bli_thread_range_jrir_sl() -> bli_thread_range_sl() + bli_thread_range_jrir() -> bli_thread_range_slrr() + - Renamed + bli_is_last_iter() -> bli_is_last_iter_slrr() + - Defined + bli_info_get_thread_jrir_tlb() + and renamed: + - bli_info_get_thread_part_jrir_slab() -> + bli_info_get_thread_jrir_slab() + - bli_info_get_thread_part_jrir_rr() -> + bli_info_get_thread_jrir_rr() + - Modified bli_rntm_set_ways_for_op() to redirect IR loop parallelism + into the JR loop when tlb is enabled for non-trsm level-3 operations. + - Added a sanity check to prevent bli_prune_unref_mparts() from being + used on packed objects. This prohibition is necessary because the + current implementation does not take into account the atomicity of + packed micropanel widths relative to the diagonal of structured + matrices. That is, the function prunes greedily without regard to + whether doing so would prune off part of a micropanel *which has + already been packed* and assigned to a thread for inclusion in the + computation. + - Further restricted early returns in bli_prune_unref_mparts() to + situations where the primary matrix is not only of general structure + but also dense (in terms of its uplo_t value). The addition of the + matrix's dense-ness to the conditional is required because gemmt is + somewhat unusual in that its C matrix has general structure but is + marked as lower- or upper-stored via its uplo_t. By only checking + for general structure, attempts to prune gemmt C matrices would + incorrectly result in early returns, even though that operation + effectively treats the matrix as symmetric (and stored in only one + triangle). + - Fixed a latent bug in bli_thread_range_rr() wherein incorrect ranges + were computed when 1 < bf. Thankfully, this bug was not yet + manifesting since all current invocations used bf == 1. + - Fixed a latent bug in some unexercised code in bli_?gemmt_l_ker_var2() + that would perform incorrect pruning of unreferenced regions above + where the diagonal of a lower-stored matrix intersects the right edge. + Thankfully, the bug was not harming anything since those unreferenced + regions were being pruned prior to the macrokernel. + - Rewrote slab/rr-based gemmt macrokernels so that they no longer carved + C into rectangular and diagonal regions prior to parallelizing each + separately. The new macrokernels use a unified loop structure where + quadratic (slab) partitioning is used. + - Updated all level-3 macrokernels to have a more uniform coding style, + such as wrt combining variable declarations with initializations as + well as the use of const. + - Updated bls_l3_packm_var[123].c to use bli_thrinfo_n_way() and + bli_thrinfo_work_id() instead of bli_thrinfo_num_threads() and + bli_thrinfo_thread_id(), respectively. This change probably should + have been included in aeb5f0c. + - Removed old prototypes in bli_gemmt_var.h and bli_trmm_var.h that + corresponded to functions that were removed in aeb5f0c. + - Other very minor cleanups. + - Comment updates. + +commit b6735ca26b9d459d9253795dc5841ae8de9e84c9 +Author: Devin Matthews +Date: Fri Jan 6 14:10:01 2023 -0600 + + Refactor structure awareness in packm_blk_var1.c. (#707) + + Details: + - Factored some of the structure awareness out of the loop in + bli_packm_blk_var1(). So instead of having a single loop with + conditionals in the body to handle various kinds of structure (and + stored/unstored submatrix placement), we now have a conditional branch + to handle various structure/storage scenarios with a loop in each + section. This change was originally motivated to choose slab or round- + robin partitioning (in the context of triangular matrices) based on + the structure of the entire block (or panel) being packed rather than + each micropanel individually. Previously, the code would attempt to + limit rr to the portion of the block that intersects the diagonal and + use slab for the remainder. However, that approach was not well-thought + out and in many situations this would lead to inferior load balancing + when compared to using round-robin for the entire block (or panel). + This commit has the added benefit of incurring less overhead during + the packing process now that each of the new loops is simpler. + +commit f956b79922da412791e4c8b8b846b3aafc0a5ee0 +Author: Field G. Van Zee +Date: Sat Dec 31 20:18:08 2022 -0600 + + Switch to l3 sup decorator in gemmlike sandbox. (#704) + + Details: + - Modified the gemmlike sandbox to call bli_l3_sup_thread_decorator() + rather than a local analogue of that code. This reduces redundant + logic and makes it easier for the sandbox to inherit future + improvements to the framework's threading code. + - Moved addon/gemmd to addon/old/gemmd. This code has fallen out of date + and is taking too much effort to maintain. We will very likely + reimplement it completely once future changes are made to the + framework proper. + +commit 538150c5845ad903773ca797c740048174116aa4 +Author: Field G. Van Zee +Date: Sun Dec 25 22:28:09 2022 -0600 + + Applied race condition fix to sup thread decorator. + + Details: + - Applied the race condition bugfix in commit 7d23dc2 to the + corresponding sup code in bli_l3_sup_decor.c. Note that in the case + of sup, the race condition would have only manifested when optional + packing was enabled at runtime (typically via setting BLIS_PACK_A + and/or BLIS_PACK_B environment variables). + - Both the fix in this commit and the fix in 7d23dc2 address bugs + that were introduced when the thrinfo_t trees/communicators were + restructured in the October omnibus commit (aeb5f0c). + +commit 7d23dc2a064a371dc9883e2c2c7236a70912428c +Author: Devin Matthews +Date: Sun Dec 25 19:09:14 2022 -0600 + + Fix a race condition which manifested as incorrect results (rarely). (#702) + + The problem occurs when there are at least two teams of threads packing different parts of a matrix, and where each team has at least two threads; call them team A and team B. The problematic sequence is: + + 1. The chief of team A checks out a block B and broadcasts the pointer to its teammates. + 2. Team A completely packs their data and perform a barrier amongst themselves. + 3. Team A commences computing with the packed data. + 4. The chief of team A finishes computing before its teammates, then calls bli_thrinfo_free on its thrinfo_t struct (which contains the mem_t object referencing the buffer B). This causes buffer B to be checked back in to the pba. + 5. The chief of team B checks out the *same* block B that was just checked back in and broadcasts the pointer to its teammates. + 6. DATA RACE: now the remaining threads of team A are reading *while* team B are writing to the same buffer B. If team A write new data before team B are done computing then an incorrect result is generated. + + The solution is to place a global barrier before the call to bli_thrinfo_free at the end of the computation. + + Co-authored-by: Field G. Van Zee + +commit 3accacf57d11e9b109339754f91bf22329b6cb6a +Author: Field G. Van Zee +Date: Fri Dec 16 10:26:33 2022 -0600 + + Skip 1m optimization when forcing hemm_l/symm_l. (#697) + + Details: + - Fixed a bug in right-sided hemm when: + - using the 1m method, + - #defining BLIS_DISABLE_HEMM_RIGHT in the active subconfiguration, + and + - the storage of C matches the gemm microkernel IO preference PRIOR to + the right-sidedness being detected and recast in terms of the left- + side code path. + It turns out that bli_gemm_ind_recast_1m_params() was applying its + optimization (recasting a complex-domain macrokernel calling a 1m + virtual microkernel to a real-domain macrokernel calling the real- + domain microkernel) in situations in which it should not have. The + optimization was silently assuming that the storage of C always + matched that of the microkernel preference, since the front-end (in + this case, bli_hemm_front()) would have already had a chance to + transpose the operation to bring the two into agreement. However, by + disabling right-sided hemm, we deprive BLIS of that flexibility (as a + transposed left-sided case would necessarily have to become a right- + sided case), and thus the assumption was no longer holding in all + cases. Thanks to Nisanth M P for reporting this bug in Issue #621. + - The aforementioned bug, and its bugfix, also apply to symm when + BLIS_DISABLE_SYMM_RIGHT is defined. + - Comment updates. + - CREDITS file update. + +commit 4833ba224eba54df3f349bcb7e188bcc53442449 +Author: Field G. Van Zee +Date: Mon Dec 12 20:26:02 2022 -0600 + + Fixed perf of mt sup with packing, and mt gemmlike. (#696) + + Details: + - Brought the gemmsup code path up to date relative to the latest + thrinfo_t semantics introduced in the October Omnibus commit + (aeb5f0c). This was done by passing the prenode (instead of the + current node) into the packm variant within bli_l3_sup_packm.c as well + as creating the prenodes and attaching them to the thrinfo_t tree in + bli_l3_sup_thrinfo_create(). These changes erase the performance + degradation introduced in the omnibus when running multithreaded sup + with optional packing enabled. Special thanks to Devin Matthews for + sussing out this fix in short order. + - Fixed the gemmlike sandbox in a manner similar to that of sup with + packing, described above. This also involved passing the prenode into + the local gemmlike packm variant. (Recall that gemmlike recycles the + use of bli_l3_sup_thrinfo_create(), so it automatically inherits that + part of the sup fix described above.) + - Updated bls_l3_packm_var[123].c to use bli_thrinfo_n_way() and + bli_thrinfo_work_id() instead of bli_thrinfo_num_threads() and + bli_thrinfo_thread_id(), respectively. + +commit db10dd8e11a12d85017f84455558a82c0093b1da +Author: Field G. Van Zee +Date: Tue Nov 29 19:10:31 2022 -0600 + + Fixed _gemm_small() prototype; disabled gemm_small. + + Details: + - Fixed a mismatch between the prototype for bli_gemm_small() in + bli_gemm_front.h and the actual definition of bli_gemm_small() in + kernels/zen/3/bli_gemm_small.c. The former was erroneously declaring + the cntl_t* argument as 'const'. Thanks to Jeff Diamond for reporting + this issue. + - Commented out BLIS_ENABLE_SMALL_MATRIX, BLIS_ENABLE_SMALL_MATRIX_TRSM + macro definitions in config/zen3/bli_family_zen3.h. AMD's small matrix + implementation should probably remain disabled in vanilla BLIS, at + least for now. + +commit f0337b784d164ae505ca0e11277a1155680500d1 +Author: Field G. Van Zee +Date: Sun Nov 13 21:36:47 2022 -0600 + + Trival whitespace/comment tweaks. + + Details: + - Trivial whitespace and comment changes, most of which ideally would + have been part of the previous commit pertaining to HPX (2b05948). + +commit 2b05948ad2c9785bc53f376d53a7141cbc917447 +Author: ct-clmsn +Date: Sun Nov 13 17:40:22 2022 -0500 + + blis support for hpx (#682) + + Implement threading backend via HPX. + + HPX is an asynchronous many task runtime system used in high performance computing applications. The runtime implements the ISO C++ parallelism specification and provides a user-space thread implementation. + + This PR provides BLIS a thread backend implementation using HPX and resolves feature request #681. The configuration script, makefiles, and testsuite have been updated to support an HPX build option. The addition of HPX support provides other developers an exemplar for integrating other C++ threading backends into BLIS. + + Co-authored-by: ctaylor + Co-authored-by: Devin Matthews + +commit e1ea25da43508925e33d4e57e420cfc0a9de793f +Author: Field G. Van Zee +Date: Fri Nov 11 12:07:51 2022 -0600 + + Fixed subtle barrier_fpa bug in bli_thrcomm.c. (#690) + + Details: + - In bli_thrcommo.c, correctly initialize the BLIS_OPENMP element of the + barrier function pointer array (barrier_fpa) to NULL when + BLIS_ENABLE_OPENMP is *not* defined. Similarly, initialize the + BLIS_POSIX element of barrier_fpa to NULL when BLIS_ENABLE_PTHREADS is + not enabled. This bug was introduced in a1a5a9b and was likely the + result of an incomplete edit. The effects of the bug would have + likely manifested when querying a thrcomm_t that was initialized with + a timpl_t value corresponding to a threading implementation that was + omitted from the -t option at configure-time. + +commit dc6e5f3f5770074ba38554541b8b64711a68c084 +Author: leekillough <15950023+leekillough@users.noreply.github.com> +Date: Thu Nov 3 18:33:08 2022 -0500 + + Enhance emacs formatting of C files to remove trailing whitespace and ensure a newline at the end of file + +commit 713d078075a4a563a43d83fd0880ab5091c2e4a4 +Author: Field G. Van Zee +Date: Thu Nov 3 20:00:11 2022 -0500 + + Delete mpi_test garbage. (#689) + + Details: + - tlrmchlsmth: "What even is this? No comments, no commit message, not + used by anything. Trash." + +commit 8d813f7f12732d52c95570ae884d5defbfd19234 +Author: Field G. Van Zee +Date: Thu Nov 3 19:10:47 2022 -0500 + + Some decluttering of the top-level directory. + + Details: + - Relocated 'mpi_test' directory to test/mpi_test. + - Relocated 'so_version' and 'version' files from top-level directory to + 'build' directory. + - Updated build/bump-version.sh script to accommodate relocation of + 'version' file to 'build' directory. + - Updated configure script to accommodate relocation of 'so_version' + file to 'build' directory. + - Updated INSTALL file to replace pointers to blis-devel mailing list + with a pointer to docs/Discord.md. + - Updated RELEASING file to contain a reminder to consider whether the + so_version file should be updated prior to the release. + +commit 6774bf08c92fc6983706a91bbb93b960e8eef285 +Author: Lee Killough <15950023+leekillough@users.noreply.github.com> +Date: Thu Nov 3 15:20:47 2022 -0500 + + Fix typo in configure --help text. (#686) + + Details: + - Fixed a misspelling in the --help description for the --int-size (-i) + configure option. + +commit 872898d817f35702e7678ff7f3eeff0f12e641f5 +Author: Field G. Van Zee +Date: Wed Nov 2 21:53:22 2022 -0500 + + Fixed trmm[3]/trsm performance bug in cf7d616. (#685) + + Details: + - Fixed a performance bug in the packing of micropanels that intersect + the diagonal of triangular matrices (i.e., those found in trmm, trmm3, + and trsm). This bug was introduced in cf7d616 and stemmed from an + ill-formed boolean conditional expression in bli_packm_blk_var1(). + This conditional would chose when to use round-robin parallel work + allocation, but checked for the triangularity of the submatrix being + packed while failing also to check for whether the current micropanel + actually intersected the diagonal. The net result of this bug was that + *all* micropanels of a triangular matrix, no matter where the upanels + resided within the matrix, were assigned to threads via a round-robin + policy. This affected some microarchitectures and threading + configurations much worse than others, but it seems that overall the + effect was universally negative, likely because of the reduced spatial + locality during the packing with round-robin. Thanks to Leick Robinson + for his tireless efforts in helping track down this issue. + +commit edcc2f9940449f7d9cefcfc02159d27b013e7995 +Author: Field G. Van Zee +Date: Wed Nov 2 19:04:49 2022 -0500 + + Support --nosup, --sup configure options. (#684) + + Details: + - Added --nosup and --sup as alternative ways of requesting that sup be + disabled or enabled. These are analagous to --disable-sup-handling and + --enable-sup-handling, respectively. (I got tired of typing out + --disable-sup-handling and needed a shorthand notation.) + - Tweaked message output by configure when sup is enable/disabled for + clarity and specificity. + - Whitespace changes. + +commit 5eea6ad9eb25f37685d1ae4ae08c73cd1daca297 +Author: Field G. Van Zee +Date: Wed Nov 2 17:07:54 2022 -0500 + + Add mention of Wilkinson Prize to README.md. (#683) + + Details: + - Added blurbs and links to Wilkinson Prize to README.md. + - Added mention of both Best Paper and Wilkinson Prizes to the top of + README.md. + - Other minor tweaks. + +commit 29f79f030e939969d4f3876c4fdaac7b0c5daa63 +Author: Devin Matthews +Date: Mon Oct 31 18:57:45 2022 -0500 + + Fixed performance bug caused by redundant packing. (#680) + + Details: + - Fixed a performance bug whereby multiple threads were redundantly + packing the same (rather than separate) micropanels. This bug was + caused by different parts of the code using the num_threads/thread_id + field of the thrinfo_t vs. the n_way/work_id fields. The fix was to + standardize on the latter and provide a "fake" thrinfo_t sub-prenode + in the thrinfo tree which consists of single-member thread teams. The + single team with multiple threads node is still required since it and + only it can be used to perform barriers and broadcasts (e.g. of the + packed buffer pointer). + +commit aeb5f0cc19665456e990a7ffccdb09da2e3f504b +Author: Devin Matthews +Date: Thu Oct 27 12:39:11 2022 -0500 + + Omnibus PR - Oct 2023 (#678) + + Details: + - This is an "omnibus" commit, consisting of multiple medium-sized + commits that affect non-trivial aspects of BLIS. The major highlights: + - Relocated the pba, sba pool (from the rntm_t), and mem_t (from the + cntl_t) to the thrinfo_t object. This allows the rntm_t to be + effectively const (although it is sometimes copied internally and + modified to reflect different ways of parallelism). Moving the mem_t + sets the stage for sharing a global control tree amongst all + threads. + - De-templatized the macrokernels for gemmt, trmm, and trsm to match + the macrokernel for gemm, which has been de-templatized since + 54fa28b. + - Reimplemented bli_l3_determine_kc() by separating out the logic for + adjusting KC based on MR/NR for triangular A and/or B into a new + function, bli_l3_adjust_kc(). For now, this function is still called + from bli_l3_determine_kc(), but in the future we plan to have it + called once when constructing the control tree. + - Refactored the level-3 thread decorator into two parts: + - One part deals only with launching threads, each one calling a + generic thread entry function. This code resides in frame/thread + and constitutes the definition of bli_thread_launch(). Note that + it is specific to the threading implementation (OpenMP, pthreads, + single, etc.) + - The other part deals with passing the matrix operands and related + information into bli_thread_launch(). This is the "l3 decorator" + and now resides in frame/3. It is agnostic to the threading + implementation. + - Modified the "level" of the thread control tree passed in at each + operation. Previously, each operation (e.g. bli_gemm_blk_var1()) was + passed in a communicator representing the active thread teams which + would share the available work. Now, the *parent* thread comm is + passed in. The operation then grabs the child comm and uses it to + partition the work. The difference is in bli_trsm_blk_var1(), where + there are now two children nodes for this single operation (i.e. the + thread control tree is split one level above where the control tree + is). The sub-prenode is used for the trsm subproblem while the + normal sub-node is used for the gemm part. Importantly, the parent + comm is used for the barrier between them. + - Removed cntl_t* arguments from bli_*_front() functions. These will be + added back in the future when the control tree's creation is moved so + that it happens much sooner (provided that bli_*_front() have not been + absorbed into their respective bli_*_ex() functions). + - Renamed various bli_thread_*() query functions to bli_thrinfo_*(), + for consistency. This includes _num_threads(), _thread_id(), _n_way(), + _work_id(), _sba_pool(), _pba(), _mem(), _barrier(), _broadcast(), and + _am_chief(). + - Removed extraneous barrier from _blk_var3() of gemm and trsm. + - Fixed a typo in bli_type_defs.h where BLIS_BLAS_INT_TYPE_SIZE was + misspelled. + +commit c803b03e52a7a6997a8d304a8cfa9acf7c1c555b +Author: Devin Matthews +Date: Wed Oct 26 18:20:00 2022 -0500 + + Add check to disable armsve on Apple M1. + +commit 2dd692b710b6a9889f7ebdd7934a2108be5c5530 +Author: Devin Matthews +Date: Wed Oct 26 18:10:26 2022 -0500 + + Fix auto-detection of firestorm (Apple M1). + +commit 88105dbecf0f9dfbfa30215743346e8bd6afb971 +Author: Field G. Van Zee +Date: Fri Oct 21 15:16:12 2022 -0500 + + Added Discord documentation (#677) + + Details: + - Added a docs/Discord.md markdown document that walks the reader + through creating a Discord account, obtaining the invite link, and + using the link to join the BLIS Discord server. + - Updated README.md to reference the new Discord.md document in multiple + places, including via the official Discord logo (used with explicit + permission from representatives at Discord Inc.). + +commit 23f5b8df3e802a27bacd92571184ec57bbdfa646 +Author: Field G. Van Zee +Date: Mon Oct 17 20:21:21 2022 -0500 + + Shuffled checked properties in bli_l3_check.c. (#676) + + Details: + - Added certain checks for matrix structure to the level-3 operations' + _check() functions, and slightly reorganized existing checks. + +commit 9453e0f163503f64a290256b4be53d8882224863 +Author: Field G. Van Zee +Date: Mon Oct 3 19:46:20 2022 -0500 + + CREDITS file update. + + Details: + - This attribution was intended to go in PR #647. + +commit 76a23bd8c33e161221891935a489df9a9fb9c8c0 +Author: Devin Matthews +Date: Mon Oct 3 15:55:07 2022 -0500 + + Reinstate sanity check in bli_pool_finalize. (#671) + + Details: + - Added a reinit argument to bli_pool_finalize(). This bool will signal + whether or not the function is being called from bli_pool_reinit(). If + it is not being called from _reinit(), we can safely check to confirm + that .top_index == 0 (i.e., all blocks have been checked in). But if + it *is* being called from _reinit(), then that check will be skipped + since one of the predicted use cases for bli_pool_reinit() anticipates + that some blocks are (probably) checked out when the pool_t is + reinitialized. + - Updated existing invocations of bli_pool_finalize() to pass in either + FALSE (from bli_apool_free_block() or bli_pba_finalize_pools()) or + TRUE (from bli_pool_reinit()) for the new reinit argument. + +commit 63470b49e3b9b15e00a8f666e86ccd70c6005fe9 +Author: Devin Matthews +Date: Thu Sep 29 18:52:08 2022 -0500 + + Fix some bugs in bli_pool.c (#670) + + Details: + - Add a check for premature pool exhaustion when checking in blocks via + bli_pool_checkin_block(). This detects "double-free" and other bad + conditions that don't necessarily result in a segfault. + - Make sure to copy all block pointers when growing the pool size. + Previously, checked-out block pointers (which are guaranteed to be set + to NULL) were not being copied, leading to the presence of + uninitialized data. + +commit 42d0e66318b186d25eeb215b40ce26115401ed8b +Author: Devin Matthews +Date: Thu Sep 29 17:38:02 2022 -0500 + + Add AddressSanitizer (-fsanitize=address) option. (#669) + + Details: + - Added support for AddressSanitizer (ASan), a compiler-integrated + memory error detector. The option (disabled by default) enables + compiling and linking with the -fsanitize=address flag supported by + clang, gcc, and probably others. This flag is employed during + compilation of all BLIS source files *except* for optimized kernels, + which are exempted because ASan usually requires an extra register, + which violates the constraints for many gemm microkernels. + - Minor whitespace, comment, ordering, and configure help text updates. + +commit b861c71b50c6d48cb07282f44aa9dddffc1f1b3f +Author: Devin Matthews +Date: Fri Sep 23 13:22:27 2022 -0500 + + Add consistent NaN/Inf handling in sumsqv. (#668) + + Details: + - Changed sumsqv implementation as follows: + - If there is a NaN (either real or imaginary), then return a sum of + NaN and unit scale. + - Else, if there is an Inf (either real or imaginary), then return a + sum of +Inf and unit scale. + - Otherwise behave as normal. + +commit ee81efc7887374c974a78bfb3e0865776b2f97a8 +Author: Field G. Van Zee +Date: Thu Sep 22 19:15:07 2022 -0500 + + Parameterized test/3 drivers via command line args. (#667) + + Details: + - Rewrote the drivers in test/3, the Makefile, and the runme.sh script + so that most of the important parameters, including parameter combo, + datatype, storage combo, induced method, problem size range, dimension + bindings, number of repeats, and alpha/beta values can be passed in + via command line arguments. (Previously, most of these parameters were + hard-coded into the driver source, except a few that were hard-coded + into the Makefile.) If no argument is given for any particular option, + it will be assigned a sane default. Either way, the values employed at + runtime will be printed to stdout before the performance data in a + section that is commented out with '%' characters (which is used by + matlab and octave for comments), unless the -q option is given, in + which case the driver will proceed quietly and output only performance + data. Each driver also provides extensive help via the -h option, with + the help text tailored for the operation in question (e.g. gemm, hemm, + herk, etc.). In this help text, the driver reminds the user which + implementation it was linked to (e.g. blis, openblas, vendor, eigen). + Thanks to Jeff Diamond for suggesting this CLI-based reimagining of + the test/3 drivers. + - In the test/3 drivers: converted cpp macro string constants, as well + as two string literals (for the opname and pc_str) used in each test + driver, to global (or static) const char* strings, and replaced the + use of strncpy() for storing the results of the command line argument + parsing with pointer copies from the corresponding strings in argv. + This works because the argv array is guaranteed by the C99 standard + to persist throughout the life of the program. This new approach uses + less storage and executes faster. Thanks to Minh Quan Ho for + recommending this change. + - Renamed the IMP_STR cpp macro that gets defined on the command line, + via the test/3/Makefile, to IMPL_STR. + - Updated runme.sh to set the problem size ranges for single-threaded + and multithreaded execution independently from one another, as well as + on a per-system basis. + - Added a 'quiet' variable to runme.sh that can easily toggle quiet mode + for the test drivers' output. + - Very minor typecast fix in call to bli_getopt() in bli_utils.c. + - In bli_getopt(), changed the nextchar variable from being a local + static variable to a field of the getopt_t state struct. (Not sure why + it was ever declared static to begin with.) + - Other minor changes to bli_getopt() to accommodate the rewritten test + drivers' command line parsing needs. + +commit 036a4f9d822df25a76a653e70be76fb02284d3d3 +Author: Field G. Van Zee +Date: Thu Sep 22 18:36:50 2022 -0500 + + Refactored some rntm_t management code. (#666) + + Details: + - Separated the "sanitizing" code from the auto-factorization code + in bli_rntm_set_ways_from_rntm() and _rntm_set_ways_from_rntm_sup(). + The santizing code now resides in bli_rntm_sanitize() while the + factorization code resides in bli_rntm_factorize() and + bli_rntm_factorize_sup(). (There are two different functions because + the conventional and sup factorization codes are currently somewhat + different.) Also note that the factorization code now relies on the + .auto_factor field to have already been set, either during + rntm_t initialization or when the rntm_t was previously updated and + santized. So rather than locally determining whether to auto- + factorize, those functions just read the .auto_factor field and + proceed accordingly. + - Refactored and removed most code from bli_thread_init_rntm_from_env(). + This function now reads the environment variables needed to set nt, + jc, pc, ic, jr, and ir; sets them into the global rntm_t; and then + calls bli_rntm_sanitize() in order to make sure that the contents are + in a "good" state. Thanks to Devin Matthews for suggesting this + refactoring. + - Redefined bli_rntm_set_num_threads() and bli_rntm_set_ways() such that + if multithreading is disabled at compile time (that is, if the cpp + macro BLIS_ENABLE_MULTITHREADING is undefined), they ignore the + caller's request and instead clear the nt and ways fields. + - Redefined bli_thread_set_num_threads() and bli_thread_set_ways() such + that if multithreading is disabled at compile time (that is, if the + cpp macro BLIS_ENABLE_MULTITHREADING is undefined), they ignore the + caller's request and do nothing. + - Redefined bli_rntm_set_num_threads() and bli_rntm_set_ways() as true + functions rather than static inline functions. + - In bli_rntm.c, statically initialize the global_rntm global variable + via the BLIS_RNTM_INITIALIZER macro. + - In bli_rntm.h, defined bli_rntm_clear_auto_factor(), which sets the + .auto_factor field of the rntm_t to FALSE. + - Reorganized order of some inline function definitions in bli_rntm.h. + - Changed the default value given to the .auto_factor field by the + BLIS_RNTM_INITIALIZER macro from TRUE to FALSE. + - Call bli_rntm_clear_auto_factor() instead of + bli_rntm_set_auto_factor_only() in bli_rntm_init(). + - Comment/whitespace updates. + +commit a1a5a9b4cbef9208da494c45a2f933a8e82559ac +Author: Field G. Van Zee +Date: Wed Sep 21 18:31:01 2022 -0500 + + Implemented support for fat multithreading. (#665) + + Details: + - Allow the user to configure BLIS in such a way that multiple threading + implementations get compiled into the library, with one of those + implementations chosen at runtime. For now, there are only three + implementations available: OpenMP, pthreads, and single. (Here, + 'single' merely refers to single-threaded mode.) The configure script + now allows the user to give the -t option with a comma-separated list + of values, such as '-t openmp,pthreads'. The first value in the list + will always be the default at library initialization time, and + 'single' is always silently appended to the end of the list. The user + can specify which implementation should execute in one of three ways: + by setting the BLIS_THREAD_IMPL environment variable prior to launch; + by calling the bli_thread_set_thread_impl() global runtime API; or by + encoding their choice into a rntm_t that is passed into one of the + expert interfaces. Any of these three choices overrides the + initialization-time default (i.e., the first value listed to the -t + configure option). Requesting an implementation that was not compiled + into the library will result in an error message followed by + bli_abort(). + - Relocated the 'auto' logic for the -t option from the top-level + Makefile to the configure script. (Currently, this logic is pretty + dumb, choosing 'openmp' for gcc and icc, and 'pthreads' for clang.) + - Defined a new 'timpl_t' enum in bli_type_defs.h, with three valid + values: BLIS_SINGLE, BLIS_OPENMP, BLIS_POSIX. + - Reorganized the thrcomm_t struct into a single defintion with two + preprocessor blocks, one each for additional fields needed by OpenMP + and pthreads. + - Added timpl_t argument to bli_thrcomm_bcast(), bli_thrcomm_barrier(), + bli_thrcomm_init(), and bli_thrcomm_cleanup(), which these functions + need since they are now wrappers that choose the implementation- + specific function corresponding to the currently enabled threading + implementation. + - Added rntm_t* to bli_thread_broadcast(), bli_thread_barrier() so that + those functions can pass the timpl_t value into bli_thrcomm_bcast() + and bli_thrcomm_barrier(), respectively. + - Defined bli_env_get_str() in bli_env.c to allow the querying of + BLIS_THREAD_IMPL (which, unlike BLIS_NUM_THREADS and friends, is + expected to be a string). + - Defined bli_thread_get_thread_impl(), bli_thread_set_thread_impl() to + get and set the current threading implementation at runtime. + - Defined bli_rntm_thread_impl() and bli_rntm_set_thread_impl() to query + and set the threading implementation within a rntm_t. Also choose + BLIS_SINGLE as the default value when initializing rntm_t structs. + - Added bli_info_get_*() functions to query whether OpenMP or pthreads + would be chosen as the default at init-time. Note that this only + tests whether OpenMP or pthreads is the first implementation in the + list passed to the threading configure option (-t) and is *not* the + same as querying which implementation is currently selected, since + that can be influenced by BLIS_THREAD_IMPL and/or + bli_thread_set_thread_impl(). + - Changed l3int_t to l3int_ft. + - Updated docs/Multithreading.md to document the new behavior. + - Updated sandbox/gemmlike and addon/gemmd to work with the new fat + threading feature. This included a few bugfixes to bring the codes up + to date, as necessary. + - Comment, whitespace updates. + +commit 89df7b8fa3a3e47ab2fc10ac4d65d0b9fde16942 +Author: Devin Matthews +Date: Sun Sep 18 18:46:57 2022 -0500 + + De-templatized _sup_var1n2m.c; unified _sup_packm_a/b(). (#659) + + Details: + - Re-expressed the two variants in frame/3/bli_l3_sup_var1n2m.c as a + single function each that performs char* pointer arithmetic rather + than four datatype-specific functions. Did the same for the functions + in bli_l3_sup_packm_a.c and _sup_packm_b.c, and then unified the two + into a single set of functions for packing either A or B, which now + resides in bli_l3_sup_packm.c. + - Pre-grow the cntl_t tree in both bli_l3_sup_var1n2m.c variants rather + than grow them incrementally. + - Relocated empty-matrix and scale-by-beta early return handlnig from + bli_gemm_front() and bli_gemmt_front() to their _ex() counterparts. + - Comment, whitespace updates. + +commit fb91337eff1ee2098f315a83888f6667b3a56f86 +Author: Field G. Van Zee +Date: Thu Sep 15 19:08:10 2022 -0500 + + Fixed a harmless pc_nt bug in 05a811e. + + Details: + - Added missing curly braces around some statements in bli_rntm.c, one + of which needed them in order for the relevant code to be executed in + the intended way. The consequence of 05a811e omitting those braces was + that a statement (pc_nt = 1;) was executed more often than it needed + to be. + - Also adjusted the analagous code in bli_thread.c to match that of + bli_rntm.c. + +commit e86076bf4461d1a78186fb21ba8320cfb430f62c +Author: Field G. Van Zee +Date: Thu Sep 15 14:22:59 2022 -0500 + + Test the 'gemmlike' sandbox via AppVeyor. (#664) + + Details: + - Added a fifth test to our .appveyor.yml that enables the 'gemmlike' + sandbox with OpenMP enabled (via clang, the 'auto' configuration + target, and building to a static library). Thanks to Jeff Diamond + for pointing out that this test would be useful. + +commit 63177dca48cb7d066576d884da4a7a599ececebf +Author: Field G. Van Zee +Date: Thu Sep 15 11:21:26 2022 -0500 + + Fixed gemmlike sandbox bug introduced in 7c07b47. + + Details: + - Fixed a bug in the 'gemmlike' sandbox that was introduced in 7c07b47. + This bug was the result of the fact that the gemmlike implementation + uses bli_thrinfo_sup_grow() to grow its thrinfo_t tree, but the + aforementioned commit added an optimization that kicks in when the + rntm_t .pack_a and .pack_b fields are both FALSE. Those fields were + originally added only for sup execution; for large code path, they + are intended to be ignored. But the default initial state of a rntm_t + has those fields set to FALSE, which was inadvertantly activating the + optimization (which targeted single-threaded cases only) and would + cause multithreaded use cases of 'gemmlike' to segfault. The fix took + the form of setting the .pack_a and .pack_b fields to TRUE in + bls_gemm_ex(). + - Added minimal 'const' and 'const'-casting to 'gemmlike' so that gcc + stays quiet. + +commit 05a811e898b371a76581abd4afa416980cce7db9 +Author: Field G. Van Zee +Date: Tue Sep 13 19:24:05 2022 -0500 + + Initialize rntm_t nt/ways fields with 1 (not -1). (#663) + + Details: + - Changed the way that rntm_t structs are initialized, mainly so that + the global rntm_t that is set via environment variables at runtime + may be queried by the application prior to any computation taking + place. (Strictly speaking, the application may already query these + fields, but they do not always contain valid values and often contain + -1 when they are unset.) These changes also served to clarify how + these parameters are treated, and homogenized the implementations of + bli_rntm_set_ways_from_rntm(), bli_rntm_set_ways_from_rntm_sup(), and + bli_thread_init_rntm_from_env(). Special thanks to Jeff Diamond, + Leick Robinson, and Devin Matthews for pointing out that the previous + behavior was needlessly confusing and could be improved. + - The aforementioned modifications also included subtle changes as to + what counts as "setting" a loop's ways of parallelism for the purposes + of deciding whether to use the ways or the total number of threads. + Previously, setting any loop's ways, even to 1, counted in favor of + using the ways. Now, only values greater than 1 will count as + "setting", and all other values will silently be mapped to 1, with + those parameters treated as if they were untouched all along. + - Updated bli_rntm.h and bli_thread.c so that any attempt to set the + PC_NT variable (or pc_nt field of a rntm_t) will either ignore the + request or reassert the value as 1. + - Updated bli_rntm_set_ways() so that rather than clear the + num_threads field, it is set to the product of all of the per-loop + ways of parallelism. + - Removed code from test_libblis.c that handled the possibility of unset + environment variables when printing out their values. + - Removed bli_rntm_equals() inline function from bli_rntm.h, which has + long been disabled. + - Updates to docs/Multithreading.md related to the aforementioned + changes. + - Comment updates. + +commit fd885cf98f4fe1d3bc46468e567776c37c670fcc +Author: Field G. Van Zee +Date: Tue Sep 13 11:50:23 2022 -0500 + + Use kernel CFLAGS for 'kernels' subdirs in addons. (#658) + + Details: + - Updated Makefile and common.mk so that the targeted configuration's + kernel CFLAGS are applied to source files that are found in a + 'kernels' subdirectory within an enabled addon. For now, this + behavior only applies when the 'kernels' directory is at the top + level of the addon directory structure. For example, if there is an + addon named 'foobar', the source code must be located in + addon/foobar/kernels/ in order for it to be compiled with the target + configurations's kernel CFLAGS. Any other source code within + addon/foobar/ will be compiled with general-purpose CFLAGS (the same + ones that were used on all addon code prior to this commit). Thanks + to AMD (esp. Mithun Mohan) for suggesting this change and catching an + intermediate bug in the PR. + - Comment/whitespace updates. + +commit cb74202db39dc8cb81fdd06f8a445f8837e27853 +Author: Field G. Van Zee +Date: Tue Sep 13 11:46:24 2022 -0500 + + Fixed incorrect sizeof(type) in edge case macros. (#662) + + Details: + - In bli_edge_case_macro_defs.h, the GEMM_UKR_SETUP_CT_PRE() and + GEMMTRSM_UKR_SETUP_CT_PRE() macros previously declared their temporary + ct microtiles as: + + PASTEMAC(ch,ctype) + _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ + __attribute__((aligned(alignment))); \ + + The problem here is that sizeof( PASTEMAC(ch,type) ) evaluates to + things like sizeof( BLIS_DOUBLE ), not sizeof( double ), and since + BLIS_DOUBLE is an enum, it is typically an int, which means the + sizeof() expression is evaluating to the wrong value. This was likely + a benign bug, though, since BLIS does not support any computational + datatypes that are smaller than sizeof( int ), which means the ct + array would be *over*-allocated rather than underallocated. Thanks + to @moon-chilled for identifying and reporting this bug in #624. + - CREDITS file update. + +commit 6e5431e8494b06bd80efcab3abf0a6456d6c0381 +Author: Devin Matthews +Date: Sat Sep 10 15:16:58 2022 -0500 + + Fix line number issue in flattened blis.h. (#660) + + Details: + - Updated the top-level Makefile so that it invokes flatten-headers.py + without the -c option, which was requesting that comments be stripped + (since comment stripping is disabled by default). + - Updated flatten-headers.py to accept a new option (-l) to enable + insertion of #line directives into the output file. This new option + is enabled by default. + - Also added logic to flatten-headers.py that outputs a warning if both + comment stripping and line numbers are requested since the comment + stripping will cause the line numbers to become inaccurate. + +commit 4afe0cfdab0e069e027f97920ea604249e34df47 +Author: Field G. Van Zee +Date: Thu Sep 8 18:33:20 2022 -0500 + + Defined invscalv, invscalm, invscald operations. (#661) + + Details: + - Defined invert-scale (invscal) operation on vectors (level-1v), + matrices (level-1m), and diagonals (level-1d). + - Added test modules for invscalv and invscalm to the testsuite. + - Updated BLISObjectAPI.md and BLISTypedAPI.md API documentation to + reflect the new operations. Also updated KernelsHowTo.md accordingly. + - Renamed 'beta' to 'alpha' in scalv and scalm testsuite modules (and + input.operations files) so that the parameter name matches the + parameter used in the documentation. + +commit a87eae2b11408b556e562f1b04e673c6cd1612bc +Author: Field G. Van Zee +Date: Tue Sep 6 18:04:09 2022 -0500 + + Added '-q' quiet mode option to testsuite. (#657) + + Details: + - Added support for a '-q' command line option to the testsuite. This + option suppresses most informational output that would normally + clutter up the screen. By default, verbose mode (the previous + status quo) will be operative, and so quiet mode must be requested. + +commit dfa54139664a42d29774e140ec9e5597af869a76 +Author: RuQing Xu +Date: Tue Aug 30 08:07:50 2022 +0800 + + Arm64 dgemmsup with extended MR&NR (#655) + + Details: + - Since the number of registers in NEON is large but their lengths are + short, I'm here extending both MR and NR. + - The approach is to represent the C microtile in registers optionally + in columns, so for sizes like 6x7m, the 'crr' kernel is the default + with 'rrr' supported through an in-register transpose. + - A few asm kernels are crafted for 'rv' to complete this extended size + support. + - For 'rd' I'm still relying heavily on C99 intrinsic kernels with + branching so the performance might not be optimal. (Sorry for that.) + - So far, these changes only affect the 'firestorm' subconfig. + - This commit also contains row-preferential s12x8 and d6x8 gemm + ukernels. These microkernels are templatized versions of the existing + s8x12 and d6x8 ukernels defined in bli_gemm_armv8a_asm_d6x8.c. + +commit 9e5594ad5fc41df8ef2825a025d7844ac2275c27 +Author: Field G. Van Zee +Date: Thu Aug 11 14:36:38 2022 -0500 + + Temporarily disabled #line directives from 6826c1c. + + Details: + - Commented out the inclusion of #line preprocessor directives in the + flattened header output provided by build/flatten-headers.py. This + output was added recently in 6826c1c, but was later found to have + thrown off the line numbering referenced by compiler warnings and + errors (possibly due to license comment blocks, which are stripped + from source headers as they are inlined into the monolithic header). + +commit 775148bcdbb1014b4881a76306f35f5d0fedecbe +Author: jdiamondGitHub +Date: Fri Aug 5 12:01:24 2022 -0500 + + Updated ARMv8a kernels to fix 2 prefetching issues. (#649) + + Details: + - The ARMv8a dgemm/sgemm microkernels had 2 prefetching issues that + impacted performance on modern ARM platforms. The most significant + issue was that only a single prefetch per C tile column was issued. + When a column of C was not cache aligned, the second cache line would + not be prefetched at all, forcing the kernel to wait for an entire + load to update elements of C. This happened with roughly 50% of the + C prefetches. The fix was to have two prefetches per column, spaced + 64 bytes (1 cache line) apart. + - A secondary performance issue was that all the C prefetch instructions + were issued sequentially at the beginning of the kernel call. This + caused a noticeable performance slowdown. Interleaving the prefetch + calls every 2-3 instructions in the prologue code solved the issue. + +commit bbaf29abd942de47a3a99a80a67d12bab41b27db +Author: Field G. Van Zee +Date: Thu Aug 4 17:51:37 2022 -0500 + + Very minor variable updates to common.mk. + + Details: + - Fixed a harmless bug that would have allowed C++ headers into the list + of header suffices specifically reserved for C99 headers. In practice, + this would have had no substantive effect on anything since the core + BLIS framework does not use C++ headers. + +commit a48e29d799091a833213efeafaf2d342ebdafde9 +Author: Field G. Van Zee +Date: Thu Jul 28 10:11:07 2022 -0500 + + CREDITS file update. + + Details: + - Thanks to Kihiro Bando for assisting with issue #644. + +commit 5b298935de7f20462bfad1893ed34ecd691cec5a +Author: Field G. Van Zee +Date: Wed Jul 27 19:14:15 2022 -0500 + + Removed buggy cruft from power10 subconfig. + + Details: + - Removed #defines for BLIS_BBN_s and BLIS_BBN_d from + bli_kernel_defs_power10.h. These were inadvertently set in ae10d949 + because the power10 subconfig was registering bb packm ukernels, but + only for 6xk (power10 uses s8x16 and d8x8 ukernels) and only because + the original author (probably) copy-pasted from power9 when getting + started. That 6xk packm registration was effectively "dead code" + prior to ae10d949, but was then mistaken as not-dead code during the + ae10d949 refactor. These improper bb factors may have been causing + bugs in power10 builds. Thanks to Nicholai Tukanov for helping remind + me what the power10 subconfig was supposed to look like. + - Removed extraneous microkernel preference registrations from power10 + subconfig. Preferences for single and double complex gemm were being + registered despite there being no complex gemm ukernels registered to + go with them. Similarly, there were trsm preferences registered + without any trsm ukernels registered (and BLIS doesn't actually use a + preference for the trsm ukernel anyway). These extraneous + registrations were almost surely not hurting anything, even if they + were quite misleading. + +commit 56de31b00fa0f1ba866321817cd1e5d83000ff11 +Author: Devin Matthews +Date: Wed Jul 27 13:54:17 2022 -0500 + + Disable modification of KC in the gemmsup kernels. (#648) + + This led to a ~50% performance reduction for certain gemm operations (but not others?). See #644 for example. + +commit 4dde947e2ec9e139c162801320c94e6a01a39708 +Author: Field G. Van Zee +Date: Tue Jul 26 17:29:32 2022 -0500 + + Fixed out-of-bounds bug in sup s6x16m haswell kernel. + + Details: + - Fixed another out-of-bounds read access bug in the haswell sup + assembly kernels. This bug is similar to the one fixed in 17b0caa + and affects bli_sgemmsup_rv_haswell_asm_6x2m(). Thanks to Madeesh + Kannan for reporting this bug (and a suitable fix) in #635. + - CREDITS file update. + +commit 6826c1cdfba855513786d9e3d606681316453398 +Author: Devin Matthews +Date: Mon Jul 25 18:21:05 2022 -0500 + + Add `#line` directives to flattened `blis.h`. (#643) + + Details: + - Modified flatten-headers.py so that #line directives are inserted into + the flattened blis.h file. This facilitates easier debugging when + something is amiss in the flattened blis.h because the compiler will + be able to refer to the line number within the original constituent + header file (which is where the fix would go) rather than the line + number within the flattened header (which is not as helpful). + +commit af3a41e02534befdae026377592ce437bab83023 +Author: Alexander Grund +Date: Thu Jul 21 18:05:48 2022 +0200 + + Add autodetection for POWER7, POWER9 & POWER10 (#647) + + Read from `/proc/cpuinfo` as done for ARM. + Fixes #501 + +commit 17b0caa2b2bff439feb6d2b39cfa16e7591882b0 +Author: Field G. Van Zee +Date: Thu Jul 14 17:55:34 2022 -0500 + + Fixed out-of-bounds read in haswell gemmsup kernels. + + Details: + - Fixed memory access bugs in the bli_sgemmsup_rv_haswell_asm_Mx2() + kernels, where M = {1,2,3,4,5,6}. The bugs were caused by loading four + single-precision elements of C, via instructions such as: + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4) + + in situations where only two elements are guaranteed to exist. (These + bugs may not have manifested in earlier tests due to the leading + dimension alignment that BLIS employs by default.) The issue was fixed + by replacing lines like the one above with: + + vmovsd(mem(rcx), xmm0) + vfmadd231ps(xmm0, xmm3, xmm4) + + Thus, we use vmovsd to explicitly load only two elements of C into + registers, and then operate on those values using register addressing. + Thanks to Daniël de Kok for reporting these bugs in #635, and to + Bhaskar Nallani for proposing the fix). + - CREDITS file update. + +commit cc260fd7068f0fe449d818435aa11adb14c17fed +Author: Field G. Van Zee +Date: Wed Jul 13 16:16:01 2022 -0500 + + Allow uniform max problem sizes in test/3/runme.sh. + + Details: + - Tweaked test/3/runme.sh so that the test driver binaries for single- + threaded (st), single-socket (1s), and dual-socket (2s) execution can + be built using identical problem size ranges. Previously, this was not + possible because runme.sh used the maximum problem size, which was + embedded into the binary filename, to tell the three classes of + binaries apart from one another. Now, runme.sh uses the binary suffix + ("st", "1s", or "2s") to tell them apart. This required only a few + changes to the logic, but it also required a change in format to the + threading config strings themselves (replacing the max problem size + with "st", "1s", or "2s"). Thanks to Jeff Diamond for inspiring this + improvement. + - Comment updates. + +commit 9b1beec60be31c6ea20b85806d61551497b699e4 +Author: bartoldeman +Date: Mon Jul 11 20:15:12 2022 -0400 + + Use BLIS_ENABLE_COMPLEX_RETURN_INTEL in blastest files (#636) + + Details: + - Fixed a crash that occurs when either cblat1 or zblat1 are linked + with a build of BLIS that was compiled with '--complex-return=intel'. + This fix involved inserting preprocessor macro guards based on + BLIS_ENABLE_COMPLEX_RETURN_INTEL into blastest/src/cblat1.c and + blastest/src/zblat1.c to correctly handle situations where BLIS is + compiled with Intel/f2c-style calling conventions for complex numbers. + - Updated blastest/src/fortran/run-f2c.sh so that future executions + will insert the aforementioned cpp macro conditional where + appropriate. + +commit 98d467891b74021ace7f248cb0856bec734e39b6 +Author: bartoldeman +Date: Mon Jul 11 19:40:53 2022 -0400 + + Change complex_return='intel' for ifx. (#637) + + Details: + - When checking the version string of the Fortran compiler for the + purposes of determining a default return convention for complex + domain values, grep for "IFORT" instead of "ifort" since that string + is common to both the 'ifx' and 'ifort' binaries provided by Intel: + + $ ifx --version + ifx (IFORT) 2022.1.0 20220316 + Copyright (C) 1985-2022 Intel Corporation. All rights reserved. + + $ ifort --version + ifort (IFORT) 2021.6.0 20220226 + Copyright (C) 1985-2022 Intel Corporation. All rights reserved. + +commit ffde54cc5c334aca8eff4d6072ba49496bf3104c +Author: jdiamondGitHub +Date: Mon Jul 11 16:47:30 2022 -0500 + + Minor changes to .gitignore and LICENSE files. (#642) + + Details: + - Macs create .DS_Store files in every directory visited. Updated + .gitignore file so these files won't be reported as untracked by + 'git status'. + - Added Oracle Corporation to the LICENSE file. + - Updated UT copyright on behalf of SHPC. + +commit 7cba7ce3dd1533fcc4ca96ac902bdf218686139a +Author: Field G. Van Zee +Date: Fri Jul 8 11:15:18 2022 -0500 + + Minor cleanups, comment updates to bli_gks.c. + + Details: + - Removed a redundant registration of 'a64fx' subconfig in + bli_gks_init(). + - Reordered registration of 'armsve', 'a64fx', and 'firestorm' + subconfigs. Thanks to Jeff Diamond for his input on this reordering. + - Comment updates to bli_gks.c and arch_t enum in bli_type_defs.h. + +commit 667f201b7871da68622027d02bd6b7da3262f8e8 +Author: Field G. Van Zee +Date: Thu Jul 7 16:44:21 2022 -0500 + + Fixed type bug in bli_cntx_set_ukr_prefs(). + + Details: + - Fixed a bug in bli_cntx_set_ukr_prefs() which erroneously typecast the + num_t value read from va_args() down to a bool before being stored + within the cntx_t. This bug was introduced on April 6th 2022, in + ae10d94. This caused the ukernel preferences for double real and + double complex to go unchanged while the preferences for single real + and single complex were corrupted by the former datatypes' + preference values. The bug manifested as degraded performance for + subconfigurations that registered column-preferential ukernels. The + reason is that the erroneous preferences trigger unnecessary + transpositions in the operation, which forces the gemm ukernel to + compute on matrices that are not stored according to its preference. + Thanks to Devin Matthews, Jeff Diamond, and Leick Robinson for their + extensive efforts and assistance in tracking down this issue. + - Augmented the informational header that is output by the testsuite to + include ukernel preferences for gemm, gemmtrsm_[lu], and trsm_[lu]. + - CREDITS file update. + +commit d429b6bfced21a63bf711224ac402f93f0080b52 +Author: Isuru Fernando +Date: Tue Jun 28 15:34:10 2022 -0500 + + Support clang targetting MinGW (#639) + + * Support clang targetting MinGW + + * Fix pthread linking + +commit d93df023348144e091f7b3e3053995648f348aa7 +Author: Field G. Van Zee +Date: Wed Jun 15 14:09:49 2022 -0500 + + Removed unused dt arg in bli_gks_query_ind_cntx(). + + Details: + - Removed the num_t datatype argument from bli_gks_query_ind_cntx(). + This argument stopped being needed by the function in commit e9da642. + Its only use in bli_gks_query_ind_cntx() was to be passed through to + the context initialization function for the chosen induced method, + but even then, commit log notes from e9da642 indicate that I could not + recall why the datatype argument was ever needed by the context init + function to begin with. + - Updated all invocations of bli_gks_query_ind_cntx() to omit the dt + argument. Most of these invocations resided in various standalone test + drivers (and the testsuite). + +commit 56772892450cc92b3fbd6a9d0460153a43fc47ab +Author: Field G. Van Zee +Date: Wed Jun 1 10:49:33 2022 -0500 + + Added SMU citation to README.md intro. + + Details: + - Added a citation to SMU and the Matthews Research Group to the general + attribution of maintainership and development in the Introduction of + the README.md file. Thanks to Robert van de Geijn and Devin Matthews + for suggesting this change. + +commit 4603324eb090dfceaad3693a70b2d60544036aa8 +Author: Field G. Van Zee +Date: Thu May 19 14:07:03 2022 -0500 + + Init/finalize via bli_pthread_switch_t API (#634). + + Details: + - Defined and implemented a new pthread-like abstract datatype and API + in bli_pthread.c. The new type, bli_pthread_switch_t, is similar to + bli_pthread_once_t in some respects. The idea is that like a switch in + your home that controls a light or ceiling fan, it can either be on or + off. The switch starts in the off state. Moving from one state to the + other (on to off; off to on) causes some action (i.e., a startup or + shutdown function) to be executed. Trying to move from one state to + the same state (on to on; off to off) is safe in that it results in + no action. Unlike bli_pthread_once(), the API for bli_pthread_switch_t + contains both _on() and _off() interfaces. Also, unlike the _once() + function, the _on() and _off() functions return error codes so that + the 'int' error code returned from the startup or shutdown functions + may be passed back to the caller. Thanks to Devin Matthews for his + input and feedback on this feature. + - Replaced the previous implementation of bli_init_once() and + bli_finalize_once() -- both of which used bli_pthread_once() -- with + ones that rely upon bli_pthread_switch_on() and _switch_off(), + respectively. This also required updating the return types of + _init_apis() and _finalize_apis() to match the function pointer type + required by bli_pthread_switch_on()/_switch_off(). + - Comment updates. + +commit 64a9b061f6032e2b59613aecdbe7bb52161605c1 +Author: Field G. Van Zee +Date: Tue May 10 14:54:22 2022 -0500 + + Fixed misspelling of 'xpbys' in gemm macrokernel. + + Details: + - Fixed a functionally harmless typo in bli_gemm_ker_var2.c where a few + instances of the substring "xpbys" were misspelled as "xbpys". The + misspellings were harmless because they were consistent, and because + they referenced only local symbols. + +commit 1c733402a95ab08b20f3332c2397fd52a2627cf6 +Author: Jed Brown +Date: Thu Apr 28 11:58:44 2022 -0600 + + Fix version check for znver3, which needs gcc >= 10.3 (#628) + + Apple's clang-12 lacks znver3 support, unlike upstream clang-12. + +commit 6431c9e13b86e4442b6aacba18a0ace12288c955 +Author: Field G. Van Zee +Date: Thu Apr 14 13:01:24 2022 -0500 + + Added missing 'const' to zen bli_gemm_small.c. + + Details: + - Added missing 'const' qualifiers to signatures of functions defined in + kernels/zen/3/bli_gemm_small.c. This fixes compile-time errors when + targeting 'zen3' subconfig (which apparently is enabling AMD's + gemm_small code path by default). Thanks to Devin Matthews for + reporting this error. + +commit 9fea633748ed27ef3853bba7cd955690c61092b4 +Author: Devin Matthews +Date: Wed Apr 13 15:59:06 2022 -0500 + + Partial addition of 'const' to all interfaces above the (micro)kernels. (#625) + + Details: + - Added 'const' qualifier to applicable function arguments wherever the + the pointed-to object is not internally modified. This change affects + all interfaces that reside above the level of the (micro)kernels. + - Typecast certain function return values to discard 'const' qualifier. + - Removed 'restrict' from various arguments, including cntx_t*, + auxinfo_t*, rntm_t*, thrinfo_t*, mem_t*, and others + - Removed parts of some APIs, such as bli_cntx_*(), due to limited use. + - Merged some variable declarations with their corresponding + initialization statements. + - Whitespace changes. + +commit ae10d9495486f589ed0320f0151b2d195574f1cf (origin/amd) +Author: Devin Matthews +Date: Wed Apr 6 20:31:11 2022 -0500 + + Simplify and rewrite reference packm kernels. (#610) + + Details: + - Reorganized the way kernels are stored within the cntx_t structure so + that rather than having a function pointer for every supported size of + unrolled packm kernel (2xk, 3xk, 4xk, etc.), we store only two packm + kernels per datatype: one to pack MRxk micropanels and one to pack + NRxk micropanels. + - NOTE: The "bb" (broadcast B) reference kernels have been merged into + the "standard" kernels (packm [including 1er and unpackm], gemm, + trsm, gemmtrsm). This replication factor is controlled by + BLIS_BB[MN]_[sdcz] etc. Power9/10 needs testing since only a + replication factor of 1 has been tested. armsve also needs testing + since the MR value isn't available as a macro. + - Simplified the bli_cntx_*() APIs to conform to the new unified kernel + array within the cntx_t. Updated existing bli_cntx_init_() + function definitions for all subconfigurations. + - Consolidated all kernel id types (e.g. l1vkr_t, l1mkr_t, l3ukr_t, + etc.) into one kernel id type: ukr_t. + - Various edits, updates, and rewrites of reference kernels pursuant to + the aforementioned changes. + - Define compile-time macro constants (BLIS_MR_[sdcz], BLIS_NR_[sdcz], + and friends) in bli_kernel_macro_defs.h, but only when the macro + BLIS_IN_REF_KERNEL is defined by the build system. + - Loose ends: + - Still need to update documentation, including: + - docs/ConfigurationHowTo.md + - docs/KernelsHowTo.md + to reflect changes made in this commit. + +commit b3e674db3c05ca586b159a71deb1b61d701ae5c9 +Author: Field G. Van Zee +Date: Mon Apr 4 17:31:02 2022 -0500 + + README.md update to link to releases page. + +commit 69fa915464c52f09a5971a60f521900d31a34e69 +Author: Field G. Van Zee +Date: Fri Apr 1 08:47:46 2022 -0500 + + Fixed broken "tagged releases" link in README.md. + +commit 88cab8383ca90ddbb4cf13e69b7d44a1663a4425 +Author: Field G. Van Zee +Date: Fri Apr 1 08:12:06 2022 -0500 + + CHANGELOG update (0.9.0) + +commit 14c86f66b20901b60ee276da355c1b62642c18d2 (tag: 0.9.0) +Author: Field G. Van Zee +Date: Fri Apr 1 08:12:06 2022 -0500 + + Version file update (0.9.0) + +commit 99bb9002f1aff598d347eae2821a3f7bdd1f48e8 +Author: Field G. Van Zee +Date: Fri Apr 1 08:10:59 2022 -0500 + + ReleaseNotes.md update in advance of next version. + +commit bee7678b2558a691ac850819dbe33fefe4fdbee3 +Author: Field G. Van Zee +Date: Thu Mar 31 14:09:39 2022 -0500 + + CREDITS file update. + +commit cf06364327bd2d21d606392371ff3c5962bee5ba +Author: Field G. Van Zee +Date: Tue Mar 29 16:18:25 2022 -0500 + + Fixed typo in BLAS gemm3m call to _check(). + + Details: + - Fixed an unresolved symbol issue leftover from #590 whereby ?gemm3m_() + as defined in bla_gemm3m.c was referencing bla_gemm3m_check(), which + does not exist. It should have simply called the _check() function for + gemm. + +commit 1ec020b33ece1681c0041e2549eed2bd4c6cf356 +Author: Dipal M Zambare <71366780+dzambare@users.noreply.github.com> +Date: Wed Mar 30 02:45:36 2022 +0530 + + AMD kernel updates; frame-specific AMD updates. (#597) + + Details: + - Allow building BLIS with certain framework files (each with the '_amd' + suffix) that have been customized by AMD for Zen-based hardware. These + customized files were derived from portable versions of the same files + (i.e., those without the '_amd' suffix). Whether the portable or AMD- + specific files are compiled is now controlled by a new configure + option, --[en|dis]able-amd-frame-tweaks. This option is disabled by + default in vanilla BLIS, though AMD may choose to enable it by default + in their fork. For now, the added AMD-specific files are: + - bli_gemv_unf_var2_amd.c + - bla_copy_amd.c + - bla_gemv_amd.c + These files reside in 'amd' subdirectories found within the directory + housing their generic counterparts. + - Register optimized real-domain copyv, setv, and swapv kernels in + bli_cntx_init_zen.c. + - Various minor updates to level-1v kernels in 'zen' kernel set. + - Added caxpyf kernel as well as saxpyf and multiple daxpyf kernels to + the 'zen' kernel set + - If the problem passed to ?gemm_() in bla_gemm.c has a unit m or n dim, + call gemv instead and return early. + - Combined variable declarations with their initialization in various + level-2 and level-3 BLAS compatibility files, and also inserted + 'const' qualifer in those same declaration statements. + - Moved frame/compat/bla_gemmt.c and .h to frame/compat/extra/ . + - Added copyv and swapv test drivers to 'test' directory. + - Whitespace, comment changes. + +commit 0db2bd5341c5c3ed5f1cc2bffa90952735efa45f +Author: Bhaskar Nallani +Date: Fri Mar 25 05:11:55 2022 +0530 + + Added BLAS/CBLAS APIs for gemm3m. (#590) + + Details: + - Created ?gemm3m_() and cblas_?gemm3m() APIs that (for now) simply + invoke the 1m implementation unconditionally. (Note that these APIs + bypass sup handling.) + - Added BLAS prototypes for gemm3m in frame/compat/bla_gemm3m.h. + - Added CBLAS prototypes for gemm3m in frame/compat/cblas/src/cblas.h. + - Relocated: + frame/compat/cblas/src/cblas_?gemmt.c + files into + frame/compat/cblas/src/extra/ + - Relocated frame/compat/bla_gemmt.? into frame/compat/extra/ . + - Minor reorganization of prototypes and cpp macro directives in + bli_blas.h, cblas.h, and cblas_f77.h. + - Trival whitespace change to cblas_zgemm.c. + +commit d6810000e961fe807dc5a7db81180a8355f3eac0 +Author: Devin Matthews +Date: Mon Mar 14 10:29:54 2022 -0500 + + Update Multithreading.md + + Add notes about `BLIS_IR_NT` (should typically be 1) and `BLIS_JR_NT` (should typically be small, e.g. <= 4). [ci skip] + +commit f1dbb0e514f53a3240d3a6cbdc3306b01a2206f5 +Author: Field G. Van Zee +Date: Fri Mar 11 13:38:28 2022 -0600 + + Trival whitespace change; commit log addendum. + + Details: + - A co-attribution to Mithun Mohan was inadvertently omitted from the + commit log for headline change in the previous commit, 7c07b47. + +commit 7c07b477e432adbbce5812ed9341ba3092b03976 +Author: Field G. Van Zee +Date: Fri Mar 11 13:28:50 2022 -0600 + + Avoid gemmsup barriers when not packing A or B. (#622) + + Details: + - Implemented a multithreaded optimization for the special (and common) + case of employing the gemmsup code path when the user requests + (implicitly or explicitly) that neither A nor B be packed during + computation. This optimization takes the form of a greatly reduced + code branch in bli_thrinfo_sup_create_for_cntl(), which avoids a + broadcast and two barriers, and results in higher performance when + obtaining two-way or higher parallelism within BLIS. Thanks to + Bhaskar Nallani of AMD for proposing this change via issue #605. + - Added an early return branch to bli_thrinfo_create_for_cntl() that + detects and quickly handles cases where no parallelism is being + obtained within BLIS (i.e., single-threaded execution). Note that + this special case handling was/is already present in + bli_thrinfo_sup_create_for_cntl(). + - CREDITS file update. + +commit cad10410b2305bc0e328c5f2517ab02593b53428 +Author: Ivan Korostelev +Date: Thu Mar 10 09:58:14 2022 -0600 + + POWER10: edge cases in microkernel (#620) + + Use new API for POWER10 gemm microkernel + +commit 71851a0549276b17db18a0a0c8ab4f54493bf033 +Author: Field G. Van Zee +Date: Tue Mar 8 17:38:09 2022 -0600 + + Fixed level-3 performance bug in haswell ukernels. + + Details: + - Fixed a performance regression affecting nearly all level-3 operations + that use the 'haswell' sgemm and dgemm microkernels. This regression + was introduced in 54fa28b, caused by an ill-formed conditional + expression in the assembly code that controls whether cache lines of C + should be prefetched as rows or as columns. Essentially, the two + branches were reversed, causing incomplete prefetching to occur for + both row- and column-stored instances of matrix C. Thanks to Devin + Matthews for his help finding and fixing this bug. + +commit 84732bf95634ac606c5f2661d9474318e366c386 +Author: Field G. Van Zee +Date: Mon Feb 28 12:19:31 2022 -0600 + + Revamp how tools are handled/checked by configure. + + Details: + - Consolidate handling of tools that are specifiable via CC, CXX, FC, + PYTHON, AR, and RANLIB into one bash function, select_tool_w_env(). + - If the user specifies a tool via an environment variable (e.g. + CC=gcc) and that tool does not seem valid, print an error message + and abort configure, unless the tool is optional (e.g. CXX or FC), + in which case a warning message is printed instead. + - The definition of "seems valid" above amounts to: + - responding to at least one of a basic set of command line options + (e.g. --version, -V, -h) if the os_name is Linux (since GNU tools + tend to respond to flags such as --version) or if the tool in + question is CC, CXX, FC, or PYTHON (which tend to respond to the + expected flags regardless of OS) + - the binary merely existing for AR and RANLIB on Darwin/OSX/BSD. + (These OSes tend to have non-GNU versions of ar and ranlib, which + typically do not respond to --version and friends.) + - This PR addresses #584. Thanks to Devin Matthews for suggesting some + of the changes in this commit. + +commit d5146582b1f1bcdccefe23925d3b114d40cd7e31 +Author: RuQing Xu +Date: Wed Feb 23 03:35:46 2022 +0900 + + ArmSVE Ensure Non-zero Block Size (#615) + + Fixes #613. There are several macros/environment variables which need to be tuned to get good cache block sizes. It would be nice to have a way of getting values automatically. + +commit 4d8352309784403ed6719528968531ffb4483947 +Author: RuQing Xu +Date: Wed Feb 23 01:03:47 2022 +0900 + + Add armsve to arm64 Metaconfig (#614) + + Availability of the `armsve` subconfig is controlled by the compiler version (gcc/clang). Tested for SVE and non-SVE. Fixes #612. + +commit c9700f369aa84fc00f36c4b817ffb7dab72b865d +Author: Field G. Van Zee +Date: Tue Feb 15 15:36:52 2022 -0600 + + Renamed SIMD-related macro constants for clarity. + + Details: + - Renamed the following macros defined in bli_kernel_macro_defs.h: + + BLIS_SIMD_NUM_REGISTERS -> BLIS_SIMD_MAX_NUM_REGISTERS + BLIS_SIMD_SIZE -> BLIS_SIMD_MAX_SIZE + + Also updated all instances of these macros elsewhere, including + subconfigurations, source code, and documentation. Thanks to Devin + Matthews for suggesting this change. + +commit ee9ff988c49f16696679d4c6cd3dcfcac7295be7 +Author: Field G. Van Zee +Date: Tue Feb 15 15:01:51 2022 -0600 + + Move edge cases to gemmtrsm ukrs; doc updates. + + Details: + - Moved edge-case handling into the gemmtrsm microkernel. This required + changing the microkernel API to take m and n dimension parameters as + well as updating all existing gemmtrsm microkernel function pointer + types, function signatures, and related definitions to take m and n + dimensions. Also updated all existing gemmtrsm kernels in the + 'kernels' directory (which for now is limited to haswell and penryn + kernel sets, plus native and 1m-based reference kernels in + 'ref_kernels') to take m and n dimensions, and implemented edge-case + handling within those microkernels via a collection of new C + preprocessor macros defined within bli_edge_case_macro_defs.h. Note + that the edge-case handling for gemm-like operations had already + been relocated into the gemm microkernel in 54fa28b. + - Added desriptive comments to GEMM_UKR_SETUP_CT() and related macros in + bli_edge_case_macro_defs.h to allow for easier reading. + - Updated docs/KernelsHowTo.md to reflect above changes. Also cleaned up + the bullet under "Implementation Notes for gemm" that covers alignment + issues. (Thanks to Ivan Korostelev for pointing out the confusing and + outdated language in issue #591.) + - Other minor tweaks to KernelsHowTo.md. + +commit 25061593460767221e1066f9d720fa6676bbed8f +Author: Devin Matthews +Date: Sun Feb 13 20:11:55 2022 -0600 + + Don't use `-Wl,-flat-namespace`. + + Flat namespaces can cause problems due to conflicting system libraries, + etc., so just mark `xerbla_` as a weak symbol on macOS instead. + +commit 5a4d3f5208d3d8cc1827f8cc90414c764b7ebab3 +Author: Devin Matthews +Date: Sun Feb 13 17:28:30 2022 -0600 + + Use -flat_namespace option to link on macOS + + Fixes #611. + +commit 26742910a087947780a089360e2baf82ea109e01 +Author: Devin Matthews +Date: Sun Feb 13 16:53:45 2022 -0600 + + Update CC_VENDOR logic + + Look for `GCC` in addition to `gcc` to handle weird conda version strings. [ci skip] + +commit 2f3872e01d51545c687ae2c8b2650e00552111a7 +Author: RuQing Xu +Date: Mon Feb 7 17:14:49 2022 +0900 + + ArmSVE Adopts Label Wrapper + + For clang (& armclang?) compilation. + + Hopefully solves #609 . + +commit 72089bb2917b78d99cf4f27c69125bf213ee54e6 +Author: RuQing Xu +Date: Sat Feb 5 16:56:04 2022 +0900 + + ArmSVE Use Predicate in M-Direction + + No need to query MR during kernel runtime. + +commit 9cc897f37455d52fbba752e3801f1a9d4a5bfdc1 +Author: Ruqing Xu +Date: Thu Feb 3 16:40:02 2022 +0000 + + Fix SVE Compil. + +commit b5df1811f1bc8212b2cda6bb97b79819afe236a8 +Author: RuQing Xu +Date: Thu Feb 3 02:31:29 2022 +0900 + + Armv8a, ArmSVE: Simplify Gen-C + +commit 35195bb5cea5d99eb3eaf41e3815137d14ceb52d +Author: Devin Matthews +Date: Mon Jan 31 10:29:50 2022 -0600 + + Add armclang detection to configure. + + armclang is treated as regular clang. Fixes #606. [ci skip] + +commit 0be9282cdccf73342d8571d3f7971a9b0af72363 +Author: Field G. Van Zee +Date: Wed Jan 26 17:46:24 2022 -0600 + + Updated zen3 macro constant names. + + Details: + - In config/zen3/bli_family_zen3.h, renamed: + BLIS_SMALL_MATRIX_A_THRES_M_GEMMT -> _M_SYRK + BLIS_SMALL_MATRIX_A_THRES_N_GEMMT -> _N_SYRK + Thanks to Jeff Diamond for helping spot the stale _SYRK naming. + +commit 0ab20c0e72402ba0b17fe2c3ed3e16bf2ace0fd3 +Author: Jeff Hammond +Date: Thu Jan 13 07:29:56 2022 -0800 + + the Apple local label thing is required by Clang in general + + @egaudry and I both saw this issue on Linux with Clang 10. + + ``` + Compiling obj/thunderx2/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.o ('thunderx2' CFLAGS for kernels) + kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c:171:49: fatal error: invalid symbol redefinition + " \n\t" + ^ + :90:5: note: instantiated into assembly here + .SLOOPKITER: + ^ + 1 error generated. + ``` + + Signed-off-by: Jeff Hammond + +commit 81f93be0561c705ae6823d19e40849facc40bef7 +Author: Devin Matthews +Date: Mon Jan 10 10:19:47 2022 -0600 + + Fix row-/column-major pref. in 16x8 haswell sgemm ukr (unused) + +commit 268ce1f29a717d18304713ecc25a2eafe41838c7 +Author: Devin Matthews +Date: Mon Jan 10 10:17:17 2022 -0600 + + Relax alignment constraints + + Remove alignment of temporary AB buffer in edge case handling macros unless alignment is specifically requested (e.g. Core2, SDB/IVB). Fixes #595. + +commit 3f2440b0226d5e23a43d12105d74aa917cd6c610 +Author: Field G. Van Zee +Date: Thu Jan 6 14:57:36 2022 -0600 + + Added m, n dims to gemmd/gemmlike ukernel calls. + + Details: + - Updated the gemmd addon and the gemmlike sandbox code to use the new + microkernel calling sequence, which now includes m and n dimensions so + that the microkernel has all the information necessary to handle edge + cases. Thanks to Jeff Diamond for catching this, which ideally would + have been included in commit 54fa28b. + - Retired var2 of both gemmd and gemmlike to 'attic' directories and + removed their corresponding prototypes. In both cases, var2 was a + variant of the block-panel algorithm where edge-case handling was + abstracted away to a microkernel wrapper. (Since this is now the + official behavior of BLIS microkernels, I saw no need to have it + included as a separate code path.) + - Comment updates. + +commit 864bfab4486ac910ef9a366e9ade4b45a39747fc +Author: Field G. Van Zee +Date: Tue Jan 4 15:10:34 2022 -0600 + + CREDITS file update. + +commit 466b68a3ad118342dc49a8130b7b02f5e7748521 +Author: Devin Matthews +Date: Sun Jan 2 14:59:41 2022 -0600 + + Add unique tag to branch labels for Apple ARM64. + + Add `%=` tag to branch labels, which expands to a unique identifier for each inline assembly block. This prevents duplicate symbol errors on Apple Silicon (#594). Fixes #594. [ci skip] since we can't test Apple Silicon anyways... + +commit 08174a2f6ebbd8ed5aa2bc4edc45da80962f06bb +Author: RuQing Xu +Date: Sat Jan 1 21:35:19 2022 +0900 + + Evict Requirement for SVE GEMM + + For 8<= GCC < 10 compatibility. + +commit 54fa28bd847b389215cffb57a83dc9b3dce79c86 +Author: Devin Matthews +Date: Fri Dec 24 08:00:33 2021 -0600 + + Move edge cases to gemm ukr; more user-custom mods. (#583) + + Details: + - Moved edge-case handling into the gemm microkernel. This required + changing the microkernel API to take m and n dimension parameters. + This required updating all existing gemm microkernel function pointer + types, function signatures, and related definitions to take m and n + dimensions. We also updated all existing kernels in the 'kernels' + directory to take m and n dimensions, and implemented edge-case + handling within those microkernels via a collection of new C + preprocessor macros defined within bli_edge_case_macro_defs.h. Also + removed the assembly code that formerly would handle general stride + IO on the microtile, since this can now be handled by the same code + that does edge cases. + - Pass the obj_t.ker_fn (of matrix C) into bli_gemm_cntl_create() and + bli_trsm_cntl_create(), where this function pointer is used in lieu of + the default macrokernel when it is non-NULL, and ignored when it is + NULL. + - Re-implemented macrokernel in bli_gemm_ker_var2.c to be a single + function using byte pointers rather that one function for each + floating-point datatype. Also, obtain the microkernel function pointer + from the .ukr field of the params struct embedded within the obj_t + for matrix C (assuming params is non-NULL and contains a non-NULL + value in the .ukr field). Communicate both the gemm microkernel + pointer to use as well as the params struct to the microkernel via + the auxinfo_t struct. + - Defined gemm_ker_params_t type (for the aforementioned obj_t.params + struct) in bli_gemm_var.h. + - Retired the separate _md macrokernel for mixed datatype computation. + We now use the reimplemented bli_gemm_ker_var2() instead. + - Updated gemmt macrokernels to pass m and n dimensions into microkernel + calls. + - Removed edge-case handling from trmm and trsm macrokernels. + - Moved most of bli_packm_alloc() code into a new helper function, + bli_packm_alloc_ex(). + - Fixed a typo bug in bli_gemmtrsm_u_template_noopt_mxn.c. + - Added test/syrk_diagonal and test/tensor_contraction directories with + associated code to test those operations. + +commit 961d9d509dd94f3a66f7095057e3dc8eb6d89839 +Author: Kiran +Date: Wed Dec 8 03:00:38 2021 +0530 + + Re-add BLIS_ENABLE_ZEN_BLOCK_SIZES macro for 'zen'. + + Details: + - Added previously-deleted cpp macro block to bli_cntx_init_zen.c + targeting the Naples microarchitecture that enabled different cache + blocksizes when the number of threads exceeds 16. This commit + represents PR #573. + +commit cf7d616a2fd58e293b496770654040818bf5609c +Author: Devin Matthews +Date: Thu Dec 2 17:10:03 2021 -0600 + + Enable user-customized packm ukernel/variant. (#549) + + Details: + - Added four new fields to obj_t: .pack_fn, .pack_params, .ker_fn, and + .ker_params. These fields store pointers to functions and data that + will allow the user to more flexibly create custom operations while + recycling BLIS's existing partitioning infrastructure. + - Updated typed API to packm variant and structure-aware kernels to + replace the diagonal offset with panel offsets, and changed strides + of both C and P to inc/ldim semantics. Updated object API to the packm + variant to include rntm_t*. + - Removed the packm variant function pointer from the packm cntl_t node + definition since it has been replaced by the .pack_fn pointer in the + obj_t. + - Updated bli_packm_int() to read the new packm variant function pointer + from the obj_t and call it instead of from the cntl_t node. + - Moved some of the logic of bli_l3_packm.c to a new file, + bli_packm_alloc.c. + - Rewrote bli_packm_blk_var1.c so that it uses byte (char*) pointers + instead of typed pointers, allowing a single function to be used + regardless of datatype. This obviated having a separate implementation + in bli_packm_blk_var1_md.c. Also relegated handling of scalars to a + new function, bli_packm_scalar(). + - Employed a new standard whereby right-hand matrix operands ("B") are + always packed as column-stored row panels -- that is, identically to + that of left-hand matrix operands ("A"). This means that while we pack + matrix A normally, we actually pack B in a transposed state. This + allowed us to simplify a lot of code throughout the framework, and + also affected some of the logic in bli_l3_packa() and _packb(). + - Simplified bli_packm_init.c in light of the new B^T convention + described above. bli_packm_init()--which is now called from within + bli_packm_blk_var1()--also now calls bli_packm_alloc() and returns + a bool that indicates whether packing should be performed (or + skipped). + - Consolidated bli_gemm_int() and bli_trsm_int() into a bli_l3_int(), + which, among other things, defaults the new .pack_fn field of the + obj_t to bli_packm_blk_var1() if the field is NULL. + - Defined a new function, bli_obj_reset_origin(), which permanently + refocuses the view of an object so that it "forgets" any offsets from + its original pointer. This function also sets the object's root field + to itself. Calls to bli_obj_reset_origin() for each matrix operand + appear in the _front() functions, after the obj_t's are aliased. This + resetting of the underlying matrices' origins is needed in preparation + for more advanced features from within custom packm kernels. + - Redefined bli_pba_rntm_set_pba() from a regular function to a static + inline function. + - Updated gemm_ukr, gemmtrsm_ukr, and trsm_ukr testsuite modules to use + libblis_test_pobj_create() to create local packed objects. Previously, + these packed objects were created by calling lower-level functions. + +commit e229e049ca08dfbd45794669df08a71dba892925 +Author: Field G. Van Zee +Date: Wed Dec 1 17:36:22 2021 -0600 + + Added recu-sed.sh script to 'build' directory. + + Details: + - Added a recursive sed script to the 'build' directory. + +commit 12c66a4acc77bf4927b01e2358e2ac10b61e0a53 +Author: Field G. Van Zee +Date: Fri Nov 19 14:43:53 2021 -0600 + + Minor updates to README.md, docs/Addons.md. + + Details: + - Add additional mentions of addons to README.md, including in the + "What's New" section. + - Removed mention of sandboxes from the long list of advantages + provided by BLIS. + - Very minor description update to opening line of Addons.md. + +commit a4bc03b990fe0572001eb6409efd12cd70677dcf +Author: Field G. Van Zee +Date: Fri Nov 19 13:29:00 2021 -0600 + + Brief mention/link to Addons.md in README.md. + + Details: + - Add a blurb about the new addons feature to the "Documentation for + BLIS developers" section of the README.md, which also links to the + Addons.md document. + +commit b727645eb7a8df39dee74068f734da66322fe0b3 +Merge: 9be97c15 7bde468c +Author: Field G. Van Zee +Date: Fri Nov 19 13:22:09 2021 -0600 + + Merge branch 'dev' + +commit 9be97c150e19fa58bca30cb993a6509ae21e2025 +Author: Madan mohan Manokar <86282872+madanm3@users.noreply.github.com> +Date: Thu Nov 18 00:46:46 2021 +0530 + + Support all four dts in test/test_her[2][k].c (#578) + + Details: + - Replaced the hard-coded calls to double-precision real syr, syr2, + syrk, and syrk in the corresponding standalone test drivers in the + 'test' directory with conditional branches that will call the + appropriate BLAS interface depending on which datatype is enabled. + Thanks to Madan mohan Manokar for this improvement. + - CREDITS file update. + +commit 26e4b6b29312b472c3cadf95ccdf5240764777f4 +Author: Dipal M Zambare <71366780+dzambare@users.noreply.github.com> +Date: Thu Nov 18 00:32:00 2021 +0530 + + Added support for AMD's Zen3 microarchitecture. + + Details: + - Added a new 'zen3' subconfiguration targeting support for the AMD Zen3 + microarchitecture (#561). Thanks to AMD for this contribution. + - Restructured clang and AOCC support for zen, zen2, and zen3 + make_defs.mk files. The clang and AOCC version detection now happens + in configure, not in the subconfigurations' makefile fragments. That + is, we've added logic to configure that detects the version of + clang/AOCC, outputs an appropriate variable to config.mk + (ie: CLANG_OT_*, AOCC_OT_*), and then checks for it within the + makefile fragment (as is currently done for the GCC_OT_* variables). + - Added configure support for a GCC_OT_10_1_0 variable (and associated + substitution anchor) to communicate whether the gcc version is older + than 10.1.0, and use this variable to check for recent enough versions + of gcc to use -march=znver3 in the zen3 subconfig. + - Inlined the contents of config/zen/amd_config.mk into the zen and zen2 + make_defs.mk so that the files are self-contained, harmonizing the + format of all three Zen-based subconfigurations' make_defs.mk files. + - Added indenting (with spaces) of GNU make conditionals for easier + reading in zen, zen2, and zen3 make_defs.mk files. + - Adjusted the range of models checked by bli_cpuid_is_zen() (which was + previously 0x00 ~ 0xff and is now 0x00 ~ 0x2f) so that it is + completely disjoint from the models checked by bli_cpuid_is_zen2() + (0x30 ~ 0xff). This is normally necessary because Zen and Zen2 + microarchitectures share the same family (23, or 0x17), and so the + model code is the only way to differentiate the two. But in our case, + fixing the model range for zen *wasn't* actually necessary since we + checked for zen2 first, and therefore the wide zen range acted like + the 'else' of an 'if-else' statement. That said, the change helps + improve clarity for the reader by encoding useful knowledge, which + was obtained from https://en.wikichip.org/wiki/amd/cpuid . + - Added zen2.def and zen3.def files to the collection in travis/cpuid. + Note that support for zen, zen2, and zen3 is now present, and while + all the three microarchitectures have identical instruction sets from + the perspective of BLIS microkernels, they each correspond to + different subconfigurations and therefore merit separate testing. + Thanks to Devin Matthews for his guidance in hacking these files as + slight modifications of zen.def. + - Enabled testing of zen2 and zen3 via the SDE in travis/do_sde.sh. + Now, zen, zen2, and zen3 are tested through the SDE via Travis CI + builds. + - Updated travis/do_sde.sh to grab the SDE tarball from a new ci-utils + repository on GitHub rather than on Intel's website. This change was + made in an attempt to circumvent recent troubles with Travis CI not + being able to download the SDE directly from Intel's website via curl. + Thanks to Devin Matthews for suggesting the idea. + - Updated travis/do_sde.sh to grab the latest version (8.69.1) of the + Intel SDE from the flame/ci-utils repository. + - Updated .travis.yml to use gcc 9. The file was previously using gcc 8, + which did not support -march=znver2. + - Created amd64_legacy umbrella family in config_registry for targeting + older (bulldozer, piledriver, steamroller, and excavator) + microarchitectures and moved those same subconfigs out of the amd64 + umbrella family. However, x86_64 retains amd64_legacy as a constituent + member. + - Fixed a bug in configure related to the building of the so-called + config list. When processing the contents of config_registry, + configure creates a series of structures and lists that allow for + various mappings related to configuration families, subconfigs, and + kernel sets. Two of those lists are built via substitution of + umbrella families with their subconfig members, and one of those + lists was improperly performing the substitution in a way that would + erroneously match on partial umbrella family names. That code was + changed to match the code that was already doing the substitution + properly, via substitute_words(). Also added comments noting the + importance of using substitute_words() in both instances. + - Comment updates. + +commit 74c0c622216aba0c24aa2c3a923811366a160cf5 +Author: Field G. Van Zee +Date: Tue Nov 16 16:06:33 2021 -0600 + + Reverted cbc88fe. + + Details: + - Reverted the annotation of some markdown code blocks with 'bash' + after realizing that the in-browser syntax highlighting was not + worthwhile. + +commit cbc88feb51b949ce562d044cf9f99c4e46bb8a39 +Author: Field G. Van Zee +Date: Tue Nov 16 16:02:39 2021 -0600 + + Marked some markdown shell code blocks as 'bash'. + + Details: + - Annotated the code blocks that represent shell commands and output as + 'bash' in README.md and BuildSystem.md. + +commit 78cd1b045155ddf0b9ec6e2ab815f2b216ad9a9e +Author: Field G. Van Zee +Date: Tue Nov 16 15:53:40 2021 -0600 + + Added 'Example Code' section to README.md. + + Details: + - Inserted a new 'Example Code' section into the README.md immediately + after the 'Getting Started' section. Thanks to Devin Matthews for + recommending this addition. + - Moved the 'Performance' section of the README down slightly so that it + appears after the 'Documentation' section. + +commit 7bde468c6f7ecc4b5322d2ade1ae9c0b88e6b9f3 +Author: Field G. Van Zee +Date: Sat Nov 13 16:39:37 2021 -0600 + + Added support for addons. + + Details: + - Implemented a new feature called addons, which are similar to + sandboxes except that there is no requirement to define gemm or any + other particular operation. + - Updated configure to accept --enable-addon= or -a syntax + for requesting an addon be included within a BLIS build. configure now + outputs the list of enabled addons into config.mk. It also outputs the + corresponding #include directives for the addons' headers to a new + companion to the bli_config.h header file named bli_addon.h. Because + addons may wish to make use of existing BLIS types within their own + definitions, the addons' headers must be included sometime after that + of bli_config.h (which currently is #included before bli_type_defs.h). + This is why the #include directives needed to go into a new top-level + header file rather than the existing bli_config.h file. + - Added a markdown document, docs/Addons.md, to explain addons, how to + build with them, and what assumptions their authors should keep in + mind as they create them. + - Added a gemmlike-like implementation of sandwich gemm called 'gemmd' + as an addon in addon/gemmd. The code uses a 'bao_' prefix for local + functions, including the user-level object and typed APIs. + - Updated .gitignore so that git ignores bli_addon.h files. + +commit 7bc8ab485e89cfc6032932e57929e208a28f4be5 +Author: Meghana-vankadari <74656386+Meghana-vankadari@users.noreply.github.com> +Date: Fri Nov 12 04:16:14 2021 +0530 + + Added BLAS/CBLAS APIs for axpby, gemm_batch. (#566) + + Details: + - Expanded the BLAS compatibility layer to include support for + ?axpby_() and ?gemm_batch_(). The former is a straightforward + BLAS-like interface into the axpbyv operation while the latter + implements a batched gemm via loops over bli_?gemm(). Also + expanded the CBLAS compatibility layer to include support for + cblas_?axpby() and cblas_?gemm_batch(), which serve as wrappers to + the corresponding (new) BLAS-like APIs. Thanks to Meghana Vankadari + for submitting these new APIs via #566. + - Fixed a long-standing bug in common.mk that for some reason never + manifested until now. Previously, CBLAS source files were compiled + *without* the location of cblas.h being specified via a -I flag. + I'm not sure why this worked, but it may be due to the fact that + the cblas.h file resided in the same directory as all of the CBLAS + source, and perhaps compilers implicitly add a -I flag for the + directory that corresponds to the location of the source file being + compiled. This bug only showed up because some CBLAS-like source code + was moved into an 'extra' subdirectory of that frame/compat/cblas/src + directory. After moving the code, compilation for those files failed + (because the cblas.h header file, presumably, could not be found in + the same location). This bug was fixed within common.mk by explicitly + adding the cblas.h directory to the list of -I flags passed to the + compiler. + - Added test_axpbyv.c and test_gemm_batch.c files to 'test' directory, + and updated test/Makefile to build those drivers. + - Fixed typo in error message string in cblas_sgemm.c. + +commit 28b0982ea70c21841fb23802d38f6b424f8200e1 +Author: Devin Matthews +Date: Wed Nov 10 12:34:50 2021 -0600 + + Refactored her[2]k/syr[2]k in terms of gemmt. (#531) + + Details: + - Renamed herk macrokernels and supporting files and functions to gemmt, + which is possible since at the macrokernel level they are identical. + Then recast herk/her2k/syrk/syr2k in terms of gemmt within the expert + level-3 oapi (bli_l3_oapi_ex.c) while also redefining them as literal + functions rather than cpp macros that instantiate multiple functions. + Thanks to Devin Matthews for his efforts on this issue (#531). + - Check that the maximum stack buffer size is sufficiently large + relative to the register blocksizes for each datatype, and do so when + the context is initialized rather than when an operation is called. + Note that with this change, users who pass in their own contexts into + the expert interfaces currently will *not* have any checks performed. + Thanks to Devin Matthews for suggesting this change. + +commit cfa3db3f3465dc58dbbd842f4462e4b49e7768b4 +Author: Field G. Van Zee +Date: Wed Nov 3 18:13:56 2021 -0500 + + Fixed bug in mixed-dt gemm introduced in e9da642. + + Details: + - Fixed a bug that broke certain mixed-datatype gemm behavior. This + bug was introduced recently in e9da642 when the code that performs + the operation transposition (for microkernel IO preference purposes) + was moved up so that it occurred sooner. However, when I moved that + code, I failed to notice that there was a cpp-protected "if" + conditional that applied to the entire code block that was moved. Once + the code block was relocated, the orphaned if-statement was now + (erroneously) glomming on to the next thing that happened to be in the + function, which happened to be the call to bli_rntm_set_ways_for_op(), + causing a rather odd memory exhaustion error in the sba due to the + num_threads field of the rntm_t still being -1 (because the rntm_t + field were never processed as they should have been). Thanks to + @ArcadioN09 (Snehith) for reporting this error and helpfully including + relevant memory trace output. + +commit f065a8070f187739ec2b34417b8ab864a7de5d7e +Author: Field G. Van Zee +Date: Thu Oct 28 16:05:43 2021 -0500 + + Removed support for 3m, 4m induced methods. + + Details: + - Removed support for all induced methods except for 1m. This included + removing code related to 3mh, 3m1, 4mh, 4m1a, and 4m1b as well as any + code that existed only to support those implementations. These + implementations were rarely used and posed code maintenance challenges + for BLIS's maintainers going forward. + - Removed reference kernels for packm that pack 3m and 4m micropanels, + and removed 3m/4m-related code from bli_cntx_ref.c. + - Removed support for 3m/4m from the code in frame/ind, then reorganized + and streamlined the remaining code in that directory. The *ind(), + *nat(), and *1m() APIs were all removed. (These additional API layers + no longer made as much sense with only one induced method (1m) being + supported.) The bli_ind.c file (and header) were moved to frame/base + and bli_l3_ind.c (and header) and bli_l3_ind_tapi.h were moved to + frame/3. + - Removed 3m/4m support from the code in frame/1m/packm. + - Removed 3m/4m support from trmm/trsm macrokernels and simplified some + pointer arithmetic that was previously expressed in terms of the + bli_ptr_inc_by_frac() static inline function (whose definition was + also removed). + - Removed the following subdirectories of level-0 macro headers from + frame/include/level0: ri3, rih, ri, ro, rpi. The level-0 scalar macros + defined in these directories were used exclusively for 3m and 4m + method codes. + - Simplified bli_cntx_set_blkszs() and bli_cntx_set_ind_blkszs() in + light of 1m being the only induced method left within BLIS. + - Removed dt_on_output field within auxinfo_t and its associated + accessor functions. + - Re-indexed the 1e/1r pack schemas after removing those associated with + variants of the 3m and 4m methods. This leaves two bits unused within + the pack format portion of the schema bitfield. (See bli_type_defs.h + for more info.) + - Spun off the basic and expert interfaces to the object and typed APIs + into separate files: bli_l3_oapi.c and bli_l3_oapi_ex.c; bli_l3_tapi.c + and bli_l3_tapi_ex.c. + - Moved the level-3 operation-specific _check function calls from the + operations' _front() functions to the corresponding _ex() function of + the object API. (This change roughly maintains where the _check() + functions are called in the call stack but lays the groundwork for + future changes that may come to the level-3 object APIs.) Minor + modifications to bli_l3_check.c to allow the check() functions to be + called from the expert interface APIs. + - Removed support within the testsuite for testing the aforementioned + induced methods, and updated the standalone test drivers in the 'test' + directory so reflect the retirement of those induced methods. + - Modified the sandbox contract so that the user is obliged to define + bli_gemm_ex() instead of bli_gemmnat(). (This change was made in light + of the *nat() functions no longer existing.) Also updated the existing + 'power10' and 'gemmlike' sandboxes to come into compliance with the + new sandbox rules. + - Updated BLISObjectAPI.md, BLISTypedAPI.md, Testsuite.md documentation + to reflect the retirement of 3m/4m, and also modified Sandboxes.md to + bring the document into alignment with new conventions. + - Updated various comments; removed segments of commented-out code. + +commit e8caf200a908859fa5f5ea2049911a9bdaa3d270 +Author: Field G. Van Zee +Date: Mon Oct 18 13:04:15 2021 -0500 + + Updated do_sde.sh to get SDE from GitHub. + + Details: + - Updated travis/do_sde.sh so that the script downloads the SDE tarball + from a new ci-utils repository on GitHub rather than from Intel's + website. This change is being made in an attempt to circumvent Travis + CI's recent troubles with downloading the SDE from Intel's website via + curl. Thanks to Devin Matthews for suggesting the idea. + +commit 290ff4b1c26737b074d5abbf76966bc22af8c562 +Author: Field G. Van Zee +Date: Thu Oct 14 16:09:43 2021 -0500 + + Disable SDE testing of old AMD microarchitectures. + + Details: + - Skip testing on piledriver, steamroller, and excavator platforms + in travis/do_sde.sh. + +commit 514fd101742dee557e5eb43d0023a221ae8a7172 +Author: Field G. Van Zee +Date: Thu Oct 14 13:50:28 2021 -0500 + + Fixed substitution bug in configure. + + Details: + - Fixed a bug in configure related to the building of the so-called + config list. When processing the contents of config_registry, + configure creates a series of structures and list that allow for + various mappings related to configuration families, subconfigs, + and kernel sets. Two of those lists are built via subsitituion + of umbrella families with their subconfig members, and one of + those lists was improperly performing the subtitution in a way + that would erroneously match on partial umbrella family names. + That code was changed to match the code that was already doing + the subtitution properly, via substitute_words(). + - Added comments noting the importance of using substitute_words() + in both instances. + +commit e9da6425e27a9d63c9fef92afc2dd750c601ccd7 +Author: Field G. Van Zee +Date: Wed Oct 13 14:15:38 2021 -0500 + + Allow use of 1m with mixing of row/col-pref ukrs. + + Details: + - Fixed a bug that broke the use of 1m for dcomplex when the single- + precision real and double-precision real ukernels had opposing I/O + preferences (row-preferential sgemm ukernel + column-preferential + dgemm ukernel, or vice versa). The fix involved adjusting the API + to bli_cntx_set_ind_blkszs() so that the induced method context init + function (e.g., bli_cntx_init__ind()) could call that + function for only one datatype at a time. This allowed the blocksize + scaling (which varies depending on whether we're doing 1m_r or 1m_c) + to happen on a per-datatype basis. This fixes issue #557. Thanks to + Devin Matthews and RuQing Xu for helping discover and report this bug. + - The aforementioned 1m fix required moving the 1m_r/1m_c logic from + bli_cntx_ref.c into a new function, bli_l3_set_schemas(), which is + called from each level-3 _front() function. The pack_t schemas in the + cntx_t were also removed entirely, along with the associated accessor + functions. This in turn required updating the trsm1m-related virtual + ukernels to read the pack schema for B from the auxinfo_t struct + rather than the context. This also required slight tweaks to + bli_gemm_md.c. + - Repositioned the logic for transposing the operation to accommodate + the microkernel IO preference. This mostly only affects gemm. Thanks + to Devin Matthews for his help with this. + - Updated dpackm pack ukernels in the 'armsve' kernel set to avoid + querying pack_t schemas from the context. + - Removed the num_t dt argument from the ind_cntx_init_ft type defined + in bli_gks.c. The context initialization functions for induced methods + were previously passed a dt argument, but I can no longer figure out + *why* they were passed this value. To reduce confusion, I've removed + the dt argument (including also from the function defintion + + prototype). + - Commented out setting of cntx_t schemas in bli_cntx_ind_stage.c. This + breaks high-leve implementations of 3m and 4m, but this is okay since + those implementations will be removed very soon. + - Removed some older blocks of preprocessor-disabled code. + - Comment update to test_libblis.c. + +commit 81e103463214d589071ccbe2d90b8d7c19a186e4 +Author: Minh Quan Ho <1337056+hominhquan@users.noreply.github.com> +Date: Wed Oct 13 20:28:02 2021 +0200 + + Alloc at least 1 elem in pool_t block_ptrs. (#560) + + Details: + - Previously, the block_ptrs field of the pool_t was allowed to be + initialized as any unsigned integer, including 0. However, a length of + 0 could be problematic given that malloc(0) is undefined and therefore + variable across implementations. As a safety measure, we check for + block_ptrs array lengths of 0 and, in that case, increase them to 1. + - Co-authored-by: Minh Quan Ho + +commit 327481a4b0acf485d0cbdd8635dd9b886ba3f2a7 +Author: Minh Quan Ho <1337056+hominhquan@users.noreply.github.com> +Date: Tue Oct 12 19:53:04 2021 +0200 + + Fix insufficient pool-growing logic in bli_pool.c. (#559) + + Details: + - The current mechanism for growing a pool_t doubles the length of the + block_ptrs array every time the array length needs to be increased + due to new blocks being added. However, that logic did not take in + account the new total number of blocks, and the fact that the caller + may be requesting more blocks that would fit even after doubling the + current length of block_ptrs. The code comments now contain two + illustrating examples that show why, even after doubling, we must + always have at least enough room to fit all of the old blocks plus + the newly requested blocks. + - This commit also happens to fix a memory corruption issue that stems + from growing any pool_t that is initialized with a block_ptrs length + of 0. (Previously, the memory pool for packed buffers of C was + initialized with a block_ptrs length of 0, but because it is unused + this bug did not manifest by default.) + - Co-authored-by: Minh Quan Ho + +commit 32a6d93ef6e2af5e486dfd5e46f8272153d3d53d +Merge: 408906fd 2604f407 +Author: Devin Matthews +Date: Sat Oct 9 15:53:54 2021 -0500 + + Merge pull request #543 from xrq-phys/armsve-packm-fix + + ARMSVE Block SVE-Intrinsic Kernels for GCC 8-9 + +commit 408906fdd8892032aa11bd061b7971128f453bef +Merge: 4277fec0 ccf16289 +Author: Devin Matthews +Date: Sat Oct 9 15:50:25 2021 -0500 + + Merge pull request #542 from xrq-phys/armsve-zgemm + + Arm SVE CGEMM / ZGEMM Natural Kernels + +commit ccf16289d2e71fd9511ccf2d13dcebbfa29deabc +Author: RuQing Xu +Date: Fri Oct 8 12:34:14 2021 +0900 + + Arm SVE C/ZGEMM Fix FMOV 0 Mistake + + FMOV [hsd]M, #imm does not allow zero immediate. + Use wzr, xzr instead. + +commit 82b61283b2005f900101056e6df2a108258db602 +Author: RuQing Xu +Date: Fri Oct 8 12:17:29 2021 +0900 + + SH Kernel Unused Eigher + +commit 1749dfa493054abd2e4ddba7cb21278d337e4f74 +Author: RuQing Xu +Date: Fri Oct 8 12:11:53 2021 +0900 + + Arm SVE C/ZGEMM Support *beta==0 + +commit 4b648e47daad256ab8ab698173a97f71ab9f75eb +Author: RuQing Xu +Date: Wed Sep 22 16:42:09 2021 +0900 + + Arm SVE Config armsve Use ZGEMM/CGEMM + +commit f76ea905e216cf640975e6319c6d2f54aeafed2e +Author: RuQing Xu +Date: Tue Sep 21 20:38:44 2021 +0900 + + Arm SVE: Update Perf. Graph + + Pic. size seems a bit different from upstream. + Generaged w/ MATLAB. Open to any change. + +commit 66a018e6ad00d9e8967b67e1aa3e23b20a7efdfe +Author: RuQing Xu +Date: Mon Sep 20 00:16:11 2021 +0900 + + Arm SVE CGEMM 2Vx10 Unindex Process Alpha=1.0 + +commit 9e1e781cb59f8fadb2a10a02376d3feac17ce38d +Author: RuQing Xu +Date: Sun Sep 19 23:30:42 2021 +0900 + + Arm SVE ZGEMM 2Vx10 Unindex Process Alpha=1.0 + +commit f7c6c2b119423e7ba7a24ae2156790e076071cba +Author: RuQing Xu +Date: Thu Sep 16 01:47:42 2021 +0900 + + A64FX Config Use ZGEMM/CGEMM + +commit e4cabb977d038688688aca39b366f98f9c36b7eb +Author: RuQing Xu +Date: Thu Sep 16 01:34:26 2021 +0900 + + Arm SVE Typo Fix ZGEMM/CGEMM C Prefetch Reg + +commit b677e0d61b23f26d9536e5c363fd6bbab6ee1540 +Author: RuQing Xu +Date: Thu Sep 16 01:18:54 2021 +0900 + + Arm SVE Add SGEMM 2Vx10 Unindexed + +commit 3f68e8309f2c5b31e25c0964395a180a80014d36 +Author: RuQing Xu +Date: Thu Sep 16 01:00:54 2021 +0900 + + Arm SVE ZGEMM Support Gather Load / Scatt. St. + +commit c19db2ff826e2ea6ac54569e8aa37e91bdf7cabe +Author: RuQing Xu +Date: Wed Sep 15 23:39:53 2021 +0900 + + Arm SVE Add ZGEMM 2Vx10 Unindexed + +commit e13abde30b9e0e381c730c496e74bc7ae062a674 +Author: RuQing Xu +Date: Wed Sep 15 04:19:45 2021 +0900 + + Arm SVE Add ZGEMM 2Vx7 Unindexed + +commit 49b9d7998eb86f340ae7b26af3e5a135d6a8feee +Author: RuQing Xu +Date: Tue Sep 14 04:02:47 2021 +0900 + + Arm SVE Add ZGEMM 2Vx8 Unindexed + +commit 4277fec0d0293400497ae8bcfc32be5e62319ae9 +Merge: 2329d990 f44149f7 +Author: Devin Matthews +Date: Thu Oct 7 13:47:22 2021 -0500 + + Merge pull request #533 from xrq-phys/arm64-hi-bw + + ARMv8 PACKM and GEMMSUP Kernels + Apple Firestorm Subconfig + +commit 2329d99016fe1aeb86da4552295f497543cea311 (origin/1m_row_col_problem) +Author: Devin Matthews +Date: Thu Oct 7 12:37:58 2021 -0500 + + Update Travis CI badge + + [ci skip] + +commit f44149f787ae3d4b53d9c4d8e6f23b2818b7770d +Author: RuQing Xu +Date: Fri Oct 8 02:35:58 2021 +0900 + + Armv8 Trash New Bulk Kernels + + - They didn't make much improvements. + - Can't register row-preferral and column-preferral ukrs at the same time. + Will break 1m. + +commit 70b52cadc5ef4c16431e1876b407019e6286614e +Author: Devin Matthews +Date: Thu Oct 7 12:34:35 2021 -0500 + + Enable testing 1m in `make check`. + +commit 2604f4071300d109f28c8438be845aeaf3ec44e4 +Author: RuQing Xu +Date: Thu Oct 7 02:39:00 2021 +0900 + + Config ArmSVE Unregister 12xk. Move 12xk to Old + +commit 1e3200326be9109eb0f8c7b9e4f952e45700cbba +Author: RuQing Xu +Date: Thu Oct 7 02:37:14 2021 +0900 + + Revert __has_include(). Distinguish w/ BLIS_FAMILY_** + +commit a4066f278a5c06f73b16ded25f115ca4b7728ecb +Author: RuQing Xu +Date: Thu Oct 7 02:26:05 2021 +0900 + + Register firestorm into arm64 Metaconfig + +commit d7a3372247c37568d142110a1537632b34b8f2ff +Author: RuQing Xu +Date: Thu Oct 7 02:25:14 2021 +0900 + + Armv8 DGEMMSUP Fix Edge 6x4 Switch Case Typo + +commit 2920dde5ac52e09f84aa42990aab8340421522ce +Author: RuQing Xu +Date: Thu Oct 7 02:01:45 2021 +0900 + + Armv8 DGEMMSUP Fix 8x4m Store Inst. Typo + +commit 14b13583f1802c002e195b3b48874b3ebadbeb20 +Author: Devin Matthews +Date: Wed Oct 6 10:22:34 2021 -0500 + + Add test for Apple M1 (firestorm) + + This test will run on Linux, but all the kernels should run just fine. This does not test autodetection but then none of the other ARM tests do either. + +commit a024715065532400da6257b8b3124ca5aecda405 +Author: RuQing Xu +Date: Thu Oct 7 00:15:54 2021 +0900 + + Firestorm CPUID Dispatcher + + Commenting out due to possibly a Xcode bug. + +commit b9da6d55fec447d05c8b67f34ce83617123d8357 +Author: RuQing Xu +Date: Wed Oct 6 12:25:54 2021 +0900 + + Armv8 GEMMSUP Edge Cases Require Signed Ints + + Fix a bug in bli_gemmsup_rd_armv8a_asm_d6x8m.c. + For safety upon similar strategies in the future, + change all [mn]_[iter/left] into signed ints. + +commit 34919de3df5dda7a06fc09dcec12ca46dc8b26f4 +Author: Devin Matthews +Date: Sat Oct 2 18:48:50 2021 -0500 + + Make error checking level a thread-local variable. + + Previously, this was a global variable. Setting the value was synchronized via a mutex but reading the value was not. Of course, these accesses are almost certainly atomic, but there is still the possibility of one thread attempting to set the value and then reading the value set by another thread. For correct operation under user threading (e.g. pthreads), this should probably be thread-local with no mutex. + +commit c3024993c3d50236fad112822215f066496c5831 +Author: Devin Matthews +Date: Tue Oct 5 15:20:27 2021 -0500 + + Fix data race in testsuite. + +commit 353a0d82572f26e78102cee25693130ce6e0ea5b +Author: Devin Matthews +Date: Tue Oct 5 14:24:17 2021 -0500 + + Update .appveyor.yml + + [ci skip] + +commit 4bfadf9b561d4ebe0bbaf8b6d332f07ff531d618 +Author: RuQing Xu +Date: Wed Oct 6 01:51:26 2021 +0900 + + Firestorm Block Size Fixes + +commit 40baf83f0ea2749199b93b5a8ac45c01794b008c +Author: RuQing Xu +Date: Wed Oct 6 01:00:52 2021 +0900 + + Armv8 Handle *beta == 0 for GEMMSUP ??r Case. + +commit 079fbd42ce8cf7ea67a939b0f80f488de5821319 +Merge: f5c03e9f 9905f443 +Author: Devin Matthews +Date: Mon Oct 4 17:21:48 2021 -0500 + + Merge branch 'master' into arm64-hi-bw + +commit 9905f44347eea4c57ef4927b81f1c63e76a92739 +Merge: 6d3036e3 64a421f6 +Author: Devin Matthews +Date: Mon Oct 4 15:58:59 2021 -0500 + + Merge pull request #553 from flame/rpath-fix + + Add an option to use an @rpath-dependent install_name on macOS + +commit 6d3036e31d8a2c1acbc1260489eeb8f535a8f97a +Merge: 53377fcc eaa554aa +Author: Devin Matthews +Date: Mon Oct 4 15:58:43 2021 -0500 + + Merge pull request #545 from hominhquan/clean_error + + bli_error: more cleanup on the error strings array + +commit 53377fcca91e595787b38e2a47780ac0c35a7e7c +Merge: d0a0b4b8 80c5366e +Author: Devin Matthews +Date: Mon Oct 4 15:45:53 2021 -0500 + + Merge pull request #554 from flame/armsve-cleanup + + Move unused ARM SVE kernels to "old" directory. + +commit 80c5366e4a9b8b72d97fba1eab89bab8989c44f4 +Author: Devin Matthews +Date: Mon Oct 4 15:40:28 2021 -0500 + + Move unused ARM SVE kernels to "old" directory. + +commit 64a421f6983ab5bc0b55df30a2ddcfff5bfd73be +Author: Devin Matthews +Date: Mon Oct 4 13:40:43 2021 -0500 + + Add an option to control whether or not to use @rpath. + + Adds `--enable-rpath/--disable--rpath` (default disabled) to use an install_name starting with @rpath/. Otherwise, set the install_name to the absolute path of the install library, which was the previous behavior. + +commit c4a31683dd6f4da3065d86c11dd998da5192740a +Author: Devin Matthews +Date: Mon Oct 4 13:27:10 2021 -0500 + + Fix $ORIGIN usage on linux. + +commit d0a0b4b841fce56b7b2d3c03c5d93ad173ce2b97 +Author: Dave Love +Date: Mon Oct 4 18:03:04 2021 +0000 + + Arm micro-architecture dispatch (#344) + + Details: + - Reworked support for ARM hardware detection in bli_cpuid.c to parse + the result of a CPUID-like instruction. + - Added a64fx support to bli_gks.c. + - #include arm64 and arm32 family headers from bli_arch_config.h. + - Fix the ordering of the "armsve" and "a64fx" strings in the + config_name string array in bli_arch.c. The ordering did not match + the ordering of the corresponding arch_t values in bli_type_defs.h, + as it should have all along. + - Added clang support to make_defs.mk in arm64, cortexa53, cortexa57 + subconfigs. + - Updated arm64 and arm32 families in config_registry. + - Updated docs/HardwareSupport.md to reflect added ARM support. + - Thanks to Dave Love, RuQing Xu, and Devin Matthews for their + contributions in this PR (#344). + +commit 91408d161a2b80871463ffb6f34c455bdfb72492 +Author: Devin Matthews +Date: Mon Oct 4 11:37:48 2021 -0500 + + Use @path-based install name on MacOS and use relocatable RPATH entries for testsuite inaries. + + - RPATH entries (and DYLD_LIBRARY_PATH) do nothing on macOS unless the install_name of the library starts with @rpath/. While the install_name can be set to the absolute install path, this makes the installation non-relocatable. When using @path in the install_name, install paths within the normal DYLD_LIBRARY_PATH work with no changes on the user side, but for install paths off the beaten track, users must specify an RPATH entry when linking (or modify DYLD_LIBRARY_PATH at runtime). Perhaps this could be made into a configure-time option. + - Having relocable testsuite binaries is not necessarily a priority but it is easy to do with @executable_path (macOS) or $ORIGIN (linux/BSD). + +commit f5c03e9fe808f9bd8a3e0c62786334e13c46b0fc +Author: RuQing Xu +Date: Sun Oct 3 16:51:51 2021 +0900 + + Armv8 Handle *beta == 0 for GEMMSUP ?rc Case. + +commit abc648352c591e26ceee436bd3a45400115b70c5 +Author: RuQing Xu +Date: Sun Oct 3 13:14:19 2021 +0900 + + Armv8 Fix 6x8 Row-Maj Ukr + + - Fixed for 6x8 only, 4x4 & 4x8 pending; + - Installed to config firestorm as benchmark seems to show better perf: + Old: + blis_dgemm_ukr_c 6 8 320 36.87 2.43e-17 PASS + blis_dgemm_ukr_c 6 8 352 40.55 1.04e-17 PASS + blis_dgemm_ukr_c 6 8 384 44.24 5.68e-17 PASS + blis_dgemm_ukr_c 6 8 416 41.67 3.51e-17 PASS + blis_dgemm_ukr_c 6 8 448 34.41 2.94e-17 PASS + blis_dgemm_ukr_c 6 8 480 42.53 2.35e-17 PASS + + New: + blis_dgemm_ukr_r 6 8 352 50.69 1.59e-17 PASS + blis_dgemm_ukr_r 6 8 384 49.15 5.55e-17 PASS + blis_dgemm_ukr_r 6 8 416 50.44 2.86e-17 PASS + blis_dgemm_ukr_r 6 8 448 46.92 3.12e-17 PASS + blis_dgemm_ukr_r 6 8 480 48.08 4.08e-17 PASS + +commit 0a45bc0fbc7aee3876c315ed567fc37f19cdc57f +Merge: 5013a6cb 13dbd5b5 +Author: Devin Matthews +Date: Sat Oct 2 18:59:43 2021 -0500 + + Merge pull request #552 from flame/armsve_beta_0 + + Add explicit handling for beta == 0 in armsve sd and armv7a d gemm ukrs. + +commit 13dbd5b5d3dbf27e33ecf0e98d43c97019a6339d +Author: Devin Matthews +Date: Sat Oct 2 20:40:25 2021 +0000 + + Apply patch from @xrq-phys. + +commit ae0eeeaf77c77892db17027cef10b95ec97c904f +Author: Devin Matthews +Date: Wed Sep 29 16:42:33 2021 -0500 + + Add explicit handling for beta == 0 in armsve sd and armv7a d gemm ukrs. + +commit 5013a6cb7110746c417da96e4a1308ef681b0b88 +Author: Field G. Van Zee +Date: Wed Sep 29 10:38:50 2021 -0500 + + More edits and fixes to docs/FAQ.md. + +commit b36fb0fbc5fda13d9a52cc64953341d3d53067ee +Author: Field G. Van Zee +Date: Tue Sep 28 18:47:45 2021 -0500 + + Fixed newly broken link to CREDITS in FAQ.md. + +commit 3442d4002b3bfffd8848f72103b30691df2b19b1 +Author: Field G. Van Zee +Date: Tue Sep 28 18:43:23 2021 -0500 + + More minor fixes to FAQ.md and Sandboxes.md. + +commit 89aaf00650d6cc19b83af2aea6c8d04ddd3769cb +Author: Field G. Van Zee +Date: Tue Sep 28 18:34:33 2021 -0500 + + Updates to FAQ.md, Sandboxes.md, and README.md. + + Details: + - Updated FAQ.md to include two new questions, reordered an existing + question, and also removed an outdated and redundant question about + BLIS vs. AMD BLIS. + - Updated Sandboxes.md to use 'gemmlike' as its main example, along with + other smaller details. + - Added ARM as a funder to README.md. + +commit c52c43115ec2264fda9380c48d9e6bb1e1ea2ead +Merge: 1fc23d21 1f527a93 +Author: Field G. Van Zee +Date: Sun Sep 26 15:56:54 2021 -0500 + + Merge branch 'dev' + +commit 1fc23d2141189c7b583a5bff2cffd87fd5261444 +Author: Field G. Van Zee +Date: Tue Sep 21 14:54:20 2021 -0500 + + Safelist 'master', 'dev', 'amd' branches. + + Details: + - Modified .travis.yml so that only commits to 'master', 'dev', and + 'amd' branches get built by Travis CI. Thanks to Devin Matthews for + helping to track down the syntax for this change. + +commit 1f527a93b996093e06ef7a8e94fb47ee7e690ce0 +Author: Field G. Van Zee +Date: Mon Sep 20 17:56:36 2021 -0500 + + Re-enable and fix fb93d24. + + Details: + - Re-enabled the changes made in fb93d24. + - Defined BLIS_ENABLE_SYSTEM in bli_arch.c, bli_cpuid.c, and bli_env.c, + all of which needed the definition (in addition to config_detect.c) in + order for the configure-time hardware detection binary to be compiled + properly. Thanks to Minh Quan Ho for helping identify these additional + files as needing to be updated. + - Added additional comments to all four source files, most notably to + prompt the reader to remember to update all of the files when updating + any of the files. Also made the cpp code in each of the files as + consistent/similar as possible. + - Refer to issues #532 and PR #546 for more history. + +commit 7b39c1492067de941f81b49a3b6c1583290336fd +Author: Field G. Van Zee +Date: Mon Sep 20 16:13:50 2021 -0500 + + Reverted fb93d24. + + Details: + - The latest changes in fb93d24 are still causing problems. Reverting + and preparing to move them to a branch. + +commit fb93d242a4fef4694ce2680436da23087bbdd5fe +Author: Field G. Van Zee +Date: Mon Sep 20 15:42:08 2021 -0500 + + Re-enable and fix 8e0c425 (BLIS_ENABLE_SYSTEM). + + Details: + - Re-enable the changes originally made in 8e0c425 but quickly reverted + in 2be78fc. + - Moved the #include of bli_config.h so that it occurs before the + #include of bli_system.h. This allows the #define BLIS_ENABLE_SYSTEM + or #define BLIS_DISABLE_SYSTEM in bli_config.h to be processed by the + time it is needed in bli_system.h. This change should have been + in the original 8e0c425, but was accidentally omitted. Thanks to Minh + Quan Ho for catching this. + - Add #define BLIS_ENABLE_SYSTEM to config_detect.c so that the proper + cpp conditional branch executes in bli_system.h when compiling the + hardware detection binary. The changes made in 8e0c425 were an attempt + to support the definition of BLIS_OS_NONE when configuring with + --disable-system (in issue #532). That commit failed because, aside + from the required but omitted header reordering (second bullet above), + AppVeyor was unable to compile the hardware detection binary as a + result of missing Windows headers. This commit, which builds on PR + #546, should help fix that issue. Thanks to Minh Quan Ho for his + assistance and patience on this matter. + +commit eaa554aa52b879d181fdc87ba0bfad3ab6131517 +Author: Minh Quan HO +Date: Wed Sep 15 15:39:36 2021 +0200 + + bli_error: more cleanup on the error strings array + + - There was redundance between the macro BLIS_MAX_NUM_ERR_MSGS (=200) and + the enum BLIS_ERROR_CODE_MAX (-170), while they both mean the same thing: + the maximal number of error codes/messages. + - The previous initialization of error messages at compile time ignored that + the 'bli_error_string' array still occupies useless memory due to 2D char[][] + declaration. Instead, it should be just an array of pointers, pointing at + strings in .rodata section. + - This commit does the two modifications: + * retired macros BLIS_MAX_NUM_ERR_MSGS and BLIS_MAX_ERR_MSG_LENGTH everywhere + * switch bli_error_string from char[][] to char *[] to reduce its footprint + from 40KB (200*200) to 1.3KB (170*sizeof(char*)). + (No problem to use the enum BLIS_ERROR_CODE_MAX at compile-time, + since compiler is smart enough to determine its value is 170.) + +commit 52f29f739dbbb878c4cde36dbe26b82847acd4e9 +Author: Field G. Van Zee +Date: Fri Sep 17 08:38:29 2021 -0500 + + Removed last vestige of #define BLIS_NUM_ARCHS. + + Details: + - Removed the commented-out #define BLIS_NUM_ARCHS in bli_type_defs.h + and its associated (now outdated) comments. BLIS_NUM_ARCHS has been + part of the arch_t enum for some time now, and so this change is + mostly about removing any opportunity for confusion for people who + may be reading the code. Thanks to Minh Quan Ho for leading me to + cleanup. + +commit 849aae09f4fbf8d7abf11f4df1471f1d057e874b +Author: Field G. Van Zee +Date: Thu Sep 16 14:47:45 2021 -0500 + + Added new packm var3 to 'gemmlike'. + + Details: + - Defined a new packm variant for the 'gemmlike' sandbox. This new + variant (bls_l3_packm_var3.c) parallelizes the packing operation over + the k dimension rather than the m or n dimensions. Note that the + gemmlike implementation still uses var1 by default, and use of the new + code would require changing bls_l3_packm_a.c and/or bls_l3_packm_b.c + so that var3 is called instead. Thanks to Jeff Diamond for proposing + this (perhaps NUMA-friendly) solution. + +commit b6f71fd378b7cd0cdc5c780e0b8c975a7abde998 +Merge: 9293a68e e3dc1954 +Author: Devin Matthews +Date: Thu Sep 16 12:24:33 2021 -0500 + + Merge pull request #544 from flame/haswell-gemmsup-fpe + + Fix more copy-paste errors in the haswell gemmsup code. + +commit e3dc1954ffb5eee2a8b41fce85ba589f75770eea +Author: Devin Matthews +Date: Thu Sep 16 10:59:37 2021 -0500 + + Fix problem where uninitialized registers are included in vhaddpd in the Mx1 gemmsup kernels for haswell. + + The fix is to use the same (valid) source register twice in the horizontal addition. + +commit 5191c43faccf45975f577c60b9089abee25722c9 +Author: Devin Matthews +Date: Thu Sep 16 10:16:17 2021 -0500 + + Fix more copy-paste errors in the haswell gemmsup code. + + Fixes #486. + +commit 30c29b256ef13f0141ca9e9169cbdc7a45ce3a61 +Author: RuQing Xu +Date: Thu Sep 16 05:01:03 2021 +0900 + + Arm SVE Exclude SVE-Intrinsic Kernels for GCC 8-9 + + Affected configs: a64fx. + +commit bffa85be59dece8e756b9444e762f18892c06ee1 +Author: RuQing Xu +Date: Thu Sep 16 04:31:45 2021 +0900 + + Arm SVE: Correct PACKM Ker Name: Intrinsic Kers + + SVE-Intrinsic-based kernels ought not to use asm in their names. + +commit 9293a68eb6557a9ea43a846435908c3d52d4218b +Merge: ade10f42 98ce6e8b +Author: Devin Matthews +Date: Fri Sep 10 14:13:29 2021 -0500 + + Merge pull request #534 from flame/cxx_test + + Add test to Travis using C++ compiler to make sure blis.h is C++-compatible + +commit 98ce6e8bc916e952510872caa60d818d62a31e69 +Author: Devin Matthews +Date: Fri Sep 10 14:12:13 2021 -0500 + + Do a fast test on OSX. [ci skip] + +commit c76fcad0c2836e7140b6bef3942e0a632a5f2cda +Author: Devin Matthews +Date: Fri Sep 10 13:57:02 2021 -0500 + + Fix AArch64 tests and consolidate some other tests. + +commit e486d666ffefee790d5e39895222b575886ac1ea +Author: Devin Matthews +Date: Fri Sep 10 13:50:16 2021 -0500 + + Use C++ cross-compiler for ARM tests. + +commit fbb3560cb8e2aeab205c47c2b096d4fa306d93db +Author: Devin Matthews +Date: Fri Sep 10 13:38:27 2021 -0500 + + Attempt to fix cxx-test for OOT builds. + +commit 9c0064f3f67d59263c62d57ae19605562bb87cc2 +Author: Devin Matthews +Date: Fri Sep 10 10:39:04 2021 -0500 + + Fix config_name in bli_arch.c + +commit ade10f427835d5274411cafc9618ac12966eb1e7 +Author: Field G. Van Zee +Date: Fri Aug 27 12:47:12 2021 -0500 + + Updated travis-ci.org link in README.md to .com. + +commit 2be78fc97777148c83d20b8509e38aa1fc1b4540 +Author: Field G. Van Zee +Date: Fri Aug 27 12:17:26 2021 -0500 + + Disabled (at least temporarily) commit 8e0c425. + + Details: + - Reverted changes in 8e0c425 due to AppVeyor build failures that we do + not yet understand. + +commit 820f11a4694aee5f234e24277aecca40885ae9d4 +Author: RuQing Xu +Date: Fri Aug 27 13:40:26 2021 +0900 + + Arm Whole GEMMSUP Call Route is Asm/Int Optimized + + - `ref2` call in `bli_gemmsup_rv_armv8a_asm_d6x8m.c` is commented out. + - `bli_gemmsup_rv_armv8a_asm_d4x8m.c` contains a tail `ref2` call but + it's not called by any upper routine. + +commit 8e0c4255de52a0a5cffecbebf6314aa52120ebe4 +Author: Field G. Van Zee +Date: Thu Aug 26 15:29:18 2021 -0500 + + Define BLIS_OS_NONE when using --disable-system. + + Details: + - Modified bli_system.h so that the cpp macro BLIS_OS_NONE is defined + when BLIS_DISABLE_SYSTEM is defined. Otherwise, the previous OS- + detecting macro conditionals are considered. This change is to + accommodate a solution to a cross-compilation issue described in + #532. + +commit d6eb70fbc382ad7732dedb4afa01cf9f53e3e027 +Author: Field G. Van Zee +Date: Thu Aug 26 13:12:39 2021 -0500 + + Updated stale calls to malloc_intl() in gemmlike. + + Details: + - Updated two out-of-date calls to bli_malloc_intl() within the gemmlike + sandbox. These calls to malloc_intl(), which resided in + bls_l3_decor_pthreads.c, were missing the err_t argument that the + function uses to report errors. Thanks to Jeff Diamond for helping + isolate this issue. + +commit 2f7325b2b770a15ff8aaaecc087b22238f0c67b7 +Author: Field G. Van Zee +Date: Mon Aug 23 15:04:05 2021 -0500 + + Blacklist clang10/gcc9 and older for 'armsve'. + + Details: + - Prohibit use of clang 10.x and older or gcc 9.x and older for the + 'armsve' subconfiguration. Addresses issue #535. + +commit 7e2951e61fda1c325d6a76ca9956253482d84924 +Author: RuQing Xu +Date: Mon Aug 23 17:06:44 2021 +0900 + + Arm: DGEMMSUP `Macro' Edge Cases Stop Calling Ref + + Ref cannot handle panel strides (packed cases) thus cannot be called + from the beginning of `gemmsup` (i.e. cannot be dispatch target of + gemmsup to other sizes.) + +commit 4fd82b0e9348553d83e258bd4969e49a81f8fcf0 +Author: RuQing Xu +Date: Mon Aug 23 05:18:32 2021 +0900 + + Header Typo + +commit 35409ebe67557c0e7cf5ced138c8166c9c1c909f +Author: RuQing Xu +Date: Mon Aug 23 04:51:47 2021 +0900 + + Arm: DGEMMSUP ??r(rv) Invoke Edge Size + + Plus some fix at edges. + + TODO: Should ensure that no ref kernel appear in beginning of gemmsup + kernels. As ref does not recognise panel stride. + +commit a361492c24fdd919ee037763fc6523e8d7d2967a +Author: RuQing Xu +Date: Mon Aug 23 01:13:39 2021 +0900 + + Arm: DGEMMSUP ?rc(rd) Invoke Edge Size + +commit eaea67401c2ab31f2e51eede59725f64c1a21785 +Merge: 5fc65cdd e320ec6d +Author: Devin Matthews +Date: Sat Aug 21 16:09:31 2021 -0500 + + Merge branch 'master' into cxx_test + +commit 5fc65cdd9e4134c5dcb16d21cd4a79ff426ca9f3 +Author: Devin Matthews +Date: Sat Aug 21 15:59:27 2021 -0500 + + Add test to Travis using C++ compiler to make sure blis.h is C++-compatible. + +commit e320ec6d5cd44e03cb2e2faa1d7625e84f76d668 +Author: Field G. Van Zee +Date: Fri Aug 20 17:15:20 2021 -0500 + + Moved lang defs from _macro_def.h to _lang_defs.h. + + Details: + - Moved miscellaneous language-related definitions, including defs + related to the handling of the 'restrict' keyword, from the top half + of bli_macro_defs.h into a new file, bli_lang_defs.h, which is now + #included immediately after "bli_system.h" in blis.h. This change is + an attempt to fix a report of recent breakage of C++ compilers due + to the recent introduction of 'restrict' in bli_type_defs.h (which + previously was being included *before* bli_macro_defs.h and its + restrict handling therein. Thanks to Ivan Korostelev for reporting + this issue in #527. + - CREDITS file update. + +commit e6799b26a6ecf1e80661a77d857d1c9e9adf50dc +Author: RuQing Xu +Date: Sat Aug 21 02:39:38 2021 +0900 + + Arm: Implement GEMMSUP Fallback Method + + bli_dgemmsup_rv_armv8a_int_6x4mn + +commit 7d5903d8d7570090eb37c592094424d1c64805d1 +Author: RuQing Xu +Date: Sat Aug 21 01:55:50 2021 +0900 + + Arm64 Fix: Support Alpha/Beta in GEMMSUP Intrin + + Forgot to support `alpha`/`beta` in gemmsup_armv8a_int. + +commit 3b275f810b2479eb5d6cf2296e97a658cf1bb769 +Author: Field G. Van Zee +Date: Thu Aug 19 16:06:46 2021 -0500 + + Minor tweaks to gemmlike sandbox. + + Details: + - In the gemmlike sandbox, changed the loop index variable of inner + loop of packm_cxk() from 'd' to 'i' (and likewise for the + corresponding inlined code within packm_var2()). + - Pack matrices A and B using packm_var1() instead of packm_var2(). + +commit 3eccfd456e7e84052c9a429dcde1183a7ecfaa48 +Author: Field G. Van Zee +Date: Thu Aug 19 13:22:10 2021 -0500 + + Added local _check() code to gemmlike sandbox. + + Details: + - Added code to the gemmlike sandbox that handles parameter checking. + Previously, the gemmlike implementation called bli_gemm_check(), which + resides within the BLIS framework proper. Certain modifications that a + user may wish to perform on the sandbox, such as adding a new matrix + or vector operand, would have required additional checks, and so these + changes make it easier for such a person to implement those checks for + their custom gemm-like operation. + +commit 7144230cdb0653b70035ddd91f7f41e06ad8d011 +Author: Field G. Van Zee +Date: Wed Aug 18 13:25:39 2021 -0500 + + README.md citation updates (e.g. BLIS7 bibtex). + +commit 4a955e939044cfd2048cf9f3e33024e3ad1fbe00 +Author: Field G. Van Zee +Date: Mon Aug 16 13:49:27 2021 -0500 + + Tweaks to gemmlike to facilitate 3rd party mods. + + Details: + - Changed the implementation in the 'gemmlike' sandbox to more easily + allow others to provide custom implementations of packm. These changes + include: + - Calling a local version of packm_cxk() that can be modified. This + version of packm_cxk() uses inlined loops in packm_cxk() rather + than querying the context for packm kernels (or even using scal2m). + - Providing two variants of packm, one of which calls the + aforementioned packm_cxk(), the other of which inlines the contents + of packm_cxk() into the variant itself, making it self-contained. + To switch from one to the other, simply change which function gets + called within bls_packm_a() and bls_packm_b(). + - Simplified and cleaned up some variant names in both variants of + packm, relative to their parent code. + +commit 2c0b4150e40c83ea814f69ca766da74c19ed0a58 +Merge: c99fae50 4b8ed99d +Author: Devin Matthews +Date: Sat Aug 14 18:41:35 2021 -0500 + + Merge pull request #527 from flame/obj_t_makeover + + Implement proposed new function pointer fields for obj_t. + +commit 4b8ed99d926876fbf54c15468feae4637268eb6b +Author: Field G. Van Zee +Date: Fri Aug 13 15:31:10 2021 -0500 + + Whitespace tweaks. + +commit c99fae50ac3de0b5380a085aeebebfe67a645407 +Merge: e6d68bc4 4f70eb79 +Author: Devin Matthews +Date: Fri Aug 13 14:48:00 2021 -0500 + + Merge pull request #530 from flame/fix_clang_warnings + + Clean up some warnings that show up on clang/OSX. + +commit e6d68bc4fd0981bea90d7f045779cacfe53f6ae8 +Merge: 20a1c401 ec06b6a5 +Author: Devin Matthews +Date: Fri Aug 13 14:47:46 2021 -0500 + + Merge pull request #529 from flame/fix_make_check_dependencies + + Add dependency on the "flat" blis.h file for the BLIS and BLAS testuite objects. + +commit 1772db029e10e0075b5a59d3fb098487b1ad542a +Author: Devin Matthews +Date: Fri Aug 13 14:46:35 2021 -0500 + + Add row- and column-strides for A/B in obj_ukr_fn_t. + +commit 4f70eb7913ad3ded193870361b6da62b20ec3823 +Author: Devin Matthews +Date: Fri Aug 13 11:12:43 2021 -0500 + + Clean up some warnings that show up on clang/OSX. + +commit 3cddce1e2a021be6064b90af30022b99cbfea986 +Author: Devin Matthews +Date: Thu Aug 12 22:32:34 2021 -0500 + + Remove schema field on obj_t (redundant) and add new API functions. + +commit ec06b6a503a203fa0cdb23273af3c0e3afeae7fa +Author: Devin Matthews +Date: Thu Aug 12 19:27:31 2021 -0500 + + Add dependency on the "flat" blis.h file for the BLIS and BLAS testsuite objects. + + This fixes a bug where "make -j check" may fail after a change to one or more header files, or where testsuite code doesn't get properly recompiled after internal changes. + +commit 20a1c4014c999063e6bc1cfa605b152454c5cbf4 +Author: Field G. Van Zee +Date: Thu Aug 12 14:44:04 2021 -0500 + + Disabled sanity check in bli_pool_finalize(). + + Details: + - Disabled a sanity check in bli_pool_finalize() that was meant to alert + the user if a pool_t was being finalized while some blocks were still + checked out. However, this is exactly the situation that might happen + when a pool_t is re-initialized for a larger blocksize, and currently + bli_pool_reinit() is implemeneted as _finalize() followed by _init(). + So, this sanity check is not universally appropriate. Thanks to + AMD-India for reporting this issue. + +commit e366665cd2b5ae8d7683f5ba2de345df0a41096f +Author: Field G. Van Zee +Date: Thu Aug 12 14:06:53 2021 -0500 + + Fixed stale API calls to membrk API in gemmlike. + + Details: + - Updated stale calls to the bli_membrk API within the 'gemmlike' + sandbox. This API is now called bli_pba (packed block allocator). + Ideally, this forgotten update would have been included as part of + 21911d6, which is when the branch where the membrk->pba changes was + introduced was merged into 'master'. + - Comment updates. + +commit e38ca28689f31c5e5bd2347704dc33042e5ea176 +Author: RuQing Xu +Date: Fri Aug 13 03:21:19 2021 +0900 + + Added Apple Firestorm (A14/M1) Subconfig + + - Use the same bulk kernel as Cortex-A53 / ThunderX2; + - Larger block size; + - Use gemmsup kernels for double precision. + +commit 3df0e9b653fbb1293cad93010273eea579e753d9 +Author: RuQing Xu +Date: Sat Jul 17 04:21:53 2021 +0900 + + Arm64 8x4 Kernel Use Less Regs + +commit 4e7e225057a05b9722ce65ddf75a9c31af9fbf36 +Author: RuQing Xu +Date: Wed Jun 9 15:46:36 2021 +0900 + + Armv8-A Supplimentary GEMMSUP Sizes for RD + +commit c792d506ba09530395c439051727631fd164f59a +Author: RuQing Xu +Date: Sat Jun 5 04:20:24 2021 +0900 + + Armv8-A Fix GEMMSUP-RD Kernels on GNU Asm + + Suffixed NEON opcode is not supported by GNU assembler + +commit ce4473520975c2c8790c82c65a69d75f8ad758ea +Author: RuQing Xu +Date: Sat Jun 5 04:08:14 2021 +0900 + + Armv8-A Adjust Types for PACKM Kernels + + GCC does not have full NEON intrinsics support. + +commit 8a32d19af85b61af92fcab1c316fb3be1a8d42ce +Author: RuQing Xu +Date: Sat Jun 5 03:31:30 2021 +0900 + + Armv8-A GEMMSUP-RD 6x8m + + Armv8-A now has a complete set of GEMMSUP kernels.. + +commit afd0fa6ad1889ed073f781c8aa8635f99e76b601 +Author: RuQing Xu +Date: Sat Jun 5 01:19:01 2021 +0900 + + Armv8-A GEMMSUP-RD 6x8n + +commit 3c5f7405148ab142dee565d00da331d95a7a07b9 +Author: RuQing Xu +Date: Fri Jun 4 21:50:51 2021 +0900 + + Armv8-A s/d Packing Kernels Fix Typo + + For GCC. + +commit 49b05df7929ec3abc0d27b475d2d406116fe2682 +Author: RuQing Xu +Date: Fri Jun 4 18:04:59 2021 +0900 + + Armv8-A Introduced s/d Packing Kernels + + Sizes according to the 2014 kernels. + +commit c3faf93168c3371ff48a2d40d597bdb27021cad4 +Author: RuQing Xu +Date: Thu Jun 3 23:09:05 2021 +0900 + + Armv8-A DGEMMSUP 6x8m Kernel + + Recommended kernels set: + ... + BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, TRUE, + BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, TRUE, + BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, TRUE, + BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, TRUE, + BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, TRUE, + BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, TRUE, + ... + bli_blksz_init ( &blkszs[ BLIS_MR ], -1, 6, -1, -1, + -1, 8, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 8, -1, -1 ); + ... + +commit 3efe707b5500954941061d4c2363d6ed41d17233 +Author: RuQing Xu +Date: Thu Jun 3 17:20:57 2021 +0900 + + Armv8-A DGEMMSUP Adjustments + +commit 8ed8f5e625de9b77a0f14883283effe79af01771 +Author: RuQing Xu +Date: Thu Jun 3 16:37:37 2021 +0900 + + Armv8-A Add More DGEMMSUP + + - Add 6x8 GEMMSUP. + - Adjust prefetching. + - Workaround for Clang's disability to handle reg clobbering. + - Subproduct 6x8 row-major GEMM <- incomplete. + +commit a9ba79ea14de3b5a271e5970cb473d3c52e2fa5f +Author: RuQing Xu +Date: Wed Jun 2 15:04:29 2021 +0900 + + Armv8-A Add GEMMSUP 4x8n Kernel + + - Compile w/ both GCC & Clang. + - Edge cases use ref-kernels. + - Can give performance boost in some contexts. + +commit df40efe8fbfd399d76c6000ec03791a9b76ffbdf +Author: RuQing Xu +Date: Wed Jun 2 00:04:20 2021 +0900 + + Armv8-A Add Part of GEMMSUP 8x4m Kernel + + - Compile w/ both GCC & Clang + - Only block part is implement. Edge cases WIP + - Not Optimal kernel scheme. Should do 4x8 instead + +commit 66399992881316514f64d68ec9eb60a87d53f674 +Author: RuQing Xu +Date: Sat May 29 05:52:05 2021 +0900 + + Armv8A DGEMM 4x4 Kernel WIP. Slow + + Quite slow. + +commit a29c16394ccef02d29141c79b71fb408e20073e6 +Author: RuQing Xu +Date: Sat May 29 04:58:45 2021 +0900 + + Armv8-A Add 8x4 Kernel WIP + + Test result: a bit lower GFlOps than 6x8. + +commit 64a1f786d58001284aa4f7faf9fae17f0be7a018 +Author: Devin Matthews +Date: Wed Aug 11 17:53:12 2021 -0500 + + Implement proposed new function pointer fields for obj_t. + + The added fields: + 1. `pack_t schema`: storing the pack schema on the object allows the macrokernel to act accordingly without side-channel information from the rntm_t and cntx_t. The pack schema and "pack_[ab]" fields could be removed from those structs. + 2. `void* user_data`: this field can be used to store any sort of additional information provided by the user. The pointer is propagated to submatrix objects and copies, but is otherwise ignored by the framework and the default implementations of the following three fields. User-specified pack, kernel, or ukr functions can do whatever they want with the data, and the user is 100% responsible for allocating, assigning, and freeing this buffer. + 3. `obj_pack_fn_t pack`: the function called when a matrix is packed. This functions receives the expected arguments, as well as a mdim_t and mem_t* as memory must be allocated inside this function, and behavior may differ based on which matrix is being backed (i.e. transposition for B). This could also be achieved by passing a desired pack schema, but this would require additional information to travel down the control tree. + 4. `obj_ker_fn_t ker`: the function called when we get to the "second loop", or the macro-kernel. Behavior may depend on the pack schemas of the input matrices. The default implementation would perform the inner two loops around the ukr, and then call either the default ukr or a user-supplied one (next field). + 5. `obj_ukr_fn_t ukr`: the function called by the default macrokernel. This would replace the various current "virtual" microkernels, and could also be used to supply user-defined behavior. Users could supply both a custom kernel (above) and microkernel, although the user-specified kernel does **not** necessarily have to call the ukr function specified on the obj_t. + + Note that no macros or functions for accessing these new fields have been defined yet. That is next once these are finalized. Addresses https://github.com/flame/blis/projects/1#card-62357687. + +commit a32257eeab2e9946e71546a05a1847a39341ec6b +Author: Field G. Van Zee +Date: Thu Aug 5 16:23:02 2021 -0500 + + Fixed bli_init.c compile-time error on OSX clang. + + Details: + - Fixed a compile-time error in bli_init.c when compiling with OSX's + clang. This error was introduced in 868b901, which introduced a + post-declaration struct assignment where the RHS was a struct + initialization expression (i.e. { ... }). This use of struct + initializer expressions apparently works with gcc despite it not + being strict C99. The fix included in this commit declares a temporary + variable for the purposes of being initialized to the desired value, + via the struct initializer, and then copies the temporary struct (via + '=' struct assignment) to the persistent struct. Thanks to Devin + Matthews for his help with this. + +commit c8728cfbd19ecde9d43af05829e00bcfe7d86eed +Author: Field G. Van Zee +Date: Thu Aug 5 15:17:09 2021 -0500 + + Fixed configure breakage on OSX clang. + + Details: + - Accept either 'clang' or 'LLVM' in vendor string when greping for + the version number (after determining that we're working with clang). + Thanks to Devin Matthews for this fix. + +commit 868b90138e64c873c780d9df14150d2a370a7a42 +Author: Field G. Van Zee +Date: Wed Aug 4 18:31:01 2021 -0500 + + Fixed one-time use property of bli_init() (#525). + + Details: + - Fixes a rather obvious bug that resulted in segmentation fault + whenever the calling application tried to re-initialize BLIS after + its first init/finalize cycle. The bug resulted from the fact that + the bli_init.c APIs made no effort to allow bli_init() to be called + subsequent times at all due to it, and bli_finalize(), being + implemented in terms of pthread_once(). This has been fixed by + resetting the pthread_once_t control variable for initialization + at the end of bli_finalize_apis(), and by resetting the control + variable for finalization at the end of bli_init_apis(). Thanks to + @lschork2 for reporting this issue (#525), and to Minh Quan Ho and + Devin Matthews for suggesting the chosen solution. + - CREDITS file update. + +commit 8dba1e752c6846a85dea50907135bbc5cbc54ee5 +Author: Field G. Van Zee +Date: Tue Jul 27 12:38:24 2021 -0500 + + CREDITS file update. + +commit cc9206df667b7c710b57b190b8ad351176de53b8 +Author: Field G. Van Zee +Date: Fri Jul 16 15:48:37 2021 -0500 + + Added Graviton2 Neoverse N1 performance results. + + Details: + - Added single-threaded and multithreaded performance results to + docs/Performance.md. These results were gathered on a Graviton2 + Neoverse N1 server. Special thanks to Nicholai Tukanov for + collecting these results via the Arm-HPC/AWS hackaton. + - Corrected what was supposed to be a temporary tweak to the legend + labels in test/3/octave/plot_l3_perf.m. + +commit fab5c86d68137b59800715efb69214c0a7e458a7 +Merge: 84f9dcd4 d073fc9a +Author: Devin Matthews +Date: Tue Jul 13 16:46:21 2021 -0500 + + Merge pull request #516 from nicholaiTukanov/p10-sandbox-rework + + P10 sandbox rework + +commit 84f9dcd449fa7a4cf4087fca8ec4ca0d10e9b801 +Author: Devin Matthews +Date: Tue Jul 13 16:45:44 2021 -0500 + + Remove unnecesary windows/zen2 directory. + +commit 21911d6ed3438ca4ba942d05851ba5d7e9835586 +Merge: 17729cf4 689fa0f4 +Author: Field G. Van Zee +Date: Fri Jul 9 18:10:46 2021 -0500 + + Merge branch 'dev' + +commit 17729cf449919d1db9777cea5b65d2efc77e2692 +Author: Devin Matthews +Date: Fri Jul 9 14:59:48 2021 -0500 + + Add vzeroupper to Haswell microkernels. (#524) + + Details: + - Added vzeroupper instruction to the end of all 'gemm' and 'gemmtrsm' + microkernels so as to avoid a performance penalty when mixing AVX + and SSE instructions. These vzeroupper instructions were once part + of the haswell kernels, but were inadvertently removed during a source + code shuffle some time ago when we were managing duplicate 'haswell' + and 'zen' kernel sets. Thanks to Devin Matthews for tracking this down + and re-inserting the missing instructions. + +commit c9a7f59aa84daa54d8f8c771f1f1ef2bd8730da2 +Merge: 75f03907 9a8e649c +Author: Devin Matthews +Date: Thu Jul 8 14:00:38 2021 -0500 + + Merge pull request #522 from flame/windows-avx512 + + Fix Win64 AVX512 bug. + +commit 9a8e649c5ac89eba951bbee7136ca28aeb24d731 +Author: Devin Matthews +Date: Wed Jul 7 15:23:57 2021 -0500 + + Fix Win64 AVX512 bug. + + Use `-march=haswell` for kernels. Fixes #514. + +commit 75f03907c58385b656c8bd35d111db245814a9f3 +Author: Devin Matthews +Date: Wed Jul 7 15:44:11 2021 -0500 + + Add comment about make checkblas on Windows + + [ci skip] + +commit 4651583b1204a965e4aa672c7ad6de60f3ab1600 +Merge: 69205ac2 174f7fc9 +Author: Devin Matthews +Date: Wed Jul 7 01:11:20 2021 -0500 + + Merge pull request #520 from flame/travis-ci-install + + Test installation in Travis CI + +commit 69205ac266947723ad4d7bb028b7521fe5c76991 +Author: Field G. Van Zee +Date: Tue Jul 6 20:39:22 2021 -0500 + + CREDITS file update. + + Details: + - Thanks to Chengguo Sun for submitting #515 (5ef7f68). + - Thanks to Andrew Wildman for submitting #519 (551c6b4). + - Whitespace update to configure (spaces to tabs). + +commit 174f7fc9a11712c7bd1a61510bdc5c262b3e8e1f +Author: Devin Matthews +Date: Tue Jul 6 19:35:55 2021 -0500 + + Test installation in Travis CI + +commit 551c6b4ee8cd9dd2e1d1b46c8dde09eb50b91b2c +Merge: 78eac6a0 f648df4e +Author: Devin Matthews +Date: Tue Jul 6 19:32:53 2021 -0500 + + Merge pull request #519 from awild82/oot_build_bugfix + + Fix installation from out-of-tree builds + +commit f648df4e5588f069b2db96f8be320ead0c1967ef +Author: Andrew Wildman +Date: Tue Jul 6 16:35:12 2021 -0700 + + Add symlink to blis.pc.in for out-of-tree builds + +commit 78eac6a0ab78c995c3f4e46a9e87388b5c3e1af6 +Author: Devin Matthews +Date: Tue Jul 6 11:05:43 2021 -0500 + + Revert "Always run `make check`." + + This reverts commit a201a53440c51244739aaee20e3309b50121cc68. + +commit a201a53440c51244739aaee20e3309b50121cc68 +Author: Devin Matthews +Date: Mon Jul 5 21:39:18 2021 -0500 + + Always run `make check`. + + I'm concerned that problems may lurk for `x86_64` builds on Windows which may be uncovered by a fuller `make check`. + +commit 5ef7f684dc75fc707c82f919e0836615f90a2627 +Merge: aaa10c87 ad6231cc +Author: Devin Matthews +Date: Mon Jul 5 21:35:07 2021 -0500 + + Merge pull request #515 from chengguosun/bug-fix + + Fixed configure script bug. + +commit ad6231cca3fc1e477752ecd31b1ee2323398a642 +Author: sunchengguo +Date: Tue Jul 6 07:30:00 2021 -0400 + + Fixed configure script bug. + Details: + - Fixed kernel list string substitution error by adding function substitute_words in configure script. + if the string contains zen and zen2, and zen need to be replaced with another string, then zen2 + also be incorrectly replaced. + +commit d073fc9acac9d702556cab9fbbb3a253eeb1f998 +Author: nicholaiTukanov +Date: Fri Jul 2 19:54:33 2021 -0500 + + Update POWER10.md + +commit 907226c0af4afb6323b4e02be4f73f5fb89cddaf +Author: nicholaiTukanov +Date: Fri Jul 2 19:47:18 2021 -0500 + + Rework POWER10 sandbox + + - Add a testsuite for gathering performance (in GFLOPs) and measuring correctness for the POWER10 GEMM reduced precision/integer kernels. + - Reworked GENERIC_GEMM template to hardcode the cache parameters. + - Remove kernel wrapper that checked that only allowed matrices that weren't transposed or conjugated. However, the kernels still assume the matrices are not transposed. This wrapper was removed for performance reasons. + - Renamed and restructured files and functions for clarity. + - Editted the POWER10 document to reflect new changes. + +commit aaa10c87e19449674a4ca30fa3b6392bb22c3a66 +Author: Field G. Van Zee +Date: Mon Jun 21 17:53:52 2021 -0500 + + Skip clearing temp microtile in gemmlike sandbox. + + Details: + - Removed code from gemmlike sandbox files bls_gemm_bp_var1.c and + bls_gemm_bp_var2.c that initializes the elements of the temporary + microtile to zero. This code, introduced recently in 7f7d726, did + not actually fix any bug (despite that commit's log entry). The + microtile does not need to be initialized because it is completely + overwritten by a "beta = 0" invocation of gemm prior to it being + read. Any NaNs or Infs present at the outset would have no impact + on the output matrix C. Thanks to Devin Matthews for reminding me + of this. + +commit bc10a3f2ff518360c32bea825b3eb62a9e4c8a77 +Merge: bf727636 6548ceba +Author: Devin Matthews +Date: Fri Jun 18 19:01:08 2021 -0500 + + Merge pull request #492 from flame/thunderx2-clang + + Allow clang for ThunderX2 config + +commit bf727636632a368f3247dc8ab1d4b6119e9c511a +Merge: e28f2a2d 5fc93e28 +Author: Devin Matthews +Date: Fri Jun 18 18:59:43 2021 -0500 + + Merge pull request #506 from xrq-phys/arm64-mac + + BLIS on Darwin_Aarch64 + +commit e28f2a2dfcff14e7094fce0b279b3a917b3ab98c +Merge: d10e05bb 56ffca6a +Author: Devin Matthews +Date: Tue Jun 15 19:35:07 2021 -0500 + + Merge pull request #513 from nicholaiTukanov/asm_warning_p9_fix + + Fix assembler warning in POWER9 DGEMM + +commit 56ffca6a9bc67432a7894298739895f406e5f467 +Author: nicholai +Date: Tue Jun 15 18:17:39 2021 -0500 + + Fix asm warning + +commit 689fa0f40399bde1acc5367d6dd4e8fc4eb6f3ea +Merge: b683d01b d10e05bb +Author: Field G. Van Zee +Date: Sun Jun 13 19:44:14 2021 -0500 + + Merge branch 'master' into dev + +commit d10e05bbd1ce45ce2c0dfe5c64daae2633357b3f +Author: Field G. Van Zee +Date: Sun Jun 13 19:36:16 2021 -0500 + + Sandbox header edits trigger full library rebuild. + + Details: + - Adjusted the top-level Makefile so that any change to a sandbox header + file will result in blis.h being regenerated along with a full + recompilation of the library. Previously, sandbox files were omitted + from the list of header files that, when touched, could trigger a full + rebuild. Why was it like that previously? Because originally we only + envisioned using sandboxes to *replace* gemm, not augment the library + with new functionality. When replacing gemm, blis.h does not need to + contain any local sandbox defintions in order for the user to be able + to (indirectly) use that sandbox. But if you are adding functions to + the library, those functions need to be prototyped so the compiler + can perform type checking against the user's invocation of those new + functions. Thanks to Jeff Diamond for helping us discover this + deficiency in the build system. + +commit 7c3eb44efaa762088c190bb820ef6a3c87db8f65 +Author: Devin Matthews +Date: Wed Jun 2 11:28:22 2021 -0500 + + Add vhsubpd/vhsubpd. + + Horizontal subtraction instructions added to bli_x86_asm_macros.h, currently unused [ci skip]. + +commit 7f7d72610c25f511ba8cd2a53be7b59bdb80f3f3 +Author: Field G. Van Zee +Date: Mon May 31 16:50:18 2021 -0500 + + Fixed bugs in cpackm kernels, gemmlike code. + + Details: + - Fixed intermittent bugs in bli_packm_haswell_asm_c3xk.c and + bli_packm_haswell_asm_c8xk.c whereby the imaginary component of the + kappa scalar was incorrectly loaded at an offset of 8 bytes (instead + of 4 bytes) from the real component. This was almost certainly a copy- + paste bug carried over from the corresonding zpackm kernels. Thanks to + Devin Matthews for bringing this to my attention. + - Added missing code to gemmlike sandbox files bls_gemm_bp_var1.c and + bls_gemm_bp_var2.c that initializes the elements of the temporary + microtile to zero. (This bug was never observed in output but rather + noticed analytically. It probably would have also manifested as + intermittent failures, this time involving edge cases.) + - Minor commented-out/disabled changes to testsuite/src/test_gemm.c + relating to debugging. + +commit 5fc93e280614b4a21a9cff36cf873b4b9407285b +Author: RuQing Xu +Date: Sat May 29 18:44:47 2021 +0900 + + Armv8A Rename Regs for Safe Darwin Compile + + Avoid x18 use in FP32 kernel: + - C address lines x[18-26] renamed to x[19-27] (reg index +1) + - Original role of x27 fulfilled by x5 which is free after k-loop pert. + + FP64 does not require changing since x18 is not used there. + +commit 9f4a4a3cfb2244e4024445e127dafd2a11f39fc5 +Author: RuQing Xu +Date: Sat May 29 17:21:28 2021 +0900 + + Armv8A Rename Regs for Clang Compile: FP32 Part + + Roughly the same as 916e1fa , additionally with x15 clobbering removed. + - x15: Not used at all. + + Compilation w/ Clang shows warning about x18 reservation, but + compilation itself is OK and all tests got passed. + +commit 916e1fa8be3cea0e3e2a4a7e8b00027ac2ee7780 +Author: RuQing Xu +Date: Sat May 29 16:46:52 2021 +0900 + + Armv8A Rename Regs for Clang Compile: FP64 Part + + - x7, x8: Used to store address for Alpha and Beta. + As Alpha & Beta was not used in k-loops, use x0, x1 to load + Alpha & Beta's addresses after k-loops are completed, since A & B's + addresses are no longer needed there. + This "ldr [addr]; -> ldr val, [addr]" would not cause much performance + drawback since it is done outside k-loops and there are plenty of + instructions between Alpha & Beta's loading and usage. + - x9: Used to store cs_c. x9 is multiplied by 8 into x10 and not used + any longer. Directly loading cs_c and into x10 and scale by 8 spares + x9 straightforwardly. + - x11, x12: Not used at all. Simply remove from clobber list. + - x13: Alike x9, loaded and scaled by 8 into x14, except that x13 is + also used in a conditional branch so that "cmp x13, #1" needs to be + modified into "cmp x14, #8" to completely free x13. + - x3, x4: Used to store next_a & next_b. Untouched in k-loops. Load + these addresses into x0 and x1 after Alpha & Beta are both loaded, + since then neigher address of A/B nor address of Alpha/Beta is needed. + +commit 7fabd896af773623ed01820a71bbff432e8a7d25 +Author: RuQing Xu +Date: Sat May 29 16:28:03 2021 +0900 + + Asm Flag Mingling for Darwin_Aarch64 + + Apple+Arm64 requires additional "tagging" of local symbols. + +commit 213dce32d2eed8b7a38c6a3f6112072b0a89ecd0 +Author: Field G. Van Zee +Date: Fri May 28 14:49:57 2021 -0500 + + Added a new 'gemmlike' sandbox. + + Details: + - Added a new sandbox called 'gemmlike', which implements sequential and + multithreaded gemm in the style of gemmsup but also unconditionally + employs packing. The purpose of this sandbox is to + (1) avoid select abstractions, such as objects and control trees, in + order to allow readers to better understand how a real-world + implementation of high-performance gemm can be constructed; + (2) provide a starting point for expert users who wish to build + something that is gemm-like without "reinventing the wheel." + Thanks to Jeff Diamond, Tze Meng Low, Nicholai Tukanov, and Devangi + Parikh for requesting and inspiring this work. + - The functions defined in this sandbox currently use the "bls_" prefix + instead of "bli_" in order to avoid any symbol collisions in the main + library. + - The sandbox contains two variants, each of which implements gemm via a + block-panel algorithm. The only difference between the two is that + variant 1 calls the microkernel directly while variant 2 calls the + microkernel indirectly, via a function wrapper, which allows the edge + case handling to be abstracted away from the classic five loops. + - This sandbox implementation utilizes the conventional gemm microkernel + (not the skinny/unpacked gemmsup kernels). + - Updated some typos in the comments of a few files in the main + framework. + +commit 82af05f54c34526a60fd2ec46656f13e1ac8f719 +Author: Field G. Van Zee +Date: Tue May 25 15:25:08 2021 -0500 + + Updated Fugaku (a64fx) performance results. + + Details: + - Updated the performance graphs (pdfs and pngs) for the Fugaku/a64fx + entry within Performance.md, and also updated the experiment details + accordingly. Thanks to RuQing Xu for re-running the BLIS and SSL2 + experiments reflected in this commit. + - In Performance.md, added an English translation of the project name + under which the Fugaku results were gathered, courtesy of RuQing Xu. + +commit e5c85da3763f73854ecd739ba3008bb467ed77c3 +Merge: cbd8d393 5feb04e2 +Author: Devin Matthews +Date: Mon May 24 16:56:22 2021 -0500 + + Merge pull request #503 from flame/windows-compiler-check + + Add explicit compiler check for Windows. + +commit cbd8d3932599485727204479fded66ac19186db4 +Merge: 6d4ab022 932dfe6a +Author: Devin Matthews +Date: Mon May 24 16:32:42 2021 -0500 + + Merge pull request #500 from xrq-phys/armsve+travis + + Upgrade Travis CI for Arm SVE + +commit 5feb04e233e1e6f81c727578ad9eae1367a2562f +Author: Devin Matthews +Date: Sun May 23 18:46:56 2021 -0500 + + Add explicit compiler check for Windows. + + Check the C compiler for a predefined macro `_WIN32` to indicate (cross-)compilation for Windows. Fixes #463. + +commit 6d4ab0223d9014ac2a66d66759536aa305be5867 +Merge: 61584ded 859fb77a +Author: Devin Matthews +Date: Sun May 23 18:39:53 2021 -0500 + + Merge pull request #502 from flame/rm-rm-dupls + + Remove `rm-dupls` function in common.mk. + +commit 859fb77a320a3ace71d25a8885c23639b097a1b6 +Author: Devin Matthews +Date: Sun May 23 18:15:23 2021 -0500 + + Remove `rm-dupls` function in common.mk. + + AMD requested removal due to unclear licensing terms; original code was from stackoverflow. The function is unused but could easily be replaced by new implementation. + +commit 932dfe6abb9617223bd26a249e53447169033f8c +Author: RuQing Xu +Date: Thu May 20 02:07:31 2021 +0900 + + Travis CI Revert Unnecessary Extras from 91d3636 + + - Removed `V=1` in make line + - Removed `CFLAGS` in configure line + - Restored `pwd` surrounding OOT line + +commit bd156a210d347a073a6939cc4adab3d9256c2e2b +Author: RuQing Xu +Date: Sun May 16 02:56:14 2021 +0900 + + Adjust TravisCI + + - ArmSVE don't test gemmt (seems Qemu-only problem); + - Clang use TravisCI-provided version instead of fixing to clang-8 + due to that clang-8 seems conflicting with TravisCI's clang-7. + +commit 91d3636031021af3712d14c9fcb1eb34b6fe2a31 +Author: RuQing Xu +Date: Sat May 15 17:05:16 2021 +0900 + + Travis Support Arm SVE + + - Updated distro to 20.04 focal aarch64-gcc-10. + This is minimal version required by aarch64-gcc-10. + SVE intrinsics would not compile without GCC >=10. + - x86 toolchains use official repo instead of ubuntu-toolchain-r/test. + 20.04 focal is not supported by that PPA at the moment. + - Add extra configuration-time options to .travis.yml. + - Add Arm SVE entry to .travis.yml. + +commit 61584deddf9b3af6d11a811e6e04328d22390202 +Author: RuQing Xu +Date: Wed May 19 23:52:29 2021 +0900 + + Added 512b SVE-based a64fx subconfig + SVE kernels. + + Details: + - Added 512-bit specific 'a64fx' subconfiguration that uses empirically + tuned block size by Stepan Nassyr. This subconfig also sets the sector + cache size and enables memory-tagging code in SVE gemm kernels. This + subconfig utilizes (16, k) and (10, k) DPACKM kernels. + - Added a vector-length agnostic 'armsve' subconfiguration that computes + blocksizes according to the analytical model. This part is ported from + Stepan Nassyr's repository. + - Implemented vector-length-agnostic [d/s/sh] gemm kernels for Arm SVE + at size (2*VL, 10). These kernels use unindexed FMLA instructions + because indexed FMLA takes 2 FMA units in many implementations. + PS: There are indexed-FLMA kernels in Stepan Nassyr's repository. + - Implemented 512-bit SVE dpackm kernels with in-register transpose + support for sizes (16, k) and (10, k). + - Extended 256-bit SVE dpackm kernels by Linaro Ltd. to 512-bit for + size (12, k). This dpackm kernel is not currently used by any + subconfiguration. + - Implemented several experimental dgemmsup kernels which would + improve performance in a few cases. However, those dgemmsup kernels + generally underperform hence they are not currently used in any + subconfig. + - Note: This commit squashes several commits submitted by RuQing Xu via + PR #424. + +commit b683d01b9c4ea5f64c8031bda816beccfbf806a0 +Author: Field G. Van Zee +Date: Thu May 13 15:23:22 2021 -0500 + + Use extra #undef when including ba/ex API headers. + + Details: + - Inserted a "#include bli_xapi_undef.h" after each usage of the basic + and expert API macro setup headers: bli_oapi_ba.h, bli_oapi_ex.h, + bli_tapi_ba.h, and bli_tapi_ex.h. This is functionally equivalent to + the previous status quo, in which each header made minimal #undef + prior to its own definitions and then a single instance of + "#include bli_xapi_undef.h" cleaned up any remaining macro defs after + all other headers were used. This commit will guarantee that macro + defs from the setup of one header (say, bli_oapi_ex.h) don't "infect" + the definitions made in a subsequent header. As with this previous + commit, this change does not fix any issue but rather attempts to + avoid creating orphaned macro definitions that are only needed within + a very limited scope. + - Removed minimal #undef from bli_?api_[ba|ex].h. + - Removed old commented-out lines from bli_?api_[ba|ex].h. + +commit d4427a5b2f5cab5d2a64c58d87416628867c2b4a +Author: Field G. Van Zee +Date: Thu May 13 13:55:11 2021 -0500 + + Minor preprocessor/header cleanup. + + Details: + - Added frame/include/bli_xapi_undef.h, which explicitly undefines all + macros defined in bli_oapi_ba.h, bli_oapi_ex.h, bli_tapi_ba.h, and + bli_tapi_ex.h. (This is for safety and good cpp coding practice, not + because it fixes anything.) + - Added #include "bli_xapi_undef.h" to bli_l1v.h, bli_l1d.h, bli_l1f.h, + bli_l1m.h, bli_l2.h, bli_l3.h, and bli_util.h. + - Comment updates to bli_oapi_ba.h, bli_oapi_ex.h, bli_tapi_ba.h, and + bli_tapi_ex.h. + - Moved frame/3/bli_l3_ft_ex.h to local 'old' directory after realizing + that nothing in BLIS used those function pointer types. Also commented + out the "#include bli_l3_ft_ex.h" directive in frame/3/bli_l3.h. + +commit 5aa63cd927b22a04e581b07d0b68ef391f4f9b1f +Author: Field G. Van Zee +Date: Wed May 12 19:53:35 2021 -0500 + + Fixed typo in cpp guard in bli_util_ft.h. + + Details: + - Changed #ifdef BLIS_OAPI_BASIC to #ifdef BLIS_TAPI_BASIC in + bli_util_ft.h. This typo was causing some types to be redefined when + they weren't supposed to be. + +commit f0e8634775094584e89f1b03811ee192f2aaf67f +Author: Field G. Van Zee +Date: Wed May 12 18:45:32 2021 -0500 + + Defined eqsc, eqv, eqm to test object equality. + + Details: + - Defined eqsc, eqv, and eqm operations, which set a bool depending on + whether the two scalars, two vectors, or two matrix operands are equal + (element-wise). eqsc and eqv support implicit conjugation and eqm + supports diagonal offset, diag, uplo, and trans parameters (in a + manner consistent with other level-1m operations). These operations + are currently housed under frame/util, at least for now, because they + are not computational in nature. + - Redefined bli_obj_equals() in terms of eqsc, eqv, and eqm. + - Documented eqsc, eqv, and eqm in BLISObjectAPI.md and BLISTypedAPI.md. + Also: + - Documented getsc and setsc in both docs. + - Reordered entry for setijv in BLISTypedAPI.md, and added separator + bars to both docs. + - Added missing "Observed object properties" clauses to various + levle-1v entries in BLISObjectAPI.md. + - Defined bli_apply_trans() in bli_param_macro_defs.h. + - Defined supporting _check() function, bli_l0_xxbsc_check(), in + bli_l0_check.c for eqsc. + - Programming style and whitespace updates to bli_l1m_unb_var1.c. + - Whitespace updates to bli_l0_oapi.c, bli_l1m_oapi.c + - Consolidated redundant macro redefinition for copym function pointer + type in bli_l1m_ft.h. + - Added macros to bli_oapi_ba.h, _ex.h, and bli_tapi_ba.h, _ex.h that + allow oapi and tapi source files to forego defining certain expert + functions. (Certain operations such as printv and printm do not need + to have both basic expert interfaces. This also includes eqsc, eqv, + and eqm.) + +commit 5d46dbee4a06ba5a422e19817836976f8574cb4f +Author: Devin Matthews +Date: Wed May 12 18:42:09 2021 -0500 + + Replace bli_dlamch with something less archaic (#498) + + Details: + - Added new implementations of bli_slamch() and bli_dlamch() that use + constants from the standard C library in lieu of dynamically-computed + values (via code inherited from netlib). The previous implementation + is still available when the cpp macro BLIS_ENABLE_LEGACY_LAMCH is + defined by the subconfiguration at compile-time. Thanks to Devin + Matthews for providing this patch, and to Stefano Zampini for + reporting the issue (#497) that prompted Devin to propose the patch. + +commit 6a89c7d8f9ac3f51b5b4d8ccb2630d908d951e6f +Author: Field G. Van Zee +Date: Sat May 1 18:54:48 2021 -0500 + + Defined setijv, getijv to set/get vector elements. + + Details: + - Defined getijv, setijv operations to get and set elements of a vector, + in bli_setgetijv.c and .h. + - Renamed bli_setgetij.c and .h to bli_setgetijm.c and .h, respectively. + - Added additional bounds checking to getijm and setijm to prevent + actions with negative indices. + - Added documentation to BLISObjectAPI.md and BLISTypedAPI.md for getijv + and setijv. + - Added documentation to BLISTypedAPI.md for getijm and setijm, which + were inadvertently missing. + - Added a new entry to the FAQ titled "Why does BLIS have vector + (level-1v) and matrix (level-1m) variations of most level-1 + operations?" + - Comment updates. + +commit 4534daffd13ed7a8983c681d3f5e9de17c9f0b96 +Author: Field G. Van Zee +Date: Tue Apr 27 18:16:44 2021 -0500 + + Minor API breakage in bli_pack API. + + Details: + - Changed bli_pack_get_pack_a() and bli_pack_get_pack_b() so that + instead of returning a bool, they set a bool that is passed in by + address. This does break the public exported API, but I expect very + few users actually use this function. (This change is being made in + preparation for a much more extensive commit relating to error + checking.) + +commit 6a4aa986ffc060d3e64ed230afe318b82630f8b2 +Author: Field G. Van Zee +Date: Fri Apr 23 13:10:01 2021 -0500 + + Fixed typo in Table of Contents. + +commit f6424b5b82160d346a09a0fbb526981ecf66cdb3 +Author: Field G. Van Zee +Date: Fri Apr 23 13:08:06 2021 -0500 + + Added dedicated Performance section to README.md. + + Details: + - Spun off the Performance.md and PerformanceSmall.md links in the + Documentation section into a new Performance section dedicated to + those two links. (The previous entries remain redundantly listed + within Documentation section.) Thanks to Robert van de Geijn for + suggesting this change. + +commit 40ce5fd241b9ad140bf57278d440f0598d7f15d8 +Merge: 6280757b 1f3461a5 +Author: Devin Matthews +Date: Wed Apr 21 09:54:25 2021 -0500 + + Merge pull request #493 from cassiersg/patch-1 + + Fix typo in FAQ.md + +commit 1f3461a5a5a88510f913451a93e3190ec1556f39 +Author: Gaëtan Cassiers +Date: Wed Apr 21 16:49:05 2021 +0200 + + Fix typo in FAQ.md + +commit 6548cebaf55a1f9bdb8417cc89dd0444d8f9c2e4 +Author: Devin Matthews +Date: Wed Apr 14 13:00:42 2021 -0500 + + Allow clang for ThunderX2 config + + Needed for compiling on e.g. Mac M1. AFAIK clang supports the same -mcpu flag for ThunderX2 as gcc. + +commit 6280757be32f90fd77d8dd9357b07d9306e6f80d +Author: Field G. Van Zee +Date: Wed Apr 7 13:03:56 2021 -0500 + + Minor updates to a64fx section of Performance.md. + +commit 1e6ed823c6cd11f9b671779f3c8bdbd2bbb40f34 +Author: RuQing Xu +Date: Thu Apr 8 02:59:26 2021 +0900 + + Additional A64fx Comments (#490) + + * Performance.md Update A64fx Comments + + - Reason for ARMPL's missing data; + - Additional envs / flags for kernel selection; + - Update BLIS SRC commit. + + * Include Another Fix in armsve-cfg-vendor + + A prototype was forgotten, causing that void* pointer was not fully returned. + +commit 2688f21a5b073950f6f187c95917fdbb5aac234a +Author: Field G. Van Zee +Date: Tue Apr 6 19:02:37 2021 -0500 + + Added Fujitsu A64fx (512-bit SVE) perf results. + + Details: + - Added single-threaded and multithreaded performance results to + docs/Performance.md. These results were gathered on the "Fugaku" + Fujitsu A64fx supercomputer at the RIKEN Center for Computational + Science in Kobe, Japan. Special thanks to RuQing Xu and Stepan + Nassyr for their work in developing and optimizing A64fx support in + BLIS and RuQing for gathering the performance data that is reflected + in these new graphs. + +commit ba3ba8da83d48397162139e11337c036a631ba79 +Author: Field G. Van Zee +Date: Tue Apr 6 18:39:58 2021 -0500 + + Minor updates and fixes to test/3/octave scripts. + + Details: + - Fixed an issue where the wrong string was being passed in for the + vendor legend string. + - Changed the graph in which the legends appear. + - Updates to runthese.m. + +commit 09bd4f4f12311131938baa9f75d27e92b664d681 +Author: Field G. Van Zee +Date: Wed Mar 31 17:09:36 2021 -0500 + + Add err_t* "return" parameter to malloc functions. + + Details: + - Added an err_t* parameter to memory allocation functions including + bli_malloc_intl(), bli_calloc_intl(), bli_malloc_user(), + bli_fmalloc_align(), and bli_fmalloc_noalign(). Since these functions + already use the return value to return the allocated memory address, + they can't communicate errors to the caller through the return value. + This commit does not employ any error checking within these functions + or their callers, but this sets up BLIS for a more comprehensive + commit that moves in that direction. + - Moved the typedefs for malloc_ft and free_ft from bli_malloc.h to + bli_type_defs.h. This was done so that what remains of bli_malloc.h + can be included after the definition of the err_t enum. (This ordering + was needed because bli_malloc.h now contains function prototypes that + use err_t.) + - Defined bli_is_success() and bli_is_failure() static functions in + bli_param_macro_defs.h. These functions provide easy checks for error + codes and will be used more heavily in future commits. + - Unfortunately, the additional err_t* argument discussed above breaks + the API for bli_malloc_user(), which is an exported symbol in the + shared library. However, it's quite possible that the only application + that calls bli_malloc_user()--indeed, the reason it is was marked for + symbol exporting to begin with--is the BLIS testsuite. And if that's + the case, this breakage won't affect anyone. Nonetheless, the "major" + part of the so_version file has been updated accordingly to 4.0.0. + +commit f9ad55ce7e12f59930605753959fcfd41a218d8d +Merge: 04502492 90508192 +Author: Field G. Van Zee +Date: Wed Mar 31 14:20:19 2021 -0500 + + Merge branch 'master' into dev + +commit 90508192f2d6ae95adc2a3ba9f4e5bad2c8d6fd2 +Author: Devin Matthews +Date: Tue Mar 30 21:16:44 2021 -0500 + + Update do_sde.sh (#489) + + Update to a newer version of SDE, and do a direct download as it seems you don't have to click-through the license anymore. + +commit 22c6b5dc4c9cc21942f8ccc30891f9b4385a9504 +Author: Nicholai Tukanov +Date: Tue Mar 30 19:07:42 2021 -0500 + + Fixed bug in power10 microkernel I/O. (#488) + + Details: + - Fixed a bug in the POWER10 DGEMM kernel whereby the microkernel did + not store the microtile result correctly due to incorrect indices + calculations. (The error was introduced when I reorganized the + 'kernels/power10/3' directory.) + +commit 04502492671456b94bcdee60b9de347b6763a32d +Author: Field G. Van Zee +Date: Sun Mar 28 19:11:43 2021 -0500 + + Always stay initialized after BLAS compat calls. + + Details: + - Removed the option to finalize BLIS after every BLAS call, which also + means that BLIS would initialize at the beginning of every BLAS call. + This option never really made sense and wasn't even implemented + properly to begin with. (Because bli_init_auto() and _finalize_auto() + were implemented in terms of bli_init_once() and _finalize_once(), + respectively, the application would have only been able to call one + BLAS routine before BLIS would find itself in a unusable, permanently + uninitialized state.) Because this option was never meant for regular + use, it never made it into configure as an actual configure-time + option, and therefore this commit only removes parts of the code + affected by the cpp macro guard BLIS_ENABLE_STAY_AUTO_INITIALIZED. + +commit 3a6f41afb8197e831b6ce2f1ae7f63735685fa0a +Author: Field G. Van Zee +Date: Sat Mar 27 17:22:14 2021 -0500 + + Renamed membrk files/vars/functions to pba. + + Details: + - Renamed the files, variables, and functions relating to the packing + block allocator from its legacy name (membrk) to its current name + (pba). This more clearly contrasts the packing block allocator with + the small block allocator (sba). + - Fixed a typo in bli_pack_set_pack_b(), defined in bli_pack.c, that + caused the function to erroneously change the value of the pack_a + field of the global rntm_t instead of the pack_b field. (Apparently + nobody has used this API yet.) + - Comment updates. + +commit 36cb4116d15cfef2d42ec4a834efd4a958f261b5 +Author: Field G. Van Zee +Date: Sat Mar 27 15:15:09 2021 -0500 + + Switch allocator mutexes to static initialization. + + Details: + - Switched the small block allocator (sba), as defined in bli_sba.c and + bli_apool.c, to static initialization of its internal mutex. Did a + similar thing for the packing block allocator (pba), which appears as + global_membrk in bli_membrk.c. + - Commented out bli_membrk_init_mutex() and bli_membrk_finalize_mutex() + to ensure they won't be used in the future. + - In bli_thrcomm_pthreads.c and .h, removed old, commented-out cpp + blocks guarded by BLIS_USE_PTHREAD_MUTEX. + +commit 159ca6f01a5f91b93513134c9470b69ff78f5354 +Author: Field G. Van Zee +Date: Wed Mar 24 15:57:32 2021 -0500 + + Made test/3/octave scripts robust to missing data. + + Details: + - Modified the octave scripts in test/3 so that the script does not + choke when one or more of the expected OpenBLAS, Eigen, or vendor data + files is missing. (The BLIS data set, however, must be complete.) When + a file is missing, that data series is simply not included on that + particular graph. Also factored out a lot of the redundant logic from + plot_panel_4x5.m into a separate function in read_data.m. + +commit 545e6c2f6d09d023b353002a9a43b11aa0c1d701 +Author: Field G. Van Zee +Date: Mon Mar 22 17:42:33 2021 -0500 + + CHANGELOG update (0.8.1) + +commit 8535b3e11d2297854991c4272932ce4974dda629 (tag: 0.8.1) Author: Field G. Van Zee Date: Mon Mar 22 17:42:33 2021 -0500 Version file update (0.8.1) -commit e56d9f2d94ed247696dda2cbf94d2ca05c7fc089 (origin/master, origin/HEAD) +commit e56d9f2d94ed247696dda2cbf94d2ca05c7fc089 Author: Field G. Van Zee Date: Mon Mar 22 17:40:50 2021 -0500 @@ -163,7 +6051,7 @@ Date: Fri Mar 5 13:53:43 2021 -0600 information, refer to the POWER10.md document that is included in 'sandbox/power10'. -commit b8dcc5bc75a746807d6f8fa22dc2123c98396bf5 (origin/dev, origin/amd, dev, amd) +commit b8dcc5bc75a746807d6f8fa22dc2123c98396bf5 Author: RuQing Xu Date: Tue Mar 2 06:58:24 2021 +0800 @@ -6796,7 +12684,7 @@ Date: Mon Oct 15 16:37:39 2018 -0500 - Updated frame/include/bli_x86_asm_macros.h with additional macros (courtsey of Devin Matthews). -commit 3612ecac98a9d36c3fcd64154121d420bb69febd (origin/nested-omp-patch) +commit 3612ecac98a9d36c3fcd64154121d420bb69febd Author: Field G. Van Zee Date: Thu Oct 11 15:16:41 2018 -0500 diff --git a/CREDITS b/CREDITS index df088c7464..218d6fec71 100644 --- a/CREDITS +++ b/CREDITS @@ -3,115 +3,150 @@ BLIS framework Acknowledgements --- -The BLIS framework was primarily authored by +The BLIS framework was originally authored by - Field Van Zee @fgvanzee (The University of Texas at Austin) + Field Van Zee @fgvanzee (The University of Texas at Austin) -but many others have contributed code and feedback, including +but many others have contributed code, ideas, and feedback, including - Sameer Agarwal @sandwichmaker (Google) - Murtaza Ali (Texas Instruments) - Sajid Ali @s-sajid-ali (Northwestern University) + Sameer Agarwal @sandwichmaker (Google) + Murtaza Ali (Texas Instruments) + Sajid Ali @s-sajid-ali (Northwestern University) Erling Andersen @erling-d-andersen Alex Arslan @ararslan - Vernon Austel (IBM, T.J. Watson Research Center) - Satish Balay @balay (Argonne National Laboratory) - Matthew Brett @matthew-brett (University of Birmingham) + Vernon Austel (IBM, T.J. Watson Research Center) + Mohsen Aznaveh @Aznaveh (Texas A&M University) + Abhishek Bagusetty @abagusetty (Argonne National Laboratory) + Satish Balay @balay (Argonne National Laboratory) + Kihiro Bando @bandokihiro + Timo Betcke @tbetcke (University College London) + Matthew Brett @matthew-brett (University of Birmingham) Jérémie du Boisberranger @jeremiedbb - Jed Brown @jedbrown (Argonne National Laboratory) + Jed Brown @jedbrown (Argonne National Laboratory) + Alex Chiang @alexsifivetw (SiFive) Robin Christ @robinchrist Dilyn Corner @dilyn-corner - Mat Cross @matcross (NAG) + Mat Cross @matcross (NAG) + Harsh Dave @HarshDave12 (AMD) + Tim Davis @DrTimothyAldenDavis (Texas A&M University) @decandia50 - Kay Dewhurst @jkd2016 (Max Planck Institute, Halle, Germany) - Jeff Diamond (Oracle) + Daniël de Kok @danieldk (Explosion) + Kay Dewhurst @jkd2016 (Max Planck Institute, Halle, Germany) + Jeff Diamond (Oracle) Johannes Dieterich @iotamudelta Krzysztof Drewniak @krzysz00 - Marat Dukhan @Maratyszcza (Google) - Victor Eijkhout @VictorEijkhout (Texas Advanced Computing Center) - Evgeny Epifanovsky @epifanovsky (Q-Chem) + Marat Dukhan @Maratyszcza (Google) + Victor Eijkhout @VictorEijkhout (Texas Advanced Computing Center) + Evgeny Epifanovsky @epifanovsky (Q-Chem) Isuru Fernando @isuruf + James Foster @jd-foster (CSIRO) Roman Gareev @gareevroman Richard Goldschmidt @SuperFluffy Chris Goodyer - John Gunnels @jagunnels (IBM, T.J. Watson Research Center) + Alexander Grund @Flamefire + John Gunnels @jagunnels (IBM, T.J. Watson Research Center) Ali Emre Gülcü @Lephar - Jeff Hammond @jeffhammond (Intel) + @h-vetinari + Jeff Hammond @jeffhammond (Intel) Jacob Gorm Hansen @jacobgorm - Shivaprashanth H (Global Edge) + Shivaprashanth H (Global Edge) Jean-Michel Hautbois @jhautbois Ian Henriksen @insertinterestingnamehere (The University of Texas at Austin) - Greg Henry (Intel) + Greg Henry (Intel) Minh Quan Ho @hominhquan Matthew Honnibal @honnibal Stefan Husmann @stefanhusmann - Francisco Igual @figual (Universidad Complutense de Madrid) + Aaron Hutchinson @Aaron-Hutchinson (SiFive) + Francisco Igual @figual (Universidad Complutense de Madrid) + @j-bm + Madeesh Kannan @shadeMe Tony Kelman @tkelman - Lee Killough @leekillough (Cray) - Mike Kistler @mkistler (IBM, Austin Research Laboratory) - Ivan Korostelev @ivan23kor (University of Alberta) - Kyungmin Lee @kyungminlee (Ohio State University) + Lee Killough @leekillough (Tactical Computing Labs) + Mike Kistler @mkistler (IBM, Austin Research Laboratory) + Nick Knight @nick-knight (SiFive) + Ivan Korostelev @ivan23kor (University of Alberta) + Kyungmin Lee @kyungminlee (Ohio State University) Michael Lehn @michael-lehn + @leo4678 Shmuel Levine @ShmuelLevine @lschork2 Dave Love @loveshack - Tze Meng Low (The University of Texas at Austin) - Ye Luo @ye-luo (Argonne National Laboratory) - Ricardo Magana @magania (Hewlett Packard Enterprise) + Tze Meng Low (The University of Texas at Austin) + Ye Luo @ye-luo (Argonne National Laboratory) + Ricardo Magana @magania (Hewlett Packard Enterprise) + Madan mohan Manokar @madanm3 (AMD) Giorgos Margaritis - Bryan Marker @bamarker (The University of Texas at Austin) - Simon Lukas Märtens @ACSimon33 (RWTH Aachen University) - Devin Matthews @devinamatthews (The University of Texas at Austin) + Bryan Marker @bamarker (The University of Texas at Austin) + Simon Lukas Märtens @ACSimon33 (RWTH Aachen University) + John Mather @jmather-sesi (SideFX Software) + Devin Matthews @devinamatthews (The University of Texas at Austin) Stefanos Mavros @smavros + Mithun Mohan @MithunMohanKadavil (AMD) + @moon-chilled Ilknur Mustafazade @Runkli @nagsingh - Bhaskar Nallani @BhaskarNallani (AMD) - Stepan Nassyr @stepannassyr (Jülich Supercomputing Centre) - Nisanth Padinharepatt (AMD) + Bhaskar Nallani @BhaskarNallani (AMD) + Stepan Nassyr @stepannassyr (Jülich Supercomputing Centre) + Bart Oldeman @bartoldeman + Nisanth M P @nisanthmp + Nisanth Padinharepatt (AMD) Ajay Panyala @ajaypanyala - Devangi Parikh @dnparikh (The University of Texas at Austin) - Elmar Peise @elmar-peise (RWTH-Aachen) + Nick Papior @zerothi + Marc-Antoine Parent @maparent (Conversence) + Devangi Parikh @dnparikh (The University of Texas at Austin) + Elmar Peise @elmar-peise (RWTH-Aachen) Clément Pernet @ClementPernet Ilya Polkovnichenko - Jack Poulson @poulson (Stanford) + Jack Poulson @poulson (Stanford) Mathieu Poumeyrol @kali - Christos Psarras @ChrisPsa (RWTH Aachen University) + Christos Psarras @ChrisPsa (RWTH Aachen University) @pkubaj @qnerd Michael Rader @mrader1248 - Pradeep Rao @pradeeptrgit (AMD) + Pradeep Rao @pradeeptrgit (AMD) Aleksei Rechinskii + Leick Robinson @LeickR (Oracle) Karl Rupp @karlrupp - Martin Schatz (The University of Texas at Austin) + Martin Schatz (The University of Texas at Austin) Nico Schlömer @nschloe + Angelika Schwarz @angsch Rene Sitt - Tony Skjellum @tonyskjellum (The University of Tennessee at Chattanooga) - Mikhail Smelyanskiy (Intel, Parallel Computing Lab) + Tony Skjellum @tonyskjellum (The University of Tennessee at Chattanooga) + Mikhail Smelyanskiy (Intel, Parallel Computing Lab) + Barry Smith @BarrySmith (Argonne National Laboratory) Nathaniel Smith @njsmith Shaden Smith @ShadenSmith - Tyler Smith @tlrmchlsmth (The University of Texas at Austin) + Tyler Smith @tlrmchlsmth (The University of Texas at Austin) + Edward Smyth @edwsmyth (AMD) Snehith @ArcadioN09 - Paul Springer @springer13 (RWTH Aachen University) - Adam J. Stewart @adamjstewart (University of Illinois at Urbana-Champaign) + Paul Springer @springer13 (RWTH Aachen University) + Adam J. Stewart @adamjstewart (University of Illinois at Urbana-Champaign) Vladimir Sukarev + Harihara Sudhan S @ihariharasudhan (AMD) Chengguo Sun @chengguosun - Santanu Thangaraj (AMD) - Nicholai Tukanov @nicholaiTukanov (The University of Texas at Austin) - Rhys Ulerich @RhysU (The University of Texas at Austin) - Robert van de Geijn @rvdg (The University of Texas at Austin) - Meghana Vankadari @Meghana-vankadari (AMD) - Kiran Varaganti @kvaragan (AMD) - Natalia Vassilieva (Hewlett Packard Enterprise) - Andrew Wildman @awild82 (University of Washington) - Zhang Xianyi @xianyi (Chinese Academy of Sciences) + Atsushi Tatsuma @yoshoku + Christopher Taylor @ct-clmsn (Tactical Computing Labs) + Santanu Thangaraj (AMD) + Nicholai Tukanov @nicholaiTukanov (The University of Texas at Austin) + Rhys Ulerich @RhysU (The University of Texas at Austin) + Robert van de Geijn @rvdg (The University of Texas at Austin) + Meghana Vankadari @Meghana-vankadari (AMD) + Kiran Varaganti @kvaragan (AMD) + Natalia Vassilieva (Hewlett Packard Enterprise) + Andrew Wildman @awild82 (University of Washington) + Zhang Xianyi @xianyi (Chinese Academy of Sciences) Benda Xu @heroxbd - Guodong Xu @docularxu (Linaro.org) - RuQing Xu @xrq-phys (The University of Tokyo) + Guodong Xu @docularxu (Linaro.org) + RuQing Xu @xrq-phys (The University of Tokyo) + Srinivas Yadav @srinivasyadav18 Costas Yamin @cosstas - Chenhan Yu @ChenhanYu (The University of Texas at Austin) - Roman Yurchak @rth (Symerio) + Michael Yeh @myeh01 (SiFive) + Chenhan Yu @ChenhanYu (The University of Texas at Austin) + Roman Yurchak @rth (Symerio) Stefano Zampini @stefanozampini M. Zhou @cdluminate + Igor Zhuravlov @jip (Far Eastern Federal University) + @AngryLoki BLIS's development was partially funded by grants from industry partners, including diff --git a/INSTALL b/INSTALL index 9adc438674..75850a96ba 100644 --- a/INSTALL +++ b/INSTALL @@ -17,11 +17,14 @@ viewing the file over GitHub via a web browser: This document will always contain the most up-to-date information related to instantiating a BLIS library from the framework source code. If you have any further questions or wish to provide feedback, please contact the BLIS -community by posting your message to the BLIS developer's mailing list: +community by either by joining our Discord community! Instructions for +joining may be found in: - https://groups.google.com/d/forum/blis-devel + docs/Discord.md -Thanks for your interest in the BLIS framework! +or in rendered form at: + + https://github.com/flame/blis/blob/master/docs/Discord.md -Field Van Zee +Thanks for your interest in the BLIS framework! diff --git a/LICENSE b/LICENSE index b9cde54b85..8168814a9b 100644 --- a/LICENSE +++ b/LICENSE @@ -6,6 +6,7 @@ while other portions are copyrighted by Hewlett Packard Enterprise Development LP Advanced Micro Devices, Inc. + Oracle Corporation with some overlap. Please see file-level license headers for file-specific copyright info. All parties provide their portions of the code under the @@ -13,9 +14,10 @@ copyright info. All parties provide their portions of the code under the --- -Copyright (C) 2018, The University of Texas at Austin +Copyright (C) 2012 - 2022, The University of Texas at Austin Copyright (C) 2016, Hewlett Packard Enterprise Development LP Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. +Copyright (C) 2022, Oracle Corporation Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/Makefile b/Makefile index b5e036744c..c686fd12c3 100644 --- a/Makefile +++ b/Makefile @@ -5,6 +5,7 @@ # libraries. # # Copyright (C) 2014, The University of Texas at Austin +# Copyright (C) 2022, Advanced Micro Devices, Inc. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -56,11 +57,12 @@ testblis testblis-fast testblis-md testblis-salt \ check checkblas \ checkblis checkblis-fast checkblis-md checkblis-salt \ - install-headers install-libs install-lib-symlinks \ + install-headers install-helper-headers install-libs install-lib-symlinks \ showconfig \ clean cleanmk cleanh cleanlib distclean \ cleantest cleanblastest cleanblistest \ changelog \ + symbols \ install uninstall uninstall-old \ uninstall-libs uninstall-lib-symlinks uninstall-headers \ uninstall-old-libs uninstall-lib-symlinks uninstall-old-headers @@ -114,6 +116,7 @@ BASE_OBJ_CONFIG_PATH := $(BASE_OBJ_PATH)/$(CONFIG_DIR) BASE_OBJ_FRAME_PATH := $(BASE_OBJ_PATH)/$(FRAME_DIR) BASE_OBJ_REFKERN_PATH := $(BASE_OBJ_PATH)/$(REFKERN_DIR) BASE_OBJ_KERNELS_PATH := $(BASE_OBJ_PATH)/$(KERNELS_DIR) +BASE_OBJ_ADDON_PATH := $(BASE_OBJ_PATH)/$(ADDON_DIR) BASE_OBJ_SANDBOX_PATH := $(BASE_OBJ_PATH)/$(SANDBOX_DIR) # --- Define install target names for static libraries --- @@ -210,15 +213,53 @@ MK_REFKERN_OBJS := $(foreach arch, $(CONFIG_LIST), \ # Generate object file paths for all of the portable framework source code. MK_FRAME_OBJS := $(call gen-obj-paths-from-src,$(FRAME_SRC_SUFS),$(MK_FRAME_SRC),$(FRAME_PATH),$(BASE_OBJ_FRAME_PATH)) +# Generate object file paths for the addon source code. If one or more addons +# were not enabled a configure-time, these variable will we empty. +# NOTE: We separate the source and objects into kernel and non-kernel lists. +MK_ADDON_KERS_SRC := $(foreach addon, $(ADDON_LIST), \ + $(filter $(ADDON_PATH)/$(addon)/$(KERNELS_DIR)/%, \ + $(MK_ADDON_SRC)) \ + ) +MK_ADDON_OTHER_SRC := $(foreach addon, $(ADDON_LIST), \ + $(filter-out $(ADDON_PATH)/$(addon)/$(KERNELS_DIR)/%, \ + $(MK_ADDON_SRC)) \ + ) +MK_ADDON_KERS_OBJS := $(call gen-obj-paths-from-src,$(ADDON_SRC_SUFS),$(MK_ADDON_KERS_SRC),$(ADDON_PATH),$(BASE_OBJ_ADDON_PATH)) +MK_ADDON_OTHER_OBJS := $(call gen-obj-paths-from-src,$(ADDON_SRC_SUFS),$(MK_ADDON_OTHER_SRC),$(ADDON_PATH),$(BASE_OBJ_ADDON_PATH)) +MK_ADDON_OBJS := $(MK_ADDON_KERS_OBJS) $(MK_ADDON_OTHER_OBJS) + # Generate object file paths for the sandbox source code. If a sandbox was not # enabled a configure-time, this variable will we empty. MK_SANDBOX_OBJS := $(call gen-obj-paths-from-src,$(SANDBOX_SRC_SUFS),$(MK_SANDBOX_SRC),$(SANDBOX_PATH),$(BASE_OBJ_SANDBOX_PATH)) +# AMD has chosen to introduce AOCL-specific optimizations to certain BLIS +# framework files that are otherwise intended to remain generic. Upstream +# developers of vanilla BLIS have agreed to integrate some of these +# optimizations, but in a way that keeps the AOCL-specific code segregated +# in separate files containing the suffix '_amd'. For example, the BLAS +# compatibility layer in vanilla BLIS contains a generic file named +# 'bla_gemm.c'. AMD's version of this file is named 'bla_gemm_amd.c'. +# Only one or the other is ever built and included in libblis. Currently, +# these files are chosen automatically based on the target configuration. +ifeq ($(ENABLE_AMD_FRAME_TWEAKS),yes) +# Build is being done for AMD platforms; remove the objects which DO NOT have +# an "_amd" suffix. +MK_FRAME_AMD_OBJS := $(filter $(BASE_OBJ_FRAME_PATH)/%amd.o, $(MK_FRAME_OBJS)) +FILES_TO_REMOVE := $(subst _amd.o,.o, $(MK_FRAME_AMD_OBJS)) +MK_FRAME_OBJS := $(filter-out $(FILES_TO_REMOVE), $(MK_FRAME_OBJS)) +else +# Build is being done for non-AMD platforms; remove the objects which DO have +# an "_amd" suffix. +MK_FRAME_AMD_OBJS := $(filter $(BASE_OBJ_FRAME_PATH)/%amd.o, $(MK_FRAME_OBJS)) +MK_FRAME_OBJS := $(filter-out $(MK_FRAME_AMD_OBJS), $(MK_FRAME_OBJS)) +endif + # Combine all of the object files into some readily-accessible variables. MK_BLIS_OBJS := $(MK_CONFIG_OBJS) \ $(MK_KERNELS_OBJS) \ $(MK_REFKERN_OBJS) \ $(MK_FRAME_OBJS) \ + $(MK_ADDON_OBJS) \ $(MK_SANDBOX_OBJS) # Optionally filter out the BLAS and CBLAS compatibility layer object files. @@ -240,13 +281,17 @@ endif # --- Monolithic header definitions -------------------------------------------- # -# Define a list of headers to install. The default is to only install blis.h. -HEADERS_TO_INSTALL := $(BLIS_H_FLAT) +# Define lists of headers to create/install. The default is to only +# create/install blis.h. +HEADERS_TO_BUILD := $(BLIS_H_FLAT) +HEADERS_TO_INSTALL := $(BLIS_H_FLAT) -# If CBLAS is enabled, we also install cblas.h so the user does not need to -# change their source code to #include "blis.h" in order to access the CBLAS -# function prototypes and enums. +# If CBLAS is enabled, we also create/install cblas.h. This allows the user to +# continue using #include "cblas.h" in their application, if they wish. (NOTE: +# The user can also access CBLAS definitions and function prototypes by +# #include'ing "blis.h".) ifeq ($(MK_ENABLE_CBLAS),yes) +HEADERS_TO_BUILD += $(CBLAS_H_FLAT) HEADERS_TO_INSTALL += $(CBLAS_H_FLAT) endif @@ -256,6 +301,19 @@ ifeq ($(INSTALL_HH),yes) HEADERS_TO_INSTALL += $(wildcard $(VEND_CPP_PATH)/*.hh) endif +# Define a list of so-called helper headers to install. These helper headers +# are very simple headers that go one directory up from INCDIR/blis (which +# by default is PREFIX/include/blis, where PREFIX is the install prefix). The +# default is to only install the blis.h helper header. +HELP_HEADERS_TO_INSTALL := $(HELP_BLIS_H_PATH) +HELP_HEADERS_INSTALLED := $(INSTALL_INCDIR)/$(BLIS_H) + +# If CBLAS is enabled, we also install the cblas.h helper header. +ifeq ($(MK_ENABLE_CBLAS),yes) +HELP_HEADERS_TO_INSTALL += $(HELP_CBLAS_H_PATH) +HELP_HEADERS_INSTALLED += $(INSTALL_INCDIR)/$(CBLAS_H) +endif + # @@ -264,7 +322,31 @@ endif # Define a list of makefile fragments to install. FRAGS_TO_INSTALL := $(CONFIG_MK_FILE) \ - $(COMMON_MK_FILE) + $(COMMON_MK_FILE) \ + $(DIST_PATH)/build/gen-make-frags/gen-make-frag.sh \ + $(DIST_PATH)/build/gen-make-frags/fragment.mk \ + $(DIST_PATH)/build/gen-make-frags/ignore_list \ + $(DIST_PATH)/build/gen-make-frags/special_list \ + $(DIST_PATH)/build/gen-make-frags/suffix_list \ + $(DIST_PATH)/build/flatten-headers.py \ + $(DIST_PATH)/build/mirror-tree.sh \ + $(DIST_PATH)/config_registry \ + $(DIST_PATH)/build/detect/iset/avx.s \ + $(DIST_PATH)/build/detect/iset/avx512dq.s \ + $(DIST_PATH)/build/detect/iset/avx512f.s \ + $(DIST_PATH)/build/detect/iset/fma3.s \ + $(DIST_PATH)/build/detect/iset/fma4.s + +# Define a list of plugin makefile fragments to install. +PLUGIN_FRAGS_TO_INSTALL := $(DIST_PATH)/build/plugin/bli_plugin_init_ref.c \ + $(DIST_PATH)/build/plugin/bli_plugin_init_zen3.c \ + $(DIST_PATH)/build/plugin/bli_plugin_register.c \ + $(DIST_PATH)/build/plugin/my_kernel_1_ref.c \ + $(DIST_PATH)/build/plugin/my_kernel_2_ref.c \ + $(DIST_PATH)/build/plugin/my_kernel_1_zen3.c \ + $(DIST_PATH)/build/plugin/bli_plugin.h.in \ + $(DIST_PATH)/build/plugin/config.mk.in \ + $(DIST_PATH)/build/plugin/Makefile PC_IN_FILE := blis.pc.in PC_OUT_FILE := blis.pc @@ -457,16 +539,29 @@ ifeq ($(ALL_MAKE_DEFS_MK_PRESENT),no) endif +# --- Shared/dynamic libblis symbol file creation/refresh --- + +symbols: check-env $(SYM_FILE) + +$(SYM_FILE): $(HEADERS_TO_INSTALL) +ifeq ($(ENABLE_VERBOSE),yes) + $(GEN_SYMS) > $(SYM_FILE) +else + @echo "Updating $(SYM_FILE)" + @$(GEN_SYMS) > $(SYM_FILE) +endif + + # --- Consolidated blis.h header creation --- flat-header: check-env $(BLIS_H_FLAT) $(BLIS_H_FLAT): $(ALL_H99_FILES) ifeq ($(ENABLE_VERBOSE),yes) - $(FLATTEN_H) -c -v1 $(BLIS_H_SRC_PATH) $@ "./$(INCLUDE_DIR)" "$(ALL_H99_DIRPATHS)" + $(FLATTEN_H) -l -v1 $(BLIS_H_SRC_PATH) $@ "./$(INCLUDE_DIR)" "$(ALL_H99_DIRPATHS)" else @echo -n "Generating monolithic blis.h" - @$(FLATTEN_H) -c -v1 $(BLIS_H_SRC_PATH) $@ "./$(INCLUDE_DIR)" "$(ALL_H99_DIRPATHS)" + @$(FLATTEN_H) -l -v1 $(BLIS_H_SRC_PATH) $@ "./$(INCLUDE_DIR)" "$(ALL_H99_DIRPATHS)" @echo "Generated $@" endif @@ -474,12 +569,16 @@ endif flat-cblas-header: check-env $(CBLAS_H_FLAT) -$(CBLAS_H_FLAT): $(FRAME_H99_FILES) +# Note that the flattened blis.h is a prerequisite of flattening cblas.h. This +# is done so that the two headers are built sequentially even when using +# 'make -j[n]'. Otherwise, the output from the two processes can become +# interleaved, which looks awkward/confusing. +$(CBLAS_H_FLAT): $(FRAME_H99_FILES) $(BLIS_H_FLAT) ifeq ($(ENABLE_VERBOSE),yes) - $(FLATTEN_H) -c -v1 $(CBLAS_H_SRC_PATH) $@ "./$(INCLUDE_DIR)" "$(ALL_H99_DIRPATHS)" + $(FLATTEN_H) -l -v1 $(CBLAS_H_SRC_PATH) $@ "./$(INCLUDE_DIR)" "$(ALL_H99_DIRPATHS)" else @echo -n "Generating monolithic cblas.h" - @$(FLATTEN_H) -c -v1 $(CBLAS_H_SRC_PATH) $@ "./$(INCLUDE_DIR)" "$(ALL_H99_DIRPATHS)" + @$(FLATTEN_H) -l -v1 $(CBLAS_H_SRC_PATH) $@ "./$(INCLUDE_DIR)" "$(ALL_H99_DIRPATHS)" @echo "Generated $@" endif @@ -493,7 +592,7 @@ endif # first argument: a configuration name from config_list, used to look up the # CFLAGS to use during compilation. define make-config-rule -$(BASE_OBJ_CONFIG_PATH)/$(1)/%.o: $(CONFIG_PATH)/$(1)/%.c $(BLIS_H_FLAT) $(MAKE_DEFS_MK_PATHS) +$(BASE_OBJ_CONFIG_PATH)/$(1)/%.o: $(CONFIG_PATH)/$(1)/%.c $(HEADERS_TO_BUILD) $(MAKE_DEFS_MK_PATHS) ifeq ($(ENABLE_VERBOSE),yes) $(CC) $(call get-config-cflags-for,$(1)) -c $$< -o $$@ else @@ -505,18 +604,28 @@ endef # first argument: a configuration name from the union of config_list and # config_name, used to look up the CFLAGS to use during compilation. define make-frame-rule -$(BASE_OBJ_FRAME_PATH)/%.o: $(FRAME_PATH)/%.c $(BLIS_H_FLAT) $(MAKE_DEFS_MK_PATHS) +$(BASE_OBJ_FRAME_PATH)/%.o: $(FRAME_PATH)/%.c $(HEADERS_TO_BUILD) $(MAKE_DEFS_MK_PATHS) ifeq ($(ENABLE_VERBOSE),yes) $(CC) $(call get-frame-cflags-for,$(1)) -c $$< -o $$@ else @echo "Compiling $$@" $(call get-frame-text-for,$(1)) @$(CC) $(call get-frame-cflags-for,$(1)) -c $$< -o $$@ endif + +ifneq ($(findstring hpx,$(THREADING_MODEL)),) +$(BASE_OBJ_FRAME_PATH)/%.o: $(FRAME_PATH)/%.cpp $(HEADERS_TO_BUILD) $(MAKE_DEFS_MK_PATHS) +ifeq ($(ENABLE_VERBOSE),yes) + $(CXX) $(call get-frame-cxxflags-for,$(1)) -c $$< -o $$@ +else + @echo "Compiling $$@" $(call get-frame-cxxtext-for,$(1)) + @$(CXX) $(call get-frame-cxxflags-for,$(1)) -c $$< -o $$@ +endif +endif endef # first argument: a kernel set (name) being targeted (e.g. haswell). define make-refinit-rule -$(BASE_OBJ_REFKERN_PATH)/$(1)/bli_cntx_$(1)_ref.o: $(REFKERN_PATH)/bli_cntx_ref.c $(BLIS_H_FLAT) $(MAKE_DEFS_MK_PATHS) +$(BASE_OBJ_REFKERN_PATH)/$(1)/bli_cntx_$(1)_ref.o: $(REFKERN_PATH)/bli_cntx_ref.c $(HEADERS_TO_BUILD) $(MAKE_DEFS_MK_PATHS) ifeq ($(ENABLE_VERBOSE),yes) $(CC) $(call get-refinit-cflags-for,$(1)) -c $$< -o $$@ else @@ -527,7 +636,7 @@ endef # first argument: a kernel set (name) being targeted (e.g. haswell). define make-refkern-rule -$(BASE_OBJ_REFKERN_PATH)/$(1)/%_$(1)_ref.o: $(REFKERN_PATH)/%_ref.c $(BLIS_H_FLAT) $(MAKE_DEFS_MK_PATHS) +$(BASE_OBJ_REFKERN_PATH)/$(1)/%_$(1)_ref.o: $(REFKERN_PATH)/%_ref.c $(HEADERS_TO_BUILD) $(MAKE_DEFS_MK_PATHS) ifeq ($(ENABLE_VERBOSE),yes) $(CC) $(call get-refkern-cflags-for,$(1)) -c $$< -o $$@ else @@ -540,7 +649,7 @@ endef # second argument: the configuration whose CFLAGS we should use in compilation. # third argument: the kernel file suffix being considered. define make-kernels-rule -$(BASE_OBJ_KERNELS_PATH)/$(1)/%.o: $(KERNELS_PATH)/$(1)/%.$(3) $(BLIS_H_FLAT) $(MAKE_DEFS_MK_PATHS) +$(BASE_OBJ_KERNELS_PATH)/$(1)/%.o: $(KERNELS_PATH)/$(1)/%.$(3) $(HEADERS_TO_BUILD) $(MAKE_DEFS_MK_PATHS) ifeq ($(ENABLE_VERBOSE),yes) $(CC) $(call get-kernel-cflags-for,$(2)) -c $$< -o $$@ else @@ -551,8 +660,49 @@ endef # first argument: a configuration name from the union of config_list and # config_name, used to look up the CFLAGS to use during compilation. +# second argument: the C99 addon file suffix being considered. +define make-c99-addon-rule +$(BASE_OBJ_ADDON_PATH)/%.o: $(ADDON_PATH)/%.$(2) $(HEADERS_TO_BUILD) $(ADDON_H99_FILES) $(MAKE_DEFS_MK_PATHS) +ifeq ($(ENABLE_VERBOSE),yes) + $(CC) $(call get-addon-c99flags-for,$(1)) -c $$< -o $$@ +else + @echo "Compiling $$@" $(call get-addon-c99text-for,$(1)) + @$(CC) $(call get-addon-c99flags-for,$(1)) -c $$< -o $$@ +endif +endef + +# first argument: a configuration name from the union of config_list and +# config_name, used to look up the CFLAGS to use during compilation. +# second argument: the C99 addon file suffix being considered. +# third argument: the name of the addon being considered. +define make-c99-addon-kers-rule +$(BASE_OBJ_ADDON_PATH)/$(3)/$(KERNELS_DIR)/%.o: $(ADDON_PATH)/$(3)/$(KERNELS_DIR)/%.$(2) $(HEADERS_TO_BUILD) $(ADDON_H99_FILES) $(MAKE_DEFS_MK_PATHS) +ifeq ($(ENABLE_VERBOSE),yes) + $(CC) $(call get-addon-kernel-c99flags-for,$(1)) -c $$< -o $$@ +else + @echo "Compiling $$@" $(call get-addon-kernel-text-for,$(1)) + @$(CC) $(call get-addon-kernel-c99flags-for,$(1)) -c $$< -o $$@ +endif +endef + +# first argument: a configuration name from the union of config_list and +# config_name, used to look up the CFLAGS to use during compilation. +# second argument: the C++ addon file suffix being considered. +define make-cxx-addon-rule +$(BASE_OBJ_ADDON_PATH)/%.o: $(ADDON_PATH)/%.$(2) $(HEADERS_TO_BUILD) $(ADDON_HXX_FILES) $(MAKE_DEFS_MK_PATHS) +ifeq ($(ENABLE_VERBOSE),yes) + $(CXX) $(call get-addon-cxxflags-for,$(1)) -c $$< -o $$@ +else + @echo "Compiling $$@" $(call get-addon-cxxtext-for,$(1)) + @$(CXX) $(call get-addon-cxxflags-for,$(1)) -c $$< -o $$@ +endif +endef + +# first argument: a configuration name from the union of config_list and +# config_name, used to look up the CFLAGS to use during compilation. +# second argument: the C99 sandbox file suffix being considered. define make-c99-sandbox-rule -$(BASE_OBJ_SANDBOX_PATH)/%.o: $(SANDBOX_PATH)/%.$(2) $(BLIS_H_FLAT) $(SANDBOX_H99_FILES) $(MAKE_DEFS_MK_PATHS) +$(BASE_OBJ_SANDBOX_PATH)/%.o: $(SANDBOX_PATH)/%.$(2) $(HEADERS_TO_BUILD) $(SANDBOX_H99_FILES) $(MAKE_DEFS_MK_PATHS) ifeq ($(ENABLE_VERBOSE),yes) $(CC) $(call get-sandbox-c99flags-for,$(1)) -c $$< -o $$@ else @@ -561,8 +711,11 @@ else endif endef +# first argument: a configuration name from the union of config_list and +# config_name, used to look up the CFLAGS to use during compilation. +# second argument: the C++ sandbox file suffix being considered. define make-cxx-sandbox-rule -$(BASE_OBJ_SANDBOX_PATH)/%.o: $(SANDBOX_PATH)/%.$(2) $(BLIS_H_FLAT) $(SANDBOX_HXX_FILES) $(MAKE_DEFS_MK_PATHS) +$(BASE_OBJ_SANDBOX_PATH)/%.o: $(SANDBOX_PATH)/%.$(2) $(HEADERS_TO_BUILD) $(SANDBOX_HXX_FILES) $(MAKE_DEFS_MK_PATHS) ifeq ($(ENABLE_VERBOSE),yes) $(CXX) $(call get-sandbox-cxxflags-for,$(1)) -c $$< -o $$@ else @@ -601,6 +754,22 @@ $(foreach conf, $(CONFIG_LIST), $(eval $(call make-refkern-rule,$(conf)))) $(foreach suf, $(KERNELS_SRC_SUFS), \ $(foreach kset, $(KERNEL_LIST), $(eval $(call make-kernels-rule,$(kset),$(call get-config-for-kset,$(kset)),$(suf))))) +# Instantiate the build rule for C addon files. Use the CFLAGS for the +# configuration family. +$(foreach suf, $(ADDON_C99_SUFS), \ +$(foreach conf, $(CONFIG_NAME), $(eval $(call make-c99-addon-rule,$(conf),$(suf))))) + +# Instantiate the build rule for C addon/kernels files. Use the CFLAGS for the +# configuration family. +$(foreach addon, $(ADDON_LIST), \ +$(foreach suf, $(ADDON_C99_SUFS), \ +$(foreach conf, $(CONFIG_NAME), $(eval $(call make-c99-addon-kers-rule,$(conf),$(suf),$(addon)))))) + +# Instantiate the build rule for C++ addon files. Use the CFLAGS for the +# configuration family. +$(foreach suf, $(ADDON_CXX_SUFS), \ +$(foreach conf, $(CONFIG_NAME), $(eval $(call make-cxx-addon-rule,$(conf),$(suf))))) + # Instantiate the build rule for C sandbox files. Use the CFLAGS for the # configuration family. $(foreach suf, $(SANDBOX_C99_SUFS), \ @@ -697,7 +866,7 @@ blastest-bin: check-env blastest-f2c $(BLASTEST_DRV_BIN_PATHS) blastest-run: $(BLASTEST_DRV_BINS_R) # f2c object file rule. -$(BASE_OBJ_BLASTEST_PATH)/%.o: $(BLASTEST_F2C_SRC_PATH)/%.c $(BLIS_H_FLAT) +$(BASE_OBJ_BLASTEST_PATH)/%.o: $(BLASTEST_F2C_SRC_PATH)/%.c $(HEADERS_TO_BUILD) ifeq ($(ENABLE_VERBOSE),yes) $(CC) $(call get-user-cflags-for,$(CONFIG_NAME)) $(BLAT_CFLAGS) -c $< -o $@ else @@ -706,7 +875,7 @@ else endif # driver object file rule. -$(BASE_OBJ_BLASTEST_PATH)/%.o: $(BLASTEST_DRV_SRC_PATH)/%.c $(BLIS_H_FLAT) +$(BASE_OBJ_BLASTEST_PATH)/%.o: $(BLASTEST_DRV_SRC_PATH)/%.c $(HEADERS_TO_BUILD) ifeq ($(ENABLE_VERBOSE),yes) $(CC) $(call get-user-cflags-for,$(CONFIG_NAME)) $(BLAT_CFLAGS) -c $< -o $@ else @@ -787,7 +956,7 @@ testsuite: testsuite-run testsuite-bin: check-env $(TESTSUITE_BIN) # Object file rule. -$(BASE_OBJ_TESTSUITE_PATH)/%.o: $(TESTSUITE_SRC_PATH)/%.c $(BLIS_H_FLAT) +$(BASE_OBJ_TESTSUITE_PATH)/%.o: $(TESTSUITE_SRC_PATH)/%.c $(HEADERS_TO_BUILD) ifeq ($(ENABLE_VERBOSE),yes) $(CC) $(call get-user-cflags-for,$(CONFIG_NAME)) -c $< -o $@ else @@ -910,8 +1079,9 @@ endif # --- Install header rules --- -install-headers: check-env $(MK_INCL_DIR_INST) +install-headers: check-env $(MK_INCL_DIR_INST) install-helper-headers +# Rule for installing main headers. $(MK_INCL_DIR_INST): $(HEADERS_TO_INSTALL) $(CONFIG_MK_FILE) ifeq ($(ENABLE_VERBOSE),yes) $(MKDIR) $(@) @@ -922,33 +1092,74 @@ else @$(INSTALL) -m 0644 $(HEADERS_TO_INSTALL) $(@) endif +install-helper-headers: check-env $(HELP_HEADERS_INSTALLED) + +# A rule to install a helper header file. +define make-helper-header-rule +$(INSTALL_INCDIR)/$(notdir $(1)): $(BUILD_PATH)/$(notdir $(1)) $(CONFIG_MK_FILE) +ifeq ($(ENABLE_VERBOSE),yes) + $(MKDIR) $(INSTALL_INCDIR) + $(INSTALL) -m 0644 $$(<) $$(@) +else + @$(MKDIR) $(INSTALL_INCDIR) + @echo "Installing $$(@F) helper header into $(INSTALL_INCDIR)/" + @$(INSTALL) -m 0644 $$(<) $$(@) +endif +endef + +# Instantiate the rule above for each helper header file to install. +$(foreach h, $(HELP_HEADERS_TO_INSTALL), $(eval $(call make-helper-header-rule,$(h)))) # --- Install share rules --- install-share: check-env $(MK_SHARE_DIR_INST) $(PC_SHARE_DIR_INST) -$(MK_SHARE_DIR_INST): $(FRAGS_TO_INSTALL) $(CONFIG_MK_FILE) +$(MK_SHARE_DIR_INST): $(CONFIGURE_FILE) $(FRAGS_TO_INSTALL) $(PLUGIN_FRAGS_TO_INSTALL) $(CONFIG_DIR)/$(CONFIG_NAME)/$(MAKE_DEFS_FILE) ifeq ($(ENABLE_VERBOSE),yes) $(MKDIR) $(@) - $(INSTALL) -m 0644 $(FRAGS_TO_INSTALL) $(@) - $(MKDIR) -p $(@)/$(CONFIG_DIR)/$(CONFIG_NAME) - $(INSTALL) -m 0644 $(CONFIG_DIR)/$(CONFIG_NAME)/$(MAKE_DEFS_FILE) \ - $(@)/$(CONFIG_DIR)/$(CONFIG_NAME) + $(MKDIR) $(@)/plugin + $(INSTALL) -m 0755 $(filter %.sh,$(FRAGS_TO_INSTALL)) $(@) + $(INSTALL) -m 0644 $(filter-out %.sh,$(FRAGS_TO_INSTALL)) $(@) + $(INSTALL) -m 0644 $(PLUGIN_FRAGS_TO_INSTALL) $(@)/plugin + $(INSTALL) -m 0755 $(CONFIGURE_FILE) $(@)/configure-plugin +# $(MKDIR) -p $(@)/$(CONFIG_DIR)/$(CONFIG_NAME) +# $(INSTALL) -m 0644 $(CONFIG_DIR)/$(CONFIG_NAME)/$(MAKE_DEFS_FILE) \ +# $(@)/$(CONFIG_DIR)/$(CONFIG_NAME) + for THIS_CONFIG in $(FULL_CONFIG_LIST); do \ + $(MKDIR) -p $(@)/$(CONFIG_DIR)/$$THIS_CONFIG; \ + $(INSTALL) -m 0644 $(CONFIG_DIR)/$$THIS_CONFIG/$(MAKE_DEFS_FILE) \ + $(@)/$(CONFIG_DIR)/$$THIS_CONFIG; \ + $(INSTALL) -m 0644 $(CONFIG_DIR)/$$THIS_CONFIG/bli_kernel_defs_$$THIS_CONFIG.h \ + $(@)/$(CONFIG_DIR)/$$THIS_CONFIG; \ + done else @$(MKDIR) $(@) + @$(MKDIR) $(@)/plugin @echo "Installing $(notdir $(FRAGS_TO_INSTALL)) into $(@)/" - @$(INSTALL) -m 0644 $(FRAGS_TO_INSTALL) $(@) - @$(MKDIR) -p $(@)/$(CONFIG_DIR)/$(CONFIG_NAME) - @echo "Installing $(CONFIG_DIR)/$(CONFIG_NAME)/$(MAKE_DEFS_FILE) into $(@)/$(CONFIG_DIR)/$(CONFIG_NAME)" - @$(INSTALL) -m 0644 $(CONFIG_DIR)/$(CONFIG_NAME)/$(MAKE_DEFS_FILE) \ - $(@)/$(CONFIG_DIR)/$(CONFIG_NAME)/ -endif - -$(PC_SHARE_DIR_INST): $(PC_IN_FILE) + @$(INSTALL) -m 0755 $(filter %.sh,$(FRAGS_TO_INSTALL)) $(@) + @$(INSTALL) -m 0644 $(filter-out %.sh,$(FRAGS_TO_INSTALL)) $(@) + @echo "Installing $(notdir $(PLUGIN_FRAGS_TO_INSTALL)) into $(@)/plugin/" + @$(INSTALL) -m 0644 $(PLUGIN_FRAGS_TO_INSTALL) $(@)/plugin + @echo "Installing $(CONFIGURE_FILE) into $(@)/configure-plugin" + @$(INSTALL) -m 0755 $(CONFIGURE_FILE) $(@)/configure-plugin +# @$(MKDIR) -p $(@)/$(CONFIG_DIR)/$(CONFIG_NAME)#\ +# @echo "Installing $(CONFIG_DIR)/$(CONFIG_NAME)/$(MAKE_DEFS_FILE) into $(@)/$(CONFIG_DIR)/$(CONFIG_NAME)" +# @$(INSTALL) -m 0644 $(CONFIG_DIR)/$(CONFIG_NAME)/$(MAKE_DEFS_FILE) \ +# $(@)/$(CONFIG_DIR)/$(CONFIG_NAME)/ + @for THIS_CONFIG in $(FULL_CONFIG_LIST); do \ + $(MKDIR) -p $(@)/$(CONFIG_DIR)/$$THIS_CONFIG; \ + echo "Installing $(CONFIG_DIR)/$$THIS_CONFIG/$(MAKE_DEFS_FILE) into $(@)/$(CONFIG_DIR)/$$THIS_CONFIG"; \ + $(INSTALL) -m 0644 $(CONFIG_DIR)/$$THIS_CONFIG/$(MAKE_DEFS_FILE) \ + $(@)/$(CONFIG_DIR)/$$THIS_CONFIG; \ + echo "Installing $(CONFIG_DIR)/$$THIS_CONFIG/bli_kernel_defs_$$THIS_CONFIG.h into $(@)/$(CONFIG_DIR)/$$THIS_CONFIG"; \ + $(INSTALL) -m 0644 $(CONFIG_DIR)/$$THIS_CONFIG/bli_kernel_defs_$$THIS_CONFIG.h \ + $(@)/$(CONFIG_DIR)/$$THIS_CONFIG; \ + done +endif + +$(PC_SHARE_DIR_INST): $(PC_IN_FILE) +ifeq ($(ENABLE_VERBOSE),yes) $(MKDIR) $(@) -ifeq ($(ENABLE_VERBOSE),no) - @echo "Installing $(PC_OUT_FILE) into $(@)/" -endif $(shell cat "$(PC_IN_FILE)" \ | sed -e "s#@PACKAGE_VERSION@#$(VERSION)#g" \ | sed -e "s#@prefix@#$(prefix)#g" \ @@ -958,6 +1169,19 @@ endif | sed -e "s#@LDFLAGS@#$(LDFLAGS)#g" \ > "$(PC_OUT_FILE)" ) $(INSTALL) -m 0644 $(PC_OUT_FILE) $(@) +else + @$(MKDIR) $(@) + @echo "Installing $(PC_OUT_FILE) into $(@)/" + @$(shell cat "$(PC_IN_FILE)" \ + | sed -e "s#@PACKAGE_VERSION@#$(VERSION)#g" \ + | sed -e "s#@prefix@#$(prefix)#g" \ + | sed -e "s#@exec_prefix@#$(exec_prefix)#g" \ + | sed -e "s#@libdir@#$(libdir)#g" \ + | sed -e "s#@includedir@#$(includedir)#g" \ + | sed -e "s#@LDFLAGS@#$(LDFLAGS)#g" \ + > "$(PC_OUT_FILE)" ) + @$(INSTALL) -m 0644 $(PC_OUT_FILE) $(@) +endif # --- Install library rules --- @@ -1049,24 +1273,25 @@ endif # ifeq ($(IS_WIN),no) # --- Query current configuration --- showconfig: check-env - @echo "configuration family: $(CONFIG_NAME)" - @echo "sub-configurations: $(CONFIG_LIST)" - @echo "requisite kernels sets: $(KERNEL_LIST)" - @echo "kernel-to-config map: $(KCONFIG_MAP)" + @echo "configuration family: $(CONFIG_NAME)" + @echo "sub-configurations: $(CONFIG_LIST)" + @echo "requisite kernels sets: $(KERNEL_LIST)" + @echo "kernel-to-config map: $(KCONFIG_MAP)" @echo "-------------------------" - @echo "BLIS version string: $(VERSION)" - @echo ".so major version: $(SO_MAJOR)" - @echo ".so minor.build vers: $(SO_MINORB)" - @echo "install libdir: $(INSTALL_LIBDIR)" - @echo "install includedir: $(INSTALL_INCDIR)" - @echo "install sharedir: $(INSTALL_SHAREDIR)" - @echo "debugging status: $(DEBUG_TYPE)" - @echo "multithreading status: $(THREADING_MODEL)" - @echo "enable BLAS API? $(MK_ENABLE_BLAS)" - @echo "enable CBLAS API? $(MK_ENABLE_CBLAS)" - @echo "build static library? $(MK_ENABLE_STATIC)" - @echo "build shared library? $(MK_ENABLE_SHARED)" - @echo "ARG_MAX hack enabled? $(ARG_MAX_HACK)" + @echo "BLIS version string: $(VERSION)" + @echo ".so major version: $(SO_MAJOR)" + @echo ".so minor.build vers: $(SO_MINORB)" + @echo "install libdir: $(INSTALL_LIBDIR)" + @echo "install includedir: $(INSTALL_INCDIR)" + @echo "install sharedir: $(INSTALL_SHAREDIR)" + @echo "debugging status: $(DEBUG_TYPE)" + @echo "enable AddressSanitizer? $(MK_ENABLE_ASAN)" + @echo "enabled threading model(s): $(THREADING_MODEL)" + @echo "enable BLAS API? $(MK_ENABLE_BLAS)" + @echo "enable CBLAS API? $(MK_ENABLE_CBLAS)" + @echo "build static library? $(MK_ENABLE_STATIC)" + @echo "build shared library? $(MK_ENABLE_SHARED)" + @echo "ARG_MAX hack enabled? $(ARG_MAX_HACK)" # --- Clean rules --- @@ -1078,6 +1303,9 @@ ifeq ($(ENABLE_VERBOSE),yes) - $(FIND) $(FRAME_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F) - $(FIND) $(REFKERN_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F) - $(FIND) $(KERNELS_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F) +ifneq ($(ADDON_LIST),) + - $(FIND) $(ADDON_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F) +endif ifneq ($(SANDBOX),) - $(FIND) $(SANDBOX_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F) endif @@ -1090,6 +1318,10 @@ else @- $(FIND) $(REFKERN_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F) @echo "Removing makefile fragments from $(KERNELS_FRAG_PATH)" @- $(FIND) $(KERNELS_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F) +ifneq ($(ADDON_LIST),) + @echo "Removing makefile fragments from $(ADDON_FRAG_PATH)" + @- $(FIND) $(ADDON_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F) +endif ifneq ($(SANDBOX),) @echo "Removing makefile fragments from $(SANDBOX_FRAG_PATH)" @- $(FIND) $(SANDBOX_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F) @@ -1210,6 +1442,7 @@ endif # IS_CONFIGURED distclean: cleanmk cleanh cleanlib cleantest ifeq ($(IS_CONFIGURED),yes) ifeq ($(ENABLE_VERBOSE),yes) + - $(RM_F) $(BLIS_ADDON_H) - $(RM_F) $(BLIS_CONFIG_H) - $(RM_F) $(CONFIG_MK_FILE) - $(RM_F) $(PC_OUT_FILE) @@ -1217,6 +1450,8 @@ ifeq ($(ENABLE_VERBOSE),yes) - $(RM_RF) $(LIB_DIR) - $(RM_RF) $(INCLUDE_DIR) else + @echo "Removing $(BLIS_ADDON_H)" + @$(RM_F) $(BLIS_ADDON_H) @echo "Removing $(BLIS_CONFIG_H)" @$(RM_F) $(BLIS_CONFIG_H) @echo "Removing $(CONFIG_MK_FILE)" @@ -1266,9 +1501,12 @@ endif uninstall-headers: check-env ifeq ($(ENABLE_VERBOSE),yes) - $(RM_RF) $(MK_INCL_DIR_INST) + - $(RM_RF) $(HELP_HEADERS_INSTALLED) else @echo "Uninstalling directory '$(notdir $(MK_INCL_DIR_INST))' from $(dir $(MK_INCL_DIR_INST))" @- $(RM_RF) $(MK_INCL_DIR_INST) + @echo "Uninstalling $(notdir $(HELP_HEADERS_INSTALLED)) from $(dir $(INSTALL_INCDIR))" + @- $(RM_RF) $(HELP_HEADERS_INSTALLED) endif uninstall-share: check-env diff --git a/README.md b/README.md index f4ec4acb30..27900ac06a 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,15 @@ +_Recipient of the **[2023 James H. Wilkinson Prize for Numerical Software](https://www.siam.org/prizes-recognition/major-prizes-lectures/detail/james-h-wilkinson-prize-for-numerical-software)**_ + +_Recipient of the **[2020 SIAM Activity Group on Supercomputing Best Paper Prize](https://www.siam.org/prizes-recognition/activity-group-prizes/detail/siag-sc-best-paper-prize)**_ + + ![The BLIS cat is sleeping.](http://www.cs.utexas.edu/users/field/blis_cat.png) -[![Build Status](https://api.travis-ci.com/flame/blis.svg?branch=master)](https://app.travis-ci.com/github/flame/blis) -[![Build Status](https://ci.appveyor.com/api/projects/status/github/flame/blis?branch=master&svg=true)](https://ci.appveyor.com/project/shpc/blis/branch/master) +[![Build Status (CircleCI)](https://dl.circleci.com/status-badge/img/gh/flame/blis/tree/master.svg?style=svg)](https://dl.circleci.com/status-badge/redirect/gh/flame/blis/tree/master) +[![Build Status (TravisCI)](https://api.travis-ci.com/flame/blis.svg?branch=master)](https://app.travis-ci.com/github/flame/blis) +[![Build Status (Appveyor)](https://ci.appveyor.com/api/projects/status/github/flame/blis?branch=master&svg=true)](https://ci.appveyor.com/project/shpc/blis/branch/master) + +[Discord logo](docs/Discord.md) Contents -------- @@ -13,18 +21,21 @@ Contents * **[Key Features](#key-features)** * **[How to Download BLIS](#how-to-download-blis)** * **[Getting Started](#getting-started)** -* **[Performance](#performance)** +* **[Example Code](#example-code)** * **[Documentation](#documentation)** +* **[Performance](#performance)** * **[External Packages](#external-packages)** * **[Discussion](#discussion)** * **[Contributing](#contributing)** * **[Citations](#citations)** +* **[Awards](#awards)** * **[Funding](#funding)** Introduction ------------ -BLIS is a portable software framework for instantiating high-performance +BLIS is an [award-winning](#awards) +portable software framework for instantiating high-performance BLAS-like dense linear algebra libraries. The framework was designed to isolate essential kernels of computation that, when optimized, immediately enable optimized implementations of most of its commonly used and computationally @@ -71,7 +82,9 @@ The BLIS framework is primarily developed and maintained by individuals in the [Science of High-Performance Computing](http://shpc.ices.utexas.edu/) (SHPC) group in the [Oden Institute for Computational Engineering and Sciences](https://www.oden.utexas.edu/) -at [The University of Texas at Austin](https://www.utexas.edu/). +at [The University of Texas at Austin](https://www.utexas.edu/) +and in the [Matthews Research Group](https://matthewsresearchgroup.webstarts.com/) +at [Southern Methodist University](https://www.smu.edu/). Please visit the [SHPC](http://shpc.ices.utexas.edu/) website for more information about our research group, such as a list of [people](http://shpc.ices.utexas.edu/people.html) @@ -94,6 +107,53 @@ all of which are available for free via the [edX platform](http://www.edx.org/). What's New ---------- + * **Plugin feature now available!** BLIS addons (see below) provided a way to +quickly extend BLIS's operation support or define new custom BLIS APIs for your application. +BLIS plugins extend this support to completely external code, needing only an installed BLIS +package (no source required). BLIS plugins also allow users to define their own kernels +and blocksizes, combined with the cross-architecture support provided by the BLIS framework. +Finally, user plugins can utilize the new API for modifying the BLIS "control tree" which +defines the mathematical operation to be computed, as well as information controlling packing, +partitioning, etc. Users can now modify the control tree to implement new linear algebra +operations not already included in BLIS. See the [documentation](docs/PluginHowTo.md) for +an overview of these features and a step-by-step guides for creating plugins and modifying +the control tree to implement an example operation "SYRKD". + + * **BLIS selected for the 2023 James H. Wilkinson Prize for Numerical Software!** We +are thrilled to announce that Field Van Zee and Devin Matthews were chosen to receive +the [2023 James H. Wilkinson Prize for Numerical Software](https://www.siam.org/prizes-recognition/major-prizes-lectures/detail/james-h-wilkinson-prize-for-numerical-software). +The selection committee sought to recognize the recipients "for the development of +BLIS, a portable open-source software framework that facilitates rapid instantiation +of high-performance BLAS and BLAS-like operations targeting modern CPUs." This prize +is awarded once every four years to the authors of an outstanding piece of numerical +software, or to individuals who have made an outstanding contribution to an existing +piece of numerical software. It is awarded to an entry that best addresses all phases +of the preparation of high-quality numerical software, and is intended to recognize +innovative software in scientific computing and to encourage researchers in the +earlier stages of their career. The prize will be awarded at the +[2023 SIAM Conference on Computational Science and Engineering](https://www.siam.org/conferences/cm/conference/cse23) in Amsterdam. + + * **Join us on Discord!** In 2021, we soft-launched our [Discord](https://discord.com/) +server by privately inviting current and former collaborators, attendees of our BLIS +Retreat, as well as other participants within the BLIS ecosystem. We've been thrilled +by the results thus far, and are happy to announce that our new community is now open +to the broader public! If you'd like to hang out with other BLIS users and developers, +ask a question, discuss future features, or just say hello, please feel free to join +us! We've put together a [step-by-step guide](docs/Discord.md) for creating an account +and joining our cozy enclave. We even have a monthly "BLIS happy hour" event where +people can casually come together for a video chat, Q&A, brainstorm session, or +whatever it happens to unfold into! + + * **Addons feature now available!** Have you ever wanted to quickly extend BLIS's +operation support or define new custom BLIS APIs for your application, but were +unsure of how to add your source code to BLIS? Do you want to isolate your custom +code so that it only gets enabled when the user requests it? Do you like +[sandboxes](docs/Sandboxes.md), but wish you didn't have to provide an +implementation of `gemm`? If so, you should check out our new +[addons](docs/Addons.md) feature. Addons act like optional extensions that can be +created, enabled, and combined to suit your application's needs, all without +formally integrating your code into the core BLIS framework. + * **Multithreaded small/skinny matrix support for sgemm now available!** Thanks to funding and hardware support from Oracle, we have now accelerated `gemm` for single-precision real matrix problems where one or two dimensions is exceedingly @@ -239,7 +299,7 @@ writing complex kernels. * **Advanced multithreading support.** BLIS allows multiple levels of symmetric multithreading for nearly all level-3 operations. (Currently, users -may choose to obtain parallelism via either OpenMP or POSIX threads). This +may choose to obtain parallelism via OpenMP, POSIX threads, or HPX). This means that matrices may be partitioned in multiple dimensions simultaneously to attain scalable, high-performance parallelism on multicore and many-core architectures. The key to this innovation is a thread-specific control tree @@ -264,20 +324,13 @@ many will find BLIS's object-based APIs a delight to use when customizing or writing their own BLIS operations. (Objects are relatively lightweight `structs` and passed by address, which helps tame function calling overhead.) - * **Multilayered API, exposed kernels, and sandboxes.** The BLIS framework -exposes its + * **Multilayered API and exposed kernels.** The BLIS framework exposes its implementations in various layers, allowing expert developers to access exactly the functionality desired. This layered interface includes that of the lowest-level kernels, for those who wish to bypass the bulk of the framework. Optimizations can occur at various levels, in part thanks to exposed packing and unpacking facilities, which by default are highly parameterized and -flexible. And more recently, BLIS introduced sandboxes--a way to provide -alternative implementations of `gemm` that do not use any more of the BLIS -infrastructure than is desired. Sandboxes provide a convenient and -straightforward way of modifying the `gemm` implementation without disrupting -any other level-3 operation or any other part of the framework. This works -especially well when the developer wants to experiment with new optimizations -or try a different algorithm. +flexible. * **Functionality that grows with the community's needs.** As its name suggests, the BLIS framework is not a single library or static API, but rather @@ -285,7 +338,9 @@ a nearly-complete template for instantiating high-performance BLAS-like libraries. Furthermore, the framework is extensible, allowing developers to leverage existing components to support new operations as they are identified. If such operations require new kernels for optimal efficiency, the framework -and its APIs will be adjusted and extended accordingly. +and its APIs will be adjusted and extended accordingly. Community developers +who wish to experiment with creating new operations or APIs in BLIS can quickly +and easily do so via the [Addons](docs/Addons.md) feature. * **Code re-use.** Auto-generation approaches to achieving the aforementioned goals tend to quickly lead to code bloat due to the multiple dimensions of @@ -325,12 +380,12 @@ to your hardware. 1. **Download a source repository with `git clone`.** Generally speaking, we prefer using `git clone` to clone a `git` repository. -Having a repository allows the user to periodically pull in the latest changes -and quickly rebuild BLIS whenever they wish. Also, implicit in cloning a -repository is that the repository defaults to using the `master` branch, which -contains the latest "stable" commits since the most recent release. (This is -in contrast to Option 3 in which the user is opting for code that may be -slightly out of date.) +Having a repository allows the user to periodically pull in the latest changes, +try out release candidates when they become available, switch to older versions +easily, and quickly rebuild BLIS whenever they wish. +(Note that implicit in cloning a repository is that the repository defaults to +using the `master` branch, which, as of 1.0, is considered akin to a development +branch and likely contains improvements since the most recent release.) In order to clone a `git` repository of BLIS, please obtain a repository URL by clicking on the green button above the file/directory listing near the @@ -339,8 +394,34 @@ to executing the following command in your terminal shell: ``` git clone https://github.com/flame/blis.git ``` - -2. **Download a source repository via a zip file.** + At this point, you will have the latest commit of the `master` branch +checked out. If you wish to check out an official release version, say, +1.0, execute the following: + ``` + git checkout 1.0 + ``` + `git` will then transform your working copy to match the state of the +commit associated with version 1.0. You can view a list of official +versiontags at any time by executing: + ``` + git tag --list + ``` + Note that pre-release versions, such as release candidates, are actually +branches rather than tags, and thus will not show up in the list of tagged +versions. + +2. **Download a source release via a tarball/zip file.** +If you would like to stick to the code that is included in official releases +and don't need the convenience of pulling in the latest changes via `git`, you +may download either a tarball or zip file of BLIS's latest +[release](https://github.com/flame/blis/releases). (NOTE: Some older releases +are only available as [tagged](https://github.com/flame/blis/tags) commits. +Also note that downloading release x.y.z is equivalent to downloading, or +checking out, the `git` tag `x.y.z`.) +We consider this option to be less than ideal for some people since you will +not be able to update your code with a simple `git pull` command. + +3. **Download a source repository via a zip file.** If you are uncomfortable with using `git` but would still like the latest stable commits, we recommend that you download BLIS as a zip file. @@ -348,15 +429,6 @@ stable commits, we recommend that you download BLIS as a zip file. click on the green button above the file listing near the top of this page. This should reveal a link for downloading the zip file. -3. **Download a source release via a tarball/zip file.** -Alternatively, if you would like to stick to the code that is included in -official releases, you may download either a tarball or zip file of any of -BLIS's previous [tagged releases](https://github.com/flame/blis/releases). -We consider this option to be less than ideal for most people since it will -likely mean you miss out on the latest bugfix or feature commits (in contrast -to Options 1 or 2), and you also will not be able to update your code with a -simple `git pull` command (in contrast to Option 1). - 4. **Download a binary package specific to your OS.** While we don't recommend this as the first choice for most users, we provide links to community members who generously maintain BLIS packages for various @@ -394,23 +466,44 @@ If/when you have time, we *strongly* encourage you to read the detailed walkthrough of the build system found in our [Build System](docs/BuildSystem.md) guide. -Performance ------------ +If you are still having trouble, you are welcome to [join us on Discord](docs/Discord.md) +for further information and/or assistance. -We provide graphs that report performance of several implementations across a -range of hardware types, multithreading configurations, problem sizes, -operations, and datatypes. These pages also document most of the details needed -to reproduce these experiments. +Example Code +------------ - * **[Performance](docs/Performance.md).** This document reports empirically -measured performance of a representative set of level-3 operations on a variety -of hardware architectures, as implemented within BLIS and other BLAS libraries -for all four of the standard floating-point datatypes. +The BLIS source distribution provides example code in the `examples` directory. +Example code focuses on using BLIS APIs (not BLAS or CBLAS), and resides in +two subdirectories: [examples/oapi](examples/oapi) (which demonstrates the +[object API](docs/BLISObjectAPI.md)) and [examples/tapi](examples/tapi) (which +demonstrates the [typed API](docs/BLISTypedAPI.md)). + +Either directory contains several files, each containing various pieces of +code that exercise core functionality of the BLIS API in question (object or +typed). These example files should be thought of collectively like a tutorial, +and therefore it is recommended to start from the beginning (the file that +starts in `00`). + +You can build all of the examples by simply running `make` from either example +subdirectory (`examples/oapi` or `examples/tapi`). (You can also run +`make clean`.) The local `Makefile` assumes that you've already configured and +built (but not necessarily installed) BLIS two directories up, in `../..`. If +you have already installed BLIS to some permanent directory, you may refer to +that installation by setting the environment variable `BLIS_INSTALL_PATH` prior +to running make: +``` +export BLIS_INSTALL_PATH=/usr/local; make +``` +or by setting the same variable as part of the make command: +``` +make BLIS_INSTALL_PATH=/usr/local +``` +**Once the executable files have been built, we recommend reading the code and +the corresponding executable output side by side. This will help you see the +effects of each section of code.** - * **[PerformanceSmall](docs/PerformanceSmall.md).** This document reports -empirically measured performance of `gemm` on select hardware architectures -within BLIS and other BLAS libraries when performing matrix problems where one -or two dimensions is exceedingly small. +This tutorial is not exhaustive or complete; several object API functions were +omitted (mostly for brevity's sake) and thus more examples could be written. Documentation ------------- @@ -432,16 +525,12 @@ included BLAS test drivers. * **[BLIS Typed API Reference](docs/BLISTypedAPI.md).** Here we document the so-called "typed" (or BLAS-like) API. This is the API that many users who are -already familiar with the BLAS will likely want to use. You can find lots of -example code for the typed API in the [examples/tapi](examples/tapi) directory -included in the BLIS source distribution. +already familiar with the BLAS will likely want to use. * **[BLIS Object API Reference](docs/BLISObjectAPI.md).** Here we document the object API. This is API abstracts away properties of vectors and matrices within `obj_t` structs that can be queried with accessor functions. Many -developers and experts prefer this API over the typed API. You can find lots of -example code for the object API in the [examples/oapi](examples/oapi) directory -included in the BLIS source distribution. +developers and experts prefer this API over the typed API. * **[Hardware Support](docs/HardwareSupport.md).** This document maintains a table of supported microarchitectures. @@ -453,6 +542,13 @@ use the multithreading features of BLIS. overview of BLIS's mixed-datatype functionality and provides a brief example of how to take advantage of this new code. + * **[Extending BLIS functionality](docs/PluginHowTo.md).** This document provides an +overview of BLIS's mechanisms for extending functionality through user-defined code. +BLIS has a plugin infrastructure which allows users to define their own kernels, +blocksizes, and kernel preferences which are compiled and managed by the BLIS framework. +BLIS also provides an API for modifying the "control tree" which can be used to +implement novel linear algebra operations. + * **[Performance](docs/Performance.md).** This document reports empirically measured performance of a representative set of level-3 operations on a variety of hardware architectures, as implemented within BLIS and other BLAS libraries @@ -463,6 +559,10 @@ empirically measured performance of `gemm` on select hardware architectures within BLIS and other BLAS libraries when performing matrix problems where one or two dimensions is exceedingly small. + * **[Discord](docs/Discord.md).** This document describes how to: create an +account on Discord (if you don't already have one); obtain a private invite +link; and use that invite link to join our BLIS server on Discord. + * **[Release Notes](docs/ReleaseNotes.md).** This document tracks a summary of changes included with each new version of BLIS, along with contributor credits for key features. @@ -497,10 +597,33 @@ learn how to add new sub-configurations or configuration families, or are simply interested in learning how BLIS organizes its configurations and kernel sets, please read this thorough walkthrough of the configuration system. + * **[Addon Guide](docs/Addons.md).** If you are interested in learning +about using BLIS addons--that is, enabling existing (or creating new) bundles +of operation or API code that are built into a BLIS library--please read this +document. + * **[Sandbox Guide](docs/Sandboxes.md).** If you are interested in learning about using sandboxes in BLIS--that is, providing alternative implementations of the `gemm` operation--please read this document. +Performance +----------- + +We provide graphs that report performance of several implementations across a +range of hardware types, multithreading configurations, problem sizes, +operations, and datatypes. These pages also document most of the details needed +to reproduce these experiments. + + * **[Performance](docs/Performance.md).** This document reports empirically +measured performance of a representative set of level-3 operations on a variety +of hardware architectures, as implemented within BLIS and other BLAS libraries +for all four of the standard floating-point datatypes. + + * **[PerformanceSmall](docs/PerformanceSmall.md).** This document reports +empirically measured performance of `gemm` on select hardware architectures +within BLIS and other BLAS libraries when performing matrix problems where one +or two dimensions is exceedingly small. + External Packages ----------------- @@ -542,24 +665,23 @@ releases. The source packages may build on other rpm-based distributions. the source rpms may build for others. * **GNU Guix**. Guix has BLIS packages, provides builds only for the generic -target and some specific x86_64 micro-architectures. +target and some specific `x86_64` micro-architectures. * **Conda**. conda channel [conda-forge](https://github.com/conda-forge/blis-feedstock) -has Linux, OSX and Windows binary packages for x86_64. +has Linux, OSX and Windows binary packages for `x86_64`. Discussion ---------- -You can keep in touch with developers and other users of the project by joining -one of the following mailing lists: +Most of the active discussions are now happening on our [Discord](https://discord.com/) +server. Users and developers alike are welcome! Please see the +[BLIS Discord guide](docs/Discord.md) for a walkthrough of how to join us. + +You can also still stay in touch by using either of the following mailing lists: * [blis-devel](https://groups.google.com/group/blis-devel): Please join and post to this mailing list if you are a BLIS developer, or if you are trying to use BLIS beyond simply linking to it as a BLAS library. -**Note:** Most of the interesting discussions happen here; don't be afraid to -join! If you would like to submit a bug report, or discuss a possible bug, -please consider opening a [new issue](https://github.com/flame/blis/issues) on -github. * [blis-discuss](https://groups.google.com/group/blis-discuss): Please join and post to this mailing list if you have general questions or feedback regarding @@ -716,6 +838,29 @@ within the BLIS Framework}, } ``` +Awards +------ + + * **[2023 James H. Wilkinson Prize for Numerical Software.](https://www.siam.org/prizes-recognition/major-prizes-lectures/detail/james-h-wilkinson-prize-for-numerical-software)** +This prize is awarded once every four years to the authors of an outstanding piece of +numerical software, or to individuals who have made an outstanding contribution to an +existing piece of numerical software. The selection committee sought to recognize the +recipients "for the development of [BLIS](https://github.com/flame/blis), a portable +open-source software framework that facilitates rapid instantiation of +high-performance BLAS and BLAS-like operations targeting modern CPUs." The prize will +be awarded at the +[2023 SIAM Conference on Computational Science and Engineering](https://www.siam.org/conferences/cm/conference/cse23) in Amsterdam. + + * **[2020 SIAM Activity Group on Supercomputing Best Paper Prize.](https://www.siam.org/prizes-recognition/activity-group-prizes/detail/siag-sc-best-paper-prize)** +This prize is awarded once every two years to the authors of the most outstanding +paper, as determined by the selection committee, in the field of parallel scientific +and engineering computing published within the four calendar years preceding the +award year. The prize was chosen for the paper ["The BLIS Framework: Experiments in +Portability."](#citations) and awarded at the [2020 SIAM Conference on Parallel Processing for Scientific Computing](https://www.siam.org/conferences/cm/conference/pp20) in Seattle where Robert van de Geijn delivered [a talk on BLIS](https://meetings.siam.org/sess/dsp_programsess.cfm?SESSIONCODE=68266) and accepted the prize alongside other coauthors. +See also: + * [SIAM News | January 2020 Prize Spotlight](https://sinews.siam.org/Details-Page/january-2020-prize-spotlight#Field&Robert) + * [Oden Institute's SHPC Group Win SIAM Best Paper Prize](https://www.oden.utexas.edu/about/news/ScienceHighPerfomanceComputingSIAMBestPaperPrize/) + Funding ------- diff --git a/RELEASING b/RELEASING deleted file mode 100644 index 351594c49d..0000000000 --- a/RELEASING +++ /dev/null @@ -1,44 +0,0 @@ -Here are the steps to follow to create a new release (version) of BLIS: - -1. Make sure there are no commits that have yet to be pulled into - local repository. - - $ git pull - - If there are any commits upstream, merge them as appropriate. - -2. Verify that the code builds properly. - - $ ./configure auto; make - -3. Verify that the code passes BLIS and BLAS tests: - - $ make check # BLIS testsuite (fast) + BLAS test drivers - $ make checkblis # BLIS testsuite (full ex. mixed-datatype) - $ make checkblis-md # BLIS testsuite (mixed-datatype only) - $ make checkblis-salt # BLIS testsuite (fast + salt) - -4. Draft a new announcement to blis-devel, crediting those who - contributed towards this version by browsing 'git log'. - -5. Update CREDITS file if 'git log' reveals any new contributors. - -6. Update docs/ReleaseNotes.md file with body of finalized announcement - and the date of the release. - -7. Commit changes from steps 5 and 6. - -8. Bump the version number: - - $ ./build/bump-version.sh "0.3.2" - - This will result in two new commits: a version file update and a CHANGELOG - file update. - -9. Push the new commits and new tag associated with the new version: - - $ git push - $ git push --tag - -10. Send finalized announcement to blis-devel. - diff --git a/RELEASING.md b/RELEASING.md new file mode 100644 index 0000000000..9496b396a4 --- /dev/null +++ b/RELEASING.md @@ -0,0 +1,346 @@ +## Contents + +* **[BLIS version numbering scheme and branching strategy](RELEASING.md#blis-version-numbering-scheme-and-branching-strategy)** +* **[Instructions for creating a new release candidate or point release of BLIS +](RELEASING.md#instructions-for-creating-a-new-release-candidate-or-point-release-of-blis +)** + * **[Creating a new release lineage branch +](RELEASING.md#creating-a-new-release-lineage-branch)** + * **[Creating a new release candidate (e.g. `1.x` -> `2.0-rc0` or `2.0-rc0` -> `2.0-rc1`)](RELEASING.md#creating-a-new-release-candidate-eg-1x---20-rc0-or-20-rc0---20-rc1)** + * **[Creating a new major release (e.g. `2.0-rc` -> `2.0`)](RELEASING.md#creating-a-new-major-release-eg-20-rcn---20)** + * **[Back-porting fixes from `master` to releases](RELEASING.md#back-porting-fixes-from-master-to-releases)** + * **[Creating a new point release (e.g. `1.1` -> `1.2` or `2.0` -> `2.1`)](RELEASING.md#creating-a-new-point-release-eg-11---12-or-20---21)** + +## BLIS version numbering scheme and branching strategy + +BLIS uses a major.minor version numbering scheme. An increase in the +major version number (a "major release" or simple "new version") +indicates new (usually significant) functionality, and possible +incompatibility with previous major releases, although the ABI +version can be used to check for compatibility across major version +in principle. + +Major releases have one or more "release candidates" which are +preliminary versions of the next release, publicly distributed for +comment and/or bug discovery. Subsequent release candidates (rcs) +correct problems found in the previous rc. Once a reasonable level +of stability is achieved, the full release is distributed. + +An increase in the minor version number (a "point release") indicates +the incorportation of one or more bugfixes or other minor changes since +the initial major version release or last point release. + +Essentially, point releases extend the rc cadence beyond the official +release by correcting additional problems discovered after release. + +All rcs, initial major release, and point releases are created along a +linear git branch, named for the major release lineage, e.g. `r1.x`. +Commits indicating rcs and releases are tagged (e.g. `1.0-rc0`, `1.0`, +`1.1`) and also have an associated non-tip branch (e.g. `r1.0-rc0`, +`r1.0`, `r1.1`). Using both tags and branches increases visibility of +important commits, but new commits should only be made on the `r1.x` +lineage branch. + +Release lineage branches diverge from `master` starting with the first +rc. Any new commits on the release lineage (except version maintenance +commits such as updating the version file, CHANGELOG, and release notes) +are cherry-picked from `master`. Exceptions may be made if, for example, +a backported bugfix cannot be cherry-picked and requires a more targeted +fix directly on a release branch. + +Here is an example illustration of the release branch structure: +``` +_________________________________________________________master + \ \ + \ \__r2.0-rc0_____r2.0-rc1_____r2.0,r2.x + \ (2.0-rc0) (2.0-rc1) (2.0) + \ + \__r1.0-rc0_____r1.0-rc1_____r1.0_____r1.1_____r1.2,r1.x + (1.0-rc0) (1.0-rc1) (1.0) (1.1) (1.2) + /\ + <- release candidates -- major release -- point releases -> +``` + +In each case, the version number (as encoded in the `version` file) +indicates the `x.y` prefix of the most recent tagged commit. The +exception is `master`, where the `version` file indicates `z.0-dev`, +where `z` is the major version number one higher than the latest major +release (e.g. `3.0-dev` in the example above). + +## Instructions for creating a new release candidate or point release of BLIS + +### Creating a new release lineage branch + +1. Consider whether the so_version should be updated (via the `build/so_version` + file) due to any ABI changes since the previous version. If so, commit that + change on `master` now. + +2. Create the new release lineage branch. + + ``` + $ git checkout master + $ git pull + $ git branch r2.x + ``` + + Note that the new release lineage branch should not be check out at this point. + +3. Update the version on the `master` branch to reflect the next release in development. + + ``` + $ ./build/do-release.sh -b "3.0-dev" + $ git push + ``` + + Note the extra option `-b`. + +4. Check out the new release lineage branch. + + ``` + $ git checkout r2.x + ``` + +### Creating a new release candidate (e.g. `1.x` -> `2.0-rc0` or `2.0-rc0` -> `2.0-rc1`) + +1. Make sure that the release lineage branch is checked out and up-to-date. + + ``` + $ git checkout r2.x + $ git pull + ``` + +2. Draft a new announcement to the blis-devel mailing list, crediting those who + contributed towards this version by browsing `git log`. + +3. Update the CREDITS file if `git log` reveals any new contributors. + NOTE: This should have already been done prior to the rc cycle. + +4. Commit the updated CREDITS file if changed. + +5. Update `docs/ReleaseNotes.md` with the body of finalized announcement + and the date of the release. Developers are encouraged to update + the release notes on `master` as new changes are made, which simplifies + preparation of rc0. + +6. Commit the updated `docs/ReleaseNotes.md` file. + +7. Use the `build/do-release.sh` script to create a new rc branch and tag. + + ``` + $ ./build/do-release.sh "2.0-rc" + ``` + + Where `` is `0` for the first rc, or one higher than the last rc on this release + lineage branch. + +8. Make sure the `do-release` script and other commits did what they were + supposed to do by inspecting the output of `git log`. If everything looks good, + you can push the changes via: + + ``` + $ git push + $ git push --tags + $ git push -u 2.0-rc + ``` + + Where `` is the name of the appropiate upstream git remote. + + At this point, the new release candidate branch is live at ``. + +9. Announce the rc release on blis-devel, Discord, and/or other appropriate + venues. + +10. Wait for bug reports. Typically an rc should stay live for at least a month + in order to give users time to try it out. + +11. After the trial period, cherry-pick any bugfixes or other updates: + + $ git cherry-pick [-nx] + + Be sure to include lines in the commit + log entry for each cherry-picked commit that note the commit hash + of the *original* commit that is being cherry-picked from. Example: + + ``` + Fixed a bug in blahblahblah. (#777) + + Details: + - Fixed a bug in blahblahblah that manifested as blahblahblah. This + bug was introduced in commit abc12345. Thanks to John Smith for + reporting this bug. + - (cherry picked from commit abc0123456789abc0123456789abc0123456789a) + ``` + + Note the final line, which was *not* present in the original commit + log entry (on `master`) but *should be* present in the commit log entry for the + cherry-picked commit (on the release lineage branch). + + 12. If no bugs are reported/found, or if the updated rc is otherwise ready + for promotion to full release, continue with the instructions below. + Otherwise, return to step 2, incrementing ``. + +### Creating a new major release (e.g. `2.0-rc` -> `2.0`) + +1. Make sure that the release lineage branch is checked out and up-to-date. + + ``` + $ git checkout r2.x + $ git pull + ``` + +2. Draft a new announcement to the blis-devel mailing list, crediting those who + contributed towards this version by browsing `git log`. + +3. Update the CREDITS file if `git log` reveals any new contributors. + NOTE: This should have already been done prior to the release cycle. + +4. Commit the updated CREDITS file if changed. + +5. Update `docs/ReleaseNotes.md` with the body of finalized announcement + and the date of the release. Developers are encouraged to update + the release notes on `master` as new changes are made, which simplifies + preparation of the release. + +6. Commit the updated `docs/ReleaseNotes.md` file. + +7. Use the `build/do-release.sh` script to create a new release branch and tag. + + ``` + $ ./build/do-release.sh "2.0" + ``` + +8. Make sure the `do-release` script and other commits did what they were + supposed to do by inspecting the output of `git log`. If everything looks good, + you can push the changes via: + + ``` + $ git push + $ git push --tags + $ git push -u 2.0 + ``` + + Where `` is the name of the appropiate upstream git remote. + + At this point, the new release branch is live at ``. + +9. Publish a new release via GitHub (https://github.com/flame/blis/releases). + Identify the new version by the tag you just created and pushed. You can + also identify the previous release. + + Try to use formatting consistent with the prior release. (You can start to + edit the previous release, inspect/copy some of the markdown syntax, and + then abort the edit.) + +10. Announce the rc release on blis-devel, Discord, and/or other appropriate + venues. + +11. Update the Wikipedia entry for BLIS to reflect the new latest version. + +### Back-porting fixes from `master` to releases + +1. When a bug fix is developed on `master` which is applicable to a supported release, + and corrects a significant problem with correctness, usability, or performance + (e.g. not new functionality or cosmetic changes), it should be back-ported. + Bug fixes should be individually back-ported to all supported releases. + +2. Check out the relevant release lineage branch, e.g.: + + ``` + $ git checkout r2.x + $ git pull + ``` + +3. Verify that the bug affects this release lineage. If not, skip this release lineage. + +4. If possible, cherry-pick the bugfix commit from `master`: + + $ git cherry-pick [-nx] + + Be sure to include lines in the commit + log entry for each cherry-picked commit that note the commit hash + of the *original* commit that is being cherry-picked from. Example: + + ``` + Fixed a bug in blahblahblah. (#777) + + Details: + - Fixed a bug in blahblahblah that manifested as blahblahblah. This + bug was introduced in commit abc12345. Thanks to John Smith for + reporting this bug. + - (cherry picked from commit abc0123456789abc0123456789abc0123456789a) + ``` + + Note the final line, which was *not* present in the original commit + log entry (on `master`) but *should be* present in the commit log entry + for the cherry-picked commit (on the release lineage branch). + +5. If cherry-picking is not possible (e.g. the commit does not merge cleanly, + underlying implementation details or internal APIs have changed, etc., + then craft a new bugfix on the release lineage branch. Make sure to test + the new bugfix against the reported bug, as well as the full BLIS testsuite! + +7. Push the changes via `git push`. Do not update any other release branches or tags + at this time. + +### Creating a new point release (e.g. `1.1` -> `1.2` or `2.0` -> `2.1`) + +1. Once enough bug fixes have accumulated, a bug fix of high enough urgency, or a + pre-determined period of time has elapsed, all bug fix commits since the last release + (major or point release) will be included in a new point release. + + Point releases can be made on either the most recent release lineage branch or on + a "historical" but still supported release lineage. + +2. Check out the relevant release lineage branch (which may not be the most recent) + + ``` + $ git checkout r2.x + $ git pull + ``` + +3. Draft a new announcement to the blis-devel mailing list, crediting those who + contributed towards this version by browsing `git log`. + +4. Update the CREDITS file if `git log` reveals any new contributors. + NOTE: This should have already been done prior to the release cycle. + +5. Commit the updated CREDITS file if changed. + +6. Update `docs/ReleaseNotes.md` with the body of finalized announcement + and the date of the release. + +7. Commit the updated `docs/ReleaseNotes.md` file. + +8. Use the `build/do-release.sh` script to create a new release branch and tag. + + ``` + $ ./build/do-release.sh "2.1" + ``` + +9. Make sure the `do-release` script and other commits did what they were + supposed to do by inspecting the output of `git log`. If everything looks good, + you can push the changes via: + + ``` + $ git push + $ git push --tags + $ git push -u 2.1 + ``` + + Where `` is the name of the appropiate upstream git remote. + + At this point, the new release branch is live at ``. + +10. Update the release target branch via GitHub (https://github.com/flame/blis/releases). + Identify the new version by the tag you just created and pushed. You can + also identify the previous release. + + Try to use formatting consistent with the prior release. (You can start to + edit the previous release, inspect/copy some of the markdown syntax, and + then abort the edit.) + +11. Announce the rc release on blis-devel, Discord, and/or other appropriate + venues. + +12. If this point release is for the most recent major release lineage, + update the Wikipedia entry for BLIS to reflect the new latest version. diff --git a/addon/old/gemmd/attic/bao_gemmd_bp_var2.c b/addon/old/gemmd/attic/bao_gemmd_bp_var2.c new file mode 100644 index 0000000000..dbccedc358 --- /dev/null +++ b/addon/old/gemmd/attic/bao_gemmd_bp_var2.c @@ -0,0 +1,602 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemmd_fp + +typedef void (*FUNCPTR_T) + ( + conj_t conja, + conj_t conjb, + dim_t m, + dim_t n, + dim_t k, + void* restrict alpha, + void* restrict a, inc_t rs_a, inc_t cs_a, + void* restrict d, inc_t incd, + void* restrict b, inc_t rs_b, inc_t cs_b, + void* restrict beta, + void* restrict c, inc_t rs_c, inc_t cs_c, + cntx_t* restrict cntx, + rntm_t* restrict rntm, + thrinfo_t* restrict thread + ); + +// +// -- gemmd-like block-panel algorithm (object interface) ---------------------- +// + +// Define a function pointer array named ftypes and initialize its contents with +// the addresses of the typed functions defined below, bao_?gemmd_bp_var2(). +static FUNCPTR_T GENARRAY_PREF(ftypes,bao_,gemmd_bp_var2); + +void bao_gemmd_bp_var2 + ( + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ) +{ + const num_t dt = bli_obj_dt( c ); + + const conj_t conja = bli_obj_conj_status( a ); + const conj_t conjb = bli_obj_conj_status( b ); + + const dim_t m = bli_obj_length( c ); + const dim_t n = bli_obj_width( c ); + const dim_t k = bli_obj_width( a ); + + void* restrict buf_a = bli_obj_buffer_at_off( a ); + const inc_t rs_a = bli_obj_row_stride( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + + void* restrict buf_d = bli_obj_buffer_at_off( d ); + const inc_t incd = bli_obj_vector_inc( d ); + + void* restrict buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const inc_t cs_b = bli_obj_col_stride( b ); + + void* restrict buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); + void* restrict buf_beta = bli_obj_buffer_for_1x1( dt, beta ); + + // Index into the function pointer array to extract the correct + // typed function pointer based on the chosen datatype. + FUNCPTR_T f = ftypes[dt]; + + // Invoke the function. + f + ( + conja, + conjb, + m, + n, + k, + buf_alpha, + buf_a, rs_a, cs_a, + buf_d, incd, + buf_b, rs_b, cs_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread + ); +} + +// +// -- gemmd-like block-panel algorithm (typed interface) ----------------------- +// + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTECH2(bao_,ch,varname) \ + ( \ + conj_t conja, \ + conj_t conjb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* restrict alpha, \ + void* restrict a, inc_t rs_a, inc_t cs_a, \ + void* restrict d, inc_t incd, \ + void* restrict b, inc_t rs_b, inc_t cs_b, \ + void* restrict beta, \ + void* restrict c, inc_t rs_c, inc_t cs_c, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + thrinfo_t* restrict thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Query the context for various blocksizes. */ \ + const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ + const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ + const dim_t NC = bli_cntx_get_blksz_def_dt( dt, BLIS_NC, cntx ); \ + const dim_t MC = bli_cntx_get_blksz_def_dt( dt, BLIS_MC, cntx ); \ + const dim_t KC = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); \ +\ + /* Query the context for the microkernel address and cast it to its + function pointer type. */ \ + /* + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ + */ \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + /* + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ + */ \ +\ + /* Compute partitioning step values for each matrix of each loop. */ \ + const inc_t jcstep_c = cs_c; \ + const inc_t jcstep_b = cs_b; \ +\ + const inc_t pcstep_a = cs_a; \ + const inc_t pcstep_d = incd; \ + const inc_t pcstep_b = rs_b; \ +\ + const inc_t icstep_c = rs_c; \ + const inc_t icstep_a = rs_a; \ +\ + const inc_t jrstep_c = cs_c * NR; \ +\ + const inc_t irstep_c = rs_c * MR; \ +\ + ctype* restrict a_00 = a; \ + ctype* restrict d_00 = d; \ + ctype* restrict b_00 = b; \ + ctype* restrict c_00 = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ +\ + /* Make local copies of the scalars to prevent any unnecessary sharing of + cache lines between the cores' caches. */ \ + ctype alpha_local = *alpha_cast; \ + ctype beta_local = *beta_cast; \ + ctype one_local = *PASTEMAC(ch,1); \ + /*ctype zero_local = *PASTEMAC(ch,0);*/ \ +\ + auxinfo_t aux; \ +\ + /* Initialize a mem_t entry for A and B. Strictly speaking, this is only + needed for the matrix we will be packing (if any), but we do it + unconditionally to be safe. */ \ + mem_t mem_a = BLIS_MEM_INITIALIZER; \ + mem_t mem_b = BLIS_MEM_INITIALIZER; \ +\ + /* Define an array of bszid_t ids, which will act as our substitute for + the cntl_t tree. */ \ + bszid_t bszids[8] = { BLIS_NC, /* 5th loop */ \ + BLIS_KC, /* 4th loop */ \ + BLIS_NO_PART, /* pack B */ \ + BLIS_MC, /* 3rd loop */ \ + BLIS_NO_PART, /* pack A */ \ + BLIS_NR, /* 2nd loop */ \ + BLIS_MR, /* 1st loop */ \ + BLIS_KR }; /* microkernel loop */ \ +\ + bszid_t* restrict bszids_jc = &bszids[0]; \ + bszid_t* restrict bszids_pc = &bszids[1]; \ + /*bszid_t* restrict bszids_pb = &bszids[2];*/ \ + bszid_t* restrict bszids_ic = &bszids[3]; \ + /*bszid_t* restrict bszids_pa = &bszids[4];*/ \ + bszid_t* restrict bszids_jr = &bszids[5]; \ + /*bszid_t* restrict bszids_ir = &bszids[6];*/ \ +\ + thrinfo_t* restrict thread_jc = NULL; \ + thrinfo_t* restrict thread_pc = NULL; \ + thrinfo_t* restrict thread_pb = NULL; \ + thrinfo_t* restrict thread_ic = NULL; \ + thrinfo_t* restrict thread_pa = NULL; \ + thrinfo_t* restrict thread_jr = NULL; \ + thrinfo_t* restrict thread_ir = NULL; \ +\ + /* Identify the current thrinfo_t node and then grow the tree. */ \ + thread_jc = thread; \ + bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \ +\ + /* Compute the JC loop thread range for the current thread. */ \ + dim_t jc_start, jc_end; \ + bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); \ + const dim_t n_local = jc_end - jc_start; \ +\ + /* Compute number of primary and leftover components of the JC loop. */ \ + /*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \ + const dim_t jc_left = n_local % NC; \ +\ + /* Loop over the n dimension (NC rows/columns at a time). */ \ + for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \ + { \ + /* Calculate the thread's current JC block dimension. */ \ + const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \ +\ + ctype* restrict b_jc = b_00 + jj * jcstep_b; \ + ctype* restrict c_jc = c_00 + jj * jcstep_c; \ +\ + /* Identify the current thrinfo_t node and then grow the tree. */ \ + thread_pc = bli_thrinfo_sub_node( thread_jc ); \ + bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \ +\ + /* Compute the PC loop thread range for the current thread. */ \ + const dim_t pc_start = 0, pc_end = k; \ + const dim_t k_local = k; \ +\ + /* Compute number of primary and leftover components of the PC loop. */ \ + /*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \ + const dim_t pc_left = k_local % KC; \ +\ + /* Loop over the k dimension (KC rows/columns at a time). */ \ + for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \ + { \ + /* Calculate the thread's current PC block dimension. */ \ + const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \ +\ + ctype* restrict a_pc = a_00 + pp * pcstep_a; \ + ctype* restrict d_pc = d_00 + pp * pcstep_d; \ + ctype* restrict b_pc = b_jc + pp * pcstep_b; \ +\ + /* Only apply beta to the first iteration of the pc loop. */ \ + ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \ +\ + ctype* b_use; \ + inc_t rs_b_use, cs_b_use, ps_b_use; \ +\ + /* Identify the current thrinfo_t node. Note that the thrinfo_t + node will have already been created by a previous call to + bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART + cause the tree to grow by two (e.g. to the next bszid that is + a normal bszid_t value). */ \ + thread_pb = bli_thrinfo_sub_node( thread_pc ); \ + /*bli_thrinfo_sup_grow( rntm, bszids_pb, thread_pb );*/ \ +\ + /* Determine the packing buffer and related parameters for matrix + B. Then call the packm implementation. */ \ + PASTECH2(bao_,ch,packm_b) \ + ( \ + conjb, \ + KC, NC, \ + kc_cur, nc_cur, NR, \ + &one_local, \ + d_pc, incd, \ + b_pc, rs_b, cs_b, \ + &b_use, &rs_b_use, &cs_b_use, \ + &ps_b_use, \ + cntx, \ + rntm, \ + &mem_b, \ + thread_pb \ + ); \ +\ + /* Alias b_use so that it's clear this is our current block of + matrix B. */ \ + ctype* restrict b_pc_use = b_use; \ +\ + /* Identify the current thrinfo_t node and then grow the tree. */ \ + thread_ic = bli_thrinfo_sub_node( thread_pb ); \ + bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \ +\ + /* Compute the IC loop thread range for the current thread. */ \ + dim_t ic_start, ic_end; \ + bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end ); \ + const dim_t m_local = ic_end - ic_start; \ +\ + /* Compute number of primary and leftover components of the IC loop. */ \ + /*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \ + const dim_t ic_left = m_local % MC; \ +\ + /* Loop over the m dimension (MC rows at a time). */ \ + for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \ + { \ + /* Calculate the thread's current IC block dimension. */ \ + const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \ +\ + ctype* restrict a_ic = a_pc + ii * icstep_a; \ + ctype* restrict c_ic = c_jc + ii * icstep_c; \ +\ + ctype* a_use; \ + inc_t rs_a_use, cs_a_use, ps_a_use; \ +\ + /* Identify the current thrinfo_t node. Note that the thrinfo_t + node will have already been created by a previous call to + bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART + cause the tree to grow by two (e.g. to the next bszid that is + a normal bszid_t value). */ \ + thread_pa = bli_thrinfo_sub_node( thread_ic ); \ + /*bli_thrinfo_sup_grow( rntm, bszids_pa, thread_pa );*/ \ +\ + /* Determine the packing buffer and related parameters for matrix + A. Then call the packm implementation. */ \ + PASTECH2(bao_,ch,packm_a) \ + ( \ + conja, \ + MC, KC, \ + mc_cur, kc_cur, MR, \ + &one_local, \ + d_pc, incd, \ + a_ic, rs_a, cs_a, \ + &a_use, &rs_a_use, &cs_a_use, \ + &ps_a_use, \ + cntx, \ + rntm, \ + &mem_a, \ + thread_pa \ + ); \ +\ + /* Alias a_use so that it's clear this is our current block of + matrix A. */ \ + ctype* restrict a_ic_use = a_use; \ +\ + /* Identify the current thrinfo_t node and then grow the tree. */ \ + thread_jr = bli_thrinfo_sub_node( thread_pa ); \ + bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \ +\ + /* Query the number of threads and thread ids for the JR loop. + NOTE: These values are only needed when computing the next + micropanel of B. */ \ + const dim_t jr_nt = bli_thrinfo_n_way( thread_jr ); \ + const dim_t jr_tid = bli_thrinfo_work_id( thread_jr ); \ +\ + /* Compute number of primary and leftover components of the JR loop. */ \ + dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \ + dim_t jr_left = nc_cur % NR; \ +\ + /* Compute the JR loop thread range for the current thread. */ \ + dim_t jr_start, jr_end; \ + bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( dim_t j = jr_start; j < jr_end; j += 1 ) \ + { \ + const dim_t nr_cur \ + = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \ +\ + ctype* restrict b_jr = b_pc_use + j * ps_b_use; \ + ctype* restrict c_jr = c_ic + j * jrstep_c; \ +\ + /* Assume for now that our next panel of B to be the current panel + of B. */ \ + ctype* restrict b2 = b_jr; \ +\ + /* Identify the current thrinfo_t node. */ \ + thread_ir = bli_thrinfo_sub_node( thread_jr ); \ +\ + /* Query the number of threads and thread ids for the IR loop. + NOTE: These values are only needed when computing the next + micropanel of A. */ \ + const dim_t ir_nt = bli_thrinfo_n_way( thread_ir ); \ + const dim_t ir_tid = bli_thrinfo_work_id( thread_ir ); \ +\ + /* Compute number of primary and leftover components of the IR loop. */ \ + dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \ + dim_t ir_left = mc_cur % MR; \ +\ + /* Compute the IR loop thread range for the current thread. */ \ + dim_t ir_start, ir_end; \ + bli_thread_range_sub( thread_ir, ir_iter, 1, FALSE, &ir_start, &ir_end ); \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( dim_t i = ir_start; i < ir_end; i += 1 ) \ + { \ + const dim_t mr_cur \ + = ( bli_is_not_edge_f( i, ir_iter, ir_left ) ? MR : ir_left ); \ +\ + ctype* restrict a_ir = a_ic_use + i * ps_a_use; \ + ctype* restrict c_ir = c_jr + i * irstep_c; \ +\ + ctype* restrict a2; \ +\ + /* Compute the addresses of the next micropanels of A and B. */ \ + a2 = bli_gemm_get_next_a_upanel( a_ir, ps_a_use, 1 ); \ + if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \ + { \ + a2 = a_ic_use; \ + b2 = bli_gemm_get_next_b_upanel( b_jr, ps_b_use, 1 ); \ + if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \ + b2 = b_pc_use; \ + } \ +\ + /* Save the addresses of next micropanels of A and B to the + auxinfo_t object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Call a wrapper to the kernel (which handles edge cases). */ \ + PASTECH2(bao_,ch,gemm_kernel) \ + ( \ + MR, \ + NR, \ + mr_cur, \ + nr_cur, \ + kc_cur, \ + &alpha_local, \ + a_ir, rs_a_use, cs_a_use, \ + b_jr, rs_b_use, cs_b_use, \ + beta_use, \ + c_ir, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + } \ + } \ +\ + /* This barrier is needed to prevent threads from starting to pack + the next row panel of B before the current row panel is fully + computed upon. */ \ + bli_thrinfo_barrier( thread_pb ); \ + } \ + } \ +\ + /* Release any memory that was acquired for packing matrices A and B. */ \ + PASTECH2(bao_,ch,packm_finalize_mem_a) \ + ( \ + rntm, \ + &mem_a, \ + thread_pa \ + ); \ + PASTECH2(bao_,ch,packm_finalize_mem_b) \ + ( \ + rntm, \ + &mem_b, \ + thread_pb \ + ); \ +\ +/* +PASTEMAC(ch,fprintm)( stdout, "gemmd_bp_var2: a1_packed", mr_cur, kc_cur, a_ir, rs_a_use, cs_a_use, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemmd_bp_var2: b1_packed", kc_cur, nr_cur, b_jr, rs_b_use, cs_b_use, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemmd_bp_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%5.2f", "" ); \ +*/ \ +} + +//INSERT_GENTFUNC_BASIC0( gemmd_bp_var2 ) +GENTFUNC( float, s, gemmd_bp_var2 ) +GENTFUNC( double, d, gemmd_bp_var2 ) +GENTFUNC( scomplex, c, gemmd_bp_var2 ) +GENTFUNC( dcomplex, z, gemmd_bp_var2 ) + +// +// -- gemm-like microkernel wrapper -------------------------------------------- +// + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTECH2(bao_,ch,varname) \ + ( \ + const dim_t MR, \ + const dim_t NR, \ + dim_t mr_cur, \ + dim_t nr_cur, \ + dim_t kc_cur, \ + ctype* restrict alpha, \ + ctype* restrict a, inc_t rs_a, inc_t cs_a, \ + ctype* restrict b, inc_t rs_b, inc_t cs_b, \ + ctype* restrict beta, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* restrict aux, \ + cntx_t* restrict cntx \ + ) \ +{ \ + /* Infer the datatype from the ctype. */ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Query the context for the microkernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype zero = *PASTEMAC(ch,0); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( mr_cur == MR && nr_cur == NR ) \ + { \ + /* Invoke the gemm microkernel. */ \ + gemm_ukr \ + ( \ + kc_cur, \ + alpha, \ + a, \ + b, \ + beta, \ + c, rs_c, cs_c, \ + aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm microkernel. */ \ + gemm_ukr \ + ( \ + kc_cur, \ + alpha, \ + a, \ + b, \ + &zero, \ + ct, rs_ct, cs_ct, \ + aux, \ + cntx \ + ); \ +\ + /* Scale the bottom edge of C and add the result from above. */ \ + PASTEMAC(ch,xpbys_mxn) \ + ( \ + mr_cur, \ + nr_cur, \ + ct, rs_ct, cs_ct, \ + beta, \ + c, rs_c, cs_c \ + ); \ + } \ +} + +//INSERT_GENTFUNC_BASIC0( gemm_kernel ) +GENTFUNC( float, s, gemm_kernel ) +GENTFUNC( double, d, gemm_kernel ) +GENTFUNC( scomplex, c, gemm_kernel ) +GENTFUNC( dcomplex, z, gemm_kernel ) + diff --git a/addon/old/gemmd/attic/bli_gemm_ex.c b/addon/old/gemmd/attic/bli_gemm_ex.c new file mode 100644 index 0000000000..8b7d11d819 --- /dev/null +++ b/addon/old/gemmd/attic/bli_gemm_ex.c @@ -0,0 +1,89 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_gemm_ex + ( + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); + + // A switch to easily toggle whether we use the addon implementation + // of bao_gemmd() as the implementation for bli_gemm(). (This allows for + // easy testing of bao_gemmd() via the testsuite.) + if ( 1 ) + { + const dim_t k = bli_obj_width_after_trans( a ); + const num_t dt = bli_obj_dt( c ); + obj_t d; + + bli_obj_create( dt, k, 1, 1, k, &d ); + bli_setv( &BLIS_ONE, &d ); + //bli_randv( &d ); + + bao_gemmd_ex( alpha, a, &d, b, beta, c, cntx, rntm ); + + bli_obj_free( &d ); + return; + } + + // Initialize a local runtime with global settings if necessary. Note + // that in the case that a runtime is passed in, we make a local copy. + rntm_t rntm_l; + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } + else { rntm_l = *rntm; rntm = &rntm_l; } + + // Obtain a valid (native) context from the gks if necessary. + if ( cntx == NULL ) cntx = bli_gks_query_cntx(); + + // Check the operands. + if ( bli_error_checking_is_enabled() ) + bli_gemm_check( alpha, a, b, beta, c, cntx ); + + // Invoke the operation's front end. + bli_gemm_front + ( + ( obj_t* )alpha, ( obj_t* )a, ( obj_t* )b, ( obj_t* )beta, ( obj_t* )c, + ( cntx_t* )cntx, ( rntm_t* )rntm, NULL + ); +} + diff --git a/addon/old/gemmd/bao_gemmd.c b/addon/old/gemmd/bao_gemmd.c new file mode 100644 index 0000000000..8379ff6d4f --- /dev/null +++ b/addon/old/gemmd/bao_gemmd.c @@ -0,0 +1,299 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// -- Define the gemmd operation's object API ---------------------------------- +// + +void bao_gemmd + ( + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c + ) +{ + bao_gemmd_ex + ( + alpha, + a, + d, + b, + beta, + c, + NULL, + NULL + ); +} + +void bao_gemmd_ex + ( + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); + + // Initialize a local runtime with global settings if necessary. Note + // that in the case that a runtime is passed in, we make a local copy. + rntm_t rntm_l; + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } + else { rntm_l = *rntm; rntm = &rntm_l; } + + // Set the .pack_a and .pack_b fields to TRUE. This is only needed because + // this addon uses bli_thrinfo_sup_grow(), which calls + // bli_thrinfo_sup_create_for_cntl(), which employs an optimization if + // both fields are FALSE (as is often the case with sup). However, this + // addon implements the "large" code path, and so both A and B must + // always be packed. Setting the fields to TRUE will avoid the optimization + // while this addon implementation executes (and it also reinforces the + // fact that we *are* indeed packing A and B, albeit not in the sup context + // originally envisioned for the .pack_a and .pack_b fields). + bli_rntm_set_pack_a( TRUE, rntm ); + bli_rntm_set_pack_b( TRUE, rntm ); + + // Obtain a valid (native) context from the gks if necessary. + // NOTE: This must be done before calling the _check() function, since + // that function assumes the context pointer is valid. + if ( cntx == NULL ) cntx = ( cntx_t* )bli_gks_query_cntx(); + + // Check parameters. + if ( bli_error_checking_is_enabled() ) + bao_gemmd_check( alpha, a, d, b, beta, c, cntx ); + + // -- bao_gemmd_front() ---------------------------------------------------- + + obj_t a_local; + obj_t b_local; + obj_t c_local; + + // If C has a zero dimension, return early. + if ( bli_obj_has_zero_dim( c ) ) + { + return; + } + + // If alpha is zero, or if A or B has a zero dimension, scale C by beta + // and return early. + if ( bli_obj_equals( alpha, &BLIS_ZERO ) || + bli_obj_has_zero_dim( a ) || + bli_obj_has_zero_dim( b ) ) + { + bli_scalm( beta, c ); + return; + } + + // Alias A, B, and C in case we need to apply transformations. + bli_obj_alias_to( a, &a_local ); + bli_obj_alias_to( b, &b_local ); + bli_obj_alias_to( c, &c_local ); + + // Induce a transposition of A if it has its transposition property set. + // Then clear the transposition bit in the object. + if ( bli_obj_has_trans( &a_local ) ) + { + bli_obj_induce_trans( &a_local ); + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local ); + } + + // Induce a transposition of B if it has its transposition property set. + // Then clear the transposition bit in the object. + if ( bli_obj_has_trans( &b_local ) ) + { + bli_obj_induce_trans( &b_local ); + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &b_local ); + } + + // An optimization: If C is stored by rows and the micro-kernel prefers + // contiguous columns, or if C is stored by columns and the micro-kernel + // prefers contiguous rows, transpose the entire operation to allow the + // micro-kernel to access elements of C in its preferred manner. + if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) ) + { + bli_obj_swap( &a_local, &b_local ); + + bli_obj_induce_trans( &a_local ); + bli_obj_induce_trans( &b_local ); + bli_obj_induce_trans( &c_local ); + } + + // Parse and interpret the contents of the rntm_t object to properly + // set the ways of parallelism for each loop, and then make any + // additional modifications necessary for the current operation. + bli_rntm_set_ways_for_op + ( + BLIS_GEMM, + BLIS_LEFT, // ignored for gemm/hemm/symm + bli_obj_length( &c_local ), + bli_obj_width( &c_local ), + bli_obj_width( &a_local ), + rntm + ); + + // Spawn threads (if applicable), where bao_gemmd_int() is the thread entry + // point function for each thread. This also begins the process of creating + // the thrinfo_t tree, which contains thread communicators. + bao_l3_thread_decorator + ( + bao_gemmd_int, + BLIS_GEMM, // operation family id + alpha, + &a_local, + d, + &b_local, + beta, + &c_local, + cntx, + rntm + ); +} + +// +// -- Define the gemmd operation's thread entry point -------------------------- +// + +void bao_gemmd_int + ( + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ) +{ + // In this function, we choose the gemmd implementation that is executed + // on each thread. + + // Call the block-panel algorithm. + bao_gemmd_bp_var1 + ( + alpha, + a, + d, + b, + beta, + c, + cntx, + rntm, + thread + ); +} + +// +// -- Define the gemmd operation's typed API ----------------------------------- +// + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTECH2(bao_,ch,opname) \ + ( \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* d, inc_t incd, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + bli_init_once(); \ +\ + /* Determine the datatype (e.g. BLIS_FLOAT, BLIS_DOUBLE, etc.) based on + the macro parameter 'ch' (e.g. s, d, etc). */ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, dd, bo, betao, co; \ +\ + dim_t m_a, n_a; \ + dim_t m_b, n_b; \ +\ + /* Adjust the dimensions of matrices A and B according to the transa and + transb parameters. */ \ + bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \ + bli_set_dims_with_trans( transb, k, n, &m_b, &n_b ); \ +\ + /* Create bufferless scalar objects and attach the provided scalar pointers + to those scalar objects. */ \ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + /* Create bufferless matrix objects and attach the provided matrix pointers + to those matrix objects. */ \ + bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, k, 1, d, incd, k, &dd ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, n, c, rs_c, cs_c, &co ); \ +\ + /* Set the transposition/conjugation properties of the objects for matrices + A and B. */ \ + bli_obj_set_conjtrans( transa, &ao ); \ + bli_obj_set_conjtrans( transb, &bo ); \ +\ + /* Call the object interface. */ \ + PASTECH(bao_,opname) \ + ( \ + &alphao, \ + &ao, \ + &dd, \ + &bo, \ + &betao, \ + &co \ + ); \ +} + +//INSERT_GENTFUNC_BASIC0( gemmd ) +GENTFUNC( float, s, gemmd ) +GENTFUNC( double, d, gemmd ) +GENTFUNC( scomplex, c, gemmd ) +GENTFUNC( dcomplex, z, gemmd ) + diff --git a/frame/3/gemm/bli_gemm_packab.c b/addon/old/gemmd/bao_gemmd.h similarity index 59% rename from frame/3/gemm/bli_gemm_packab.c rename to addon/old/gemmd/bao_gemmd.h index a15192994e..7c7466494d 100644 --- a/frame/3/gemm/bli_gemm_packab.c +++ b/addon/old/gemmd/bao_gemmd.h @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2021, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -32,85 +32,74 @@ */ -#include "blis.h" +// +// -- Prototype the gemmd operation's object API ------------------------------- +// -void bli_gemm_packa +BLIS_EXPORT_ADDON void bao_gemmd ( + obj_t* alpha, obj_t* a, + obj_t* d, obj_t* b, + obj_t* beta, + obj_t* c + ); + +BLIS_EXPORT_ADDON void bao_gemmd_ex + ( + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, obj_t* c, cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - obj_t a_pack; - - // Pack matrix A according to the control tree node. - bli_l3_packm - ( - a, - &a_pack, - cntx, - rntm, - cntl, - thread - ); + rntm_t* rntm + ); - // Proceed with execution using packed matrix A. - bli_gemm_int - ( - &BLIS_ONE, - &a_pack, - b, - &BLIS_ONE, - c, - cntx, - rntm, - bli_cntl_sub_node( cntl ), - bli_thrinfo_sub_node( thread ) - ); -} +// +// -- Prototype the gemmd operation's thread entry point ----------------------- +// -// ----------------------------------------------------------------------------- - -void bli_gemm_packb +void bao_gemmd_int ( + obj_t* alpha, obj_t* a, + obj_t* d, obj_t* b, + obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, - cntl_t* cntl, thrinfo_t* thread - ) -{ - obj_t b_pack; + ); + +// +// -- Prototype the gemmd operation's typed API -------------------------------- +// - // Pack matrix B according to the control tree node. - bli_l3_packm - ( - b, - &b_pack, - cntx, - rntm, - cntl, - thread - ); +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +BLIS_EXPORT_ADDON void PASTECH2(bao_,ch,opname) \ + ( \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* d, inc_t incd, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); - // Proceed with execution using packed matrix B. - bli_gemm_int - ( - &BLIS_ONE, - a, - &b_pack, - &BLIS_ONE, - c, - cntx, - rntm, - bli_cntl_sub_node( cntl ), - bli_thrinfo_sub_node( thread ) - ); -} +//INSERT_GENTPROT_BASIC0( gemmd ) +GENTPROT( float, s, gemmd ) +GENTPROT( double, d, gemmd ) +GENTPROT( scomplex, c, gemmd ) +GENTPROT( dcomplex, z, gemmd ) diff --git a/addon/old/gemmd/bao_gemmd_bp_var1.c b/addon/old/gemmd/bao_gemmd_bp_var1.c new file mode 100644 index 0000000000..b475218e9e --- /dev/null +++ b/addon/old/gemmd/bao_gemmd_bp_var1.c @@ -0,0 +1,491 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemmd_fp + +typedef void (*FUNCPTR_T) + ( + conj_t conja, + conj_t conjb, + dim_t m, + dim_t n, + dim_t k, + void* restrict alpha, + void* restrict a, inc_t rs_a, inc_t cs_a, + void* restrict d, inc_t incd, + void* restrict b, inc_t rs_b, inc_t cs_b, + void* restrict beta, + void* restrict c, inc_t rs_c, inc_t cs_c, + cntx_t* restrict cntx, + rntm_t* restrict rntm, + thrinfo_t* restrict thread + ); + +// +// -- gemmd-like block-panel algorithm (object interface) ---------------------- +// + +// Define a function pointer array named ftypes and initialize its contents with +// the addresses of the typed functions defined below, bao_?gemmd_bp_var1(). +static FUNCPTR_T GENARRAY_PREF(ftypes,bao_,gemmd_bp_var1); + +void bao_gemmd_bp_var1 + ( + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ) +{ + const num_t dt = bli_obj_dt( c ); + + const conj_t conja = bli_obj_conj_status( a ); + const conj_t conjb = bli_obj_conj_status( b ); + + const dim_t m = bli_obj_length( c ); + const dim_t n = bli_obj_width( c ); + const dim_t k = bli_obj_width( a ); + + void* restrict buf_a = bli_obj_buffer_at_off( a ); + const inc_t rs_a = bli_obj_row_stride( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + + void* restrict buf_d = bli_obj_buffer_at_off( d ); + const inc_t incd = bli_obj_vector_inc( d ); + + void* restrict buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const inc_t cs_b = bli_obj_col_stride( b ); + + void* restrict buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); + void* restrict buf_beta = bli_obj_buffer_for_1x1( dt, beta ); + + // Index into the function pointer array to extract the correct + // typed function pointer based on the chosen datatype. + FUNCPTR_T f = ftypes[dt]; + + // Invoke the function. + f + ( + conja, + conjb, + m, + n, + k, + buf_alpha, + buf_a, rs_a, cs_a, + buf_d, incd, + buf_b, rs_b, cs_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread + ); +} + +// +// -- gemmd-like block-panel algorithm (typed interface) ----------------------- +// + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTECH2(bao_,ch,varname) \ + ( \ + conj_t conja, \ + conj_t conjb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* restrict alpha, \ + void* restrict a, inc_t rs_a, inc_t cs_a, \ + void* restrict d, inc_t incd, \ + void* restrict b, inc_t rs_b, inc_t cs_b, \ + void* restrict beta, \ + void* restrict c, inc_t rs_c, inc_t cs_c, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + thrinfo_t* restrict thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Query the context for various blocksizes. */ \ + const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ + const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ + const dim_t NC = bli_cntx_get_blksz_def_dt( dt, BLIS_NC, cntx ); \ + const dim_t MC = bli_cntx_get_blksz_def_dt( dt, BLIS_MC, cntx ); \ + const dim_t KC = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); \ +\ + /* Query the context for the microkernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Compute partitioning step values for each matrix of each loop. */ \ + const inc_t jcstep_c = cs_c; \ + const inc_t jcstep_b = cs_b; \ +\ + const inc_t pcstep_a = cs_a; \ + const inc_t pcstep_d = incd; \ + const inc_t pcstep_b = rs_b; \ +\ + const inc_t icstep_c = rs_c; \ + const inc_t icstep_a = rs_a; \ +\ + const inc_t jrstep_c = cs_c * NR; \ +\ + const inc_t irstep_c = rs_c * MR; \ +\ + ctype* restrict a_00 = a; \ + ctype* restrict d_00 = d; \ + ctype* restrict b_00 = b; \ + ctype* restrict c_00 = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ +\ + /* Make local copies of the scalars to prevent any unnecessary sharing of + cache lines between the cores' caches. */ \ + ctype alpha_local = *alpha_cast; \ + ctype beta_local = *beta_cast; \ + ctype one_local = *PASTEMAC(ch,1); \ +\ + auxinfo_t aux; \ +\ + /* Initialize a mem_t entry for A and B. Strictly speaking, this is only + needed for the matrix we will be packing (if any), but we do it + unconditionally to be safe. */ \ + mem_t mem_a = BLIS_MEM_INITIALIZER; \ + mem_t mem_b = BLIS_MEM_INITIALIZER; \ +\ + /* Define an array of bszid_t ids, which will act as our substitute for + the cntl_t tree. */ \ + bszid_t bszids[8] = { BLIS_NC, /* 5th loop */ \ + BLIS_KC, /* 4th loop */ \ + BLIS_NO_PART, /* pack B */ \ + BLIS_MC, /* 3rd loop */ \ + BLIS_NO_PART, /* pack A */ \ + BLIS_NR, /* 2nd loop */ \ + BLIS_MR, /* 1st loop */ \ + BLIS_KR }; /* microkernel loop */ \ +\ + bszid_t* restrict bszids_jc = &bszids[0]; \ + bszid_t* restrict bszids_pc = &bszids[1]; \ + /*bszid_t* restrict bszids_pb = &bszids[2];*/ \ + bszid_t* restrict bszids_ic = &bszids[3]; \ + /*bszid_t* restrict bszids_pa = &bszids[4];*/ \ + bszid_t* restrict bszids_jr = &bszids[5]; \ + /*bszid_t* restrict bszids_ir = &bszids[6];*/ \ +\ + thrinfo_t* restrict thread_jc = NULL; \ + thrinfo_t* restrict thread_pc = NULL; \ + thrinfo_t* restrict thread_pb = NULL; \ + thrinfo_t* restrict thread_ic = NULL; \ + thrinfo_t* restrict thread_pa = NULL; \ + thrinfo_t* restrict thread_jr = NULL; \ + thrinfo_t* restrict thread_ir = NULL; \ +\ + /* Identify the current thrinfo_t node and then grow the tree. */ \ + thread_jc = thread; \ + bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \ +\ + /* Compute the JC loop thread range for the current thread. */ \ + dim_t jc_start, jc_end; \ + bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); \ + const dim_t n_local = jc_end - jc_start; \ +\ + /* Compute number of primary and leftover components of the JC loop. */ \ + /*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \ + const dim_t jc_left = n_local % NC; \ +\ + /* Loop over the n dimension (NC rows/columns at a time). */ \ + for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \ + { \ + /* Calculate the thread's current JC block dimension. */ \ + const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \ +\ + ctype* restrict b_jc = b_00 + jj * jcstep_b; \ + ctype* restrict c_jc = c_00 + jj * jcstep_c; \ +\ + /* Identify the current thrinfo_t node and then grow the tree. */ \ + thread_pc = bli_thrinfo_sub_node( thread_jc ); \ + bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \ +\ + /* Compute the PC loop thread range for the current thread. */ \ + const dim_t pc_start = 0, pc_end = k; \ + const dim_t k_local = k; \ +\ + /* Compute number of primary and leftover components of the PC loop. */ \ + /*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \ + const dim_t pc_left = k_local % KC; \ +\ + /* Loop over the k dimension (KC rows/columns at a time). */ \ + for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \ + { \ + /* Calculate the thread's current PC block dimension. */ \ + const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \ +\ + ctype* restrict a_pc = a_00 + pp * pcstep_a; \ + ctype* restrict d_pc = d_00 + pp * pcstep_d; \ + ctype* restrict b_pc = b_jc + pp * pcstep_b; \ +\ + /* Only apply beta to the first iteration of the pc loop. */ \ + ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \ +\ + ctype* b_use; \ + inc_t rs_b_use, cs_b_use, ps_b_use; \ +\ + /* Identify the current thrinfo_t node. Note that the thrinfo_t + node will have already been created by a previous call to + bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART + cause the tree to grow by two (e.g. to the next bszid that is + a normal bszid_t value). */ \ + thread_pb = bli_thrinfo_sub_node( thread_pc ); \ + /*bli_thrinfo_sup_grow( rntm, bszids_pb, thread_pb );*/ \ +\ + /* Determine the packing buffer and related parameters for matrix + B. Then call the packm implementation. */ \ + PASTECH2(bao_,ch,packm_b) \ + ( \ + conjb, \ + KC, NC, \ + kc_cur, nc_cur, NR, \ + &one_local, \ + d_pc, incd, \ + b_pc, rs_b, cs_b, \ + &b_use, &rs_b_use, &cs_b_use, \ + &ps_b_use, \ + cntx, \ + rntm, \ + &mem_b, \ + thread_pb \ + ); \ +\ + /* Alias b_use so that it's clear this is our current block of + matrix B. */ \ + ctype* restrict b_pc_use = b_use; \ +\ + /* Identify the current thrinfo_t node and then grow the tree. */ \ + thread_ic = bli_thrinfo_sub_node( thread_pb ); \ + bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \ +\ + /* Compute the IC loop thread range for the current thread. */ \ + dim_t ic_start, ic_end; \ + bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end ); \ + const dim_t m_local = ic_end - ic_start; \ +\ + /* Compute number of primary and leftover components of the IC loop. */ \ + /*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \ + const dim_t ic_left = m_local % MC; \ +\ + /* Loop over the m dimension (MC rows at a time). */ \ + for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \ + { \ + /* Calculate the thread's current IC block dimension. */ \ + const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \ +\ + ctype* restrict a_ic = a_pc + ii * icstep_a; \ + ctype* restrict c_ic = c_jc + ii * icstep_c; \ +\ + ctype* a_use; \ + inc_t rs_a_use, cs_a_use, ps_a_use; \ +\ + /* Identify the current thrinfo_t node. Note that the thrinfo_t + node will have already been created by a previous call to + bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART + cause the tree to grow by two (e.g. to the next bszid that is + a normal bszid_t value). */ \ + thread_pa = bli_thrinfo_sub_node( thread_ic ); \ + /*bli_thrinfo_sup_grow( rntm, bszids_pa, thread_pa );*/ \ +\ + /* Determine the packing buffer and related parameters for matrix + A. Then call the packm implementation. */ \ + PASTECH2(bao_,ch,packm_a) \ + ( \ + conja, \ + MC, KC, \ + mc_cur, kc_cur, MR, \ + &one_local, \ + d_pc, incd, \ + a_ic, rs_a, cs_a, \ + &a_use, &rs_a_use, &cs_a_use, \ + &ps_a_use, \ + cntx, \ + rntm, \ + &mem_a, \ + thread_pa \ + ); \ +\ + /* Alias a_use so that it's clear this is our current block of + matrix A. */ \ + ctype* restrict a_ic_use = a_use; \ +\ + /* Identify the current thrinfo_t node and then grow the tree. */ \ + thread_jr = bli_thrinfo_sub_node( thread_pa ); \ + bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \ +\ + /* Query the number of threads and thread ids for the JR loop. + NOTE: These values are only needed when computing the next + micropanel of B. */ \ + const dim_t jr_nt = bli_thrinfo_n_way( thread_jr ); \ + const dim_t jr_tid = bli_thrinfo_work_id( thread_jr ); \ +\ + /* Compute number of primary and leftover components of the JR loop. */ \ + dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \ + dim_t jr_left = nc_cur % NR; \ +\ + /* Compute the JR loop thread range for the current thread. */ \ + dim_t jr_start, jr_end; \ + bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( dim_t j = jr_start; j < jr_end; j += 1 ) \ + { \ + const dim_t nr_cur \ + = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \ +\ + ctype* restrict b_jr = b_pc_use + j * ps_b_use; \ + ctype* restrict c_jr = c_ic + j * jrstep_c; \ +\ + /* Assume for now that our next panel of B to be the current panel + of B. */ \ + ctype* restrict b2 = b_jr; \ +\ + /* Identify the current thrinfo_t node. */ \ + thread_ir = bli_thrinfo_sub_node( thread_jr ); \ +\ + /* Query the number of threads and thread ids for the IR loop. + NOTE: These values are only needed when computing the next + micropanel of A. */ \ + const dim_t ir_nt = bli_thrinfo_n_way( thread_ir ); \ + const dim_t ir_tid = bli_thrinfo_work_id( thread_ir ); \ +\ + /* Compute number of primary and leftover components of the IR loop. */ \ + dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \ + dim_t ir_left = mc_cur % MR; \ +\ + /* Compute the IR loop thread range for the current thread. */ \ + dim_t ir_start, ir_end; \ + bli_thread_range_sub( thread_ir, ir_iter, 1, FALSE, &ir_start, &ir_end ); \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( dim_t i = ir_start; i < ir_end; i += 1 ) \ + { \ + const dim_t mr_cur \ + = ( bli_is_not_edge_f( i, ir_iter, ir_left ) ? MR : ir_left ); \ +\ + ctype* restrict a_ir = a_ic_use + i * ps_a_use; \ + ctype* restrict c_ir = c_jr + i * irstep_c; \ +\ + ctype* restrict a2; \ +\ + /* Compute the addresses of the next micropanels of A and B. */ \ + a2 = bli_gemm_get_next_a_upanel( a_ir, ps_a_use, 1 ); \ + if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \ + { \ + a2 = a_ic_use; \ + b2 = bli_gemm_get_next_b_upanel( b_jr, ps_b_use, 1 ); \ + if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \ + b2 = b_pc_use; \ + } \ +\ + /* Save the addresses of next micropanels of A and B to the + auxinfo_t object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Invoke the gemm microkernel. */ \ + gemm_ukr \ + ( \ + mr_cur, \ + nr_cur, \ + kc_cur, \ + &alpha_local, \ + a_ir, \ + b_jr, \ + beta_use, \ + c_ir, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + } \ + } \ +\ + /* This barrier is needed to prevent threads from starting to pack + the next row panel of B before the current row panel is fully + computed upon. */ \ + bli_thrinfo_barrier( thread_pb ); \ + } \ + } \ +\ + /* Release any memory that was acquired for packing matrices A and B. */ \ + PASTECH2(bao_,ch,packm_finalize_mem_a) \ + ( \ + rntm, \ + &mem_a, \ + thread_pa \ + ); \ + PASTECH2(bao_,ch,packm_finalize_mem_b) \ + ( \ + rntm, \ + &mem_b, \ + thread_pb \ + ); \ +\ +/* +PASTEMAC(ch,fprintm)( stdout, "gemmd_bp_var1: a1_packed", mr_cur, kc_cur, a_ir, rs_a_use, cs_a_use, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemmd_bp_var1: b1_packed", kc_cur, nr_cur, b_jr, rs_b_use, cs_b_use, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemmd_bp_var1: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%5.2f", "" ); \ +*/ \ +} + +//INSERT_GENTFUNC_BASIC0( gemmd_bp_var1 ) +GENTFUNC( float, s, gemmd_bp_var1 ) +GENTFUNC( double, d, gemmd_bp_var1 ) +GENTFUNC( scomplex, c, gemmd_bp_var1 ) +GENTFUNC( dcomplex, z, gemmd_bp_var1 ) + diff --git a/addon/old/gemmd/bao_gemmd_check.c b/addon/old/gemmd/bao_gemmd_check.c new file mode 100644 index 0000000000..864e9a1acb --- /dev/null +++ b/addon/old/gemmd/bao_gemmd_check.c @@ -0,0 +1,131 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bao_gemmd_check + ( + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx + ) +{ + err_t e_val; + + // Check object datatypes. + + e_val = bli_check_noninteger_object( alpha ); + bli_check_error_code( e_val ); + + e_val = bli_check_noninteger_object( beta ); + bli_check_error_code( e_val ); + + e_val = bli_check_floating_object( a ); + bli_check_error_code( e_val ); + + e_val = bli_check_floating_object( d ); + bli_check_error_code( e_val ); + + e_val = bli_check_floating_object( b ); + bli_check_error_code( e_val ); + + e_val = bli_check_floating_object( c ); + bli_check_error_code( e_val ); + + // Check scalar/vector/matrix type. + + e_val = bli_check_scalar_object( alpha ); + bli_check_error_code( e_val ); + + e_val = bli_check_scalar_object( beta ); + bli_check_error_code( e_val ); + + e_val = bli_check_matrix_object( a ); + bli_check_error_code( e_val ); + + e_val = bli_check_vector_object( d ); + bli_check_error_code( e_val ); + + e_val = bli_check_matrix_object( b ); + bli_check_error_code( e_val ); + + e_val = bli_check_matrix_object( c ); + bli_check_error_code( e_val ); + + // Check object buffers (for non-NULLness). + + e_val = bli_check_object_buffer( alpha ); + bli_check_error_code( e_val ); + + e_val = bli_check_object_buffer( a ); + bli_check_error_code( e_val ); + + e_val = bli_check_object_buffer( d ); + bli_check_error_code( e_val ); + + e_val = bli_check_object_buffer( b ); + bli_check_error_code( e_val ); + + e_val = bli_check_object_buffer( beta ); + bli_check_error_code( e_val ); + + e_val = bli_check_object_buffer( c ); + bli_check_error_code( e_val ); + + // Check object dimensions. + + e_val = bli_check_level3_dims( a, b, c ); + bli_check_error_code( e_val ); + + e_val = bli_check_vector_dim_equals( d, bli_obj_width_after_trans( a ) ); + bli_check_error_code( e_val ); + + // Check for consistent datatypes. + // NOTE: We only perform these tests when mixed datatype support is + // disabled. + + e_val = bli_check_consistent_object_datatypes( c, a ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( c, d ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( c, b ); + bli_check_error_code( e_val ); +} + diff --git a/frame/3/trsm/bli_trsm_int.h b/addon/old/gemmd/bao_gemmd_check.h similarity index 93% rename from frame/3/trsm/bli_trsm_int.h rename to addon/old/gemmd/bao_gemmd_check.h index aabb2a8aa6..243ec70c8c 100644 --- a/frame/3/trsm/bli_trsm_int.h +++ b/addon/old/gemmd/bao_gemmd_check.h @@ -32,16 +32,19 @@ */ -void bli_trsm_int + +// +// Prototype object-based check functions. +// + +void bao_gemmd_check ( obj_t* alpha, obj_t* a, + obj_t* d, obj_t* b, obj_t* beta, obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ); + cntx_t* cntx + ); diff --git a/addon/old/gemmd/bao_gemmd_var.h b/addon/old/gemmd/bao_gemmd_var.h new file mode 100644 index 0000000000..05ec45e07e --- /dev/null +++ b/addon/old/gemmd/bao_gemmd_var.h @@ -0,0 +1,119 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +// +// Prototype the object-based variant interfaces. +// + +#undef GENPROT +#define GENPROT( opname ) \ +\ +void PASTECH(bao_,opname) \ + ( \ + obj_t* alpha, \ + obj_t* a, \ + obj_t* d, \ + obj_t* b, \ + obj_t* beta, \ + obj_t* c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ); + +GENPROT( gemmd_bp_var1 ) + + +// +// Prototype the typed variant interfaces. +// + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTECH2(bao_,ch,varname) \ + ( \ + conj_t conja, \ + conj_t conjb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* restrict alpha, \ + void* restrict a, inc_t rs_a, inc_t cs_a, \ + void* restrict d, inc_t incd, \ + void* restrict b, inc_t rs_b, inc_t cs_b, \ + void* restrict beta, \ + void* restrict c, inc_t rs_c, inc_t cs_c, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + thrinfo_t* restrict thread \ + ); + +//INSERT_GENTPROT_BASIC0( gemmd_bp_var1 ) +GENTPROT( float, s, gemmd_bp_var1 ) +GENTPROT( double, d, gemmd_bp_var1 ) +GENTPROT( scomplex, c, gemmd_bp_var1 ) +GENTPROT( dcomplex, z, gemmd_bp_var1 ) + + +// +// Prototype the typed kernel interfaces. +// + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTECH2(bao_,ch,varname) \ + ( \ + const dim_t MR, \ + const dim_t NR, \ + dim_t mr_cur, \ + dim_t nr_cur, \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a, inc_t rs_a, inc_t cs_a, \ + ctype* restrict b, inc_t rs_b, inc_t cs_b, \ + ctype* restrict beta, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* restrict aux, \ + cntx_t* restrict cntx \ + ); + +//INSERT_GENTPROT_BASIC0( gemm_kernel ) +GENTPROT( float, s, gemm_kernel ) +GENTPROT( double, d, gemm_kernel ) +GENTPROT( scomplex, c, gemm_kernel ) +GENTPROT( dcomplex, z, gemm_kernel ) + diff --git a/sandbox/gemmlike/bls_l3_packm_a.c b/addon/old/gemmd/bao_l3_packm_a.c similarity index 95% rename from sandbox/gemmlike/bls_l3_packm_a.c rename to addon/old/gemmd/bao_l3_packm_a.c index 0dcc531fdb..b33fd9089d 100644 --- a/sandbox/gemmlike/bls_l3_packm_a.c +++ b/addon/old/gemmd/bao_l3_packm_a.c @@ -37,7 +37,7 @@ #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ -void PASTECH2(bls_,ch,opname) \ +void PASTECH2(bao_,ch,opname) \ ( \ dim_t m, \ dim_t k, \ @@ -61,7 +61,7 @@ void PASTECH2(bls_,ch,opname) \ \ /* Barrier to make sure all threads are caught up and ready to begin the packm stage. */ \ - bli_thread_barrier( thread ); \ + bli_thrinfo_barrier( thread ); \ \ /* Compute the size of the memory block eneded. */ \ siz_t size_needed = sizeof( ctype ) * m_pack * k_pack; \ @@ -90,7 +90,7 @@ void PASTECH2(bls_,ch,opname) \ \ /* Broadcast the address of the chief thread's passed-in mem_t to all threads. */ \ - mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ + mem_t* mem_p = bli_thrinfo_broadcast( thread, mem ); \ \ /* Non-chief threads: Copy the contents of the chief thread's passed-in mem_t to the passed-in mem_t for this thread. (The @@ -139,7 +139,7 @@ void PASTECH2(bls_,ch,opname) \ \ /* Broadcast the address of the chief thread's passed-in mem_t to all threads. */ \ - mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ + mem_t* mem_p = bli_thrinfo_broadcast( thread, mem ); \ \ /* Non-chief threads: Copy the contents of the chief thread's passed-in mem_t to the passed-in mem_t for this thread. (The @@ -168,7 +168,7 @@ GENTFUNC( dcomplex, z, packm_init_mem_a ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ -void PASTECH2(bls_,ch,opname) \ +void PASTECH2(bao_,ch,opname) \ ( \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ @@ -201,7 +201,7 @@ GENTFUNC( dcomplex, z, packm_finalize_mem_a ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ -void PASTECH2(bls_,ch,opname) \ +void PASTECH2(bao_,ch,opname) \ ( \ pack_t* restrict schema, \ dim_t m, \ @@ -254,7 +254,7 @@ GENTFUNC( dcomplex, z, packm_init_a ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ -void PASTECH2(bls_,ch,opname) \ +void PASTECH2(bao_,ch,opname) \ ( \ conj_t conj, \ dim_t m_alloc, \ @@ -263,6 +263,7 @@ void PASTECH2(bls_,ch,opname) \ dim_t k, \ dim_t mr, \ ctype* restrict kappa, \ + ctype* restrict d, inc_t incd, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ @@ -278,7 +279,7 @@ void PASTECH2(bls_,ch,opname) \ dim_t pd_p; \ \ /* Prepare the packing destination buffer. */ \ - PASTECH2(bls_,ch,packm_init_mem_a) \ + PASTECH2(bao_,ch,packm_init_mem_a) \ ( \ m_alloc, k_alloc, mr, \ cntx, \ @@ -288,7 +289,7 @@ void PASTECH2(bls_,ch,opname) \ ); \ \ /* Determine the packing buffer and related parameters for matrix A. */ \ - PASTECH2(bls_,ch,packm_init_a) \ + PASTECH2(bao_,ch,packm_init_a) \ ( \ &schema, \ m, k, mr, \ @@ -300,7 +301,7 @@ void PASTECH2(bls_,ch,opname) \ \ /* Pack matrix A to the destination buffer chosen above. Here, the packed matrix is stored to column-stored MR x k micropanels. */ \ - PASTECH2(bls_,ch,packm_var1) \ + PASTECH2(bao_,ch,packm_var1) \ ( \ conj, \ schema, \ @@ -309,15 +310,16 @@ void PASTECH2(bls_,ch,opname) \ m_max, \ k_max, \ kappa, \ + d, incd, \ a, rs_a, cs_a, \ *p, *rs_p, *cs_p, \ - pd_p, *ps_p, \ + pd_p, *ps_p, \ cntx, \ thread \ ); \ \ /* Barrier so that packing is done before computation. */ \ - bli_thread_barrier( thread ); \ + bli_thrinfo_barrier( thread ); \ } //INSERT_GENTFUNC_BASIC0( packm_a ) diff --git a/sandbox/gemmlike/bls_l3_packm_a.h b/addon/old/gemmd/bao_l3_packm_a.h similarity index 95% rename from sandbox/gemmlike/bls_l3_packm_a.h rename to addon/old/gemmd/bao_l3_packm_a.h index 201a24efae..b683b79d4a 100644 --- a/sandbox/gemmlike/bls_l3_packm_a.h +++ b/addon/old/gemmd/bao_l3_packm_a.h @@ -35,7 +35,7 @@ #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ -void PASTECH2(bls_,ch,opname) \ +void PASTECH2(bao_,ch,opname) \ ( \ dim_t m, \ dim_t k, \ @@ -56,7 +56,7 @@ GENTPROT( dcomplex, z, packm_init_mem_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ -void PASTECH2(bls_,ch,opname) \ +void PASTECH2(bao_,ch,opname) \ ( \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ @@ -73,7 +73,7 @@ GENTPROT( dcomplex, z, packm_finalize_mem_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ -void PASTECH2(bls_,ch,opname) \ +void PASTECH2(bao_,ch,opname) \ ( \ pack_t* restrict schema, \ dim_t m, \ @@ -96,7 +96,7 @@ GENTPROT( dcomplex, z, packm_init_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ -void PASTECH2(bls_,ch,opname) \ +void PASTECH2(bao_,ch,opname) \ ( \ conj_t conj, \ dim_t m_alloc, \ @@ -105,6 +105,7 @@ void PASTECH2(bls_,ch,opname) \ dim_t k, \ dim_t mr, \ ctype* restrict kappa, \ + ctype* restrict d, inc_t incd, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ diff --git a/sandbox/gemmlike/bls_l3_packm_b.c b/addon/old/gemmd/bao_l3_packm_b.c similarity index 95% rename from sandbox/gemmlike/bls_l3_packm_b.c rename to addon/old/gemmd/bao_l3_packm_b.c index 9d563109a6..76860c8ee8 100644 --- a/sandbox/gemmlike/bls_l3_packm_b.c +++ b/addon/old/gemmd/bao_l3_packm_b.c @@ -37,7 +37,7 @@ #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ -void PASTECH2(bls_,ch,opname) \ +void PASTECH2(bao_,ch,opname) \ ( \ dim_t k, \ dim_t n, \ @@ -61,7 +61,7 @@ void PASTECH2(bls_,ch,opname) \ \ /* Barrier to make sure all threads are caught up and ready to begin the packm stage. */ \ - bli_thread_barrier( thread ); \ + bli_thrinfo_barrier( thread ); \ \ /* Compute the size of the memory block eneded. */ \ siz_t size_needed = sizeof( ctype ) * k_pack * n_pack; \ @@ -90,7 +90,7 @@ void PASTECH2(bls_,ch,opname) \ \ /* Broadcast the address of the chief thread's passed-in mem_t to all threads. */ \ - mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ + mem_t* mem_p = bli_thrinfo_broadcast( thread, mem ); \ \ /* Non-chief threads: Copy the contents of the chief thread's passed-in mem_t to the passed-in mem_t for this thread. (The @@ -139,7 +139,7 @@ void PASTECH2(bls_,ch,opname) \ \ /* Broadcast the address of the chief thread's passed-in mem_t to all threads. */ \ - mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ + mem_t* mem_p = bli_thrinfo_broadcast( thread, mem ); \ \ /* Non-chief threads: Copy the contents of the chief thread's passed-in mem_t to the passed-in mem_t for this thread. (The @@ -168,7 +168,7 @@ GENTFUNC( dcomplex, z, packm_init_mem_b ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ -void PASTECH2(bls_,ch,opname) \ +void PASTECH2(bao_,ch,opname) \ ( \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ @@ -201,7 +201,7 @@ GENTFUNC( dcomplex, z, packm_finalize_mem_b ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ -void PASTECH2(bls_,ch,opname) \ +void PASTECH2(bao_,ch,opname) \ ( \ pack_t* restrict schema, \ dim_t k, \ @@ -254,7 +254,7 @@ GENTFUNC( dcomplex, z, packm_init_b ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ -void PASTECH2(bls_,ch,opname) \ +void PASTECH2(bao_,ch,opname) \ ( \ conj_t conj, \ dim_t k_alloc, \ @@ -263,6 +263,7 @@ void PASTECH2(bls_,ch,opname) \ dim_t n, \ dim_t nr, \ ctype* restrict kappa, \ + ctype* restrict d, inc_t incd, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ @@ -278,7 +279,7 @@ void PASTECH2(bls_,ch,opname) \ dim_t pd_p; \ \ /* Prepare the packing destination buffer. */ \ - PASTECH2(bls_,ch,packm_init_mem_b) \ + PASTECH2(bao_,ch,packm_init_mem_b) \ ( \ k_alloc, n_alloc, nr, \ cntx, \ @@ -288,7 +289,7 @@ void PASTECH2(bls_,ch,opname) \ ); \ \ /* Determine the packing buffer and related parameters for matrix B. */ \ - PASTECH2(bls_,ch,packm_init_b) \ + PASTECH2(bao_,ch,packm_init_b) \ ( \ &schema, \ k, n, nr, \ @@ -300,7 +301,7 @@ void PASTECH2(bls_,ch,opname) \ \ /* Pack matrix B to the destination buffer chosen above. Here, the packed matrix is stored to row-stored k x NR micropanels. */ \ - PASTECH2(bls_,ch,packm_var1) \ + PASTECH2(bao_,ch,packm_var1) \ ( \ conj, \ schema, \ @@ -309,15 +310,16 @@ void PASTECH2(bls_,ch,opname) \ k_max, \ n_max, \ kappa, \ + d, incd, \ b, rs_b, cs_b, \ *p, *rs_p, *cs_p, \ - pd_p, *ps_p, \ + pd_p, *ps_p, \ cntx, \ thread \ ); \ \ /* Barrier so that packing is done before computation. */ \ - bli_thread_barrier( thread ); \ + bli_thrinfo_barrier( thread ); \ } //INSERT_GENTFUNC_BASIC0( packm_b ) diff --git a/sandbox/gemmlike/bls_l3_packm_b.h b/addon/old/gemmd/bao_l3_packm_b.h similarity index 95% rename from sandbox/gemmlike/bls_l3_packm_b.h rename to addon/old/gemmd/bao_l3_packm_b.h index 728d21aed5..9161604ce9 100644 --- a/sandbox/gemmlike/bls_l3_packm_b.h +++ b/addon/old/gemmd/bao_l3_packm_b.h @@ -35,7 +35,7 @@ #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ -void PASTECH2(bls_,ch,opname) \ +void PASTECH2(bao_,ch,opname) \ ( \ dim_t k, \ dim_t n, \ @@ -56,7 +56,7 @@ GENTPROT( dcomplex, z, packm_init_mem_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ -void PASTECH2(bls_,ch,opname) \ +void PASTECH2(bao_,ch,opname) \ ( \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ @@ -73,7 +73,7 @@ GENTPROT( dcomplex, z, packm_finalize_mem_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ -void PASTECH2(bls_,ch,opname) \ +void PASTECH2(bao_,ch,opname) \ ( \ pack_t* restrict schema, \ dim_t k, \ @@ -96,7 +96,7 @@ GENTPROT( dcomplex, z, packm_init_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ -void PASTECH2(bls_,ch,opname) \ +void PASTECH2(bao_,ch,opname) \ ( \ conj_t conj, \ dim_t k_alloc, \ @@ -105,6 +105,7 @@ void PASTECH2(bls_,ch,opname) \ dim_t n, \ dim_t nr, \ ctype* restrict kappa, \ + ctype* restrict d, inc_t incd, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ diff --git a/addon/old/gemmd/bao_l3_packm_var.h b/addon/old/gemmd/bao_l3_packm_var.h new file mode 100644 index 0000000000..063e59e5f8 --- /dev/null +++ b/addon/old/gemmd/bao_l3_packm_var.h @@ -0,0 +1,69 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// +// Prototype BLAS-like interfaces to the variants. +// + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTECH2(bao_,ch,varname) \ + ( \ + trans_t transc, \ + pack_t schema, \ + dim_t m, \ + dim_t n, \ + dim_t m_max, \ + dim_t n_max, \ + ctype* restrict kappa, \ + ctype* restrict d, inc_t incd, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + dim_t pd_p, inc_t ps_p, \ + cntx_t* restrict cntx, \ + thrinfo_t* restrict thread \ + ); + +//INSERT_GENTPROT_BASIC0( packm_var1 ) +GENTPROT( float, s, packm_var1 ) +GENTPROT( double, d, packm_var1 ) +GENTPROT( scomplex, c, packm_var1 ) +GENTPROT( dcomplex, z, packm_var1 ) + +//INSERT_GENTPROT_BASIC0( packm_var2 ) +GENTPROT( float, s, packm_var2 ) +GENTPROT( double, d, packm_var2 ) +GENTPROT( scomplex, c, packm_var2 ) +GENTPROT( dcomplex, z, packm_var2 ) diff --git a/sandbox/gemmlike/bls_l3_packm_var3.c b/addon/old/gemmd/bao_l3_packm_var1.c similarity index 74% rename from sandbox/gemmlike/bls_l3_packm_var3.c rename to addon/old/gemmd/bao_l3_packm_var1.c index 5ea80ff424..d002dc6bf0 100644 --- a/sandbox/gemmlike/bls_l3_packm_var3.c +++ b/addon/old/gemmd/bao_l3_packm_var1.c @@ -35,15 +35,13 @@ #include "blis.h" // -// Variant 3 is similar to variant 1, except that it parallelizes packing -// along the k dimension. (Our current hypothesis is that this method of -// parallelizing the operation may perform better on some NUMA systems.) +// Variant 1 provides basic support for packing by calling packm_cxk(). // #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ -void PASTECH2(bls_,ch,varname) \ +void PASTECH2(bao_,ch,varname) \ ( \ trans_t transc, \ pack_t schema, \ @@ -52,6 +50,7 @@ void PASTECH2(bls_,ch,varname) \ dim_t m_max, \ dim_t n_max, \ ctype* restrict kappa, \ + ctype* restrict d, inc_t incd, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ ctype* restrict p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ @@ -123,38 +122,25 @@ void PASTECH2(bls_,ch,varname) \ ic0 = 0; \ ic_inc = panel_dim_max; \ } \ +\ + ctype* restrict p_begin = p_cast; \ \ /* Query the number of threads and thread ids from the current thread's packm thrinfo_t node. */ \ - const dim_t nt = bli_thread_n_way( thread ); \ - const dim_t tid = bli_thread_work_id( thread ); \ + const dim_t nt = bli_thrinfo_n_way( thread ); \ + const dim_t tid = bli_thrinfo_work_id( thread ); \ \ /* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \ ( void )nt; \ ( void )tid; \ \ - dim_t pr_start, pr_end; \ + dim_t it_start, it_end, it_inc; \ \ /* Determine the thread range and increment using the current thread's - packm thrinfo_t node. */ \ - bli_thread_range_sub( thread, panel_len, 1, FALSE, &pr_start, &pr_end ); \ -\ - /* Define instances of panel_len and panel_len_max that are specific to - the local thread. */ \ - dim_t panel_len_loc = pr_end - pr_start; \ - dim_t panel_len_max_loc = panel_len_loc; \ -\ - /* If panel_len_max > panel_len, then there are some columns in p that - need to be zeroed. Of course, only the last thread will be responsible - for this edge region. */ \ - dim_t panel_len_zero = panel_len_max - panel_len; \ - if ( tid == nt - 1 ) panel_len_max_loc += panel_len_zero; \ -\ - /* Shift the pointer for c and p to the appropriate locations within the - first micropanel. */ \ - dim_t off_loc = pr_start; \ - ctype* restrict c_begin_loc = c_cast + off_loc * ldc; \ - ctype* restrict p_begin_loc = p_cast + off_loc * ldp; \ + packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() + will depend on whether slab or round-robin partitioning was requested + at configure-time. */ \ + bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \ \ /* Iterate over every logical micropanel in the source matrix. */ \ for ( ic = ic0, it = 0; it < n_iter; \ @@ -162,39 +148,48 @@ void PASTECH2(bls_,ch,varname) \ { \ panel_dim = bli_min( panel_dim_max, iter_dim - ic ); \ \ - ctype* restrict c_use = c_begin_loc + (ic )*incc; \ - ctype* restrict p_use = p_begin_loc + (it )*ps_p; \ + ctype* restrict c_begin = c_cast + (ic )*incc; \ +\ + ctype* restrict c_use = c_begin; \ + ctype* restrict p_use = p_begin; \ \ + /* The definition of bli_packm_my_iter() will depend on whether slab + or round-robin partitioning was requested at configure-time. (The + default is slab.) */ \ + if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \ { \ - PASTECH2(bls_,ch,packm_cxk) \ + PASTECH2(bao_,ch,packm_cxk) \ ( \ conjc, \ schema, \ panel_dim, \ panel_dim_max, \ - panel_len_loc, \ - panel_len_max_loc, \ + panel_len, \ + panel_len_max, \ kappa_cast, \ + d, incd, \ c_use, incc, ldc, \ p_use, ldp, \ cntx \ ); \ } \ - } \ -} - -//INSERT_GENTFUNC_BASIC0( packm_var3 ) -GENTFUNC( float, s, packm_var3 ) -GENTFUNC( double, d, packm_var3 ) -GENTFUNC( scomplex, c, packm_var3 ) -GENTFUNC( dcomplex, z, packm_var3 ) - +\ /* if ( !row_stored ) \ -PASTEMAC(ch,fprintm)( stdout, "packm_var3: a packed", panel_dim_max, panel_len_max, \ +PASTEMAC(ch,fprintm)( stdout, "packm_var1: a packed", panel_dim_max, panel_len_max, \ p_use, rs_p, cs_p, "%5.2f", "" ); \ else \ -PASTEMAC(ch,fprintm)( stdout, "packm_var3: b packed", panel_len_max, panel_dim_max, \ +PASTEMAC(ch,fprintm)( stdout, "packm_var1: b packed", panel_len_max, panel_dim_max, \ p_use, rs_p, cs_p, "%5.2f", "" ); \ -*/ +*/ \ +\ + p_begin += ps_p; \ + } \ +} + +//INSERT_GENTFUNC_BASIC0( packm_var1 ) +GENTFUNC( float, s, packm_var1 ) +GENTFUNC( double, d, packm_var1 ) +GENTFUNC( scomplex, c, packm_var1 ) +GENTFUNC( dcomplex, z, packm_var1 ) diff --git a/sandbox/gemmlike/bls_l3_packm_var2.c b/addon/old/gemmd/bao_l3_packm_var2.c similarity index 92% rename from sandbox/gemmlike/bls_l3_packm_var2.c rename to addon/old/gemmd/bao_l3_packm_var2.c index 8d2b90cac1..49e9d19415 100644 --- a/sandbox/gemmlike/bls_l3_packm_var2.c +++ b/addon/old/gemmd/bao_l3_packm_var2.c @@ -41,7 +41,7 @@ #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ -void PASTECH2(bls_,ch,varname) \ +void PASTECH2(bao_,ch,varname) \ ( \ trans_t transc, \ pack_t schema, \ @@ -50,6 +50,7 @@ void PASTECH2(bls_,ch,varname) \ dim_t m_max, \ dim_t n_max, \ ctype* restrict kappa, \ + ctype* restrict d, inc_t incd, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ ctype* restrict p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ @@ -126,8 +127,8 @@ void PASTECH2(bls_,ch,varname) \ \ /* Query the number of threads and thread ids from the current thread's packm thrinfo_t node. */ \ - const dim_t nt = bli_thread_n_way( thread ); \ - const dim_t tid = bli_thread_work_id( thread ); \ + const dim_t nt = bli_thrinfo_n_way( thread ); \ + const dim_t tid = bli_thrinfo_work_id( thread ); \ \ /* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \ ( void )nt; \ @@ -167,12 +168,12 @@ void PASTECH2(bls_,ch,varname) \ { \ for ( dim_t l = 0; l < panel_len; ++l ) \ { \ - for ( dim_t i = 0; i < panel_dim; ++i ) \ + for ( dim_t d = 0; d < panel_dim; ++d ) \ { \ - ctype* cli = c_use + (l )*ldc + (i )*incc; \ - ctype* pli = p_use + (l )*ldp + (i )*1; \ + ctype* cld = c_use + (l )*ldc + (d )*incc; \ + ctype* pld = p_use + (l )*ldp + (d )*1; \ \ - PASTEMAC(ch,copyjs)( *cli, *pli ); \ + PASTEMAC(ch,copyjs)( *cld, *pld ); \ } \ } \ } \ @@ -180,12 +181,12 @@ void PASTECH2(bls_,ch,varname) \ { \ for ( dim_t l = 0; l < panel_len; ++l ) \ { \ - for ( dim_t i = 0; i < panel_dim; ++i ) \ + for ( dim_t d = 0; d < panel_dim; ++d ) \ { \ - ctype* cli = c_use + (l )*ldc + (i )*incc; \ - ctype* pli = p_use + (l )*ldp + (i )*1; \ + ctype* cld = c_use + (l )*ldc + (d )*incc; \ + ctype* pld = p_use + (l )*ldp + (d )*1; \ \ - PASTEMAC(ch,copys)( *cli, *pli ); \ + PASTEMAC(ch,copys)( *cld, *pld ); \ } \ } \ } \ diff --git a/sandbox/gemmlike/bls_packm_cxk.c b/addon/old/gemmd/bao_packm_cxk.c similarity index 70% rename from sandbox/gemmlike/bls_packm_cxk.c rename to addon/old/gemmd/bao_packm_cxk.c index ca11c207c0..8680c53321 100644 --- a/sandbox/gemmlike/bls_packm_cxk.c +++ b/addon/old/gemmd/bao_packm_cxk.c @@ -37,7 +37,7 @@ #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ -void PASTECH2(bls_,ch,opname) \ +void PASTECH2(bao_,ch,opname) \ ( \ conj_t conja, \ pack_t schema, \ @@ -46,6 +46,7 @@ void PASTECH2(bls_,ch,opname) \ dim_t panel_len, \ dim_t panel_len_max, \ ctype* kappa, \ + ctype* d, inc_t incd, \ ctype* a, inc_t inca, inc_t lda, \ ctype* p, inc_t ldp, \ cntx_t* cntx \ @@ -54,15 +55,15 @@ void PASTECH2(bls_,ch,opname) \ /* Note that we use panel_dim_max, not panel_dim, to query the packm kernel function pointer. This means that we always use the same kernel, even for edge cases. */ \ - num_t dt = PASTEMAC(ch,type); \ - l1mkr_t ker_id = panel_dim_max; \ + num_t dt = PASTEMAC(ch,type); \ + ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER : BLIS_PACKM_MRXK_KER; \ \ PASTECH2(ch,opname,_ker_ft) f; \ \ /* Query the context for the packm kernel corresponding to the current panel dimension, or kernel id. If the id is invalid, the function will return NULL. */ \ - f = bli_cntx_get_packm_ker_dt( dt, ker_id, cntx ); \ + f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \ \ /* If there exists a kernel implementation for the micro-panel dimension provided, we invoke the implementation. Otherwise, we use scal2m. */ \ @@ -91,30 +92,67 @@ void PASTECH2(bls_,ch,opname) \ if ( !PASTEMAC(ch,eq1)( *kappa ) ) \ bli_abort(); \ \ - /* Perform the packing, taking conja into account. */ \ - if ( bli_is_conj( conja ) ) \ + if ( d == NULL ) \ { \ - for ( dim_t l = 0; l < panel_len; ++l ) \ + /* Perform the packing, taking conja into account. */ \ + if ( bli_is_conj( conja ) ) \ { \ - for ( dim_t i = 0; i < panel_dim; ++i ) \ + for ( dim_t l = 0; l < panel_len; ++l ) \ { \ - ctype* ali = a + (l )*lda + (i )*inca; \ - ctype* pli = p + (l )*ldp + (i )*1; \ + for ( dim_t i = 0; i < panel_dim; ++i ) \ + { \ + ctype* ali = a + (l )*lda + (i )*inca; \ + ctype* pli = p + (l )*ldp + (i )*1; \ \ - PASTEMAC(ch,copyjs)( *ali, *pli ); \ + PASTEMAC(ch,copyjs)( *ali, *pli ); \ + } \ + } \ + } \ + else \ + { \ + for ( dim_t l = 0; l < panel_len; ++l ) \ + { \ + for ( dim_t i = 0; i < panel_dim; ++i ) \ + { \ + ctype* ali = a + (l )*lda + (i )*inca; \ + ctype* pli = p + (l )*ldp + (i )*1; \ +\ + PASTEMAC(ch,copys)( *ali, *pli ); \ + } \ } \ } \ } \ - else \ + else /* if ( d != NULL ) */ \ { \ - for ( dim_t l = 0; l < panel_len; ++l ) \ + /* Perform the packing, taking conja into account. */ \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( dim_t l = 0; l < panel_len; ++l ) \ + { \ + for ( dim_t i = 0; i < panel_dim; ++i ) \ + { \ + ctype* ali = a + (l )*lda + (i )*inca; \ + ctype* dl = d + (l )*incd; \ + ctype* pli = p + (l )*ldp + (i )*1; \ +\ + /* Note that ali must be the second operand here since + that is what is conjugated by scal2js. */ \ + PASTEMAC(ch,scal2js)( *dl, *ali, *pli ); \ + } \ + } \ + } \ + else \ { \ - for ( dim_t i = 0; i < panel_dim; ++i ) \ + for ( dim_t l = 0; l < panel_len; ++l ) \ { \ - ctype* ali = a + (l )*lda + (i )*inca; \ - ctype* pli = p + (l )*ldp + (i )*1; \ + for ( dim_t i = 0; i < panel_dim; ++i ) \ + { \ + ctype* ali = a + (l )*lda + (i )*inca; \ + ctype* dl = d + (l )*incd; \ + ctype* pli = p + (l )*ldp + (i )*1; \ \ - PASTEMAC(ch,copys)( *ali, *pli ); \ + PASTEMAC(ch,scal2s)( *ali, *dl, *pli ); \ + } \ } \ } \ } \ diff --git a/sandbox/gemmlike/bls_packm_cxk.h b/addon/old/gemmd/bao_packm_cxk.h similarity index 97% rename from sandbox/gemmlike/bls_packm_cxk.h rename to addon/old/gemmd/bao_packm_cxk.h index f6582d64a7..3e977a7cc2 100644 --- a/sandbox/gemmlike/bls_packm_cxk.h +++ b/addon/old/gemmd/bao_packm_cxk.h @@ -36,7 +36,7 @@ #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ -void PASTECH2(bls_,ch,varname) \ +void PASTECH2(bao_,ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ @@ -45,6 +45,7 @@ void PASTECH2(bls_,ch,varname) \ dim_t panel_len, \ dim_t panel_len_max, \ ctype* kappa, \ + ctype* d, inc_t incd, \ ctype* a, inc_t inca, inc_t lda, \ ctype* p, inc_t ldp, \ cntx_t* cntx \ diff --git a/addon/old/gemmd/gemmd.h b/addon/old/gemmd/gemmd.h new file mode 100644 index 0000000000..cab61bd181 --- /dev/null +++ b/addon/old/gemmd/gemmd.h @@ -0,0 +1,54 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of copyright holder(s) nor the names + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef GEMMD_H +#define GEMMD_H + +// This header should contain (or #include) any definitions that must be +// folded into blis.h. + +#include "bao_gemmd.h" +#include "bao_gemmd_check.h" +#include "bao_gemmd_var.h" + +#include "bao_l3_packm_a.h" +#include "bao_l3_packm_b.h" +#include "bao_l3_packm_var.h" + +#include "bao_packm_cxk.h" + +#include "bao_l3_decor.h" + + +#endif diff --git a/addon/old/gemmd/thread/bao_l3_decor.c b/addon/old/gemmd/thread/bao_l3_decor.c new file mode 100644 index 0000000000..ff510b6f37 --- /dev/null +++ b/addon/old/gemmd/thread/bao_l3_decor.c @@ -0,0 +1,150 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// Initialize a function pointer array containing function addresses for +// each of the threading-specific level-3 thread decorators. + +static l3ao_decor_ft l3ao_decor_fpa[ BLIS_NUM_THREAD_IMPLS ] = +{ + [BLIS_SINGLE] = bao_l3_thread_decorator_single, + [BLIS_OPENMP] = +#if defined(BLIS_ENABLE_OPENMP) + bao_l3_thread_decorator_openmp, +#elif defined(BLIS_ENABLE_PTHREADS) + NULL, +#else + NULL, +#endif + [BLIS_POSIX] = +#if defined(BLIS_ENABLE_PTHREADS) + bao_l3_thread_decorator_pthreads, +#elif defined(BLIS_ENABLE_OPENMP) + NULL, +#else + NULL, +#endif +}; + +// Define a dispatcher that chooses a threading-specific function from the +// above function pointer array. + +void bao_l3_thread_decorator + ( + l3aoint_ft func, + opid_t family, + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + rntm_t rntm_l; + + // Query the threading implementation and the number of threads requested. + timpl_t ti = bli_rntm_thread_impl( rntm ); + dim_t nt = bli_rntm_num_threads( rntm ); + + if ( bli_error_checking_is_enabled() ) + bao_l3_thread_decorator_check( rntm ); + + if ( 1 < nt && ti == BLIS_SINGLE ) + { + // Here, we resolve conflicting information. The caller requested + // a sequential threading implementation, but also requested more + // than one thread. Here, we choose to favor the requested threading + // implementation over the number of threads, and so reset all + // parallelism parameters to 1. + rntm_l = *rntm; + nt = 1; + bli_rntm_set_ways_only( 1, 1, 1, 1, 1, &rntm_l ); + bli_rntm_set_num_threads_only( 1, &rntm_l ); + rntm = &rntm_l; + } + + // Use the timpl_t value to index into the corresponding function address + // from the function pointer array. + const l3ao_decor_ft fp = l3ao_decor_fpa[ ti ]; + + // Call the threading-specific decorator function. + fp + ( + func, + family, + alpha, + a, + d, + b, + beta, + c, + cntx, + rntm + ); +} + +void bao_l3_thread_decorator_check + ( + rntm_t* rntm + ) +{ + //err_t e_val; + + //e_val = bli_check_valid_thread_impl( bli_rntm_thread_impl( rntm ) ); + //bli_check_error_code( e_val ); + + const timpl_t ti = bli_rntm_thread_impl( rntm ); + + if ( +#ifndef BLIS_ENABLE_OPENMP + ti == BLIS_OPENMP || +#endif +#ifndef BLIS_ENABLE_PTHREADS + ti == BLIS_POSIX || +#endif + FALSE + ) + { + fprintf( stderr, "\n" ); + fprintf( stderr, "libblis: User requested threading implementation \"%s\", but that method is\n", ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ); + fprintf( stderr, "libblis: unavailable. Try reconfiguring BLIS with \"-t %s\" and recompiling.\n", ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ); + fprintf( stderr, "libblis: %s: line %d\n", __FILE__, ( int )__LINE__ ); + bli_abort(); + } +} + diff --git a/frame/thread/bli_l3_decor.h b/addon/old/gemmd/thread/bao_l3_decor.h similarity index 71% rename from frame/thread/bli_l3_decor.h rename to addon/old/gemmd/thread/bao_l3_decor.h index 0b09189a69..4c087bdb6d 100644 --- a/frame/thread/bli_l3_decor.h +++ b/addon/old/gemmd/thread/bao_l3_decor.h @@ -33,45 +33,57 @@ */ -#ifndef BLIS_L3_DECOR_H -#define BLIS_L3_DECOR_H - -// -- conventional definitions ------------------------------------------------- - // Level-3 internal function type. -typedef void (*l3int_t) +typedef void (*l3aoint_ft) ( obj_t* alpha, obj_t* a, obj_t* b, + obj_t* d, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, - cntl_t* cntl, thrinfo_t* thread ); +// Level-3 thread decorator function type. +typedef void (*l3ao_decor_ft) + ( + l3aoint_ft func, + opid_t family, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* d, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ); + // Level-3 thread decorator prototype. -void bli_l3_thread_decorator +void bao_l3_thread_decorator ( - l3int_t func, - opid_t family, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + l3aoint_ft func, + opid_t family, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* d, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm ); -// Include definitions specific to the method of multithreading for the -// conventional code path. -#include "bli_l3_decor_single.h" -#include "bli_l3_decor_openmp.h" -#include "bli_l3_decor_pthreads.h" +void bao_l3_thread_decorator_check + ( + rntm_t* rntm + ); -#endif +// Include definitions specific to the method of multithreading. +#include "bao_l3_decor_single.h" +#include "bao_l3_decor_openmp.h" +#include "bao_l3_decor_pthreads.h" diff --git a/sandbox/gemmlike/thread/bls_l3_decor_openmp.c b/addon/old/gemmd/thread/bao_l3_decor_openmp.c similarity index 89% rename from sandbox/gemmlike/thread/bls_l3_decor_openmp.c rename to addon/old/gemmd/thread/bao_l3_decor_openmp.c index bf0d4d8bcd..7deee95edc 100644 --- a/sandbox/gemmlike/thread/bls_l3_decor_openmp.c +++ b/addon/old/gemmd/thread/bao_l3_decor_openmp.c @@ -36,19 +36,15 @@ #ifdef BLIS_ENABLE_OPENMP -// Define a dummy thread entry function, which is needed in the pthreads -// version, so that when building Windows DLLs (with OpenMP enabled or with -// no multithreading) we don't risk having an unresolved symbol. -void* bls_l3_thread_entry( void* data_void ) { return NULL; } - //#define PRINT_THRINFO -void bls_l3_thread_decorator +void bao_l3_thread_decorator_openmp ( - l3sbxint_t func, + l3aoint_ft func, opid_t family, obj_t* alpha, obj_t* a, + obj_t* d, obj_t* b, obj_t* beta, obj_t* c, @@ -65,7 +61,7 @@ void bls_l3_thread_decorator // with an internal lock to ensure only one application thread accesses // the sba at a time. bli_sba_checkout_array() will also automatically // resize the array_t, if necessary. - array_t* restrict array = bli_sba_checkout_array( n_threads ); + array_t* array = bli_sba_checkout_array( n_threads ); // Access the pool_t* for thread 0 and embed it into the rntm. We do // this up-front only so that we have the rntm_t.sba_pool field @@ -78,7 +74,7 @@ void bls_l3_thread_decorator bli_pba_rntm_set_pba( rntm ); // Allcoate a global communicator for the root thrinfo_t structures. - thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); + thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads ); _Pragma( "omp parallel num_threads(n_threads)" ) @@ -93,8 +89,6 @@ void bls_l3_thread_decorator const dim_t tid = omp_get_thread_num(); // Check for a somewhat obscure OpenMP thread-mistmatch issue. - // NOTE: This calls the same function used for the conventional/large - // code path. bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p ); // Use the thread id to access the appropriate pool_t* within the @@ -112,6 +106,7 @@ void bls_l3_thread_decorator ( alpha, a, + d, b, beta, c, diff --git a/frame/thread/bli_l3_decor_openmp.h b/addon/old/gemmd/thread/bao_l3_decor_openmp.h similarity index 85% rename from frame/thread/bli_l3_decor_openmp.h rename to addon/old/gemmd/thread/bao_l3_decor_openmp.h index 80dbe5374e..4ed3e7efc6 100644 --- a/frame/thread/bli_l3_decor_openmp.h +++ b/addon/old/gemmd/thread/bao_l3_decor_openmp.h @@ -4,8 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018, Advanced Micro Devices, Inc. + Copyright (C) 2021, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -33,21 +32,22 @@ */ -#ifndef BLIS_L3_DECOR_OPENMP_H -#define BLIS_L3_DECOR_OPENMP_H - // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP -void bli_l3_thread_decorator_thread_check +void bao_l3_thread_decorator_openmp ( - dim_t n_threads, - dim_t tid, - thrcomm_t* gl_comm, + l3aoint_ft func, + opid_t family, + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, rntm_t* rntm ); #endif -#endif - diff --git a/sandbox/gemmlike/thread/bls_l3_decor_pthreads.c b/addon/old/gemmd/thread/bao_l3_decor_pthreads.c similarity index 90% rename from sandbox/gemmlike/thread/bls_l3_decor_pthreads.c rename to addon/old/gemmd/thread/bao_l3_decor_pthreads.c index ff723a4ce4..dfbfbaa614 100644 --- a/sandbox/gemmlike/thread/bls_l3_decor_pthreads.c +++ b/addon/old/gemmd/thread/bao_l3_decor_pthreads.c @@ -39,10 +39,11 @@ // A data structure to assist in passing operands to additional threads. typedef struct thread_data { - l3sbxint_t func; + l3aoint_ft func; opid_t family; obj_t* alpha; obj_t* a; + obj_t* d; obj_t* b; obj_t* beta; obj_t* c; @@ -54,14 +55,15 @@ typedef struct thread_data } thread_data_t; // Entry point function for additional threads. -void* bls_l3_thread_entry( void* data_void ) +void* bao_l3_thread_entry( void* data_void ) { thread_data_t* data = data_void; - l3sbxint_t func = data->func; + l3aoint_ft func = data->func; opid_t family = data->family; obj_t* alpha = data->alpha; obj_t* a = data->a; + obj_t* d = data->d; obj_t* b = data->b; obj_t* beta = data->beta; obj_t* c = data->c; @@ -94,6 +96,7 @@ void* bls_l3_thread_entry( void* data_void ) ( alpha, a, + d, b, beta, c, @@ -108,12 +111,13 @@ void* bls_l3_thread_entry( void* data_void ) return NULL; } -void bls_l3_thread_decorator +void bao_l3_thread_decorator_pthreads ( - l3sbxint_t func, + l3aoint_ft func, opid_t family, obj_t* alpha, obj_t* a, + obj_t* d, obj_t* b, obj_t* beta, obj_t* c, @@ -169,6 +173,7 @@ void bls_l3_thread_decorator datas[tid].family = family; datas[tid].alpha = alpha; datas[tid].a = a; + datas[tid].d = d; datas[tid].b = b; datas[tid].beta = beta; datas[tid].c = c; @@ -180,9 +185,9 @@ void bls_l3_thread_decorator // Spawn additional threads for ids greater than 1. if ( tid != 0 ) - bli_pthread_create( &pthreads[tid], NULL, &bls_l3_thread_entry, &datas[tid] ); + bli_pthread_create( &pthreads[tid], NULL, &bao_l3_thread_entry, &datas[tid] ); else - bls_l3_thread_entry( ( void* )(&datas[0]) ); + bao_l3_thread_entry( ( void* )(&datas[0]) ); } // We shouldn't free the global communicator since it was already freed @@ -211,5 +216,12 @@ void bls_l3_thread_decorator bli_free_intl( datas ); } +#else + +// Define a dummy function bli_l3_thread_entry(), which is needed for +// consistent dynamic linking behavior when building shared objects in Linux +// or OSX, or Windows DLLs; otherwise, we risk having an unresolved symbol. +void* bao_l3_thread_entry( void* data_void ) { return NULL; } + #endif diff --git a/sandbox/gemmlike/thread/bls_l3_decor_pthreads.h b/addon/old/gemmd/thread/bao_l3_decor_pthreads.h similarity index 84% rename from sandbox/gemmlike/thread/bls_l3_decor_pthreads.h rename to addon/old/gemmd/thread/bao_l3_decor_pthreads.h index ef5c3bad45..1c0b58900b 100644 --- a/sandbox/gemmlike/thread/bls_l3_decor_pthreads.h +++ b/addon/old/gemmd/thread/bao_l3_decor_pthreads.h @@ -32,16 +32,25 @@ */ -#ifndef BLIS_SBX_L3_DECOR_PTHREADS_H -#define BLIS_SBX_L3_DECOR_PTHREADS_H - // Definitions specific to situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS // Thread entry point prototype. -void* bls_l3_thread_entry( void* data_void ); - -#endif +void* bao_l3_thread_entry( void* data_void ); + +void bao_l3_thread_decorator_pthreads + ( + l3aoint_ft func, + opid_t family, + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ); #endif diff --git a/sandbox/gemmlike/thread/bls_l3_decor_single.c b/addon/old/gemmd/thread/bao_l3_decor_single.c similarity index 97% rename from sandbox/gemmlike/thread/bls_l3_decor_single.c rename to addon/old/gemmd/thread/bao_l3_decor_single.c index 8bb04817fb..362c1e68c7 100644 --- a/sandbox/gemmlike/thread/bls_l3_decor_single.c +++ b/addon/old/gemmd/thread/bao_l3_decor_single.c @@ -34,18 +34,17 @@ #include "blis.h" -#ifndef BLIS_ENABLE_MULTITHREADING - #define SKIP_THRINFO_TREE -void bls_l3_thread_decorator +void bao_l3_thread_decorator_single ( - l3sbxint_t func, + l3aoint_ft func, opid_t family, //pack_t schema_a, //pack_t schema_b, obj_t* alpha, obj_t* a, + obj_t* d, obj_t* b, obj_t* beta, obj_t* c, @@ -113,6 +112,7 @@ void bls_l3_thread_decorator ( alpha, a, + d, b, beta, c, @@ -137,5 +137,3 @@ void bls_l3_thread_decorator bli_sba_checkin_array( array ); } -#endif - diff --git a/sandbox/gemmlike/thread/bls_l3_decor_single.h b/addon/old/gemmd/thread/bao_l3_decor_single.h similarity index 83% rename from sandbox/gemmlike/thread/bls_l3_decor_single.h rename to addon/old/gemmd/thread/bao_l3_decor_single.h index 211a43a894..813bb6d75d 100644 --- a/sandbox/gemmlike/thread/bls_l3_decor_single.h +++ b/addon/old/gemmd/thread/bao_l3_decor_single.h @@ -32,13 +32,18 @@ */ -#ifndef BLIS_SBX_L3_DECOR_SINGLE_H -#define BLIS_SBX_L3_DECOR_SINGLE_H - -// Definitions specific to situations when multithreading is disabled. -#ifndef BLIS_ENABLE_MULTITHREADING - -#endif - -#endif - +void bao_l3_thread_decorator_single + ( + l3aoint_ft func, + opid_t family, + //pack_t schema_a, + //pack_t schema_b, + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ); diff --git a/blastest/f2c/open.c b/blastest/f2c/open.c index 2834fd9463..12e5f02b21 100644 --- a/blastest/f2c/open.c +++ b/blastest/f2c/open.c @@ -28,6 +28,7 @@ use or performance of this software. #include #endif #ifdef _MSC_VER +#include #define access _access #endif #include "f2c.h" diff --git a/blastest/src/cblat1.c b/blastest/src/cblat1.c index daccb2f6cd..6562946847 100644 --- a/blastest/src/cblat1.c +++ b/blastest/src/cblat1.c @@ -68,6 +68,11 @@ static real c_b52 = 0.f; /* ===================================================================== */ /* Main program */ int main(void) { +#ifdef BLIS_ENABLE_HPX + char* program = "cblat1"; + bli_thread_initialize_hpx( 1, &program ); +#endif + /* Initialized data */ static real sfac = 9.765625e-4f; @@ -136,7 +141,12 @@ static real c_b52 = 0.f; } s_stop("", (ftnlen)0); - return 0; +#ifdef BLIS_ENABLE_HPX + return bli_thread_finalize_hpx(); +#else + // Return peacefully. + return 0; +#endif } /* main */ /* Subroutine */ int header_(void) @@ -230,7 +240,7 @@ static real c_b52 = 0.f; complex q__1; /* Builtin functions */ - integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_wsle(void); /* Subroutine */ int s_stop(char *, ftnlen); @@ -238,15 +248,15 @@ static real c_b52 = 0.f; integer i__; complex cx[8]; integer np1, len; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, - integer *), ctest_(integer *, complex *, complex *, complex *, + extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + integer *), ctest_(integer *, complex *, complex *, complex *, real *); complex mwpcs[5], mwpct[5]; extern real scnrm2_(integer *, complex *, integer *); extern /* Subroutine */ int itest1_(integer *, integer *), stest1_(real *, real *, real *, real *); extern integer icamax_(integer *, complex *, integer *); - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer + extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer *); extern real scasum_(integer *, complex *, integer *); @@ -465,7 +475,7 @@ static real c_b52 = 0.f; complex q__1; /* Builtin functions */ - integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_wsle(void); /* Subroutine */ int s_stop(char *, ftnlen); @@ -475,17 +485,29 @@ static real c_b52 = 0.f; integer mx, my; complex cdot[1]; integer lenx, leny; - extern /* Complex */ complex cdotc_(integer *, complex *, integer + extern /* Complex */ +#ifdef BLIS_ENABLE_COMPLEX_RETURN_INTEL + void cdotc_(complex *, +#else +complex cdotc_( +#endif + integer *, complex *, integer *, complex *, integer *); - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, complex *, integer *); - extern /* Complex */ complex cdotu_(integer *, complex *, integer + extern /* Complex */ +#ifdef BLIS_ENABLE_COMPLEX_RETURN_INTEL + void cdotu_(complex *, +#else +complex cdotu_( +#endif + integer *, complex *, integer *, complex *, integer *); - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, - complex *, integer *), ctest_(integer *, complex *, complex *, + extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + complex *, integer *), ctest_(integer *, complex *, complex *, complex *, real *); integer ksize; - extern /* Subroutine */ int caxpy_(integer *, complex *, complex *, + extern /* Subroutine */ int caxpy_(integer *, complex *, complex *, integer *, complex *, integer *); /* Fortran I/O blocks */ @@ -526,14 +548,26 @@ static real c_b52 = 0.f; } if (combla_1.icase == 1) { /* .. CDOTC .. */ - q__1 = cdotc_(&combla_1.n, cx, &combla_1.incx, cy, & + +#ifdef BLIS_ENABLE_COMPLEX_RETURN_INTEL + cdotc_(&q__1, +#else + q__1 = cdotc_( +#endif + &combla_1.n, cx, &combla_1.incx, cy, & combla_1.incy); cdot[0].r = q__1.r, cdot[0].i = q__1.i; ctest_(&c__1, cdot, &ct6[kn + (ki << 2) - 5], &csize1[kn - 1], sfac); } else if (combla_1.icase == 2) { /* .. CDOTU .. */ - q__1 = cdotu_(&combla_1.n, cx, &combla_1.incx, cy, & + +#ifdef BLIS_ENABLE_COMPLEX_RETURN_INTEL + cdotu_(&q__1, +#else + q__1 = cdotu_( +#endif + &combla_1.n, cx, &combla_1.incx, cy, & combla_1.incy); cdot[0].r = q__1.r, cdot[0].i = q__1.i; ctest_(&c__1, cdot, &ct7[kn + (ki << 2) - 5], &csize1[kn - 1], @@ -667,7 +701,7 @@ static real c_b52 = 0.f; sfac) { real scomp[1], strue[1]; - extern /* Subroutine */ int stest_(integer *, real *, real *, real *, + extern /* Subroutine */ int stest_(integer *, real *, real *, real *, real *); /* ************************* STEST1 ***************************** */ @@ -709,7 +743,7 @@ real sdiff_(real *sa, real *sb) return ret_val; } /* sdiff_ */ -/* Subroutine */ int ctest_(integer *len, complex *ccomp, complex *ctrue, +/* Subroutine */ int ctest_(integer *len, complex *ccomp, complex *ctrue, complex *csize, real *sfac) { /* System generated locals */ @@ -721,7 +755,7 @@ real sdiff_(real *sa, real *sb) /* Local variables */ integer i__; real scomp[20], ssize[20], strue[20]; - extern /* Subroutine */ int stest_(integer *, real *, real *, real *, + extern /* Subroutine */ int stest_(integer *, real *, real *, real *, real *); /* **************************** CTEST ***************************** */ diff --git a/blastest/src/cblat2.c b/blastest/src/cblat2.c index 2916a36a4e..08d215aee3 100644 --- a/blastest/src/cblat2.c +++ b/blastest/src/cblat2.c @@ -158,10 +158,15 @@ static logical c_false = FALSE_; /* ===================================================================== */ /* Main program */ int main(void) { +#ifdef BLIS_ENABLE_HPX + char* program = "cblat2"; + bli_thread_initialize_hpx( 1, &program ); +#endif + /* Initialized data */ - static char snames[6*17] = "CGEMV " "CGBMV " "CHEMV " "CHBMV " "CHPMV " - "CTRMV " "CTBMV " "CTPMV " "CTRSV " "CTBSV " "CTPSV " "CGERC " + static char snames[6*17] = "CGEMV " "CGBMV " "CHEMV " "CHBMV " "CHPMV " + "CTRMV " "CTBMV " "CTPMV " "CTRSV " "CTBSV " "CTPSV " "CGERC " "CGERU " "CHER " "CHPR " "CHER2 " "CHPR2 "; /* Format strings */ @@ -209,10 +214,10 @@ static logical c_false = FALSE_; cllist cl__1; /* Builtin functions */ - integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_rsle(void), f_open(olist *), s_wsfe(cilist *), do_fio(integer *, - char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), - s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, + char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), + s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, ftnlen); /* Subroutine */ int s_stop(char *, ftnlen); integer f_clos(cllist *); @@ -234,42 +239,42 @@ static logical c_false = FALSE_; integer ninc, nbet, ntra; logical rewi; integer nout; - extern /* Subroutine */ int cchk1_(char *, real *, real *, integer *, - integer *, logical *, logical *, logical *, integer *, integer *, - integer *, integer *, integer *, complex *, integer *, complex *, - integer *, integer *, integer *, integer *, complex *, complex *, - complex *, complex *, complex *, complex *, complex *, complex *, - complex *, complex *, real *, ftnlen), cchk2_(char *, real *, - real *, integer *, integer *, logical *, logical *, logical *, - integer *, integer *, integer *, integer *, integer *, complex *, - integer *, complex *, integer *, integer *, integer *, integer *, - complex *, complex *, complex *, complex *, complex *, complex *, - complex *, complex *, complex *, complex *, real *, ftnlen), - cchk3_(char *, real *, real *, integer *, integer *, logical *, - logical *, logical *, integer *, integer *, integer *, integer *, - integer *, integer *, integer *, integer *, complex *, complex *, - complex *, complex *, complex *, complex *, complex *, real *, - complex *, ftnlen), cchk4_(char *, real *, real *, integer *, - integer *, logical *, logical *, logical *, integer *, integer *, - integer *, complex *, integer *, integer *, integer *, integer *, - complex *, complex *, complex *, complex *, complex *, complex *, - complex *, complex *, complex *, complex *, real *, complex *, - ftnlen), cchk5_(char *, real *, real *, integer *, integer *, - logical *, logical *, logical *, integer *, integer *, integer *, - complex *, integer *, integer *, integer *, integer *, complex *, - complex *, complex *, complex *, complex *, complex *, complex *, - complex *, complex *, complex *, real *, complex *, ftnlen), - cchk6_(char *, real *, real *, integer *, integer *, logical *, - logical *, logical *, integer *, integer *, integer *, complex *, - integer *, integer *, integer *, integer *, complex *, complex *, - complex *, complex *, complex *, complex *, complex *, complex *, + extern /* Subroutine */ int cchk1_(char *, real *, real *, integer *, + integer *, logical *, logical *, logical *, integer *, integer *, + integer *, integer *, integer *, complex *, integer *, complex *, + integer *, integer *, integer *, integer *, complex *, complex *, + complex *, complex *, complex *, complex *, complex *, complex *, + complex *, complex *, real *, ftnlen), cchk2_(char *, real *, + real *, integer *, integer *, logical *, logical *, logical *, + integer *, integer *, integer *, integer *, integer *, complex *, + integer *, complex *, integer *, integer *, integer *, integer *, + complex *, complex *, complex *, complex *, complex *, complex *, + complex *, complex *, complex *, complex *, real *, ftnlen), + cchk3_(char *, real *, real *, integer *, integer *, logical *, + logical *, logical *, integer *, integer *, integer *, integer *, + integer *, integer *, integer *, integer *, complex *, complex *, + complex *, complex *, complex *, complex *, complex *, real *, + complex *, ftnlen), cchk4_(char *, real *, real *, integer *, + integer *, logical *, logical *, logical *, integer *, integer *, + integer *, complex *, integer *, integer *, integer *, integer *, + complex *, complex *, complex *, complex *, complex *, complex *, + complex *, complex *, complex *, complex *, real *, complex *, + ftnlen), cchk5_(char *, real *, real *, integer *, integer *, + logical *, logical *, logical *, integer *, integer *, integer *, + complex *, integer *, integer *, integer *, integer *, complex *, + complex *, complex *, complex *, complex *, complex *, complex *, + complex *, complex *, complex *, real *, complex *, ftnlen), + cchk6_(char *, real *, real *, integer *, integer *, logical *, + logical *, logical *, integer *, integer *, integer *, complex *, + integer *, integer *, integer *, integer *, complex *, complex *, + complex *, complex *, complex *, complex *, complex *, complex *, complex *, complex *, real *, complex *, ftnlen), cchke_(integer * , char *, integer *, ftnlen); logical fatal, trace; integer nidim; extern /* Subroutine */ int cmvch_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * - , integer *, complex *, real *, complex *, real *, real *, + , integer *, complex *, real *, complex *, real *, real *, logical *, integer *, logical *, ftnlen); char snaps[32], trans[1]; integer isnum; @@ -618,7 +623,7 @@ static logical c_false = FALSE_; goto L80; } for (i__ = 1; i__ <= 17; ++i__) { - if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) + if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) { goto L70; } @@ -677,7 +682,7 @@ static logical c_false = FALSE_; /* YY holds the exact result. On exit from CMVCH YT holds */ /* the result computed by CMVCH. */ *(unsigned char *)trans = 'N'; - cmvch_(trans, &n, &n, &c_b2, a, &c__65, x, &c__1, &c_b1, y, &c__1, yt, g, + cmvch_(trans, &n, &n, &c_b2, a, &c__65, x, &c__1, &c_b1, y, &c__1, yt, g, yy, &eps, &err, &fatal, &nout, &c_true, (ftnlen)1); same = lce_(yy, yt, &n); if (! same || err != 0.f) { @@ -690,7 +695,7 @@ static logical c_false = FALSE_; s_stop("", (ftnlen)0); } *(unsigned char *)trans = 'T'; - cmvch_(trans, &n, &n, &c_b2, a, &c__65, x, &c_n1, &c_b1, y, &c_n1, yt, g, + cmvch_(trans, &n, &n, &c_b2, a, &c__65, x, &c_n1, &c_b1, y, &c_n1, yt, g, yy, &eps, &err, &fatal, &nout, &c_true, (ftnlen)1); same = lce_(yy, yt, &n); if (! same || err != 0.f) { @@ -751,44 +756,44 @@ static logical c_false = FALSE_; /* Test CGEMV, 01, and CGBMV, 02. */ L140: cchk1_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, - &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, + trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, + &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, (ftnlen)6); goto L200; /* Test CHEMV, 03, CHBMV, 04, and CHPMV, 05. */ L150: cchk2_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, - &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, + trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, + &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, (ftnlen)6); goto L200; /* Test CTRMV, 06, CTBMV, 07, CTPMV, 08, */ /* CTRSV, 09, CTBSV, 10, and CTPSV, 11. */ L160: cchk3_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &ninc, inc, + trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &ninc, inc, &c__65, &c__2, a, aa, as, y, yy, ys, yt, g, z__, (ftnlen) 6); goto L200; /* Test CGERC, 12, CGERU, 13. */ L170: cchk4_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, - inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, + inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, z__, (ftnlen)6); goto L200; /* Test CHER, 14, and CHPR, 15. */ L180: cchk5_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, - inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, + inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, z__, (ftnlen)6); goto L200; /* Test CHER2, 16, and CHPR2, 17. */ L190: cchk6_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, - inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, + inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, z__, (ftnlen)6); L200: @@ -830,15 +835,20 @@ static logical c_false = FALSE_; /* End of CBLAT2. */ - return 0; +#ifdef BLIS_ENABLE_HPX + return bli_thread_finalize_hpx(); +#else + // Return peacefully. + return 0; +#endif } /* main */ /* Subroutine */ int cchk1_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, integer * - nalf, complex *alf, integer *nbet, complex *bet, integer *ninc, + nalf, complex *alf, integer *nbet, complex *bet, integer *ninc, integer *inc, integer *nmax, integer *incmax, complex *a, complex *aa, - complex *as, complex *x, complex *xx, complex *xs, complex *y, + complex *as, complex *x, complex *xx, complex *xs, complex *y, complex *yy, complex *ys, complex *yt, real *g, ftnlen sname_len) { /* Initialized data */ @@ -867,7 +877,7 @@ static logical c_false = FALSE_; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8, + integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8, i__9; alist al__1; @@ -887,26 +897,26 @@ static logical c_false = FALSE_; logical same; integer incx, incy; logical full, tran, null; - extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, - integer *, complex *, integer *, complex *, integer *, integer *, + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, integer *, integer *, logical *, complex *, ftnlen, ftnlen, ftnlen); complex alpha; logical isame[13]; extern /* Subroutine */ int cgbmv_(char *, integer *, integer *, integer * , integer *, complex *, complex *, integer *, complex *, integer * - , complex *, complex *, integer *, ftnlen), cgemv_(char *, - integer *, integer *, complex *, complex *, integer *, complex *, + , complex *, complex *, integer *, ftnlen), cgemv_(char *, + integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *, ftnlen), cmvch_(char * , integer *, integer *, complex *, complex *, integer *, complex * - , integer *, complex *, complex *, integer *, complex *, real *, - complex *, real *, real *, logical *, integer *, logical *, + , integer *, complex *, complex *, integer *, complex *, real *, + complex *, real *, real *, logical *, integer *, logical *, ftnlen); integer nargs; logical reset; integer incxs, incys; char trans[1]; logical banded; - extern logical lceres_(char *, char *, integer *, integer *, complex *, + extern logical lceres_(char *, char *, integer *, integer *, complex *, complex *, integer *, ftnlen, ftnlen); real errmax; complex transl; @@ -1089,9 +1099,9 @@ static logical c_false = FALSE_; transl.r = 0.f, transl.i = 0.f; i__7 = abs(incy); i__8 = ml - 1; - cmake_("GE", " ", " ", &c__1, &ml, &y[1], + cmake_("GE", " ", " ", &c__1, &ml, &y[1], &c__1, &yy[1], &i__7, &c__0, & - i__8, &reset, &transl, (ftnlen)2, + i__8, &reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); ++nc; @@ -1099,7 +1109,7 @@ static logical c_false = FALSE_; /* Save every datum before calling the */ /* subroutine. */ - *(unsigned char *)transs = *(unsigned + *(unsigned char *)transs = *(unsigned char *)trans; ms = m; ns = n; @@ -1110,7 +1120,7 @@ static logical c_false = FALSE_; for (i__ = 1; i__ <= i__7; ++i__) { i__8 = i__; i__9 = i__; - as[i__8].r = aa[i__9].r, as[i__8].i = + as[i__8].r = aa[i__9].r, as[i__8].i = aa[i__9].i; /* L10: */ } @@ -1119,7 +1129,7 @@ static logical c_false = FALSE_; for (i__ = 1; i__ <= i__7; ++i__) { i__8 = i__; i__9 = i__; - xs[i__8].r = xx[i__9].r, xs[i__8].i = + xs[i__8].r = xx[i__9].r, xs[i__8].i = xx[i__9].i; /* L20: */ } @@ -1129,7 +1139,7 @@ static logical c_false = FALSE_; for (i__ = 1; i__ <= i__7; ++i__) { i__8 = i__; i__9 = i__; - ys[i__8].r = yy[i__9].r, ys[i__8].i = + ys[i__8].r = yy[i__9].r, ys[i__8].i = yy[i__9].i; /* L30: */ } @@ -1166,7 +1176,7 @@ static logical c_false = FALSE_; al__1.aunit = *ntra; f_rew(&al__1); } - cgemv_(trans, &m, &n, &alpha, &aa[1], + cgemv_(trans, &m, &n, &alpha, &aa[1], &lda, &xx[1], &incx, &beta, & yy[1], &incy, (ftnlen)1); } else if (banded) { @@ -1225,7 +1235,7 @@ static logical c_false = FALSE_; isame[1] = ms == m; isame[2] = ns == n; if (full) { - isame[3] = als.r == alpha.r && als.i + isame[3] = als.r == alpha.r && als.i == alpha.i; isame[4] = lce_(&as[1], &aa[1], &laa); isame[5] = ldas == lda; @@ -1247,13 +1257,13 @@ static logical c_false = FALSE_; } else if (banded) { isame[3] = kls == kl; isame[4] = kus == ku; - isame[5] = als.r == alpha.r && als.i + isame[5] = als.r == alpha.r && als.i == alpha.i; isame[6] = lce_(&as[1], &aa[1], &laa); isame[7] = ldas == lda; isame[8] = lce_(&xs[1], &xx[1], &lx); isame[9] = incxs == incx; - isame[10] = bls.r == beta.r && bls.i + isame[10] = bls.r == beta.r && bls.i == beta.i; if (null) { isame[11] = lce_(&ys[1], &yy[1], & @@ -1295,8 +1305,8 @@ static logical c_false = FALSE_; cmvch_(trans, &m, &n, &alpha, &a[ a_offset], nmax, &x[1], &incx, - &beta, &y[1], &incy, &yt[1], - &g[1], &yy[1], eps, &err, + &beta, &y[1], &incy, &yt[1], + &g[1], &yy[1], eps, &err, fatal, nout, &c_true, (ftnlen) 1); errmax = max(errmax,err); @@ -1401,11 +1411,11 @@ static logical c_false = FALSE_; } /* cchk1_ */ /* Subroutine */ int cchk2_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, integer * - nalf, complex *alf, integer *nbet, complex *bet, integer *ninc, + nalf, complex *alf, integer *nbet, complex *bet, integer *ninc, integer *inc, integer *nmax, integer *incmax, complex *a, complex *aa, - complex *as, complex *x, complex *xx, complex *xs, complex *y, + complex *as, complex *x, complex *xx, complex *xs, complex *y, complex *yy, complex *ys, complex *yt, real *g, ftnlen sname_len) { /* Initialized data */ @@ -1438,7 +1448,7 @@ static logical c_false = FALSE_; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8, + integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8, i__9; alist al__1; @@ -1447,7 +1457,7 @@ static logical c_false = FALSE_; f_rew(alist *); /* Local variables */ - integer i__, k, n, ia, ib, ic, nc, ik, in, nk, ks, ix, iy, ns, lx, ly, + integer i__, k, n, ia, ib, ic, nc, ik, in, nk, ks, ix, iy, ns, lx, ly, laa, lda; extern logical lce_(complex *, complex *, integer *); complex als, bls; @@ -1458,18 +1468,18 @@ static logical c_false = FALSE_; integer incx, incy; logical full, null; char uplo[1]; - extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, - integer *, complex *, integer *, complex *, integer *, integer *, + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, integer *, integer *, logical *, complex *, ftnlen, ftnlen, ftnlen); complex alpha; logical isame[13]; extern /* Subroutine */ int chbmv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * - , integer *, ftnlen), chemv_(char *, integer *, complex *, - complex *, integer *, complex *, integer *, complex *, complex *, + , integer *, ftnlen), chemv_(char *, integer *, complex *, + complex *, integer *, complex *, integer *, complex *, complex *, integer *, ftnlen), cmvch_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * - , integer *, complex *, real *, complex *, real *, real *, + , integer *, complex *, real *, complex *, real *, real *, logical *, integer *, logical *, ftnlen); integer nargs; extern /* Subroutine */ int chpmv_(char *, integer *, complex *, complex * @@ -1478,7 +1488,7 @@ static logical c_false = FALSE_; integer incxs, incys; char uplos[1]; logical banded, packed; - extern logical lceres_(char *, char *, integer *, integer *, complex *, + extern logical lceres_(char *, char *, integer *, integer *, complex *, complex *, integer *, ftnlen, ftnlen); real errmax; complex transl; @@ -1643,7 +1653,7 @@ static logical c_false = FALSE_; i__8 = n - 1; cmake_("GE", " ", " ", &c__1, &n, &y[1], & c__1, &yy[1], &i__7, &c__0, &i__8, & - reset, &transl, (ftnlen)2, (ftnlen)1, + reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); ++nc; @@ -1795,13 +1805,13 @@ static logical c_false = FALSE_; unsigned char *)uplos; isame[1] = ns == n; if (full) { - isame[2] = als.r == alpha.r && als.i == + isame[2] = als.r == alpha.r && als.i == alpha.i; isame[3] = lce_(&as[1], &aa[1], &laa); isame[4] = ldas == lda; isame[5] = lce_(&xs[1], &xx[1], &lx); isame[6] = incxs == incx; - isame[7] = bls.r == beta.r && bls.i == + isame[7] = bls.r == beta.r && bls.i == beta.i; if (null) { isame[8] = lce_(&ys[1], &yy[1], &ly); @@ -1814,13 +1824,13 @@ static logical c_false = FALSE_; isame[9] = incys == incy; } else if (banded) { isame[2] = ks == k; - isame[3] = als.r == alpha.r && als.i == + isame[3] = als.r == alpha.r && als.i == alpha.i; isame[4] = lce_(&as[1], &aa[1], &laa); isame[5] = ldas == lda; isame[6] = lce_(&xs[1], &xx[1], &lx); isame[7] = incxs == incx; - isame[8] = bls.r == beta.r && bls.i == + isame[8] = bls.r == beta.r && bls.i == beta.i; if (null) { isame[9] = lce_(&ys[1], &yy[1], &ly); @@ -1832,12 +1842,12 @@ static logical c_false = FALSE_; } isame[10] = incys == incy; } else if (packed) { - isame[2] = als.r == alpha.r && als.i == + isame[2] = als.r == alpha.r && als.i == alpha.i; isame[3] = lce_(&as[1], &aa[1], &laa); isame[4] = lce_(&xs[1], &xx[1], &lx); isame[5] = incxs == incx; - isame[6] = bls.r == beta.r && bls.i == + isame[6] = bls.r == beta.r && bls.i == beta.i; if (null) { isame[7] = lce_(&ys[1], &yy[1], &ly); @@ -1875,8 +1885,8 @@ static logical c_false = FALSE_; /* Check the result. */ - cmvch_("N", &n, &n, &alpha, &a[a_offset], - nmax, &x[1], &incx, &beta, &y[1], + cmvch_("N", &n, &n, &alpha, &a[a_offset], + nmax, &x[1], &incx, &beta, &y[1], &incy, &yt[1], &g[1], &yy[1], eps, &err, fatal, nout, &c_true, ( ftnlen)1); @@ -1987,10 +1997,10 @@ static logical c_false = FALSE_; } /* cchk2_ */ /* Subroutine */ int cchk3_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, integer * - ninc, integer *inc, integer *nmax, integer *incmax, complex *a, - complex *aa, complex *as, complex *x, complex *xx, complex *xs, + ninc, integer *inc, integer *nmax, integer *incmax, complex *a, + complex *aa, complex *as, complex *x, complex *xx, complex *xs, complex *xt, real *g, complex *z__, ftnlen sname_len) { /* Initialized data */ @@ -2040,36 +2050,36 @@ static logical c_false = FALSE_; integer incx; logical full, null; char uplo[1]; - extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, - integer *, complex *, integer *, complex *, integer *, integer *, + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, integer *, integer *, logical *, complex *, ftnlen, ftnlen, ftnlen); char diags[1]; logical isame[13]; extern /* Subroutine */ int cmvch_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * - , integer *, complex *, real *, complex *, real *, real *, + , integer *, complex *, real *, complex *, real *, real *, logical *, integer *, logical *, ftnlen); integer nargs; - extern /* Subroutine */ int ctbmv_(char *, char *, char *, integer *, - integer *, complex *, integer *, complex *, integer *, ftnlen, - ftnlen, ftnlen), ctbsv_(char *, char *, char *, integer *, - integer *, complex *, integer *, complex *, integer *, ftnlen, + extern /* Subroutine */ int ctbmv_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, ftnlen, + ftnlen, ftnlen), ctbsv_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, ftnlen, ftnlen, ftnlen); logical reset; integer incxs; char trans[1]; - extern /* Subroutine */ int ctpmv_(char *, char *, char *, integer *, + extern /* Subroutine */ int ctpmv_(char *, char *, char *, integer *, complex *, complex *, integer *, ftnlen, ftnlen, ftnlen), ctrmv_( char *, char *, char *, integer *, complex *, integer *, complex * - , integer *, ftnlen, ftnlen, ftnlen), ctpsv_(char *, char *, char - *, integer *, complex *, complex *, integer *, ftnlen, ftnlen, + , integer *, ftnlen, ftnlen, ftnlen), ctpsv_(char *, char *, char + *, integer *, complex *, complex *, integer *, ftnlen, ftnlen, ftnlen); char uplos[1]; - extern /* Subroutine */ int ctrsv_(char *, char *, char *, integer *, - complex *, integer *, complex *, integer *, ftnlen, ftnlen, + extern /* Subroutine */ int ctrsv_(char *, char *, char *, integer *, + complex *, integer *, complex *, integer *, ftnlen, ftnlen, ftnlen); logical banded, packed; - extern logical lceres_(char *, char *, integer *, integer *, complex *, + extern logical lceres_(char *, char *, integer *, integer *, complex *, complex *, integer *, ftnlen, ftnlen); real errmax; complex transl; @@ -2197,13 +2207,13 @@ static logical c_false = FALSE_; ; for (icd = 1; icd <= 2; ++icd) { - *(unsigned char *)diag = *(unsigned char *)&ichd[icd + *(unsigned char *)diag = *(unsigned char *)&ichd[icd - 1]; /* Generate the matrix A. */ transl.r = 0.f, transl.i = 0.f; - cmake_(sname + 1, uplo, diag, &n, &n, &a[a_offset], + cmake_(sname + 1, uplo, diag, &n, &n, &a[a_offset], nmax, &aa[1], &lda, &k, &k, &reset, &transl, ( ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -2258,7 +2268,7 @@ static logical c_false = FALSE_; /* Call the subroutine. */ - if (s_cmp(sname + 3, "MV", (ftnlen)2, (ftnlen)2) + if (s_cmp(sname + 3, "MV", (ftnlen)2, (ftnlen)2) == 0) { if (full) { if (*trace) { @@ -2311,7 +2321,7 @@ static logical c_false = FALSE_; al__1.aunit = *ntra; f_rew(&al__1); } - ctbmv_(uplo, trans, diag, &n, &k, &aa[1], + ctbmv_(uplo, trans, diag, &n, &k, &aa[1], &lda, &xx[1], &incx, (ftnlen)1, ( ftnlen)1, (ftnlen)1); } else if (packed) { @@ -2392,7 +2402,7 @@ static logical c_false = FALSE_; al__1.aunit = *ntra; f_rew(&al__1); } - ctbsv_(uplo, trans, diag, &n, &k, &aa[1], + ctbsv_(uplo, trans, diag, &n, &k, &aa[1], &lda, &xx[1], &incx, (ftnlen)1, ( ftnlen)1, (ftnlen)1); } else if (packed) { @@ -2434,11 +2444,11 @@ static logical c_false = FALSE_; /* See what data changed inside subroutines. */ - isame[0] = *(unsigned char *)uplo == *(unsigned + isame[0] = *(unsigned char *)uplo == *(unsigned char *)uplos; - isame[1] = *(unsigned char *)trans == *(unsigned + isame[1] = *(unsigned char *)trans == *(unsigned char *)transs; - isame[2] = *(unsigned char *)diag == *(unsigned + isame[2] = *(unsigned char *)diag == *(unsigned char *)diags; isame[3] = ns == n; if (full) { @@ -2508,7 +2518,7 @@ static logical c_false = FALSE_; cmvch_(trans, &n, &n, &c_b2, &a[a_offset], nmax, &x[1], &incx, &c_b1, &z__[ - 1], &incx, &xt[1], &g[1], &xx[1], + 1], &incx, &xt[1], &g[1], &xx[1], eps, &err, fatal, nout, &c_true, ( ftnlen)1); } else if (s_cmp(sname + 3, "SV", (ftnlen)2, ( @@ -2520,18 +2530,18 @@ static logical c_false = FALSE_; for (i__ = 1; i__ <= i__4; ++i__) { i__5 = i__; i__6 = (i__ - 1) * abs(incx) + 1; - z__[i__5].r = xx[i__6].r, z__[i__5].i + z__[i__5].r = xx[i__6].r, z__[i__5].i = xx[i__6].i; i__5 = (i__ - 1) * abs(incx) + 1; i__6 = i__; - xx[i__5].r = x[i__6].r, xx[i__5].i = + xx[i__5].r = x[i__6].r, xx[i__5].i = x[i__6].i; /* L50: */ } cmvch_(trans, &n, &n, &c_b2, &a[a_offset], nmax, &z__[1], &incx, &c_b1, &x[ - 1], &incx, &xt[1], &g[1], &xx[1], - eps, &err, fatal, nout, &c_false, + 1], &incx, &xt[1], &g[1], &xx[1], + eps, &err, fatal, nout, &c_false, (ftnlen)1); } errmax = max(errmax,err); @@ -2634,10 +2644,10 @@ static logical c_false = FALSE_; } /* cchk3_ */ /* Subroutine */ int cchk4_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nalf, complex *alf, integer * - ninc, integer *inc, integer *nmax, integer *incmax, complex *a, - complex *aa, complex *as, complex *x, complex *xx, complex *xs, + ninc, integer *inc, integer *nmax, integer *incmax, complex *a, + complex *aa, complex *as, complex *x, complex *xx, complex *xs, complex *y, complex *yy, complex *ys, complex *yt, real *g, complex * z__, ftnlen sname_len) { @@ -2681,23 +2691,23 @@ static logical c_false = FALSE_; logical same, conj; integer incx, incy; logical null; - extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, - integer *, complex *, integer *, complex *, integer *, integer *, + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, integer *, integer *, logical *, complex *, ftnlen, ftnlen, ftnlen), cgerc_( - integer *, integer *, complex *, complex *, integer *, complex *, + integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, integer *); complex alpha; logical isame[13]; extern /* Subroutine */ int cmvch_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * - , integer *, complex *, real *, complex *, real *, real *, - logical *, integer *, logical *, ftnlen), cgeru_(integer *, - integer *, complex *, complex *, integer *, complex *, integer *, + , integer *, complex *, real *, complex *, real *, real *, + logical *, integer *, logical *, ftnlen), cgeru_(integer *, + integer *, complex *, complex *, integer *, complex *, integer *, complex *, integer *); integer nargs; logical reset; integer incxs, incys; - extern logical lceres_(char *, char *, integer *, integer *, complex *, + extern logical lceres_(char *, char *, integer *, integer *, complex *, complex *, integer *, ftnlen, ftnlen); real errmax; complex transl; @@ -2801,7 +2811,7 @@ static logical c_false = FALSE_; i__3 = abs(incx); i__4 = m - 1; cmake_("GE", " ", " ", &c__1, &m, &x[1], &c__1, &xx[1], &i__3, - &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, + &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); if (m > 1) { i__3 = m / 2; @@ -2840,7 +2850,7 @@ static logical c_false = FALSE_; transl.r = 0.f, transl.i = 0.f; i__5 = m - 1; i__6 = n - 1; - cmake_(sname + 1, " ", " ", &m, &n, &a[a_offset], + cmake_(sname + 1, " ", " ", &m, &n, &a[a_offset], nmax, &aa[1], &lda, &i__5, &i__6, &reset, & transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -2999,9 +3009,9 @@ static logical c_false = FALSE_; r_cnjg(&q__1, w); w[0].r = q__1.r, w[0].i = q__1.i; } - cmvch_("N", &m, &c__1, &alpha, &z__[1], nmax, + cmvch_("N", &m, &c__1, &alpha, &z__[1], nmax, w, &c__1, &c_b2, &a[j * a_dim1 + 1], & - c__1, &yt[1], &g[1], &aa[(j - 1) * + c__1, &yt[1], &g[1], &aa[(j - 1) * lda + 1], eps, &err, fatal, nout, & c_true, (ftnlen)1); errmax = max(errmax,err); @@ -3082,10 +3092,10 @@ static logical c_false = FALSE_; } /* cchk4_ */ /* Subroutine */ int cchk5_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nalf, complex *alf, integer * - ninc, integer *inc, integer *nmax, integer *incmax, complex *a, - complex *aa, complex *as, complex *x, complex *xx, complex *xs, + ninc, integer *inc, integer *nmax, integer *incmax, complex *a, + complex *aa, complex *as, complex *x, complex *xx, complex *xs, complex *y, complex *yy, complex *ys, complex *yt, real *g, complex * z__, ftnlen sname_len) { @@ -3130,24 +3140,24 @@ static logical c_false = FALSE_; integer ia, ja, ic, nc, jj, lj, in, ix, ns, lx, laa, lda; extern logical lce_(complex *, complex *, integer *); real err; - extern /* Subroutine */ int cher_(char *, integer *, real *, complex *, + extern /* Subroutine */ int cher_(char *, integer *, real *, complex *, integer *, complex *, integer *, ftnlen); integer ldas; logical same; - extern /* Subroutine */ int chpr_(char *, integer *, real *, complex *, + extern /* Subroutine */ int chpr_(char *, integer *, real *, complex *, integer *, complex *, ftnlen); real rals; integer incx; logical full, null; char uplo[1]; - extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, - integer *, complex *, integer *, complex *, integer *, integer *, + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, integer *, integer *, logical *, complex *, ftnlen, ftnlen, ftnlen); complex alpha; logical isame[13]; extern /* Subroutine */ int cmvch_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * - , integer *, complex *, real *, complex *, real *, real *, + , integer *, complex *, real *, complex *, real *, real *, logical *, integer *, logical *, ftnlen); integer nargs; logical reset; @@ -3156,7 +3166,7 @@ static logical c_false = FALSE_; char uplos[1]; logical packed; real ralpha; - extern logical lceres_(char *, char *, integer *, integer *, complex *, + extern logical lceres_(char *, char *, integer *, integer *, complex *, complex *, integer *, ftnlen, ftnlen); real errmax; complex transl; @@ -3261,7 +3271,7 @@ static logical c_false = FALSE_; i__3 = abs(incx); i__4 = n - 1; cmake_("GE", " ", " ", &c__1, &n, &x[1], &c__1, &xx[1], &i__3, - &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, + &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); if (n > 1) { i__3 = n / 2; @@ -3336,7 +3346,7 @@ static logical c_false = FALSE_; al__1.aunit = *ntra; f_rew(&al__1); } - cher_(uplo, &n, &ralpha, &xx[1], &incx, &aa[1], &lda, + cher_(uplo, &n, &ralpha, &xx[1], &incx, &aa[1], &lda, (ftnlen)1); } else if (packed) { if (*trace) { @@ -3446,9 +3456,9 @@ static logical c_false = FALSE_; jj = j; lj = n - j + 1; } - cmvch_("N", &lj, &c__1, &alpha, &z__[jj], &lj, w, - &c__1, &c_b2, &a[jj + j * a_dim1], &c__1, - &yt[1], &g[1], &aa[ja], eps, &err, fatal, + cmvch_("N", &lj, &c__1, &alpha, &z__[jj], &lj, w, + &c__1, &c_b2, &a[jj + j * a_dim1], &c__1, + &yt[1], &g[1], &aa[ja], eps, &err, fatal, nout, &c_true, (ftnlen)1); if (full) { if (upper) { @@ -3547,10 +3557,10 @@ static logical c_false = FALSE_; } /* cchk5_ */ /* Subroutine */ int cchk6_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nalf, complex *alf, integer * - ninc, integer *inc, integer *nmax, integer *incmax, complex *a, - complex *aa, complex *as, complex *x, complex *xx, complex *xs, + ninc, integer *inc, integer *nmax, integer *incmax, complex *a, + complex *aa, complex *as, complex *x, complex *xx, complex *xs, complex *y, complex *yy, complex *ys, complex *yt, real *g, complex * z__, ftnlen sname_len) { @@ -3580,7 +3590,7 @@ static logical c_false = FALSE_; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5, + integer a_dim1, a_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; complex q__1, q__2, q__3; alist al__1; @@ -3603,17 +3613,17 @@ static logical c_false = FALSE_; logical full, null; char uplo[1]; extern /* Subroutine */ int cher2_(char *, integer *, complex *, complex * - , integer *, complex *, integer *, complex *, integer *, ftnlen), - chpr2_(char *, integer *, complex *, complex *, integer *, - complex *, integer *, complex *, ftnlen), cmake_(char *, char *, - char *, integer *, integer *, complex *, integer *, complex *, - integer *, integer *, integer *, logical *, complex *, ftnlen, + , integer *, complex *, integer *, complex *, integer *, ftnlen), + chpr2_(char *, integer *, complex *, complex *, integer *, + complex *, integer *, complex *, ftnlen), cmake_(char *, char *, + char *, integer *, integer *, complex *, integer *, complex *, + integer *, integer *, integer *, logical *, complex *, ftnlen, ftnlen, ftnlen); complex alpha; logical isame[13]; extern /* Subroutine */ int cmvch_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * - , integer *, complex *, real *, complex *, real *, real *, + , integer *, complex *, real *, complex *, real *, real *, logical *, integer *, logical *, ftnlen); integer nargs; logical reset; @@ -3621,7 +3631,7 @@ static logical c_false = FALSE_; logical upper; char uplos[1]; logical packed; - extern logical lceres_(char *, char *, integer *, integer *, complex *, + extern logical lceres_(char *, char *, integer *, integer *, complex *, complex *, integer *, ftnlen, ftnlen); real errmax; complex transl; @@ -3728,7 +3738,7 @@ static logical c_false = FALSE_; i__3 = abs(incx); i__4 = n - 1; cmake_("GE", " ", " ", &c__1, &n, &x[1], &c__1, &xx[1], &i__3, - &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, + &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); if (n > 1) { i__3 = n / 2; @@ -3768,7 +3778,7 @@ static logical c_false = FALSE_; transl.r = 0.f, transl.i = 0.f; i__5 = n - 1; i__6 = n - 1; - cmake_(sname + 1, uplo, " ", &n, &n, &a[a_offset], + cmake_(sname + 1, uplo, " ", &n, &n, &a[a_offset], nmax, &aa[1], &lda, &i__5, &i__6, &reset, & transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -3956,14 +3966,14 @@ static logical c_false = FALSE_; i__5 = n; for (j = 1; j <= i__5; ++j) { r_cnjg(&q__2, &z__[j + (z_dim1 << 1)]); - q__1.r = alpha.r * q__2.r - alpha.i * q__2.i, - q__1.i = alpha.r * q__2.i + alpha.i * + q__1.r = alpha.r * q__2.r - alpha.i * q__2.i, + q__1.i = alpha.r * q__2.i + alpha.i * q__2.r; w[0].r = q__1.r, w[0].i = q__1.i; r_cnjg(&q__2, &alpha); r_cnjg(&q__3, &z__[j + z_dim1]); - q__1.r = q__2.r * q__3.r - q__2.i * q__3.i, - q__1.i = q__2.r * q__3.i + q__2.i * + q__1.r = q__2.r * q__3.r - q__2.i * q__3.i, + q__1.i = q__2.r * q__3.i + q__2.i * q__3.r; w[1].r = q__1.r, w[1].i = q__1.i; if (upper) { @@ -3973,8 +3983,8 @@ static logical c_false = FALSE_; jj = j; lj = n - j + 1; } - cmvch_("N", &lj, &c__2, &c_b2, &z__[jj + - z_dim1], nmax, w, &c__1, &c_b2, &a[jj + cmvch_("N", &lj, &c__2, &c_b2, &z__[jj + + z_dim1], nmax, w, &c__1, &c_b2, &a[jj + j * a_dim1], &c__1, &yt[1], &g[1], & aa[ja], eps, &err, fatal, nout, & c_true, (ftnlen)1); @@ -4079,7 +4089,7 @@ static logical c_false = FALSE_; } /* cchk6_ */ -/* Subroutine */ int cchke_(integer *isnum, char *srnamt, integer *nout, +/* Subroutine */ int cchke_(integer *isnum, char *srnamt, integer *nout, ftnlen srnamt_len) { /* Format strings */ @@ -4093,40 +4103,40 @@ static logical c_false = FALSE_; /* Local variables */ complex a[1] /* was [1][1] */, x[1], y[1], beta; - extern /* Subroutine */ int cher_(char *, integer *, real *, complex *, + extern /* Subroutine */ int cher_(char *, integer *, real *, complex *, integer *, complex *, integer *, ftnlen), chpr_(char *, integer *, - real *, complex *, integer *, complex *, ftnlen), cher2_(char *, - integer *, complex *, complex *, integer *, complex *, integer *, + real *, complex *, integer *, complex *, ftnlen), cher2_(char *, + integer *, complex *, complex *, integer *, complex *, integer *, complex *, integer *, ftnlen), chpr2_(char *, integer *, complex * - , complex *, integer *, complex *, integer *, complex *, ftnlen), - cgerc_(integer *, integer *, complex *, complex *, integer *, + , complex *, integer *, complex *, integer *, complex *, ftnlen), + cgerc_(integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, integer *); complex alpha; extern /* Subroutine */ int cgbmv_(char *, integer *, integer *, integer * , integer *, complex *, complex *, integer *, complex *, integer * - , complex *, complex *, integer *, ftnlen), chbmv_(char *, - integer *, integer *, complex *, complex *, integer *, complex *, + , complex *, complex *, integer *, ftnlen), chbmv_(char *, + integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *, ftnlen), cgemv_(char * , integer *, integer *, complex *, complex *, integer *, complex * , integer *, complex *, complex *, integer *, ftnlen), chemv_( - char *, integer *, complex *, complex *, integer *, complex *, + char *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *, ftnlen), cgeru_( - integer *, integer *, complex *, complex *, integer *, complex *, - integer *, complex *, integer *), ctbmv_(char *, char *, char *, - integer *, integer *, complex *, integer *, complex *, integer *, - ftnlen, ftnlen, ftnlen), chpmv_(char *, integer *, complex *, - complex *, complex *, integer *, complex *, complex *, integer *, - ftnlen), ctbsv_(char *, char *, char *, integer *, integer *, - complex *, integer *, complex *, integer *, ftnlen, ftnlen, - ftnlen), ctpmv_(char *, char *, char *, integer *, complex *, - complex *, integer *, ftnlen, ftnlen, ftnlen), ctrmv_(char *, - char *, char *, integer *, complex *, integer *, complex *, + integer *, integer *, complex *, complex *, integer *, complex *, + integer *, complex *, integer *), ctbmv_(char *, char *, char *, + integer *, integer *, complex *, integer *, complex *, integer *, + ftnlen, ftnlen, ftnlen), chpmv_(char *, integer *, complex *, + complex *, complex *, integer *, complex *, complex *, integer *, + ftnlen), ctbsv_(char *, char *, char *, integer *, integer *, + complex *, integer *, complex *, integer *, ftnlen, ftnlen, + ftnlen), ctpmv_(char *, char *, char *, integer *, complex *, + complex *, integer *, ftnlen, ftnlen, ftnlen), ctrmv_(char *, + char *, char *, integer *, complex *, integer *, complex *, integer *, ftnlen, ftnlen, ftnlen), ctpsv_(char *, char *, char *, - integer *, complex *, complex *, integer *, ftnlen, ftnlen, - ftnlen), ctrsv_(char *, char *, char *, integer *, complex *, + integer *, complex *, complex *, integer *, ftnlen, ftnlen, + ftnlen), ctrsv_(char *, char *, char *, integer *, complex *, integer *, complex *, integer *, ftnlen, ftnlen, ftnlen); real ralpha; - extern /* Subroutine */ int chkxer_(char *, integer *, integer *, logical + extern /* Subroutine */ int chkxer_(char *, integer *, integer *, logical *, logical *, ftnlen); /* Fortran I/O blocks */ @@ -4655,9 +4665,9 @@ static logical c_false = FALSE_; } /* cchke_ */ -/* Subroutine */ int cmake_(char *type__, char *uplo, char *diag, integer *m, - integer *n, complex *a, integer *nmax, complex *aa, integer *lda, - integer *kl, integer *ku, logical *reset, complex *transl, ftnlen +/* Subroutine */ int cmake_(char *type__, char *uplo, char *diag, integer *m, + integer *n, complex *a, integer *nmax, complex *aa, integer *lda, + integer *kl, integer *ku, logical *reset, complex *transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len) { /* System generated locals */ @@ -4718,7 +4728,7 @@ static logical c_false = FALSE_; i__2 = *m; for (i__ = 1; i__ <= i__2; ++i__) { if (gen || upper && i__ <= j || lower && i__ >= j) { - if (i__ <= j && j - i__ <= *ku || i__ >= j && i__ - j <= *kl) + if (i__ <= j && j - i__ <= *ku || i__ >= j && i__ - j <= *kl) { i__3 = i__ + j * a_dim1; cbeg_(&q__2, reset); @@ -4953,8 +4963,8 @@ static logical c_false = FALSE_; /* Subroutine */ int cmvch_(char *trans, integer *m, integer *n, complex * alpha, complex *a, integer *nmax, complex *x, integer *incx, complex * - beta, complex *y, integer *incy, complex *yt, real *g, complex *yy, - real *eps, real *err, logical *fatal, integer *nout, logical *mv, + beta, complex *y, integer *incy, complex *yt, real *g, complex *yy, + real *eps, real *err, logical *fatal, integer *nout, logical *mv, ftnlen trans_len) { /* Format strings */ @@ -5057,15 +5067,15 @@ static logical c_false = FALSE_; i__4 = iy; i__5 = j + i__ * a_dim1; i__6 = jx; - q__2.r = a[i__5].r * x[i__6].r - a[i__5].i * x[i__6].i, + q__2.r = a[i__5].r * x[i__6].r - a[i__5].i * x[i__6].i, q__2.i = a[i__5].r * x[i__6].i + a[i__5].i * x[i__6] .r; q__1.r = yt[i__4].r + q__2.r, q__1.i = yt[i__4].i + q__2.i; yt[i__3].r = q__1.r, yt[i__3].i = q__1.i; i__3 = j + i__ * a_dim1; i__4 = jx; - g[iy] += ((r__1 = a[i__3].r, abs(r__1)) + (r__2 = r_imag(&a[j - + i__ * a_dim1]), abs(r__2))) * ((r__3 = x[i__4].r, + g[iy] += ((r__1 = a[i__3].r, abs(r__1)) + (r__2 = r_imag(&a[j + + i__ * a_dim1]), abs(r__2))) * ((r__3 = x[i__4].r, abs(r__3)) + (r__4 = r_imag(&x[jx]), abs(r__4))); jx += incxl; /* L10: */ @@ -5077,14 +5087,14 @@ static logical c_false = FALSE_; i__4 = iy; r_cnjg(&q__3, &a[j + i__ * a_dim1]); i__5 = jx; - q__2.r = q__3.r * x[i__5].r - q__3.i * x[i__5].i, q__2.i = + q__2.r = q__3.r * x[i__5].r - q__3.i * x[i__5].i, q__2.i = q__3.r * x[i__5].i + q__3.i * x[i__5].r; q__1.r = yt[i__4].r + q__2.r, q__1.i = yt[i__4].i + q__2.i; yt[i__3].r = q__1.r, yt[i__3].i = q__1.i; i__3 = j + i__ * a_dim1; i__4 = jx; - g[iy] += ((r__1 = a[i__3].r, abs(r__1)) + (r__2 = r_imag(&a[j - + i__ * a_dim1]), abs(r__2))) * ((r__3 = x[i__4].r, + g[iy] += ((r__1 = a[i__3].r, abs(r__1)) + (r__2 = r_imag(&a[j + + i__ * a_dim1]), abs(r__2))) * ((r__3 = x[i__4].r, abs(r__3)) + (r__4 = r_imag(&x[jx]), abs(r__4))); jx += incxl; /* L20: */ @@ -5096,7 +5106,7 @@ static logical c_false = FALSE_; i__4 = iy; i__5 = i__ + j * a_dim1; i__6 = jx; - q__2.r = a[i__5].r * x[i__6].r - a[i__5].i * x[i__6].i, + q__2.r = a[i__5].r * x[i__6].r - a[i__5].i * x[i__6].i, q__2.i = a[i__5].r * x[i__6].i + a[i__5].i * x[i__6] .r; q__1.r = yt[i__4].r + q__2.r, q__1.i = yt[i__4].i + q__2.i; @@ -5104,7 +5114,7 @@ static logical c_false = FALSE_; i__3 = i__ + j * a_dim1; i__4 = jx; g[iy] += ((r__1 = a[i__3].r, abs(r__1)) + (r__2 = r_imag(&a[ - i__ + j * a_dim1]), abs(r__2))) * ((r__3 = x[i__4].r, + i__ + j * a_dim1]), abs(r__2))) * ((r__3 = x[i__4].r, abs(r__3)) + (r__4 = r_imag(&x[jx]), abs(r__4))); jx += incxl; /* L30: */ @@ -5112,7 +5122,7 @@ static logical c_false = FALSE_; } i__2 = iy; i__3 = iy; - q__2.r = alpha->r * yt[i__3].r - alpha->i * yt[i__3].i, q__2.i = + q__2.r = alpha->r * yt[i__3].r - alpha->i * yt[i__3].i, q__2.i = alpha->r * yt[i__3].i + alpha->i * yt[i__3].r; i__4 = iy; q__3.r = beta->r * y[i__4].r - beta->i * y[i__4].i, q__3.i = beta->r * @@ -5121,7 +5131,7 @@ static logical c_false = FALSE_; yt[i__2].r = q__1.r, yt[i__2].i = q__1.i; i__2 = iy; g[iy] = ((r__1 = alpha->r, abs(r__1)) + (r__2 = r_imag(alpha), abs( - r__2))) * g[iy] + ((r__3 = beta->r, abs(r__3)) + (r__4 = + r__2))) * g[iy] + ((r__3 = beta->r, abs(r__3)) + (r__4 = r_imag(beta), abs(r__4))) * ((r__5 = y[i__2].r, abs(r__5)) + ( r__6 = r_imag(&y[iy]), abs(r__6))); iy += incyl; @@ -5410,7 +5420,7 @@ real sdiff_(real *x, real *y) } /* sdiff_ */ -/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, +/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, logical *lerr, logical *ok, ftnlen srnamt_len) { /* Format strings */ diff --git a/blastest/src/cblat3.c b/blastest/src/cblat3.c index a5b870f0f3..e3d5e32a3c 100644 --- a/blastest/src/cblat3.c +++ b/blastest/src/cblat3.c @@ -140,9 +140,14 @@ static integer c_n1 = -1; /* ===================================================================== */ /* Main program */ int main(void) { +#ifdef BLIS_ENABLE_HPX + char* program = "cblat3"; + bli_thread_initialize_hpx( 1, &program ); +#endif + /* Initialized data */ - static char snames[6*9] = "CGEMM " "CHEMM " "CSYMM " "CTRMM " "CTRSM " + static char snames[6*9] = "CGEMM " "CHEMM " "CSYMM " "CTRMM " "CTRSM " "CHERK " "CSYRK " "CHER2K" "CSYR2K"; /* Format strings */ @@ -186,10 +191,10 @@ static integer c_n1 = -1; cllist cl__1; /* Builtin functions */ - integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_rsle(void), f_open(olist *), s_wsfe(cilist *), do_fio(integer *, - char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), - s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, + char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), + s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, ftnlen); /* Subroutine */ int s_stop(char *, ftnlen); integer f_clos(cllist *); @@ -209,34 +214,34 @@ static integer c_n1 = -1; integer nbet, ntra; logical rewi; integer nout; - extern /* Subroutine */ int cchk1_(char *, real *, real *, integer *, - integer *, logical *, logical *, logical *, integer *, integer *, - integer *, complex *, integer *, complex *, integer *, complex *, - complex *, complex *, complex *, complex *, complex *, complex *, - complex *, complex *, complex *, real *, ftnlen), cchk2_(char *, - real *, real *, integer *, integer *, logical *, logical *, - logical *, integer *, integer *, integer *, complex *, integer *, - complex *, integer *, complex *, complex *, complex *, complex *, - complex *, complex *, complex *, complex *, complex *, complex *, - real *, ftnlen), cchk3_(char *, real *, real *, integer *, - integer *, logical *, logical *, logical *, integer *, integer *, - integer *, complex *, integer *, complex *, complex *, complex *, - complex *, complex *, complex *, complex *, real *, complex *, - ftnlen), cchk4_(char *, real *, real *, integer *, integer *, - logical *, logical *, logical *, integer *, integer *, integer *, - complex *, integer *, complex *, integer *, complex *, complex *, - complex *, complex *, complex *, complex *, complex *, complex *, - complex *, complex *, real *, ftnlen), cchk5_(char *, real *, - real *, integer *, integer *, logical *, logical *, logical *, - integer *, integer *, integer *, complex *, integer *, complex *, - integer *, complex *, complex *, complex *, complex *, complex *, - complex *, complex *, complex *, complex *, real *, complex *, + extern /* Subroutine */ int cchk1_(char *, real *, real *, integer *, + integer *, logical *, logical *, logical *, integer *, integer *, + integer *, complex *, integer *, complex *, integer *, complex *, + complex *, complex *, complex *, complex *, complex *, complex *, + complex *, complex *, complex *, real *, ftnlen), cchk2_(char *, + real *, real *, integer *, integer *, logical *, logical *, + logical *, integer *, integer *, integer *, complex *, integer *, + complex *, integer *, complex *, complex *, complex *, complex *, + complex *, complex *, complex *, complex *, complex *, complex *, + real *, ftnlen), cchk3_(char *, real *, real *, integer *, + integer *, logical *, logical *, logical *, integer *, integer *, + integer *, complex *, integer *, complex *, complex *, complex *, + complex *, complex *, complex *, complex *, real *, complex *, + ftnlen), cchk4_(char *, real *, real *, integer *, integer *, + logical *, logical *, logical *, integer *, integer *, integer *, + complex *, integer *, complex *, integer *, complex *, complex *, + complex *, complex *, complex *, complex *, complex *, complex *, + complex *, complex *, real *, ftnlen), cchk5_(char *, real *, + real *, integer *, integer *, logical *, logical *, logical *, + integer *, integer *, integer *, complex *, integer *, complex *, + integer *, complex *, complex *, complex *, complex *, complex *, + complex *, complex *, complex *, complex *, real *, complex *, ftnlen), cchke_(integer *, char *, integer *, ftnlen); logical fatal; - extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, - integer *, complex *, complex *, integer *, complex *, integer *, - complex *, complex *, integer *, complex *, real *, complex *, - integer *, real *, real *, logical *, integer *, logical *, + extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *, complex *, real *, complex *, + integer *, real *, real *, logical *, integer *, logical *, ftnlen, ftnlen); logical trace; integer nidim; @@ -508,7 +513,7 @@ static integer c_n1 = -1; goto L60; } for (i__ = 1; i__ <= 9; ++i__) { - if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) + if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) { goto L50; } @@ -571,7 +576,7 @@ static integer c_n1 = -1; *(unsigned char *)transa = 'N'; *(unsigned char *)transb = 'N'; cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & - c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, &nout, &c_true, (ftnlen)1, (ftnlen)1); same = lce_(cc, ct, &n); if (! same || err != 0.f) { @@ -586,7 +591,7 @@ static integer c_n1 = -1; } *(unsigned char *)transb = 'C'; cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & - c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, &nout, &c_true, (ftnlen)1, (ftnlen)1); same = lce_(cc, ct, &n); if (! same || err != 0.f) { @@ -619,7 +624,7 @@ static integer c_n1 = -1; *(unsigned char *)transa = 'C'; *(unsigned char *)transb = 'N'; cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & - c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, &nout, &c_true, (ftnlen)1, (ftnlen)1); same = lce_(cc, ct, &n); if (! same || err != 0.f) { @@ -634,7 +639,7 @@ static integer c_n1 = -1; } *(unsigned char *)transb = 'C'; cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & - c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, &nout, &c_true, (ftnlen)1, (ftnlen)1); same = lce_(cc, ct, &n); if (! same || err != 0.f) { @@ -688,34 +693,34 @@ static integer c_n1 = -1; /* Test CGEMM, 01. */ L140: cchk1_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, - bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, + bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, ct, g, (ftnlen)6); goto L190; /* Test CHEMM, 02, CSYMM, 03. */ L150: cchk2_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, - bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, + bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, ct, g, (ftnlen)6); goto L190; /* Test CTRMM, 04, CTRSM, 05. */ L160: cchk3_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &c__65, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, (ftnlen)6); goto L190; /* Test CHERK, 06, CSYRK, 07. */ L170: cchk4_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, - bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, + bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, ct, g, (ftnlen)6); goto L190; /* Test CHER2K, 08, CSYR2K, 09. */ L180: cchk5_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, ct, g, w, ( ftnlen)6); goto L190; @@ -759,14 +764,19 @@ static integer c_n1 = -1; /* End of CBLAT3. */ - return 0; +#ifdef BLIS_ENABLE_HPX + return bli_thread_finalize_hpx(); +#else + // Return peacefully. + return 0; +#endif } /* main */ /* Subroutine */ int cchk1_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nalf, complex *alf, integer * nbet, complex *bet, integer *nmax, complex *a, complex *aa, complex * - as, complex *b, complex *bb, complex *bs, complex *c__, complex *cc, + as, complex *b, complex *bb, complex *bs, complex *c__, complex *cc, complex *cs, complex *ct, real *g, ftnlen sname_len) { /* Initialized data */ @@ -791,7 +801,7 @@ static integer c_n1 = -1; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8; alist al__1; @@ -800,7 +810,7 @@ static integer c_n1 = -1; f_rew(alist *); /* Local variables */ - integer i__, k, m, n, ia, ib, ma, mb, na, nb, nc, ik, im, in, ks, ms, ns, + integer i__, k, m, n, ia, ib, ma, mb, na, nb, nc, ik, im, in, ks, ms, ns, ica, icb, laa, lbb, lda, lcc, ldb, ldc; extern logical lce_(complex *, complex *, integer *); complex als, bls; @@ -808,21 +818,21 @@ static integer c_n1 = -1; complex beta; integer ldas, ldbs, ldcs; logical same, null; - extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, - integer *, complex *, integer *, complex *, integer *, logical *, + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, logical *, complex *, ftnlen, ftnlen, ftnlen); complex alpha; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, - integer *, complex *, complex *, integer *, complex *, integer *, - complex *, complex *, integer *, ftnlen, ftnlen), cmmch_(char *, - char *, integer *, integer *, integer *, complex *, complex *, - integer *, complex *, integer *, complex *, complex *, integer *, + extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *, ftnlen, ftnlen), cmmch_(char *, + char *, integer *, integer *, integer *, complex *, complex *, + integer *, complex *, integer *, complex *, complex *, integer *, complex *, real *, complex *, integer *, real *, real *, logical * , integer *, logical *, ftnlen, ftnlen); logical isame[13], trana, tranb; integer nargs; logical reset; - extern logical lceres_(char *, char *, integer *, integer *, complex *, + extern logical lceres_(char *, char *, integer *, integer *, complex *, complex *, integer *, ftnlen, ftnlen); char tranas[1], tranbs[1], transa[1], transb[1]; real errmax; @@ -915,7 +925,7 @@ static integer c_n1 = -1; for (ica = 1; ica <= 3; ++ica) { *(unsigned char *)transa = *(unsigned char *)&ich[ica - 1] ; - trana = *(unsigned char *)transa == 'T' || *(unsigned + trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == 'C'; if (trana) { @@ -943,9 +953,9 @@ static integer c_n1 = -1; ftnlen)1); for (icb = 1; icb <= 3; ++icb) { - *(unsigned char *)transb = *(unsigned char *)&ich[icb + *(unsigned char *)transb = *(unsigned char *)&ich[icb - 1]; - tranb = *(unsigned char *)transb == 'T' || *(unsigned + tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == 'C'; if (tranb) { @@ -1086,13 +1096,13 @@ static integer c_n1 = -1; isame[2] = ms == m; isame[3] = ns == n; isame[4] = ks == k; - isame[5] = als.r == alpha.r && als.i == + isame[5] = als.r == alpha.r && als.i == alpha.i; isame[6] = lce_(&as[1], &aa[1], &laa); isame[7] = ldas == lda; isame[8] = lce_(&bs[1], &bb[1], &lbb); isame[9] = ldbs == ldb; - isame[10] = bls.r == beta.r && bls.i == + isame[10] = bls.r == beta.r && bls.i == beta.i; if (null) { isame[11] = lce_(&cs[1], &cc[1], &lcc); @@ -1130,9 +1140,9 @@ static integer c_n1 = -1; cmmch_(transa, transb, &m, &n, &k, &alpha, &a[a_offset], nmax, &b[b_offset], - nmax, &beta, &c__[c_offset], + nmax, &beta, &c__[c_offset], nmax, &ct[1], &g[1], &cc[1], &ldc, - eps, &err, fatal, nout, &c_true, + eps, &err, fatal, nout, &c_true, (ftnlen)1, (ftnlen)1); errmax = max(errmax,err); /* If got really bad answer, report and */ @@ -1214,10 +1224,10 @@ static integer c_n1 = -1; } /* cchk1_ */ /* Subroutine */ int cchk2_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nalf, complex *alf, integer * nbet, complex *bet, integer *nmax, complex *a, complex *aa, complex * - as, complex *b, complex *bb, complex *bs, complex *c__, complex *cc, + as, complex *b, complex *bb, complex *bs, complex *c__, complex *cc, complex *cs, complex *ct, real *g, ftnlen sname_len) { /* Initialized data */ @@ -1243,7 +1253,7 @@ static integer c_n1 = -1; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; alist al__1; @@ -1252,7 +1262,7 @@ static integer c_n1 = -1; integer *, char *, ftnlen), e_wsfe(void), f_rew(alist *); /* Local variables */ - integer i__, m, n, ia, ib, na, nc, im, in, ms, ns, laa, lbb, lda, lcc, + integer i__, m, n, ia, ib, na, nc, im, in, ms, ns, laa, lbb, lda, lcc, ldb, ldc; extern logical lce_(complex *, complex *, integer *); integer ics; @@ -1265,26 +1275,26 @@ static integer c_n1 = -1; char side[1]; logical conj, left, null; char uplo[1]; - extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, - integer *, complex *, integer *, complex *, integer *, logical *, + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, logical *, complex *, ftnlen, ftnlen, ftnlen); complex alpha; - extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, - integer *, complex *, complex *, integer *, complex *, integer *, - complex *, complex *, integer *, complex *, real *, complex *, - integer *, real *, real *, logical *, integer *, logical *, - ftnlen, ftnlen), chemm_(char *, char *, integer *, integer *, - complex *, complex *, integer *, complex *, integer *, complex *, + extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *, complex *, real *, complex *, + integer *, real *, real *, logical *, integer *, logical *, + ftnlen, ftnlen), chemm_(char *, char *, integer *, integer *, + complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *, ftnlen, ftnlen); logical isame[13]; char sides[1]; integer nargs; logical reset; - extern /* Subroutine */ int csymm_(char *, char *, integer *, integer *, - complex *, complex *, integer *, complex *, integer *, complex *, + extern /* Subroutine */ int csymm_(char *, char *, integer *, integer *, + complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *, ftnlen, ftnlen); char uplos[1]; - extern logical lceres_(char *, char *, integer *, integer *, complex *, + extern logical lceres_(char *, char *, integer *, integer *, complex *, complex *, integer *, ftnlen, ftnlen); real errmax; @@ -1426,7 +1436,7 @@ static integer c_n1 = -1; /* Generate the matrix C. */ - cmake_("GE", " ", " ", &m, &n, &c__[c_offset], + cmake_("GE", " ", " ", &m, &n, &c__[c_offset], nmax, &cc[1], &ldc, &reset, &c_b1, ( ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -1522,9 +1532,9 @@ static integer c_n1 = -1; /* See what data changed inside subroutines. */ - isame[0] = *(unsigned char *)sides == *(unsigned + isame[0] = *(unsigned char *)sides == *(unsigned char *)side; - isame[1] = *(unsigned char *)uplos == *(unsigned + isame[1] = *(unsigned char *)uplos == *(unsigned char *)uplo; isame[2] = ms == m; isame[3] = ns == n; @@ -1569,14 +1579,14 @@ static integer c_n1 = -1; if (left) { cmmch_("N", "N", &m, &n, &m, &alpha, &a[ - a_offset], nmax, &b[b_offset], + a_offset], nmax, &b[b_offset], nmax, &beta, &c__[c_offset], nmax, &ct[1], &g[1], &cc[1], &ldc, eps, &err, fatal, nout, &c_true, ( ftnlen)1, (ftnlen)1); } else { cmmch_("N", "N", &m, &n, &n, &alpha, &b[ - b_offset], nmax, &a[a_offset], + b_offset], nmax, &a[a_offset], nmax, &beta, &c__[c_offset], nmax, &ct[1], &g[1], &cc[1], &ldc, eps, &err, fatal, nout, &c_true, ( @@ -1657,9 +1667,9 @@ static integer c_n1 = -1; } /* cchk2_ */ /* Subroutine */ int cchk3_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nalf, complex *alf, integer * - nmax, complex *a, complex *aa, complex *as, complex *b, complex *bb, + nmax, complex *a, complex *aa, complex *as, complex *b, complex *bb, complex *bs, complex *ct, real *g, complex *c__, ftnlen sname_len) { /* Initialized data */ @@ -1686,7 +1696,7 @@ static integer c_n1 = -1; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; complex q__1; alist al__1; @@ -1708,27 +1718,27 @@ static integer c_n1 = -1; char side[1]; logical left, null; char uplo[1]; - extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, - integer *, complex *, integer *, complex *, integer *, logical *, + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, logical *, complex *, ftnlen, ftnlen, ftnlen); complex alpha; char diags[1]; - extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, - integer *, complex *, complex *, integer *, complex *, integer *, - complex *, complex *, integer *, complex *, real *, complex *, - integer *, real *, real *, logical *, integer *, logical *, + extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *, complex *, real *, complex *, + integer *, real *, real *, logical *, integer *, logical *, ftnlen, ftnlen); logical isame[13]; char sides[1]; integer nargs; logical reset; - extern /* Subroutine */ int ctrmm_(char *, char *, char *, char *, - integer *, integer *, complex *, complex *, integer *, complex *, + extern /* Subroutine */ int ctrmm_(char *, char *, char *, char *, + integer *, integer *, complex *, complex *, integer *, complex *, integer *, ftnlen, ftnlen, ftnlen, ftnlen), ctrsm_(char *, char *, - char *, char *, integer *, integer *, complex *, complex *, + char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, ftnlen, ftnlen, ftnlen, ftnlen); char uplos[1]; - extern logical lceres_(char *, char *, integer *, integer *, complex *, + extern logical lceres_(char *, char *, integer *, integer *, complex *, complex *, integer *, ftnlen, ftnlen); char tranas[1], transa[1]; real errmax; @@ -1867,7 +1877,7 @@ static integer c_n1 = -1; /* Generate the matrix B. */ - cmake_("GE", " ", " ", &m, &n, &b[b_offset], + cmake_("GE", " ", " ", &m, &n, &b[b_offset], nmax, &bb[1], &ldb, &reset, &c_b1, ( ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -1939,7 +1949,7 @@ static integer c_n1 = -1; } ctrmm_(side, uplo, transa, diag, &m, &n, & alpha, &aa[1], &lda, &bb[1], &ldb, - (ftnlen)1, (ftnlen)1, (ftnlen)1, + (ftnlen)1, (ftnlen)1, (ftnlen)1, (ftnlen)1); } else if (s_cmp(sname + 3, "SM", (ftnlen)2, ( ftnlen)2) == 0) { @@ -1972,7 +1982,7 @@ static integer c_n1 = -1; } ctrsm_(side, uplo, transa, diag, &m, &n, & alpha, &aa[1], &lda, &bb[1], &ldb, - (ftnlen)1, (ftnlen)1, (ftnlen)1, + (ftnlen)1, (ftnlen)1, (ftnlen)1, (ftnlen)1); } @@ -1998,7 +2008,7 @@ static integer c_n1 = -1; unsigned char *)diag; isame[4] = ms == m; isame[5] = ns == n; - isame[6] = als.r == alpha.r && als.i == + isame[6] = als.r == alpha.r && als.i == alpha.i; isame[7] = lce_(&as[1], &aa[1], &laa); isame[8] = ldas == lda; @@ -2042,18 +2052,18 @@ static integer c_n1 = -1; cmmch_(transa, "N", &m, &n, &m, & alpha, &a[a_offset], nmax, &b[b_offset], nmax, & - c_b1, &c__[c_offset], + c_b1, &c__[c_offset], nmax, &ct[1], &g[1], &bb[ - 1], &ldb, eps, &err, + 1], &ldb, eps, &err, fatal, nout, &c_true, ( ftnlen)1, (ftnlen)1); } else { cmmch_("N", transa, &m, &n, &n, & alpha, &b[b_offset], nmax, &a[a_offset], nmax, & - c_b1, &c__[c_offset], + c_b1, &c__[c_offset], nmax, &ct[1], &g[1], &bb[ - 1], &ldb, eps, &err, + 1], &ldb, eps, &err, fatal, nout, &c_true, ( ftnlen)1, (ftnlen)1); } @@ -2066,14 +2076,14 @@ static integer c_n1 = -1; i__4 = n; for (j = 1; j <= i__4; ++j) { i__5 = m; - for (i__ = 1; i__ <= i__5; ++i__) + for (i__ = 1; i__ <= i__5; ++i__) { i__6 = i__ + j * c_dim1; i__7 = i__ + (j - 1) * ldb; c__[i__6].r = bb[i__7].r, c__[i__6].i = bb[i__7].i; i__6 = i__ + (j - 1) * ldb; i__7 = i__ + j * b_dim1; - q__1.r = alpha.r * b[i__7].r - alpha.i * b[i__7].i, + q__1.r = alpha.r * b[i__7].r - alpha.i * b[i__7].i, q__1.i = alpha.r * b[i__7].i + alpha.i * b[ i__7].r; bb[i__6].r = q__1.r, bb[i__6].i = q__1.i; @@ -2084,20 +2094,20 @@ static integer c_n1 = -1; if (left) { cmmch_(transa, "N", &m, &n, &m, & - c_b2, &a[a_offset], nmax, + c_b2, &a[a_offset], nmax, &c__[c_offset], nmax, & - c_b1, &b[b_offset], nmax, + c_b1, &b[b_offset], nmax, &ct[1], &g[1], &bb[1], & - ldb, eps, &err, fatal, + ldb, eps, &err, fatal, nout, &c_false, (ftnlen)1, (ftnlen)1); } else { cmmch_("N", transa, &m, &n, &n, & - c_b2, &c__[c_offset], - nmax, &a[a_offset], nmax, + c_b2, &c__[c_offset], + nmax, &a[a_offset], nmax, &c_b1, &b[b_offset], nmax, &ct[1], &g[1], &bb[1], & - ldb, eps, &err, fatal, + ldb, eps, &err, fatal, nout, &c_false, (ftnlen)1, (ftnlen)1); } @@ -2179,10 +2189,10 @@ static integer c_n1 = -1; } /* cchk3_ */ /* Subroutine */ int cchk4_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nalf, complex *alf, integer * nbet, complex *bet, integer *nmax, complex *a, complex *aa, complex * - as, complex *b, complex *bb, complex *bs, complex *c__, complex *cc, + as, complex *b, complex *bb, complex *bs, complex *c__, complex *cc, complex *cs, complex *ct, real *g, ftnlen sname_len) { /* Initialized data */ @@ -2213,7 +2223,7 @@ static integer c_n1 = -1; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; complex q__1; alist al__1; @@ -2236,16 +2246,16 @@ static integer c_n1 = -1; real rals; logical tran, null; char uplo[1]; - extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, - integer *, complex *, integer *, complex *, integer *, logical *, + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, logical *, complex *, ftnlen, ftnlen, ftnlen); complex alpha; - extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, - integer *, complex *, complex *, integer *, complex *, integer *, - complex *, complex *, integer *, complex *, real *, complex *, - integer *, real *, real *, logical *, integer *, logical *, - ftnlen, ftnlen), cherk_(char *, char *, integer *, integer *, - real *, complex *, integer *, real *, complex *, integer *, + extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *, complex *, real *, complex *, + integer *, real *, real *, logical *, integer *, logical *, + ftnlen, ftnlen), cherk_(char *, char *, integer *, integer *, + real *, complex *, integer *, real *, complex *, integer *, ftnlen, ftnlen); real rbeta; logical isame[13]; @@ -2254,12 +2264,12 @@ static integer c_n1 = -1; logical reset; char trans[1]; logical upper; - extern /* Subroutine */ int csyrk_(char *, char *, integer *, integer *, - complex *, complex *, integer *, complex *, complex *, integer *, + extern /* Subroutine */ int csyrk_(char *, char *, integer *, integer *, + complex *, complex *, integer *, complex *, complex *, integer *, ftnlen, ftnlen); char uplos[1]; real ralpha; - extern logical lceres_(char *, char *, integer *, integer *, complex *, + extern logical lceres_(char *, char *, integer *, integer *, complex *, complex *, integer *, ftnlen, ftnlen); real errmax; char transs[1], transt[1]; @@ -2402,7 +2412,7 @@ static integer c_n1 = -1; } null = n <= 0; if (conj) { - null = null || (k <= 0 || ralpha == 0.f) && + null = null || (k <= 0 || ralpha == 0.f) && rbeta == 1.f; } @@ -2481,7 +2491,7 @@ static integer c_n1 = -1; f_rew(&al__1); } cherk_(uplo, trans, &n, &k, &ralpha, &aa[1], & - lda, &rbeta, &cc[1], &ldc, (ftnlen)1, + lda, &rbeta, &cc[1], &ldc, (ftnlen)1, (ftnlen)1); } else { if (*trace) { @@ -2528,16 +2538,16 @@ static integer c_n1 = -1; /* See what data changed inside subroutines. */ - isame[0] = *(unsigned char *)uplos == *(unsigned + isame[0] = *(unsigned char *)uplos == *(unsigned char *)uplo; - isame[1] = *(unsigned char *)transs == *(unsigned + isame[1] = *(unsigned char *)transs == *(unsigned char *)trans; isame[2] = ns == n; isame[3] = ks == k; if (conj) { isame[4] = rals == ralpha; } else { - isame[4] = als.r == alpha.r && als.i == + isame[4] = als.r == alpha.r && als.i == alpha.i; } isame[5] = lce_(&as[1], &aa[1], &laa); @@ -2545,7 +2555,7 @@ static integer c_n1 = -1; if (conj) { isame[7] = rbets == rbeta; } else { - isame[7] = bets.r == beta.r && bets.i == + isame[7] = bets.r == beta.r && bets.i == beta.i; } if (null) { @@ -2599,19 +2609,19 @@ static integer c_n1 = -1; } if (tran) { cmmch_(transt, "N", &lj, &c__1, &k, & - alpha, &a[jj * a_dim1 + 1], - nmax, &a[j * a_dim1 + 1], - nmax, &beta, &c__[jj + j * - c_dim1], nmax, &ct[1], &g[1], - &cc[jc], &ldc, eps, &err, + alpha, &a[jj * a_dim1 + 1], + nmax, &a[j * a_dim1 + 1], + nmax, &beta, &c__[jj + j * + c_dim1], nmax, &ct[1], &g[1], + &cc[jc], &ldc, eps, &err, fatal, nout, &c_true, (ftnlen) 1, (ftnlen)1); } else { cmmch_("N", transt, &lj, &c__1, &k, & - alpha, &a[jj + a_dim1], nmax, + alpha, &a[jj + a_dim1], nmax, &a[j + a_dim1], nmax, &beta, & c__[jj + j * c_dim1], nmax, & - ct[1], &g[1], &cc[jc], &ldc, + ct[1], &g[1], &cc[jc], &ldc, eps, &err, fatal, nout, & c_true, (ftnlen)1, (ftnlen)1); } @@ -2720,10 +2730,10 @@ static integer c_n1 = -1; } /* cchk4_ */ /* Subroutine */ int cchk5_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nalf, complex *alf, integer * nbet, complex *bet, integer *nmax, complex *ab, complex *aa, complex * - as, complex *bb, complex *bs, complex *c__, complex *cc, complex *cs, + as, complex *bb, complex *bs, complex *c__, complex *cc, complex *cs, complex *ct, real *g, complex *w, ftnlen sname_len) { /* Initialized data */ @@ -2778,14 +2788,14 @@ static integer c_n1 = -1; complex bets; logical tran, null; char uplo[1]; - extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, - integer *, complex *, integer *, complex *, integer *, logical *, + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, logical *, complex *, ftnlen, ftnlen, ftnlen); complex alpha; - extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, - integer *, complex *, complex *, integer *, complex *, integer *, - complex *, complex *, integer *, complex *, real *, complex *, - integer *, real *, real *, logical *, integer *, logical *, + extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *, complex *, real *, complex *, + integer *, real *, real *, logical *, integer *, logical *, ftnlen, ftnlen); real rbeta; logical isame[13]; @@ -2795,12 +2805,12 @@ static integer c_n1 = -1; char trans[1]; logical upper; char uplos[1]; - extern /* Subroutine */ int cher2k_(char *, char *, integer *, integer *, - complex *, complex *, integer *, complex *, integer *, real *, - complex *, integer *, ftnlen, ftnlen), csyr2k_(char *, char *, - integer *, integer *, complex *, complex *, integer *, complex *, + extern /* Subroutine */ int cher2k_(char *, char *, integer *, integer *, + complex *, complex *, integer *, complex *, integer *, real *, + complex *, integer *, ftnlen, ftnlen), csyr2k_(char *, char *, + integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *, ftnlen, ftnlen); - extern logical lceres_(char *, char *, integer *, integer *, complex *, + extern logical lceres_(char *, char *, integer *, integer *, complex *, complex *, integer *, ftnlen, ftnlen); real errmax; char transs[1], transt[1]; @@ -2957,7 +2967,7 @@ static integer c_n1 = -1; } null = n <= 0; if (conj) { - null = null || (k <= 0 || alpha.r == 0.f && + null = null || (k <= 0 || alpha.r == 0.f && alpha.i == 0.f) && rbeta == 1.f; } @@ -3092,9 +3102,9 @@ static integer c_n1 = -1; /* See what data changed inside subroutines. */ - isame[0] = *(unsigned char *)uplos == *(unsigned + isame[0] = *(unsigned char *)uplos == *(unsigned char *)uplo; - isame[1] = *(unsigned char *)transs == *(unsigned + isame[1] = *(unsigned char *)transs == *(unsigned char *)trans; isame[2] = ns == n; isame[3] = ks == k; @@ -3106,7 +3116,7 @@ static integer c_n1 = -1; if (conj) { isame[9] = rbets == rbeta; } else { - isame[9] = bets.r == beta.r && bets.i == + isame[9] = bets.r == beta.r && bets.i == beta.i; } if (null) { @@ -3162,20 +3172,20 @@ static integer c_n1 = -1; i__6 = k; for (i__ = 1; i__ <= i__6; ++i__) { i__7 = i__; - i__8 = (j - 1 << 1) * *nmax + k + + i__8 = (j - 1 << 1) * *nmax + k + i__; - q__1.r = alpha.r * ab[i__8].r - - alpha.i * ab[i__8].i, + q__1.r = alpha.r * ab[i__8].r - + alpha.i * ab[i__8].i, q__1.i = alpha.r * ab[ i__8].i + alpha.i * ab[ i__8].r; - w[i__7].r = q__1.r, w[i__7].i = + w[i__7].r = q__1.r, w[i__7].i = q__1.i; if (conj) { i__7 = k + i__; r_cnjg(&q__2, &alpha); i__8 = (j - 1 << 1) * *nmax + i__; - q__1.r = q__2.r * ab[i__8].r - q__2.i * ab[i__8].i, + q__1.r = q__2.r * ab[i__8].r - q__2.i * ab[i__8].i, q__1.i = q__2.r * ab[i__8].i + q__2.i * ab[ i__8].r; w[i__7].r = q__1.r, w[i__7].i = q__1.i; @@ -3183,7 +3193,7 @@ static integer c_n1 = -1; i__7 = k + i__; i__8 = (j - 1 << 1) * *nmax + i__; q__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] - .i, q__1.i = alpha.r * ab[i__8].i + alpha.i + .i, q__1.i = alpha.r * ab[i__8].i + alpha.i * ab[i__8].r; w[i__7].r = q__1.r, w[i__7].i = q__1.i; } @@ -3194,9 +3204,9 @@ static integer c_n1 = -1; i__8 = *nmax << 1; cmmch_(transt, "N", &lj, &c__1, &i__6, &c_b2, &ab[jjab], &i__7, &w[ - 1], &i__8, &beta, &c__[jj + j + 1], &i__8, &beta, &c__[jj + j * c_dim1], nmax, &ct[1], &g[1] - , &cc[jc], &ldc, eps, &err, + , &cc[jc], &ldc, eps, &err, fatal, nout, &c_true, (ftnlen) 1, (ftnlen)1); } else { @@ -3205,14 +3215,14 @@ static integer c_n1 = -1; if (conj) { i__7 = i__; r_cnjg(&q__2, &ab[(k + i__ - 1) * *nmax + j]); - q__1.r = alpha.r * q__2.r - alpha.i * q__2.i, - q__1.i = alpha.r * q__2.i + alpha.i * + q__1.r = alpha.r * q__2.r - alpha.i * q__2.i, + q__1.i = alpha.r * q__2.i + alpha.i * q__2.r; w[i__7].r = q__1.r, w[i__7].i = q__1.i; i__7 = k + i__; i__8 = (i__ - 1) * *nmax + j; q__2.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] - .i, q__2.i = alpha.r * ab[i__8].i + alpha.i + .i, q__2.i = alpha.r * ab[i__8].i + alpha.i * ab[i__8].r; r_cnjg(&q__1, &q__2); w[i__7].r = q__1.r, w[i__7].i = q__1.i; @@ -3220,13 +3230,13 @@ static integer c_n1 = -1; i__7 = i__; i__8 = (k + i__ - 1) * *nmax + j; q__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] - .i, q__1.i = alpha.r * ab[i__8].i + alpha.i + .i, q__1.i = alpha.r * ab[i__8].i + alpha.i * ab[i__8].r; w[i__7].r = q__1.r, w[i__7].i = q__1.i; i__7 = k + i__; i__8 = (i__ - 1) * *nmax + j; q__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] - .i, q__1.i = alpha.r * ab[i__8].i + alpha.i + .i, q__1.i = alpha.r * ab[i__8].i + alpha.i * ab[i__8].r; w[i__7].r = q__1.r, w[i__7].i = q__1.i; } @@ -3236,9 +3246,9 @@ static integer c_n1 = -1; i__7 = *nmax << 1; cmmch_("N", "N", &lj, &c__1, &i__6, & c_b2, &ab[jj], nmax, &w[1], & - i__7, &beta, &c__[jj + j * - c_dim1], nmax, &ct[1], &g[1], - &cc[jc], &ldc, eps, &err, + i__7, &beta, &c__[jj + j * + c_dim1], nmax, &ct[1], &g[1], + &cc[jc], &ldc, eps, &err, fatal, nout, &c_true, (ftnlen) 1, (ftnlen)1); } @@ -3351,7 +3361,7 @@ static integer c_n1 = -1; } /* cchk5_ */ -/* Subroutine */ int cchke_(integer *isnum, char *srnamt, integer *nout, +/* Subroutine */ int cchke_(integer *isnum, char *srnamt, integer *nout, ftnlen srnamt_len) { /* Format strings */ @@ -3364,34 +3374,34 @@ static integer c_n1 = -1; integer s_wsfe(cilist *), do_fio(integer *, char *, ftnlen), e_wsfe(void); /* Local variables */ - complex a[2] /* was [2][1] */, b[2] /* was [2][1] */, c__[2] + complex a[2] /* was [2][1] */, b[2] /* was [2][1] */, c__[2] /* was [2][1] */, beta, alpha; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, - integer *, complex *, complex *, integer *, complex *, integer *, - complex *, complex *, integer *, ftnlen, ftnlen), chemm_(char *, - char *, integer *, integer *, complex *, complex *, integer *, - complex *, integer *, complex *, complex *, integer *, ftnlen, - ftnlen), cherk_(char *, char *, integer *, integer *, real *, - complex *, integer *, real *, complex *, integer *, ftnlen, + extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *, ftnlen, ftnlen), chemm_(char *, + char *, integer *, integer *, complex *, complex *, integer *, + complex *, integer *, complex *, complex *, integer *, ftnlen, + ftnlen), cherk_(char *, char *, integer *, integer *, real *, + complex *, integer *, real *, complex *, integer *, ftnlen, ftnlen); real rbeta; - extern /* Subroutine */ int ctrmm_(char *, char *, char *, char *, - integer *, integer *, complex *, complex *, integer *, complex *, + extern /* Subroutine */ int ctrmm_(char *, char *, char *, char *, + integer *, integer *, complex *, complex *, integer *, complex *, integer *, ftnlen, ftnlen, ftnlen, ftnlen), csymm_(char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, - integer *, complex *, complex *, integer *, ftnlen, ftnlen), - ctrsm_(char *, char *, char *, char *, integer *, integer *, - complex *, complex *, integer *, complex *, integer *, ftnlen, - ftnlen, ftnlen, ftnlen), csyrk_(char *, char *, integer *, - integer *, complex *, complex *, integer *, complex *, complex *, - integer *, ftnlen, ftnlen), cher2k_(char *, char *, integer *, - integer *, complex *, complex *, integer *, complex *, integer *, - real *, complex *, integer *, ftnlen, ftnlen), csyr2k_(char *, - char *, integer *, integer *, complex *, complex *, integer *, - complex *, integer *, complex *, complex *, integer *, ftnlen, + integer *, complex *, complex *, integer *, ftnlen, ftnlen), + ctrsm_(char *, char *, char *, char *, integer *, integer *, + complex *, complex *, integer *, complex *, integer *, ftnlen, + ftnlen, ftnlen, ftnlen), csyrk_(char *, char *, integer *, + integer *, complex *, complex *, integer *, complex *, complex *, + integer *, ftnlen, ftnlen), cher2k_(char *, char *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + real *, complex *, integer *, ftnlen, ftnlen), csyr2k_(char *, + char *, integer *, integer *, complex *, complex *, integer *, + complex *, integer *, complex *, complex *, integer *, ftnlen, ftnlen); real ralpha; - extern /* Subroutine */ int chkxer_(char *, integer *, integer *, logical + extern /* Subroutine */ int chkxer_(char *, integer *, integer *, logical *, logical *, ftnlen); /* Fortran I/O blocks */ @@ -3451,302 +3461,302 @@ static integer c_n1 = -1; } L10: infoc_1.infot = 1; - cgemm_("/", "N", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("/", "N", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 1; - cgemm_("/", "C", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("/", "C", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 1; - cgemm_("/", "T", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("/", "T", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 2; - cgemm_("N", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("N", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 2; - cgemm_("C", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("C", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 2; - cgemm_("T", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("T", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - cgemm_("N", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("N", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - cgemm_("N", "C", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("N", "C", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - cgemm_("N", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("N", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - cgemm_("C", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("C", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - cgemm_("C", "C", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("C", "C", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - cgemm_("C", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("C", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - cgemm_("T", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("T", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - cgemm_("T", "C", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("T", "C", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - cgemm_("T", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("T", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - cgemm_("N", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("N", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - cgemm_("N", "C", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("N", "C", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - cgemm_("N", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("N", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - cgemm_("C", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("C", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - cgemm_("C", "C", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("C", "C", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - cgemm_("C", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("C", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - cgemm_("T", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("T", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - cgemm_("T", "C", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("T", "C", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - cgemm_("T", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("T", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - cgemm_("N", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("N", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - cgemm_("N", "C", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("N", "C", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - cgemm_("N", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("N", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - cgemm_("C", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("C", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - cgemm_("C", "C", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("C", "C", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - cgemm_("C", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("C", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - cgemm_("T", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("T", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - cgemm_("T", "C", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("T", "C", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - cgemm_("T", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("T", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - cgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__2, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - cgemm_("N", "C", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("N", "C", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__2, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - cgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__2, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - cgemm_("C", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__2, &beta, + cgemm_("C", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__2, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - cgemm_("C", "C", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("C", "C", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - cgemm_("C", "T", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("C", "T", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - cgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__2, &beta, + cgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__2, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - cgemm_("T", "C", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("T", "C", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - cgemm_("T", "T", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("T", "T", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - cgemm_("N", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("N", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - cgemm_("C", "N", &c__0, &c__0, &c__2, &alpha, a, &c__2, b, &c__1, &beta, + cgemm_("C", "N", &c__0, &c__0, &c__2, &alpha, a, &c__2, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - cgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__2, b, &c__1, &beta, + cgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__2, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - cgemm_("N", "C", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("N", "C", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - cgemm_("C", "C", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("C", "C", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - cgemm_("T", "C", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("T", "C", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - cgemm_("N", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("N", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - cgemm_("C", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("C", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - cgemm_("T", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("T", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - cgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, + cgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - cgemm_("N", "C", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, + cgemm_("N", "C", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - cgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, + cgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - cgemm_("C", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("C", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - cgemm_("C", "C", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("C", "C", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - cgemm_("C", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("C", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - cgemm_("T", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("T", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - cgemm_("T", "C", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("T", "C", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - cgemm_("T", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("T", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); @@ -4926,9 +4936,9 @@ static integer c_n1 = -1; } /* cchke_ */ -/* Subroutine */ int cmake_(char *type__, char *uplo, char *diag, integer *m, - integer *n, complex *a, integer *nmax, complex *aa, integer *lda, - logical *reset, complex *transl, ftnlen type_len, ftnlen uplo_len, +/* Subroutine */ int cmake_(char *type__, char *uplo, char *diag, integer *m, + integer *n, complex *a, integer *nmax, complex *aa, integer *lda, + logical *reset, complex *transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len) { /* System generated locals */ @@ -5114,10 +5124,10 @@ static integer c_n1 = -1; } /* cmake_ */ /* Subroutine */ int cmmch_(char *transa, char *transb, integer *m, integer * - n, integer *kk, complex *alpha, complex *a, integer *lda, complex *b, - integer *ldb, complex *beta, complex *c__, integer *ldc, complex *ct, + n, integer *kk, complex *alpha, complex *a, integer *lda, complex *b, + integer *ldb, complex *beta, complex *c__, integer *ldc, complex *ct, real *g, complex *cc, integer *ldcc, real *eps, real *err, logical * - fatal, integer *nout, logical *mv, ftnlen transa_len, ftnlen + fatal, integer *nout, logical *mv, ftnlen transa_len, ftnlen transb_len) { /* Format strings */ @@ -5131,7 +5141,7 @@ static integer c_n1 = -1; " \002,i3)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, cc_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; real r__1, r__2, r__3, r__4, r__5, r__6; complex q__1, q__2, q__3, q__4; @@ -5190,9 +5200,9 @@ static integer c_n1 = -1; cc -= cc_offset; /* Function Body */ - trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == + trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == 'C'; - tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == + tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == 'C'; ctrana = *(unsigned char *)transa == 'C'; ctranb = *(unsigned char *)transb == 'C'; @@ -5220,17 +5230,17 @@ static integer c_n1 = -1; i__5 = i__; i__6 = i__ + k * a_dim1; i__7 = k + j * b_dim1; - q__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7].i, + q__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7].i, q__2.i = a[i__6].r * b[i__7].i + a[i__6].i * b[ i__7].r; - q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + q__2.i; ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; i__4 = i__ + k * a_dim1; i__5 = k + j * b_dim1; g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = r_imag( &a[i__ + k * a_dim1]), abs(r__2))) * ((r__3 = b[ - i__5].r, abs(r__3)) + (r__4 = r_imag(&b[k + j * + i__5].r, abs(r__3)) + (r__4 = r_imag(&b[k + j * b_dim1]), abs(r__4))); /* L20: */ } @@ -5246,15 +5256,15 @@ static integer c_n1 = -1; i__5 = i__; r_cnjg(&q__3, &a[k + i__ * a_dim1]); i__6 = k + j * b_dim1; - q__2.r = q__3.r * b[i__6].r - q__3.i * b[i__6].i, + q__2.r = q__3.r * b[i__6].r - q__3.i * b[i__6].i, q__2.i = q__3.r * b[i__6].i + q__3.i * b[i__6] .r; - q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + q__2.i; ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; i__4 = k + i__ * a_dim1; i__5 = k + j * b_dim1; - g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = r_imag(&a[k + i__ * a_dim1]), abs(r__2))) * (( r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag( &b[k + j * b_dim1]), abs(r__4))); @@ -5274,12 +5284,12 @@ static integer c_n1 = -1; q__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7] .i, q__2.i = a[i__6].r * b[i__7].i + a[i__6] .i * b[i__7].r; - q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + q__2.i; ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; i__4 = k + i__ * a_dim1; i__5 = k + j * b_dim1; - g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = r_imag(&a[k + i__ * a_dim1]), abs(r__2))) * (( r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag( &b[k + j * b_dim1]), abs(r__4))); @@ -5298,15 +5308,15 @@ static integer c_n1 = -1; i__5 = i__; i__6 = i__ + k * a_dim1; r_cnjg(&q__3, &b[j + k * b_dim1]); - q__2.r = a[i__6].r * q__3.r - a[i__6].i * q__3.i, - q__2.i = a[i__6].r * q__3.i + a[i__6].i * + q__2.r = a[i__6].r * q__3.r - a[i__6].i * q__3.i, + q__2.i = a[i__6].r * q__3.i + a[i__6].i * q__3.r; - q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + q__2.i; ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; i__4 = i__ + k * a_dim1; i__5 = j + k * b_dim1; - g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = r_imag(&a[i__ + k * a_dim1]), abs(r__2))) * (( r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag( &b[j + k * b_dim1]), abs(r__4))); @@ -5326,12 +5336,12 @@ static integer c_n1 = -1; q__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7] .i, q__2.i = a[i__6].r * b[i__7].i + a[i__6] .i * b[i__7].r; - q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + q__2.i; ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; i__4 = i__ + k * a_dim1; i__5 = j + k * b_dim1; - g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = r_imag(&a[i__ + k * a_dim1]), abs(r__2))) * (( r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag( &b[j + k * b_dim1]), abs(r__4))); @@ -5351,17 +5361,17 @@ static integer c_n1 = -1; i__5 = i__; r_cnjg(&q__3, &a[k + i__ * a_dim1]); r_cnjg(&q__4, &b[j + k * b_dim1]); - q__2.r = q__3.r * q__4.r - q__3.i * q__4.i, - q__2.i = q__3.r * q__4.i + q__3.i * + q__2.r = q__3.r * q__4.r - q__3.i * q__4.i, + q__2.i = q__3.r * q__4.i + q__3.i * q__4.r; - q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + q__2.i; ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; i__4 = k + i__ * a_dim1; i__5 = j + k * b_dim1; g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = r_imag(&a[k + i__ * a_dim1]), abs(r__2))) - * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 + * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag(&b[j + k * b_dim1]), abs(r__4))); /* L120: */ } @@ -5376,17 +5386,17 @@ static integer c_n1 = -1; i__5 = i__; r_cnjg(&q__3, &a[k + i__ * a_dim1]); i__6 = j + k * b_dim1; - q__2.r = q__3.r * b[i__6].r - q__3.i * b[i__6].i, + q__2.r = q__3.r * b[i__6].r - q__3.i * b[i__6].i, q__2.i = q__3.r * b[i__6].i + q__3.i * b[ i__6].r; - q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + q__2.i; ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; i__4 = k + i__ * a_dim1; i__5 = j + k * b_dim1; g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = r_imag(&a[k + i__ * a_dim1]), abs(r__2))) - * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 + * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag(&b[j + k * b_dim1]), abs(r__4))); /* L140: */ } @@ -5403,17 +5413,17 @@ static integer c_n1 = -1; i__5 = i__; i__6 = k + i__ * a_dim1; r_cnjg(&q__3, &b[j + k * b_dim1]); - q__2.r = a[i__6].r * q__3.r - a[i__6].i * q__3.i, - q__2.i = a[i__6].r * q__3.i + a[i__6].i * + q__2.r = a[i__6].r * q__3.r - a[i__6].i * q__3.i, + q__2.i = a[i__6].r * q__3.i + a[i__6].i * q__3.r; - q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + q__2.i; ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; i__4 = k + i__ * a_dim1; i__5 = j + k * b_dim1; g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = r_imag(&a[k + i__ * a_dim1]), abs(r__2))) - * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 + * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag(&b[j + k * b_dim1]), abs(r__4))); /* L160: */ } @@ -5429,16 +5439,16 @@ static integer c_n1 = -1; i__6 = k + i__ * a_dim1; i__7 = j + k * b_dim1; q__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[ - i__7].i, q__2.i = a[i__6].r * b[i__7].i + + i__7].i, q__2.i = a[i__6].r * b[i__7].i + a[i__6].i * b[i__7].r; - q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + q__2.i; ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; i__4 = k + i__ * a_dim1; i__5 = j + k * b_dim1; g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = r_imag(&a[k + i__ * a_dim1]), abs(r__2))) - * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 + * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag(&b[j + k * b_dim1]), abs(r__4))); /* L180: */ } @@ -5451,17 +5461,17 @@ static integer c_n1 = -1; for (i__ = 1; i__ <= i__2; ++i__) { i__3 = i__; i__4 = i__; - q__2.r = alpha->r * ct[i__4].r - alpha->i * ct[i__4].i, q__2.i = + q__2.r = alpha->r * ct[i__4].r - alpha->i * ct[i__4].i, q__2.i = alpha->r * ct[i__4].i + alpha->i * ct[i__4].r; i__5 = i__ + j * c_dim1; - q__3.r = beta->r * c__[i__5].r - beta->i * c__[i__5].i, q__3.i = + q__3.r = beta->r * c__[i__5].r - beta->i * c__[i__5].i, q__3.i = beta->r * c__[i__5].i + beta->i * c__[i__5].r; q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i; ct[i__3].r = q__1.r, ct[i__3].i = q__1.i; i__3 = i__ + j * c_dim1; - g[i__] = ((r__1 = alpha->r, abs(r__1)) + (r__2 = r_imag(alpha), + g[i__] = ((r__1 = alpha->r, abs(r__1)) + (r__2 = r_imag(alpha), abs(r__2))) * g[i__] + ((r__3 = beta->r, abs(r__3)) + ( - r__4 = r_imag(beta), abs(r__4))) * ((r__5 = c__[i__3].r, + r__4 = r_imag(beta), abs(r__4))) * ((r__5 = c__[i__3].r, abs(r__5)) + (r__6 = r_imag(&c__[i__ + j * c_dim1]), abs( r__6))); /* L200: */ @@ -5772,7 +5782,7 @@ real sdiff_(real *x, real *y) } /* sdiff_ */ -/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, +/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, logical *lerr, logical *ok, ftnlen srnamt_len) { /* Format strings */ diff --git a/blastest/src/dblat1.c b/blastest/src/dblat1.c index 14665d844f..ccac12c88a 100644 --- a/blastest/src/dblat1.c +++ b/blastest/src/dblat1.c @@ -70,6 +70,11 @@ static real c_b81 = 0.f; /* ===================================================================== */ /* Main program */ int main(void) { +#ifdef BLIS_ENABLE_HPX + char* program = "dblat1"; + bli_thread_initialize_hpx( 1, &program ); +#endif + /* Initialized data */ static doublereal sfac = 9.765625e-4; @@ -85,7 +90,7 @@ static real c_b81 = 0.f; /* Local variables */ integer ic; - extern /* Subroutine */ int check0_(doublereal *), check1_(doublereal *), + extern /* Subroutine */ int check0_(doublereal *), check1_(doublereal *), check2_(doublereal *), check3_(doublereal *), header_(void); /* Fortran I/O blocks */ @@ -124,11 +129,11 @@ static real c_b81 = 0.f; combla_1.incy = 9999; if (combla_1.icase == 3 || combla_1.icase == 11) { check0_(&sfac); - } else if (combla_1.icase == 7 || combla_1.icase == 8 || + } else if (combla_1.icase == 7 || combla_1.icase == 8 || combla_1.icase == 9 || combla_1.icase == 10) { check1_(&sfac); - } else if (combla_1.icase == 1 || combla_1.icase == 2 || - combla_1.icase == 5 || combla_1.icase == 6 || combla_1.icase + } else if (combla_1.icase == 1 || combla_1.icase == 2 || + combla_1.icase == 5 || combla_1.icase == 6 || combla_1.icase == 12 || combla_1.icase == 13) { check2_(&sfac); } else if (combla_1.icase == 4) { @@ -143,7 +148,12 @@ static real c_b81 = 0.f; } s_stop("", (ftnlen)0); - return 0; +#ifdef BLIS_ENABLE_HPX + return bli_thread_finalize_hpx(); +#else + // Return peacefully. + return 0; +#endif } /* main */ /* Subroutine */ int header_(void) @@ -201,17 +211,17 @@ static real c_b81 = 0.f; static doublereal dc1[8] = { .6,.8,-.6,.8,.6,1.,0.,1. }; /* Builtin functions */ - integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_wsle(void); /* Subroutine */ int s_stop(char *, ftnlen); /* Local variables */ integer i__, k; doublereal sa, sb, sc, ss, dtemp[9]; - extern /* Subroutine */ int drotg_(doublereal *, doublereal *, doublereal - *, doublereal *), stest_(integer *, doublereal *, doublereal *, - doublereal *, doublereal *), stest1_(doublereal *, doublereal *, - doublereal *, doublereal *), drotmg_(doublereal *, doublereal *, + extern /* Subroutine */ int drotg_(doublereal *, doublereal *, doublereal + *, doublereal *), stest_(integer *, doublereal *, doublereal *, + doublereal *, doublereal *), stest1_(doublereal *, doublereal *, + doublereal *, doublereal *), drotmg_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); /* Fortran I/O blocks */ @@ -319,7 +329,7 @@ static real c_b81 = 0.f; doublereal d__1; /* Builtin functions */ - integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_wsle(void); /* Subroutine */ int s_stop(char *, ftnlen); @@ -328,12 +338,12 @@ static real c_b81 = 0.f; doublereal sx[8]; integer np1, len; extern doublereal dnrm2_(integer *, doublereal *, integer *); - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, integer *); extern doublereal dasum_(integer *, doublereal *, integer *); doublereal stemp[1], strue[8]; - extern /* Subroutine */ int stest_(integer *, doublereal *, doublereal *, - doublereal *, doublereal *), itest1_(integer *, integer *), + extern /* Subroutine */ int stest_(integer *, doublereal *, doublereal *, + doublereal *, doublereal *), itest1_(integer *, integer *), stest1_(doublereal *, doublereal *, doublereal *, doublereal *); extern integer idamax_(integer *, doublereal *, integer *); @@ -375,11 +385,11 @@ static real c_b81 = 0.f; stest1_(&d__1, stemp, stemp, sfac); } else if (combla_1.icase == 9) { /* .. DSCAL .. */ - dscal_(&combla_1.n, &sa[(combla_1.incx - 1) * 5 + np1 - 1], + dscal_(&combla_1.n, &sa[(combla_1.incx - 1) * 5 + np1 - 1], sx, &combla_1.incx); i__1 = len; for (i__ = 1; i__ <= i__1; ++i__) { - strue[i__ - 1] = dtrue5[i__ + (np1 + combla_1.incx * 5 << + strue[i__ - 1] = dtrue5[i__ + (np1 + combla_1.incx * 5 << 3) - 49]; /* L40: */ } @@ -446,71 +456,71 @@ static real c_b81 = 0.f; -3.,-4.,5.,0.,0.,2.,-3.,0.,1.,5.,2.,0.,-4. }; static struct { doublereal e_1[448]; - } equiv_3 = {{ .6, 0., 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., - .6, 0., 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., .6, - 0., 0., 0., 0., 0., 0., -.8, 0., 0., 0., 0., 0., 0., -.9, 0., - 0., 0., 0., 0., 0., 3.5, 0., 0., 0., 0., 0., 0., .6, .1, 0., - 0., 0., 0., 0., -.8, 3.8, 0., 0., 0., 0., 0., -.9, 2.8, 0., + } equiv_3 = {{ .6, 0., 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., + .6, 0., 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., .6, + 0., 0., 0., 0., 0., 0., -.8, 0., 0., 0., 0., 0., 0., -.9, 0., + 0., 0., 0., 0., 0., 3.5, 0., 0., 0., 0., 0., 0., .6, .1, 0., + 0., 0., 0., 0., -.8, 3.8, 0., 0., 0., 0., 0., -.9, 2.8, 0., 0., 0., 0., 0., 3.5, -.4, 0., 0., 0., 0., 0., .6, .1, -.5, .8, 0., 0., 0., -.8, 3.8, -2.2, -1.2, 0., 0., 0., -.9, 2.8, -1.4, - -1.3, 0., 0., 0., 3.5, -.4, -2.2, 4.7, 0., 0., 0., .6, 0., - 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., .6, 0., 0., - 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., .6, 0., 0., 0., - 0., 0., 0., -.8, 0., 0., 0., 0., 0., 0., -.9, 0., 0., 0., 0., - 0., 0., 3.5, 0., 0., 0., 0., 0., 0., .6, .1, -.5, 0., 0., 0., + -1.3, 0., 0., 0., 3.5, -.4, -2.2, 4.7, 0., 0., 0., .6, 0., + 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., .6, 0., 0., + 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., .6, 0., 0., 0., + 0., 0., 0., -.8, 0., 0., 0., 0., 0., 0., -.9, 0., 0., 0., 0., + 0., 0., 3.5, 0., 0., 0., 0., 0., 0., .6, .1, -.5, 0., 0., 0., 0., 0., .1, -3., 0., 0., 0., 0., -.3, .1, -2., 0., 0., 0., 0., - 3.3, .1, -2., 0., 0., 0., 0., .6, .1, -.5, .8, .9, -.3, -.4, - -2., .1, 1.4, .8, .6, -.3, -2.8, -1.8, .1, 1.3, .8, 0., -.3, - -1.9, 3.8, .1, -3.1, .8, 4.8, -.3, -1.5, .6, 0., 0., 0., 0., - 0., 0., .6, 0., 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., - 0., .6, 0., 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., + 3.3, .1, -2., 0., 0., 0., 0., .6, .1, -.5, .8, .9, -.3, -.4, + -2., .1, 1.4, .8, .6, -.3, -2.8, -1.8, .1, 1.3, .8, 0., -.3, + -1.9, 3.8, .1, -3.1, .8, 4.8, -.3, -1.5, .6, 0., 0., 0., 0., + 0., 0., .6, 0., 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., + 0., .6, 0., 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., -.8, 0., 0., 0., 0., 0., 0., -.9, 0., 0., 0., 0., 0., 0., 3.5, 0., 0., 0., 0., 0., 0., .6, .1, -.5, 0., 0., 0., 0., 4.8, .1, - -3., 0., 0., 0., 0., 3.3, .1, -2., 0., 0., 0., 0., 2.1, .1, - -2., 0., 0., 0., 0., .6, .1, -.5, .8, .9, -.3, -.4, -1.6, .1, - -2.2, .8, 5.4, -.3, -2.8, -1.5, .1, -1.4, .8, 3.6, -.3, -1.9, + -3., 0., 0., 0., 0., 3.3, .1, -2., 0., 0., 0., 0., 2.1, .1, + -2., 0., 0., 0., 0., .6, .1, -.5, .8, .9, -.3, -.4, -1.6, .1, + -2.2, .8, 5.4, -.3, -2.8, -1.5, .1, -1.4, .8, 3.6, -.3, -1.9, 3.7, .1, -2.2, .8, 3.6, -.3, -1.5, .6, 0., 0., 0., 0., 0., 0., - .6, 0., 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., .6, - 0., 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., -.8, 0., - 0., 0., 0., 0., 0., -.9, 0., 0., 0., 0., 0., 0., 3.5, 0., 0., - 0., 0., 0., 0., .6, .1, 0., 0., 0., 0., 0., -.8, -1., 0., 0., + .6, 0., 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., .6, + 0., 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., -.8, 0., + 0., 0., 0., 0., 0., -.9, 0., 0., 0., 0., 0., 0., 3.5, 0., 0., + 0., 0., 0., 0., .6, .1, 0., 0., 0., 0., 0., -.8, -1., 0., 0., 0., 0., 0., -.9, -.8, 0., 0., 0., 0., 0., 3.5, .8, 0., 0., 0., 0., 0., .6, .1, -.5, .8, 0., 0., 0., -.8, -1., 1.4, -1.6, 0., - 0., 0., -.9, -.8, 1.3, -1.6, 0., 0., 0., 3.5, .8, -3.1, 4.8, + 0., 0., -.9, -.8, 1.3, -1.6, 0., 0., 0., 3.5, .8, -3.1, 4.8, 0., 0., 0. }}; static struct { doublereal e_1[448]; - } equiv_7 = {{ .5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., - .5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., .5, - 0., 0., 0., 0., 0., 0., .7, 0., 0., 0., 0., 0., 0., 1.7, 0., + } equiv_7 = {{ .5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., + .5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., .5, + 0., 0., 0., 0., 0., 0., .7, 0., 0., 0., 0., 0., 0., 1.7, 0., 0., 0., 0., 0., 0., -2.6, 0., 0., 0., 0., 0., 0., .5, -.9, 0., - 0., 0., 0., 0., .7, -4.8, 0., 0., 0., 0., 0., 1.7, -.7, 0., - 0., 0., 0., 0., -2.6, 3.5, 0., 0., 0., 0., 0., .5, -.9, .3, - .7, 0., 0., 0., .7, -4.8, 3., 1.1, 0., 0., 0., 1.7, -.7, -.7, + 0., 0., 0., 0., .7, -4.8, 0., 0., 0., 0., 0., 1.7, -.7, 0., + 0., 0., 0., 0., -2.6, 3.5, 0., 0., 0., 0., 0., .5, -.9, .3, + .7, 0., 0., 0., .7, -4.8, 3., 1.1, 0., 0., 0., 1.7, -.7, -.7, 2.3, 0., 0., 0., -2.6, 3.5, -.7, -3.6, 0., 0., 0., .5, 0., 0., - 0., 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., - 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., - 0., 0., .7, 0., 0., 0., 0., 0., 0., 1.7, 0., 0., 0., 0., 0., + 0., 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., + 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., + 0., 0., .7, 0., 0., 0., 0., 0., 0., 1.7, 0., 0., 0., 0., 0., 0., -2.6, 0., 0., 0., 0., 0., 0., .5, -.9, .3, 0., 0., 0., 0., - 4., -.9, -.3, 0., 0., 0., 0., -.5, -.9, 1.5, 0., 0., 0., 0., + 4., -.9, -.3, 0., 0., 0., 0., -.5, -.9, 1.5, 0., 0., 0., 0., -1.5, -.9, -1.8, 0., 0., 0., 0., .5, -.9, .3, .7, -.6, .2, .8, - 3.7, -.9, -1.2, .7, -1.5, .2, 2.2, -.3, -.9, 2.1, .7, -1.6, - .2, 2., -1.6, -.9, -2.1, .7, 2.9, .2, -3.8, .5, 0., 0., 0., - 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., - 0., 0., .5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., 0., - 0., .7, 0., 0., 0., 0., 0., 0., 1.7, 0., 0., 0., 0., 0., 0., + 3.7, -.9, -1.2, .7, -1.5, .2, 2.2, -.3, -.9, 2.1, .7, -1.6, + .2, 2., -1.6, -.9, -2.1, .7, 2.9, .2, -3.8, .5, 0., 0., 0., + 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., + 0., 0., .5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., 0., + 0., .7, 0., 0., 0., 0., 0., 0., 1.7, 0., 0., 0., 0., 0., 0., -2.6, 0., 0., 0., 0., 0., 0., .5, -.9, 0., 0., 0., 0., 0., 4., - -6.3, 0., 0., 0., 0., 0., -.5, .3, 0., 0., 0., 0., 0., -1.5, - 3., 0., 0., 0., 0., 0., .5, -.9, .3, .7, 0., 0., 0., 3.7, - -7.2, 3., 1.7, 0., 0., 0., -.3, .9, -.7, 1.9, 0., 0., 0., - -1.6, 2.7, -.7, -3.4, 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., - .5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., .5, - 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., .7, 0., + -6.3, 0., 0., 0., 0., 0., -.5, .3, 0., 0., 0., 0., 0., -1.5, + 3., 0., 0., 0., 0., 0., .5, -.9, .3, .7, 0., 0., 0., 3.7, + -7.2, 3., 1.7, 0., 0., 0., -.3, .9, -.7, 1.9, 0., 0., 0., + -1.6, 2.7, -.7, -3.4, 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., + .5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., .5, + 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., .7, 0., 0., 0., 0., 0., 0., 1.7, 0., 0., 0., 0., 0., 0., -2.6, 0., 0., - 0., 0., 0., 0., .5, -.9, .3, 0., 0., 0., 0., .7, -.9, 1.2, + 0., 0., 0., 0., .5, -.9, .3, 0., 0., 0., 0., .7, -.9, 1.2, 0., 0., 0., 0., 1.7, -.9, .5, 0., 0., 0., 0., -2.6, -.9, -1.3, - 0., 0., 0., 0., .5, -.9, .3, .7, -.6, .2, .8, .7, -.9, 1.2, + 0., 0., 0., 0., .5, -.9, .3, .7, -.6, .2, .8, .7, -.9, 1.2, .7, -1.5, .2, 1.6, 1.7, -.9, .5, .7, -1.6, .2, 2.4, -2.6, -.9, -1.3, .7, 2.9, .2, -4. }}; @@ -521,7 +531,7 @@ static real c_b81 = 0.f; doublereal d__1; /* Builtin functions */ - integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_wsle(void); /* Subroutine */ int s_stop(char *, ftnlen); @@ -532,7 +542,7 @@ static real c_b81 = 0.f; doublereal sx[7], sy[7]; integer kni; doublereal stx[7], sty[7]; - extern doublereal ddot_(integer *, doublereal *, integer *, doublereal *, + extern doublereal ddot_(integer *, doublereal *, integer *, doublereal *, integer *); integer kpar, lenx, leny; #define dt19x ((doublereal *)&equiv_3) @@ -547,16 +557,16 @@ static real c_b81 = 0.f; #define dt19yc ((doublereal *)&equiv_7 + 224) #define dt19yd ((doublereal *)&equiv_7 + 336) extern doublereal dsdot_(integer *, real *, integer *, real *, integer *); - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); integer ksize; - extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *, - integer *, doublereal *, integer *), drotm_(integer *, doublereal + extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *, + integer *, doublereal *, integer *), drotm_(integer *, doublereal *, integer *, doublereal *, integer *, doublereal *), dswap_( integer *, doublereal *, integer *, doublereal *, integer *); doublereal ssize[7]; - extern /* Subroutine */ int stest_(integer *, doublereal *, doublereal *, - doublereal *, doublereal *), stest1_(doublereal *, doublereal *, + extern /* Subroutine */ int stest_(integer *, doublereal *, doublereal *, + doublereal *, doublereal *), stest1_(doublereal *, doublereal *, doublereal *, doublereal *); /* Fortran I/O blocks */ @@ -616,7 +626,7 @@ static real c_b81 = 0.f; /* .. DDOT .. */ d__1 = ddot_(&combla_1.n, sx, &combla_1.incx, sy, & combla_1.incy); - stest1_(&d__1, &dt7[kn + (ki << 2) - 5], &ssize1[kn - 1], + stest1_(&d__1, &dt7[kn + (ki << 2) - 5], &ssize1[kn - 1], sfac); } else if (combla_1.icase == 2) { /* .. DAXPY .. */ @@ -653,9 +663,9 @@ static real c_b81 = 0.f; for (i__ = 1; i__ <= 7; ++i__) { sx[i__ - 1] = dx1[i__ - 1]; sy[i__ - 1] = dy1[i__ - 1]; - stx[i__ - 1] = dt19x[i__ + (kpar + (kni << 2)) * 7 - + stx[i__ - 1] = dt19x[i__ + (kpar + (kni << 2)) * 7 - 36]; - sty[i__ - 1] = dt19y[i__ + (kpar + (kni << 2)) * 7 - + sty[i__ - 1] = dt19y[i__ + (kpar + (kni << 2)) * 7 - 36]; } @@ -746,7 +756,7 @@ static real c_b81 = 0.f; 1.17,1.17,1.17,1.17,1.17,1.17,1.17 }; /* Builtin functions */ - integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_wsle(void); /* Subroutine */ int s_stop(char *, ftnlen); @@ -755,13 +765,13 @@ static real c_b81 = 0.f; doublereal sx[7], sy[7], stx[7], sty[7]; integer lenx, leny; doublereal mwpc[11]; - extern /* Subroutine */ int drot_(integer *, doublereal *, integer *, + extern /* Subroutine */ int drot_(integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *); integer mwpn[11]; doublereal mwps[11], mwpx[5], mwpy[5]; integer ksize; doublereal copyx[5], copyy[5]; - extern /* Subroutine */ int stest_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ int stest_(integer *, doublereal *, doublereal *, doublereal *, doublereal *); doublereal mwptx[55] /* was [11][5] */, mwpty[55] /* was [11][5] */; @@ -1034,7 +1044,7 @@ static real c_b81 = 0.f; /* Local variables */ real sd; - extern real s_epsilon_(); + extern real s_epsilon_(real *); /* Fortran I/O blocks */ static cilist io___125 = { 0, 6, 0, fmt_99999, 0 }; @@ -1090,11 +1100,11 @@ static real c_b81 = 0.f; } /* testdsdot_ */ -/* Subroutine */ int stest1_(doublereal *scomp1, doublereal *strue1, +/* Subroutine */ int stest1_(doublereal *scomp1, doublereal *strue1, doublereal *ssize, doublereal *sfac) { doublereal scomp[1], strue[1]; - extern /* Subroutine */ int stest_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ int stest_(integer *, doublereal *, doublereal *, doublereal *, doublereal *); /* ************************* STEST1 ***************************** */ diff --git a/blastest/src/dblat2.c b/blastest/src/dblat2.c index 0cdc8f16f3..7982c67c50 100644 --- a/blastest/src/dblat2.c +++ b/blastest/src/dblat2.c @@ -155,10 +155,15 @@ static logical c_false = FALSE_; /* ===================================================================== */ /* Main program */ int main(void) { +#ifdef BLIS_ENABLE_HPX + char* program = "dblat2"; + bli_thread_initialize_hpx( 1, &program ); +#endif + /* Initialized data */ - static char snames[6*16] = "DGEMV " "DGBMV " "DSYMV " "DSBMV " "DSPMV " - "DTRMV " "DTBMV " "DTPMV " "DTRSV " "DTBSV " "DTPSV " "DGER " + static char snames[6*16] = "DGEMV " "DGBMV " "DSYMV " "DSBMV " "DSPMV " + "DTRMV " "DTBMV " "DTPMV " "DTRSV " "DTBSV " "DTPSV " "DGER " "DSYR " "DSPR " "DSYR2 " "DSPR2 "; /* Format strings */ @@ -204,10 +209,10 @@ static logical c_false = FALSE_; cllist cl__1; /* Builtin functions */ - integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_rsle(void), f_open(olist *), s_wsfe(cilist *), do_fio(integer *, - char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), - s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, + char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), + s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, ftnlen); /* Subroutine */ int s_stop(char *, ftnlen); integer f_clos(cllist *); @@ -227,50 +232,50 @@ static logical c_false = FALSE_; integer ninc, nbet, ntra; logical rewi; integer nout; - extern /* Subroutine */ int dchk1_(char *, doublereal *, doublereal *, - integer *, integer *, logical *, logical *, logical *, integer *, - integer *, integer *, integer *, integer *, doublereal *, integer - *, doublereal *, integer *, integer *, integer *, integer *, - doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, ftnlen), dchk2_(char *, - doublereal *, doublereal *, integer *, integer *, logical *, - logical *, logical *, integer *, integer *, integer *, integer *, - integer *, doublereal *, integer *, doublereal *, integer *, - integer *, integer *, integer *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, ftnlen), dchk3_(char *, doublereal *, doublereal *, - integer *, integer *, logical *, logical *, logical *, integer *, - integer *, integer *, integer *, integer *, integer *, integer *, + extern /* Subroutine */ int dchk1_(char *, doublereal *, doublereal *, + integer *, integer *, logical *, logical *, logical *, integer *, + integer *, integer *, integer *, integer *, doublereal *, integer + *, doublereal *, integer *, integer *, integer *, integer *, + doublereal *, doublereal *, doublereal *, doublereal *, + doublereal *, doublereal *, doublereal *, doublereal *, + doublereal *, doublereal *, doublereal *, ftnlen), dchk2_(char *, + doublereal *, doublereal *, integer *, integer *, logical *, + logical *, logical *, integer *, integer *, integer *, integer *, + integer *, doublereal *, integer *, doublereal *, integer *, + integer *, integer *, integer *, doublereal *, doublereal *, + doublereal *, doublereal *, doublereal *, doublereal *, + doublereal *, doublereal *, doublereal *, doublereal *, + doublereal *, ftnlen), dchk3_(char *, doublereal *, doublereal *, + integer *, integer *, logical *, logical *, logical *, integer *, + integer *, integer *, integer *, integer *, integer *, integer *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, ftnlen), dchk4_(char *, doublereal *, doublereal *, - integer *, integer *, logical *, logical *, logical *, integer *, - integer *, integer *, doublereal *, integer *, integer *, integer - *, integer *, doublereal *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, ftnlen), dchk5_(char *, doublereal *, doublereal *, - integer *, integer *, logical *, logical *, logical *, integer *, - integer *, integer *, doublereal *, integer *, integer *, integer - *, integer *, doublereal *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, ftnlen), dchk6_(char *, doublereal *, doublereal *, - integer *, integer *, logical *, logical *, logical *, integer *, - integer *, integer *, doublereal *, integer *, integer *, integer - *, integer *, doublereal *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, ftnlen), dchke_(integer *, char *, integer *, + doublereal *, doublereal *, doublereal *, doublereal *, + doublereal *, ftnlen), dchk4_(char *, doublereal *, doublereal *, + integer *, integer *, logical *, logical *, logical *, integer *, + integer *, integer *, doublereal *, integer *, integer *, integer + *, integer *, doublereal *, doublereal *, doublereal *, + doublereal *, doublereal *, doublereal *, doublereal *, + doublereal *, doublereal *, doublereal *, doublereal *, + doublereal *, ftnlen), dchk5_(char *, doublereal *, doublereal *, + integer *, integer *, logical *, logical *, logical *, integer *, + integer *, integer *, doublereal *, integer *, integer *, integer + *, integer *, doublereal *, doublereal *, doublereal *, + doublereal *, doublereal *, doublereal *, doublereal *, + doublereal *, doublereal *, doublereal *, doublereal *, + doublereal *, ftnlen), dchk6_(char *, doublereal *, doublereal *, + integer *, integer *, logical *, logical *, logical *, integer *, + integer *, integer *, doublereal *, integer *, integer *, integer + *, integer *, doublereal *, doublereal *, doublereal *, + doublereal *, doublereal *, doublereal *, doublereal *, + doublereal *, doublereal *, doublereal *, doublereal *, + doublereal *, ftnlen), dchke_(integer *, char *, integer *, ftnlen); logical fatal, trace; integer nidim; - extern /* Subroutine */ int dmvch_(char *, integer *, integer *, - doublereal *, doublereal *, integer *, doublereal *, integer *, + extern /* Subroutine */ int dmvch_(char *, integer *, integer *, + doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, logical *, integer *, + doublereal *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen); char snaps[32], trans[1]; integer isnum; @@ -621,7 +626,7 @@ static logical c_false = FALSE_; goto L80; } for (i__ = 1; i__ <= 16; ++i__) { - if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) + if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) { goto L70; } @@ -668,7 +673,7 @@ static logical c_false = FALSE_; } i__1 = n; for (j = 1; j <= i__1; ++j) { - yy[j - 1] = (doublereal) (j * ((j + 1) * j) / 2 - (j + 1) * j * (j - + yy[j - 1] = (doublereal) (j * ((j + 1) * j) / 2 - (j + 1) * j * (j - 1) / 3); /* L130: */ } @@ -748,44 +753,44 @@ static logical c_false = FALSE_; /* Test DGEMV, 01, and DGBMV, 02. */ L140: dchk1_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, - &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, + trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, + &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, (ftnlen)6); goto L200; /* Test DSYMV, 03, DSBMV, 04, and DSPMV, 05. */ L150: dchk2_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, - &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, + trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, + &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, (ftnlen)6); goto L200; /* Test DTRMV, 06, DTBMV, 07, DTPMV, 08, */ /* DTRSV, 09, DTBSV, 10, and DTPSV, 11. */ L160: dchk3_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &ninc, inc, + trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &ninc, inc, &c__65, &c__2, a, aa, as, y, yy, ys, yt, g, z__, (ftnlen) 6); goto L200; /* Test DGER, 12. */ L170: dchk4_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, - inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, + inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, z__, (ftnlen)6); goto L200; /* Test DSYR, 13, and DSPR, 14. */ L180: dchk5_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, - inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, + inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, z__, (ftnlen)6); goto L200; /* Test DSYR2, 15, and DSPR2, 16. */ L190: dchk6_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, - inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, + inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, z__, (ftnlen)6); L200: @@ -827,16 +832,21 @@ static logical c_false = FALSE_; /* End of DBLAT2. */ - return 0; +#ifdef BLIS_ENABLE_HPX + return bli_thread_finalize_hpx(); +#else + // Return peacefully. + return 0; +#endif } /* main */ -/* Subroutine */ int dchk1_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int dchk1_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * - fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, - integer *nalf, doublereal *alf, integer *nbet, doublereal *bet, - integer *ninc, integer *inc, integer *nmax, integer *incmax, - doublereal *a, doublereal *aa, doublereal *as, doublereal *x, - doublereal *xx, doublereal *xs, doublereal *y, doublereal *yy, + fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, + integer *nalf, doublereal *alf, integer *nbet, doublereal *bet, + integer *ninc, integer *inc, integer *nmax, integer *incmax, + doublereal *a, doublereal *aa, doublereal *as, doublereal *x, + doublereal *xx, doublereal *xs, doublereal *y, doublereal *yy, doublereal *ys, doublereal *yt, doublereal *g, ftnlen sname_len) { /* Initialized data */ @@ -881,21 +891,21 @@ static logical c_false = FALSE_; logical same; integer incx, incy; logical full, tran, null; - extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, - integer *, doublereal *, integer *, doublereal *, integer *, - integer *, integer *, logical *, doublereal *, ftnlen, ftnlen, + extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, + integer *, doublereal *, integer *, doublereal *, integer *, + integer *, integer *, logical *, doublereal *, ftnlen, ftnlen, ftnlen); doublereal alpha; logical isame[13]; extern /* Subroutine */ int dgbmv_(char *, integer *, integer *, integer * - , integer *, doublereal *, doublereal *, integer *, doublereal *, + , integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, ftnlen), dgemv_( - char *, integer *, integer *, doublereal *, doublereal *, integer + char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, - ftnlen), dmvch_(char *, integer *, integer *, doublereal *, - doublereal *, integer *, doublereal *, integer *, doublereal *, + ftnlen), dmvch_(char *, integer *, integer *, doublereal *, + doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, - doublereal *, doublereal *, logical *, integer *, logical *, + doublereal *, doublereal *, logical *, integer *, logical *, ftnlen); integer nargs; logical reset; @@ -1079,9 +1089,9 @@ static logical c_false = FALSE_; transl = 0.; i__7 = abs(incy); i__8 = ml - 1; - dmake_("GE", " ", " ", &c__1, &ml, &y[1], + dmake_("GE", " ", " ", &c__1, &ml, &y[1], &c__1, &yy[1], &i__7, &c__0, & - i__8, &reset, &transl, (ftnlen)2, + i__8, &reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); ++nc; @@ -1089,7 +1099,7 @@ static logical c_false = FALSE_; /* Save every datum before calling the */ /* subroutine. */ - *(unsigned char *)transs = *(unsigned + *(unsigned char *)transs = *(unsigned char *)trans; ms = m; ns = n; @@ -1149,7 +1159,7 @@ static logical c_false = FALSE_; al__1.aunit = *ntra; f_rew(&al__1); } - dgemv_(trans, &m, &n, &alpha, &aa[1], + dgemv_(trans, &m, &n, &alpha, &aa[1], &lda, &xx[1], &incx, &beta, & yy[1], &incy, (ftnlen)1); } else if (banded) { @@ -1276,8 +1286,8 @@ static logical c_false = FALSE_; dmvch_(trans, &m, &n, &alpha, &a[ a_offset], nmax, &x[1], &incx, - &beta, &y[1], &incy, &yt[1], - &g[1], &yy[1], eps, &err, + &beta, &y[1], &incy, &yt[1], + &g[1], &yy[1], eps, &err, fatal, nout, &c_true, (ftnlen) 1); errmax = max(errmax,err); @@ -1381,13 +1391,13 @@ static logical c_false = FALSE_; } /* dchk1_ */ -/* Subroutine */ int dchk2_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int dchk2_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * - fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, - integer *nalf, doublereal *alf, integer *nbet, doublereal *bet, - integer *ninc, integer *inc, integer *nmax, integer *incmax, - doublereal *a, doublereal *aa, doublereal *as, doublereal *x, - doublereal *xx, doublereal *xs, doublereal *y, doublereal *yy, + fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, + integer *nalf, doublereal *alf, integer *nbet, doublereal *bet, + integer *ninc, integer *inc, integer *nmax, integer *incmax, + doublereal *a, doublereal *aa, doublereal *as, doublereal *x, + doublereal *xx, doublereal *xs, doublereal *y, doublereal *yy, doublereal *ys, doublereal *yt, doublereal *g, ftnlen sname_len) { /* Initialized data */ @@ -1425,7 +1435,7 @@ static logical c_false = FALSE_; f_rew(alist *); /* Local variables */ - integer i__, k, n, ia, ib, ic, nc, ik, in, nk, ks, ix, iy, ns, lx, ly, + integer i__, k, n, ia, ib, ic, nc, ik, in, nk, ks, ix, iy, ns, lx, ly, laa, lda; extern logical lde_(doublereal *, doublereal *, integer *); doublereal als, bls, err, beta; @@ -1434,29 +1444,29 @@ static logical c_false = FALSE_; integer incx, incy; logical full, null; char uplo[1]; - extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, - integer *, doublereal *, integer *, doublereal *, integer *, - integer *, integer *, logical *, doublereal *, ftnlen, ftnlen, + extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, + integer *, doublereal *, integer *, doublereal *, integer *, + integer *, integer *, logical *, doublereal *, ftnlen, ftnlen, ftnlen); doublereal alpha; logical isame[13]; - extern /* Subroutine */ int dmvch_(char *, integer *, integer *, - doublereal *, doublereal *, integer *, doublereal *, integer *, + extern /* Subroutine */ int dmvch_(char *, integer *, integer *, + doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, logical *, integer *, + doublereal *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen); integer nargs; - extern /* Subroutine */ int dsbmv_(char *, integer *, integer *, - doublereal *, doublereal *, integer *, doublereal *, integer *, + extern /* Subroutine */ int dsbmv_(char *, integer *, integer *, + doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, ftnlen); logical reset; integer incxs, incys; - extern /* Subroutine */ int dspmv_(char *, integer *, doublereal *, + extern /* Subroutine */ int dspmv_(char *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, integer *, ftnlen); char uplos[1]; - extern /* Subroutine */ int dsymv_(char *, integer *, doublereal *, - doublereal *, integer *, doublereal *, integer *, doublereal *, + extern /* Subroutine */ int dsymv_(char *, integer *, doublereal *, + doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, ftnlen); logical banded, packed; extern logical lderes_(char *, char *, integer *, integer *, doublereal *, @@ -1619,7 +1629,7 @@ static logical c_false = FALSE_; i__8 = n - 1; dmake_("GE", " ", " ", &c__1, &n, &y[1], & c__1, &yy[1], &i__7, &c__0, &i__8, & - reset, &transl, (ftnlen)2, (ftnlen)1, + reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); ++nc; @@ -1836,8 +1846,8 @@ static logical c_false = FALSE_; /* Check the result. */ - dmvch_("N", &n, &n, &alpha, &a[a_offset], - nmax, &x[1], &incx, &beta, &y[1], + dmvch_("N", &n, &n, &alpha, &a[a_offset], + nmax, &x[1], &incx, &beta, &y[1], &incy, &yt[1], &g[1], &yy[1], eps, &err, fatal, nout, &c_true, ( ftnlen)1); @@ -1947,12 +1957,12 @@ static logical c_false = FALSE_; } /* dchk2_ */ -/* Subroutine */ int dchk3_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int dchk3_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * - fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, - integer *ninc, integer *inc, integer *nmax, integer *incmax, - doublereal *a, doublereal *aa, doublereal *as, doublereal *x, - doublereal *xx, doublereal *xs, doublereal *xt, doublereal *g, + fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, + integer *ninc, integer *inc, integer *nmax, integer *incmax, + doublereal *a, doublereal *aa, doublereal *as, doublereal *x, + doublereal *xx, doublereal *xs, doublereal *xt, doublereal *g, doublereal *z__, ftnlen sname_len) { /* Initialized data */ @@ -2002,36 +2012,36 @@ static logical c_false = FALSE_; integer incx; logical full, null; char uplo[1]; - extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, - integer *, doublereal *, integer *, doublereal *, integer *, - integer *, integer *, logical *, doublereal *, ftnlen, ftnlen, + extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, + integer *, doublereal *, integer *, doublereal *, integer *, + integer *, integer *, logical *, doublereal *, ftnlen, ftnlen, ftnlen); char diags[1]; logical isame[13]; - extern /* Subroutine */ int dmvch_(char *, integer *, integer *, - doublereal *, doublereal *, integer *, doublereal *, integer *, + extern /* Subroutine */ int dmvch_(char *, integer *, integer *, + doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, logical *, integer *, + doublereal *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen); integer nargs; - extern /* Subroutine */ int dtbmv_(char *, char *, char *, integer *, - integer *, doublereal *, integer *, doublereal *, integer *, + extern /* Subroutine */ int dtbmv_(char *, char *, char *, integer *, + integer *, doublereal *, integer *, doublereal *, integer *, ftnlen, ftnlen, ftnlen); logical reset; - extern /* Subroutine */ int dtbsv_(char *, char *, char *, integer *, - integer *, doublereal *, integer *, doublereal *, integer *, + extern /* Subroutine */ int dtbsv_(char *, char *, char *, integer *, + integer *, doublereal *, integer *, doublereal *, integer *, ftnlen, ftnlen, ftnlen); integer incxs; char trans[1]; - extern /* Subroutine */ int dtpmv_(char *, char *, char *, integer *, - doublereal *, doublereal *, integer *, ftnlen, ftnlen, ftnlen), + extern /* Subroutine */ int dtpmv_(char *, char *, char *, integer *, + doublereal *, doublereal *, integer *, ftnlen, ftnlen, ftnlen), dtrmv_(char *, char *, char *, integer *, doublereal *, integer *, - doublereal *, integer *, ftnlen, ftnlen, ftnlen), dtpsv_(char *, - char *, char *, integer *, doublereal *, doublereal *, integer *, + doublereal *, integer *, ftnlen, ftnlen, ftnlen), dtpsv_(char *, + char *, char *, integer *, doublereal *, doublereal *, integer *, ftnlen, ftnlen, ftnlen); char uplos[1]; - extern /* Subroutine */ int dtrsv_(char *, char *, char *, integer *, - doublereal *, integer *, doublereal *, integer *, ftnlen, ftnlen, + extern /* Subroutine */ int dtrsv_(char *, char *, char *, integer *, + doublereal *, integer *, doublereal *, integer *, ftnlen, ftnlen, ftnlen); logical banded, packed; extern logical lderes_(char *, char *, integer *, integer *, doublereal *, @@ -2160,13 +2170,13 @@ static logical c_false = FALSE_; ; for (icd = 1; icd <= 2; ++icd) { - *(unsigned char *)diag = *(unsigned char *)&ichd[icd + *(unsigned char *)diag = *(unsigned char *)&ichd[icd - 1]; /* Generate the matrix A. */ transl = 0.; - dmake_(sname + 1, uplo, diag, &n, &n, &a[a_offset], + dmake_(sname + 1, uplo, diag, &n, &n, &a[a_offset], nmax, &aa[1], &lda, &k, &k, &reset, &transl, ( ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -2213,7 +2223,7 @@ static logical c_false = FALSE_; /* Call the subroutine. */ - if (s_cmp(sname + 3, "MV", (ftnlen)2, (ftnlen)2) + if (s_cmp(sname + 3, "MV", (ftnlen)2, (ftnlen)2) == 0) { if (full) { if (*trace) { @@ -2266,7 +2276,7 @@ static logical c_false = FALSE_; al__1.aunit = *ntra; f_rew(&al__1); } - dtbmv_(uplo, trans, diag, &n, &k, &aa[1], + dtbmv_(uplo, trans, diag, &n, &k, &aa[1], &lda, &xx[1], &incx, (ftnlen)1, ( ftnlen)1, (ftnlen)1); } else if (packed) { @@ -2347,7 +2357,7 @@ static logical c_false = FALSE_; al__1.aunit = *ntra; f_rew(&al__1); } - dtbsv_(uplo, trans, diag, &n, &k, &aa[1], + dtbsv_(uplo, trans, diag, &n, &k, &aa[1], &lda, &xx[1], &incx, (ftnlen)1, ( ftnlen)1, (ftnlen)1); } else if (packed) { @@ -2389,11 +2399,11 @@ static logical c_false = FALSE_; /* See what data changed inside subroutines. */ - isame[0] = *(unsigned char *)uplo == *(unsigned + isame[0] = *(unsigned char *)uplo == *(unsigned char *)uplos; - isame[1] = *(unsigned char *)trans == *(unsigned + isame[1] = *(unsigned char *)trans == *(unsigned char *)transs; - isame[2] = *(unsigned char *)diag == *(unsigned + isame[2] = *(unsigned char *)diag == *(unsigned char *)diags; isame[3] = ns == n; if (full) { @@ -2464,7 +2474,7 @@ static logical c_false = FALSE_; dmvch_(trans, &n, &n, &c_b128, &a[ a_offset], nmax, &x[1], &incx, & c_b120, &z__[1], &incx, &xt[1], & - g[1], &xx[1], eps, &err, fatal, + g[1], &xx[1], eps, &err, fatal, nout, &c_true, (ftnlen)1); } else if (s_cmp(sname + 3, "SV", (ftnlen)2, ( ftnlen)2) == 0) { @@ -2473,7 +2483,7 @@ static logical c_false = FALSE_; i__4 = n; for (i__ = 1; i__ <= i__4; ++i__) { - z__[i__] = xx[(i__ - 1) * abs(incx) + + z__[i__] = xx[(i__ - 1) * abs(incx) + 1]; xx[(i__ - 1) * abs(incx) + 1] = x[i__] ; @@ -2482,7 +2492,7 @@ static logical c_false = FALSE_; dmvch_(trans, &n, &n, &c_b128, &a[ a_offset], nmax, &z__[1], &incx, & c_b120, &x[1], &incx, &xt[1], &g[ - 1], &xx[1], eps, &err, fatal, + 1], &xx[1], eps, &err, fatal, nout, &c_false, (ftnlen)1); } errmax = max(errmax,err); @@ -2584,13 +2594,13 @@ static logical c_false = FALSE_; } /* dchk3_ */ -/* Subroutine */ int dchk4_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int dchk4_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * - fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, - integer *ninc, integer *inc, integer *nmax, integer *incmax, - doublereal *a, doublereal *aa, doublereal *as, doublereal *x, - doublereal *xx, doublereal *xs, doublereal *y, doublereal *yy, - doublereal *ys, doublereal *yt, doublereal *g, doublereal *z__, + fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, + integer *ninc, integer *inc, integer *nmax, integer *incmax, + doublereal *a, doublereal *aa, doublereal *as, doublereal *x, + doublereal *xx, doublereal *xs, doublereal *y, doublereal *yy, + doublereal *ys, doublereal *yt, doublereal *g, doublereal *z__, ftnlen sname_len) { /* Format strings */ @@ -2625,23 +2635,23 @@ static logical c_false = FALSE_; integer ia, nc, nd, im, in, ms, ix, iy, ns, lx, ly, laa, lda; extern logical lde_(doublereal *, doublereal *, integer *); doublereal als, err; - extern /* Subroutine */ int dger_(integer *, integer *, doublereal *, - doublereal *, integer *, doublereal *, integer *, doublereal *, + extern /* Subroutine */ int dger_(integer *, integer *, doublereal *, + doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); integer ldas; logical same; integer incx, incy; logical null; - extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, - integer *, doublereal *, integer *, doublereal *, integer *, - integer *, integer *, logical *, doublereal *, ftnlen, ftnlen, + extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, + integer *, doublereal *, integer *, doublereal *, integer *, + integer *, integer *, logical *, doublereal *, ftnlen, ftnlen, ftnlen); doublereal alpha; logical isame[13]; - extern /* Subroutine */ int dmvch_(char *, integer *, integer *, - doublereal *, doublereal *, integer *, doublereal *, integer *, + extern /* Subroutine */ int dmvch_(char *, integer *, integer *, + doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, logical *, integer *, + doublereal *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen); integer nargs; logical reset; @@ -2748,7 +2758,7 @@ static logical c_false = FALSE_; i__3 = abs(incx); i__4 = m - 1; dmake_("GE", " ", " ", &c__1, &m, &x[1], &c__1, &xx[1], &i__3, - &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, + &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); if (m > 1) { x[m / 2] = 0.; @@ -2782,7 +2792,7 @@ static logical c_false = FALSE_; transl = 0.; i__5 = m - 1; i__6 = n - 1; - dmake_(sname + 1, " ", " ", &m, &n, &a[a_offset], + dmake_(sname + 1, " ", " ", &m, &n, &a[a_offset], nmax, &aa[1], &lda, &i__5, &i__6, &reset, & transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -2913,9 +2923,9 @@ static logical c_false = FALSE_; } else { w[0] = y[n - j + 1]; } - dmvch_("N", &m, &c__1, &alpha, &z__[1], nmax, + dmvch_("N", &m, &c__1, &alpha, &z__[1], nmax, w, &c__1, &c_b128, &a[j * a_dim1 + 1], - &c__1, &yt[1], &g[1], &aa[(j - 1) * + &c__1, &yt[1], &g[1], &aa[(j - 1) * lda + 1], eps, &err, fatal, nout, & c_true, (ftnlen)1); errmax = max(errmax,err); @@ -2995,13 +3005,13 @@ static logical c_false = FALSE_; } /* dchk4_ */ -/* Subroutine */ int dchk5_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int dchk5_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * - fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, - integer *ninc, integer *inc, integer *nmax, integer *incmax, - doublereal *a, doublereal *aa, doublereal *as, doublereal *x, - doublereal *xx, doublereal *xs, doublereal *y, doublereal *yy, - doublereal *ys, doublereal *yt, doublereal *g, doublereal *z__, + fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, + integer *ninc, integer *inc, integer *nmax, integer *incmax, + doublereal *a, doublereal *aa, doublereal *as, doublereal *x, + doublereal *xx, doublereal *xs, doublereal *y, doublereal *yy, + doublereal *ys, doublereal *yt, doublereal *g, doublereal *z__, ftnlen sname_len) { /* Initialized data */ @@ -3047,21 +3057,21 @@ static logical c_false = FALSE_; logical same; integer incx; logical full; - extern /* Subroutine */ int dspr_(char *, integer *, doublereal *, + extern /* Subroutine */ int dspr_(char *, integer *, doublereal *, doublereal *, integer *, doublereal *, ftnlen); logical null; char uplo[1]; - extern /* Subroutine */ int dsyr_(char *, integer *, doublereal *, + extern /* Subroutine */ int dsyr_(char *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, ftnlen), dmake_( - char *, char *, char *, integer *, integer *, doublereal *, - integer *, doublereal *, integer *, integer *, integer *, logical + char *, char *, char *, integer *, integer *, doublereal *, + integer *, doublereal *, integer *, integer *, integer *, logical *, doublereal *, ftnlen, ftnlen, ftnlen); doublereal alpha; logical isame[13]; - extern /* Subroutine */ int dmvch_(char *, integer *, integer *, - doublereal *, doublereal *, integer *, doublereal *, integer *, + extern /* Subroutine */ int dmvch_(char *, integer *, integer *, + doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, logical *, integer *, + doublereal *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen); integer nargs; logical reset; @@ -3173,7 +3183,7 @@ static logical c_false = FALSE_; i__3 = abs(incx); i__4 = n - 1; dmake_("GE", " ", " ", &c__1, &n, &x[1], &c__1, &xx[1], &i__3, - &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, + &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); if (n > 1) { x[n / 2] = 0.; @@ -3342,9 +3352,9 @@ static logical c_false = FALSE_; jj = j; lj = n - j + 1; } - dmvch_("N", &lj, &c__1, &alpha, &z__[jj], &lj, w, + dmvch_("N", &lj, &c__1, &alpha, &z__[jj], &lj, w, &c__1, &c_b128, &a[jj + j * a_dim1], & - c__1, &yt[1], &g[1], &aa[ja], eps, &err, + c__1, &yt[1], &g[1], &aa[ja], eps, &err, fatal, nout, &c_true, (ftnlen)1); if (full) { if (upper) { @@ -3442,13 +3452,13 @@ static logical c_false = FALSE_; } /* dchk5_ */ -/* Subroutine */ int dchk6_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int dchk6_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * - fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, - integer *ninc, integer *inc, integer *nmax, integer *incmax, - doublereal *a, doublereal *aa, doublereal *as, doublereal *x, - doublereal *xx, doublereal *xs, doublereal *y, doublereal *yy, - doublereal *ys, doublereal *yt, doublereal *g, doublereal *z__, + fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, + integer *ninc, integer *inc, integer *nmax, integer *incmax, + doublereal *a, doublereal *aa, doublereal *as, doublereal *x, + doublereal *xx, doublereal *xs, doublereal *y, doublereal *yy, + doublereal *ys, doublereal *yt, doublereal *g, doublereal *z__, ftnlen sname_len) { /* Initialized data */ @@ -3477,7 +3487,7 @@ static logical c_false = FALSE_; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5, + integer a_dim1, a_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5, i__6; alist al__1; @@ -3496,19 +3506,19 @@ static logical c_false = FALSE_; integer incx, incy; logical full, null; char uplo[1]; - extern /* Subroutine */ int dspr2_(char *, integer *, doublereal *, - doublereal *, integer *, doublereal *, integer *, doublereal *, - ftnlen), dsyr2_(char *, integer *, doublereal *, doublereal *, - integer *, doublereal *, integer *, doublereal *, integer *, - ftnlen), dmake_(char *, char *, char *, integer *, integer *, - doublereal *, integer *, doublereal *, integer *, integer *, + extern /* Subroutine */ int dspr2_(char *, integer *, doublereal *, + doublereal *, integer *, doublereal *, integer *, doublereal *, + ftnlen), dsyr2_(char *, integer *, doublereal *, doublereal *, + integer *, doublereal *, integer *, doublereal *, integer *, + ftnlen), dmake_(char *, char *, char *, integer *, integer *, + doublereal *, integer *, doublereal *, integer *, integer *, integer *, logical *, doublereal *, ftnlen, ftnlen, ftnlen); doublereal alpha; logical isame[13]; - extern /* Subroutine */ int dmvch_(char *, integer *, integer *, - doublereal *, doublereal *, integer *, doublereal *, integer *, + extern /* Subroutine */ int dmvch_(char *, integer *, integer *, + doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, logical *, integer *, + doublereal *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen); integer nargs; logical reset; @@ -3622,7 +3632,7 @@ static logical c_false = FALSE_; i__3 = abs(incx); i__4 = n - 1; dmake_("GE", " ", " ", &c__1, &n, &x[1], &c__1, &xx[1], &i__3, - &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, + &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); if (n > 1) { x[n / 2] = 0.; @@ -3657,7 +3667,7 @@ static logical c_false = FALSE_; transl = 0.; i__5 = n - 1; i__6 = n - 1; - dmake_(sname + 1, uplo, " ", &n, &n, &a[a_offset], + dmake_(sname + 1, uplo, " ", &n, &n, &a[a_offset], nmax, &aa[1], &lda, &i__5, &i__6, &reset, & transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -3835,7 +3845,7 @@ static logical c_false = FALSE_; jj = j; lj = n - j + 1; } - dmvch_("N", &lj, &c__2, &alpha, &z__[jj + + dmvch_("N", &lj, &c__2, &alpha, &z__[jj + z_dim1], nmax, w, &c__1, &c_b128, &a[ jj + j * a_dim1], &c__1, &yt[1], &g[1] , &aa[ja], eps, &err, fatal, nout, & @@ -3941,7 +3951,7 @@ static logical c_false = FALSE_; } /* dchk6_ */ -/* Subroutine */ int dchke_(integer *isnum, char *srnamt, integer *nout, +/* Subroutine */ int dchke_(integer *isnum, char *srnamt, integer *nout, ftnlen srnamt_len) { /* Format strings */ @@ -3955,39 +3965,39 @@ static logical c_false = FALSE_; /* Local variables */ doublereal a[1] /* was [1][1] */, x[1], y[1], beta; - extern /* Subroutine */ int dger_(integer *, integer *, doublereal *, - doublereal *, integer *, doublereal *, integer *, doublereal *, - integer *), dspr_(char *, integer *, doublereal *, doublereal *, - integer *, doublereal *, ftnlen), dsyr_(char *, integer *, - doublereal *, doublereal *, integer *, doublereal *, integer *, - ftnlen), dspr2_(char *, integer *, doublereal *, doublereal *, + extern /* Subroutine */ int dger_(integer *, integer *, doublereal *, + doublereal *, integer *, doublereal *, integer *, doublereal *, + integer *), dspr_(char *, integer *, doublereal *, doublereal *, + integer *, doublereal *, ftnlen), dsyr_(char *, integer *, + doublereal *, doublereal *, integer *, doublereal *, integer *, + ftnlen), dspr2_(char *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, ftnlen), dsyr2_( - char *, integer *, doublereal *, doublereal *, integer *, + char *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, ftnlen); doublereal alpha; extern /* Subroutine */ int dgbmv_(char *, integer *, integer *, integer * - , integer *, doublereal *, doublereal *, integer *, doublereal *, + , integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, ftnlen), dgemv_( - char *, integer *, integer *, doublereal *, doublereal *, integer + char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, - ftnlen), dsbmv_(char *, integer *, integer *, doublereal *, - doublereal *, integer *, doublereal *, integer *, doublereal *, - doublereal *, integer *, ftnlen), dtbmv_(char *, char *, char *, - integer *, integer *, doublereal *, integer *, doublereal *, + ftnlen), dsbmv_(char *, integer *, integer *, doublereal *, + doublereal *, integer *, doublereal *, integer *, doublereal *, + doublereal *, integer *, ftnlen), dtbmv_(char *, char *, char *, + integer *, integer *, doublereal *, integer *, doublereal *, integer *, ftnlen, ftnlen, ftnlen), dtbsv_(char *, char *, char *, - integer *, integer *, doublereal *, integer *, doublereal *, - integer *, ftnlen, ftnlen, ftnlen), dspmv_(char *, integer *, + integer *, integer *, doublereal *, integer *, doublereal *, + integer *, ftnlen, ftnlen, ftnlen), dspmv_(char *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, - doublereal *, integer *, ftnlen), dtpmv_(char *, char *, char *, - integer *, doublereal *, doublereal *, integer *, ftnlen, ftnlen, - ftnlen), dtrmv_(char *, char *, char *, integer *, doublereal *, - integer *, doublereal *, integer *, ftnlen, ftnlen, ftnlen), - dtpsv_(char *, char *, char *, integer *, doublereal *, - doublereal *, integer *, ftnlen, ftnlen, ftnlen), dsymv_(char *, - integer *, doublereal *, doublereal *, integer *, doublereal *, + doublereal *, integer *, ftnlen), dtpmv_(char *, char *, char *, + integer *, doublereal *, doublereal *, integer *, ftnlen, ftnlen, + ftnlen), dtrmv_(char *, char *, char *, integer *, doublereal *, + integer *, doublereal *, integer *, ftnlen, ftnlen, ftnlen), + dtpsv_(char *, char *, char *, integer *, doublereal *, + doublereal *, integer *, ftnlen, ftnlen, ftnlen), dsymv_(char *, + integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, ftnlen), dtrsv_( - char *, char *, char *, integer *, doublereal *, integer *, - doublereal *, integer *, ftnlen, ftnlen, ftnlen), chkxer_(char *, + char *, char *, char *, integer *, doublereal *, integer *, + doublereal *, integer *, ftnlen, ftnlen, ftnlen), chkxer_(char *, integer *, integer *, logical *, logical *, ftnlen); /* Fortran I/O blocks */ @@ -4493,9 +4503,9 @@ static logical c_false = FALSE_; } /* dchke_ */ -/* Subroutine */ int dmake_(char *type__, char *uplo, char *diag, integer *m, +/* Subroutine */ int dmake_(char *type__, char *uplo, char *diag, integer *m, integer *n, doublereal *a, integer *nmax, doublereal *aa, integer * - lda, integer *kl, integer *ku, logical *reset, doublereal *transl, + lda, integer *kl, integer *ku, logical *reset, doublereal *transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len) { /* System generated locals */ @@ -4553,7 +4563,7 @@ static logical c_false = FALSE_; i__2 = *m; for (i__ = 1; i__ <= i__2; ++i__) { if (gen || upper && i__ <= j || lower && i__ >= j) { - if (i__ <= j && j - i__ <= *ku || i__ >= j && i__ - j <= *kl) + if (i__ <= j && j - i__ <= *ku || i__ >= j && i__ - j <= *kl) { a[i__ + j * a_dim1] = dbeg_(reset) + *transl; } else { @@ -4728,9 +4738,9 @@ static logical c_false = FALSE_; } /* dmake_ */ /* Subroutine */ int dmvch_(char *trans, integer *m, integer *n, doublereal * - alpha, doublereal *a, integer *nmax, doublereal *x, integer *incx, - doublereal *beta, doublereal *y, integer *incy, doublereal *yt, - doublereal *g, doublereal *yy, doublereal *eps, doublereal *err, + alpha, doublereal *a, integer *nmax, doublereal *x, integer *incx, + doublereal *beta, doublereal *y, integer *incy, doublereal *yt, + doublereal *g, doublereal *yy, doublereal *eps, doublereal *err, logical *fatal, integer *nout, logical *mv, ftnlen trans_len) { /* Format strings */ @@ -4845,7 +4855,7 @@ static logical c_false = FALSE_; *err = 0.; i__1 = ml; for (i__ = 1; i__ <= i__1; ++i__) { - erri = (d__1 = yt[i__] - yy[(i__ - 1) * abs(*incy) + 1], abs(d__1)) / + erri = (d__1 = yt[i__] - yy[(i__ - 1) * abs(*incy) + 1], abs(d__1)) / *eps; if (g[i__] != 0.) { erri /= g[i__]; @@ -5102,7 +5112,7 @@ doublereal ddiff_(doublereal *x, doublereal *y) } /* ddiff_ */ -/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, +/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, logical *lerr, logical *ok, ftnlen srnamt_len) { /* Format strings */ diff --git a/blastest/src/dblat3.c b/blastest/src/dblat3.c index d7a85e29c1..b4698f56cb 100644 --- a/blastest/src/dblat3.c +++ b/blastest/src/dblat3.c @@ -135,9 +135,14 @@ static integer c__2 = 2; /* ===================================================================== */ /* Main program */ int main(void) { +#ifdef BLIS_ENABLE_HPX + char* program = "dblat3"; + bli_thread_initialize_hpx( 1, &program ); +#endif + /* Initialized data */ - static char snames[6*6] = "DGEMM " "DSYMM " "DTRMM " "DTRSM " "DSYRK " + static char snames[6*6] = "DGEMM " "DSYMM " "DTRMM " "DTRSM " "DSYRK " "DSYR2K"; /* Format strings */ @@ -179,10 +184,10 @@ static integer c__2 = 2; cllist cl__1; /* Builtin functions */ - integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_rsle(void), f_open(olist *), s_wsfe(cilist *), do_fio(integer *, - char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), - s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, + char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), + s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, ftnlen); /* Subroutine */ int s_stop(char *, ftnlen); integer f_clos(cllist *); @@ -200,38 +205,38 @@ static integer c__2 = 2; integer nbet, ntra; logical rewi; integer nout; - extern /* Subroutine */ int dchk1_(char *, doublereal *, doublereal *, - integer *, integer *, logical *, logical *, logical *, integer *, - integer *, integer *, doublereal *, integer *, doublereal *, + extern /* Subroutine */ int dchk1_(char *, doublereal *, doublereal *, + integer *, integer *, logical *, logical *, logical *, integer *, + integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, ftnlen), dchk2_(char *, - doublereal *, doublereal *, integer *, integer *, logical *, - logical *, logical *, integer *, integer *, integer *, doublereal + doublereal *, doublereal *, doublereal *, doublereal *, + doublereal *, doublereal *, doublereal *, ftnlen), dchk2_(char *, + doublereal *, doublereal *, integer *, integer *, logical *, + logical *, logical *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, ftnlen), dchk3_(char *, doublereal *, doublereal *, - integer *, integer *, logical *, logical *, logical *, integer *, - integer *, integer *, doublereal *, integer *, doublereal *, - doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, doublereal *, ftnlen), - dchk4_(char *, doublereal *, doublereal *, integer *, integer *, - logical *, logical *, logical *, integer *, integer *, integer *, - doublereal *, integer *, doublereal *, integer *, doublereal *, - doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, doublereal *, ftnlen), dchk5_(char *, doublereal *, - doublereal *, integer *, integer *, logical *, logical *, logical - *, integer *, integer *, integer *, doublereal *, integer *, + doublereal *, doublereal *, doublereal *, doublereal *, + doublereal *, doublereal *, doublereal *, doublereal *, + doublereal *, ftnlen), dchk3_(char *, doublereal *, doublereal *, + integer *, integer *, logical *, logical *, logical *, integer *, + integer *, integer *, doublereal *, integer *, doublereal *, + doublereal *, doublereal *, doublereal *, doublereal *, + doublereal *, doublereal *, doublereal *, doublereal *, ftnlen), + dchk4_(char *, doublereal *, doublereal *, integer *, integer *, + logical *, logical *, logical *, integer *, integer *, integer *, + doublereal *, integer *, doublereal *, integer *, doublereal *, + doublereal *, doublereal *, doublereal *, doublereal *, + doublereal *, doublereal *, doublereal *, doublereal *, + doublereal *, doublereal *, ftnlen), dchk5_(char *, doublereal *, + doublereal *, integer *, integer *, logical *, logical *, logical + *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, doublereal *, ftnlen), + doublereal *, doublereal *, doublereal *, doublereal *, + doublereal *, doublereal *, doublereal *, doublereal *, ftnlen), dchke_(integer *, char *, integer *, ftnlen); logical fatal; - extern /* Subroutine */ int dmmch_(char *, char *, integer *, integer *, - integer *, doublereal *, doublereal *, integer *, doublereal *, - integer *, doublereal *, doublereal *, integer *, doublereal *, + extern /* Subroutine */ int dmmch_(char *, char *, integer *, integer *, + integer *, doublereal *, doublereal *, integer *, doublereal *, + integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen, ftnlen); logical trace; @@ -506,7 +511,7 @@ static integer c__2 = 2; goto L60; } for (i__ = 1; i__ <= 6; ++i__) { - if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) + if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) { goto L50; } @@ -554,7 +559,7 @@ static integer c__2 = 2; } i__1 = n; for (j = 1; j <= i__1; ++j) { - cc[j - 1] = (doublereal) (j * ((j + 1) * j) / 2 - (j + 1) * j * (j - + cc[j - 1] = (doublereal) (j * ((j + 1) * j) / 2 - (j + 1) * j * (j - 1) / 3); /* L110: */ } @@ -599,7 +604,7 @@ static integer c__2 = 2; } i__1 = n; for (j = 1; j <= i__1; ++j) { - cc[n - j] = (doublereal) (j * ((j + 1) * j) / 2 - (j + 1) * j * (j - + cc[n - j] = (doublereal) (j * ((j + 1) * j) / 2 - (j + 1) * j * (j - 1) / 3); /* L130: */ } @@ -672,34 +677,34 @@ static integer c__2 = 2; /* Test DGEMM, 01. */ L140: dchk1_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, - bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, + bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, ct, g, (ftnlen)6); goto L190; /* Test DSYMM, 02. */ L150: dchk2_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, - bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, + bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, ct, g, (ftnlen)6); goto L190; /* Test DTRMM, 03, DTRSM, 04. */ L160: dchk3_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &c__65, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, (ftnlen)6); goto L190; /* Test DSYRK, 05. */ L170: dchk4_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, - bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, + bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, ct, g, (ftnlen)6); goto L190; /* Test DSYR2K, 06. */ L180: dchk5_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, ct, g, w, ( ftnlen)6); goto L190; @@ -743,15 +748,20 @@ static integer c__2 = 2; /* End of DBLAT3. */ - return 0; +#ifdef BLIS_ENABLE_HPX + return bli_thread_finalize_hpx(); +#else + // Return peacefully. + return 0; +#endif } /* main */ -/* Subroutine */ int dchk1_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int dchk1_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * - fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, - integer *nbet, doublereal *bet, integer *nmax, doublereal *a, - doublereal *aa, doublereal *as, doublereal *b, doublereal *bb, - doublereal *bs, doublereal *c__, doublereal *cc, doublereal *cs, + fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, + integer *nbet, doublereal *bet, integer *nmax, doublereal *a, + doublereal *aa, doublereal *as, doublereal *b, doublereal *bb, + doublereal *bs, doublereal *c__, doublereal *cc, doublereal *cs, doublereal *ct, doublereal *g, ftnlen sname_len) { /* Initialized data */ @@ -775,7 +785,7 @@ static integer c__2 = 2; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6; alist al__1; @@ -784,22 +794,22 @@ static integer c__2 = 2; f_rew(alist *); /* Local variables */ - integer i__, k, m, n, ia, ib, ma, mb, na, nb, nc, ik, im, in, ks, ms, ns, + integer i__, k, m, n, ia, ib, ma, mb, na, nb, nc, ik, im, in, ks, ms, ns, ica, icb, laa, lbb, lda, lcc, ldb, ldc; extern logical lde_(doublereal *, doublereal *, integer *); doublereal als, bls, err, beta; integer ldas, ldbs, ldcs; logical same, null; - extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, - integer *, doublereal *, integer *, doublereal *, integer *, + extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, + integer *, doublereal *, integer *, doublereal *, integer *, logical *, doublereal *, ftnlen, ftnlen, ftnlen); doublereal alpha; - extern /* Subroutine */ int dmmch_(char *, char *, integer *, integer *, - integer *, doublereal *, doublereal *, integer *, doublereal *, - integer *, doublereal *, doublereal *, integer *, doublereal *, + extern /* Subroutine */ int dmmch_(char *, char *, integer *, integer *, + integer *, doublereal *, doublereal *, integer *, doublereal *, + integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, - logical *, integer *, logical *, ftnlen, ftnlen), dgemm_(char *, - char *, integer *, integer *, integer *, doublereal *, doublereal + logical *, integer *, logical *, ftnlen, ftnlen), dgemm_(char *, + char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, ftnlen, ftnlen); logical isame[13], trana, tranb; @@ -898,7 +908,7 @@ static integer c__2 = 2; for (ica = 1; ica <= 3; ++ica) { *(unsigned char *)transa = *(unsigned char *)&ich[ica - 1] ; - trana = *(unsigned char *)transa == 'T' || *(unsigned + trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == 'C'; if (trana) { @@ -926,9 +936,9 @@ static integer c__2 = 2; ftnlen)1); for (icb = 1; icb <= 3; ++icb) { - *(unsigned char *)transb = *(unsigned char *)&ich[icb + *(unsigned char *)transb = *(unsigned char *)&ich[icb - 1]; - tranb = *(unsigned char *)transb == 'T' || *(unsigned + tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == 'C'; if (tranb) { @@ -1100,9 +1110,9 @@ static integer c__2 = 2; dmmch_(transa, transb, &m, &n, &k, &alpha, &a[a_offset], nmax, &b[b_offset], - nmax, &beta, &c__[c_offset], + nmax, &beta, &c__[c_offset], nmax, &ct[1], &g[1], &cc[1], &ldc, - eps, &err, fatal, nout, &c_true, + eps, &err, fatal, nout, &c_true, (ftnlen)1, (ftnlen)1); errmax = max(errmax,err); /* If got really bad answer, report and */ @@ -1183,12 +1193,12 @@ static integer c__2 = 2; } /* dchk1_ */ -/* Subroutine */ int dchk2_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int dchk2_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * - fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, - integer *nbet, doublereal *bet, integer *nmax, doublereal *a, - doublereal *aa, doublereal *as, doublereal *b, doublereal *bb, - doublereal *bs, doublereal *c__, doublereal *cc, doublereal *cs, + fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, + integer *nbet, doublereal *bet, integer *nmax, doublereal *a, + doublereal *aa, doublereal *as, doublereal *b, doublereal *bb, + doublereal *bs, doublereal *c__, doublereal *cc, doublereal *cs, doublereal *ct, doublereal *g, ftnlen sname_len) { /* Initialized data */ @@ -1213,7 +1223,7 @@ static integer c__2 = 2; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5; alist al__1; @@ -1222,7 +1232,7 @@ static integer c__2 = 2; f_rew(alist *); /* Local variables */ - integer i__, m, n, ia, ib, na, nc, im, in, ms, ns, laa, lbb, lda, lcc, + integer i__, m, n, ia, ib, na, nc, im, in, ms, ns, laa, lbb, lda, lcc, ldb, ldc; extern logical lde_(doublereal *, doublereal *, integer *); integer ics; @@ -1234,21 +1244,21 @@ static integer c__2 = 2; char side[1]; logical left, null; char uplo[1]; - extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, - integer *, doublereal *, integer *, doublereal *, integer *, + extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, + integer *, doublereal *, integer *, doublereal *, integer *, logical *, doublereal *, ftnlen, ftnlen, ftnlen); doublereal alpha; - extern /* Subroutine */ int dmmch_(char *, char *, integer *, integer *, - integer *, doublereal *, doublereal *, integer *, doublereal *, - integer *, doublereal *, doublereal *, integer *, doublereal *, + extern /* Subroutine */ int dmmch_(char *, char *, integer *, integer *, + integer *, doublereal *, doublereal *, integer *, doublereal *, + integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen, ftnlen); logical isame[13]; char sides[1]; integer nargs; logical reset; - extern /* Subroutine */ int dsymm_(char *, char *, integer *, integer *, - doublereal *, doublereal *, integer *, doublereal *, integer *, + extern /* Subroutine */ int dsymm_(char *, char *, integer *, integer *, + doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, ftnlen, ftnlen); char uplos[1]; extern logical lderes_(char *, char *, integer *, integer *, doublereal *, @@ -1391,7 +1401,7 @@ static integer c__2 = 2; /* Generate the matrix C. */ - dmake_("GE", " ", " ", &m, &n, &c__[c_offset], + dmake_("GE", " ", " ", &m, &n, &c__[c_offset], nmax, &cc[1], &ldc, &reset, &c_b86, ( ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -1472,9 +1482,9 @@ static integer c__2 = 2; /* See what data changed inside subroutines. */ - isame[0] = *(unsigned char *)sides == *(unsigned + isame[0] = *(unsigned char *)sides == *(unsigned char *)side; - isame[1] = *(unsigned char *)uplos == *(unsigned + isame[1] = *(unsigned char *)uplos == *(unsigned char *)uplo; isame[2] = ms == m; isame[3] = ns == n; @@ -1519,14 +1529,14 @@ static integer c__2 = 2; if (left) { dmmch_("N", "N", &m, &n, &m, &alpha, &a[ - a_offset], nmax, &b[b_offset], + a_offset], nmax, &b[b_offset], nmax, &beta, &c__[c_offset], nmax, &ct[1], &g[1], &cc[1], &ldc, eps, &err, fatal, nout, &c_true, ( ftnlen)1, (ftnlen)1); } else { dmmch_("N", "N", &m, &n, &n, &alpha, &b[ - b_offset], nmax, &a[a_offset], + b_offset], nmax, &a[a_offset], nmax, &beta, &c__[c_offset], nmax, &ct[1], &g[1], &cc[1], &ldc, eps, &err, fatal, nout, &c_true, ( @@ -1606,11 +1616,11 @@ static integer c__2 = 2; } /* dchk2_ */ -/* Subroutine */ int dchk3_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int dchk3_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * - fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, - integer *nmax, doublereal *a, doublereal *aa, doublereal *as, - doublereal *b, doublereal *bb, doublereal *bs, doublereal *ct, + fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, + integer *nmax, doublereal *a, doublereal *aa, doublereal *as, + doublereal *b, doublereal *bb, doublereal *bs, doublereal *ct, doublereal *g, doublereal *c__, ftnlen sname_len) { /* Initialized data */ @@ -1637,7 +1647,7 @@ static integer c__2 = 2; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5; alist al__1; @@ -1658,25 +1668,25 @@ static integer c__2 = 2; char side[1]; logical left, null; char uplo[1]; - extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, - integer *, doublereal *, integer *, doublereal *, integer *, + extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, + integer *, doublereal *, integer *, doublereal *, integer *, logical *, doublereal *, ftnlen, ftnlen, ftnlen); doublereal alpha; char diags[1]; - extern /* Subroutine */ int dmmch_(char *, char *, integer *, integer *, - integer *, doublereal *, doublereal *, integer *, doublereal *, - integer *, doublereal *, doublereal *, integer *, doublereal *, + extern /* Subroutine */ int dmmch_(char *, char *, integer *, integer *, + integer *, doublereal *, doublereal *, integer *, doublereal *, + integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen, ftnlen); logical isame[13]; char sides[1]; integer nargs; logical reset; - extern /* Subroutine */ int dtrmm_(char *, char *, char *, char *, - integer *, integer *, doublereal *, doublereal *, integer *, + extern /* Subroutine */ int dtrmm_(char *, char *, char *, char *, + integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, ftnlen, ftnlen, ftnlen, ftnlen), dtrsm_( char *, char *, char *, char *, integer *, integer *, doublereal * - , doublereal *, integer *, doublereal *, integer *, ftnlen, + , doublereal *, integer *, doublereal *, integer *, ftnlen, ftnlen, ftnlen, ftnlen); char uplos[1]; extern logical lderes_(char *, char *, integer *, integer *, doublereal *, @@ -1816,7 +1826,7 @@ static integer c__2 = 2; /* Generate the matrix B. */ - dmake_("GE", " ", " ", &m, &n, &b[b_offset], + dmake_("GE", " ", " ", &m, &n, &b[b_offset], nmax, &bb[1], &ldb, &reset, &c_b86, ( ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -1882,7 +1892,7 @@ static integer c__2 = 2; } dtrmm_(side, uplo, transa, diag, &m, &n, & alpha, &aa[1], &lda, &bb[1], &ldb, - (ftnlen)1, (ftnlen)1, (ftnlen)1, + (ftnlen)1, (ftnlen)1, (ftnlen)1, (ftnlen)1); } else if (s_cmp(sname + 3, "SM", (ftnlen)2, ( ftnlen)2) == 0) { @@ -1915,7 +1925,7 @@ static integer c__2 = 2; } dtrsm_(side, uplo, transa, diag, &m, &n, & alpha, &aa[1], &lda, &bb[1], &ldb, - (ftnlen)1, (ftnlen)1, (ftnlen)1, + (ftnlen)1, (ftnlen)1, (ftnlen)1, (ftnlen)1); } @@ -1984,18 +1994,18 @@ static integer c__2 = 2; dmmch_(transa, "N", &m, &n, &m, & alpha, &a[a_offset], nmax, &b[b_offset], nmax, & - c_b86, &c__[c_offset], + c_b86, &c__[c_offset], nmax, &ct[1], &g[1], &bb[ - 1], &ldb, eps, &err, + 1], &ldb, eps, &err, fatal, nout, &c_true, ( ftnlen)1, (ftnlen)1); } else { dmmch_("N", transa, &m, &n, &n, & alpha, &b[b_offset], nmax, &a[a_offset], nmax, & - c_b86, &c__[c_offset], + c_b86, &c__[c_offset], nmax, &ct[1], &g[1], &bb[ - 1], &ldb, eps, &err, + 1], &ldb, eps, &err, fatal, nout, &c_true, ( ftnlen)1, (ftnlen)1); } @@ -2008,10 +2018,10 @@ static integer c__2 = 2; i__4 = n; for (j = 1; j <= i__4; ++j) { i__5 = m; - for (i__ = 1; i__ <= i__5; ++i__) + for (i__ = 1; i__ <= i__5; ++i__) { c__[i__ + j * c_dim1] = bb[i__ + (j - 1) * ldb]; - bb[i__ + (j - 1) * ldb] = alpha * b[i__ + j * + bb[i__ + (j - 1) * ldb] = alpha * b[i__ + j * b_dim1]; /* L60: */ } @@ -2024,16 +2034,16 @@ static integer c__2 = 2; &c__[c_offset], nmax, & c_b86, &b[b_offset], nmax, &ct[1], &g[1], &bb[1], & - ldb, eps, &err, fatal, + ldb, eps, &err, fatal, nout, &c_false, (ftnlen)1, (ftnlen)1); } else { dmmch_("N", transa, &m, &n, &n, & - c_b96, &c__[c_offset], - nmax, &a[a_offset], nmax, - &c_b86, &b[b_offset], + c_b96, &c__[c_offset], + nmax, &a[a_offset], nmax, + &c_b86, &b[b_offset], nmax, &ct[1], &g[1], &bb[ - 1], &ldb, eps, &err, + 1], &ldb, eps, &err, fatal, nout, &c_false, ( ftnlen)1, (ftnlen)1); } @@ -2114,12 +2124,12 @@ static integer c__2 = 2; } /* dchk3_ */ -/* Subroutine */ int dchk4_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int dchk4_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * - fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, - integer *nbet, doublereal *bet, integer *nmax, doublereal *a, - doublereal *aa, doublereal *as, doublereal *b, doublereal *bb, - doublereal *bs, doublereal *c__, doublereal *cc, doublereal *cs, + fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, + integer *nbet, doublereal *bet, integer *nmax, doublereal *a, + doublereal *aa, doublereal *as, doublereal *b, doublereal *bb, + doublereal *bs, doublereal *c__, doublereal *cc, doublereal *cs, doublereal *ct, doublereal *g, ftnlen sname_len) { /* Initialized data */ @@ -2146,7 +2156,7 @@ static integer c__2 = 2; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5; alist al__1; @@ -2166,13 +2176,13 @@ static integer c__2 = 2; doublereal bets; logical tran, null; char uplo[1]; - extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, - integer *, doublereal *, integer *, doublereal *, integer *, + extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, + integer *, doublereal *, integer *, doublereal *, integer *, logical *, doublereal *, ftnlen, ftnlen, ftnlen); doublereal alpha; - extern /* Subroutine */ int dmmch_(char *, char *, integer *, integer *, - integer *, doublereal *, doublereal *, integer *, doublereal *, - integer *, doublereal *, doublereal *, integer *, doublereal *, + extern /* Subroutine */ int dmmch_(char *, char *, integer *, integer *, + integer *, doublereal *, doublereal *, integer *, doublereal *, + integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen, ftnlen); logical isame[13]; @@ -2180,7 +2190,7 @@ static integer c__2 = 2; logical reset; char trans[1]; logical upper; - extern /* Subroutine */ int dsyrk_(char *, char *, integer *, integer *, + extern /* Subroutine */ int dsyrk_(char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, integer *, ftnlen, ftnlen); char uplos[1]; @@ -2312,7 +2322,7 @@ static integer c__2 = 2; /* Generate the matrix C. */ - dmake_("SY", uplo, " ", &n, &n, &c__[c_offset], + dmake_("SY", uplo, " ", &n, &n, &c__[c_offset], nmax, &cc[1], &ldc, &reset, &c_b86, ( ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -2369,7 +2379,7 @@ static integer c__2 = 2; al__1.aunit = *ntra; f_rew(&al__1); } - dsyrk_(uplo, trans, &n, &k, &alpha, &aa[1], &lda, + dsyrk_(uplo, trans, &n, &k, &alpha, &aa[1], &lda, &beta, &cc[1], &ldc, (ftnlen)1, (ftnlen)1) ; @@ -2385,9 +2395,9 @@ static integer c__2 = 2; /* See what data changed inside subroutines. */ - isame[0] = *(unsigned char *)uplos == *(unsigned + isame[0] = *(unsigned char *)uplos == *(unsigned char *)uplo; - isame[1] = *(unsigned char *)transs == *(unsigned + isame[1] = *(unsigned char *)transs == *(unsigned char *)trans; isame[2] = ns == n; isame[3] = ks == k; @@ -2440,19 +2450,19 @@ static integer c__2 = 2; } if (tran) { dmmch_("T", "N", &lj, &c__1, &k, & - alpha, &a[jj * a_dim1 + 1], - nmax, &a[j * a_dim1 + 1], - nmax, &beta, &c__[jj + j * - c_dim1], nmax, &ct[1], &g[1], - &cc[jc], &ldc, eps, &err, + alpha, &a[jj * a_dim1 + 1], + nmax, &a[j * a_dim1 + 1], + nmax, &beta, &c__[jj + j * + c_dim1], nmax, &ct[1], &g[1], + &cc[jc], &ldc, eps, &err, fatal, nout, &c_true, (ftnlen) 1, (ftnlen)1); } else { dmmch_("N", "T", &lj, &c__1, &k, & - alpha, &a[jj + a_dim1], nmax, + alpha, &a[jj + a_dim1], nmax, &a[j + a_dim1], nmax, &beta, & c__[jj + j * c_dim1], nmax, & - ct[1], &g[1], &cc[jc], &ldc, + ct[1], &g[1], &cc[jc], &ldc, eps, &err, fatal, nout, & c_true, (ftnlen)1, (ftnlen)1); } @@ -2544,12 +2554,12 @@ static integer c__2 = 2; } /* dchk4_ */ -/* Subroutine */ int dchk5_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int dchk5_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * - fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, - integer *nbet, doublereal *bet, integer *nmax, doublereal *ab, - doublereal *aa, doublereal *as, doublereal *bb, doublereal *bs, - doublereal *c__, doublereal *cc, doublereal *cs, doublereal *ct, + fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, + integer *nbet, doublereal *bet, integer *nmax, doublereal *ab, + doublereal *aa, doublereal *as, doublereal *bb, doublereal *bs, + doublereal *c__, doublereal *cc, doublereal *cs, doublereal *ct, doublereal *g, doublereal *w, ftnlen sname_len) { /* Initialized data */ @@ -2597,13 +2607,13 @@ static integer c__2 = 2; doublereal bets; logical tran, null; char uplo[1]; - extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, - integer *, doublereal *, integer *, doublereal *, integer *, + extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, + integer *, doublereal *, integer *, doublereal *, integer *, logical *, doublereal *, ftnlen, ftnlen, ftnlen); doublereal alpha; - extern /* Subroutine */ int dmmch_(char *, char *, integer *, integer *, - integer *, doublereal *, doublereal *, integer *, doublereal *, - integer *, doublereal *, doublereal *, integer *, doublereal *, + extern /* Subroutine */ int dmmch_(char *, char *, integer *, integer *, + integer *, doublereal *, doublereal *, integer *, doublereal *, + integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen, ftnlen); logical isame[13]; @@ -2612,8 +2622,8 @@ static integer c__2 = 2; char trans[1]; logical upper; char uplos[1]; - extern /* Subroutine */ int dsyr2k_(char *, char *, integer *, integer *, - doublereal *, doublereal *, integer *, doublereal *, integer *, + extern /* Subroutine */ int dsyr2k_(char *, char *, integer *, integer *, + doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, ftnlen, ftnlen); extern logical lderes_(char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, ftnlen, ftnlen); @@ -2762,7 +2772,7 @@ static integer c__2 = 2; /* Generate the matrix C. */ - dmake_("SY", uplo, " ", &n, &n, &c__[c_offset], + dmake_("SY", uplo, " ", &n, &n, &c__[c_offset], nmax, &cc[1], &ldc, &reset, &c_b86, ( ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -2843,9 +2853,9 @@ static integer c__2 = 2; /* See what data changed inside subroutines. */ - isame[0] = *(unsigned char *)uplos == *(unsigned + isame[0] = *(unsigned char *)uplos == *(unsigned char *)uplo; - isame[1] = *(unsigned char *)transs == *(unsigned + isame[1] = *(unsigned char *)transs == *(unsigned char *)trans; isame[2] = ns == n; isame[3] = ks == k; @@ -2902,7 +2912,7 @@ static integer c__2 = 2; if (tran) { i__6 = k; for (i__ = 1; i__ <= i__6; ++i__) { - w[i__] = ab[(j - 1 << 1) * *nmax + w[i__] = ab[(j - 1 << 1) * *nmax + k + i__]; w[k + i__] = ab[(j - 1 << 1) * * nmax + i__]; @@ -2913,17 +2923,17 @@ static integer c__2 = 2; i__8 = *nmax << 1; dmmch_("T", "N", &lj, &c__1, &i__6, & alpha, &ab[jjab], &i__7, &w[1] - , &i__8, &beta, &c__[jj + j * - c_dim1], nmax, &ct[1], &g[1], - &cc[jc], &ldc, eps, &err, + , &i__8, &beta, &c__[jj + j * + c_dim1], nmax, &ct[1], &g[1], + &cc[jc], &ldc, eps, &err, fatal, nout, &c_true, (ftnlen) 1, (ftnlen)1); } else { i__6 = k; for (i__ = 1; i__ <= i__6; ++i__) { - w[i__] = ab[(k + i__ - 1) * *nmax + w[i__] = ab[(k + i__ - 1) * *nmax + j]; - w[k + i__] = ab[(i__ - 1) * *nmax + w[k + i__] = ab[(i__ - 1) * *nmax + j]; /* L60: */ } @@ -2931,9 +2941,9 @@ static integer c__2 = 2; i__7 = *nmax << 1; dmmch_("N", "N", &lj, &c__1, &i__6, & alpha, &ab[jj], nmax, &w[1], & - i__7, &beta, &c__[jj + j * - c_dim1], nmax, &ct[1], &g[1], - &cc[jc], &ldc, eps, &err, + i__7, &beta, &c__[jj + j * + c_dim1], nmax, &ct[1], &g[1], + &cc[jc], &ldc, eps, &err, fatal, nout, &c_true, (ftnlen) 1, (ftnlen)1); } @@ -3029,7 +3039,7 @@ static integer c__2 = 2; } /* dchk5_ */ -/* Subroutine */ int dchke_(integer *isnum, char *srnamt, integer *nout, +/* Subroutine */ int dchke_(integer *isnum, char *srnamt, integer *nout, ftnlen srnamt_len) { /* Format strings */ @@ -3042,24 +3052,24 @@ static integer c__2 = 2; integer s_wsfe(cilist *), do_fio(integer *, char *, ftnlen), e_wsfe(void); /* Local variables */ - doublereal a[2] /* was [2][1] */, b[2] /* was [2][1] */, c__[2] + doublereal a[2] /* was [2][1] */, b[2] /* was [2][1] */, c__[2] /* was [2][1] */, beta, alpha; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, - integer *, doublereal *, doublereal *, integer *, doublereal *, + extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, ftnlen, ftnlen), - dtrmm_(char *, char *, char *, char *, integer *, integer *, - doublereal *, doublereal *, integer *, doublereal *, integer *, + dtrmm_(char *, char *, char *, char *, integer *, integer *, + doublereal *, doublereal *, integer *, doublereal *, integer *, ftnlen, ftnlen, ftnlen, ftnlen), dsymm_(char *, char *, integer *, - integer *, doublereal *, doublereal *, integer *, doublereal *, + integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, ftnlen, ftnlen), - dtrsm_(char *, char *, char *, char *, integer *, integer *, - doublereal *, doublereal *, integer *, doublereal *, integer *, + dtrsm_(char *, char *, char *, char *, integer *, integer *, + doublereal *, doublereal *, integer *, doublereal *, integer *, ftnlen, ftnlen, ftnlen, ftnlen), dsyrk_(char *, char *, integer *, - integer *, doublereal *, doublereal *, integer *, doublereal *, - doublereal *, integer *, ftnlen, ftnlen), dsyr2k_(char *, char *, - integer *, integer *, doublereal *, doublereal *, integer *, - doublereal *, integer *, doublereal *, doublereal *, integer *, - ftnlen, ftnlen), chkxer_(char *, integer *, integer *, logical *, + integer *, doublereal *, doublereal *, integer *, doublereal *, + doublereal *, integer *, ftnlen, ftnlen), dsyr2k_(char *, char *, + integer *, integer *, doublereal *, doublereal *, integer *, + doublereal *, integer *, doublereal *, doublereal *, integer *, + ftnlen, ftnlen), chkxer_(char *, integer *, integer *, logical *, logical *, ftnlen); /* Fortran I/O blocks */ @@ -3113,142 +3123,142 @@ static integer c__2 = 2; } L10: infoc_1.infot = 1; - dgemm_("/", "N", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("/", "N", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 1; - dgemm_("/", "T", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("/", "T", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 2; - dgemm_("N", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("N", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 2; - dgemm_("T", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("T", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - dgemm_("N", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("N", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - dgemm_("N", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("N", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - dgemm_("T", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("T", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - dgemm_("T", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("T", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - dgemm_("N", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("N", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - dgemm_("N", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("N", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - dgemm_("T", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("T", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - dgemm_("T", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("T", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - dgemm_("N", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("N", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - dgemm_("N", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("N", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - dgemm_("T", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("T", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - dgemm_("T", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("T", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - dgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__2, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - dgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__2, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - dgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__2, &beta, + dgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__2, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - dgemm_("T", "T", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("T", "T", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - dgemm_("N", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("N", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - dgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__2, b, &c__1, &beta, + dgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__2, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - dgemm_("N", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("N", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - dgemm_("T", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("T", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - dgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, + dgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - dgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, + dgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - dgemm_("T", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("T", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - dgemm_("T", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("T", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); @@ -3952,9 +3962,9 @@ static integer c__2 = 2; } /* dchke_ */ -/* Subroutine */ int dmake_(char *type__, char *uplo, char *diag, integer *m, +/* Subroutine */ int dmake_(char *type__, char *uplo, char *diag, integer *m, integer *n, doublereal *a, integer *nmax, doublereal *aa, integer * - lda, logical *reset, doublereal *transl, ftnlen type_len, ftnlen + lda, logical *reset, doublereal *transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len) { /* System generated locals */ @@ -4097,8 +4107,8 @@ static integer c__2 = 2; } /* dmake_ */ /* Subroutine */ int dmmch_(char *transa, char *transb, integer *m, integer * - n, integer *kk, doublereal *alpha, doublereal *a, integer *lda, - doublereal *b, integer *ldb, doublereal *beta, doublereal *c__, + n, integer *kk, doublereal *alpha, doublereal *a, integer *lda, + doublereal *b, integer *ldb, doublereal *beta, doublereal *c__, integer *ldc, doublereal *ct, doublereal *g, doublereal *cc, integer * ldcc, doublereal *eps, doublereal *err, logical *fatal, integer *nout, logical *mv, ftnlen transa_len, ftnlen transb_len) @@ -4112,7 +4122,7 @@ static integer c__2 = 2; " \002,i3)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, cc_offset, i__1, i__2, i__3; doublereal d__1, d__2; @@ -4166,9 +4176,9 @@ static integer c__2 = 2; cc -= cc_offset; /* Function Body */ - trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == + trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == 'C'; - tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == + tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == 'C'; /* Compute expected result, one column at a time, in CT using data */ @@ -4190,7 +4200,7 @@ static integer c__2 = 2; i__3 = *m; for (i__ = 1; i__ <= i__3; ++i__) { ct[i__] += a[i__ + k * a_dim1] * b[k + j * b_dim1]; - g[i__] += (d__1 = a[i__ + k * a_dim1], abs(d__1)) * (d__2 + g[i__] += (d__1 = a[i__ + k * a_dim1], abs(d__1)) * (d__2 = b[k + j * b_dim1], abs(d__2)); /* L20: */ } @@ -4202,7 +4212,7 @@ static integer c__2 = 2; i__3 = *m; for (i__ = 1; i__ <= i__3; ++i__) { ct[i__] += a[k + i__ * a_dim1] * b[k + j * b_dim1]; - g[i__] += (d__1 = a[k + i__ * a_dim1], abs(d__1)) * (d__2 + g[i__] += (d__1 = a[k + i__ * a_dim1], abs(d__1)) * (d__2 = b[k + j * b_dim1], abs(d__2)); /* L40: */ } @@ -4214,7 +4224,7 @@ static integer c__2 = 2; i__3 = *m; for (i__ = 1; i__ <= i__3; ++i__) { ct[i__] += a[i__ + k * a_dim1] * b[j + k * b_dim1]; - g[i__] += (d__1 = a[i__ + k * a_dim1], abs(d__1)) * (d__2 + g[i__] += (d__1 = a[i__ + k * a_dim1], abs(d__1)) * (d__2 = b[j + k * b_dim1], abs(d__2)); /* L60: */ } @@ -4226,7 +4236,7 @@ static integer c__2 = 2; i__3 = *m; for (i__ = 1; i__ <= i__3; ++i__) { ct[i__] += a[k + i__ * a_dim1] * b[j + k * b_dim1]; - g[i__] += (d__1 = a[k + i__ * a_dim1], abs(d__1)) * (d__2 + g[i__] += (d__1 = a[k + i__ * a_dim1], abs(d__1)) * (d__2 = b[j + k * b_dim1], abs(d__2)); /* L80: */ } @@ -4520,7 +4530,7 @@ doublereal ddiff_(doublereal *x, doublereal *y) } /* ddiff_ */ -/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, +/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, logical *lerr, logical *ok, ftnlen srnamt_len) { /* Format strings */ diff --git a/blastest/src/fortran/run-f2c.sh b/blastest/src/fortran/run-f2c.sh index fdad4fd34d..f0df2f5b83 100755 --- a/blastest/src/fortran/run-f2c.sh +++ b/blastest/src/fortran/run-f2c.sh @@ -50,13 +50,15 @@ recursive-sed.sh -c "s/-4.f };/-4.f }};/g" -p "s*1.c" # Convert from brain-dead f2c complex calling conventions to normal # return-based conventions. -recursive-sed.sh -c "s/void cdotc_(complex \*, /complex cdotc_(/g" -p "c*1.c" -recursive-sed.sh -c "s/void cdotu_(complex \*, /complex cdotu_(/g" -p "c*1.c" -recursive-sed.sh -c "s/cdotc_(&q__1, /q__1 = cdotc_(/g" -p "c*1.c" -recursive-sed.sh -c "s/cdotu_(&q__1, /q__1 = cdotu_(/g" -p "c*1.c" - -recursive-sed.sh -c "s/void zdotc_(doublecomplex \*, /doublecomplex zdotc_(/g" -p "z*1.c" -recursive-sed.sh -c "s/void zdotu_(doublecomplex \*, /doublecomplex zdotu_(/g" -p "z*1.c" -recursive-sed.sh -c "s/zdotc_(\&z__1, /z__1 = zdotc_(/g" -p "z*1.c" -recursive-sed.sh -c "s/zdotu_(\&z__1, /z__1 = zdotu_(/g" -p "z*1.c" +subst1='\n#ifdef BLIS_ENABLE_COMPLEX_RETURN_INTEL\n&\n#else\n' +subst2='\n#endif\n' +recursive-sed.sh -c "s/ void cdotc_(complex \*,/${subst1}complex cdotc_(${subst2}/g" -p "c*1.c" +recursive-sed.sh -c "s/ void cdotu_(complex \*,/${subst1}complex cdotu_(${subst2}/g" -p "c*1.c" +recursive-sed.sh -c "s/\(.*\)cdotc_(&q__1,/${subst1}\1q__1 = cdotc_(${subst2}\1/g" -p "c*1.c" +recursive-sed.sh -c "s/\(.*\)cdotu_(&q__1,/${subst1}\1q__1 = cdotu_(${subst2}\1/g" -p "c*1.c" + +recursive-sed.sh -c "s/ void zdotc_(doublecomplex \*,/${subst1}doublecomplex zdotc_(${subst2}/g" -p "z*1.c" +recursive-sed.sh -c "s/ void zdotu_(doublecomplex \*,/${subst1}doublecomplex zdotu_(${subst2}/g" -p "z*1.c" +recursive-sed.sh -c "s/\(.*\)zdotc_(\&z__1,/${subst1}\1z__1 = zdotc_(${subst2}\1/g" -p "z*1.c" +recursive-sed.sh -c "s/\(.*\)zdotu_(\&z__1,/${subst1}\1z__1 = zdotu_(${subst2}\1/g" -p "z*1.c" diff --git a/blastest/src/sblat1.c b/blastest/src/sblat1.c index 6996666a55..7bde1b108f 100644 --- a/blastest/src/sblat1.c +++ b/blastest/src/sblat1.c @@ -69,6 +69,11 @@ static real c_b63 = 0.f; /* ===================================================================== */ /* Main program */ int main(void) { +#ifdef BLIS_ENABLE_HPX + char* program = "sblat1"; + bli_thread_initialize_hpx( 1, &program ); +#endif + /* Initialized data */ static real sfac = 9.765625e-4f; @@ -123,11 +128,11 @@ static real c_b63 = 0.f; combla_1.incy = 9999; if (combla_1.icase == 3 || combla_1.icase == 11) { check0_(&sfac); - } else if (combla_1.icase == 7 || combla_1.icase == 8 || + } else if (combla_1.icase == 7 || combla_1.icase == 8 || combla_1.icase == 9 || combla_1.icase == 10) { check1_(&sfac); - } else if (combla_1.icase == 1 || combla_1.icase == 2 || - combla_1.icase == 5 || combla_1.icase == 6 || combla_1.icase + } else if (combla_1.icase == 1 || combla_1.icase == 2 || + combla_1.icase == 5 || combla_1.icase == 6 || combla_1.icase == 12 || combla_1.icase == 13) { check2_(&sfac); } else if (combla_1.icase == 4) { @@ -142,7 +147,12 @@ static real c_b63 = 0.f; } s_stop("", (ftnlen)0); - return 0; +#ifdef BLIS_ENABLE_HPX + return bli_thread_finalize_hpx(); +#else + // Return peacefully. + return 0; +#endif } /* main */ /* Subroutine */ int header_(void) @@ -202,16 +212,16 @@ static real c_b63 = 0.f; static real dc1[8] = { .6f,.8f,-.6f,.8f,.6f,1.f,0.f,1.f }; /* Builtin functions */ - integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_wsle(void); /* Subroutine */ int s_stop(char *, ftnlen); /* Local variables */ integer i__, k; real sa, sb, sc, ss, dtemp[9]; - extern /* Subroutine */ int srotg_(real *, real *, real *, real *), + extern /* Subroutine */ int srotg_(real *, real *, real *, real *), stest_(integer *, real *, real *, real *, real *), stest1_(real *, - real *, real *, real *), srotmg_(real *, real *, real *, real *, + real *, real *, real *), srotmg_(real *, real *, real *, real *, real *); /* Fortran I/O blocks */ @@ -322,7 +332,7 @@ static real c_b63 = 0.f; real r__1; /* Builtin functions */ - integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_wsle(void); /* Subroutine */ int s_stop(char *, ftnlen); @@ -335,8 +345,8 @@ static real c_b63 = 0.f; real stemp[1]; extern real sasum_(integer *, real *, integer *); real strue[8]; - extern /* Subroutine */ int stest_(integer *, real *, real *, real *, - real *), itest1_(integer *, integer *), stest1_(real *, real *, + extern /* Subroutine */ int stest_(integer *, real *, real *, real *, + real *), itest1_(integer *, integer *), stest1_(real *, real *, real *, real *); extern integer isamax_(integer *, real *, integer *); @@ -378,11 +388,11 @@ static real c_b63 = 0.f; stest1_(&r__1, stemp, stemp, sfac); } else if (combla_1.icase == 9) { /* .. SSCAL .. */ - sscal_(&combla_1.n, &sa[(combla_1.incx - 1) * 5 + np1 - 1], + sscal_(&combla_1.n, &sa[(combla_1.incx - 1) * 5 + np1 - 1], sx, &combla_1.incx); i__1 = len; for (i__ = 1; i__ <= i__1; ++i__) { - strue[i__ - 1] = dtrue5[i__ + (np1 + combla_1.incx * 5 << + strue[i__ - 1] = dtrue5[i__ + (np1 + combla_1.incx * 5 << 3) - 49]; /* L40: */ } @@ -455,87 +465,87 @@ static real c_b63 = 0.f; ; static struct { real e_1[448]; - } equiv_3 = {{ .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, - 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, - 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, - -.8f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, -.9f, 0.f, 0.f, 0.f, 0.f, - 0.f, 0.f, 3.5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, .1f, 0.f, + } equiv_3 = {{ .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + -.8f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, -.9f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 3.5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, .1f, 0.f, 0.f, 0.f, 0.f, 0.f, -.8f, 3.8f, 0.f, 0.f, 0.f, 0.f, 0.f, -.9f, - 2.8f, 0.f, 0.f, 0.f, 0.f, 0.f, 3.5f, -.4f, 0.f, 0.f, 0.f, - 0.f, 0.f, .6f, .1f, -.5f, .8f, 0.f, 0.f, 0.f, -.8f, 3.8f, - -2.2f, -1.2f, 0.f, 0.f, 0.f, -.9f, 2.8f, -1.4f, -1.3f, 0.f, - 0.f, 0.f, 3.5f, -.4f, -2.2f, 4.7f, 0.f, 0.f, 0.f, .6f, 0.f, - 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, - .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, - 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, -.8f, 0.f, 0.f, - 0.f, 0.f, 0.f, 0.f, -.9f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 3.5f, - 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, .1f, -.5f, 0.f, 0.f, 0.f, + 2.8f, 0.f, 0.f, 0.f, 0.f, 0.f, 3.5f, -.4f, 0.f, 0.f, 0.f, + 0.f, 0.f, .6f, .1f, -.5f, .8f, 0.f, 0.f, 0.f, -.8f, 3.8f, + -2.2f, -1.2f, 0.f, 0.f, 0.f, -.9f, 2.8f, -1.4f, -1.3f, 0.f, + 0.f, 0.f, 3.5f, -.4f, -2.2f, 4.7f, 0.f, 0.f, 0.f, .6f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, -.8f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, -.9f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 3.5f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, .1f, -.5f, 0.f, 0.f, 0.f, 0.f, 0.f, .1f, -3.f, 0.f, 0.f, 0.f, 0.f, -.3f, .1f, -2.f, 0.f, 0.f, 0.f, 0.f, 3.3f, .1f, -2.f, 0.f, 0.f, 0.f, 0.f, .6f, .1f, - -.5f, .8f, .9f, -.3f, -.4f, -2.f, .1f, 1.4f, .8f, .6f, -.3f, - -2.8f, -1.8f, .1f, 1.3f, .8f, 0.f, -.3f, -1.9f, 3.8f, .1f, - -3.1f, .8f, 4.8f, -.3f, -1.5f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, - 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, - 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, - 0.f, 0.f, 0.f, 0.f, 0.f, -.8f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, - -.9f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 3.5f, 0.f, 0.f, 0.f, 0.f, + -.5f, .8f, .9f, -.3f, -.4f, -2.f, .1f, 1.4f, .8f, .6f, -.3f, + -2.8f, -1.8f, .1f, 1.3f, .8f, 0.f, -.3f, -1.9f, 3.8f, .1f, + -3.1f, .8f, 4.8f, -.3f, -1.5f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, -.8f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + -.9f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 3.5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, .1f, -.5f, 0.f, 0.f, 0.f, 0.f, 4.8f, .1f, -3.f, - 0.f, 0.f, 0.f, 0.f, 3.3f, .1f, -2.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 3.3f, .1f, -2.f, 0.f, 0.f, 0.f, 0.f, 2.1f, .1f, -2.f, 0.f, 0.f, 0.f, 0.f, .6f, .1f, -.5f, .8f, .9f, -.3f, -.4f, -1.6f, .1f, -2.2f, .8f, 5.4f, -.3f, -2.8f, -1.5f, - .1f, -1.4f, .8f, 3.6f, -.3f, -1.9f, 3.7f, .1f, -2.2f, .8f, - 3.6f, -.3f, -1.5f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, - 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, - 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, - 0.f, 0.f, 0.f, -.8f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, -.9f, 0.f, - 0.f, 0.f, 0.f, 0.f, 0.f, 3.5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, - .6f, .1f, 0.f, 0.f, 0.f, 0.f, 0.f, -.8f, -1.f, 0.f, 0.f, 0.f, + .1f, -1.4f, .8f, 3.6f, -.3f, -1.9f, 3.7f, .1f, -2.2f, .8f, + 3.6f, -.3f, -1.5f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, -.8f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, -.9f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 3.5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + .6f, .1f, 0.f, 0.f, 0.f, 0.f, 0.f, -.8f, -1.f, 0.f, 0.f, 0.f, 0.f, 0.f, -.9f, -.8f, 0.f, 0.f, 0.f, 0.f, 0.f, 3.5f, .8f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, .1f, -.5f, .8f, 0.f, 0.f, 0.f, -.8f, - -1.f, 1.4f, -1.6f, 0.f, 0.f, 0.f, -.9f, -.8f, 1.3f, -1.6f, + -1.f, 1.4f, -1.6f, 0.f, 0.f, 0.f, -.9f, -.8f, 1.3f, -1.6f, 0.f, 0.f, 0.f, 3.5f, .8f, -3.1f, 4.8f, 0.f, 0.f, 0.f }}; static struct { real e_1[448]; - } equiv_7 = {{ .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, - 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, - 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, - .7f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 1.7f, 0.f, 0.f, 0.f, 0.f, + } equiv_7 = {{ .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + .7f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 1.7f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, -2.6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, -.9f, 0.f, - 0.f, 0.f, 0.f, 0.f, .7f, -4.8f, 0.f, 0.f, 0.f, 0.f, 0.f, - 1.7f, -.7f, 0.f, 0.f, 0.f, 0.f, 0.f, -2.6f, 3.5f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, .7f, -4.8f, 0.f, 0.f, 0.f, 0.f, 0.f, + 1.7f, -.7f, 0.f, 0.f, 0.f, 0.f, 0.f, -2.6f, 3.5f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, -.9f, .3f, .7f, 0.f, 0.f, 0.f, .7f, -4.8f, - 3.f, 1.1f, 0.f, 0.f, 0.f, 1.7f, -.7f, -.7f, 2.3f, 0.f, 0.f, - 0.f, -2.6f, 3.5f, -.7f, -3.6f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, - 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, - 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, - 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .7f, 0.f, 0.f, 0.f, + 3.f, 1.1f, 0.f, 0.f, 0.f, 1.7f, -.7f, -.7f, 2.3f, 0.f, 0.f, + 0.f, -2.6f, 3.5f, -.7f, -3.6f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .7f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 1.7f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, -2.6f, 0.f, - 0.f, 0.f, 0.f, 0.f, 0.f, .5f, -.9f, .3f, 0.f, 0.f, 0.f, 0.f, - 4.f, -.9f, -.3f, 0.f, 0.f, 0.f, 0.f, -.5f, -.9f, 1.5f, 0.f, - 0.f, 0.f, 0.f, -1.5f, -.9f, -1.8f, 0.f, 0.f, 0.f, 0.f, .5f, + 0.f, 0.f, 0.f, 0.f, 0.f, .5f, -.9f, .3f, 0.f, 0.f, 0.f, 0.f, + 4.f, -.9f, -.3f, 0.f, 0.f, 0.f, 0.f, -.5f, -.9f, 1.5f, 0.f, + 0.f, 0.f, 0.f, -1.5f, -.9f, -1.8f, 0.f, 0.f, 0.f, 0.f, .5f, -.9f, .3f, .7f, -.6f, .2f, .8f, 3.7f, -.9f, -1.2f, .7f, -1.5f, - .2f, 2.2f, -.3f, -.9f, 2.1f, .7f, -1.6f, .2f, 2.f, -1.6f, - -.9f, -2.1f, .7f, 2.9f, .2f, -3.8f, .5f, 0.f, 0.f, 0.f, 0.f, - 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, - 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, - 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .7f, 0.f, 0.f, 0.f, 0.f, 0.f, + .2f, 2.2f, -.3f, -.9f, 2.1f, .7f, -1.6f, .2f, 2.f, -1.6f, + -.9f, -2.1f, .7f, 2.9f, .2f, -3.8f, .5f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .7f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 1.7f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, -2.6f, 0.f, 0.f, 0.f, - 0.f, 0.f, 0.f, .5f, -.9f, 0.f, 0.f, 0.f, 0.f, 0.f, 4.f, + 0.f, 0.f, 0.f, .5f, -.9f, 0.f, 0.f, 0.f, 0.f, 0.f, 4.f, -6.3f, 0.f, 0.f, 0.f, 0.f, 0.f, -.5f, .3f, 0.f, 0.f, 0.f, 0.f, - 0.f, -1.5f, 3.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, -.9f, .3f, - .7f, 0.f, 0.f, 0.f, 3.7f, -7.2f, 3.f, 1.7f, 0.f, 0.f, 0.f, - -.3f, .9f, -.7f, 1.9f, 0.f, 0.f, 0.f, -1.6f, 2.7f, -.7f, - -3.4f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, - 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, - 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, - 0.f, 0.f, 0.f, .7f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 1.7f, 0.f, - 0.f, 0.f, 0.f, 0.f, 0.f, -2.6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, -1.5f, 3.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, -.9f, .3f, + .7f, 0.f, 0.f, 0.f, 3.7f, -7.2f, 3.f, 1.7f, 0.f, 0.f, 0.f, + -.3f, .9f, -.7f, 1.9f, 0.f, 0.f, 0.f, -1.6f, 2.7f, -.7f, + -3.4f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, .7f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 1.7f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, -2.6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, -.9f, .3f, 0.f, 0.f, 0.f, 0.f, .7f, -.9f, 1.2f, 0.f, 0.f, - 0.f, 0.f, 1.7f, -.9f, .5f, 0.f, 0.f, 0.f, 0.f, -2.6f, -.9f, - -1.3f, 0.f, 0.f, 0.f, 0.f, .5f, -.9f, .3f, .7f, -.6f, .2f, - .8f, .7f, -.9f, 1.2f, .7f, -1.5f, .2f, 1.6f, 1.7f, -.9f, .5f, - .7f, -1.6f, .2f, 2.4f, -2.6f, -.9f, -1.3f, .7f, 2.9f, .2f, + 0.f, 0.f, 1.7f, -.9f, .5f, 0.f, 0.f, 0.f, 0.f, -2.6f, -.9f, + -1.3f, 0.f, 0.f, 0.f, 0.f, .5f, -.9f, .3f, .7f, -.6f, .2f, + .8f, .7f, -.9f, 1.2f, .7f, -1.5f, .2f, 1.6f, 1.7f, -.9f, .5f, + .7f, -1.6f, .2f, 2.4f, -2.6f, -.9f, -1.3f, .7f, 2.9f, .2f, -4.f }}; @@ -544,7 +554,7 @@ static real c_b63 = 0.f; real r__1; /* Builtin functions */ - integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_wsle(void); /* Subroutine */ int s_stop(char *, ftnlen); @@ -568,13 +578,13 @@ static real c_b63 = 0.f; #define dt19yd ((real *)&equiv_7 + 336) integer ksize; real ssize[7]; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, integer *), sswap_(integer *, real *, integer *, real *, integer * ), stest_(integer *, real *, real *, real *, real *), saxpy_( integer *, real *, real *, integer *, real *, integer *), srotm_( integer *, real *, integer *, real *, integer *, real *), stest1_( real *, real *, real *, real *); - extern real sdsdot_(integer *, real *, real *, integer *, real *, integer + extern real sdsdot_(integer *, real *, real *, integer *, real *, integer *); /* Fortran I/O blocks */ @@ -627,7 +637,7 @@ static real c_b63 = 0.f; /* .. SDOT .. */ r__1 = sdot_(&combla_1.n, sx, &combla_1.incx, sy, & combla_1.incy); - stest1_(&r__1, &dt7[kn + (ki << 2) - 5], &ssize1[kn - 1], + stest1_(&r__1, &dt7[kn + (ki << 2) - 5], &ssize1[kn - 1], sfac); } else if (combla_1.icase == 2) { /* .. SAXPY .. */ @@ -664,9 +674,9 @@ static real c_b63 = 0.f; for (i__ = 1; i__ <= 7; ++i__) { sx[i__ - 1] = dx1[i__ - 1]; sy[i__ - 1] = dy1[i__ - 1]; - stx[i__ - 1] = dt19x[i__ + (kpar + (kni << 2)) * 7 - + stx[i__ - 1] = dt19x[i__ + (kpar + (kni << 2)) * 7 - 36]; - sty[i__ - 1] = dt19y[i__ + (kpar + (kni << 2)) * 7 - + sty[i__ - 1] = dt19y[i__ + (kpar + (kni << 2)) * 7 - 36]; } @@ -696,7 +706,7 @@ static real c_b63 = 0.f; /* .. SDSROT .. */ r__1 = sdsdot_(&combla_1.n, &c_b39, sx, &combla_1.incx, sy, & combla_1.incy); - stest1_(&r__1, &st7b[kn + (ki << 2) - 5], &ssize3[kn - 1], + stest1_(&r__1, &st7b[kn + (ki << 2) - 5], &ssize3[kn - 1], sfac); } else { s_wsle(&io___80); @@ -759,7 +769,7 @@ static real c_b63 = 0.f; 1.17f,1.17f,1.17f,1.17f,1.17f,1.17f,1.17f,1.17f,1.17f }; /* Builtin functions */ - integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_wsle(void); /* Subroutine */ int s_stop(char *, ftnlen); @@ -770,12 +780,12 @@ static real c_b63 = 0.f; real mwpc[11]; integer mwpn[11]; real mwps[11]; - extern /* Subroutine */ int srot_(integer *, real *, integer *, real *, + extern /* Subroutine */ int srot_(integer *, real *, integer *, real *, integer *, real *, real *); real mwpx[5], mwpy[5]; integer ksize; real copyx[5], copyy[5]; - extern /* Subroutine */ int stest_(integer *, real *, real *, real *, + extern /* Subroutine */ int stest_(integer *, real *, real *, real *, real *); real mwptx[55] /* was [11][5] */, mwpty[55] /* was [11][5] */; integer mwpinx[11], mwpiny[11]; @@ -1032,7 +1042,7 @@ static real c_b63 = 0.f; sfac) { real scomp[1], strue[1]; - extern /* Subroutine */ int stest_(integer *, real *, real *, real *, + extern /* Subroutine */ int stest_(integer *, real *, real *, real *, real *); /* ************************* STEST1 ***************************** */ diff --git a/blastest/src/sblat2.c b/blastest/src/sblat2.c index 54d0a010af..a2ce310f65 100644 --- a/blastest/src/sblat2.c +++ b/blastest/src/sblat2.c @@ -155,10 +155,15 @@ static logical c_false = FALSE_; /* ===================================================================== */ /* Main program */ int main(void) { +#ifdef BLIS_ENABLE_HPX + char* program = "sblat2"; + bli_thread_initialize_hpx( 1, &program ); +#endif + /* Initialized data */ - static char snames[6*16] = "SGEMV " "SGBMV " "SSYMV " "SSBMV " "SSPMV " - "STRMV " "STBMV " "STPMV " "STRSV " "STBSV " "STPSV " "SGER " + static char snames[6*16] = "SGEMV " "SGBMV " "SSYMV " "SSBMV " "SSPMV " + "STRMV " "STBMV " "STPMV " "STRSV " "STBSV " "STPSV " "SGER " "SSYR " "SSPR " "SSYR2 " "SSPR2 "; /* Format strings */ @@ -204,10 +209,10 @@ static logical c_false = FALSE_; cllist cl__1; /* Builtin functions */ - integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_rsle(void), f_open(olist *), s_wsfe(cilist *), do_fio(integer *, - char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), - s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, + char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), + s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, ftnlen); /* Subroutine */ int s_stop(char *, ftnlen); integer f_clos(cllist *); @@ -228,40 +233,40 @@ static logical c_false = FALSE_; integer ninc, nbet, ntra; logical rewi; integer nout; - extern /* Subroutine */ int schk1_(char *, real *, real *, integer *, - integer *, logical *, logical *, logical *, integer *, integer *, - integer *, integer *, integer *, real *, integer *, real *, + extern /* Subroutine */ int schk1_(char *, real *, real *, integer *, + integer *, logical *, logical *, logical *, integer *, integer *, + integer *, integer *, integer *, real *, integer *, real *, integer *, integer *, integer *, integer *, real *, real *, real * - , real *, real *, real *, real *, real *, real *, real *, real *, - ftnlen), schk2_(char *, real *, real *, integer *, integer *, - logical *, logical *, logical *, integer *, integer *, integer *, - integer *, integer *, real *, integer *, real *, integer *, - integer *, integer *, integer *, real *, real *, real *, real *, - real *, real *, real *, real *, real *, real *, real *, ftnlen), - schk3_(char *, real *, real *, integer *, integer *, logical *, - logical *, logical *, integer *, integer *, integer *, integer *, + , real *, real *, real *, real *, real *, real *, real *, real *, + ftnlen), schk2_(char *, real *, real *, integer *, integer *, + logical *, logical *, logical *, integer *, integer *, integer *, + integer *, integer *, real *, integer *, real *, integer *, + integer *, integer *, integer *, real *, real *, real *, real *, + real *, real *, real *, real *, real *, real *, real *, ftnlen), + schk3_(char *, real *, real *, integer *, integer *, logical *, + logical *, logical *, integer *, integer *, integer *, integer *, integer *, integer *, integer *, integer *, real *, real *, real * , real *, real *, real *, real *, real *, real *, ftnlen), schk4_( char *, real *, real *, integer *, integer *, logical *, logical * - , logical *, integer *, integer *, integer *, real *, integer *, - integer *, integer *, integer *, real *, real *, real *, real *, - real *, real *, real *, real *, real *, real *, real *, real *, - ftnlen), schk5_(char *, real *, real *, integer *, integer *, - logical *, logical *, logical *, integer *, integer *, integer *, + , logical *, integer *, integer *, integer *, real *, integer *, + integer *, integer *, integer *, real *, real *, real *, real *, + real *, real *, real *, real *, real *, real *, real *, real *, + ftnlen), schk5_(char *, real *, real *, integer *, integer *, + logical *, logical *, logical *, integer *, integer *, integer *, real *, integer *, integer *, integer *, integer *, real *, real * - , real *, real *, real *, real *, real *, real *, real *, real *, + , real *, real *, real *, real *, real *, real *, real *, real *, real *, real *, ftnlen), schk6_(char *, real *, real *, integer *, integer *, logical *, logical *, logical *, integer *, integer *, - integer *, real *, integer *, integer *, integer *, integer *, - real *, real *, real *, real *, real *, real *, real *, real *, + integer *, real *, integer *, integer *, integer *, integer *, + real *, real *, real *, real *, real *, real *, real *, real *, real *, real *, real *, real *, ftnlen); logical fatal; extern /* Subroutine */ int schke_(integer *, char *, integer *, ftnlen); logical trace; integer nidim; - extern /* Subroutine */ int smvch_(char *, integer *, integer *, real *, - real *, integer *, real *, integer *, real *, real *, integer *, - real *, real *, real *, real *, real *, logical *, integer *, + extern /* Subroutine */ int smvch_(char *, integer *, integer *, real *, + real *, integer *, real *, integer *, real *, real *, integer *, + real *, real *, real *, real *, real *, logical *, integer *, logical *, ftnlen); char snaps[32], trans[1]; integer isnum; @@ -610,7 +615,7 @@ static logical c_false = FALSE_; goto L80; } for (i__ = 1; i__ <= 16; ++i__) { - if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) + if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) { goto L70; } @@ -737,44 +742,44 @@ static logical c_false = FALSE_; /* Test SGEMV, 01, and SGBMV, 02. */ L140: schk1_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, - &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, + trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, + &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, (ftnlen)6); goto L200; /* Test SSYMV, 03, SSBMV, 04, and SSPMV, 05. */ L150: schk2_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, - &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, + trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, + &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, (ftnlen)6); goto L200; /* Test STRMV, 06, STBMV, 07, STPMV, 08, */ /* STRSV, 09, STBSV, 10, and STPSV, 11. */ L160: schk3_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &ninc, inc, + trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &ninc, inc, &c__65, &c__2, a, aa, as, y, yy, ys, yt, g, z__, (ftnlen) 6); goto L200; /* Test SGER, 12. */ L170: schk4_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, - inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, + inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, z__, (ftnlen)6); goto L200; /* Test SSYR, 13, and SSPR, 14. */ L180: schk5_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, - inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, + inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, z__, (ftnlen)6); goto L200; /* Test SSYR2, 15, and SSPR2, 16. */ L190: schk6_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, - inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, + inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, z__, (ftnlen)6); L200: @@ -816,15 +821,20 @@ static logical c_false = FALSE_; /* End of SBLAT2. */ - return 0; +#ifdef BLIS_ENABLE_HPX + return bli_thread_finalize_hpx(); +#else + // Return peacefully. + return 0; +#endif } /* main */ /* Subroutine */ int schk1_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, integer * nalf, real *alf, integer *nbet, real *bet, integer *ninc, integer * - inc, integer *nmax, integer *incmax, real *a, real *aa, real *as, - real *x, real *xx, real *xs, real *y, real *yy, real *ys, real *yt, + inc, integer *nmax, integer *incmax, real *a, real *aa, real *as, + real *x, real *xx, real *xs, real *y, real *yy, real *ys, real *yt, real *g, ftnlen sname_len) { /* Initialized data */ @@ -872,24 +882,24 @@ static logical c_false = FALSE_; logical full, tran, null; real alpha; logical isame[13]; - extern /* Subroutine */ int smake_(char *, char *, char *, integer *, - integer *, real *, integer *, real *, integer *, integer *, + extern /* Subroutine */ int smake_(char *, char *, char *, integer *, + integer *, real *, integer *, real *, integer *, integer *, integer *, logical *, real *, ftnlen, ftnlen, ftnlen); integer nargs; extern /* Subroutine */ int sgbmv_(char *, integer *, integer *, integer * , integer *, real *, real *, integer *, real *, integer *, real *, - real *, integer *, ftnlen), smvch_(char *, integer *, integer *, - real *, real *, integer *, real *, integer *, real *, real *, - integer *, real *, real *, real *, real *, real *, logical *, + real *, integer *, ftnlen), smvch_(char *, integer *, integer *, + real *, real *, integer *, real *, integer *, real *, real *, + integer *, real *, real *, real *, real *, real *, logical *, integer *, logical *, ftnlen), sgemv_(char *, integer *, integer * - , real *, real *, integer *, real *, integer *, real *, real *, + , real *, real *, integer *, real *, integer *, real *, real *, integer *, ftnlen); logical reset; integer incxs, incys; char trans[1]; logical banded; real errmax; - extern logical lseres_(char *, char *, integer *, integer *, real *, real + extern logical lseres_(char *, char *, integer *, integer *, real *, real *, integer *, ftnlen, ftnlen); real transl; char transs[1]; @@ -1066,9 +1076,9 @@ static logical c_false = FALSE_; transl = 0.f; i__7 = abs(incy); i__8 = ml - 1; - smake_("GE", " ", " ", &c__1, &ml, &y[1], + smake_("GE", " ", " ", &c__1, &ml, &y[1], &c__1, &yy[1], &i__7, &c__0, & - i__8, &reset, &transl, (ftnlen)2, + i__8, &reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); ++nc; @@ -1076,7 +1086,7 @@ static logical c_false = FALSE_; /* Save every datum before calling the */ /* subroutine. */ - *(unsigned char *)transs = *(unsigned + *(unsigned char *)transs = *(unsigned char *)trans; ms = m; ns = n; @@ -1134,7 +1144,7 @@ static logical c_false = FALSE_; al__1.aunit = *ntra; f_rew(&al__1); } - sgemv_(trans, &m, &n, &alpha, &aa[1], + sgemv_(trans, &m, &n, &alpha, &aa[1], &lda, &xx[1], &incx, &beta, & yy[1], &incy, (ftnlen)1); } else if (banded) { @@ -1259,8 +1269,8 @@ static logical c_false = FALSE_; smvch_(trans, &m, &n, &alpha, &a[ a_offset], nmax, &x[1], &incx, - &beta, &y[1], &incy, &yt[1], - &g[1], &yy[1], eps, &err, + &beta, &y[1], &incy, &yt[1], + &g[1], &yy[1], eps, &err, fatal, nout, &c_true, (ftnlen) 1); errmax = max(errmax,err); @@ -1365,11 +1375,11 @@ static logical c_false = FALSE_; } /* schk1_ */ /* Subroutine */ int schk2_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, integer * nalf, real *alf, integer *nbet, real *bet, integer *ninc, integer * - inc, integer *nmax, integer *incmax, real *a, real *aa, real *as, - real *x, real *xx, real *xs, real *y, real *yy, real *ys, real *yt, + inc, integer *nmax, integer *incmax, real *a, real *aa, real *as, + real *x, real *xx, real *xs, real *y, real *yy, real *ys, real *yt, real *g, ftnlen sname_len) { /* Initialized data */ @@ -1407,7 +1417,7 @@ static logical c_false = FALSE_; f_rew(alist *); /* Local variables */ - integer i__, k, n, ia, ib, ic, nc, ik, in, nk, ks, ix, iy, ns, lx, ly, + integer i__, k, n, ia, ib, ic, nc, ik, in, nk, ks, ix, iy, ns, lx, ly, laa, lda; real als, bls; extern logical lse_(real *, real *, integer *); @@ -1419,27 +1429,27 @@ static logical c_false = FALSE_; char uplo[1]; real alpha; logical isame[13]; - extern /* Subroutine */ int smake_(char *, char *, char *, integer *, - integer *, real *, integer *, real *, integer *, integer *, + extern /* Subroutine */ int smake_(char *, char *, char *, integer *, + integer *, real *, integer *, real *, integer *, integer *, integer *, logical *, real *, ftnlen, ftnlen, ftnlen); integer nargs; - extern /* Subroutine */ int smvch_(char *, integer *, integer *, real *, - real *, integer *, real *, integer *, real *, real *, integer *, - real *, real *, real *, real *, real *, logical *, integer *, + extern /* Subroutine */ int smvch_(char *, integer *, integer *, real *, + real *, integer *, real *, integer *, real *, real *, integer *, + real *, real *, real *, real *, real *, logical *, integer *, logical *, ftnlen); logical reset; integer incxs, incys; - extern /* Subroutine */ int ssbmv_(char *, integer *, integer *, real *, - real *, integer *, real *, integer *, real *, real *, integer *, + extern /* Subroutine */ int ssbmv_(char *, integer *, integer *, real *, + real *, integer *, real *, integer *, real *, real *, integer *, ftnlen); char uplos[1]; - extern /* Subroutine */ int sspmv_(char *, integer *, real *, real *, + extern /* Subroutine */ int sspmv_(char *, integer *, real *, real *, real *, integer *, real *, real *, integer *, ftnlen), ssymv_( - char *, integer *, real *, real *, integer *, real *, integer *, + char *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *, ftnlen); logical banded, packed; real errmax; - extern logical lseres_(char *, char *, integer *, integer *, real *, real + extern logical lseres_(char *, char *, integer *, integer *, real *, real *, integer *, ftnlen, ftnlen); real transl; @@ -1599,7 +1609,7 @@ static logical c_false = FALSE_; i__8 = n - 1; smake_("GE", " ", " ", &c__1, &n, &y[1], & c__1, &yy[1], &i__7, &c__0, &i__8, & - reset, &transl, (ftnlen)2, (ftnlen)1, + reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); ++nc; @@ -1816,8 +1826,8 @@ static logical c_false = FALSE_; /* Check the result. */ - smvch_("N", &n, &n, &alpha, &a[a_offset], - nmax, &x[1], &incx, &beta, &y[1], + smvch_("N", &n, &n, &alpha, &a[a_offset], + nmax, &x[1], &incx, &beta, &y[1], &incy, &yt[1], &g[1], &yy[1], eps, &err, fatal, nout, &c_true, ( ftnlen)1); @@ -1928,10 +1938,10 @@ static logical c_false = FALSE_; } /* schk2_ */ /* Subroutine */ int schk3_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, integer * ninc, integer *inc, integer *nmax, integer *incmax, real *a, real *aa, - real *as, real *x, real *xx, real *xs, real *xt, real *g, real *z__, + real *as, real *x, real *xx, real *xs, real *xt, real *g, real *z__, ftnlen sname_len) { /* Initialized data */ @@ -1971,7 +1981,7 @@ static logical c_false = FALSE_; integer *, char *, ftnlen), e_wsfe(void), f_rew(alist *); /* Local variables */ - integer i__, k, n, nc, ik, in, nk, ks, ix, ns, lx, laa, icd, lda, ict, + integer i__, k, n, nc, ik, in, nk, ks, ix, ns, lx, laa, icd, lda, ict, icu; extern logical lse_(real *, real *, integer *); real err; @@ -1982,32 +1992,32 @@ static logical c_false = FALSE_; logical full, null; char uplo[1], diags[1]; logical isame[13]; - extern /* Subroutine */ int smake_(char *, char *, char *, integer *, - integer *, real *, integer *, real *, integer *, integer *, + extern /* Subroutine */ int smake_(char *, char *, char *, integer *, + integer *, real *, integer *, real *, integer *, integer *, integer *, logical *, real *, ftnlen, ftnlen, ftnlen); integer nargs; - extern /* Subroutine */ int smvch_(char *, integer *, integer *, real *, - real *, integer *, real *, integer *, real *, real *, integer *, - real *, real *, real *, real *, real *, logical *, integer *, + extern /* Subroutine */ int smvch_(char *, integer *, integer *, real *, + real *, integer *, real *, integer *, real *, real *, integer *, + real *, real *, real *, real *, real *, logical *, integer *, logical *, ftnlen); logical reset; integer incxs; char trans[1]; - extern /* Subroutine */ int stbmv_(char *, char *, char *, integer *, - integer *, real *, integer *, real *, integer *, ftnlen, ftnlen, - ftnlen), stbsv_(char *, char *, char *, integer *, integer *, + extern /* Subroutine */ int stbmv_(char *, char *, char *, integer *, + integer *, real *, integer *, real *, integer *, ftnlen, ftnlen, + ftnlen), stbsv_(char *, char *, char *, integer *, integer *, real *, integer *, real *, integer *, ftnlen, ftnlen, ftnlen); char uplos[1]; - extern /* Subroutine */ int stpmv_(char *, char *, char *, integer *, + extern /* Subroutine */ int stpmv_(char *, char *, char *, integer *, real *, real *, integer *, ftnlen, ftnlen, ftnlen), strmv_(char *, - char *, char *, integer *, real *, integer *, real *, integer *, + char *, char *, integer *, real *, integer *, real *, integer *, ftnlen, ftnlen, ftnlen), stpsv_(char *, char *, char *, integer *, real *, real *, integer *, ftnlen, ftnlen, ftnlen), strsv_(char * , char *, char *, integer *, real *, integer *, real *, integer *, ftnlen, ftnlen, ftnlen); logical banded, packed; real errmax; - extern logical lseres_(char *, char *, integer *, integer *, real *, real + extern logical lseres_(char *, char *, integer *, integer *, real *, real *, integer *, ftnlen, ftnlen); real transl; char transs[1]; @@ -2133,13 +2143,13 @@ static logical c_false = FALSE_; ; for (icd = 1; icd <= 2; ++icd) { - *(unsigned char *)diag = *(unsigned char *)&ichd[icd + *(unsigned char *)diag = *(unsigned char *)&ichd[icd - 1]; /* Generate the matrix A. */ transl = 0.f; - smake_(sname + 1, uplo, diag, &n, &n, &a[a_offset], + smake_(sname + 1, uplo, diag, &n, &n, &a[a_offset], nmax, &aa[1], &lda, &k, &k, &reset, &transl, ( ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -2186,7 +2196,7 @@ static logical c_false = FALSE_; /* Call the subroutine. */ - if (s_cmp(sname + 3, "MV", (ftnlen)2, (ftnlen)2) + if (s_cmp(sname + 3, "MV", (ftnlen)2, (ftnlen)2) == 0) { if (full) { if (*trace) { @@ -2239,7 +2249,7 @@ static logical c_false = FALSE_; al__1.aunit = *ntra; f_rew(&al__1); } - stbmv_(uplo, trans, diag, &n, &k, &aa[1], + stbmv_(uplo, trans, diag, &n, &k, &aa[1], &lda, &xx[1], &incx, (ftnlen)1, ( ftnlen)1, (ftnlen)1); } else if (packed) { @@ -2320,7 +2330,7 @@ static logical c_false = FALSE_; al__1.aunit = *ntra; f_rew(&al__1); } - stbsv_(uplo, trans, diag, &n, &k, &aa[1], + stbsv_(uplo, trans, diag, &n, &k, &aa[1], &lda, &xx[1], &incx, (ftnlen)1, ( ftnlen)1, (ftnlen)1); } else if (packed) { @@ -2362,11 +2372,11 @@ static logical c_false = FALSE_; /* See what data changed inside subroutines. */ - isame[0] = *(unsigned char *)uplo == *(unsigned + isame[0] = *(unsigned char *)uplo == *(unsigned char *)uplos; - isame[1] = *(unsigned char *)trans == *(unsigned + isame[1] = *(unsigned char *)trans == *(unsigned char *)transs; - isame[2] = *(unsigned char *)diag == *(unsigned + isame[2] = *(unsigned char *)diag == *(unsigned char *)diags; isame[3] = ns == n; if (full) { @@ -2437,7 +2447,7 @@ static logical c_false = FALSE_; smvch_(trans, &n, &n, &c_b128, &a[ a_offset], nmax, &x[1], &incx, & c_b120, &z__[1], &incx, &xt[1], & - g[1], &xx[1], eps, &err, fatal, + g[1], &xx[1], eps, &err, fatal, nout, &c_true, (ftnlen)1); } else if (s_cmp(sname + 3, "SV", (ftnlen)2, ( ftnlen)2) == 0) { @@ -2446,7 +2456,7 @@ static logical c_false = FALSE_; i__4 = n; for (i__ = 1; i__ <= i__4; ++i__) { - z__[i__] = xx[(i__ - 1) * abs(incx) + + z__[i__] = xx[(i__ - 1) * abs(incx) + 1]; xx[(i__ - 1) * abs(incx) + 1] = x[i__] ; @@ -2455,7 +2465,7 @@ static logical c_false = FALSE_; smvch_(trans, &n, &n, &c_b128, &a[ a_offset], nmax, &z__[1], &incx, & c_b120, &x[1], &incx, &xt[1], &g[ - 1], &xx[1], eps, &err, fatal, + 1], &xx[1], eps, &err, fatal, nout, &c_false, (ftnlen)1); } errmax = max(errmax,err); @@ -2558,10 +2568,10 @@ static logical c_false = FALSE_; } /* schk3_ */ /* Subroutine */ int schk4_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nalf, real *alf, integer * ninc, integer *inc, integer *nmax, integer *incmax, real *a, real *aa, - real *as, real *x, real *xx, real *xs, real *y, real *yy, real *ys, + real *as, real *x, real *xx, real *xs, real *y, real *yy, real *ys, real *yt, real *g, real *z__, ftnlen sname_len) { /* Format strings */ @@ -2599,24 +2609,24 @@ static logical c_false = FALSE_; real err; integer ldas; logical same; - extern /* Subroutine */ int sger_(integer *, integer *, real *, real *, + extern /* Subroutine */ int sger_(integer *, integer *, real *, real *, integer *, real *, integer *, real *, integer *); integer incx, incy; logical null; real alpha; logical isame[13]; - extern /* Subroutine */ int smake_(char *, char *, char *, integer *, - integer *, real *, integer *, real *, integer *, integer *, + extern /* Subroutine */ int smake_(char *, char *, char *, integer *, + integer *, real *, integer *, real *, integer *, integer *, integer *, logical *, real *, ftnlen, ftnlen, ftnlen); integer nargs; - extern /* Subroutine */ int smvch_(char *, integer *, integer *, real *, - real *, integer *, real *, integer *, real *, real *, integer *, - real *, real *, real *, real *, real *, logical *, integer *, + extern /* Subroutine */ int smvch_(char *, integer *, integer *, real *, + real *, integer *, real *, integer *, real *, real *, integer *, + real *, real *, real *, real *, real *, logical *, integer *, logical *, ftnlen); logical reset; integer incxs, incys; real errmax; - extern logical lseres_(char *, char *, integer *, integer *, real *, real + extern logical lseres_(char *, char *, integer *, integer *, real *, real *, integer *, ftnlen, ftnlen); real transl; @@ -2718,7 +2728,7 @@ static logical c_false = FALSE_; i__3 = abs(incx); i__4 = m - 1; smake_("GE", " ", " ", &c__1, &m, &x[1], &c__1, &xx[1], &i__3, - &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, + &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); if (m > 1) { x[m / 2] = 0.f; @@ -2752,7 +2762,7 @@ static logical c_false = FALSE_; transl = 0.f; i__5 = m - 1; i__6 = n - 1; - smake_(sname + 1, " ", " ", &m, &n, &a[a_offset], + smake_(sname + 1, " ", " ", &m, &n, &a[a_offset], nmax, &aa[1], &lda, &i__5, &i__6, &reset, & transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -2883,9 +2893,9 @@ static logical c_false = FALSE_; } else { w[0] = y[n - j + 1]; } - smvch_("N", &m, &c__1, &alpha, &z__[1], nmax, + smvch_("N", &m, &c__1, &alpha, &z__[1], nmax, w, &c__1, &c_b128, &a[j * a_dim1 + 1], - &c__1, &yt[1], &g[1], &aa[(j - 1) * + &c__1, &yt[1], &g[1], &aa[(j - 1) * lda + 1], eps, &err, fatal, nout, & c_true, (ftnlen)1); errmax = max(errmax,err); @@ -2966,10 +2976,10 @@ static logical c_false = FALSE_; } /* schk4_ */ /* Subroutine */ int schk5_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nalf, real *alf, integer * ninc, integer *inc, integer *nmax, integer *incmax, real *a, real *aa, - real *as, real *x, real *xx, real *xs, real *y, real *yy, real *ys, + real *as, real *x, real *xx, real *xs, real *y, real *yy, real *ys, real *yt, real *g, real *z__, ftnlen sname_len) { /* Initialized data */ @@ -3017,18 +3027,18 @@ static logical c_false = FALSE_; integer incx; logical full, null; char uplo[1]; - extern /* Subroutine */ int sspr_(char *, integer *, real *, real *, - integer *, real *, ftnlen), ssyr_(char *, integer *, real *, real + extern /* Subroutine */ int sspr_(char *, integer *, real *, real *, + integer *, real *, ftnlen), ssyr_(char *, integer *, real *, real *, integer *, real *, integer *, ftnlen); real alpha; logical isame[13]; - extern /* Subroutine */ int smake_(char *, char *, char *, integer *, - integer *, real *, integer *, real *, integer *, integer *, + extern /* Subroutine */ int smake_(char *, char *, char *, integer *, + integer *, real *, integer *, real *, integer *, integer *, integer *, logical *, real *, ftnlen, ftnlen, ftnlen); integer nargs; - extern /* Subroutine */ int smvch_(char *, integer *, integer *, real *, - real *, integer *, real *, integer *, real *, real *, integer *, - real *, real *, real *, real *, real *, logical *, integer *, + extern /* Subroutine */ int smvch_(char *, integer *, integer *, real *, + real *, integer *, real *, integer *, real *, real *, integer *, + real *, real *, real *, real *, real *, logical *, integer *, logical *, ftnlen); logical reset; integer incxs; @@ -3036,7 +3046,7 @@ static logical c_false = FALSE_; char uplos[1]; logical packed; real errmax; - extern logical lseres_(char *, char *, integer *, integer *, real *, real + extern logical lseres_(char *, char *, integer *, integer *, real *, real *, integer *, ftnlen, ftnlen); real transl; @@ -3140,7 +3150,7 @@ static logical c_false = FALSE_; i__3 = abs(incx); i__4 = n - 1; smake_("GE", " ", " ", &c__1, &n, &x[1], &c__1, &xx[1], &i__3, - &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, + &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); if (n > 1) { x[n / 2] = 0.f; @@ -3309,9 +3319,9 @@ static logical c_false = FALSE_; jj = j; lj = n - j + 1; } - smvch_("N", &lj, &c__1, &alpha, &z__[jj], &lj, w, + smvch_("N", &lj, &c__1, &alpha, &z__[jj], &lj, w, &c__1, &c_b128, &a[jj + j * a_dim1], & - c__1, &yt[1], &g[1], &aa[ja], eps, &err, + c__1, &yt[1], &g[1], &aa[ja], eps, &err, fatal, nout, &c_true, (ftnlen)1); if (full) { if (upper) { @@ -3410,10 +3420,10 @@ static logical c_false = FALSE_; } /* schk5_ */ /* Subroutine */ int schk6_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nalf, real *alf, integer * ninc, integer *inc, integer *nmax, integer *incmax, real *a, real *aa, - real *as, real *x, real *xx, real *xs, real *y, real *yy, real *ys, + real *as, real *x, real *xx, real *xs, real *y, real *yy, real *ys, real *yt, real *g, real *z__, ftnlen sname_len) { /* Initialized data */ @@ -3442,7 +3452,7 @@ static logical c_false = FALSE_; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5, + integer a_dim1, a_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5, i__6; alist al__1; @@ -3462,19 +3472,19 @@ static logical c_false = FALSE_; integer incx, incy; logical full, null; char uplo[1]; - extern /* Subroutine */ int sspr2_(char *, integer *, real *, real *, - integer *, real *, integer *, real *, ftnlen), ssyr2_(char *, - integer *, real *, real *, integer *, real *, integer *, real *, + extern /* Subroutine */ int sspr2_(char *, integer *, real *, real *, + integer *, real *, integer *, real *, ftnlen), ssyr2_(char *, + integer *, real *, real *, integer *, real *, integer *, real *, integer *, ftnlen); real alpha; logical isame[13]; - extern /* Subroutine */ int smake_(char *, char *, char *, integer *, - integer *, real *, integer *, real *, integer *, integer *, + extern /* Subroutine */ int smake_(char *, char *, char *, integer *, + integer *, real *, integer *, real *, integer *, integer *, integer *, logical *, real *, ftnlen, ftnlen, ftnlen); integer nargs; - extern /* Subroutine */ int smvch_(char *, integer *, integer *, real *, - real *, integer *, real *, integer *, real *, real *, integer *, - real *, real *, real *, real *, real *, logical *, integer *, + extern /* Subroutine */ int smvch_(char *, integer *, integer *, real *, + real *, integer *, real *, integer *, real *, real *, integer *, + real *, real *, real *, real *, real *, logical *, integer *, logical *, ftnlen); logical reset; integer incxs, incys; @@ -3482,7 +3492,7 @@ static logical c_false = FALSE_; char uplos[1]; logical packed; real errmax; - extern logical lseres_(char *, char *, integer *, integer *, real *, real + extern logical lseres_(char *, char *, integer *, integer *, real *, real *, integer *, ftnlen, ftnlen); real transl; @@ -3588,7 +3598,7 @@ static logical c_false = FALSE_; i__3 = abs(incx); i__4 = n - 1; smake_("GE", " ", " ", &c__1, &n, &x[1], &c__1, &xx[1], &i__3, - &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, + &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); if (n > 1) { x[n / 2] = 0.f; @@ -3623,7 +3633,7 @@ static logical c_false = FALSE_; transl = 0.f; i__5 = n - 1; i__6 = n - 1; - smake_(sname + 1, uplo, " ", &n, &n, &a[a_offset], + smake_(sname + 1, uplo, " ", &n, &n, &a[a_offset], nmax, &aa[1], &lda, &i__5, &i__6, &reset, & transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -3801,7 +3811,7 @@ static logical c_false = FALSE_; jj = j; lj = n - j + 1; } - smvch_("N", &lj, &c__2, &alpha, &z__[jj + + smvch_("N", &lj, &c__2, &alpha, &z__[jj + z_dim1], nmax, w, &c__1, &c_b128, &a[ jj + j * a_dim1], &c__1, &yt[1], &g[1] , &aa[ja], eps, &err, fatal, nout, & @@ -3907,7 +3917,7 @@ static logical c_false = FALSE_; } /* schk6_ */ -/* Subroutine */ int schke_(integer *isnum, char *srnamt, integer *nout, +/* Subroutine */ int schke_(integer *isnum, char *srnamt, integer *nout, ftnlen srnamt_len) { /* Format strings */ @@ -3921,35 +3931,35 @@ static logical c_false = FALSE_; /* Local variables */ real a[1] /* was [1][1] */, x[1], y[1], beta; - extern /* Subroutine */ int sger_(integer *, integer *, real *, real *, - integer *, real *, integer *, real *, integer *), sspr_(char *, - integer *, real *, real *, integer *, real *, ftnlen), ssyr_(char - *, integer *, real *, real *, integer *, real *, integer *, - ftnlen), sspr2_(char *, integer *, real *, real *, integer *, - real *, integer *, real *, ftnlen), ssyr2_(char *, integer *, - real *, real *, integer *, real *, integer *, real *, integer *, + extern /* Subroutine */ int sger_(integer *, integer *, real *, real *, + integer *, real *, integer *, real *, integer *), sspr_(char *, + integer *, real *, real *, integer *, real *, ftnlen), ssyr_(char + *, integer *, real *, real *, integer *, real *, integer *, + ftnlen), sspr2_(char *, integer *, real *, real *, integer *, + real *, integer *, real *, ftnlen), ssyr2_(char *, integer *, + real *, real *, integer *, real *, integer *, real *, integer *, ftnlen); real alpha; extern /* Subroutine */ int sgbmv_(char *, integer *, integer *, integer * , integer *, real *, real *, integer *, real *, integer *, real *, - real *, integer *, ftnlen), sgemv_(char *, integer *, integer *, - real *, real *, integer *, real *, integer *, real *, real *, - integer *, ftnlen), ssbmv_(char *, integer *, integer *, real *, - real *, integer *, real *, integer *, real *, real *, integer *, - ftnlen), stbmv_(char *, char *, char *, integer *, integer *, - real *, integer *, real *, integer *, ftnlen, ftnlen, ftnlen), - stbsv_(char *, char *, char *, integer *, integer *, real *, + real *, integer *, ftnlen), sgemv_(char *, integer *, integer *, + real *, real *, integer *, real *, integer *, real *, real *, + integer *, ftnlen), ssbmv_(char *, integer *, integer *, real *, + real *, integer *, real *, integer *, real *, real *, integer *, + ftnlen), stbmv_(char *, char *, char *, integer *, integer *, + real *, integer *, real *, integer *, ftnlen, ftnlen, ftnlen), + stbsv_(char *, char *, char *, integer *, integer *, real *, integer *, real *, integer *, ftnlen, ftnlen, ftnlen), sspmv_( - char *, integer *, real *, real *, real *, integer *, real *, - real *, integer *, ftnlen), stpmv_(char *, char *, char *, - integer *, real *, real *, integer *, ftnlen, ftnlen, ftnlen), - strmv_(char *, char *, char *, integer *, real *, integer *, real - *, integer *, ftnlen, ftnlen, ftnlen), stpsv_(char *, char *, - char *, integer *, real *, real *, integer *, ftnlen, ftnlen, - ftnlen), ssymv_(char *, integer *, real *, real *, integer *, + char *, integer *, real *, real *, real *, integer *, real *, + real *, integer *, ftnlen), stpmv_(char *, char *, char *, + integer *, real *, real *, integer *, ftnlen, ftnlen, ftnlen), + strmv_(char *, char *, char *, integer *, real *, integer *, real + *, integer *, ftnlen, ftnlen, ftnlen), stpsv_(char *, char *, + char *, integer *, real *, real *, integer *, ftnlen, ftnlen, + ftnlen), ssymv_(char *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *, ftnlen), strsv_( - char *, char *, char *, integer *, real *, integer *, real *, - integer *, ftnlen, ftnlen, ftnlen), chkxer_(char *, integer *, + char *, char *, char *, integer *, real *, integer *, real *, + integer *, ftnlen, ftnlen, ftnlen), chkxer_(char *, integer *, integer *, logical *, logical *, ftnlen); /* Fortran I/O blocks */ @@ -4455,9 +4465,9 @@ static logical c_false = FALSE_; } /* schke_ */ -/* Subroutine */ int smake_(char *type__, char *uplo, char *diag, integer *m, +/* Subroutine */ int smake_(char *type__, char *uplo, char *diag, integer *m, integer *n, real *a, integer *nmax, real *aa, integer *lda, integer * - kl, integer *ku, logical *reset, real *transl, ftnlen type_len, + kl, integer *ku, logical *reset, real *transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len) { /* System generated locals */ @@ -4516,7 +4526,7 @@ static logical c_false = FALSE_; i__2 = *m; for (i__ = 1; i__ <= i__2; ++i__) { if (gen || upper && i__ <= j || lower && i__ >= j) { - if (i__ <= j && j - i__ <= *ku || i__ >= j && i__ - j <= *kl) + if (i__ <= j && j - i__ <= *ku || i__ >= j && i__ - j <= *kl) { a[i__ + j * a_dim1] = sbeg_(reset) + *transl; } else { @@ -4690,9 +4700,9 @@ static logical c_false = FALSE_; } /* smake_ */ -/* Subroutine */ int smvch_(char *trans, integer *m, integer *n, real *alpha, - real *a, integer *nmax, real *x, integer *incx, real *beta, real *y, - integer *incy, real *yt, real *g, real *yy, real *eps, real *err, +/* Subroutine */ int smvch_(char *trans, integer *m, integer *n, real *alpha, + real *a, integer *nmax, real *x, integer *incx, real *beta, real *y, + integer *incy, real *yt, real *g, real *yy, real *eps, real *err, logical *fatal, integer *nout, logical *mv, ftnlen trans_len) { /* Format strings */ @@ -4807,7 +4817,7 @@ static logical c_false = FALSE_; *err = 0.f; i__1 = ml; for (i__ = 1; i__ <= i__1; ++i__) { - erri = (r__1 = yt[i__] - yy[(i__ - 1) * abs(*incy) + 1], abs(r__1)) / + erri = (r__1 = yt[i__] - yy[(i__ - 1) * abs(*incy) + 1], abs(r__1)) / *eps; if (g[i__] != 0.f) { erri /= g[i__]; @@ -4903,7 +4913,7 @@ logical lse_(real *ri, real *rj, integer *lr) } /* lse_ */ -logical lseres_(char *type__, char *uplo, integer *m, integer *n, real *aa, +logical lseres_(char *type__, char *uplo, integer *m, integer *n, real *aa, real *as, integer *lda, ftnlen type_len, ftnlen uplo_len) { /* System generated locals */ @@ -5064,7 +5074,7 @@ real sdiff_(real *x, real *y) } /* sdiff_ */ -/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, +/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, logical *lerr, logical *ok, ftnlen srnamt_len) { /* Format strings */ diff --git a/blastest/src/sblat3.c b/blastest/src/sblat3.c index dc5ef5738b..01d4ca4b8b 100644 --- a/blastest/src/sblat3.c +++ b/blastest/src/sblat3.c @@ -135,9 +135,14 @@ static integer c__2 = 2; /* ===================================================================== */ /* Main program */ int main(void) { +#ifdef BLIS_ENABLE_HPX + char* program = "sblat3"; + bli_thread_initialize_hpx( 1, &program ); +#endif + /* Initialized data */ - static char snames[6*6] = "SGEMM " "SSYMM " "STRMM " "STRSM " "SSYRK " + static char snames[6*6] = "SGEMM " "SSYMM " "STRMM " "STRSM " "SSYRK " "SSYR2K"; /* Format strings */ @@ -179,10 +184,10 @@ static integer c__2 = 2; cllist cl__1; /* Builtin functions */ - integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_rsle(void), f_open(olist *), s_wsfe(cilist *), do_fio(integer *, - char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), - s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, + char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), + s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, ftnlen); /* Subroutine */ int s_stop(char *, ftnlen); integer f_clos(cllist *); @@ -200,33 +205,33 @@ static integer c__2 = 2; integer nbet, ntra; logical rewi; integer nout; - extern /* Subroutine */ int schk1_(char *, real *, real *, integer *, - integer *, logical *, logical *, logical *, integer *, integer *, - integer *, real *, integer *, real *, integer *, real *, real *, - real *, real *, real *, real *, real *, real *, real *, real *, - real *, ftnlen), schk2_(char *, real *, real *, integer *, - integer *, logical *, logical *, logical *, integer *, integer *, - integer *, real *, integer *, real *, integer *, real *, real *, - real *, real *, real *, real *, real *, real *, real *, real *, - real *, ftnlen), schk3_(char *, real *, real *, integer *, - integer *, logical *, logical *, logical *, integer *, integer *, - integer *, real *, integer *, real *, real *, real *, real *, - real *, real *, real *, real *, real *, ftnlen), schk4_(char *, - real *, real *, integer *, integer *, logical *, logical *, - logical *, integer *, integer *, integer *, real *, integer *, + extern /* Subroutine */ int schk1_(char *, real *, real *, integer *, + integer *, logical *, logical *, logical *, integer *, integer *, + integer *, real *, integer *, real *, integer *, real *, real *, + real *, real *, real *, real *, real *, real *, real *, real *, + real *, ftnlen), schk2_(char *, real *, real *, integer *, + integer *, logical *, logical *, logical *, integer *, integer *, + integer *, real *, integer *, real *, integer *, real *, real *, + real *, real *, real *, real *, real *, real *, real *, real *, + real *, ftnlen), schk3_(char *, real *, real *, integer *, + integer *, logical *, logical *, logical *, integer *, integer *, + integer *, real *, integer *, real *, real *, real *, real *, + real *, real *, real *, real *, real *, ftnlen), schk4_(char *, + real *, real *, integer *, integer *, logical *, logical *, + logical *, integer *, integer *, integer *, real *, integer *, real *, integer *, real *, real *, real *, real *, real *, real *, - real *, real *, real *, real *, real *, ftnlen), schk5_(char *, - real *, real *, integer *, integer *, logical *, logical *, - logical *, integer *, integer *, integer *, real *, integer *, + real *, real *, real *, real *, real *, ftnlen), schk5_(char *, + real *, real *, integer *, integer *, logical *, logical *, + logical *, integer *, integer *, integer *, real *, integer *, real *, integer *, real *, real *, real *, real *, real *, real *, real *, real *, real *, real *, real *, ftnlen); logical fatal; extern /* Subroutine */ int schke_(integer *, char *, integer *, ftnlen); logical trace; integer nidim; - extern /* Subroutine */ int smmch_(char *, char *, integer *, integer *, - integer *, real *, real *, integer *, real *, integer *, real *, - real *, integer *, real *, real *, real *, integer *, real *, + extern /* Subroutine */ int smmch_(char *, char *, integer *, integer *, + integer *, real *, real *, integer *, real *, integer *, real *, + real *, integer *, real *, real *, real *, integer *, real *, real *, logical *, integer *, logical *, ftnlen, ftnlen); char snaps[32]; integer isnum; @@ -496,7 +501,7 @@ static integer c__2 = 2; goto L60; } for (i__ = 1; i__ <= 6; ++i__) { - if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) + if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) { goto L50; } @@ -662,34 +667,34 @@ static integer c__2 = 2; /* Test SGEMM, 01. */ L140: schk1_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, - bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, + bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, ct, g, (ftnlen)6); goto L190; /* Test SSYMM, 02. */ L150: schk2_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, - bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, + bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, ct, g, (ftnlen)6); goto L190; /* Test STRMM, 03, STRSM, 04. */ L160: schk3_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &c__65, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, (ftnlen)6); goto L190; /* Test SSYRK, 05. */ L170: schk4_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, - bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, + bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, ct, g, (ftnlen)6); goto L190; /* Test SSYR2K, 06. */ L180: schk5_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, ct, g, w, ( ftnlen)6); goto L190; @@ -733,14 +738,19 @@ static integer c__2 = 2; /* End of SBLAT3. */ - return 0; +#ifdef BLIS_ENABLE_HPX + return bli_thread_finalize_hpx(); +#else + // Return peacefully. + return 0; +#endif } /* main */ /* Subroutine */ int schk1_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nalf, real *alf, integer * - nbet, real *bet, integer *nmax, real *a, real *aa, real *as, real *b, - real *bb, real *bs, real *c__, real *cc, real *cs, real *ct, real *g, + nbet, real *bet, integer *nmax, real *a, real *aa, real *as, real *b, + real *bb, real *bs, real *c__, real *cc, real *cs, real *ct, real *g, ftnlen sname_len) { /* Initialized data */ @@ -764,7 +774,7 @@ static integer c__2 = 2; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6; alist al__1; @@ -773,7 +783,7 @@ static integer c__2 = 2; f_rew(alist *); /* Local variables */ - integer i__, k, m, n, ia, ib, ma, mb, na, nb, nc, ik, im, in, ks, ms, ns, + integer i__, k, m, n, ia, ib, ma, mb, na, nb, nc, ik, im, in, ks, ms, ns, ica, icb, laa, lbb, lda, lcc, ldb, ldc; real als, bls; extern logical lse_(real *, real *, integer *); @@ -782,22 +792,22 @@ static integer c__2 = 2; logical same, null; real alpha; logical isame[13]; - extern /* Subroutine */ int smake_(char *, char *, char *, integer *, + extern /* Subroutine */ int smake_(char *, char *, char *, integer *, integer *, real *, integer *, real *, integer *, logical *, real * , ftnlen, ftnlen, ftnlen); logical trana, tranb; - extern /* Subroutine */ int smmch_(char *, char *, integer *, integer *, - integer *, real *, real *, integer *, real *, integer *, real *, - real *, integer *, real *, real *, real *, integer *, real *, + extern /* Subroutine */ int smmch_(char *, char *, integer *, integer *, + integer *, real *, real *, integer *, real *, integer *, real *, + real *, integer *, real *, real *, real *, integer *, real *, real *, logical *, integer *, logical *, ftnlen, ftnlen), sgemm_( - char *, char *, integer *, integer *, integer *, real *, real *, - integer *, real *, integer *, real *, real *, integer *, ftnlen, + char *, char *, integer *, integer *, integer *, real *, real *, + integer *, real *, integer *, real *, real *, integer *, ftnlen, ftnlen); integer nargs; logical reset; char tranas[1], tranbs[1], transa[1], transb[1]; real errmax; - extern logical lseres_(char *, char *, integer *, integer *, real *, real + extern logical lseres_(char *, char *, integer *, integer *, real *, real *, integer *, ftnlen, ftnlen); /* Fortran I/O blocks */ @@ -888,7 +898,7 @@ static integer c__2 = 2; for (ica = 1; ica <= 3; ++ica) { *(unsigned char *)transa = *(unsigned char *)&ich[ica - 1] ; - trana = *(unsigned char *)transa == 'T' || *(unsigned + trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == 'C'; if (trana) { @@ -916,9 +926,9 @@ static integer c__2 = 2; ftnlen)1); for (icb = 1; icb <= 3; ++icb) { - *(unsigned char *)transb = *(unsigned char *)&ich[icb + *(unsigned char *)transb = *(unsigned char *)&ich[icb - 1]; - tranb = *(unsigned char *)transb == 'T' || *(unsigned + tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == 'C'; if (tranb) { @@ -1090,9 +1100,9 @@ static integer c__2 = 2; smmch_(transa, transb, &m, &n, &k, &alpha, &a[a_offset], nmax, &b[b_offset], - nmax, &beta, &c__[c_offset], + nmax, &beta, &c__[c_offset], nmax, &ct[1], &g[1], &cc[1], &ldc, - eps, &err, fatal, nout, &c_true, + eps, &err, fatal, nout, &c_true, (ftnlen)1, (ftnlen)1); errmax = max(errmax,err); /* If got really bad answer, report and */ @@ -1174,10 +1184,10 @@ static integer c__2 = 2; } /* schk1_ */ /* Subroutine */ int schk2_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nalf, real *alf, integer * - nbet, real *bet, integer *nmax, real *a, real *aa, real *as, real *b, - real *bb, real *bs, real *c__, real *cc, real *cs, real *ct, real *g, + nbet, real *bet, integer *nmax, real *a, real *aa, real *as, real *b, + real *bb, real *bs, real *c__, real *cc, real *cs, real *ct, real *g, ftnlen sname_len) { /* Initialized data */ @@ -1202,7 +1212,7 @@ static integer c__2 = 2; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5; alist al__1; @@ -1211,7 +1221,7 @@ static integer c__2 = 2; f_rew(alist *); /* Local variables */ - integer i__, m, n, ia, ib, na, nc, im, in, ms, ns, laa, lbb, lda, lcc, + integer i__, m, n, ia, ib, na, nc, im, in, ms, ns, laa, lbb, lda, lcc, ldb, ldc, ics; real als, bls; integer icu; @@ -1224,22 +1234,22 @@ static integer c__2 = 2; char uplo[1]; real alpha; logical isame[13]; - extern /* Subroutine */ int smake_(char *, char *, char *, integer *, + extern /* Subroutine */ int smake_(char *, char *, char *, integer *, integer *, real *, integer *, real *, integer *, logical *, real * , ftnlen, ftnlen, ftnlen); char sides[1]; - extern /* Subroutine */ int smmch_(char *, char *, integer *, integer *, - integer *, real *, real *, integer *, real *, integer *, real *, - real *, integer *, real *, real *, real *, integer *, real *, + extern /* Subroutine */ int smmch_(char *, char *, integer *, integer *, + integer *, real *, real *, integer *, real *, integer *, real *, + real *, integer *, real *, real *, real *, integer *, real *, real *, logical *, integer *, logical *, ftnlen, ftnlen); integer nargs; logical reset; char uplos[1]; - extern /* Subroutine */ int ssymm_(char *, char *, integer *, integer *, - real *, real *, integer *, real *, integer *, real *, real *, + extern /* Subroutine */ int ssymm_(char *, char *, integer *, integer *, + real *, real *, integer *, real *, integer *, real *, real *, integer *, ftnlen, ftnlen); real errmax; - extern logical lseres_(char *, char *, integer *, integer *, real *, real + extern logical lseres_(char *, char *, integer *, integer *, real *, real *, integer *, ftnlen, ftnlen); /* Fortran I/O blocks */ @@ -1378,7 +1388,7 @@ static integer c__2 = 2; /* Generate the matrix C. */ - smake_("GE", " ", " ", &m, &n, &c__[c_offset], + smake_("GE", " ", " ", &m, &n, &c__[c_offset], nmax, &cc[1], &ldc, &reset, &c_b84, ( ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -1459,9 +1469,9 @@ static integer c__2 = 2; /* See what data changed inside subroutines. */ - isame[0] = *(unsigned char *)sides == *(unsigned + isame[0] = *(unsigned char *)sides == *(unsigned char *)side; - isame[1] = *(unsigned char *)uplos == *(unsigned + isame[1] = *(unsigned char *)uplos == *(unsigned char *)uplo; isame[2] = ms == m; isame[3] = ns == n; @@ -1506,14 +1516,14 @@ static integer c__2 = 2; if (left) { smmch_("N", "N", &m, &n, &m, &alpha, &a[ - a_offset], nmax, &b[b_offset], + a_offset], nmax, &b[b_offset], nmax, &beta, &c__[c_offset], nmax, &ct[1], &g[1], &cc[1], &ldc, eps, &err, fatal, nout, &c_true, ( ftnlen)1, (ftnlen)1); } else { smmch_("N", "N", &m, &n, &n, &alpha, &b[ - b_offset], nmax, &a[a_offset], + b_offset], nmax, &a[a_offset], nmax, &beta, &c__[c_offset], nmax, &ct[1], &g[1], &cc[1], &ldc, eps, &err, fatal, nout, &c_true, ( @@ -1594,7 +1604,7 @@ static integer c__2 = 2; } /* schk2_ */ /* Subroutine */ int schk3_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nalf, real *alf, integer * nmax, real *a, real *aa, real *as, real *b, real *bb, real *bs, real * ct, real *g, real *c__, ftnlen sname_len) @@ -1623,7 +1633,7 @@ static integer c__2 = 2; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5; alist al__1; @@ -1647,25 +1657,25 @@ static integer c__2 = 2; real alpha; char diags[1]; logical isame[13]; - extern /* Subroutine */ int smake_(char *, char *, char *, integer *, + extern /* Subroutine */ int smake_(char *, char *, char *, integer *, integer *, real *, integer *, real *, integer *, logical *, real * , ftnlen, ftnlen, ftnlen); char sides[1]; - extern /* Subroutine */ int smmch_(char *, char *, integer *, integer *, - integer *, real *, real *, integer *, real *, integer *, real *, - real *, integer *, real *, real *, real *, integer *, real *, + extern /* Subroutine */ int smmch_(char *, char *, integer *, integer *, + integer *, real *, real *, integer *, real *, integer *, real *, + real *, integer *, real *, real *, real *, integer *, real *, real *, logical *, integer *, logical *, ftnlen, ftnlen); integer nargs; logical reset; char uplos[1]; - extern /* Subroutine */ int strmm_(char *, char *, char *, char *, + extern /* Subroutine */ int strmm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer * - , ftnlen, ftnlen, ftnlen, ftnlen), strsm_(char *, char *, char *, - char *, integer *, integer *, real *, real *, integer *, real *, + , ftnlen, ftnlen, ftnlen, ftnlen), strsm_(char *, char *, char *, + char *, integer *, integer *, real *, real *, integer *, real *, integer *, ftnlen, ftnlen, ftnlen, ftnlen); char tranas[1], transa[1]; real errmax; - extern logical lseres_(char *, char *, integer *, integer *, real *, real + extern logical lseres_(char *, char *, integer *, integer *, real *, real *, integer *, ftnlen, ftnlen); /* Fortran I/O blocks */ @@ -1800,7 +1810,7 @@ static integer c__2 = 2; /* Generate the matrix B. */ - smake_("GE", " ", " ", &m, &n, &b[b_offset], + smake_("GE", " ", " ", &m, &n, &b[b_offset], nmax, &bb[1], &ldb, &reset, &c_b84, ( ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -1866,7 +1876,7 @@ static integer c__2 = 2; } strmm_(side, uplo, transa, diag, &m, &n, & alpha, &aa[1], &lda, &bb[1], &ldb, - (ftnlen)1, (ftnlen)1, (ftnlen)1, + (ftnlen)1, (ftnlen)1, (ftnlen)1, (ftnlen)1); } else if (s_cmp(sname + 3, "SM", (ftnlen)2, ( ftnlen)2) == 0) { @@ -1899,7 +1909,7 @@ static integer c__2 = 2; } strsm_(side, uplo, transa, diag, &m, &n, & alpha, &aa[1], &lda, &bb[1], &ldb, - (ftnlen)1, (ftnlen)1, (ftnlen)1, + (ftnlen)1, (ftnlen)1, (ftnlen)1, (ftnlen)1); } @@ -1968,18 +1978,18 @@ static integer c__2 = 2; smmch_(transa, "N", &m, &n, &m, & alpha, &a[a_offset], nmax, &b[b_offset], nmax, & - c_b84, &c__[c_offset], + c_b84, &c__[c_offset], nmax, &ct[1], &g[1], &bb[ - 1], &ldb, eps, &err, + 1], &ldb, eps, &err, fatal, nout, &c_true, ( ftnlen)1, (ftnlen)1); } else { smmch_("N", transa, &m, &n, &n, & alpha, &b[b_offset], nmax, &a[a_offset], nmax, & - c_b84, &c__[c_offset], + c_b84, &c__[c_offset], nmax, &ct[1], &g[1], &bb[ - 1], &ldb, eps, &err, + 1], &ldb, eps, &err, fatal, nout, &c_true, ( ftnlen)1, (ftnlen)1); } @@ -1992,10 +2002,10 @@ static integer c__2 = 2; i__4 = n; for (j = 1; j <= i__4; ++j) { i__5 = m; - for (i__ = 1; i__ <= i__5; ++i__) + for (i__ = 1; i__ <= i__5; ++i__) { c__[i__ + j * c_dim1] = bb[i__ + (j - 1) * ldb]; - bb[i__ + (j - 1) * ldb] = alpha * b[i__ + j * + bb[i__ + (j - 1) * ldb] = alpha * b[i__ + j * b_dim1]; /* L60: */ } @@ -2008,16 +2018,16 @@ static integer c__2 = 2; &c__[c_offset], nmax, & c_b84, &b[b_offset], nmax, &ct[1], &g[1], &bb[1], & - ldb, eps, &err, fatal, + ldb, eps, &err, fatal, nout, &c_false, (ftnlen)1, (ftnlen)1); } else { smmch_("N", transa, &m, &n, &n, & - c_b94, &c__[c_offset], - nmax, &a[a_offset], nmax, - &c_b84, &b[b_offset], + c_b94, &c__[c_offset], + nmax, &a[a_offset], nmax, + &c_b84, &b[b_offset], nmax, &ct[1], &g[1], &bb[ - 1], &ldb, eps, &err, + 1], &ldb, eps, &err, fatal, nout, &c_false, ( ftnlen)1, (ftnlen)1); } @@ -2099,10 +2109,10 @@ static integer c__2 = 2; } /* schk3_ */ /* Subroutine */ int schk4_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nalf, real *alf, integer * - nbet, real *bet, integer *nmax, real *a, real *aa, real *as, real *b, - real *bb, real *bs, real *c__, real *cc, real *cs, real *ct, real *g, + nbet, real *bet, integer *nmax, real *a, real *aa, real *as, real *b, + real *bb, real *bs, real *c__, real *cc, real *cs, real *ct, real *g, ftnlen sname_len) { /* Initialized data */ @@ -2129,7 +2139,7 @@ static integer c__2 = 2; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5; alist al__1; @@ -2151,22 +2161,22 @@ static integer c__2 = 2; char uplo[1]; real alpha; logical isame[13]; - extern /* Subroutine */ int smake_(char *, char *, char *, integer *, + extern /* Subroutine */ int smake_(char *, char *, char *, integer *, integer *, real *, integer *, real *, integer *, logical *, real * - , ftnlen, ftnlen, ftnlen), smmch_(char *, char *, integer *, + , ftnlen, ftnlen, ftnlen), smmch_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer * - , real *, real *, integer *, real *, real *, real *, integer *, + , real *, real *, integer *, real *, real *, real *, integer *, real *, real *, logical *, integer *, logical *, ftnlen, ftnlen); integer nargs; logical reset; char trans[1]; logical upper; char uplos[1]; - extern /* Subroutine */ int ssyrk_(char *, char *, integer *, integer *, - real *, real *, integer *, real *, real *, integer *, ftnlen, + extern /* Subroutine */ int ssyrk_(char *, char *, integer *, integer *, + real *, real *, integer *, real *, real *, integer *, ftnlen, ftnlen); real errmax; - extern logical lseres_(char *, char *, integer *, integer *, real *, real + extern logical lseres_(char *, char *, integer *, integer *, real *, real *, integer *, ftnlen, ftnlen); char transs[1]; @@ -2293,7 +2303,7 @@ static integer c__2 = 2; /* Generate the matrix C. */ - smake_("SY", uplo, " ", &n, &n, &c__[c_offset], + smake_("SY", uplo, " ", &n, &n, &c__[c_offset], nmax, &cc[1], &ldc, &reset, &c_b84, ( ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -2350,7 +2360,7 @@ static integer c__2 = 2; al__1.aunit = *ntra; f_rew(&al__1); } - ssyrk_(uplo, trans, &n, &k, &alpha, &aa[1], &lda, + ssyrk_(uplo, trans, &n, &k, &alpha, &aa[1], &lda, &beta, &cc[1], &ldc, (ftnlen)1, (ftnlen)1) ; @@ -2366,9 +2376,9 @@ static integer c__2 = 2; /* See what data changed inside subroutines. */ - isame[0] = *(unsigned char *)uplos == *(unsigned + isame[0] = *(unsigned char *)uplos == *(unsigned char *)uplo; - isame[1] = *(unsigned char *)transs == *(unsigned + isame[1] = *(unsigned char *)transs == *(unsigned char *)trans; isame[2] = ns == n; isame[3] = ks == k; @@ -2421,19 +2431,19 @@ static integer c__2 = 2; } if (tran) { smmch_("T", "N", &lj, &c__1, &k, & - alpha, &a[jj * a_dim1 + 1], - nmax, &a[j * a_dim1 + 1], - nmax, &beta, &c__[jj + j * - c_dim1], nmax, &ct[1], &g[1], - &cc[jc], &ldc, eps, &err, + alpha, &a[jj * a_dim1 + 1], + nmax, &a[j * a_dim1 + 1], + nmax, &beta, &c__[jj + j * + c_dim1], nmax, &ct[1], &g[1], + &cc[jc], &ldc, eps, &err, fatal, nout, &c_true, (ftnlen) 1, (ftnlen)1); } else { smmch_("N", "T", &lj, &c__1, &k, & - alpha, &a[jj + a_dim1], nmax, + alpha, &a[jj + a_dim1], nmax, &a[j + a_dim1], nmax, &beta, & c__[jj + j * c_dim1], nmax, & - ct[1], &g[1], &cc[jc], &ldc, + ct[1], &g[1], &cc[jc], &ldc, eps, &err, fatal, nout, & c_true, (ftnlen)1, (ftnlen)1); } @@ -2526,7 +2536,7 @@ static integer c__2 = 2; } /* schk4_ */ /* Subroutine */ int schk5_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nalf, real *alf, integer * nbet, real *bet, integer *nmax, real *ab, real *aa, real *as, real * bb, real *bs, real *c__, real *cc, real *cs, real *ct, real *g, real * @@ -2579,22 +2589,22 @@ static integer c__2 = 2; char uplo[1]; real alpha; logical isame[13]; - extern /* Subroutine */ int smake_(char *, char *, char *, integer *, + extern /* Subroutine */ int smake_(char *, char *, char *, integer *, integer *, real *, integer *, real *, integer *, logical *, real * - , ftnlen, ftnlen, ftnlen), smmch_(char *, char *, integer *, + , ftnlen, ftnlen, ftnlen), smmch_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer * - , real *, real *, integer *, real *, real *, real *, integer *, + , real *, real *, integer *, real *, real *, real *, integer *, real *, real *, logical *, integer *, logical *, ftnlen, ftnlen); integer nargs; logical reset; char trans[1]; logical upper; char uplos[1]; - extern /* Subroutine */ int ssyr2k_(char *, char *, integer *, integer *, - real *, real *, integer *, real *, integer *, real *, real *, + extern /* Subroutine */ int ssyr2k_(char *, char *, integer *, integer *, + real *, real *, integer *, real *, integer *, real *, real *, integer *, ftnlen, ftnlen); real errmax; - extern logical lseres_(char *, char *, integer *, integer *, real *, real + extern logical lseres_(char *, char *, integer *, integer *, real *, real *, integer *, ftnlen, ftnlen); char transs[1]; @@ -2740,7 +2750,7 @@ static integer c__2 = 2; /* Generate the matrix C. */ - smake_("SY", uplo, " ", &n, &n, &c__[c_offset], + smake_("SY", uplo, " ", &n, &n, &c__[c_offset], nmax, &cc[1], &ldc, &reset, &c_b84, ( ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -2821,9 +2831,9 @@ static integer c__2 = 2; /* See what data changed inside subroutines. */ - isame[0] = *(unsigned char *)uplos == *(unsigned + isame[0] = *(unsigned char *)uplos == *(unsigned char *)uplo; - isame[1] = *(unsigned char *)transs == *(unsigned + isame[1] = *(unsigned char *)transs == *(unsigned char *)trans; isame[2] = ns == n; isame[3] = ks == k; @@ -2880,7 +2890,7 @@ static integer c__2 = 2; if (tran) { i__6 = k; for (i__ = 1; i__ <= i__6; ++i__) { - w[i__] = ab[(j - 1 << 1) * *nmax + w[i__] = ab[(j - 1 << 1) * *nmax + k + i__]; w[k + i__] = ab[(j - 1 << 1) * * nmax + i__]; @@ -2891,17 +2901,17 @@ static integer c__2 = 2; i__8 = *nmax << 1; smmch_("T", "N", &lj, &c__1, &i__6, & alpha, &ab[jjab], &i__7, &w[1] - , &i__8, &beta, &c__[jj + j * - c_dim1], nmax, &ct[1], &g[1], - &cc[jc], &ldc, eps, &err, + , &i__8, &beta, &c__[jj + j * + c_dim1], nmax, &ct[1], &g[1], + &cc[jc], &ldc, eps, &err, fatal, nout, &c_true, (ftnlen) 1, (ftnlen)1); } else { i__6 = k; for (i__ = 1; i__ <= i__6; ++i__) { - w[i__] = ab[(k + i__ - 1) * *nmax + w[i__] = ab[(k + i__ - 1) * *nmax + j]; - w[k + i__] = ab[(i__ - 1) * *nmax + w[k + i__] = ab[(i__ - 1) * *nmax + j]; /* L60: */ } @@ -2909,9 +2919,9 @@ static integer c__2 = 2; i__7 = *nmax << 1; smmch_("N", "N", &lj, &c__1, &i__6, & alpha, &ab[jj], nmax, &w[1], & - i__7, &beta, &c__[jj + j * - c_dim1], nmax, &ct[1], &g[1], - &cc[jc], &ldc, eps, &err, + i__7, &beta, &c__[jj + j * + c_dim1], nmax, &ct[1], &g[1], + &cc[jc], &ldc, eps, &err, fatal, nout, &c_true, (ftnlen) 1, (ftnlen)1); } @@ -3007,7 +3017,7 @@ static integer c__2 = 2; } /* schk5_ */ -/* Subroutine */ int schke_(integer *isnum, char *srnamt, integer *nout, +/* Subroutine */ int schke_(integer *isnum, char *srnamt, integer *nout, ftnlen srnamt_len) { /* Format strings */ @@ -3020,22 +3030,22 @@ static integer c__2 = 2; integer s_wsfe(cilist *), do_fio(integer *, char *, ftnlen), e_wsfe(void); /* Local variables */ - real a[2] /* was [2][1] */, b[2] /* was [2][1] */, c__[2] /* + real a[2] /* was [2][1] */, b[2] /* was [2][1] */, c__[2] /* was [2][1] */, beta, alpha; - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, - integer *, real *, real *, integer *, real *, integer *, real *, + extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *, ftnlen, ftnlen), strmm_(char *, char *, char *, - char *, integer *, integer *, real *, real *, integer *, real *, + char *, integer *, integer *, real *, real *, integer *, real *, integer *, ftnlen, ftnlen, ftnlen, ftnlen), ssymm_(char *, char *, - integer *, integer *, real *, real *, integer *, real *, integer - *, real *, real *, integer *, ftnlen, ftnlen), strsm_(char *, - char *, char *, char *, integer *, integer *, real *, real *, - integer *, real *, integer *, ftnlen, ftnlen, ftnlen, ftnlen), - ssyrk_(char *, char *, integer *, integer *, real *, real *, + integer *, integer *, real *, real *, integer *, real *, integer + *, real *, real *, integer *, ftnlen, ftnlen), strsm_(char *, + char *, char *, char *, integer *, integer *, real *, real *, + integer *, real *, integer *, ftnlen, ftnlen, ftnlen, ftnlen), + ssyrk_(char *, char *, integer *, integer *, real *, real *, integer *, real *, real *, integer *, ftnlen, ftnlen), ssyr2k_( - char *, char *, integer *, integer *, real *, real *, integer *, - real *, integer *, real *, real *, integer *, ftnlen, ftnlen), - chkxer_(char *, integer *, integer *, logical *, logical *, + char *, char *, integer *, integer *, real *, real *, integer *, + real *, integer *, real *, real *, integer *, ftnlen, ftnlen), + chkxer_(char *, integer *, integer *, logical *, logical *, ftnlen); /* Fortran I/O blocks */ @@ -3089,142 +3099,142 @@ static integer c__2 = 2; } L10: infoc_1.infot = 1; - sgemm_("/", "N", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("/", "N", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 1; - sgemm_("/", "T", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("/", "T", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 2; - sgemm_("N", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("N", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 2; - sgemm_("T", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("T", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - sgemm_("N", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("N", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - sgemm_("N", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("N", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - sgemm_("T", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("T", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - sgemm_("T", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("T", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - sgemm_("N", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("N", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - sgemm_("N", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("N", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - sgemm_("T", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("T", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - sgemm_("T", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("T", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - sgemm_("N", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("N", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - sgemm_("N", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("N", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - sgemm_("T", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("T", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - sgemm_("T", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("T", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - sgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__2, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - sgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__2, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - sgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__2, &beta, + sgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__2, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - sgemm_("T", "T", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("T", "T", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - sgemm_("N", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("N", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - sgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__2, b, &c__1, &beta, + sgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__2, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - sgemm_("N", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("N", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - sgemm_("T", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("T", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - sgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, + sgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - sgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, + sgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - sgemm_("T", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("T", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - sgemm_("T", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("T", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); @@ -3928,9 +3938,9 @@ static integer c__2 = 2; } /* schke_ */ -/* Subroutine */ int smake_(char *type__, char *uplo, char *diag, integer *m, +/* Subroutine */ int smake_(char *type__, char *uplo, char *diag, integer *m, integer *n, real *a, integer *nmax, real *aa, integer *lda, logical * - reset, real *transl, ftnlen type_len, ftnlen uplo_len, ftnlen + reset, real *transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len) { /* System generated locals */ @@ -4075,7 +4085,7 @@ static integer c__2 = 2; /* Subroutine */ int smmch_(char *transa, char *transb, integer *m, integer * n, integer *kk, real *alpha, real *a, integer *lda, real *b, integer * ldb, real *beta, real *c__, integer *ldc, real *ct, real *g, real *cc, - integer *ldcc, real *eps, real *err, logical *fatal, integer *nout, + integer *ldcc, real *eps, real *err, logical *fatal, integer *nout, logical *mv, ftnlen transa_len, ftnlen transb_len) { /* Format strings */ @@ -4087,7 +4097,7 @@ static integer c__2 = 2; " \002,i3)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, cc_offset, i__1, i__2, i__3; real r__1, r__2; @@ -4141,9 +4151,9 @@ static integer c__2 = 2; cc -= cc_offset; /* Function Body */ - trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == + trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == 'C'; - tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == + tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == 'C'; /* Compute expected result, one column at a time, in CT using data */ @@ -4165,7 +4175,7 @@ static integer c__2 = 2; i__3 = *m; for (i__ = 1; i__ <= i__3; ++i__) { ct[i__] += a[i__ + k * a_dim1] * b[k + j * b_dim1]; - g[i__] += (r__1 = a[i__ + k * a_dim1], abs(r__1)) * (r__2 + g[i__] += (r__1 = a[i__ + k * a_dim1], abs(r__1)) * (r__2 = b[k + j * b_dim1], abs(r__2)); /* L20: */ } @@ -4177,7 +4187,7 @@ static integer c__2 = 2; i__3 = *m; for (i__ = 1; i__ <= i__3; ++i__) { ct[i__] += a[k + i__ * a_dim1] * b[k + j * b_dim1]; - g[i__] += (r__1 = a[k + i__ * a_dim1], abs(r__1)) * (r__2 + g[i__] += (r__1 = a[k + i__ * a_dim1], abs(r__1)) * (r__2 = b[k + j * b_dim1], abs(r__2)); /* L40: */ } @@ -4189,7 +4199,7 @@ static integer c__2 = 2; i__3 = *m; for (i__ = 1; i__ <= i__3; ++i__) { ct[i__] += a[i__ + k * a_dim1] * b[j + k * b_dim1]; - g[i__] += (r__1 = a[i__ + k * a_dim1], abs(r__1)) * (r__2 + g[i__] += (r__1 = a[i__ + k * a_dim1], abs(r__1)) * (r__2 = b[j + k * b_dim1], abs(r__2)); /* L60: */ } @@ -4201,7 +4211,7 @@ static integer c__2 = 2; i__3 = *m; for (i__ = 1; i__ <= i__3; ++i__) { ct[i__] += a[k + i__ * a_dim1] * b[j + k * b_dim1]; - g[i__] += (r__1 = a[k + i__ * a_dim1], abs(r__1)) * (r__2 + g[i__] += (r__1 = a[k + i__ * a_dim1], abs(r__1)) * (r__2 = b[j + k * b_dim1], abs(r__2)); /* L80: */ } @@ -4328,7 +4338,7 @@ logical lse_(real *ri, real *rj, integer *lr) } /* lse_ */ -logical lseres_(char *type__, char *uplo, integer *m, integer *n, real *aa, +logical lseres_(char *type__, char *uplo, integer *m, integer *n, real *aa, real *as, integer *lda, ftnlen type_len, ftnlen uplo_len) { /* System generated locals */ @@ -4495,7 +4505,7 @@ real sdiff_(real *x, real *y) } /* sdiff_ */ -/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, +/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, logical *lerr, logical *ok, ftnlen srnamt_len) { /* Format strings */ diff --git a/blastest/src/zblat1.c b/blastest/src/zblat1.c index c34a572620..93a24f4c31 100644 --- a/blastest/src/zblat1.c +++ b/blastest/src/zblat1.c @@ -68,6 +68,11 @@ static doublereal c_b52 = 0.; /* ===================================================================== */ /* Main program */ int main(void) { +#ifdef BLIS_ENABLE_HPX + char* program = "zblat1"; + bli_thread_initialize_hpx( 1, &program ); +#endif + /* Initialized data */ static doublereal sfac = 9.765625e-4; @@ -84,7 +89,7 @@ static doublereal c_b52 = 0.; /* Local variables */ integer ic; - extern /* Subroutine */ int check1_(doublereal *), check2_(doublereal *), + extern /* Subroutine */ int check1_(doublereal *), check2_(doublereal *), header_(void); /* Fortran I/O blocks */ @@ -136,7 +141,12 @@ static doublereal c_b52 = 0.; } s_stop("", (ftnlen)0); - return 0; +#ifdef BLIS_ENABLE_HPX + return bli_thread_finalize_hpx(); +#else + // Return peacefully. + return 0; +#endif } /* main */ /* Subroutine */ int header_(void) @@ -222,7 +232,7 @@ static doublereal c_b52 = 0.; doublecomplex z__1; /* Builtin functions */ - integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_wsle(void); /* Subroutine */ int s_stop(char *, ftnlen); @@ -230,14 +240,14 @@ static doublereal c_b52 = 0.; integer i__; doublecomplex cx[8]; integer np1, len; - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, - doublecomplex *, integer *), ctest_(integer *, doublecomplex *, + extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + doublecomplex *, integer *), ctest_(integer *, doublecomplex *, doublecomplex *, doublecomplex *, doublereal *); doublecomplex mwpcs[5], mwpct[5]; extern /* Subroutine */ int itest1_(integer *, integer *); extern doublereal dznrm2_(integer *, doublecomplex *, integer *); - extern /* Subroutine */ int stest1_(doublereal *, doublereal *, - doublereal *, doublereal *), zdscal_(integer *, doublereal *, + extern /* Subroutine */ int stest1_(doublereal *, doublereal *, + doublereal *, doublereal *), zdscal_(integer *, doublereal *, doublecomplex *, integer *); extern integer izamax_(integer *, doublecomplex *, integer *); extern doublereal dzasum_(integer *, doublecomplex *, integer *); @@ -433,7 +443,7 @@ static doublereal c_b52 = 0.; 0.,0.},{0.,0.},{0.,0.},{0.,0.},{.7,-.8},{-.9,.5},{-.4,-.7},{0.,0.} ,{0.,0.},{0.,0.},{0.,0.},{.7,-.8},{-.9,.5},{-.4,-.7},{.1,-.5},{ -.1,-.9},{-.5,-.3},{.2,-.8} }; - static doublecomplex csize1[4] = { {0.,0.},{.9,.9},{1.63,1.73},{2.9,2.78} + static doublecomplex csize1[4] = { {0.,0.},{.9,.9},{1.63,1.73},{2.9,2.78} }; static doublecomplex csize3[14] = { {0.,0.},{0.,0.},{0.,0.},{0.,0.},{0., 0.},{0.,0.},{0.,0.},{1.17,1.17},{1.17,1.17},{1.17,1.17},{1.17, @@ -447,7 +457,7 @@ static doublereal c_b52 = 0.; doublecomplex z__1; /* Builtin functions */ - integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_wsle(void); /* Subroutine */ int s_stop(char *, ftnlen); @@ -457,17 +467,29 @@ static doublereal c_b52 = 0.; integer mx, my; doublecomplex cdot[1]; integer lenx, leny; - extern /* Subroutine */ int ctest_(integer *, doublecomplex *, + extern /* Subroutine */ int ctest_(integer *, doublecomplex *, doublecomplex *, doublecomplex *, doublereal *); - extern /* Double Complex */ doublecomplex zdotc_(integer *, + extern /* Double Complex */ +#ifdef BLIS_ENABLE_COMPLEX_RETURN_INTEL + void zdotc_(doublecomplex *, +#else +doublecomplex zdotc_( +#endif + integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer ksize; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); - extern /* Double Complex */ doublecomplex zdotu_(integer *, + extern /* Double Complex */ +#ifdef BLIS_ENABLE_COMPLEX_RETURN_INTEL + void zdotu_(doublecomplex *, +#else +doublecomplex zdotu_( +#endif + integer *, doublecomplex *, integer *, doublecomplex *, integer *); - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, - doublecomplex *, integer *), zaxpy_(integer *, doublecomplex *, + extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, + doublecomplex *, integer *), zaxpy_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); /* Fortran I/O blocks */ @@ -508,14 +530,26 @@ static doublereal c_b52 = 0.; } if (combla_1.icase == 1) { /* .. ZDOTC .. */ - z__1 = zdotc_(&combla_1.n, cx, &combla_1.incx, cy, & + +#ifdef BLIS_ENABLE_COMPLEX_RETURN_INTEL + zdotc_(&z__1, +#else + z__1 = zdotc_( +#endif + &combla_1.n, cx, &combla_1.incx, cy, & combla_1.incy); cdot[0].r = z__1.r, cdot[0].i = z__1.i; ctest_(&c__1, cdot, &ct6[kn + (ki << 2) - 5], &csize1[kn - 1], sfac); } else if (combla_1.icase == 2) { /* .. ZDOTU .. */ - z__1 = zdotu_(&combla_1.n, cx, &combla_1.incx, cy, & + +#ifdef BLIS_ENABLE_COMPLEX_RETURN_INTEL + zdotu_(&z__1, +#else + z__1 = zdotu_( +#endif + &combla_1.n, cx, &combla_1.incx, cy, & combla_1.incy); cdot[0].r = z__1.r, cdot[0].i = z__1.i; ctest_(&c__1, cdot, &ct7[kn + (ki << 2) - 5], &csize1[kn - 1], @@ -645,11 +679,11 @@ static doublereal c_b52 = 0.; } /* stest_ */ -/* Subroutine */ int stest1_(doublereal *scomp1, doublereal *strue1, +/* Subroutine */ int stest1_(doublereal *scomp1, doublereal *strue1, doublereal *ssize, doublereal *sfac) { doublereal scomp[1], strue[1]; - extern /* Subroutine */ int stest_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ int stest_(integer *, doublereal *, doublereal *, doublereal *, doublereal *); /* ************************* STEST1 ***************************** */ @@ -691,7 +725,7 @@ doublereal sdiff_(doublereal *sa, doublereal *sb) return ret_val; } /* sdiff_ */ -/* Subroutine */ int ctest_(integer *len, doublecomplex *ccomp, doublecomplex +/* Subroutine */ int ctest_(integer *len, doublecomplex *ccomp, doublecomplex *ctrue, doublecomplex *csize, doublereal *sfac) { /* System generated locals */ @@ -703,7 +737,7 @@ doublereal sdiff_(doublereal *sa, doublereal *sb) /* Local variables */ integer i__; doublereal scomp[20], ssize[20], strue[20]; - extern /* Subroutine */ int stest_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ int stest_(integer *, doublereal *, doublereal *, doublereal *, doublereal *); /* **************************** CTEST ***************************** */ diff --git a/blastest/src/zblat2.c b/blastest/src/zblat2.c index 030f03b833..5550b413f6 100644 --- a/blastest/src/zblat2.c +++ b/blastest/src/zblat2.c @@ -157,10 +157,15 @@ static logical c_false = FALSE_; /* ===================================================================== */ /* Main program */ int main(void) { +#ifdef BLIS_ENABLE_HPX + char* program = "zblat2"; + bli_thread_initialize_hpx( 1, &program ); +#endif + /* Initialized data */ - static char snames[6*17] = "ZGEMV " "ZGBMV " "ZHEMV " "ZHBMV " "ZHPMV " - "ZTRMV " "ZTBMV " "ZTPMV " "ZTRSV " "ZTBSV " "ZTPSV " "ZGERC " + static char snames[6*17] = "ZGEMV " "ZGBMV " "ZHEMV " "ZHBMV " "ZHPMV " + "ZTRMV " "ZTBMV " "ZTPMV " "ZTRSV " "ZTBSV " "ZTPSV " "ZGERC " "ZGERU " "ZHER " "ZHPR " "ZHER2 " "ZHPR2 "; /* Format strings */ @@ -208,10 +213,10 @@ static logical c_false = FALSE_; cllist cl__1; /* Builtin functions */ - integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_rsle(void), f_open(olist *), s_wsfe(cilist *), do_fio(integer *, - char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), - s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, + char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), + s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, ftnlen); /* Subroutine */ int s_stop(char *, ftnlen); integer f_clos(cllist *); @@ -234,53 +239,53 @@ static logical c_false = FALSE_; integer ninc, nbet, ntra; logical rewi; integer nout; - extern /* Subroutine */ int zchk1_(char *, doublereal *, doublereal *, - integer *, integer *, logical *, logical *, logical *, integer *, - integer *, integer *, integer *, integer *, doublecomplex *, - integer *, doublecomplex *, integer *, integer *, integer *, - integer *, doublecomplex *, doublecomplex *, doublecomplex *, + extern /* Subroutine */ int zchk1_(char *, doublereal *, doublereal *, + integer *, integer *, logical *, logical *, logical *, integer *, + integer *, integer *, integer *, integer *, doublecomplex *, + integer *, doublecomplex *, integer *, integer *, integer *, + integer *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex * , doublecomplex *, doublecomplex *, doublecomplex *, doublereal *, - ftnlen), zchk2_(char *, doublereal *, doublereal *, integer *, - integer *, logical *, logical *, logical *, integer *, integer *, - integer *, integer *, integer *, doublecomplex *, integer *, - doublecomplex *, integer *, integer *, integer *, integer *, + ftnlen), zchk2_(char *, doublereal *, doublereal *, integer *, + integer *, logical *, logical *, logical *, integer *, integer *, + integer *, integer *, integer *, doublecomplex *, integer *, + doublecomplex *, integer *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex * - , doublecomplex *, doublecomplex *, doublecomplex *, - doublecomplex *, doublecomplex *, doublecomplex *, doublereal *, - ftnlen), zchk3_(char *, doublereal *, doublereal *, integer *, - integer *, logical *, logical *, logical *, integer *, integer *, - integer *, integer *, integer *, integer *, integer *, integer *, + , doublecomplex *, doublecomplex *, doublecomplex *, + doublecomplex *, doublecomplex *, doublecomplex *, doublereal *, + ftnlen), zchk3_(char *, doublereal *, doublereal *, integer *, + integer *, logical *, logical *, logical *, integer *, integer *, + integer *, integer *, integer *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex * , doublecomplex *, doublecomplex *, doublecomplex *, doublereal *, - doublecomplex *, ftnlen), zchk4_(char *, doublereal *, - doublereal *, integer *, integer *, logical *, logical *, logical - *, integer *, integer *, integer *, doublecomplex *, integer *, + doublecomplex *, ftnlen), zchk4_(char *, doublereal *, + doublereal *, integer *, integer *, logical *, logical *, logical + *, integer *, integer *, integer *, doublecomplex *, integer *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, - doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex - *, doublecomplex *, doublecomplex *, doublecomplex *, + doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex + *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *, doublereal *, doublecomplex *, ftnlen), zchk5_( - char *, doublereal *, doublereal *, integer *, integer *, logical - *, logical *, logical *, integer *, integer *, integer *, - doublecomplex *, integer *, integer *, integer *, integer *, + char *, doublereal *, doublereal *, integer *, integer *, logical + *, logical *, logical *, integer *, integer *, integer *, + doublecomplex *, integer *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex * - , doublecomplex *, doublecomplex *, doublecomplex *, - doublecomplex *, doublecomplex *, doublecomplex *, doublereal *, - doublecomplex *, ftnlen), zchk6_(char *, doublereal *, doublereal - *, integer *, integer *, logical *, logical *, logical *, integer - *, integer *, integer *, doublecomplex *, integer *, integer *, - integer *, integer *, doublecomplex *, doublecomplex *, + , doublecomplex *, doublecomplex *, doublecomplex *, + doublecomplex *, doublecomplex *, doublecomplex *, doublereal *, + doublecomplex *, ftnlen), zchk6_(char *, doublereal *, doublereal + *, integer *, integer *, logical *, logical *, logical *, integer + *, integer *, integer *, doublecomplex *, integer *, integer *, + integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex * - , doublecomplex *, doublecomplex *, doublecomplex *, + , doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *, doublereal *, doublecomplex *, ftnlen); logical fatal, trace; integer nidim; extern /* Subroutine */ int zchke_(integer *, char *, integer *, ftnlen); char snaps[32], trans[1]; - extern /* Subroutine */ int zmvch_(char *, integer *, integer *, - doublecomplex *, doublecomplex *, integer *, doublecomplex *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, doublereal *, doublecomplex *, doublereal *, + extern /* Subroutine */ int zmvch_(char *, integer *, integer *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, doublereal *, doublecomplex *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen); integer isnum; logical ltest[17], sfatal; @@ -630,7 +635,7 @@ static logical c_false = FALSE_; goto L80; } for (i__ = 1; i__ <= 17; ++i__) { - if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) + if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) { goto L70; } @@ -689,7 +694,7 @@ static logical c_false = FALSE_; /* YY holds the exact result. On exit from ZMVCH YT holds */ /* the result computed by ZMVCH. */ *(unsigned char *)trans = 'N'; - zmvch_(trans, &n, &n, &c_b2, a, &c__65, x, &c__1, &c_b1, y, &c__1, yt, g, + zmvch_(trans, &n, &n, &c_b2, a, &c__65, x, &c__1, &c_b1, y, &c__1, yt, g, yy, &eps, &err, &fatal, &nout, &c_true, (ftnlen)1); same = lze_(yy, yt, &n); if (! same || err != 0.) { @@ -702,7 +707,7 @@ static logical c_false = FALSE_; s_stop("", (ftnlen)0); } *(unsigned char *)trans = 'T'; - zmvch_(trans, &n, &n, &c_b2, a, &c__65, x, &c_n1, &c_b1, y, &c_n1, yt, g, + zmvch_(trans, &n, &n, &c_b2, a, &c__65, x, &c_n1, &c_b1, y, &c_n1, yt, g, yy, &eps, &err, &fatal, &nout, &c_true, (ftnlen)1); same = lze_(yy, yt, &n); if (! same || err != 0.) { @@ -763,44 +768,44 @@ static logical c_false = FALSE_; /* Test ZGEMV, 01, and ZGBMV, 02. */ L140: zchk1_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, - &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, + trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, + &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, (ftnlen)6); goto L200; /* Test ZHEMV, 03, ZHBMV, 04, and ZHPMV, 05. */ L150: zchk2_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, - &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, + trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, + &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, (ftnlen)6); goto L200; /* Test ZTRMV, 06, ZTBMV, 07, ZTPMV, 08, */ /* ZTRSV, 09, ZTBSV, 10, and ZTPSV, 11. */ L160: zchk3_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &ninc, inc, + trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &ninc, inc, &c__65, &c__2, a, aa, as, y, yy, ys, yt, g, z__, (ftnlen) 6); goto L200; /* Test ZGERC, 12, ZGERU, 13. */ L170: zchk4_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, - inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, + inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, z__, (ftnlen)6); goto L200; /* Test ZHER, 14, and ZHPR, 15. */ L180: zchk5_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, - inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, + inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, z__, (ftnlen)6); goto L200; /* Test ZHER2, 16, and ZHPR2, 17. */ L190: zchk6_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, - inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, + inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, z__, (ftnlen)6); L200: @@ -842,16 +847,21 @@ static logical c_false = FALSE_; /* End of ZBLAT2. */ - return 0; +#ifdef BLIS_ENABLE_HPX + return bli_thread_finalize_hpx(); +#else + // Return peacefully. + return 0; +#endif } /* main */ -/* Subroutine */ int zchk1_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int zchk1_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * - fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, - integer *nalf, doublecomplex *alf, integer *nbet, doublecomplex *bet, - integer *ninc, integer *inc, integer *nmax, integer *incmax, - doublecomplex *a, doublecomplex *aa, doublecomplex *as, doublecomplex - *x, doublecomplex *xx, doublecomplex *xs, doublecomplex *y, + fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, + integer *nalf, doublecomplex *alf, integer *nbet, doublecomplex *bet, + integer *ninc, integer *inc, integer *nmax, integer *incmax, + doublecomplex *a, doublecomplex *aa, doublecomplex *as, doublecomplex + *x, doublecomplex *xx, doublecomplex *xs, doublecomplex *y, doublecomplex *yy, doublecomplex *ys, doublecomplex *yt, doublereal * g, ftnlen sname_len) { @@ -881,7 +891,7 @@ static logical c_false = FALSE_; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8, + integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8, i__9; alist al__1; @@ -904,7 +914,7 @@ static logical c_false = FALSE_; logical full, tran, null; doublecomplex alpha; logical isame[13]; - extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, + extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *, integer *, logical *, doublecomplex *, ftnlen, ftnlen, ftnlen); @@ -912,22 +922,22 @@ static logical c_false = FALSE_; logical reset; integer incxs, incys; extern /* Subroutine */ int zgbmv_(char *, integer *, integer *, integer * - , integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, + , integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen); char trans[1]; - extern /* Subroutine */ int zgemv_(char *, integer *, integer *, - doublecomplex *, doublecomplex *, integer *, doublecomplex *, - integer *, doublecomplex *, doublecomplex *, integer *, ftnlen), - zmvch_(char *, integer *, integer *, doublecomplex *, - doublecomplex *, integer *, doublecomplex *, integer *, - doublecomplex *, doublecomplex *, integer *, doublecomplex *, - doublereal *, doublecomplex *, doublereal *, doublereal *, + extern /* Subroutine */ int zgemv_(char *, integer *, integer *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, + integer *, doublecomplex *, doublecomplex *, integer *, ftnlen), + zmvch_(char *, integer *, integer *, doublecomplex *, + doublecomplex *, integer *, doublecomplex *, integer *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, + doublereal *, doublecomplex *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen); logical banded; doublereal errmax; doublecomplex transl; - extern logical lzeres_(char *, char *, integer *, integer *, + extern logical lzeres_(char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen); char transs[1]; @@ -1108,9 +1118,9 @@ static logical c_false = FALSE_; transl.r = 0., transl.i = 0.; i__7 = abs(incy); i__8 = ml - 1; - zmake_("GE", " ", " ", &c__1, &ml, &y[1], + zmake_("GE", " ", " ", &c__1, &ml, &y[1], &c__1, &yy[1], &i__7, &c__0, & - i__8, &reset, &transl, (ftnlen)2, + i__8, &reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); ++nc; @@ -1118,7 +1128,7 @@ static logical c_false = FALSE_; /* Save every datum before calling the */ /* subroutine. */ - *(unsigned char *)transs = *(unsigned + *(unsigned char *)transs = *(unsigned char *)trans; ms = m; ns = n; @@ -1129,7 +1139,7 @@ static logical c_false = FALSE_; for (i__ = 1; i__ <= i__7; ++i__) { i__8 = i__; i__9 = i__; - as[i__8].r = aa[i__9].r, as[i__8].i = + as[i__8].r = aa[i__9].r, as[i__8].i = aa[i__9].i; /* L10: */ } @@ -1138,7 +1148,7 @@ static logical c_false = FALSE_; for (i__ = 1; i__ <= i__7; ++i__) { i__8 = i__; i__9 = i__; - xs[i__8].r = xx[i__9].r, xs[i__8].i = + xs[i__8].r = xx[i__9].r, xs[i__8].i = xx[i__9].i; /* L20: */ } @@ -1148,7 +1158,7 @@ static logical c_false = FALSE_; for (i__ = 1; i__ <= i__7; ++i__) { i__8 = i__; i__9 = i__; - ys[i__8].r = yy[i__9].r, ys[i__8].i = + ys[i__8].r = yy[i__9].r, ys[i__8].i = yy[i__9].i; /* L30: */ } @@ -1187,7 +1197,7 @@ static logical c_false = FALSE_; al__1.aunit = *ntra; f_rew(&al__1); } - zgemv_(trans, &m, &n, &alpha, &aa[1], + zgemv_(trans, &m, &n, &alpha, &aa[1], &lda, &xx[1], &incx, &beta, & yy[1], &incy, (ftnlen)1); } else if (banded) { @@ -1248,7 +1258,7 @@ static logical c_false = FALSE_; isame[1] = ms == m; isame[2] = ns == n; if (full) { - isame[3] = als.r == alpha.r && als.i + isame[3] = als.r == alpha.r && als.i == alpha.i; isame[4] = lze_(&as[1], &aa[1], &laa); isame[5] = ldas == lda; @@ -1270,13 +1280,13 @@ static logical c_false = FALSE_; } else if (banded) { isame[3] = kls == kl; isame[4] = kus == ku; - isame[5] = als.r == alpha.r && als.i + isame[5] = als.r == alpha.r && als.i == alpha.i; isame[6] = lze_(&as[1], &aa[1], &laa); isame[7] = ldas == lda; isame[8] = lze_(&xs[1], &xx[1], &lx); isame[9] = incxs == incx; - isame[10] = bls.r == beta.r && bls.i + isame[10] = bls.r == beta.r && bls.i == beta.i; if (null) { isame[11] = lze_(&ys[1], &yy[1], & @@ -1318,8 +1328,8 @@ static logical c_false = FALSE_; zmvch_(trans, &m, &n, &alpha, &a[ a_offset], nmax, &x[1], &incx, - &beta, &y[1], &incy, &yt[1], - &g[1], &yy[1], eps, &err, + &beta, &y[1], &incy, &yt[1], + &g[1], &yy[1], eps, &err, fatal, nout, &c_true, (ftnlen) 1); errmax = max(errmax,err); @@ -1423,13 +1433,13 @@ static logical c_false = FALSE_; } /* zchk1_ */ -/* Subroutine */ int zchk2_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int zchk2_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * - fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, - integer *nalf, doublecomplex *alf, integer *nbet, doublecomplex *bet, - integer *ninc, integer *inc, integer *nmax, integer *incmax, - doublecomplex *a, doublecomplex *aa, doublecomplex *as, doublecomplex - *x, doublecomplex *xx, doublecomplex *xs, doublecomplex *y, + fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, + integer *nalf, doublecomplex *alf, integer *nbet, doublecomplex *bet, + integer *ninc, integer *inc, integer *nmax, integer *incmax, + doublecomplex *a, doublecomplex *aa, doublecomplex *as, doublecomplex + *x, doublecomplex *xx, doublecomplex *xs, doublecomplex *y, doublecomplex *yy, doublecomplex *ys, doublecomplex *yt, doublereal * g, ftnlen sname_len) { @@ -1463,7 +1473,7 @@ static logical c_false = FALSE_; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8, + integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8, i__9; alist al__1; @@ -1472,7 +1482,7 @@ static logical c_false = FALSE_; f_rew(alist *); /* Local variables */ - integer i__, k, n, ia, ib, ic, nc, ik, in, nk, ks, ix, iy, ns, lx, ly, + integer i__, k, n, ia, ib, ic, nc, ik, in, nk, ks, ix, iy, ns, lx, ly, laa, lda; doublecomplex als, bls; doublereal err; @@ -1485,31 +1495,31 @@ static logical c_false = FALSE_; char uplo[1]; doublecomplex alpha; logical isame[13]; - extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, + extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *, integer *, logical *, doublecomplex *, ftnlen, ftnlen, ftnlen); integer nargs; logical reset; integer incxs, incys; - extern /* Subroutine */ int zhbmv_(char *, integer *, integer *, - doublecomplex *, doublecomplex *, integer *, doublecomplex *, - integer *, doublecomplex *, doublecomplex *, integer *, ftnlen), - zmvch_(char *, integer *, integer *, doublecomplex *, - doublecomplex *, integer *, doublecomplex *, integer *, - doublecomplex *, doublecomplex *, integer *, doublecomplex *, - doublereal *, doublecomplex *, doublereal *, doublereal *, + extern /* Subroutine */ int zhbmv_(char *, integer *, integer *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, + integer *, doublecomplex *, doublecomplex *, integer *, ftnlen), + zmvch_(char *, integer *, integer *, doublecomplex *, + doublecomplex *, integer *, doublecomplex *, integer *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, + doublereal *, doublecomplex *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen), zhemv_(char *, integer * - , doublecomplex *, doublecomplex *, integer *, doublecomplex *, + , doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen); char uplos[1]; - extern /* Subroutine */ int zhpmv_(char *, integer *, doublecomplex *, - doublecomplex *, doublecomplex *, integer *, doublecomplex *, + extern /* Subroutine */ int zhpmv_(char *, integer *, doublecomplex *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen); logical banded, packed; doublereal errmax; doublecomplex transl; - extern logical lzeres_(char *, char *, integer *, integer *, + extern logical lzeres_(char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen); /* Fortran I/O blocks */ @@ -1672,7 +1682,7 @@ static logical c_false = FALSE_; i__8 = n - 1; zmake_("GE", " ", " ", &c__1, &n, &y[1], & c__1, &yy[1], &i__7, &c__0, &i__8, & - reset, &transl, (ftnlen)2, (ftnlen)1, + reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); ++nc; @@ -1824,13 +1834,13 @@ static logical c_false = FALSE_; unsigned char *)uplos; isame[1] = ns == n; if (full) { - isame[2] = als.r == alpha.r && als.i == + isame[2] = als.r == alpha.r && als.i == alpha.i; isame[3] = lze_(&as[1], &aa[1], &laa); isame[4] = ldas == lda; isame[5] = lze_(&xs[1], &xx[1], &lx); isame[6] = incxs == incx; - isame[7] = bls.r == beta.r && bls.i == + isame[7] = bls.r == beta.r && bls.i == beta.i; if (null) { isame[8] = lze_(&ys[1], &yy[1], &ly); @@ -1843,13 +1853,13 @@ static logical c_false = FALSE_; isame[9] = incys == incy; } else if (banded) { isame[2] = ks == k; - isame[3] = als.r == alpha.r && als.i == + isame[3] = als.r == alpha.r && als.i == alpha.i; isame[4] = lze_(&as[1], &aa[1], &laa); isame[5] = ldas == lda; isame[6] = lze_(&xs[1], &xx[1], &lx); isame[7] = incxs == incx; - isame[8] = bls.r == beta.r && bls.i == + isame[8] = bls.r == beta.r && bls.i == beta.i; if (null) { isame[9] = lze_(&ys[1], &yy[1], &ly); @@ -1861,12 +1871,12 @@ static logical c_false = FALSE_; } isame[10] = incys == incy; } else if (packed) { - isame[2] = als.r == alpha.r && als.i == + isame[2] = als.r == alpha.r && als.i == alpha.i; isame[3] = lze_(&as[1], &aa[1], &laa); isame[4] = lze_(&xs[1], &xx[1], &lx); isame[5] = incxs == incx; - isame[6] = bls.r == beta.r && bls.i == + isame[6] = bls.r == beta.r && bls.i == beta.i; if (null) { isame[7] = lze_(&ys[1], &yy[1], &ly); @@ -1904,8 +1914,8 @@ static logical c_false = FALSE_; /* Check the result. */ - zmvch_("N", &n, &n, &alpha, &a[a_offset], - nmax, &x[1], &incx, &beta, &y[1], + zmvch_("N", &n, &n, &alpha, &a[a_offset], + nmax, &x[1], &incx, &beta, &y[1], &incy, &yt[1], &g[1], &yy[1], eps, &err, fatal, nout, &c_true, ( ftnlen)1); @@ -2015,12 +2025,12 @@ static logical c_false = FALSE_; } /* zchk2_ */ -/* Subroutine */ int zchk3_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int zchk3_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * - fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, - integer *ninc, integer *inc, integer *nmax, integer *incmax, - doublecomplex *a, doublecomplex *aa, doublecomplex *as, doublecomplex - *x, doublecomplex *xx, doublecomplex *xs, doublecomplex *xt, + fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, + integer *ninc, integer *inc, integer *nmax, integer *incmax, + doublecomplex *a, doublecomplex *aa, doublecomplex *as, doublecomplex + *x, doublecomplex *xx, doublecomplex *xs, doublecomplex *xt, doublereal *g, doublecomplex *z__, ftnlen sname_len) { /* Initialized data */ @@ -2060,7 +2070,7 @@ static logical c_false = FALSE_; integer *, char *, ftnlen), e_wsfe(void), f_rew(alist *); /* Local variables */ - integer i__, k, n, nc, ik, in, nk, ks, ix, ns, lx, laa, icd, lda, ict, + integer i__, k, n, nc, ik, in, nk, ks, ix, ns, lx, laa, icd, lda, ict, icu; doublereal err; extern logical lze_(doublecomplex *, doublecomplex *, integer *); @@ -2071,7 +2081,7 @@ static logical c_false = FALSE_; logical full, null; char uplo[1], diags[1]; logical isame[13]; - extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, + extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *, integer *, logical *, doublecomplex *, ftnlen, ftnlen, ftnlen); @@ -2079,28 +2089,28 @@ static logical c_false = FALSE_; logical reset; integer incxs; char trans[1]; - extern /* Subroutine */ int zmvch_(char *, integer *, integer *, - doublecomplex *, doublecomplex *, integer *, doublecomplex *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, doublereal *, doublecomplex *, doublereal *, + extern /* Subroutine */ int zmvch_(char *, integer *, integer *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, doublereal *, doublecomplex *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen); char uplos[1]; - extern /* Subroutine */ int ztbmv_(char *, char *, char *, integer *, + extern /* Subroutine */ int ztbmv_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, ftnlen, ftnlen, ftnlen), ztbsv_(char *, char *, char *, integer * - , integer *, doublecomplex *, integer *, doublecomplex *, integer - *, ftnlen, ftnlen, ftnlen), ztpmv_(char *, char *, char *, - integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, - ftnlen, ftnlen), ztrmv_(char *, char *, char *, integer *, - doublecomplex *, integer *, doublecomplex *, integer *, ftnlen, - ftnlen, ftnlen), ztpsv_(char *, char *, char *, integer *, - doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen, + , integer *, doublecomplex *, integer *, doublecomplex *, integer + *, ftnlen, ftnlen, ftnlen), ztpmv_(char *, char *, char *, + integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, + ftnlen, ftnlen), ztrmv_(char *, char *, char *, integer *, + doublecomplex *, integer *, doublecomplex *, integer *, ftnlen, + ftnlen, ftnlen), ztpsv_(char *, char *, char *, integer *, + doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen, ftnlen), ztrsv_(char *, char *, char *, integer *, doublecomplex * , integer *, doublecomplex *, integer *, ftnlen, ftnlen, ftnlen); logical banded, packed; doublereal errmax; doublecomplex transl; - extern logical lzeres_(char *, char *, integer *, integer *, + extern logical lzeres_(char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen); char transs[1]; @@ -2226,13 +2236,13 @@ static logical c_false = FALSE_; ; for (icd = 1; icd <= 2; ++icd) { - *(unsigned char *)diag = *(unsigned char *)&ichd[icd + *(unsigned char *)diag = *(unsigned char *)&ichd[icd - 1]; /* Generate the matrix A. */ transl.r = 0., transl.i = 0.; - zmake_(sname + 1, uplo, diag, &n, &n, &a[a_offset], + zmake_(sname + 1, uplo, diag, &n, &n, &a[a_offset], nmax, &aa[1], &lda, &k, &k, &reset, &transl, ( ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -2287,7 +2297,7 @@ static logical c_false = FALSE_; /* Call the subroutine. */ - if (s_cmp(sname + 3, "MV", (ftnlen)2, (ftnlen)2) + if (s_cmp(sname + 3, "MV", (ftnlen)2, (ftnlen)2) == 0) { if (full) { if (*trace) { @@ -2340,7 +2350,7 @@ static logical c_false = FALSE_; al__1.aunit = *ntra; f_rew(&al__1); } - ztbmv_(uplo, trans, diag, &n, &k, &aa[1], + ztbmv_(uplo, trans, diag, &n, &k, &aa[1], &lda, &xx[1], &incx, (ftnlen)1, ( ftnlen)1, (ftnlen)1); } else if (packed) { @@ -2421,7 +2431,7 @@ static logical c_false = FALSE_; al__1.aunit = *ntra; f_rew(&al__1); } - ztbsv_(uplo, trans, diag, &n, &k, &aa[1], + ztbsv_(uplo, trans, diag, &n, &k, &aa[1], &lda, &xx[1], &incx, (ftnlen)1, ( ftnlen)1, (ftnlen)1); } else if (packed) { @@ -2463,11 +2473,11 @@ static logical c_false = FALSE_; /* See what data changed inside subroutines. */ - isame[0] = *(unsigned char *)uplo == *(unsigned + isame[0] = *(unsigned char *)uplo == *(unsigned char *)uplos; - isame[1] = *(unsigned char *)trans == *(unsigned + isame[1] = *(unsigned char *)trans == *(unsigned char *)transs; - isame[2] = *(unsigned char *)diag == *(unsigned + isame[2] = *(unsigned char *)diag == *(unsigned char *)diags; isame[3] = ns == n; if (full) { @@ -2537,7 +2547,7 @@ static logical c_false = FALSE_; zmvch_(trans, &n, &n, &c_b2, &a[a_offset], nmax, &x[1], &incx, &c_b1, &z__[ - 1], &incx, &xt[1], &g[1], &xx[1], + 1], &incx, &xt[1], &g[1], &xx[1], eps, &err, fatal, nout, &c_true, ( ftnlen)1); } else if (s_cmp(sname + 3, "SV", (ftnlen)2, ( @@ -2549,18 +2559,18 @@ static logical c_false = FALSE_; for (i__ = 1; i__ <= i__4; ++i__) { i__5 = i__; i__6 = (i__ - 1) * abs(incx) + 1; - z__[i__5].r = xx[i__6].r, z__[i__5].i + z__[i__5].r = xx[i__6].r, z__[i__5].i = xx[i__6].i; i__5 = (i__ - 1) * abs(incx) + 1; i__6 = i__; - xx[i__5].r = x[i__6].r, xx[i__5].i = + xx[i__5].r = x[i__6].r, xx[i__5].i = x[i__6].i; /* L50: */ } zmvch_(trans, &n, &n, &c_b2, &a[a_offset], nmax, &z__[1], &incx, &c_b1, &x[ - 1], &incx, &xt[1], &g[1], &xx[1], - eps, &err, fatal, nout, &c_false, + 1], &incx, &xt[1], &g[1], &xx[1], + eps, &err, fatal, nout, &c_false, (ftnlen)1); } errmax = max(errmax,err); @@ -2662,12 +2672,12 @@ static logical c_false = FALSE_; } /* zchk3_ */ -/* Subroutine */ int zchk4_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int zchk4_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex * - alf, integer *ninc, integer *inc, integer *nmax, integer *incmax, - doublecomplex *a, doublecomplex *aa, doublecomplex *as, doublecomplex - *x, doublecomplex *xx, doublecomplex *xs, doublecomplex *y, + alf, integer *ninc, integer *inc, integer *nmax, integer *incmax, + doublecomplex *a, doublecomplex *aa, doublecomplex *as, doublecomplex + *x, doublecomplex *xx, doublecomplex *xs, doublecomplex *y, doublecomplex *yy, doublecomplex *ys, doublecomplex *yt, doublereal * g, doublecomplex *z__, ftnlen sname_len) { @@ -2713,26 +2723,26 @@ static logical c_false = FALSE_; logical null; doublecomplex alpha; logical isame[13]; - extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, + extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *, integer *, logical *, doublecomplex *, ftnlen, ftnlen, ftnlen); integer nargs; - extern /* Subroutine */ int zgerc_(integer *, integer *, doublecomplex *, - doublecomplex *, integer *, doublecomplex *, integer *, + extern /* Subroutine */ int zgerc_(integer *, integer *, doublecomplex *, + doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); logical reset; integer incxs, incys; - extern /* Subroutine */ int zmvch_(char *, integer *, integer *, - doublecomplex *, doublecomplex *, integer *, doublecomplex *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, doublereal *, doublecomplex *, doublereal *, + extern /* Subroutine */ int zmvch_(char *, integer *, integer *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, doublereal *, doublecomplex *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen), zgeru_( integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); doublereal errmax; doublecomplex transl; - extern logical lzeres_(char *, char *, integer *, integer *, + extern logical lzeres_(char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen); /* Fortran I/O blocks */ @@ -2834,7 +2844,7 @@ static logical c_false = FALSE_; i__3 = abs(incx); i__4 = m - 1; zmake_("GE", " ", " ", &c__1, &m, &x[1], &c__1, &xx[1], &i__3, - &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, + &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); if (m > 1) { i__3 = m / 2; @@ -2873,7 +2883,7 @@ static logical c_false = FALSE_; transl.r = 0., transl.i = 0.; i__5 = m - 1; i__6 = n - 1; - zmake_(sname + 1, " ", " ", &m, &n, &a[a_offset], + zmake_(sname + 1, " ", " ", &m, &n, &a[a_offset], nmax, &aa[1], &lda, &i__5, &i__6, &reset, & transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -3032,9 +3042,9 @@ static logical c_false = FALSE_; d_cnjg(&z__1, w); w[0].r = z__1.r, w[0].i = z__1.i; } - zmvch_("N", &m, &c__1, &alpha, &z__[1], nmax, + zmvch_("N", &m, &c__1, &alpha, &z__[1], nmax, w, &c__1, &c_b2, &a[j * a_dim1 + 1], & - c__1, &yt[1], &g[1], &aa[(j - 1) * + c__1, &yt[1], &g[1], &aa[(j - 1) * lda + 1], eps, &err, fatal, nout, & c_true, (ftnlen)1); errmax = max(errmax,err); @@ -3114,12 +3124,12 @@ static logical c_false = FALSE_; } /* zchk4_ */ -/* Subroutine */ int zchk5_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int zchk5_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex * - alf, integer *ninc, integer *inc, integer *nmax, integer *incmax, - doublecomplex *a, doublecomplex *aa, doublecomplex *as, doublecomplex - *x, doublecomplex *xx, doublecomplex *xs, doublecomplex *y, + alf, integer *ninc, integer *inc, integer *nmax, integer *incmax, + doublecomplex *a, doublecomplex *aa, doublecomplex *as, doublecomplex + *x, doublecomplex *xx, doublecomplex *xs, doublecomplex *y, doublecomplex *yy, doublecomplex *ys, doublecomplex *yt, doublereal * g, doublecomplex *z__, ftnlen sname_len) { @@ -3169,32 +3179,32 @@ static logical c_false = FALSE_; doublereal rals; integer incx; logical full; - extern /* Subroutine */ int zher_(char *, integer *, doublereal *, + extern /* Subroutine */ int zher_(char *, integer *, doublereal *, doublecomplex *, integer *, doublecomplex *, integer *, ftnlen); logical null; char uplo[1]; - extern /* Subroutine */ int zhpr_(char *, integer *, doublereal *, + extern /* Subroutine */ int zhpr_(char *, integer *, doublereal *, doublecomplex *, integer *, doublecomplex *, ftnlen); doublecomplex alpha; logical isame[13]; - extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, + extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *, integer *, logical *, doublecomplex *, ftnlen, ftnlen, ftnlen); integer nargs; logical reset; integer incxs; - extern /* Subroutine */ int zmvch_(char *, integer *, integer *, - doublecomplex *, doublecomplex *, integer *, doublecomplex *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, doublereal *, doublecomplex *, doublereal *, + extern /* Subroutine */ int zmvch_(char *, integer *, integer *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, doublereal *, doublecomplex *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen); logical upper; char uplos[1]; logical packed; doublereal ralpha, errmax; doublecomplex transl; - extern logical lzeres_(char *, char *, integer *, integer *, + extern logical lzeres_(char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen); /* Fortran I/O blocks */ @@ -3297,7 +3307,7 @@ static logical c_false = FALSE_; i__3 = abs(incx); i__4 = n - 1; zmake_("GE", " ", " ", &c__1, &n, &x[1], &c__1, &xx[1], &i__3, - &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, + &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); if (n > 1) { i__3 = n / 2; @@ -3372,7 +3382,7 @@ static logical c_false = FALSE_; al__1.aunit = *ntra; f_rew(&al__1); } - zher_(uplo, &n, &ralpha, &xx[1], &incx, &aa[1], &lda, + zher_(uplo, &n, &ralpha, &xx[1], &incx, &aa[1], &lda, (ftnlen)1); } else if (packed) { if (*trace) { @@ -3482,9 +3492,9 @@ static logical c_false = FALSE_; jj = j; lj = n - j + 1; } - zmvch_("N", &lj, &c__1, &alpha, &z__[jj], &lj, w, - &c__1, &c_b2, &a[jj + j * a_dim1], &c__1, - &yt[1], &g[1], &aa[ja], eps, &err, fatal, + zmvch_("N", &lj, &c__1, &alpha, &z__[jj], &lj, w, + &c__1, &c_b2, &a[jj + j * a_dim1], &c__1, + &yt[1], &g[1], &aa[ja], eps, &err, fatal, nout, &c_true, (ftnlen)1); if (full) { if (upper) { @@ -3582,12 +3592,12 @@ static logical c_false = FALSE_; } /* zchk5_ */ -/* Subroutine */ int zchk6_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int zchk6_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex * - alf, integer *ninc, integer *inc, integer *nmax, integer *incmax, - doublecomplex *a, doublecomplex *aa, doublecomplex *as, doublecomplex - *x, doublecomplex *xx, doublecomplex *xs, doublecomplex *y, + alf, integer *ninc, integer *inc, integer *nmax, integer *incmax, + doublecomplex *a, doublecomplex *aa, doublecomplex *as, doublecomplex + *x, doublecomplex *xx, doublecomplex *xs, doublecomplex *y, doublecomplex *yy, doublecomplex *ys, doublecomplex *yt, doublereal * g, doublecomplex *z__, ftnlen sname_len) { @@ -3617,7 +3627,7 @@ static logical c_false = FALSE_; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5, + integer a_dim1, a_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; doublecomplex z__1, z__2, z__3; alist al__1; @@ -3639,31 +3649,31 @@ static logical c_false = FALSE_; integer incx, incy; logical full, null; char uplo[1]; - extern /* Subroutine */ int zher2_(char *, integer *, doublecomplex *, - doublecomplex *, integer *, doublecomplex *, integer *, - doublecomplex *, integer *, ftnlen), zhpr2_(char *, integer *, - doublecomplex *, doublecomplex *, integer *, doublecomplex *, + extern /* Subroutine */ int zher2_(char *, integer *, doublecomplex *, + doublecomplex *, integer *, doublecomplex *, integer *, + doublecomplex *, integer *, ftnlen), zhpr2_(char *, integer *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, ftnlen); doublecomplex alpha; logical isame[13]; - extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, + extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *, integer *, logical *, doublecomplex *, ftnlen, ftnlen, ftnlen); integer nargs; logical reset; integer incxs, incys; - extern /* Subroutine */ int zmvch_(char *, integer *, integer *, - doublecomplex *, doublecomplex *, integer *, doublecomplex *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, doublereal *, doublecomplex *, doublereal *, + extern /* Subroutine */ int zmvch_(char *, integer *, integer *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, doublereal *, doublecomplex *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen); logical upper; char uplos[1]; logical packed; doublereal errmax; doublecomplex transl; - extern logical lzeres_(char *, char *, integer *, integer *, + extern logical lzeres_(char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen); /* Fortran I/O blocks */ @@ -3768,7 +3778,7 @@ static logical c_false = FALSE_; i__3 = abs(incx); i__4 = n - 1; zmake_("GE", " ", " ", &c__1, &n, &x[1], &c__1, &xx[1], &i__3, - &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, + &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); if (n > 1) { i__3 = n / 2; @@ -3808,7 +3818,7 @@ static logical c_false = FALSE_; transl.r = 0., transl.i = 0.; i__5 = n - 1; i__6 = n - 1; - zmake_(sname + 1, uplo, " ", &n, &n, &a[a_offset], + zmake_(sname + 1, uplo, " ", &n, &n, &a[a_offset], nmax, &aa[1], &lda, &i__5, &i__6, &reset, & transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -3996,14 +4006,14 @@ static logical c_false = FALSE_; i__5 = n; for (j = 1; j <= i__5; ++j) { d_cnjg(&z__2, &z__[j + (z_dim1 << 1)]); - z__1.r = alpha.r * z__2.r - alpha.i * z__2.i, - z__1.i = alpha.r * z__2.i + alpha.i * + z__1.r = alpha.r * z__2.r - alpha.i * z__2.i, + z__1.i = alpha.r * z__2.i + alpha.i * z__2.r; w[0].r = z__1.r, w[0].i = z__1.i; d_cnjg(&z__2, &alpha); d_cnjg(&z__3, &z__[j + z_dim1]); - z__1.r = z__2.r * z__3.r - z__2.i * z__3.i, - z__1.i = z__2.r * z__3.i + z__2.i * + z__1.r = z__2.r * z__3.r - z__2.i * z__3.i, + z__1.i = z__2.r * z__3.i + z__2.i * z__3.r; w[1].r = z__1.r, w[1].i = z__1.i; if (upper) { @@ -4013,8 +4023,8 @@ static logical c_false = FALSE_; jj = j; lj = n - j + 1; } - zmvch_("N", &lj, &c__2, &c_b2, &z__[jj + - z_dim1], nmax, w, &c__1, &c_b2, &a[jj + zmvch_("N", &lj, &c__2, &c_b2, &z__[jj + + z_dim1], nmax, w, &c__1, &c_b2, &a[jj + j * a_dim1], &c__1, &yt[1], &g[1], & aa[ja], eps, &err, fatal, nout, & c_true, (ftnlen)1); @@ -4119,7 +4129,7 @@ static logical c_false = FALSE_; } /* zchk6_ */ -/* Subroutine */ int zchke_(integer *isnum, char *srnamt, integer *nout, +/* Subroutine */ int zchke_(integer *isnum, char *srnamt, integer *nout, ftnlen srnamt_len) { /* Format strings */ @@ -4133,47 +4143,47 @@ static logical c_false = FALSE_; /* Local variables */ doublecomplex a[1] /* was [1][1] */, x[1], y[1], beta; - extern /* Subroutine */ int zher_(char *, integer *, doublereal *, - doublecomplex *, integer *, doublecomplex *, integer *, ftnlen), + extern /* Subroutine */ int zher_(char *, integer *, doublereal *, + doublecomplex *, integer *, doublecomplex *, integer *, ftnlen), zhpr_(char *, integer *, doublereal *, doublecomplex *, integer *, - doublecomplex *, ftnlen), zher2_(char *, integer *, - doublecomplex *, doublecomplex *, integer *, doublecomplex *, - integer *, doublecomplex *, integer *, ftnlen), zhpr2_(char *, - integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, ftnlen), zher2_(char *, integer *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, + integer *, doublecomplex *, integer *, ftnlen), zhpr2_(char *, + integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, ftnlen); doublecomplex alpha; - extern /* Subroutine */ int zgerc_(integer *, integer *, doublecomplex *, - doublecomplex *, integer *, doublecomplex *, integer *, - doublecomplex *, integer *), zgbmv_(char *, integer *, integer *, + extern /* Subroutine */ int zgerc_(integer *, integer *, doublecomplex *, + doublecomplex *, integer *, doublecomplex *, integer *, + doublecomplex *, integer *), zgbmv_(char *, integer *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, ftnlen), zhbmv_(char *, integer *, integer *, - doublecomplex *, doublecomplex *, integer *, doublecomplex *, - integer *, doublecomplex *, doublecomplex *, integer *, ftnlen), - zgemv_(char *, integer *, integer *, doublecomplex *, - doublecomplex *, integer *, doublecomplex *, integer *, - doublecomplex *, doublecomplex *, integer *, ftnlen), zhemv_(char - *, integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, ftnlen), zgeru_(integer *, integer *, doublecomplex *, - doublecomplex *, integer *, doublecomplex *, integer *, - doublecomplex *, integer *), ztbmv_(char *, char *, char *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *, ftnlen), zhbmv_(char *, integer *, integer *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, + integer *, doublecomplex *, doublecomplex *, integer *, ftnlen), + zgemv_(char *, integer *, integer *, doublecomplex *, + doublecomplex *, integer *, doublecomplex *, integer *, + doublecomplex *, doublecomplex *, integer *, ftnlen), zhemv_(char + *, integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *, ftnlen), zgeru_(integer *, integer *, doublecomplex *, + doublecomplex *, integer *, doublecomplex *, integer *, + doublecomplex *, integer *), ztbmv_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, - integer *, ftnlen, ftnlen, ftnlen), zhpmv_(char *, integer *, - doublecomplex *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, doublecomplex *, integer *, ftnlen), ztbsv_(char - *, char *, char *, integer *, integer *, doublecomplex *, integer + integer *, ftnlen, ftnlen, ftnlen), zhpmv_(char *, integer *, + doublecomplex *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, doublecomplex *, integer *, ftnlen), ztbsv_(char + *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, ftnlen, ftnlen, ftnlen), ztpmv_( - char *, char *, char *, integer *, doublecomplex *, doublecomplex - *, integer *, ftnlen, ftnlen, ftnlen), ztrmv_(char *, char *, - char *, integer *, doublecomplex *, integer *, doublecomplex *, + char *, char *, char *, integer *, doublecomplex *, doublecomplex + *, integer *, ftnlen, ftnlen, ftnlen), ztrmv_(char *, char *, + char *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, ftnlen, ftnlen, ftnlen), ztpsv_(char *, char *, char *, - integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, - ftnlen, ftnlen), ztrsv_(char *, char *, char *, integer *, - doublecomplex *, integer *, doublecomplex *, integer *, ftnlen, + integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, + ftnlen, ftnlen), ztrsv_(char *, char *, char *, integer *, + doublecomplex *, integer *, doublecomplex *, integer *, ftnlen, ftnlen, ftnlen); doublereal ralpha; - extern /* Subroutine */ int chkxer_(char *, integer *, integer *, logical + extern /* Subroutine */ int chkxer_(char *, integer *, integer *, logical *, logical *, ftnlen); /* Fortran I/O blocks */ @@ -4702,9 +4712,9 @@ static logical c_false = FALSE_; } /* zchke_ */ -/* Subroutine */ int zmake_(char *type__, char *uplo, char *diag, integer *m, - integer *n, doublecomplex *a, integer *nmax, doublecomplex *aa, - integer *lda, integer *kl, integer *ku, logical *reset, doublecomplex +/* Subroutine */ int zmake_(char *type__, char *uplo, char *diag, integer *m, + integer *n, doublecomplex *a, integer *nmax, doublecomplex *aa, + integer *lda, integer *kl, integer *ku, logical *reset, doublecomplex *transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len) { /* System generated locals */ @@ -4765,7 +4775,7 @@ static logical c_false = FALSE_; i__2 = *m; for (i__ = 1; i__ <= i__2; ++i__) { if (gen || upper && i__ <= j || lower && i__ >= j) { - if (i__ <= j && j - i__ <= *ku || i__ >= j && i__ - j <= *kl) + if (i__ <= j && j - i__ <= *ku || i__ >= j && i__ - j <= *kl) { i__3 = i__ + j * a_dim1; zbeg_(&z__2, reset); @@ -4998,11 +5008,11 @@ static logical c_false = FALSE_; } /* zmake_ */ -/* Subroutine */ int zmvch_(char *trans, integer *m, integer *n, +/* Subroutine */ int zmvch_(char *trans, integer *m, integer *n, doublecomplex *alpha, doublecomplex *a, integer *nmax, doublecomplex * x, integer *incx, doublecomplex *beta, doublecomplex *y, integer * - incy, doublecomplex *yt, doublereal *g, doublecomplex *yy, doublereal - *eps, doublereal *err, logical *fatal, integer *nout, logical *mv, + incy, doublecomplex *yt, doublereal *g, doublecomplex *yy, doublereal + *eps, doublereal *err, logical *fatal, integer *nout, logical *mv, ftnlen trans_len) { /* Format strings */ @@ -5105,15 +5115,15 @@ static logical c_false = FALSE_; i__4 = iy; i__5 = j + i__ * a_dim1; i__6 = jx; - z__2.r = a[i__5].r * x[i__6].r - a[i__5].i * x[i__6].i, + z__2.r = a[i__5].r * x[i__6].r - a[i__5].i * x[i__6].i, z__2.i = a[i__5].r * x[i__6].i + a[i__5].i * x[i__6] .r; z__1.r = yt[i__4].r + z__2.r, z__1.i = yt[i__4].i + z__2.i; yt[i__3].r = z__1.r, yt[i__3].i = z__1.i; i__3 = j + i__ * a_dim1; i__4 = jx; - g[iy] += ((d__1 = a[i__3].r, abs(d__1)) + (d__2 = d_imag(&a[j - + i__ * a_dim1]), abs(d__2))) * ((d__3 = x[i__4].r, + g[iy] += ((d__1 = a[i__3].r, abs(d__1)) + (d__2 = d_imag(&a[j + + i__ * a_dim1]), abs(d__2))) * ((d__3 = x[i__4].r, abs(d__3)) + (d__4 = d_imag(&x[jx]), abs(d__4))); jx += incxl; /* L10: */ @@ -5125,14 +5135,14 @@ static logical c_false = FALSE_; i__4 = iy; d_cnjg(&z__3, &a[j + i__ * a_dim1]); i__5 = jx; - z__2.r = z__3.r * x[i__5].r - z__3.i * x[i__5].i, z__2.i = + z__2.r = z__3.r * x[i__5].r - z__3.i * x[i__5].i, z__2.i = z__3.r * x[i__5].i + z__3.i * x[i__5].r; z__1.r = yt[i__4].r + z__2.r, z__1.i = yt[i__4].i + z__2.i; yt[i__3].r = z__1.r, yt[i__3].i = z__1.i; i__3 = j + i__ * a_dim1; i__4 = jx; - g[iy] += ((d__1 = a[i__3].r, abs(d__1)) + (d__2 = d_imag(&a[j - + i__ * a_dim1]), abs(d__2))) * ((d__3 = x[i__4].r, + g[iy] += ((d__1 = a[i__3].r, abs(d__1)) + (d__2 = d_imag(&a[j + + i__ * a_dim1]), abs(d__2))) * ((d__3 = x[i__4].r, abs(d__3)) + (d__4 = d_imag(&x[jx]), abs(d__4))); jx += incxl; /* L20: */ @@ -5144,7 +5154,7 @@ static logical c_false = FALSE_; i__4 = iy; i__5 = i__ + j * a_dim1; i__6 = jx; - z__2.r = a[i__5].r * x[i__6].r - a[i__5].i * x[i__6].i, + z__2.r = a[i__5].r * x[i__6].r - a[i__5].i * x[i__6].i, z__2.i = a[i__5].r * x[i__6].i + a[i__5].i * x[i__6] .r; z__1.r = yt[i__4].r + z__2.r, z__1.i = yt[i__4].i + z__2.i; @@ -5152,7 +5162,7 @@ static logical c_false = FALSE_; i__3 = i__ + j * a_dim1; i__4 = jx; g[iy] += ((d__1 = a[i__3].r, abs(d__1)) + (d__2 = d_imag(&a[ - i__ + j * a_dim1]), abs(d__2))) * ((d__3 = x[i__4].r, + i__ + j * a_dim1]), abs(d__2))) * ((d__3 = x[i__4].r, abs(d__3)) + (d__4 = d_imag(&x[jx]), abs(d__4))); jx += incxl; /* L30: */ @@ -5160,7 +5170,7 @@ static logical c_false = FALSE_; } i__2 = iy; i__3 = iy; - z__2.r = alpha->r * yt[i__3].r - alpha->i * yt[i__3].i, z__2.i = + z__2.r = alpha->r * yt[i__3].r - alpha->i * yt[i__3].i, z__2.i = alpha->r * yt[i__3].i + alpha->i * yt[i__3].r; i__4 = iy; z__3.r = beta->r * y[i__4].r - beta->i * y[i__4].i, z__3.i = beta->r * @@ -5169,7 +5179,7 @@ static logical c_false = FALSE_; yt[i__2].r = z__1.r, yt[i__2].i = z__1.i; i__2 = iy; g[iy] = ((d__1 = alpha->r, abs(d__1)) + (d__2 = d_imag(alpha), abs( - d__2))) * g[iy] + ((d__3 = beta->r, abs(d__3)) + (d__4 = + d__2))) * g[iy] + ((d__3 = beta->r, abs(d__3)) + (d__4 = d_imag(beta), abs(d__4))) * ((d__5 = y[i__2].r, abs(d__5)) + ( d__6 = d_imag(&y[iy]), abs(d__6))); iy += incyl; @@ -5281,8 +5291,8 @@ logical lze_(doublecomplex *ri, doublecomplex *rj, integer *lr) } /* lze_ */ -logical lzeres_(char *type__, char *uplo, integer *m, integer *n, - doublecomplex *aa, doublecomplex *as, integer *lda, ftnlen type_len, +logical lzeres_(char *type__, char *uplo, integer *m, integer *n, + doublecomplex *aa, doublecomplex *as, integer *lda, ftnlen type_len, ftnlen uplo_len) { /* System generated locals */ @@ -5459,7 +5469,7 @@ doublereal ddiff_(doublereal *x, doublereal *y) } /* ddiff_ */ -/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, +/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, logical *lerr, logical *ok, ftnlen srnamt_len) { /* Format strings */ diff --git a/blastest/src/zblat3.c b/blastest/src/zblat3.c index 3ff3634b68..045eeba420 100644 --- a/blastest/src/zblat3.c +++ b/blastest/src/zblat3.c @@ -140,9 +140,14 @@ static integer c_n1 = -1; /* ===================================================================== */ /* Main program */ int main(void) { +#ifdef BLIS_ENABLE_HPX + char* program = "zblat3"; + bli_thread_initialize_hpx( 1, &program ); +#endif + /* Initialized data */ - static char snames[6*9] = "ZGEMM " "ZHEMM " "ZSYMM " "ZTRMM " "ZTRSM " + static char snames[6*9] = "ZGEMM " "ZHEMM " "ZSYMM " "ZTRMM " "ZTRSM " "ZHERK " "ZSYRK " "ZHER2K" "ZSYR2K"; /* Format strings */ @@ -186,10 +191,10 @@ static integer c_n1 = -1; cllist cl__1; /* Builtin functions */ - integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_rsle(void), f_open(olist *), s_wsfe(cilist *), do_fio(integer *, - char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), - s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, + char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), + s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, ftnlen); /* Subroutine */ int s_stop(char *, ftnlen); integer f_clos(cllist *); @@ -208,44 +213,44 @@ static integer c_n1 = -1; integer nbet, ntra; logical rewi; integer nout; - extern /* Subroutine */ int zchk1_(char *, doublereal *, doublereal *, - integer *, integer *, logical *, logical *, logical *, integer *, + extern /* Subroutine */ int zchk1_(char *, doublereal *, doublereal *, + integer *, integer *, logical *, logical *, logical *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, - integer *, doublecomplex *, doublecomplex *, doublecomplex *, + integer *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex * , doublecomplex *, doublecomplex *, doublecomplex *, doublereal *, - ftnlen), zchk2_(char *, doublereal *, doublereal *, integer *, - integer *, logical *, logical *, logical *, integer *, integer *, + ftnlen), zchk2_(char *, doublereal *, doublereal *, integer *, + integer *, logical *, logical *, logical *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, - doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex - *, doublecomplex *, doublecomplex *, doublecomplex *, - doublecomplex *, doublecomplex *, doublecomplex *, doublereal *, - ftnlen), zchk3_(char *, doublereal *, doublereal *, integer *, - integer *, logical *, logical *, logical *, integer *, integer *, - integer *, doublecomplex *, integer *, doublecomplex *, + doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex + *, doublecomplex *, doublecomplex *, doublecomplex *, + doublecomplex *, doublecomplex *, doublecomplex *, doublereal *, + ftnlen), zchk3_(char *, doublereal *, doublereal *, integer *, + integer *, logical *, logical *, logical *, integer *, integer *, + integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex * , doublecomplex *, doublecomplex *, doublereal *, doublecomplex *, - ftnlen), zchk4_(char *, doublereal *, doublereal *, integer *, - integer *, logical *, logical *, logical *, integer *, integer *, + ftnlen), zchk4_(char *, doublereal *, doublereal *, integer *, + integer *, logical *, logical *, logical *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, - doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex - *, doublecomplex *, doublecomplex *, doublecomplex *, - doublecomplex *, doublecomplex *, doublecomplex *, doublereal *, - ftnlen), zchk5_(char *, doublereal *, doublereal *, integer *, - integer *, logical *, logical *, logical *, integer *, integer *, + doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex + *, doublecomplex *, doublecomplex *, doublecomplex *, + doublecomplex *, doublecomplex *, doublecomplex *, doublereal *, + ftnlen), zchk5_(char *, doublereal *, doublereal *, integer *, + integer *, logical *, logical *, logical *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, - doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex - *, doublecomplex *, doublecomplex *, doublecomplex *, - doublecomplex *, doublecomplex *, doublereal *, doublecomplex *, + doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex + *, doublecomplex *, doublecomplex *, doublecomplex *, + doublecomplex *, doublecomplex *, doublereal *, doublecomplex *, ftnlen); logical fatal, trace; integer nidim; - extern /* Subroutine */ int zchke_(integer *, char *, integer *, ftnlen), - zmmch_(char *, char *, integer *, integer *, integer *, - doublecomplex *, doublecomplex *, integer *, doublecomplex *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, doublereal *, doublecomplex *, integer *, - doublereal *, doublereal *, logical *, integer *, logical *, + extern /* Subroutine */ int zchke_(integer *, char *, integer *, ftnlen), + zmmch_(char *, char *, integer *, integer *, integer *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, doublereal *, doublecomplex *, integer *, + doublereal *, doublereal *, logical *, integer *, logical *, ftnlen, ftnlen); char snaps[32]; integer isnum; @@ -517,7 +522,7 @@ static integer c_n1 = -1; goto L60; } for (i__ = 1; i__ <= 9; ++i__) { - if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) + if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) { goto L50; } @@ -580,7 +585,7 @@ static integer c_n1 = -1; *(unsigned char *)transa = 'N'; *(unsigned char *)transb = 'N'; zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & - c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, &nout, &c_true, (ftnlen)1, (ftnlen)1); same = lze_(cc, ct, &n); if (! same || err != 0.) { @@ -595,7 +600,7 @@ static integer c_n1 = -1; } *(unsigned char *)transb = 'C'; zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & - c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, &nout, &c_true, (ftnlen)1, (ftnlen)1); same = lze_(cc, ct, &n); if (! same || err != 0.) { @@ -628,7 +633,7 @@ static integer c_n1 = -1; *(unsigned char *)transa = 'C'; *(unsigned char *)transb = 'N'; zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & - c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, &nout, &c_true, (ftnlen)1, (ftnlen)1); same = lze_(cc, ct, &n); if (! same || err != 0.) { @@ -643,7 +648,7 @@ static integer c_n1 = -1; } *(unsigned char *)transb = 'C'; zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & - c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, &nout, &c_true, (ftnlen)1, (ftnlen)1); same = lze_(cc, ct, &n); if (! same || err != 0.) { @@ -697,34 +702,34 @@ static integer c_n1 = -1; /* Test ZGEMM, 01. */ L140: zchk1_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, - bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, + bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, ct, g, (ftnlen)6); goto L190; /* Test ZHEMM, 02, ZSYMM, 03. */ L150: zchk2_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, - bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, + bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, ct, g, (ftnlen)6); goto L190; /* Test ZTRMM, 04, ZTRSM, 05. */ L160: zchk3_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &c__65, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, (ftnlen)6); goto L190; /* Test ZHERK, 06, ZSYRK, 07. */ L170: zchk4_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, - bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, + bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, ct, g, (ftnlen)6); goto L190; /* Test ZHER2K, 08, ZSYR2K, 09. */ L180: zchk5_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, ct, g, w, ( ftnlen)6); goto L190; @@ -768,15 +773,20 @@ static integer c_n1 = -1; /* End of ZBLAT3. */ - return 0; +#ifdef BLIS_ENABLE_HPX + return bli_thread_finalize_hpx(); +#else + // Return peacefully. + return 0; +#endif } /* main */ -/* Subroutine */ int zchk1_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int zchk1_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex * alf, integer *nbet, doublecomplex *bet, integer *nmax, doublecomplex * - a, doublecomplex *aa, doublecomplex *as, doublecomplex *b, - doublecomplex *bb, doublecomplex *bs, doublecomplex *c__, + a, doublecomplex *aa, doublecomplex *as, doublecomplex *b, + doublecomplex *bb, doublecomplex *bs, doublecomplex *c__, doublecomplex *cc, doublecomplex *cs, doublecomplex *ct, doublereal * g, ftnlen sname_len) { @@ -802,7 +812,7 @@ static integer c_n1 = -1; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8; alist al__1; @@ -811,7 +821,7 @@ static integer c_n1 = -1; f_rew(alist *); /* Local variables */ - integer i__, k, m, n, ia, ib, ma, mb, na, nb, nc, ik, im, in, ks, ms, ns, + integer i__, k, m, n, ia, ib, ma, mb, na, nb, nc, ik, im, in, ks, ms, ns, ica, icb, laa, lbb, lda, lcc, ldb, ldc; doublecomplex als, bls; doublereal err; @@ -821,23 +831,23 @@ static integer c_n1 = -1; logical same, null; doublecomplex alpha; logical isame[13], trana, tranb; - extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, + extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, logical *, doublecomplex *, ftnlen, ftnlen, ftnlen); integer nargs; - extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, doublecomplex *, doublereal *, doublecomplex *, - integer *, doublereal *, doublereal *, logical *, integer *, - logical *, ftnlen, ftnlen), zgemm_(char *, char *, integer *, + extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *, doublecomplex *, doublereal *, doublecomplex *, + integer *, doublereal *, doublereal *, logical *, integer *, + logical *, ftnlen, ftnlen), zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen); logical reset; char tranas[1], tranbs[1], transa[1], transb[1]; doublereal errmax; - extern logical lzeres_(char *, char *, integer *, integer *, + extern logical lzeres_(char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen); /* Fortran I/O blocks */ @@ -928,7 +938,7 @@ static integer c_n1 = -1; for (ica = 1; ica <= 3; ++ica) { *(unsigned char *)transa = *(unsigned char *)&ich[ica - 1] ; - trana = *(unsigned char *)transa == 'T' || *(unsigned + trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == 'C'; if (trana) { @@ -956,9 +966,9 @@ static integer c_n1 = -1; ftnlen)1); for (icb = 1; icb <= 3; ++icb) { - *(unsigned char *)transb = *(unsigned char *)&ich[icb + *(unsigned char *)transb = *(unsigned char *)&ich[icb - 1]; - tranb = *(unsigned char *)transb == 'T' || *(unsigned + tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == 'C'; if (tranb) { @@ -1099,13 +1109,13 @@ static integer c_n1 = -1; isame[2] = ms == m; isame[3] = ns == n; isame[4] = ks == k; - isame[5] = als.r == alpha.r && als.i == + isame[5] = als.r == alpha.r && als.i == alpha.i; isame[6] = lze_(&as[1], &aa[1], &laa); isame[7] = ldas == lda; isame[8] = lze_(&bs[1], &bb[1], &lbb); isame[9] = ldbs == ldb; - isame[10] = bls.r == beta.r && bls.i == + isame[10] = bls.r == beta.r && bls.i == beta.i; if (null) { isame[11] = lze_(&cs[1], &cc[1], &lcc); @@ -1143,9 +1153,9 @@ static integer c_n1 = -1; zmmch_(transa, transb, &m, &n, &k, &alpha, &a[a_offset], nmax, &b[b_offset], - nmax, &beta, &c__[c_offset], + nmax, &beta, &c__[c_offset], nmax, &ct[1], &g[1], &cc[1], &ldc, - eps, &err, fatal, nout, &c_true, + eps, &err, fatal, nout, &c_true, (ftnlen)1, (ftnlen)1); errmax = max(errmax,err); /* If got really bad answer, report and */ @@ -1226,12 +1236,12 @@ static integer c_n1 = -1; } /* zchk1_ */ -/* Subroutine */ int zchk2_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int zchk2_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex * alf, integer *nbet, doublecomplex *bet, integer *nmax, doublecomplex * - a, doublecomplex *aa, doublecomplex *as, doublecomplex *b, - doublecomplex *bb, doublecomplex *bs, doublecomplex *c__, + a, doublecomplex *aa, doublecomplex *as, doublecomplex *b, + doublecomplex *bb, doublecomplex *bs, doublecomplex *c__, doublecomplex *cc, doublecomplex *cs, doublecomplex *ct, doublereal * g, ftnlen sname_len) { @@ -1258,7 +1268,7 @@ static integer c_n1 = -1; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; alist al__1; @@ -1267,7 +1277,7 @@ static integer c_n1 = -1; integer *, char *, ftnlen), e_wsfe(void), f_rew(alist *); /* Local variables */ - integer i__, m, n, ia, ib, na, nc, im, in, ms, ns, laa, lbb, lda, lcc, + integer i__, m, n, ia, ib, na, nc, im, in, ms, ns, laa, lbb, lda, lcc, ldb, ldc, ics; doublecomplex als, bls; integer icu; @@ -1282,27 +1292,27 @@ static integer c_n1 = -1; doublecomplex alpha; logical isame[13]; char sides[1]; - extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, + extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, logical *, doublecomplex *, ftnlen, ftnlen, ftnlen); integer nargs; - extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, doublecomplex *, doublereal *, doublecomplex *, - integer *, doublereal *, doublereal *, logical *, integer *, - logical *, ftnlen, ftnlen), zhemm_(char *, char *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, + extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *, doublecomplex *, doublereal *, doublecomplex *, + integer *, doublereal *, doublereal *, logical *, integer *, + logical *, ftnlen, ftnlen), zhemm_(char *, char *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen); logical reset; char uplos[1]; - extern /* Subroutine */ int zsymm_(char *, char *, integer *, integer *, - doublecomplex *, doublecomplex *, integer *, doublecomplex *, - integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, + extern /* Subroutine */ int zsymm_(char *, char *, integer *, integer *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, + integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen); doublereal errmax; - extern logical lzeres_(char *, char *, integer *, integer *, + extern logical lzeres_(char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen); /* Fortran I/O blocks */ @@ -1443,7 +1453,7 @@ static integer c_n1 = -1; /* Generate the matrix C. */ - zmake_("GE", " ", " ", &m, &n, &c__[c_offset], + zmake_("GE", " ", " ", &m, &n, &c__[c_offset], nmax, &cc[1], &ldc, &reset, &c_b1, ( ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -1539,9 +1549,9 @@ static integer c_n1 = -1; /* See what data changed inside subroutines. */ - isame[0] = *(unsigned char *)sides == *(unsigned + isame[0] = *(unsigned char *)sides == *(unsigned char *)side; - isame[1] = *(unsigned char *)uplos == *(unsigned + isame[1] = *(unsigned char *)uplos == *(unsigned char *)uplo; isame[2] = ms == m; isame[3] = ns == n; @@ -1586,14 +1596,14 @@ static integer c_n1 = -1; if (left) { zmmch_("N", "N", &m, &n, &m, &alpha, &a[ - a_offset], nmax, &b[b_offset], + a_offset], nmax, &b[b_offset], nmax, &beta, &c__[c_offset], nmax, &ct[1], &g[1], &cc[1], &ldc, eps, &err, fatal, nout, &c_true, ( ftnlen)1, (ftnlen)1); } else { zmmch_("N", "N", &m, &n, &n, &alpha, &b[ - b_offset], nmax, &a[a_offset], + b_offset], nmax, &a[a_offset], nmax, &beta, &c__[c_offset], nmax, &ct[1], &g[1], &cc[1], &ldc, eps, &err, fatal, nout, &c_true, ( @@ -1673,12 +1683,12 @@ static integer c_n1 = -1; } /* zchk2_ */ -/* Subroutine */ int zchk3_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int zchk3_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex * - alf, integer *nmax, doublecomplex *a, doublecomplex *aa, - doublecomplex *as, doublecomplex *b, doublecomplex *bb, doublecomplex - *bs, doublecomplex *ct, doublereal *g, doublecomplex *c__, ftnlen + alf, integer *nmax, doublecomplex *a, doublecomplex *aa, + doublecomplex *as, doublecomplex *b, doublecomplex *bb, doublecomplex + *bs, doublecomplex *ct, doublereal *g, doublecomplex *c__, ftnlen sname_len) { /* Initialized data */ @@ -1705,7 +1715,7 @@ static integer c_n1 = -1; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; doublecomplex z__1; alist al__1; @@ -1731,27 +1741,27 @@ static integer c_n1 = -1; char diags[1]; logical isame[13]; char sides[1]; - extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, + extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, logical *, doublecomplex *, ftnlen, ftnlen, ftnlen); integer nargs; - extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, doublecomplex *, doublereal *, doublecomplex *, - integer *, doublereal *, doublereal *, logical *, integer *, + extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *, doublecomplex *, doublereal *, doublecomplex *, + integer *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen, ftnlen); logical reset; char uplos[1]; - extern /* Subroutine */ int ztrmm_(char *, char *, char *, char *, + extern /* Subroutine */ int ztrmm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, ftnlen, ftnlen, ftnlen, ftnlen), - ztrsm_(char *, char *, char *, char *, integer *, integer *, - doublecomplex *, doublecomplex *, integer *, doublecomplex *, + doublecomplex *, integer *, ftnlen, ftnlen, ftnlen, ftnlen), + ztrsm_(char *, char *, char *, char *, integer *, integer *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, ftnlen, ftnlen, ftnlen, ftnlen); char tranas[1], transa[1]; doublereal errmax; - extern logical lzeres_(char *, char *, integer *, integer *, + extern logical lzeres_(char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen); /* Fortran I/O blocks */ @@ -1888,7 +1898,7 @@ static integer c_n1 = -1; /* Generate the matrix B. */ - zmake_("GE", " ", " ", &m, &n, &b[b_offset], + zmake_("GE", " ", " ", &m, &n, &b[b_offset], nmax, &bb[1], &ldb, &reset, &c_b1, ( ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -1960,7 +1970,7 @@ static integer c_n1 = -1; } ztrmm_(side, uplo, transa, diag, &m, &n, & alpha, &aa[1], &lda, &bb[1], &ldb, - (ftnlen)1, (ftnlen)1, (ftnlen)1, + (ftnlen)1, (ftnlen)1, (ftnlen)1, (ftnlen)1); } else if (s_cmp(sname + 3, "SM", (ftnlen)2, ( ftnlen)2) == 0) { @@ -1993,7 +2003,7 @@ static integer c_n1 = -1; } ztrsm_(side, uplo, transa, diag, &m, &n, & alpha, &aa[1], &lda, &bb[1], &ldb, - (ftnlen)1, (ftnlen)1, (ftnlen)1, + (ftnlen)1, (ftnlen)1, (ftnlen)1, (ftnlen)1); } @@ -2019,7 +2029,7 @@ static integer c_n1 = -1; unsigned char *)diag; isame[4] = ms == m; isame[5] = ns == n; - isame[6] = als.r == alpha.r && als.i == + isame[6] = als.r == alpha.r && als.i == alpha.i; isame[7] = lze_(&as[1], &aa[1], &laa); isame[8] = ldas == lda; @@ -2063,18 +2073,18 @@ static integer c_n1 = -1; zmmch_(transa, "N", &m, &n, &m, & alpha, &a[a_offset], nmax, &b[b_offset], nmax, & - c_b1, &c__[c_offset], + c_b1, &c__[c_offset], nmax, &ct[1], &g[1], &bb[ - 1], &ldb, eps, &err, + 1], &ldb, eps, &err, fatal, nout, &c_true, ( ftnlen)1, (ftnlen)1); } else { zmmch_("N", transa, &m, &n, &n, & alpha, &b[b_offset], nmax, &a[a_offset], nmax, & - c_b1, &c__[c_offset], + c_b1, &c__[c_offset], nmax, &ct[1], &g[1], &bb[ - 1], &ldb, eps, &err, + 1], &ldb, eps, &err, fatal, nout, &c_true, ( ftnlen)1, (ftnlen)1); } @@ -2087,14 +2097,14 @@ static integer c_n1 = -1; i__4 = n; for (j = 1; j <= i__4; ++j) { i__5 = m; - for (i__ = 1; i__ <= i__5; ++i__) + for (i__ = 1; i__ <= i__5; ++i__) { i__6 = i__ + j * c_dim1; i__7 = i__ + (j - 1) * ldb; c__[i__6].r = bb[i__7].r, c__[i__6].i = bb[i__7].i; i__6 = i__ + (j - 1) * ldb; i__7 = i__ + j * b_dim1; - z__1.r = alpha.r * b[i__7].r - alpha.i * b[i__7].i, + z__1.r = alpha.r * b[i__7].r - alpha.i * b[i__7].i, z__1.i = alpha.r * b[i__7].i + alpha.i * b[ i__7].r; bb[i__6].r = z__1.r, bb[i__6].i = z__1.i; @@ -2105,20 +2115,20 @@ static integer c_n1 = -1; if (left) { zmmch_(transa, "N", &m, &n, &m, & - c_b2, &a[a_offset], nmax, + c_b2, &a[a_offset], nmax, &c__[c_offset], nmax, & - c_b1, &b[b_offset], nmax, + c_b1, &b[b_offset], nmax, &ct[1], &g[1], &bb[1], & - ldb, eps, &err, fatal, + ldb, eps, &err, fatal, nout, &c_false, (ftnlen)1, (ftnlen)1); } else { zmmch_("N", transa, &m, &n, &n, & - c_b2, &c__[c_offset], - nmax, &a[a_offset], nmax, + c_b2, &c__[c_offset], + nmax, &a[a_offset], nmax, &c_b1, &b[b_offset], nmax, &ct[1], &g[1], &bb[1], & - ldb, eps, &err, fatal, + ldb, eps, &err, fatal, nout, &c_false, (ftnlen)1, (ftnlen)1); } @@ -2199,12 +2209,12 @@ static integer c_n1 = -1; } /* zchk3_ */ -/* Subroutine */ int zchk4_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int zchk4_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex * alf, integer *nbet, doublecomplex *bet, integer *nmax, doublecomplex * - a, doublecomplex *aa, doublecomplex *as, doublecomplex *b, - doublecomplex *bb, doublecomplex *bs, doublecomplex *c__, + a, doublecomplex *aa, doublecomplex *as, doublecomplex *b, + doublecomplex *bb, doublecomplex *bs, doublecomplex *c__, doublecomplex *cc, doublecomplex *cs, doublecomplex *ct, doublereal * g, ftnlen sname_len) { @@ -2236,7 +2246,7 @@ static integer c_n1 = -1; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; doublecomplex z__1; alist al__1; @@ -2262,29 +2272,29 @@ static integer c_n1 = -1; doublecomplex alpha; doublereal rbeta; logical isame[13]; - extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, + extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, logical *, doublecomplex *, ftnlen, ftnlen, ftnlen); integer nargs; - extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, doublecomplex *, doublereal *, doublecomplex *, - integer *, doublereal *, doublereal *, logical *, integer *, + extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *, doublecomplex *, doublereal *, doublecomplex *, + integer *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen, ftnlen); doublereal rbets; logical reset; - extern /* Subroutine */ int zherk_(char *, char *, integer *, integer *, - doublereal *, doublecomplex *, integer *, doublereal *, + extern /* Subroutine */ int zherk_(char *, char *, integer *, integer *, + doublereal *, doublecomplex *, integer *, doublereal *, doublecomplex *, integer *, ftnlen, ftnlen); char trans[1]; logical upper; char uplos[1]; - extern /* Subroutine */ int zsyrk_(char *, char *, integer *, integer *, - doublecomplex *, doublecomplex *, integer *, doublecomplex *, + extern /* Subroutine */ int zsyrk_(char *, char *, integer *, integer *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen); doublereal ralpha, errmax; - extern logical lzeres_(char *, char *, integer *, integer *, + extern logical lzeres_(char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen); char transs[1], transt[1]; @@ -2426,7 +2436,7 @@ static integer c_n1 = -1; } null = n <= 0; if (conj) { - null = null || (k <= 0 || ralpha == 0.) && + null = null || (k <= 0 || ralpha == 0.) && rbeta == 1.; } @@ -2505,7 +2515,7 @@ static integer c_n1 = -1; f_rew(&al__1); } zherk_(uplo, trans, &n, &k, &ralpha, &aa[1], & - lda, &rbeta, &cc[1], &ldc, (ftnlen)1, + lda, &rbeta, &cc[1], &ldc, (ftnlen)1, (ftnlen)1); } else { if (*trace) { @@ -2552,16 +2562,16 @@ static integer c_n1 = -1; /* See what data changed inside subroutines. */ - isame[0] = *(unsigned char *)uplos == *(unsigned + isame[0] = *(unsigned char *)uplos == *(unsigned char *)uplo; - isame[1] = *(unsigned char *)transs == *(unsigned + isame[1] = *(unsigned char *)transs == *(unsigned char *)trans; isame[2] = ns == n; isame[3] = ks == k; if (conj) { isame[4] = rals == ralpha; } else { - isame[4] = als.r == alpha.r && als.i == + isame[4] = als.r == alpha.r && als.i == alpha.i; } isame[5] = lze_(&as[1], &aa[1], &laa); @@ -2569,7 +2579,7 @@ static integer c_n1 = -1; if (conj) { isame[7] = rbets == rbeta; } else { - isame[7] = bets.r == beta.r && bets.i == + isame[7] = bets.r == beta.r && bets.i == beta.i; } if (null) { @@ -2623,19 +2633,19 @@ static integer c_n1 = -1; } if (tran) { zmmch_(transt, "N", &lj, &c__1, &k, & - alpha, &a[jj * a_dim1 + 1], - nmax, &a[j * a_dim1 + 1], - nmax, &beta, &c__[jj + j * - c_dim1], nmax, &ct[1], &g[1], - &cc[jc], &ldc, eps, &err, + alpha, &a[jj * a_dim1 + 1], + nmax, &a[j * a_dim1 + 1], + nmax, &beta, &c__[jj + j * + c_dim1], nmax, &ct[1], &g[1], + &cc[jc], &ldc, eps, &err, fatal, nout, &c_true, (ftnlen) 1, (ftnlen)1); } else { zmmch_("N", transt, &lj, &c__1, &k, & - alpha, &a[jj + a_dim1], nmax, + alpha, &a[jj + a_dim1], nmax, &a[j + a_dim1], nmax, &beta, & c__[jj + j * c_dim1], nmax, & - ct[1], &g[1], &cc[jc], &ldc, + ct[1], &g[1], &cc[jc], &ldc, eps, &err, fatal, nout, & c_true, (ftnlen)1, (ftnlen)1); } @@ -2743,12 +2753,12 @@ static integer c_n1 = -1; } /* zchk4_ */ -/* Subroutine */ int zchk5_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int zchk5_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex * alf, integer *nbet, doublecomplex *bet, integer *nmax, doublecomplex * - ab, doublecomplex *aa, doublecomplex *as, doublecomplex *bb, - doublecomplex *bs, doublecomplex *c__, doublecomplex *cc, + ab, doublecomplex *aa, doublecomplex *as, doublecomplex *bb, + doublecomplex *bs, doublecomplex *c__, doublecomplex *cc, doublecomplex *cs, doublecomplex *ct, doublereal *g, doublecomplex *w, ftnlen sname_len) { @@ -2807,30 +2817,30 @@ static integer c_n1 = -1; doublecomplex alpha; doublereal rbeta; logical isame[13]; - extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, + extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, logical *, doublecomplex *, ftnlen, ftnlen, ftnlen); integer nargs; - extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, doublecomplex *, doublereal *, doublecomplex *, - integer *, doublereal *, doublereal *, logical *, integer *, + extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *, doublecomplex *, doublereal *, doublecomplex *, + integer *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen, ftnlen); doublereal rbets; logical reset; char trans[1]; logical upper; char uplos[1]; - extern /* Subroutine */ int zher2k_(char *, char *, integer *, integer *, - doublecomplex *, doublecomplex *, integer *, doublecomplex *, - integer *, doublereal *, doublecomplex *, integer *, ftnlen, - ftnlen), zsyr2k_(char *, char *, integer *, integer *, - doublecomplex *, doublecomplex *, integer *, doublecomplex *, - integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, + extern /* Subroutine */ int zher2k_(char *, char *, integer *, integer *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, + integer *, doublereal *, doublecomplex *, integer *, ftnlen, + ftnlen), zsyr2k_(char *, char *, integer *, integer *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, + integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen); doublereal errmax; - extern logical lzeres_(char *, char *, integer *, integer *, + extern logical lzeres_(char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen); char transs[1], transt[1]; @@ -2986,7 +2996,7 @@ static integer c_n1 = -1; } null = n <= 0; if (conj) { - null = null || (k <= 0 || alpha.r == 0. && + null = null || (k <= 0 || alpha.r == 0. && alpha.i == 0.) && rbeta == 1.; } @@ -3121,9 +3131,9 @@ static integer c_n1 = -1; /* See what data changed inside subroutines. */ - isame[0] = *(unsigned char *)uplos == *(unsigned + isame[0] = *(unsigned char *)uplos == *(unsigned char *)uplo; - isame[1] = *(unsigned char *)transs == *(unsigned + isame[1] = *(unsigned char *)transs == *(unsigned char *)trans; isame[2] = ns == n; isame[3] = ks == k; @@ -3135,7 +3145,7 @@ static integer c_n1 = -1; if (conj) { isame[9] = rbets == rbeta; } else { - isame[9] = bets.r == beta.r && bets.i == + isame[9] = bets.r == beta.r && bets.i == beta.i; } if (null) { @@ -3191,20 +3201,20 @@ static integer c_n1 = -1; i__6 = k; for (i__ = 1; i__ <= i__6; ++i__) { i__7 = i__; - i__8 = (j - 1 << 1) * *nmax + k + + i__8 = (j - 1 << 1) * *nmax + k + i__; - z__1.r = alpha.r * ab[i__8].r - - alpha.i * ab[i__8].i, + z__1.r = alpha.r * ab[i__8].r - + alpha.i * ab[i__8].i, z__1.i = alpha.r * ab[ i__8].i + alpha.i * ab[ i__8].r; - w[i__7].r = z__1.r, w[i__7].i = + w[i__7].r = z__1.r, w[i__7].i = z__1.i; if (conj) { i__7 = k + i__; d_cnjg(&z__2, &alpha); i__8 = (j - 1 << 1) * *nmax + i__; - z__1.r = z__2.r * ab[i__8].r - z__2.i * ab[i__8].i, + z__1.r = z__2.r * ab[i__8].r - z__2.i * ab[i__8].i, z__1.i = z__2.r * ab[i__8].i + z__2.i * ab[ i__8].r; w[i__7].r = z__1.r, w[i__7].i = z__1.i; @@ -3212,7 +3222,7 @@ static integer c_n1 = -1; i__7 = k + i__; i__8 = (j - 1 << 1) * *nmax + i__; z__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] - .i, z__1.i = alpha.r * ab[i__8].i + alpha.i + .i, z__1.i = alpha.r * ab[i__8].i + alpha.i * ab[i__8].r; w[i__7].r = z__1.r, w[i__7].i = z__1.i; } @@ -3223,9 +3233,9 @@ static integer c_n1 = -1; i__8 = *nmax << 1; zmmch_(transt, "N", &lj, &c__1, &i__6, &c_b2, &ab[jjab], &i__7, &w[ - 1], &i__8, &beta, &c__[jj + j + 1], &i__8, &beta, &c__[jj + j * c_dim1], nmax, &ct[1], &g[1] - , &cc[jc], &ldc, eps, &err, + , &cc[jc], &ldc, eps, &err, fatal, nout, &c_true, (ftnlen) 1, (ftnlen)1); } else { @@ -3234,14 +3244,14 @@ static integer c_n1 = -1; if (conj) { i__7 = i__; d_cnjg(&z__2, &ab[(k + i__ - 1) * *nmax + j]); - z__1.r = alpha.r * z__2.r - alpha.i * z__2.i, - z__1.i = alpha.r * z__2.i + alpha.i * + z__1.r = alpha.r * z__2.r - alpha.i * z__2.i, + z__1.i = alpha.r * z__2.i + alpha.i * z__2.r; w[i__7].r = z__1.r, w[i__7].i = z__1.i; i__7 = k + i__; i__8 = (i__ - 1) * *nmax + j; z__2.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] - .i, z__2.i = alpha.r * ab[i__8].i + alpha.i + .i, z__2.i = alpha.r * ab[i__8].i + alpha.i * ab[i__8].r; d_cnjg(&z__1, &z__2); w[i__7].r = z__1.r, w[i__7].i = z__1.i; @@ -3249,13 +3259,13 @@ static integer c_n1 = -1; i__7 = i__; i__8 = (k + i__ - 1) * *nmax + j; z__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] - .i, z__1.i = alpha.r * ab[i__8].i + alpha.i + .i, z__1.i = alpha.r * ab[i__8].i + alpha.i * ab[i__8].r; w[i__7].r = z__1.r, w[i__7].i = z__1.i; i__7 = k + i__; i__8 = (i__ - 1) * *nmax + j; z__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] - .i, z__1.i = alpha.r * ab[i__8].i + alpha.i + .i, z__1.i = alpha.r * ab[i__8].i + alpha.i * ab[i__8].r; w[i__7].r = z__1.r, w[i__7].i = z__1.i; } @@ -3265,9 +3275,9 @@ static integer c_n1 = -1; i__7 = *nmax << 1; zmmch_("N", "N", &lj, &c__1, &i__6, & c_b2, &ab[jj], nmax, &w[1], & - i__7, &beta, &c__[jj + j * - c_dim1], nmax, &ct[1], &g[1], - &cc[jc], &ldc, eps, &err, + i__7, &beta, &c__[jj + j * + c_dim1], nmax, &ct[1], &g[1], + &cc[jc], &ldc, eps, &err, fatal, nout, &c_true, (ftnlen) 1, (ftnlen)1); } @@ -3380,7 +3390,7 @@ static integer c_n1 = -1; } /* zchk5_ */ -/* Subroutine */ int zchke_(integer *isnum, char *srnamt, integer *nout, +/* Subroutine */ int zchke_(integer *isnum, char *srnamt, integer *nout, ftnlen srnamt_len) { /* Format strings */ @@ -3393,37 +3403,37 @@ static integer c_n1 = -1; integer s_wsfe(cilist *), do_fio(integer *, char *, ftnlen), e_wsfe(void); /* Local variables */ - doublecomplex a[2] /* was [2][1] */, b[2] /* was [2][1] */, c__[2] + doublecomplex a[2] /* was [2][1] */, b[2] /* was [2][1] */, c__[2] /* was [2][1] */, beta, alpha; doublereal rbeta; - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, ftnlen, ftnlen), zhemm_(char *, char *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, ftnlen, ftnlen), zherk_(char *, char *, integer *, + extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *, ftnlen, ftnlen), zhemm_(char *, char *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *, ftnlen, ftnlen), zherk_(char *, char *, integer *, integer *, doublereal *, doublecomplex *, integer *, doublereal *, - doublecomplex *, integer *, ftnlen, ftnlen), ztrmm_(char *, char - *, char *, char *, integer *, integer *, doublecomplex *, - doublecomplex *, integer *, doublecomplex *, integer *, ftnlen, - ftnlen, ftnlen, ftnlen), zsymm_(char *, char *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, + doublecomplex *, integer *, ftnlen, ftnlen), ztrmm_(char *, char + *, char *, char *, integer *, integer *, doublecomplex *, + doublecomplex *, integer *, doublecomplex *, integer *, ftnlen, + ftnlen, ftnlen, ftnlen), zsymm_(char *, char *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen), ztrsm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer * - , doublecomplex *, integer *, ftnlen, ftnlen, ftnlen, ftnlen), - zsyrk_(char *, char *, integer *, integer *, doublecomplex *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, ftnlen, ftnlen), zher2k_(char *, char *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublereal *, doublecomplex *, - integer *, ftnlen, ftnlen), zsyr2k_(char *, char *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, + , doublecomplex *, integer *, ftnlen, ftnlen, ftnlen, ftnlen), + zsyrk_(char *, char *, integer *, integer *, doublecomplex *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *, ftnlen, ftnlen), zher2k_(char *, char *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublereal *, doublecomplex *, + integer *, ftnlen, ftnlen), zsyr2k_(char *, char *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen); doublereal ralpha; - extern /* Subroutine */ int chkxer_(char *, integer *, integer *, logical + extern /* Subroutine */ int chkxer_(char *, integer *, integer *, logical *, logical *, ftnlen); /* Fortran I/O blocks */ @@ -3485,302 +3495,302 @@ static integer c_n1 = -1; } L10: infoc_1.infot = 1; - zgemm_("/", "N", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("/", "N", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 1; - zgemm_("/", "C", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("/", "C", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 1; - zgemm_("/", "T", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("/", "T", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 2; - zgemm_("N", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("N", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 2; - zgemm_("C", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("C", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 2; - zgemm_("T", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("T", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - zgemm_("N", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("N", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - zgemm_("N", "C", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("N", "C", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - zgemm_("N", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("N", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - zgemm_("C", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("C", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - zgemm_("C", "C", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("C", "C", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - zgemm_("C", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("C", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - zgemm_("T", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("T", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - zgemm_("T", "C", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("T", "C", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - zgemm_("T", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("T", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - zgemm_("N", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("N", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - zgemm_("N", "C", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("N", "C", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - zgemm_("N", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("N", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - zgemm_("C", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("C", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - zgemm_("C", "C", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("C", "C", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - zgemm_("C", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("C", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - zgemm_("T", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("T", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - zgemm_("T", "C", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("T", "C", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - zgemm_("T", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("T", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - zgemm_("N", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("N", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - zgemm_("N", "C", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("N", "C", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - zgemm_("N", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("N", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - zgemm_("C", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("C", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - zgemm_("C", "C", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("C", "C", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - zgemm_("C", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("C", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - zgemm_("T", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("T", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - zgemm_("T", "C", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("T", "C", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - zgemm_("T", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("T", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - zgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__2, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - zgemm_("N", "C", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("N", "C", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__2, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - zgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__2, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - zgemm_("C", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__2, &beta, + zgemm_("C", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__2, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - zgemm_("C", "C", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("C", "C", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - zgemm_("C", "T", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("C", "T", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - zgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__2, &beta, + zgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__2, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - zgemm_("T", "C", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("T", "C", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - zgemm_("T", "T", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("T", "T", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - zgemm_("N", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("N", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - zgemm_("C", "N", &c__0, &c__0, &c__2, &alpha, a, &c__2, b, &c__1, &beta, + zgemm_("C", "N", &c__0, &c__0, &c__2, &alpha, a, &c__2, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - zgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__2, b, &c__1, &beta, + zgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__2, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - zgemm_("N", "C", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("N", "C", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - zgemm_("C", "C", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("C", "C", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - zgemm_("T", "C", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("T", "C", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - zgemm_("N", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("N", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - zgemm_("C", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("C", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - zgemm_("T", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("T", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - zgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, + zgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - zgemm_("N", "C", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, + zgemm_("N", "C", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - zgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, + zgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - zgemm_("C", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("C", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - zgemm_("C", "C", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("C", "C", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - zgemm_("C", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("C", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - zgemm_("T", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("T", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - zgemm_("T", "C", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("T", "C", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - zgemm_("T", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("T", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); @@ -4960,9 +4970,9 @@ static integer c_n1 = -1; } /* zchke_ */ -/* Subroutine */ int zmake_(char *type__, char *uplo, char *diag, integer *m, - integer *n, doublecomplex *a, integer *nmax, doublecomplex *aa, - integer *lda, logical *reset, doublecomplex *transl, ftnlen type_len, +/* Subroutine */ int zmake_(char *type__, char *uplo, char *diag, integer *m, + integer *n, doublecomplex *a, integer *nmax, doublecomplex *aa, + integer *lda, logical *reset, doublecomplex *transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len) { /* System generated locals */ @@ -5148,10 +5158,10 @@ static integer c_n1 = -1; } /* zmake_ */ /* Subroutine */ int zmmch_(char *transa, char *transb, integer *m, integer * - n, integer *kk, doublecomplex *alpha, doublecomplex *a, integer *lda, + n, integer *kk, doublecomplex *alpha, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublecomplex *beta, doublecomplex * c__, integer *ldc, doublecomplex *ct, doublereal *g, doublecomplex * - cc, integer *ldcc, doublereal *eps, doublereal *err, logical *fatal, + cc, integer *ldcc, doublereal *eps, doublereal *err, logical *fatal, integer *nout, logical *mv, ftnlen transa_len, ftnlen transb_len) { /* Format strings */ @@ -5165,7 +5175,7 @@ static integer c_n1 = -1; " \002,i3)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, cc_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; doublereal d__1, d__2, d__3, d__4, d__5, d__6; doublecomplex z__1, z__2, z__3, z__4; @@ -5224,9 +5234,9 @@ static integer c_n1 = -1; cc -= cc_offset; /* Function Body */ - trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == + trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == 'C'; - tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == + tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == 'C'; ctrana = *(unsigned char *)transa == 'C'; ctranb = *(unsigned char *)transb == 'C'; @@ -5254,17 +5264,17 @@ static integer c_n1 = -1; i__5 = i__; i__6 = i__ + k * a_dim1; i__7 = k + j * b_dim1; - z__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7].i, + z__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7].i, z__2.i = a[i__6].r * b[i__7].i + a[i__6].i * b[ i__7].r; - z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + z__2.i; ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; i__4 = i__ + k * a_dim1; i__5 = k + j * b_dim1; g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = d_imag( &a[i__ + k * a_dim1]), abs(d__2))) * ((d__3 = b[ - i__5].r, abs(d__3)) + (d__4 = d_imag(&b[k + j * + i__5].r, abs(d__3)) + (d__4 = d_imag(&b[k + j * b_dim1]), abs(d__4))); /* L20: */ } @@ -5280,15 +5290,15 @@ static integer c_n1 = -1; i__5 = i__; d_cnjg(&z__3, &a[k + i__ * a_dim1]); i__6 = k + j * b_dim1; - z__2.r = z__3.r * b[i__6].r - z__3.i * b[i__6].i, + z__2.r = z__3.r * b[i__6].r - z__3.i * b[i__6].i, z__2.i = z__3.r * b[i__6].i + z__3.i * b[i__6] .r; - z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + z__2.i; ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; i__4 = k + i__ * a_dim1; i__5 = k + j * b_dim1; - g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = d_imag(&a[k + i__ * a_dim1]), abs(d__2))) * (( d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag( &b[k + j * b_dim1]), abs(d__4))); @@ -5308,12 +5318,12 @@ static integer c_n1 = -1; z__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7] .i, z__2.i = a[i__6].r * b[i__7].i + a[i__6] .i * b[i__7].r; - z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + z__2.i; ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; i__4 = k + i__ * a_dim1; i__5 = k + j * b_dim1; - g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = d_imag(&a[k + i__ * a_dim1]), abs(d__2))) * (( d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag( &b[k + j * b_dim1]), abs(d__4))); @@ -5332,15 +5342,15 @@ static integer c_n1 = -1; i__5 = i__; i__6 = i__ + k * a_dim1; d_cnjg(&z__3, &b[j + k * b_dim1]); - z__2.r = a[i__6].r * z__3.r - a[i__6].i * z__3.i, - z__2.i = a[i__6].r * z__3.i + a[i__6].i * + z__2.r = a[i__6].r * z__3.r - a[i__6].i * z__3.i, + z__2.i = a[i__6].r * z__3.i + a[i__6].i * z__3.r; - z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + z__2.i; ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; i__4 = i__ + k * a_dim1; i__5 = j + k * b_dim1; - g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = d_imag(&a[i__ + k * a_dim1]), abs(d__2))) * (( d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag( &b[j + k * b_dim1]), abs(d__4))); @@ -5360,12 +5370,12 @@ static integer c_n1 = -1; z__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7] .i, z__2.i = a[i__6].r * b[i__7].i + a[i__6] .i * b[i__7].r; - z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + z__2.i; ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; i__4 = i__ + k * a_dim1; i__5 = j + k * b_dim1; - g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = d_imag(&a[i__ + k * a_dim1]), abs(d__2))) * (( d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag( &b[j + k * b_dim1]), abs(d__4))); @@ -5385,17 +5395,17 @@ static integer c_n1 = -1; i__5 = i__; d_cnjg(&z__3, &a[k + i__ * a_dim1]); d_cnjg(&z__4, &b[j + k * b_dim1]); - z__2.r = z__3.r * z__4.r - z__3.i * z__4.i, - z__2.i = z__3.r * z__4.i + z__3.i * + z__2.r = z__3.r * z__4.r - z__3.i * z__4.i, + z__2.i = z__3.r * z__4.i + z__3.i * z__4.r; - z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + z__2.i; ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; i__4 = k + i__ * a_dim1; i__5 = j + k * b_dim1; g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = d_imag(&a[k + i__ * a_dim1]), abs(d__2))) - * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 + * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag(&b[j + k * b_dim1]), abs(d__4))); /* L120: */ } @@ -5410,17 +5420,17 @@ static integer c_n1 = -1; i__5 = i__; d_cnjg(&z__3, &a[k + i__ * a_dim1]); i__6 = j + k * b_dim1; - z__2.r = z__3.r * b[i__6].r - z__3.i * b[i__6].i, + z__2.r = z__3.r * b[i__6].r - z__3.i * b[i__6].i, z__2.i = z__3.r * b[i__6].i + z__3.i * b[ i__6].r; - z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + z__2.i; ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; i__4 = k + i__ * a_dim1; i__5 = j + k * b_dim1; g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = d_imag(&a[k + i__ * a_dim1]), abs(d__2))) - * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 + * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag(&b[j + k * b_dim1]), abs(d__4))); /* L140: */ } @@ -5437,17 +5447,17 @@ static integer c_n1 = -1; i__5 = i__; i__6 = k + i__ * a_dim1; d_cnjg(&z__3, &b[j + k * b_dim1]); - z__2.r = a[i__6].r * z__3.r - a[i__6].i * z__3.i, - z__2.i = a[i__6].r * z__3.i + a[i__6].i * + z__2.r = a[i__6].r * z__3.r - a[i__6].i * z__3.i, + z__2.i = a[i__6].r * z__3.i + a[i__6].i * z__3.r; - z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + z__2.i; ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; i__4 = k + i__ * a_dim1; i__5 = j + k * b_dim1; g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = d_imag(&a[k + i__ * a_dim1]), abs(d__2))) - * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 + * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag(&b[j + k * b_dim1]), abs(d__4))); /* L160: */ } @@ -5463,16 +5473,16 @@ static integer c_n1 = -1; i__6 = k + i__ * a_dim1; i__7 = j + k * b_dim1; z__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[ - i__7].i, z__2.i = a[i__6].r * b[i__7].i + + i__7].i, z__2.i = a[i__6].r * b[i__7].i + a[i__6].i * b[i__7].r; - z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + z__2.i; ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; i__4 = k + i__ * a_dim1; i__5 = j + k * b_dim1; g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = d_imag(&a[k + i__ * a_dim1]), abs(d__2))) - * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 + * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag(&b[j + k * b_dim1]), abs(d__4))); /* L180: */ } @@ -5485,17 +5495,17 @@ static integer c_n1 = -1; for (i__ = 1; i__ <= i__2; ++i__) { i__3 = i__; i__4 = i__; - z__2.r = alpha->r * ct[i__4].r - alpha->i * ct[i__4].i, z__2.i = + z__2.r = alpha->r * ct[i__4].r - alpha->i * ct[i__4].i, z__2.i = alpha->r * ct[i__4].i + alpha->i * ct[i__4].r; i__5 = i__ + j * c_dim1; - z__3.r = beta->r * c__[i__5].r - beta->i * c__[i__5].i, z__3.i = + z__3.r = beta->r * c__[i__5].r - beta->i * c__[i__5].i, z__3.i = beta->r * c__[i__5].i + beta->i * c__[i__5].r; z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i; ct[i__3].r = z__1.r, ct[i__3].i = z__1.i; i__3 = i__ + j * c_dim1; - g[i__] = ((d__1 = alpha->r, abs(d__1)) + (d__2 = d_imag(alpha), + g[i__] = ((d__1 = alpha->r, abs(d__1)) + (d__2 = d_imag(alpha), abs(d__2))) * g[i__] + ((d__3 = beta->r, abs(d__3)) + ( - d__4 = d_imag(beta), abs(d__4))) * ((d__5 = c__[i__3].r, + d__4 = d_imag(beta), abs(d__4))) * ((d__5 = c__[i__3].r, abs(d__5)) + (d__6 = d_imag(&c__[i__ + j * c_dim1]), abs( d__6))); /* L200: */ @@ -5621,8 +5631,8 @@ logical lze_(doublecomplex *ri, doublecomplex *rj, integer *lr) } /* lze_ */ -logical lzeres_(char *type__, char *uplo, integer *m, integer *n, - doublecomplex *aa, doublecomplex *as, integer *lda, ftnlen type_len, +logical lzeres_(char *type__, char *uplo, integer *m, integer *n, + doublecomplex *aa, doublecomplex *as, integer *lda, ftnlen type_len, ftnlen uplo_len) { /* System generated locals */ @@ -5807,7 +5817,7 @@ doublereal ddiff_(doublereal *x, doublereal *y) } /* ddiff_ */ -/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, +/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, logical *lerr, logical *ok, ftnlen srnamt_len) { /* Format strings */ diff --git a/build/add-copyright.py b/build/add-copyright.py index e22ebd16c2..ae331c94e2 100755 --- a/build/add-copyright.py +++ b/build/add-copyright.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # -# BLIS +# BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # @@ -200,7 +200,7 @@ def main(): file_string = "".join( file_lines ) # Search for an existing copyright line. - has_cr = re.search( 'Copyright \(C\)', file_string ) + has_cr = re.search( r'Copyright \(C\)', file_string ) # If the file does not have any copyright notice in it already, we # assume we don't need to update it. @@ -210,7 +210,7 @@ def main(): # Check whether the file already has a copyright for the_org. We may # need to use this information later. - has_org_cr = re.search( 'Copyright \(C\) ([0-9][0-9][0-9][0-9]), %s' % the_org, file_string ) + has_org_cr = re.search( r'Copyright \(C\) ([0-9][0-9][0-9][0-9]), %s' % the_org, file_string ) # Initialize the list of processed (potentially modified) file lines. mod_file_lines = [] @@ -225,7 +225,7 @@ def main(): # Iterate through the lines in the current file. for line in file_lines: - result = re.search( 'Copyright \(C\) ([0-9][0-9][0-9][0-9]), %s' % the_org, line ) + result = re.search( r'Copyright \(C\) ([0-9][0-9][0-9][0-9]), %s' % the_org, line ) # If the current line matches a copyright line for the_org... if result: @@ -253,7 +253,7 @@ def main(): # Add the unchanged line to the running list. mod_file_lines += line - + else: # Add the unchanged line to the running list. mod_file_lines += line @@ -284,8 +284,8 @@ def main(): line_next = file_lines[i] # Try to match both the current line and the next line. - result = re.search( 'Copyright \(C\) ([0-9][0-9][0-9][0-9]), (.*)', line ) - resnext = re.search( 'Copyright \(C\) ([0-9][0-9][0-9][0-9]), (.*)', line_next ) + result = re.search( r'Copyright \(C\) ([0-9][0-9][0-9][0-9]), (.*)', line ) + resnext = re.search( r'Copyright \(C\) ([0-9][0-9][0-9][0-9]), (.*)', line_next ) # Parse the results. if result: @@ -301,7 +301,7 @@ def main(): # The current line matches but the next does not. Thus, # this branch only executes for the *last* copyright line # in the file. - + # Extract the year and organization from the matched # string. old_year = result.group(1) diff --git a/sandbox/gemmlike/thread/bls_l3_decor_openmp.h b/build/bli_addon.h.in similarity index 90% rename from sandbox/gemmlike/thread/bls_l3_decor_openmp.h rename to build/bli_addon.h.in index 9c956d7c36..36a8e29bd1 100644 --- a/sandbox/gemmlike/thread/bls_l3_decor_openmp.h +++ b/build/bli_addon.h.in @@ -32,13 +32,16 @@ */ -#ifndef BLIS_SBX_L3_DECOR_OPENMP_H -#define BLIS_SBX_L3_DECOR_OPENMP_H - -// Definitions specific to situations when OpenMP multithreading is enabled. -#ifdef BLIS_ENABLE_OPENMP +#ifndef BLIS_ADDON_H +#define BLIS_ADDON_H +#if @enable_addons@ +#define BLIS_ENABLE_ADDONS +#else +#define BLIS_DISABLE_ADDONS #endif -#endif +// Enabled addons +@addon_list_includes@ +#endif diff --git a/build/bli_config.h.in b/build/bli_config.h.in index fa6bbbe12e..f883c492b0 100644 --- a/build/bli_config.h.in +++ b/build/bli_config.h.in @@ -45,18 +45,42 @@ // Enabled kernel sets (kernel_list) @kernel_list_defines@ +// Disabled symbols (symbol_omit_list) +@omit_symbol_list_defines@ + +#define BLIS_VERSION_STRING "@version@" + #if @enable_system@ #define BLIS_ENABLE_SYSTEM #else #define BLIS_DISABLE_SYSTEM #endif +#if @enable_tls@ +#define BLIS_ENABLE_TLS +#else +#define BLIS_DISABLE_TLS +#endif + #if @enable_openmp@ #define BLIS_ENABLE_OPENMP +#if @enable_openmp_as_def@ +#define BLIS_ENABLE_OPENMP_AS_DEFAULT +#endif #endif #if @enable_pthreads@ #define BLIS_ENABLE_PTHREADS +#if @enable_pthreads_as_def@ +#define BLIS_ENABLE_PTHREADS_AS_DEFAULT +#endif +#endif + +#if @enable_hpx@ +#define BLIS_ENABLE_HPX +#if @enable_hpx_as_def@ +#define BLIS_ENABLE_HPX_AS_DEFAULT +#endif #endif #if @enable_jrir_slab@ @@ -67,6 +91,10 @@ #define BLIS_ENABLE_JRIR_RR #endif +#if @enable_jrir_tlb@ +#define BLIS_ENABLE_JRIR_TLB +#endif + #if @enable_pba_pools@ #define BLIS_ENABLE_PBA_POOLS #else @@ -85,6 +113,12 @@ #define BLIS_DISABLE_MEM_TRACING #endif +#if @enable_scalapack_compat@ +#define BLIS_ENABLE_SCALAPACK_COMPAT +#else +#define BLIS_DISABLE_SCALAPACK_COMPAT +#endif + #if @int_type_size@ == 64 #define BLIS_INT_TYPE_SIZE 64 #elif @int_type_size@ == 32 @@ -121,26 +155,6 @@ #endif #endif -#ifndef BLIS_ENABLE_MIXED_DT -#ifndef BLIS_DISABLE_MIXED_DT -#if @enable_mixed_dt@ -#define BLIS_ENABLE_MIXED_DT -#else -#define BLIS_DISABLE_MIXED_DT -#endif -#endif -#endif - -#ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM -#ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM -#if @enable_mixed_dt_extra_mem@ -#define BLIS_ENABLE_MIXED_DT_EXTRA_MEM -#else -#define BLIS_DISABLE_MIXED_DT_EXTRA_MEM -#endif -#endif -#endif - #if @enable_sup_handling@ #define BLIS_ENABLE_SUP_HANDLING #else diff --git a/build/blis.h b/build/blis.h new file mode 100644 index 0000000000..999edb6a5e --- /dev/null +++ b/build/blis.h @@ -0,0 +1 @@ +#include diff --git a/build/bump-version.sh b/build/bump-version.sh index 65e1a2988f..b72a09a40e 100755 --- a/build/bump-version.sh +++ b/build/bump-version.sh @@ -98,10 +98,10 @@ main() # The name of the CHANGELOG file. changelog_file='CHANGELOG' - # The name of the default version file. - version_file_def='version' + # The name and location of the default version file. + version_file_def='build/version' - # The name of the specified version file. + # The name and location of the specified version file. version_file='' # Strings used during version query. diff --git a/build/cblas.h b/build/cblas.h new file mode 100644 index 0000000000..f9ab368727 --- /dev/null +++ b/build/cblas.h @@ -0,0 +1 @@ +#include diff --git a/build/config.mk.in b/build/config.mk.in index 7533d1acbb..dcc6f1b55d 100644 --- a/build/config.mk.in +++ b/build/config.mk.in @@ -5,6 +5,7 @@ # libraries. # # Copyright (C) 2014, The University of Texas at Austin +# Copyright (C) 2022, Advanced Micro Devices, Inc. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -52,7 +53,9 @@ CONFIG_NAME := @config_name@ # sub-configuration in CONFIG_LIST corresponds to a configuration # sub-directory in the 'config' directory. See the 'config_registry' # file for the full list of registered configurations. -CONFIG_LIST := @config_list@ +CONFIG_LIST := @config_list@ +FULL_CONFIG_LIST := @full_config_list@ +FULL_SUBCONFIG_LIST := @full_subconfig_list@ # This list of kernels needed for the configurations in CONFIG_LIST. # Each item in this list corresponds to a sub-directory in the top-level @@ -61,6 +64,7 @@ CONFIG_LIST := @config_list@ # kernel set X, and configuration W uses kernel set Q, and the CONFIG_LIST # might contained "X Y Z W", then the KERNEL_LIST would contain "X Z Q". KERNEL_LIST := @kernel_list@ +FULL_KERNEL_LIST := @full_kernel_list@ # This list contains some number of "kernel:config" pairs, where "config" # specifies which configuration's compilation flags (CFLAGS) should be @@ -72,6 +76,7 @@ OS_NAME := @os_name@ # Check for whether the operating system is Windows. IS_WIN := @is_win@ +IS_MSVC := @is_msvc@ # The directory path to the top level of the source distribution. When # building in-tree, this path is ".". When building out-of-tree, this path @@ -93,10 +98,18 @@ CC := @CC@ GCC_OT_4_9_0 := @gcc_older_than_4_9_0@ GCC_OT_6_1_0 := @gcc_older_than_6_1_0@ GCC_OT_9_1_0 := @gcc_older_than_9_1_0@ +GCC_OT_10_3_0 := @gcc_older_than_10_3_0@ +CLANG_OT_9_0_0 := @clang_older_than_9_0_0@ +CLANG_OT_12_0_0 := @clang_older_than_12_0_0@ +AOCC_OT_2_0_0 := @aocc_older_than_2_0_0@ +AOCC_OT_3_0_0 := @aocc_older_than_3_0_0@ -# The C++ compiler. NOTE: A C++ is typically not needed. +# The C++ compiler. NOTE: A C++ compiler is typically not needed. CXX := @CXX@ +# The Fortran compiler. NOTE: A Fortran compiler is typically not needed. +FC := @FC@ + # Static library indexer. RANLIB := @RANLIB@ @@ -106,21 +119,26 @@ AR := @AR@ # Python Interpreter PYTHON := @PYTHON@ -# Preset (required) CFLAGS and LDFLAGS. These variables capture the value -# of the CFLAGS and LDFLAGS environment variables at configure-time (and/or -# the value of CFLAGS/LDFLAGS if either was specified on the command line). +# Preset (required) CFLAGS, CXXFLAGS, and LDFLAGS. These variables capture the value +# of the CFLAGS, CXXFLAGS, and LDFLAGS environment variables at configure-time (and/or +# the value of CFLAGS/CXXFLAGS/LDFLAGS if any was specified on the command line). # These flags are used in addition to the flags automatically determined # by the build system. CFLAGS_PRESET := @cflags_preset@ +CXXFLAGS_PRESET := @cxxflags_preset@ LDFLAGS_PRESET := @ldflags_preset@ # The level of debugging info to generate. DEBUG_TYPE := @debug_type@ +ENABLE_DEBUG := @enable_debug@ + +# Whether to compile and link the AddressSanitizer library. +MK_ENABLE_ASAN := @enable_asan@ # Whether operating system support was requested via --enable-system. -ENABLE_SYSTEM := @enable_system@ +ENABLE_SYSTEM := @mk_enable_system@ -# The requested threading model. +# The requested threading model(s). THREADING_MODEL := @threading_model@ # Whether the compiler supports "#pragma omp simd" via the -fopenmp-simd option. @@ -168,8 +186,8 @@ ARG_MAX_HACK := @enable_arg_max_hack@ # Whether to build the static and shared libraries. # NOTE: The "MK_" prefix, which helps differentiate these variables from # their corresonding cpp macros that use the BLIS_ prefix. -MK_ENABLE_STATIC := @enable_static@ -MK_ENABLE_SHARED := @enable_shared@ +MK_ENABLE_STATIC := @mk_enable_static@ +MK_ENABLE_SHARED := @mk_enable_shared@ # Whether to use an install_name based on @rpath. MK_ENABLE_RPATH := @enable_rpath@ @@ -179,11 +197,15 @@ MK_ENABLE_RPATH := @enable_rpath@ EXPORT_SHARED := @export_shared@ # Whether to enable either the BLAS or CBLAS compatibility layers. -MK_ENABLE_BLAS := @enable_blas@ -MK_ENABLE_CBLAS := @enable_cblas@ +MK_ENABLE_BLAS := @mk_enable_blas@ +MK_ENABLE_CBLAS := @mk_enable_cblas@ # Whether libblis will depend on libmemkind for certain memory allocations. -MK_ENABLE_MEMKIND := @enable_memkind@ +MK_ENABLE_MEMKIND := @mk_enable_memkind@ + +# The names of the addons to include when building BLIS. If empty, no addons +# will be included. +ADDON_LIST := @addon_list@ # The name of a sandbox defining an alternative gemm implementation. If empty, # no sandbox will be used and the conventional gemm implementation will remain @@ -194,5 +216,8 @@ SANDBOX := @sandbox@ # variable is set to the empty value. LIBPTHREAD := @libpthread@ +# Whether we should use AMD-customized versions of certain framework files. +ENABLE_AMD_FRAME_TWEAKS := @enable_amd_frame_tweaks@ + # end of ifndef CONFIG_MK_INCLUDED conditional block endif diff --git a/build/detect/config/config_detect.c b/build/detect/config/config_detect.c index 5e29defe15..5f1ea0f420 100644 --- a/build/detect/config/config_detect.c +++ b/build/detect/config/config_detect.c @@ -69,8 +69,8 @@ int main( int argc, char** argv ) { - arch_t id = bli_cpuid_query_id(); - char* s = bli_arch_string( id ); + arch_t id = bli_cpuid_query_id(); + const char* s = bli_arch_string( id ); printf( "%s\n", s ); diff --git a/build/detect/riscv/bli_riscv_cpuid.h b/build/detect/riscv/bli_riscv_cpuid.h new file mode 100644 index 0000000000..4f0c25a339 --- /dev/null +++ b/build/detect/riscv/bli_riscv_cpuid.h @@ -0,0 +1,67 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ + +/* RISC-V autodetection code which works with native or cross-compilers. + Compile with $CC -E and ignore all output lines starting with #. On RISC-V + it may return rv32i (base 32-bit integer RISC-V), rv32iv (rv32i plus vector + extensions), rv64i (base 64-bit integer RISC-V), or rv64iv (rv64i plus + vector extensions). On 128-bit integer RISC-V, it falls back to generic + for now. For toolchains which do not yet support RISC-V feature-detection + macros, it will fall back on generic, so the BLIS configure script may need + the RISC-V configuration to be explicitly specified. */ + +// false if !defined(__riscv) || !defined(__riscv_xlen) +#if __riscv && __riscv_xlen == 64 + +#if __riscv_vector // false if !defined(__riscv_vector) +rv64iv +#else +rv64i +#endif + +// false if !defined(__riscv) || !defined(__riscv_xlen) || __riscv_e32 != 0 +#elif __riscv && __riscv_xlen == 32 && !__riscv_e32 + +#if __riscv_vector // false if !defined(__riscv_vector) +rv32iv +#else +rv32i +#endif + +#else + +generic // fall back on BLIS runtime CPUID autodetection algorithm + +#endif diff --git a/build/detect/riscv/bli_riscv_detect_abi.h b/build/detect/riscv/bli_riscv_detect_abi.h new file mode 100644 index 0000000000..a5a3739268 --- /dev/null +++ b/build/detect/riscv/bli_riscv_detect_abi.h @@ -0,0 +1,63 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ + +/* Construct a RISC-V ABI string based on available features. */ + +#if __riscv + +#define CAT2(a,b) a##b +#define CAT(a,b) CAT2(a,b) + +#if __riscv_xlen == 32 +#define RISCV_INT_ABI ilp32 +#else +#define RISCV_INT_ABI lp64 +#endif + +#if __riscv_abi_rve +CAT(RISCV_INT_ABI, e) +#elif __riscv_float_abi_soft +RISCV_INT_ABI +#elif __riscv_float_abi_single +CAT(RISCV_INT_ABI, f) +#elif __riscv_float_abi_double +CAT(RISCV_INT_ABI, d) +#elif __riscv_float_abi_quad +CAT(RISCV_INT_ABI, q) +#else +#error "Unknown RISC-V ABI" +#endif + +#endif /* __riscv */ diff --git a/build/detect/riscv/bli_riscv_detect_arch.h b/build/detect/riscv/bli_riscv_detect_arch.h new file mode 100644 index 0000000000..55542f5086 --- /dev/null +++ b/build/detect/riscv/bli_riscv_detect_arch.h @@ -0,0 +1,225 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ + +/* Construct a RISC-V architecture string based on available features. */ + +#if __riscv + +#if __riscv_arch_test + +#if __riscv_i +#define RISCV_I i +#else +#define RISCV_I +#endif + +#if __riscv_e +#define RISCV_E e +#else +#define RISCV_E +#endif + +#if __riscv_m +#define RISCV_M m +#else +#define RISCV_M +#endif + +#if __riscv_a +#define RISCV_A a +#else +#define RISCV_A +#endif + +#if __riscv_f +#define RISCV_F f +#else +#define RISCV_F +#endif + +#if __riscv_d +#define RISCV_D d +#else +#define RISCV_D +#endif + +#if __riscv_flen >= 128 +#define RISCV_Q q +#else +#define RISCV_Q +#endif + +#if __riscv_c +#define RISCV_C c +#else +#define RISCV_C +#endif + +#if __riscv_p +#define RISCV_P p +#else +#define RISCV_P +#endif + +/* FORCE_RISCV_VECTOR is a Clang workaround */ +#if __riscv_v || FORCE_RISCV_VECTOR +#define RISCV_V v +#else +#define RISCV_V +#endif + +/* No test currently for Zicsr, which was removed from the base ISA, + but F implies Zicsr */ +#if __riscv_f +#define RISCV_ZICSR _zicsr +#else +#define RISCV_ZICSR +#endif + +/* No test currently for Zifencei, which was removed from the base ISA */ +#define RISCV_ZIFENCEI + +#if __riscv_zba +#define RISCV_ZBA _zba +#else +#define RISCV_ZBA +#endif + +#if __riscv_zbb +#define RISCV_ZBB _zbb +#else +#define RISCV_ZBB +#endif + +#if __riscv_zbc +#define RISCV_ZBC _zbc +#else +#define RISCV_ZBC +#endif + +#if __riscv_zbs +#define RISCV_ZBS _zbs +#else +#define RISCV_ZBS +#endif + +#if __riscv_zfh +#define RISCV_ZFH _zfh +#else +#define RISCV_ZFH +#endif + +#else /* __riscv_arch_test */ + +/* We assume I and E are exclusive when __riscv_arch_test isn't defined */ +#if __riscv_32e +#define RISCV_I +#define RISCV_E e +#else +#define RISCV_I i +#define RISCV_E +#endif + +#if __riscv_mul +#define RISCV_M m +#else +#define RISCV_M +#endif + +#if __riscv_atomic +#define RISCV_A a +#else +#define RISCV_A +#endif + +#if __riscv_flen >= 32 +#define RISCV_F f +#else +#define RISCV_F +#endif + +#if __riscv_flen >= 64 +#define RISCV_D d +#else +#define RISCV_D +#endif + +#if __riscv_flen >= 128 +#define RISCV_Q q +#else +#define RISCV_Q +#endif + +#if __riscv_compressed +#define RISCV_C c +#else +#define RISCV_C +#endif + +#define RISCV_P + +/* FORCE_RISCV_VECTOR is a Clang workaround */ +#if __riscv_vector || FORCE_RISCV_VECTOR +#define RISCV_V v +#else +#define RISCV_V +#endif + +/* No test currently for Zicsr, which was removed from the base ISA, but + F implies Zicsr */ +#if __riscv_flen >= 32 +#define RISCV_ZICSR _zicsr +#else +#define RISCV_ZICSR +#endif + +#define RISCV_ZIFENCEI +#define RISCV_ZBA +#define RISCV_ZBB +#define RISCV_ZBC +#define RISCV_ZBS +#define RISCV_ZFH + +#endif /* __riscv_arch_test */ + +#define CAT2(a,b) a##b +#define CAT(a,b) CAT2(a,b) + +CAT(rv, CAT(__riscv_xlen, CAT(RISCV_I, CAT(RISCV_E, CAT(RISCV_M, CAT(RISCV_A, +CAT(RISCV_F, CAT(RISCV_D, CAT(RISCV_Q, CAT(RISCV_C, CAT(RISCV_P, CAT(RISCV_V, +CAT(RISCV_ZICSR, CAT(RISCV_ZIFENCEI, CAT(RISCV_ZBA, CAT(RISCV_ZBB, +CAT(RISCV_ZBC, CAT(RISCV_ZBS, RISCV_ZFH)))))))))))))))))) + +#endif /* __riscv */ diff --git a/build/do-release.sh b/build/do-release.sh new file mode 100755 index 0000000000..97c906695f --- /dev/null +++ b/build/do-release.sh @@ -0,0 +1,240 @@ +#!/bin/sh +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2014, The University of Texas at Austin +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name(s) of the copyright holder(s) nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + +# +# bump-version.sh +# +# Field G. Van Zee +# + + +print_usage() +{ + echo <<- EOF + + $script_name + + Field G. Van Zee + + Performs a series of actions needed when creating a new release + candidate branch for BLIS: + 1. Overwrite the version file with the version string passed + into this script (). + 2. Commit the updated version file. + 3. Update the CHANGELOG file. + 4. Commit the updated CHANGELOG file. + 5. Create a new branch (named 'r') which refers to + the commit created in (4). + 6. Tag the commit created in (4) with a tage ''. + + Usage: + ${script_name} [options] new_vers" + + Arguments: + + new_vers The new version string. + + Options: + + -b bare update + Update the version and CHANGELOG files but do not create + a release branch or tag. + -d dry-run + Go through all the motions, but don't actually make any + changes to files or perform any git commits. Note that + this will result in the commits for (2) and (4) above + being equal to the initial commit in the script output. + -f VERSFILE version file name + Update VERSFILE with new version string instead of default + 'version' file. + EOF + + exit 1 +} + + +main() +{ + # -- BEGIN GLOBAL VARIABLE DECLARATIONS -- + + # The name of the script, stripped of any preceeding path. + script_name=${0##*/} + + # The name of the config.mk file. + configmk_file='config.mk' + + # The name of the CHANGELOG file. + changelog_file='CHANGELOG' + + # The name and location of the default version file. + version_file_def='build/version' + + # The name and location of the specified version file. + version_file='' + + # Strings used during version query. + git_commit_str='' + new_version_str='' + new_rc_str='' + + # Master branch name. + master_br=master + + # The script name to use instead of the $0 when outputting messages. + output_name='' + + # The git directory. + gitdir='.git' + + # Whether we are performing a dry run or not. + dry_run_flag="" + + # Whether we are doing a bare update or not. + bare_flag="" + + # -- END GLOBAL VARIABLE DECLARATIONS -- + + + # Process our command line options. + while getopts ":dhbf:" opt; do + case $opt in + d ) dry_run_flag="1" ;; + b ) bare_flag="1" ;; + f ) version_file=$OPTARG ;; + h ) print_usage ;; + \? ) print_usage + esac + done + shift $(($OPTIND - 1)) + + + # If a version file name was not given, set version_file to the default + # value. + if [ -n "${version_file}" ]; then + + echo "${script_name}: version file specified: '${version_file}'." + else + + echo "${script_name}: no version file specified; defaulting to '${version_file_def}'." + version_file="${version_file_def}" + fi + + + # Check the number of arguments after command line option processing. + if [ $# = "1" ]; then + + new_version_str=$1 + new_rc_str="r${new_version_str}" + + echo "${script_name}: new version string: '${new_version_str}'." + if [ -z "${bare_flag}" ]; then + echo "${script_name}: preparing to create release (candidate) branch '${new_rc_str}'." + fi + + else + print_usage + fi + + + # Check if the .git dir exists; if it does not, we do nothing. + if [ -d "${gitdir}" ]; then + + echo "${script_name}: found '${gitdir}' directory; assuming git clone." + + git_commit_str=$(git describe --always) + echo "${script_name}: initial commit: ${git_commit_str}." + + echo "${script_name}: updating version file '${version_file}'." + if [ -z "$dry_run_flag" ]; then + echo "${new_version_str}" > ${version_file} + fi + + echo "${script_name}: executing: git commit -m \"Version file update (${new_version_str})\" ${version_file}." + if [ -z "$dry_run_flag" ]; then + git commit -m "Version file update (${new_version_str})" ${version_file} + fi + + git_commit_str=$(git describe --always) + echo "${script_name}: new commit containing version file update: ${git_commit_str}." + + echo "${script_name}: updating '${changelog_file}'." + if [ -z "$dry_run_flag" ]; then + git log --no-decorate > ${changelog_file} + fi + + echo "${script_name}: executing: git commit -m \"CHANGELOG update (${new_version_str})\" ${changelog_file}." + if [ -z "$dry_run_flag" ]; then + git commit -m "CHANGELOG update (${new_version_str})" ${changelog_file} + fi + + git_commit_str=$(git describe --always) + echo "${script_name}: new commit containing CHANGELOG update: ${git_commit_str}." + + if [ -z "${bare_flag}" ]; then + + echo "${script_name}: Creating branch ${new_rc_str}." + if [ -z "$dry_run_flag" ]; then + git branch "${new_rc_str}" + fi + + echo "${script_name}: Tagging branch ${new_rc_str} with tag ${new_version_str}." + if [ -z "$dry_run_flag" ]; then + git tag "${new_version_str}" "${new_rc_str}" + fi + + echo "${script_name}: " + echo "${script_name}: FINAL STEPS: Check the output of 'git log'. If everything" + echo "${script_name}: looks okay, push the new branch manually:" + echo "${script_name}: " + echo "${script_name}: git push" + echo "${script_name}: git push --tags" + echo "${script_name}: git push -u origin ${new_rc_str}" + echo "${script_name}: " + + fi + + else + + echo "${script_name}: could not find '${gitdir}' directory; bailing out." + + fi + + + # Exit peacefully. + return 0 +} + + +# The script's main entry point, passing all parameters given. +main "$@" diff --git a/build/flatten-headers.py b/build/flatten-headers.py index 563725a7e9..2ce5b6c0c8 100755 --- a/build/flatten-headers.py +++ b/build/flatten-headers.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -# BLIS +# BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # @@ -110,7 +110,7 @@ def print_usage(): def canonicalize_ws( s ): - return re.sub( '\s+', ' ', s ).strip() + return re.sub( r'\s+', ' ', s ).strip() # --- @@ -166,7 +166,7 @@ def list_contains_header( items ): rval = False for item in items: - is_h = re.search( "\.h", item ) + is_h = re.search( r"\.h", item ) if is_h: rval = True @@ -198,7 +198,7 @@ def get_header_path( filename, header_dirpaths ): def strip_cstyle_comments( string ): - return re.sub( "/\*.*?\*/", "", string, flags=re.S ) + return re.sub( r"/\*.*?\*/", "", string, flags=re.S ) # ------------------------------------------------------------------------------ @@ -215,9 +215,19 @@ def flatten_header( inputfile, header_dirpaths, cursp ): # Open the input file to process. ifile = open( inputfile, "r" ) + # A counter to track the line number being parsed within the current file. + # This counter, when selectively encoded into the flattened header via #line + # directives, facilitates easier debugging. (When the compiler finds an + # issue, it will be able to refer to the line number within the constituent + # header file rather than the flattened one.) + lineno = 0 + # Iterate over the lines in the file. while True: + # Increment the line number. + lineno += 1 + # Read a line in the file. line = ifile.readline() @@ -268,12 +278,16 @@ def flatten_header( inputfile, header_dirpaths, cursp ): # Mark the beginning of the header being inserted. ostring += "%s%s%c" % ( beginstr, header, '\n' ) + if line_numbers: + ostring += "#line %d \"%s\"%c\n" % ( 1, header_path, '\n' ) # Recurse on the header, accumulating the string. ostring += flatten_header( header_path, header_dirpaths, cursp + " " ) # Mark the end of the header being inserted. ostring += "%s%s%c" % ( endstr, header, '\n' ) + if line_numbers: + ostring += "#line %d \"%s\"%c\n" % ( lineno+1, inputfile, '\n' ) echov2( "%sheader file '%s' fully processed." \ % ( cursp, header_path ) ) @@ -300,7 +314,7 @@ def flatten_header( inputfile, header_dirpaths, cursp ): # endif # endwhile - + # Close the input file. ifile.close() @@ -330,7 +344,6 @@ def find_header_dirs( dirpath ): #endfor return header_dirpaths - # ------------------------------------------------------------------------------ @@ -339,6 +352,7 @@ def find_header_dirs( dirpath ): output_name = None strip_comments = None recursive_flag = None +line_numbers = None verbose_flag = None regex = None root_inputfile = None @@ -349,6 +363,7 @@ def main(): global output_name global strip_comments global recursive_flag + global line_numbers global verbose_flag global regex global root_inputfile @@ -360,13 +375,14 @@ def main(): strip_comments = False recursive_flag = False + line_numbers = False verbose_flag = "1" nestsp = " " # Process our command line options. try: - opts, args = getopt.getopt( sys.argv[1:], "o:rchv:" ) + opts, args = getopt.getopt( sys.argv[1:], "o:rclhv:" ) except getopt.GetoptError as err: # print help information and exit: @@ -379,6 +395,8 @@ def main(): output_name = optarg elif opt == "-r": recursive_flag = True + elif opt == "-l": + line_numbers = True elif opt == "-c": strip_comments = True elif opt == "-v": @@ -390,6 +408,9 @@ def main(): print_usage() sys.exit() + if line_numbers and strip_comments: + my_print( "WARNING: stripping comments will result in inaccurate line numbers" ) + # Make sure that the verboseness level is valid. if ( verbose_flag != "0" and verbose_flag != "1" and @@ -506,7 +527,7 @@ def main(): # Precompile the main regular expression used to isolate #include # directives and the headers they reference. This regex object will # get reused over and over again in flatten_header(). - regex = re.compile( '^[\s]*#include (["<])([\w\.\-/]*)([">])' ) + regex = re.compile( r'^[\s]*#include (["<])([\w\.\-/]*)([">])' ) # Recursively substitute headers for occurrences of #include directives. final_string = flatten_header( inputfile, header_dirpaths, nestsp ) diff --git a/build/gen-libblis-symbols.sh b/build/gen-libblis-symbols.sh new file mode 100755 index 0000000000..0ffa3458fc --- /dev/null +++ b/build/gen-libblis-symbols.sh @@ -0,0 +1,60 @@ +#!/usr/bin/env bash + +get_config_var() +{ + # Parse the compiler assigned to the CC variable within the config.mk file. + echo "$(grep "^ *$1 *:=" config.mk | sed 's/'$1' *:= *//')" +} + +main() +{ + if [ ! -e config.mk ]; then + echo "No config.mk file detected; have you configured BLIS?" + exit 1 + fi + + CC=$(get_config_var CC) + CONFIG_NAME=$(get_config_var CONFIG_NAME) + BLIS_H_FLAT="include/${CONFIG_NAME}/blis.h" + + if [ ! -e ${BLIS_H_FLAT} ]; then + echo "No monolithic blis.h file detected at ${BLIS_H_FLAT}; have you run 'make'?" + exit 1 + fi + + # + # Header line + # + echo "EXPORTS" + + # + # Breakdown of commands: + # $(CC) ... # Pre-process blis.h, making sure to include all BLAS and CBLAS symbols + # | tr ... # Make sure to split lines at ';' so that each declaration is on its own line + # | grep ... # Find exported symbols + # | sed -E + # -e ... # 1. Remove all __attribute__ clauses + # -e ... # 2. Select only the portion before an opening '(' (if any) + # -e ... # 3. Pull out the last word, which is the function name. + # | grep ... # Remove constants + # | grep ... # Remove blank lines + # | sed ... # Remove trailing spaces + # | sort + # | uniq + # + ${CC} -DBLIS_ENABLE_CBLAS=1 -DBLIS_ENABLE_BLAS=1 -E ${BLIS_H_FLAT} \ + | tr ';' '\n' \ + | grep visibility \ + | sed -E \ + -e 's/__attribute__ *\( *\([^\)]+(\([^\)]+\) *)\) *\)//g' \ + -e 's/(.*) *\(.*/\1/' \ + -e 's/.* ([^ ].*)/\1/' \ + | grep -v BLIS \ + | grep -E '[^ ]' \ + | sed -e 's/[[:space:]]*$//g' \ + | sort \ + | uniq +} + +main "$@" + diff --git a/build/gen-make-frags/gen-make-frag.sh b/build/gen-make-frags/gen-make-frag.sh index e411fa8d95..348b3c68f9 100755 --- a/build/gen-make-frags/gen-make-frag.sh +++ b/build/gen-make-frags/gen-make-frag.sh @@ -1,6 +1,6 @@ #!/bin/sh # -# BLIS +# BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # @@ -42,10 +42,10 @@ print_usage() { #local script_name - + # Get the script name #script_name=${0##*/} - + # Echo usage info echo " " echo " "$script_name @@ -86,6 +86,9 @@ print_usage() echo " root_dir." echo " -h hide" echo " Hide the makefile fragments by prepending filenames with '.'." + echo " -i LIST ignore" + echo " Augment the list of directory names contained in ign_list" + echo " with the directory names in LIST." echo " -p PREFIX prefix name" echo " Use PREFIX instead of uppercased root_dir in the makefile" echo " variable name. If the root_dir were 'stuff' and -p was not" @@ -100,7 +103,7 @@ print_usage() echo " level 1: default (one line per directory)" echo " level 2: verbose (several lines per directory)." echo " " - + # Exit with non-zero exit status exit 1 } @@ -123,24 +126,28 @@ gen_mkfile() local mkfile_frag_var_name local this_dir local this_frag_dir - local mkfile_frag_tmpl_name - local mkfile_name + local mkfile_frag_tmpl_name + local mkfile_name local mkfile_frag_path - local cur_frag_dir + local cur_frag_dir local cur_frag_path local local_src_files local sub_items local item_path local item_suffix local cur_frag_sub_dirs - - + + # Extract our arguments to local variables mkfile_frag_var_name=$1 this_dir=$2 this_frag_dir=$3 - - + + + # Make sure the target directory exists + mkdir -p $this_frag_dir + + # Strip the leading path from the template makefile path to get its # simple filename. Hide the output makefile fragment filename, if # requested. @@ -150,62 +157,62 @@ gen_mkfile() else mkfile_frag_path=$this_frag_dir/$mkfile_frag_tmpl_name fi - - + + # Determine the directory in which the fragment will reside. cur_frag_path=$this_dir cur_frag_dir=${this_dir##*/} - - + + # Initialize the local source list to empty local_src_files="" - + # Get a listing of the items in $this_dir sub_items=$(ls $this_dir) - + # Generate a list of the source files we've chosen for item in $sub_items; do - + # Prepend the directory to the item to get a relative path item_path=$this_dir/$item - + # Acquire the item's suffix, if it has one item_suffix=${item_path##*.} - + # If the suffix matches, then add it to our list if is_in_list $item_suffix "$src_file_suffixes" then local_src_files="$local_src_files $item" fi done - + # Delete the leading " " space character in the local source files list. local_src_files=${local_src_files##" "} - - + + # Initialize the fragment subdirectory list to empty cur_frag_sub_dirs="" - + # Capture the relative path listing of items in $this_dir. sub_items=$(ls $this_dir) - + # Determine the fragment's subdirectory names, if any exist for item in $sub_items; do - + # Prepend the directory to the item to get a relative path item_path=$this_dir/$item - + # If item is a directory, and it's not in the ignore list, descend into it. #if [ -d $item_path ] && ! should_ignore $item; then - if [ -d $item_path ] && ! is_in_list $item "$ignore_dirs" ; then + if [ "$recursive_flag" = "1" ] && [ -d $item_path ] && ! is_in_list $item "$ignore_dirs" ; then cur_frag_sub_dirs=$cur_frag_sub_dirs" "$item fi done - + # Delete the leading " " space character in fragment's subdirectory list. cur_frag_sub_dirs=${cur_frag_sub_dirs##" "} - - + + # Be verbose, if level 2 was requested. if [ "$verbose_flag" = "2" ]; then echo "mkf frag tmpl path: $mkfile_frag_tmpl_path" @@ -218,8 +225,8 @@ gen_mkfile() echo "mkf frag var name: $mkfile_frag_var_name" echo "--------------------------------------------------" fi - - + + # Copy the template makefile to the directory given, using the new # makefile name we just created above. if [ -z "$dry_run_flag" ]; then @@ -229,8 +236,8 @@ gen_mkfile() | sed -e s/"$mkfile_fragment_src_var_name_anchor"/"$mkfile_frag_var_name"/g \ > $mkfile_frag_path fi - - + + # Return peacefully. return 0 } @@ -239,59 +246,59 @@ gen_mkfile() # # gen_mkfiles # -# Recursively generates makefile fragments for a directory and all +# Recursively generates makefile fragments for a directory and all # subdirectories. All of the actual work happens in gen_mkfile(). # gen_mkfiles() { # Local variable declarations local item sub_items cur_dir this_frag_dir this_dir - - + + # Extract our argument cur_dir=$1 this_frag_dir=$2 - - + + # Append a relevant suffix to the makefile variable name, if necesary # NOTE: This step is disabled because special directories are presently # ignored when generating makefile variable names. #all_add_src_var_name "$cur_dir" - - + + # Be verbose if level 2 was requested if [ "$verbose_flag" = "2" ]; then echo ">>>" $script_name ${src_var_name}_$SRC $cur_dir $this_frag_dir elif [ "$verbose_flag" = "1" ]; then echo "$script_name: creating makefile fragment in $this_frag_dir from $cur_dir" fi - - + + # Call our function to generate a makefile in the directory given. gen_mkfile "${src_var_name}_$SRC" $cur_dir $this_frag_dir - - + + # Get a listing of the directories in $directory sub_items=$(ls $cur_dir) - + # Descend into the contents of root_dir to generate the subdirectories' # makefile fragments. for item in $sub_items; do - + # If item is a directory, and it's not in the ignore list, descend into it. #if [ -d "$cur_dir/$item" ] && ! should_ignore $item; then if [ -d "$cur_dir/$item" ] && ! is_in_list $item "$ignore_dirs" ; then gen_mkfiles $cur_dir/$item $this_frag_dir/$item fi done - - + + # Remove a relevant suffix from the makefile variable name, if necesary # NOTE: This step is disabled because special directories are presently # ignored when generating makefile variable names. #all_del_src_var_name "$cur_dir" - - + + # Return peacefully return 0 } @@ -301,28 +308,28 @@ gen_mkfiles() #update_src_var_name_special() #{ # local dir act i name var_suffix -# +# # # Extract arguments. # act="$1" # dir="$2" -# +# # # Strip / from end of directory path, if there is one, and then strip # # path from directory name. # dir=${dir%/} # dir=${dir##*/} -# +# # # Run through our list. # # NOTE: CURRENTLY, SPECIAL DIRECTORY NAMES ARE IGNORED. In order to # # re-enable them, remove the quotes from "${special_dirs}". # for specdir in "${special_dirs}"; do -# +# # # If the current item matches sdir, then we'll have # # to make a modification of some form. # if [ "$dir" = "$specdir" ]; then -# +# # # Convert the directory name to uppercase. # var_suffix=$(echo "$dir" | tr '[:lower:]' '[:upper:]') -# +# # # Either add or remove the suffix, and also update the # # source file suffix variable. # if [ "$act" == "+" ]; then @@ -330,7 +337,7 @@ gen_mkfiles() # else # src_var_name=${src_var_name%_$var_suffix} # fi -# +# # # No need to continue iterating. # break; # fi @@ -340,17 +347,17 @@ gen_mkfiles() #init_src_var_name() #{ # local dir="$1" -# +# # # Strip off the leading / if there is one # dir=${dir%%/} -# -# # Convert the / directory separators into spaces to make a list of +# +# # Convert the / directory separators into spaces to make a list of # # directories. # list=${dir//\// } -# +# # # Inspect each item in $list # for item in $list; do -# +# # # Try to initialize the source variable name # all_add_src_var_name $item # done @@ -359,7 +366,7 @@ gen_mkfiles() #all_add_src_var_name() #{ # local dir="$1" -# +# # update_src_var_name_special "+" "$dir" # #} @@ -367,7 +374,7 @@ gen_mkfiles() #all_del_src_var_name() #{ # local dir="$1" -# +# # update_src_var_name_special "-" "$dir" #} @@ -384,7 +391,7 @@ read_mkfile_config() src_file_suffixes=$(echo ${src_file_suffixes} | sed "s/\n/ /g") ignore_dirs=$(echo ${ignore_dirs} | sed "s/\n/ /g") -} +} main() { @@ -395,26 +402,26 @@ main() mkfile_fragment_sub_dir_names_anchor="_mkfile_fragment_sub_dir_names_" mkfile_fragment_local_src_files_anchor="_mkfile_fragment_local_src_files_" mkfile_fragment_src_var_name_anchor="_mkfile_fragment_src_var_name_" - + # The name of the script, stripped of any preceeding path. script_name=${0##*/} - + # The prefix for all makefile variables. src_var_name_prefix='MK' # The variable that always holds the string that will be passed to # gen_mkfile() as the source variable to insert into the fragment.mk. src_var_name='' - + # The suffix appended to all makefile fragment source variables. SRC='SRC' - + # The list of source file suffixes to add to the makefile variables. src_file_suffixes='' # The lists of directories to ignore. ignore_dirs='' - + # The arguments to this function. They'll get assigned meaningful # values after getopts. root_dir="" @@ -422,28 +429,30 @@ main() mkfile_frag_tmpl_path="" suffix_file="" ignore_file="" - + # Flags set by getopts. - dry_run_flag="" + dry_run_flag="" hide_flag="" + ignore_list="" recursive_flag="" output_name="" prefix_flag="" verbose_flag="" - + # -- END GLOBAL VARIABLE DECLARATIONS -- # Local variable declarations. local item sub_items this_dir - - + + # Process our command line options. - while getopts ":dho:p:rv:" opt; do + while getopts ":dhi:o:p:rv:" opt; do case $opt in d ) dry_run_flag="1" ;; h ) hide_flag="1" ;; r ) recursive_flag="1" ;; + i ) ignore_list="${ignore_list} $OPTARG" ;; o ) output_name=$OPTARG ;; p ) prefix_flag=$OPTARG ;; v ) verbose_flag=$OPTARG ;; @@ -451,15 +460,15 @@ main() esac done shift $(($OPTIND - 1)) - - + + # Make sure that verboseness level is valid. - if [ "$verbose_flag" != "0" ] && - [ "$verbose_flag" != "1" ] && + if [ "$verbose_flag" != "0" ] && + [ "$verbose_flag" != "1" ] && [ "$verbose_flag" != "2" ]; then verbose_flag="1" fi - + # Check the number of arguments after command line option processing. if [ $# != "5" ]; then print_usage @@ -469,25 +478,29 @@ main() if [ -n "${output_name}" ]; then script_name="${output_name}" fi - - + + # Extract our arguments. root_dir=$1 frag_dir=$2 mkfile_frag_tmpl_path=$3 suffix_file=$4 ignore_file=$5 - - + + # Read the makefile config files to be used in the makefile fragment # generation. read_mkfile_config - - + + # Append the command line ignore_list to the ignore_dirs variable read from + # ignore_file by read_mkfile_config(). + ignore_dirs="${ignore_dirs} ${ignore_list}" + + # Strip / from end of directory path, if there is one. root_dir=${root_dir%/} frag_dir=${frag_dir%/} - + # Initialize the name of the makefile source variable. if [ -n "$prefix_flag" ]; then @@ -509,41 +522,41 @@ main() root_dir_upper=$(echo "$root_dir_upper" | tr '/' '_') src_var_name="${src_var_name_prefix}_${root_dir_upper}" fi - - + + # Be verbose if level 2 was requested. if [ "$verbose_flag" = "2" ]; then echo ">>>" $script_name ${src_var_name}_$SRC $root_dir $frag_dir elif [ "$verbose_flag" = "1" ]; then echo "$script_name: creating makefile fragment in $frag_dir from $root_dir" fi - - + + # Call our function to generate a makefile in the root directory given. gen_mkfile "${src_var_name}_$SRC" $root_dir $frag_dir - - + + # If we were asked to act recursively, then continue processing # root_dir's contents. if [ -n "$recursive_flag" ]; then - + # Get a listing of the directories in $directory. sub_items=$(ls $root_dir) - + # Descend into the contents of root_dir to generate the makefile # fragments. for item in $sub_items; do - + # If item is a directory, and it's not in the ignore list, descend into it. #if [ -d "$root_dir/$item" ] && ! should_ignore $item ; then if [ -d "$root_dir/$item" ] && ! is_in_list $item "$ignore_dirs" ; then - + gen_mkfiles $root_dir/$item $frag_dir/$item fi done fi - - + + # Exit peacefully. return 0 } @@ -551,22 +564,22 @@ main() is_in_list() { local cur_item the_item item_list - + # Extract argument. the_item="$1" item_list="$2" - + # Check each item in the list against the item of interest. for cur_item in ${item_list}; do - + # If the current item in the list matches the one of interest. if [ "${cur_item}" = "${the_item}" ]; then - + # Return success (ie: item was found). return 0 fi done - + # If we made it this far, return failure (ie: item not found). return 1 } diff --git a/build/libblis-symbols.def b/build/libblis-symbols.def index 97146a7861..ec68592d7a 100644 --- a/build/libblis-symbols.def +++ b/build/libblis-symbols.def @@ -1,122 +1,69 @@ EXPORTS bli_abort bli_absqsc -bli_absqsc_check -bli_absqsc_qfp bli_acquire_mij bli_acquire_mpart bli_acquire_mpart_b2t bli_acquire_mpart_br2tl bli_acquire_mpart_l2r -bli_acquire_mpart_l2r_check bli_acquire_mpart_mdim bli_acquire_mpart_mndim bli_acquire_mpart_ndim bli_acquire_mpart_r2l bli_acquire_mpart_t2b -bli_acquire_mpart_t2b_check bli_acquire_mpart_tl2br -bli_acquire_mpart_tl2br_check bli_acquire_vi bli_acquire_vpart_b2f bli_acquire_vpart_f2b bli_addd -bli_addd_check bli_addd_ex -bli_addd_ex_qfp bli_addm -bli_addm_check bli_addm_ex -bli_addm_ex_qfp bli_addsc -bli_addsc_check -bli_addsc_qfp bli_addv -bli_addv_check bli_addv_ex -bli_addv_ex_qfp -bli_adjust_strides bli_align_dim_to_mult bli_align_dim_to_size bli_align_ptr_to_size bli_amaxv -bli_amaxv_check bli_amaxv_ex -bli_amaxv_ex_qfp -bli_apool_alloc_block -bli_apool_array_elem -bli_apool_checkin_array -bli_apool_checkout_array -bli_apool_finalize -bli_apool_free_block -bli_apool_grow -bli_apool_init bli_arch_query_id -bli_arch_set_id -bli_arch_set_id_once bli_arch_string -bli_array_elem -bli_array_finalize -bli_array_init -bli_array_resize -bli_array_set_elem bli_asumv -bli_asumv_check bli_asumv_ex -bli_asumv_ex_qfp bli_axpbyv -bli_axpbyv_check bli_axpbyv_ex -bli_axpbyv_ex_qfp bli_axpy2v -bli_axpy2v_check bli_axpy2v_ex -bli_axpy2v_ex_qfp bli_axpyd -bli_axpyd_check bli_axpyd_ex -bli_axpyd_ex_qfp bli_axpyf -bli_axpyf_check bli_axpyf_ex -bli_axpyf_ex_qfp bli_axpym -bli_axpym_check bli_axpym_ex -bli_axpym_ex_qfp bli_axpyv -bli_axpyv_check bli_axpyv_ex -bli_axpyv_ex_qfp bli_blksz_create bli_blksz_create_ed bli_blksz_free bli_blksz_init bli_blksz_init_easy bli_blksz_init_ed -bli_blksz_reduce_def_to -bli_blksz_reduce_max_to bli_cabsqsc bli_caddd bli_caddd_ex bli_caddm bli_caddm_ex -bli_caddm_unb_var1 bli_caddsc bli_caddv bli_caddv_ex -bli_calloc_intl bli_camaxv bli_camaxv_ex bli_castm -bli_castm_check bli_castnzm -bli_castnzm_check bli_castv -bli_castv_check bli_casumv bli_casumv_ex -bli_casumv_unb_var1 bli_caxpbyv bli_caxpbyv_ex bli_caxpy2v @@ -127,33 +74,24 @@ bli_caxpyf bli_caxpyf_ex bli_caxpym bli_caxpym_ex -bli_caxpym_unb_var1 bli_caxpyv bli_caxpyv_ex bli_cccastm bli_cccastnzm bli_cccastv bli_cccopysc -bli_ccgemm_ker_var2_md bli_ccopyd bli_ccopyd_ex bli_ccopym bli_ccopym_ex -bli_ccopym_unb_var1 bli_ccopyv bli_ccopyv_ex -bli_ccpackm_blk_var1_md -bli_ccpackm_cxk_1e_md -bli_ccpackm_cxk_1r_md -bli_ccpackm_struc_cxk_md bli_ccxpbym_md bli_ccxpbym_md_ex -bli_ccxpbym_md_unb_var1 bli_cdcastm bli_cdcastnzm bli_cdcastv bli_cdcopysc -bli_cdgemm_ker_var2_md bli_cdivsc bli_cdotaxpyv bli_cdotaxpyv_ex @@ -165,288 +103,115 @@ bli_cdotxf bli_cdotxf_ex bli_cdotxv bli_cdotxv_ex -bli_cdpackm_blk_var1_md -bli_cdpackm_cxk_1e_md -bli_cdpackm_cxk_1r_md -bli_cdpackm_struc_cxk_md bli_cdxpbym_md bli_cdxpbym_md_ex -bli_cdxpbym_md_unb_var1 +bli_ceqm +bli_ceqsc +bli_ceqv bli_cfprintm bli_cfprintv bli_cgemm -bli_cgemm1m -bli_cgemm3m1 -bli_cgemm3mh -bli_cgemm4m1 -bli_cgemm4mb -bli_cgemm4mb_ker_var2 -bli_cgemm4mh bli_cgemm_ex -bli_cgemm_ker_var2 -bli_cgemm_md_c2r_ref -bli_cgemmtrsm_l_ukernel -bli_cgemmtrsm_u_ukernel -bli_cgemm_ukernel +bli_cgemmt +bli_cgemmt_ex bli_cgemv bli_cgemv_ex -bli_cgemv_unb_var1 -bli_cgemv_unb_var2 -bli_cgemv_unf_var1 -bli_cgemv_unf_var2 bli_cger bli_cger_ex -bli_cger_unb_var1 -bli_cger_unb_var2 bli_cgetijm +bli_cgetijv bli_cgetsc -bli_check_alignment_is_mult_of_ptr_size -bli_check_alignment_is_power_of_two -bli_check_conformal_dims -bli_check_consistent_datatypes -bli_check_consistent_object_datatypes -bli_check_consistent_object_precisions -bli_check_consistent_precisions -bli_check_datatype_real_proj_of -bli_check_equal_vector_lengths +bli_cgesc +bli_cgtsc bli_check_error_code_helper -bli_check_floating_datatype -bli_check_floating_object -bli_check_general_object -bli_check_hermitian_object -bli_check_if_exhausted_pool -bli_check_integer_datatype -bli_check_integer_object -bli_check_level3_dims -bli_check_matrix_object -bli_check_matrix_strides -bli_check_nonconstant_datatype -bli_check_nonconstant_object -bli_check_noninteger_datatype -bli_check_noninteger_object -bli_check_nonunit_diag -bli_check_null_pointer -bli_check_object_alias_of -bli_check_object_buffer -bli_check_object_diag_offset_equals -bli_check_object_length_equals -bli_check_object_real_proj_of -bli_check_object_struc -bli_check_object_valid_datatype -bli_check_object_width_equals -bli_check_packm_schema_on_unpack -bli_check_packv_schema_on_unpack -bli_check_real_datatype -bli_check_real_object -bli_check_real_valued_object -bli_check_scalar_object -bli_check_square_object -bli_check_sufficient_stack_buf_size -bli_check_symmetric_object -bli_check_triangular_object -bli_check_upper_or_lower_object -bli_check_valid_1x3_subpart -bli_check_valid_3x1_subpart -bli_check_valid_3x3_subpart -bli_check_valid_arch_id -bli_check_valid_cntl -bli_check_valid_datatype -bli_check_valid_diag -bli_check_valid_error_level -bli_check_valid_kc_mod_mult -bli_check_valid_malloc_buf -bli_check_valid_mc_mod_mult -bli_check_valid_nc_mod_mult -bli_check_valid_packbuf -bli_check_valid_side -bli_check_valid_trans -bli_check_valid_uplo -bli_check_vector_dim_equals -bli_check_vector_object bli_chemm -bli_chemm1m -bli_chemm3m1 -bli_chemm3mh -bli_chemm4m1 -bli_chemm4mh bli_chemm_ex bli_chemv bli_chemv_ex -bli_chemv_unb_var1 -bli_chemv_unb_var2 -bli_chemv_unb_var3 -bli_chemv_unb_var4 -bli_chemv_unf_var1 -bli_chemv_unf_var1a -bli_chemv_unf_var3 -bli_chemv_unf_var3a bli_cher bli_cher2 bli_cher2_ex bli_cher2k -bli_cher2k1m -bli_cher2k3m1 -bli_cher2k3mh -bli_cher2k4m1 -bli_cher2k4mh bli_cher2k_ex -bli_cher2_unb_var1 -bli_cher2_unb_var2 -bli_cher2_unb_var3 -bli_cher2_unb_var4 -bli_cher2_unf_var1 -bli_cher2_unf_var4 bli_cher_ex bli_cherk -bli_cherk1m -bli_cherk3m1 -bli_cherk3mh -bli_cherk4m1 -bli_cherk4mh bli_cherk_ex -bli_cherk_l_ker_var2 -bli_cherk_u_ker_var2 -bli_cher_unb_var1 -bli_cher_unb_var2 bli_cinvertd bli_cinvertd_ex bli_cinvertsc bli_cinvertv bli_cinvertv_ex +bli_cinvscald +bli_cinvscald_ex +bli_cinvscalm +bli_cinvscalm_ex +bli_cinvscalv +bli_cinvscalv_ex bli_clock -bli_clock_helper bli_clock_min_diff +bli_clesc +bli_cltsc bli_cmachval bli_cmkherm bli_cmkherm_ex -bli_cmkherm_unb_var1 bli_cmksymm bli_cmksymm_ex -bli_cmksymm_unb_var1 bli_cmktrim bli_cmktrim_ex -bli_cmktrim_unb_var1 bli_cmulsc bli_cnorm1m bli_cnorm1m_ex -bli_cnorm1m_unb_var1 bli_cnorm1v bli_cnorm1v_ex -bli_cnorm1v_unb_var1 bli_cnormfm bli_cnormfm_ex -bli_cnormfm_unb_var1 bli_cnormfsc bli_cnormfv bli_cnormfv_ex -bli_cnormfv_unb_var1 bli_cnormim bli_cnormim_ex -bli_cnormim_unb_var1 bli_cnormiv bli_cnormiv_ex -bli_cnormiv_unb_var1 -bli_cntl_calc_num_threads_in bli_cntl_clear_node bli_cntl_copy bli_cntl_create_node bli_cntl_free bli_cntl_free_node -bli_cntl_free_wo_thrinfo -bli_cntl_free_w_thrinfo bli_cntl_mark_family -bli_cntx_1m_stage -bli_cntx_3m1_stage -bli_cntx_3mh_stage -bli_cntx_4m1_stage -bli_cntx_4mb_stage -bli_cntx_4mh_stage bli_cntx_clear -bli_cntx_ind_stage -bli_cntx_nat_stage bli_cntx_print bli_cntx_set_blkszs bli_cntx_set_ind_blkszs -bli_cntx_set_l1f_kers -bli_cntx_set_l1v_kers -bli_cntx_set_l3_nat_ukrs -bli_cntx_set_packm_kers +bli_cntx_set_l3_sup_handlers +bli_cntx_set_ukr_prefs +bli_cntx_set_ukrs bli_copyd -bli_copyd_check bli_copyd_ex -bli_copyd_ex_qfp bli_copym -bli_copym_check bli_copym_ex -bli_copym_ex_qfp bli_copysc -bli_copysc_check bli_copyv -bli_copyv_check bli_copyv_ex -bli_copyv_ex_qfp -bli_cpackm_blk_var1 -bli_cpackm_cxk -bli_cpackm_cxk_1er -bli_cpackm_cxk_3mis -bli_cpackm_cxk_4mi -bli_cpackm_cxk_rih -bli_cpackm_herm_cxk -bli_cpackm_herm_cxk_1er -bli_cpackm_herm_cxk_3mis -bli_cpackm_herm_cxk_4mi -bli_cpackm_herm_cxk_rih -bli_cpackm_struc_cxk -bli_cpackm_struc_cxk_1er -bli_cpackm_struc_cxk_3mis -bli_cpackm_struc_cxk_4mi -bli_cpackm_struc_cxk_rih -bli_cpackm_tri_cxk -bli_cpackm_tri_cxk_1er -bli_cpackm_tri_cxk_3mis -bli_cpackm_tri_cxk_4mi -bli_cpackm_tri_cxk_rih -bli_cpackm_unb_var1 bli_cprintm -bli_cprintm_ex bli_cprintv -bli_cprintv_ex -bli_cpuid_is_bulldozer -bli_cpuid_is_excavator -bli_cpuid_is_haswell -bli_cpuid_is_knl -bli_cpuid_is_penryn -bli_cpuid_is_piledriver -bli_cpuid_is_sandybridge -bli_cpuid_is_skx -bli_cpuid_is_steamroller -bli_cpuid_is_zen -bli_cpuid_query -bli_cpuid_query_id bli_crandm bli_crandm_ex -bli_crandm_unb_var1 bli_crandnm bli_crandnm_ex -bli_crandnm_unb_var1 bli_crandnv bli_crandnv_ex -bli_crandnv_unb_var1 bli_crandv bli_crandv_ex -bli_crandv_unb_var1 bli_cscal2d bli_cscal2d_ex bli_cscal2m bli_cscal2m_ex -bli_cscal2m_unb_var1 bli_cscal2v bli_cscal2v_ex bli_cscald bli_cscald_ex bli_cscalm bli_cscalm_ex -bli_cscalm_unb_var1 bli_cscalv bli_cscalv_ex bli_cscastm @@ -458,42 +223,30 @@ bli_csetd_ex bli_csetid bli_csetid_ex bli_csetijm +bli_csetijv bli_csetm bli_csetm_ex -bli_csetm_unb_var1 bli_csetsc bli_csetv bli_csetv_ex -bli_csgemm_ker_var2_md bli_cshiftd bli_cshiftd_ex -bli_cspackm_blk_var1_md -bli_cspackm_cxk_1e_md -bli_cspackm_cxk_1r_md -bli_cspackm_struc_cxk_md +bli_csqrtrsc bli_csqrtsc bli_csubd bli_csubd_ex bli_csubm bli_csubm_ex -bli_csubm_unb_var1 bli_csubsc bli_csubv bli_csubv_ex bli_csumsqv bli_csumsqv_ex -bli_csumsqv_unb_var1 bli_cswapv bli_cswapv_ex bli_csxpbym_md bli_csxpbym_md_ex -bli_csxpbym_md_unb_var1 bli_csymm -bli_csymm1m -bli_csymm3m1 -bli_csymm3mh -bli_csymm4m1 -bli_csymm4mh bli_csymm_ex bli_csymv bli_csymv_ex @@ -501,89 +254,39 @@ bli_csyr bli_csyr2 bli_csyr2_ex bli_csyr2k -bli_csyr2k1m -bli_csyr2k3m1 -bli_csyr2k3mh -bli_csyr2k4m1 -bli_csyr2k4mh bli_csyr2k_ex bli_csyr_ex bli_csyrk -bli_csyrk1m -bli_csyrk3m1 -bli_csyrk3mh -bli_csyrk4m1 -bli_csyrk4mh bli_csyrk_ex bli_ctrmm -bli_ctrmm1m bli_ctrmm3 -bli_ctrmm31m -bli_ctrmm33m1 -bli_ctrmm33mh -bli_ctrmm34m1 -bli_ctrmm34mh bli_ctrmm3_ex -bli_ctrmm3m1 -bli_ctrmm4m1 bli_ctrmm_ex -bli_ctrmm_ll_ker_var2 -bli_ctrmm_lu_ker_var2 -bli_ctrmm_rl_ker_var2 -bli_ctrmm_ru_ker_var2 bli_ctrmv bli_ctrmv_ex -bli_ctrmv_unb_var1 -bli_ctrmv_unb_var2 -bli_ctrmv_unf_var1 -bli_ctrmv_unf_var2 bli_ctrsm -bli_ctrsm1m -bli_ctrsm3m1 -bli_ctrsm4m1 bli_ctrsm_ex -bli_ctrsm_ll_ker_var2 -bli_ctrsm_l_ukernel -bli_ctrsm_lu_ker_var2 -bli_ctrsm_rl_ker_var2 -bli_ctrsm_ru_ker_var2 -bli_ctrsm_u_ukernel bli_ctrsv bli_ctrsv_ex -bli_ctrsv_unb_var1 -bli_ctrsv_unb_var2 -bli_ctrsv_unf_var1 -bli_ctrsv_unf_var2 -bli_cunpackm_blk_var1 -bli_cunpackm_cxk -bli_cunpackm_unb_var1 bli_cunzipsc bli_cxpbyd bli_cxpbyd_ex bli_cxpbym bli_cxpbym_ex -bli_cxpbym_unb_var1 bli_cxpbyv bli_cxpbyv_ex bli_czcastm bli_czcastnzm bli_czcastv bli_czcopysc -bli_czgemm_ker_var2_md bli_czipsc -bli_czpackm_blk_var1_md -bli_czpackm_cxk_1e_md -bli_czpackm_cxk_1r_md -bli_czpackm_struc_cxk_md bli_czxpbym_md bli_czxpbym_md_ex -bli_czxpbym_md_unb_var1 bli_dabsqsc bli_daddd bli_daddd_ex bli_daddm bli_daddm_ex -bli_daddm_unb_var1 bli_daddsc bli_daddv bli_daddv_ex @@ -591,7 +294,6 @@ bli_damaxv bli_damaxv_ex bli_dasumv bli_dasumv_ex -bli_dasumv_unb_var1 bli_daxpbyv bli_daxpbyv_ex bli_daxpy2v @@ -602,33 +304,24 @@ bli_daxpyf bli_daxpyf_ex bli_daxpym bli_daxpym_ex -bli_daxpym_unb_var1 bli_daxpyv bli_daxpyv_ex bli_dccastm bli_dccastnzm bli_dccastv bli_dccopysc -bli_dcgemm_ker_var2_md bli_dcopyd bli_dcopyd_ex bli_dcopym bli_dcopym_ex -bli_dcopym_unb_var1 bli_dcopyv bli_dcopyv_ex -bli_dcpackm_blk_var1_md -bli_dcpackm_cxk_1e_md -bli_dcpackm_cxk_1r_md -bli_dcpackm_struc_cxk_md bli_dcxpbym_md bli_dcxpbym_md_ex -bli_dcxpbym_md_unb_var1 bli_ddcastm bli_ddcastnzm bli_ddcastv bli_ddcopysc -bli_ddgemm_ker_var2_md bli_ddivsc bli_ddotaxpyv bli_ddotaxpyv_ex @@ -640,183 +333,103 @@ bli_ddotxf bli_ddotxf_ex bli_ddotxv bli_ddotxv_ex -bli_ddpackm_blk_var1_md -bli_ddpackm_cxk_1e_md -bli_ddpackm_cxk_1r_md -bli_ddpackm_struc_cxk_md bli_ddxpbym_md bli_ddxpbym_md_ex -bli_ddxpbym_md_unb_var1 -bli_determine_blocksize -bli_determine_blocksize_b -bli_determine_blocksize_b_sub -bli_determine_blocksize_f -bli_determine_blocksize_f_sub +bli_deqm +bli_deqsc +bli_deqv bli_dfprintm bli_dfprintv bli_dgemm -bli_dgemm1m -bli_dgemm3m1 -bli_dgemm3mh -bli_dgemm4m1 -bli_dgemm4mb -bli_dgemm4mb_ker_var2 -bli_dgemm4mh bli_dgemm_ex -bli_dgemm_ker_var2 -bli_dgemmtrsm_l_ukernel -bli_dgemmtrsm_u_ukernel -bli_dgemm_ukernel +bli_dgemmt +bli_dgemmt_ex bli_dgemv bli_dgemv_ex -bli_dgemv_unb_var1 -bli_dgemv_unb_var2 -bli_dgemv_unf_var1 -bli_dgemv_unf_var2 bli_dger bli_dger_ex -bli_dger_unb_var1 -bli_dger_unb_var2 bli_dgetijm +bli_dgetijv bli_dgetsc +bli_dgesc +bli_dgtsc bli_dhemm -bli_dhemm1m -bli_dhemm3m1 -bli_dhemm3mh -bli_dhemm4m1 -bli_dhemm4mh bli_dhemm_ex bli_dhemv bli_dhemv_ex -bli_dhemv_unb_var1 -bli_dhemv_unb_var2 -bli_dhemv_unb_var3 -bli_dhemv_unb_var4 -bli_dhemv_unf_var1 -bli_dhemv_unf_var1a -bli_dhemv_unf_var3 -bli_dhemv_unf_var3a bli_dher bli_dher2 bli_dher2_ex bli_dher2k -bli_dher2k1m -bli_dher2k3m1 -bli_dher2k3mh -bli_dher2k4m1 -bli_dher2k4mh bli_dher2k_ex -bli_dher2_unb_var1 -bli_dher2_unb_var2 -bli_dher2_unb_var3 -bli_dher2_unb_var4 -bli_dher2_unf_var1 -bli_dher2_unf_var4 bli_dher_ex bli_dherk -bli_dherk1m -bli_dherk3m1 -bli_dherk3mh -bli_dherk4m1 -bli_dherk4mh bli_dherk_ex -bli_dherk_l_ker_var2 -bli_dherk_u_ker_var2 -bli_dher_unb_var1 -bli_dher_unb_var2 bli_dinvertd bli_dinvertd_ex bli_dinvertsc bli_dinvertv bli_dinvertv_ex +bli_dinvscald +bli_dinvscald_ex +bli_dinvscalm +bli_dinvscalm_ex +bli_dinvscalv +bli_dinvscalv_ex bli_divsc -bli_divsc_check -bli_divsc_qfp -bli_dlamch +bli_dlesc +bli_dltsc bli_dmachval bli_dmkherm bli_dmkherm_ex -bli_dmkherm_unb_var1 bli_dmksymm bli_dmksymm_ex -bli_dmksymm_unb_var1 bli_dmktrim bli_dmktrim_ex -bli_dmktrim_unb_var1 bli_dmulsc bli_dnorm1m bli_dnorm1m_ex -bli_dnorm1m_unb_var1 bli_dnorm1v bli_dnorm1v_ex -bli_dnorm1v_unb_var1 bli_dnormfm bli_dnormfm_ex -bli_dnormfm_unb_var1 bli_dnormfsc bli_dnormfv bli_dnormfv_ex -bli_dnormfv_unb_var1 bli_dnormim bli_dnormim_ex -bli_dnormim_unb_var1 bli_dnormiv bli_dnormiv_ex -bli_dnormiv_unb_var1 bli_dotaxpyv -bli_dotaxpyv_check bli_dotaxpyv_ex -bli_dotaxpyv_ex_qfp bli_dotv -bli_dotv_check bli_dotv_ex -bli_dotv_ex_qfp bli_dotxaxpyf -bli_dotxaxpyf_check bli_dotxaxpyf_ex -bli_dotxaxpyf_ex_qfp bli_dotxf -bli_dotxf_check bli_dotxf_ex -bli_dotxf_ex_qfp bli_dotxv -bli_dotxv_check bli_dotxv_ex -bli_dotxv_ex_qfp -bli_dpackm_blk_var1 -bli_dpackm_cxk -bli_dpackm_herm_cxk -bli_dpackm_struc_cxk -bli_dpackm_tri_cxk -bli_dpackm_unb_var1 bli_dprintm -bli_dprintm_ex bli_dprintv -bli_dprintv_ex bli_drandm bli_drandm_ex -bli_drandm_unb_var1 bli_drandnm bli_drandnm_ex -bli_drandnm_unb_var1 bli_drandnv bli_drandnv_ex -bli_drandnv_unb_var1 bli_drandv bli_drandv_ex -bli_drandv_unb_var1 bli_dscal2d bli_dscal2d_ex bli_dscal2m bli_dscal2m_ex -bli_dscal2m_unb_var1 bli_dscal2v bli_dscal2v_ex bli_dscald bli_dscald_ex bli_dscalm bli_dscalm_ex -bli_dscalm_unb_var1 bli_dscalv bli_dscalv_ex bli_dscastm @@ -828,42 +441,30 @@ bli_dsetd_ex bli_dsetid bli_dsetid_ex bli_dsetijm +bli_dsetijv bli_dsetm bli_dsetm_ex -bli_dsetm_unb_var1 bli_dsetsc bli_dsetv bli_dsetv_ex -bli_dsgemm_ker_var2_md bli_dshiftd bli_dshiftd_ex -bli_dspackm_blk_var1_md -bli_dspackm_cxk_1e_md -bli_dspackm_cxk_1r_md -bli_dspackm_struc_cxk_md +bli_dsqrtrsc bli_dsqrtsc bli_dsubd bli_dsubd_ex bli_dsubm bli_dsubm_ex -bli_dsubm_unb_var1 bli_dsubsc bli_dsubv bli_dsubv_ex bli_dsumsqv bli_dsumsqv_ex -bli_dsumsqv_unb_var1 bli_dswapv bli_dswapv_ex bli_dsxpbym_md bli_dsxpbym_md_ex -bli_dsxpbym_md_unb_var1 bli_dsymm -bli_dsymm1m -bli_dsymm3m1 -bli_dsymm3mh -bli_dsymm4m1 -bli_dsymm4mh bli_dsymm_ex bli_dsymv bli_dsymv_ex @@ -871,301 +472,80 @@ bli_dsyr bli_dsyr2 bli_dsyr2_ex bli_dsyr2k -bli_dsyr2k1m -bli_dsyr2k3m1 -bli_dsyr2k3mh -bli_dsyr2k4m1 -bli_dsyr2k4mh bli_dsyr2k_ex bli_dsyr_ex bli_dsyrk -bli_dsyrk1m -bli_dsyrk3m1 -bli_dsyrk3mh -bli_dsyrk4m1 -bli_dsyrk4mh bli_dsyrk_ex +bli_dt_size +bli_dt_string bli_dtrmm -bli_dtrmm1m bli_dtrmm3 -bli_dtrmm31m -bli_dtrmm33m1 -bli_dtrmm33mh -bli_dtrmm34m1 -bli_dtrmm34mh bli_dtrmm3_ex -bli_dtrmm3m1 -bli_dtrmm4m1 bli_dtrmm_ex -bli_dtrmm_ll_ker_var2 -bli_dtrmm_lu_ker_var2 -bli_dtrmm_rl_ker_var2 -bli_dtrmm_ru_ker_var2 bli_dtrmv bli_dtrmv_ex -bli_dtrmv_unb_var1 -bli_dtrmv_unb_var2 -bli_dtrmv_unf_var1 -bli_dtrmv_unf_var2 bli_dtrsm -bli_dtrsm1m -bli_dtrsm3m1 -bli_dtrsm4m1 bli_dtrsm_ex -bli_dtrsm_ll_ker_var2 -bli_dtrsm_l_ukernel -bli_dtrsm_lu_ker_var2 -bli_dtrsm_rl_ker_var2 -bli_dtrsm_ru_ker_var2 -bli_dtrsm_u_ukernel bli_dtrsv bli_dtrsv_ex -bli_dtrsv_unb_var1 -bli_dtrsv_unb_var2 -bli_dtrsv_unf_var1 -bli_dtrsv_unf_var2 -bli_dt_size -bli_dt_size_check -bli_dt_string -bli_dt_string_check -bli_dt_union_check -bli_dunpackm_blk_var1 -bli_dunpackm_cxk -bli_dunpackm_unb_var1 bli_dunzipsc bli_dxpbyd bli_dxpbyd_ex bli_dxpbym bli_dxpbym_ex -bli_dxpbym_unb_var1 bli_dxpbyv bli_dxpbyv_ex bli_dzcastm bli_dzcastnzm bli_dzcastv bli_dzcopysc -bli_dzgemm_ker_var2_md bli_dzipsc -bli_dzpackm_blk_var1_md -bli_dzpackm_cxk_1e_md -bli_dzpackm_cxk_1r_md -bli_dzpackm_struc_cxk_md bli_dzxpbym_md bli_dzxpbym_md_ex -bli_dzxpbym_md_unb_var1 -bli_error_checking_is_enabled +bli_eqm +bli_eqsc +bli_eqv bli_error_checking_level bli_error_checking_level_set -bli_error_string_for_code -bli_ffree_align -bli_ffree_noalign bli_finalize -bli_finalize_apis -bli_finalize_auto -bli_finalize_once -bli_find_area_trap_l -bli_fmalloc_align -bli_fmalloc_align_check -bli_fmalloc_noalign -bli_fmalloc_post_check bli_fprintm -bli_fprintm_check -bli_fprintm_ex -bli_fprintm_qfp bli_fprintv -bli_fprintv_check -bli_fprintv_ex -bli_fprintv_qfp -bli_free_intl bli_free_user -bli_func_create -bli_func_free -bli_func_init -bli_func_init_null -bli_func_is_null -bli_func_is_null_dt -bli_gcd bli_gemm -bli_gemm1m -bli_gemm3m1 -bli_gemm3mh -bli_gemm4m1 -bli_gemm4mb -bli_gemm4mb_ker_var2 -bli_gemm4mh -bli_gemm_basic_check -bli_gemm_blk_var1 -bli_gemm_blk_var2 -bli_gemm_blk_var3 -bli_gemmbp_cntl_create -bli_gemm_check -bli_gemm_cntl_create -bli_gemm_cntl_create_node -bli_gemm_cntl_free -bli_gemm_determine_kc -bli_gemm_determine_kc_b -bli_gemm_determine_kc_f -bli_gemm_direct bli_gemm_ex -bli_gemm_front -bli_gemmind -bli_gemmind_get_avail -bli_gemm_int -bli_gemm_ker_var2 -bli_gemm_ker_var2_md -bli_gemm_md -bli_gemm_md_ccc -bli_gemm_md_ccr -bli_gemm_md_crc -bli_gemm_md_crr -bli_gemm_md_rcc -bli_gemm_md_rcr -bli_gemm_md_rrc -bli_gemm_md_rrr -bli_gemmnat -bli_gemm_packa -bli_gemm_packb -bli_gemm_prune_unref_mparts_k -bli_gemm_prune_unref_mparts_m -bli_gemm_prune_unref_mparts_n -bli_gemmtrsm_l_ukernel_qfp -bli_gemmtrsm_ukernel -bli_gemmtrsm_u_ukernel_qfp bli_gemm_ukernel -bli_gemm_ukernel_qfp +bli_gemmt +bli_gemmt_ex +bli_gemmtrsm_ukernel bli_gemv -bli_gemv_check bli_gemv_ex -bli_gemv_ex_qfp -bli_gemv_unb_var1 -bli_gemv_unb_var1_qfp -bli_gemv_unb_var2 -bli_gemv_unb_var2_qfp -bli_gemv_unf_var1 -bli_gemv_unf_var1_qfp -bli_gemv_unf_var2 -bli_gemv_unf_var2_qfp bli_ger -bli_ger_check bli_ger_ex -bli_ger_ex_qfp -bli_ger_unb_var1 -bli_ger_unb_var1_qfp -bli_ger_unb_var2 -bli_ger_unb_var2_qfp bli_getijm +bli_getijv bli_getopt bli_getopt_init_state bli_getsc -bli_getsc_check -bli_getsc_qfp -bli_gks_cntx_l3_nat_ukr_is_ref -bli_gks_finalize -bli_gks_init -bli_gks_init_index bli_gks_init_ref_cntx bli_gks_l3_ukr_impl_string bli_gks_l3_ukr_impl_type -bli_gks_lookup_ind_cntx -bli_gks_lookup_nat_cntx bli_gks_query_cntx -bli_gks_query_cntx_noinit bli_gks_query_ind_cntx bli_gks_query_nat_cntx -bli_gks_register_cntx +bli_gesc +bli_gtsc bli_hemm -bli_hemm1m -bli_hemm3m1 -bli_hemm3mh -bli_hemm4m1 -bli_hemm4mh -bli_hemm_basic_check -bli_hemm_check bli_hemm_ex -bli_hemm_front -bli_hemmind -bli_hemmind_get_avail -bli_hemmnat bli_hemv -bli_hemv_check bli_hemv_ex -bli_hemv_ex_qfp -bli_hemv_unb_var1 -bli_hemv_unb_var1_qfp -bli_hemv_unb_var2 -bli_hemv_unb_var2_qfp -bli_hemv_unb_var3 -bli_hemv_unb_var3_qfp -bli_hemv_unb_var4 -bli_hemv_unb_var4_qfp -bli_hemv_unf_var1 -bli_hemv_unf_var1a -bli_hemv_unf_var1a_qfp -bli_hemv_unf_var1_qfp -bli_hemv_unf_var3 -bli_hemv_unf_var3a -bli_hemv_unf_var3a_qfp -bli_hemv_unf_var3_qfp bli_her bli_her2 -bli_her2_check bli_her2_ex -bli_her2_ex_qfp bli_her2k -bli_her2k1m -bli_her2k3m1 -bli_her2k3mh -bli_her2k4m1 -bli_her2k4mh -bli_her2k_basic_check -bli_her2k_check bli_her2k_ex -bli_her2k_front -bli_her2kind -bli_her2kind_get_avail -bli_her2knat -bli_her2_unb_var1 -bli_her2_unb_var1_qfp -bli_her2_unb_var2 -bli_her2_unb_var2_qfp -bli_her2_unb_var3 -bli_her2_unb_var3_qfp -bli_her2_unb_var4 -bli_her2_unb_var4_qfp -bli_her2_unf_var1 -bli_her2_unf_var1_qfp -bli_her2_unf_var4 -bli_her2_unf_var4_qfp -bli_her_check bli_her_ex -bli_her_ex_qfp bli_herk -bli_herk1m -bli_herk3m1 -bli_herk3mh -bli_herk4m1 -bli_herk4mh -bli_herk_basic_check -bli_herk_check -bli_herk_determine_kc -bli_herk_determine_kc_b -bli_herk_determine_kc_f -bli_herk_direct bli_herk_ex -bli_herk_front -bli_herkind -bli_herkind_get_avail -bli_herk_l_ker_var2 -bli_herknat -bli_herk_prune_unref_mparts_k -bli_herk_prune_unref_mparts_m -bli_herk_prune_unref_mparts_n -bli_herk_u_ker_var2 -bli_herk_x_ker_var2 -bli_her_unb_var1 -bli_her_unb_var1_qfp -bli_her_unb_var2 -bli_her_unb_var2_qfp bli_ifprintm bli_ifprintv bli_igetsc @@ -1175,30 +555,30 @@ bli_ind_disable_all_dt bli_ind_disable_dt bli_ind_enable bli_ind_enable_dt -bli_ind_finalize -bli_ind_get_impl_string -bli_ind_init -bli_ind_map_cdt_to_index bli_ind_oper_enable_only bli_ind_oper_find_avail -bli_ind_oper_get_avail bli_ind_oper_get_avail_impl_string -bli_ind_oper_is_impl bli_info_get_blas_int_type_size bli_info_get_enable_blas bli_info_get_enable_cblas +bli_info_get_enable_hpx +bli_info_get_enable_hpx_as_default bli_info_get_enable_memkind bli_info_get_enable_openmp +bli_info_get_enable_openmp_as_default bli_info_get_enable_pba_pools bli_info_get_enable_pthreads +bli_info_get_enable_pthreads_as_default bli_info_get_enable_sandbox bli_info_get_enable_sba_pools bli_info_get_enable_stay_auto_init bli_info_get_enable_threading +bli_info_get_enable_tls bli_info_get_gemm_impl_string +bli_info_get_gemm_ukr_impl_string +bli_info_get_gemmt_impl_string bli_info_get_gemmtrsm_l_ukr_impl_string bli_info_get_gemmtrsm_u_ukr_impl_string -bli_info_get_gemm_ukr_impl_string bli_info_get_heap_addr_align_size bli_info_get_heap_stride_align_size bli_info_get_hemm_impl_string @@ -1209,7 +589,14 @@ bli_info_get_int_type_size_str bli_info_get_max_type_size bli_info_get_num_fp_types bli_info_get_page_size -bli_info_get_pool_addr_align_size +bli_info_get_pool_addr_align_size_a +bli_info_get_pool_addr_align_size_b +bli_info_get_pool_addr_align_size_c +bli_info_get_pool_addr_align_size_gen +bli_info_get_pool_addr_offset_size_a +bli_info_get_pool_addr_offset_size_b +bli_info_get_pool_addr_offset_size_c +bli_info_get_pool_addr_offset_size_gen bli_info_get_simd_align_size bli_info_get_simd_num_registers bli_info_get_simd_size @@ -1218,8 +605,9 @@ bli_info_get_stack_buf_max_size bli_info_get_symm_impl_string bli_info_get_syr2k_impl_string bli_info_get_syrk_impl_string -bli_info_get_thread_part_jrir_rr -bli_info_get_thread_part_jrir_slab +bli_info_get_thread_jrir_rr +bli_info_get_thread_jrir_slab +bli_info_get_thread_jrir_tlb bli_info_get_trmm3_impl_string bli_info_get_trmm_impl_string bli_info_get_trsm_impl_string @@ -1227,177 +615,70 @@ bli_info_get_trsm_l_ukr_impl_string bli_info_get_trsm_u_ukr_impl_string bli_info_get_version_str bli_init -bli_init_apis -bli_init_auto -bli_init_once bli_invertd -bli_invertd_check bli_invertd_ex -bli_invertd_ex_qfp bli_invertsc -bli_invertsc_check -bli_invertsc_qfp bli_invertv -bli_invertv_check bli_invertv_ex -bli_invertv_ex_qfp -bli_ipow +bli_invscald +bli_invscald_ex +bli_invscalm +bli_invscalm_ex +bli_invscalv +bli_invscalv_ex bli_iprintm -bli_iprintm_ex bli_iprintv -bli_iprintv_ex bli_isetsc -bli_l0_xsc_check -bli_l0_xx2sc_check -bli_l0_xxsc_check -bli_l1d_ax_check -bli_l1d_axy_check -bli_l1d_x_check -bli_l1d_xy_check -bli_l1m_ax_check -bli_l1m_axy_check -bli_l1m_xy_check -bli_l1v_axby_check -bli_l1v_ax_check -bli_l1v_axy_check -bli_l1v_dot_check -bli_l1v_xby_check -bli_l1v_x_check -bli_l1v_xi_check -bli_l1v_xy_check -bli_l3_basic_check -bli_l3_cntl_create_if -bli_l3_cntl_free -bli_l3_determine_kc -bli_l3_direct -bli_l3_ind_oper_enable_only -bli_l3_ind_oper_find_avail -bli_l3_ind_oper_get_enable -bli_l3_ind_oper_get_func -bli_l3_ind_oper_set_enable -bli_l3_ind_oper_set_enable_all -bli_l3_ind_set_enable_dt -bli_l3_packm -bli_l3_prune_unref_mparts_k -bli_l3_prune_unref_mparts_m -bli_l3_prune_unref_mparts_n -bli_l3_thread_decorator -bli_l3_thread_entry -bli_l3_thrinfo_create_root -bli_l3_thrinfo_free -bli_l3_thrinfo_free_paths -bli_l3_thrinfo_init_single -bli_l3_thrinfo_print_gemm_paths -bli_l3_thrinfo_print_trsm_paths -bli_lcm -bli_lsame +bli_l3_thrinfo_create +bli_lesc +bli_ltsc bli_machval -bli_malloc_intl bli_malloc_user -bli_mbool_create -bli_mbool_free -bli_mbool_init -bli_pba_acquire_m -bli_pba_compute_pool_block_sizes -bli_pba_compute_pool_block_sizes_dt -bli_pba_finalize -bli_pba_finalize_pools -bli_pba_init -bli_pba_init_pools -bli_pba_pool_size -bli_pba_query -bli_pba_release -bli_pba_rntm_set_pba -bli_memsys_finalize -bli_memsys_init bli_mkherm -bli_mkherm_check bli_mkherm_ex -bli_mkherm_ex_qfp bli_mksymm -bli_mksymm_check bli_mksymm_ex -bli_mksymm_ex_qfp bli_mktrim -bli_mktrim_check bli_mktrim_ex -bli_mktrim_ex_qfp bli_mulsc -bli_mulsc_check -bli_mulsc_qfp -bli_next_prime_factor bli_norm1m -bli_norm1m_check bli_norm1m_ex -bli_norm1m_ex_qfp bli_norm1v -bli_norm1v_check bli_norm1v_ex -bli_norm1v_ex_qfp bli_normfm -bli_normfm_check bli_normfm_ex -bli_normfm_ex_qfp bli_normfsc -bli_normfsc_check -bli_normfsc_qfp bli_normfv -bli_normfv_check bli_normfv_ex -bli_normfv_ex_qfp bli_normim -bli_normim_check bli_normim_ex -bli_normim_ex_qfp bli_normiv -bli_normiv_check bli_normiv_ex -bli_normiv_ex_qfp bli_obj_alloc_buffer -bli_obj_alloc_buffer_check bli_obj_attach_buffer -bli_obj_attach_buffer_check bli_obj_create bli_obj_create_1x1 bli_obj_create_1x1_with_attached_buffer -bli_obj_create_check bli_obj_create_conf_to -bli_obj_create_const_check -bli_obj_create_scalar_check bli_obj_create_with_attached_buffer bli_obj_create_without_buffer -bli_obj_create_without_buffer_check -bli_obj_equals bli_obj_free -bli_obj_free_check -bli_obj_imag_equals -bli_obj_imag_is_zero bli_obj_print -bli_obj_print_check bli_obj_scalar_apply_scalar bli_obj_scalar_attach bli_obj_scalar_cast_to bli_obj_scalar_detach -bli_obj_scalar_equals -bli_obj_scalar_has_nonzero_imag bli_obj_scalar_init_detached bli_obj_scalar_init_detached_copy_of bli_obj_scalar_reset -bli_packm_acquire_mpart_l2r -bli_packm_acquire_mpart_t2b -bli_packm_acquire_mpart_tl2br +bli_pack_get_pack_a +bli_pack_get_pack_b +bli_pack_set_pack_a +bli_pack_set_pack_b +bli_packm_alloc +bli_packm_alloc_ex bli_packm_blk_var1 -bli_packm_blk_var1_md -bli_packm_cntl_create_node -bli_packm_init -bli_packm_init_check -bli_packm_init_pack -bli_packm_int -bli_packm_int_check -bli_packm_offset_to_panel_for -bli_packm_thrinfo_init -bli_packm_thrinfo_init_single -bli_packm_unb_var1 +bli_packm_scalar bli_param_map_blis_to_char_conj bli_param_map_blis_to_char_diag bli_param_map_blis_to_char_dt @@ -1415,33 +696,11 @@ bli_param_map_char_to_blis_dt bli_param_map_char_to_blis_side bli_param_map_char_to_blis_trans bli_param_map_char_to_blis_uplo -bli_param_map_netlib_to_blis_diag -bli_param_map_netlib_to_blis_side -bli_param_map_netlib_to_blis_trans -bli_param_map_netlib_to_blis_uplo -bli_partition_2x2 -bli_pblk_print -bli_pool_alloc_block -bli_pool_checkin_block -bli_pool_checkout_block -bli_pool_finalize -bli_pool_free_block -bli_pool_grow -bli_pool_init -bli_pool_print -bli_pool_reinit -bli_pool_shrink -bli_prime_factorization +bli_pba_query bli_printm -bli_printm_ex -bli_print_msg bli_printv -bli_printv_ex bli_projm -bli_projm_check bli_projv -bli_projv_check -bli_prune_unref_mparts bli_pthread_barrier_destroy bli_pthread_barrier_init bli_pthread_barrier_wait @@ -1458,30 +717,22 @@ bli_pthread_mutex_trylock bli_pthread_mutex_unlock bli_pthread_once bli_randm -bli_randm_check bli_randm_ex -bli_randm_ex_qfp bli_randnm -bli_randnm_check bli_randnm_ex -bli_randnm_ex_qfp bli_randnv -bli_randnv_check bli_randnv_ex -bli_randnv_ex_qfp bli_randv -bli_randv_check bli_randv_ex -bli_randv_ex_qfp -bli_rntm_print +bli_rntm_init_from_global +bli_rntm_set_num_threads +bli_rntm_set_ways bli_rntm_set_ways_for_op -bli_rntm_set_ways_from_rntm bli_sabsqsc bli_saddd bli_saddd_ex bli_saddm bli_saddm_ex -bli_saddm_unb_var1 bli_saddsc bli_saddv bli_saddv_ex @@ -1489,7 +740,6 @@ bli_samaxv bli_samaxv_ex bli_sasumv bli_sasumv_ex -bli_sasumv_unb_var1 bli_saxpbyv bli_saxpbyv_ex bli_saxpy2v @@ -1500,65 +750,36 @@ bli_saxpyf bli_saxpyf_ex bli_saxpym bli_saxpym_ex -bli_saxpym_unb_var1 bli_saxpyv bli_saxpyv_ex -bli_sba_acquire -bli_sba_checkin_array -bli_sba_checkout_array -bli_sba_finalize -bli_sba_init -bli_sba_query -bli_sba_release -bli_sba_rntm_set_pool bli_scal2d -bli_scal2d_check bli_scal2d_ex -bli_scal2d_ex_qfp bli_scal2m -bli_scal2m_check bli_scal2m_ex -bli_scal2m_ex_qfp bli_scal2v -bli_scal2v_check bli_scal2v_ex -bli_scal2v_ex_qfp bli_scald -bli_scald_check bli_scald_ex -bli_scald_ex_qfp bli_scalm -bli_scalm_check bli_scalm_ex -bli_scalm_ex_qfp bli_scalv -bli_scalv_check bli_scalv_ex -bli_scalv_ex_qfp bli_sccastm bli_sccastnzm bli_sccastv bli_sccopysc -bli_scgemm_ker_var2_md bli_scopyd bli_scopyd_ex bli_scopym bli_scopym_ex -bli_scopym_unb_var1 bli_scopyv bli_scopyv_ex -bli_scpackm_blk_var1_md -bli_scpackm_cxk_1e_md -bli_scpackm_cxk_1r_md -bli_scpackm_struc_cxk_md bli_scxpbym_md bli_scxpbym_md_ex -bli_scxpbym_md_unb_var1 bli_sdcastm bli_sdcastnzm bli_sdcastv bli_sdcopysc -bli_sdgemm_ker_var2_md bli_sdivsc bli_sdotaxpyv bli_sdotaxpyv_ex @@ -1570,187 +791,112 @@ bli_sdotxf bli_sdotxf_ex bli_sdotxv bli_sdotxv_ex -bli_sdpackm_blk_var1_md -bli_sdpackm_cxk_1e_md -bli_sdpackm_cxk_1r_md -bli_sdpackm_struc_cxk_md bli_sdxpbym_md bli_sdxpbym_md_ex -bli_sdxpbym_md_unb_var1 +bli_seqm +bli_seqsc +bli_seqv bli_setd -bli_setd_check bli_setd_ex -bli_setd_ex_qfp bli_setid -bli_setid_check bli_setid_ex -bli_setid_ex_qfp bli_setijm +bli_setijv bli_setim bli_setiv bli_setm -bli_setm_check bli_setm_ex -bli_setm_ex_qfp bli_setrm bli_setrv bli_setsc -bli_setsc_check -bli_setsc_qfp bli_setv -bli_setv_check bli_setv_ex -bli_setv_ex_qfp bli_sfprintm bli_sfprintv bli_sgemm -bli_sgemm1m -bli_sgemm3m1 -bli_sgemm3mh -bli_sgemm4m1 -bli_sgemm4mb -bli_sgemm4mb_ker_var2 -bli_sgemm4mh bli_sgemm_ex -bli_sgemm_ker_var2 -bli_sgemmtrsm_l_ukernel -bli_sgemmtrsm_u_ukernel -bli_sgemm_ukernel +bli_sgemmt +bli_sgemmt_ex bli_sgemv bli_sgemv_ex -bli_sgemv_unb_var1 -bli_sgemv_unb_var2 -bli_sgemv_unf_var1 -bli_sgemv_unf_var2 bli_sger bli_sger_ex -bli_sger_unb_var1 -bli_sger_unb_var2 bli_sgetijm +bli_sgetijv bli_sgetsc +bli_sgesc +bli_sgtsc bli_shemm -bli_shemm1m -bli_shemm3m1 -bli_shemm3mh -bli_shemm4m1 -bli_shemm4mh bli_shemm_ex bli_shemv bli_shemv_ex -bli_shemv_unb_var1 -bli_shemv_unb_var2 -bli_shemv_unb_var3 -bli_shemv_unb_var4 -bli_shemv_unf_var1 -bli_shemv_unf_var1a -bli_shemv_unf_var3 -bli_shemv_unf_var3a bli_sher bli_sher2 bli_sher2_ex bli_sher2k -bli_sher2k1m -bli_sher2k3m1 -bli_sher2k3mh -bli_sher2k4m1 -bli_sher2k4mh bli_sher2k_ex -bli_sher2_unb_var1 -bli_sher2_unb_var2 -bli_sher2_unb_var3 -bli_sher2_unb_var4 -bli_sher2_unf_var1 -bli_sher2_unf_var4 bli_sher_ex bli_sherk -bli_sherk1m -bli_sherk3m1 -bli_sherk3mh -bli_sherk4m1 -bli_sherk4mh bli_sherk_ex -bli_sherk_l_ker_var2 -bli_sherk_u_ker_var2 -bli_sher_unb_var1 -bli_sher_unb_var2 bli_shiftd -bli_shiftd_check bli_shiftd_ex -bli_shiftd_ex_qfp bli_sinvertd bli_sinvertd_ex bli_sinvertsc bli_sinvertv bli_sinvertv_ex -bli_slamch +bli_sinvscald +bli_sinvscald_ex +bli_sinvscalm +bli_sinvscalm_ex +bli_sinvscalv +bli_sinvscalv_ex bli_sleep +bli_slesc +bli_sltsc bli_smachval bli_smkherm bli_smkherm_ex -bli_smkherm_unb_var1 bli_smksymm bli_smksymm_ex -bli_smksymm_unb_var1 bli_smktrim bli_smktrim_ex -bli_smktrim_unb_var1 bli_smulsc bli_snorm1m bli_snorm1m_ex -bli_snorm1m_unb_var1 bli_snorm1v bli_snorm1v_ex -bli_snorm1v_unb_var1 bli_snormfm bli_snormfm_ex -bli_snormfm_unb_var1 bli_snormfsc bli_snormfv bli_snormfv_ex -bli_snormfv_unb_var1 bli_snormim bli_snormim_ex -bli_snormim_unb_var1 bli_snormiv bli_snormiv_ex -bli_snormiv_unb_var1 -bli_spackm_blk_var1 -bli_spackm_cxk -bli_spackm_herm_cxk -bli_spackm_struc_cxk -bli_spackm_tri_cxk -bli_spackm_unb_var1 bli_sprintm -bli_sprintm_ex bli_sprintv -bli_sprintv_ex +bli_sqrtrsc bli_sqrtsc -bli_sqrtsc_check -bli_sqrtsc_qfp bli_srandm bli_srandm_ex -bli_srandm_unb_var1 bli_srandnm bli_srandnm_ex -bli_srandnm_unb_var1 bli_srandnv bli_srandnv_ex -bli_srandnv_unb_var1 bli_srandv bli_srandv_ex -bli_srandv_unb_var1 bli_sscal2d bli_sscal2d_ex bli_sscal2m bli_sscal2m_ex -bli_sscal2m_unb_var1 bli_sscal2v bli_sscal2v_ex bli_sscald bli_sscald_ex bli_sscalm bli_sscalm_ex -bli_sscalm_unb_var1 bli_sscalv bli_sscalv_ex bli_sscastm @@ -1762,42 +908,30 @@ bli_ssetd_ex bli_ssetid bli_ssetid_ex bli_ssetijm +bli_ssetijv bli_ssetm bli_ssetm_ex -bli_ssetm_unb_var1 bli_ssetsc bli_ssetv bli_ssetv_ex -bli_ssgemm_ker_var2_md bli_sshiftd bli_sshiftd_ex -bli_sspackm_blk_var1_md -bli_sspackm_cxk_1e_md -bli_sspackm_cxk_1r_md -bli_sspackm_struc_cxk_md +bli_ssqrtrsc bli_ssqrtsc bli_ssubd bli_ssubd_ex bli_ssubm bli_ssubm_ex -bli_ssubm_unb_var1 bli_ssubsc bli_ssubv bli_ssubv_ex bli_ssumsqv bli_ssumsqv_ex -bli_ssumsqv_unb_var1 bli_sswapv bli_sswapv_ex bli_ssxpbym_md bli_ssxpbym_md_ex -bli_ssxpbym_md_unb_var1 bli_ssymm -bli_ssymm1m -bli_ssymm3m1 -bli_ssymm3mh -bli_ssymm4m1 -bli_ssymm4mh bli_ssymm_ex bli_ssymv bli_ssymv_ex @@ -1805,330 +939,100 @@ bli_ssyr bli_ssyr2 bli_ssyr2_ex bli_ssyr2k -bli_ssyr2k1m -bli_ssyr2k3m1 -bli_ssyr2k3mh -bli_ssyr2k4m1 -bli_ssyr2k4mh bli_ssyr2k_ex bli_ssyr_ex bli_ssyrk -bli_ssyrk1m -bli_ssyrk3m1 -bli_ssyrk3mh -bli_ssyrk4m1 -bli_ssyrk4mh bli_ssyrk_ex -bli_string_mkupper bli_strmm -bli_strmm1m bli_strmm3 -bli_strmm31m -bli_strmm33m1 -bli_strmm33mh -bli_strmm34m1 -bli_strmm34mh bli_strmm3_ex -bli_strmm3m1 -bli_strmm4m1 bli_strmm_ex -bli_strmm_ll_ker_var2 -bli_strmm_lu_ker_var2 -bli_strmm_rl_ker_var2 -bli_strmm_ru_ker_var2 bli_strmv bli_strmv_ex -bli_strmv_unb_var1 -bli_strmv_unb_var2 -bli_strmv_unf_var1 -bli_strmv_unf_var2 bli_strsm -bli_strsm1m -bli_strsm3m1 -bli_strsm4m1 bli_strsm_ex -bli_strsm_ll_ker_var2 -bli_strsm_l_ukernel -bli_strsm_lu_ker_var2 -bli_strsm_rl_ker_var2 -bli_strsm_ru_ker_var2 -bli_strsm_u_ukernel bli_strsv bli_strsv_ex -bli_strsv_unb_var1 -bli_strsv_unb_var2 -bli_strsv_unf_var1 -bli_strsv_unf_var2 bli_subd -bli_subd_check bli_subd_ex -bli_subd_ex_qfp bli_subm -bli_subm_check bli_subm_ex -bli_subm_ex_qfp bli_subsc -bli_subsc_check -bli_subsc_qfp bli_subv -bli_subv_check bli_subv_ex -bli_subv_ex_qfp bli_sumsqv -bli_sumsqv_check bli_sumsqv_ex -bli_sumsqv_ex_qfp -bli_sunpackm_blk_var1 -bli_sunpackm_cxk -bli_sunpackm_unb_var1 bli_sunzipsc bli_swapv -bli_swapv_check bli_swapv_ex -bli_swapv_ex_qfp bli_sxpbyd bli_sxpbyd_ex bli_sxpbym bli_sxpbym_ex -bli_sxpbym_unb_var1 bli_sxpbyv bli_sxpbyv_ex bli_symm -bli_symm1m -bli_symm3m1 -bli_symm3mh -bli_symm4m1 -bli_symm4mh -bli_symm_check bli_symm_ex -bli_symm_front -bli_symmind -bli_symmind_get_avail -bli_symmnat bli_symv -bli_symv_check bli_symv_ex -bli_symv_ex_qfp bli_syr bli_syr2 -bli_syr2_check bli_syr2_ex -bli_syr2_ex_qfp bli_syr2k -bli_syr2k1m -bli_syr2k3m1 -bli_syr2k3mh -bli_syr2k4m1 -bli_syr2k4mh -bli_syr2k_check bli_syr2k_ex -bli_syr2k_front -bli_syr2kind -bli_syr2kind_get_avail -bli_syr2knat -bli_syr_check bli_syr_ex -bli_syr_ex_qfp bli_syrk -bli_syrk1m -bli_syrk3m1 -bli_syrk3mh -bli_syrk4m1 -bli_syrk4mh -bli_syrk_check bli_syrk_ex -bli_syrk_front -bli_syrkind -bli_syrkind_get_avail -bli_syrknat bli_szcastm bli_szcastnzm bli_szcastv bli_szcopysc -bli_szgemm_ker_var2_md bli_szipsc -bli_szpackm_blk_var1_md -bli_szpackm_cxk_1e_md -bli_szpackm_cxk_1r_md -bli_szpackm_struc_cxk_md bli_szxpbym_md bli_szxpbym_md_ex -bli_szxpbym_md_unb_var1 bli_thrcomm_barrier -bli_thrcomm_barrier_atomic bli_thrcomm_bcast -bli_thrcomm_cleanup -bli_thrcomm_create -bli_thrcomm_free -bli_thrcomm_init -bli_thread_finalize -bli_thread_get_env bli_thread_get_ic_nt bli_thread_get_ir_nt bli_thread_get_jc_nt bli_thread_get_jr_nt bli_thread_get_num_threads bli_thread_get_pc_nt -bli_thread_init -bli_thread_init_rntm -bli_thread_init_rntm_from_env -bli_thread_range_b2t -bli_thread_range_l2r -bli_thread_range_mdim -bli_thread_range_ndim -bli_thread_range_r2l +bli_thread_get_thread_impl +bli_thread_get_thread_impl_str +bli_thread_launch bli_thread_range_sub -bli_thread_range_t2b -bli_thread_range_weighted_b2t -bli_thread_range_weighted_l2r -bli_thread_range_weighted_r2l -bli_thread_range_weighted_sub -bli_thread_range_weighted_t2b -bli_thread_range_width_l bli_thread_set_num_threads bli_thread_set_num_threads_ +bli_thread_set_thread_impl bli_thread_set_ways bli_thread_set_ways_ -bli_thrinfo_create -bli_thrinfo_create_for_cntl -bli_thrinfo_create_for_cntl_prenode bli_thrinfo_free -bli_thrinfo_grow -bli_thrinfo_init -bli_thrinfo_init_single -bli_thrinfo_rgrow -bli_thrinfo_rgrow_prenode bli_trmm -bli_trmm1m bli_trmm3 -bli_trmm31m -bli_trmm33m1 -bli_trmm33mh -bli_trmm34m1 -bli_trmm34mh bli_trmm3_ex -bli_trmm3_front -bli_trmm3ind -bli_trmm3ind_get_avail -bli_trmm3m1 -bli_trmm3nat -bli_trmm4m1 -bli_trmm_check -bli_trmm_determine_kc -bli_trmm_determine_kc_b -bli_trmm_determine_kc_f -bli_trmm_direct bli_trmm_ex -bli_trmm_front -bli_trmmind -bli_trmmind_get_avail -bli_trmm_ll_ker_var2 -bli_trmm_lu_ker_var2 -bli_trmmnat -bli_trmm_prune_unref_mparts_k -bli_trmm_prune_unref_mparts_m -bli_trmm_prune_unref_mparts_n -bli_trmm_rl_ker_var2 -bli_trmm_ru_ker_var2 -bli_trmm_xx_ker_var2 bli_trmv -bli_trmv_check bli_trmv_ex -bli_trmv_ex_qfp -bli_trmv_unb_var1 -bli_trmv_unb_var1_qfp -bli_trmv_unb_var2 -bli_trmv_unb_var2_qfp -bli_trmv_unf_var1 -bli_trmv_unf_var1_qfp -bli_trmv_unf_var2 -bli_trmv_unf_var2_qfp bli_trsm -bli_trsm1m -bli_trsm3m1 -bli_trsm4m1 -bli_trsm_blk_var1 -bli_trsm_blk_var2 -bli_trsm_blk_var3 -bli_trsm_check -bli_trsm_cntl_create -bli_trsm_cntl_create_node -bli_trsm_cntl_free -bli_trsm_determine_kc -bli_trsm_determine_kc_b -bli_trsm_determine_kc_f -bli_trsm_direct bli_trsm_ex -bli_trsm_front -bli_trsmind -bli_trsmind_get_avail -bli_trsm_int -bli_trsm_l_cntl_create -bli_trsm_ll_ker_var2 -bli_trsm_l_ukernel_qfp -bli_trsm_lu_ker_var2 -bli_trsmnat -bli_trsm_packa -bli_trsm_packb -bli_trsm_prune_unref_mparts_k -bli_trsm_prune_unref_mparts_m -bli_trsm_prune_unref_mparts_n -bli_trsm_r_cntl_create -bli_trsm_rl_ker_var2 -bli_trsm_ru_ker_var2 bli_trsm_ukernel -bli_trsm_u_ukernel_qfp -bli_trsm_xx_ker_var2 bli_trsv -bli_trsv_check bli_trsv_ex -bli_trsv_ex_qfp -bli_trsv_unb_var1 -bli_trsv_unb_var1_qfp -bli_trsv_unb_var2 -bli_trsv_unb_var2_qfp -bli_trsv_unf_var1 -bli_trsv_unf_var1_qfp -bli_trsv_unf_var2 -bli_trsv_unf_var2_qfp -bli_unpackm_blk_var1 -bli_unpackm_cntl_create_node -bli_unpackm_int -bli_unpackm_int_check -bli_unpackm_unb_var1 bli_unzipsc -bli_unzipsc_check -bli_unzipsc_qfp -bli_utilm_fprint_check -bli_utilm_mkhst_check -bli_utilm_norm_check -bli_utilm_rand_check -bli_utilv_norm_check -bli_utilv_sumsqv_check -bli_utilv_xa_check bli_xpbyd -bli_xpbyd_check bli_xpbyd_ex -bli_xpbyd_ex_qfp bli_xpbym -bli_xpbym_check bli_xpbym_ex -bli_xpbym_ex_qfp bli_xpbym_md bli_xpbym_md_ex -bli_xpbym_md_ex_qfp2 bli_xpbyv -bli_xpbyv_check bli_xpbyv_ex -bli_xpbyv_ex_qfp -bli_xxmv_check -bli_xxr_check bli_zabsqsc bli_zaddd bli_zaddd_ex bli_zaddm bli_zaddm_ex -bli_zaddm_unb_var1 bli_zaddsc bli_zaddv bli_zaddv_ex @@ -2136,7 +1040,6 @@ bli_zamaxv bli_zamaxv_ex bli_zasumv bli_zasumv_ex -bli_zasumv_unb_var1 bli_zaxpbyv bli_zaxpbyv_ex bli_zaxpy2v @@ -2147,33 +1050,24 @@ bli_zaxpyf bli_zaxpyf_ex bli_zaxpym bli_zaxpym_ex -bli_zaxpym_unb_var1 bli_zaxpyv bli_zaxpyv_ex bli_zccastm bli_zccastnzm bli_zccastv bli_zccopysc -bli_zcgemm_ker_var2_md bli_zcopyd bli_zcopyd_ex bli_zcopym bli_zcopym_ex -bli_zcopym_unb_var1 bli_zcopyv bli_zcopyv_ex -bli_zcpackm_blk_var1_md -bli_zcpackm_cxk_1e_md -bli_zcpackm_cxk_1r_md -bli_zcpackm_struc_cxk_md bli_zcxpbym_md bli_zcxpbym_md_ex -bli_zcxpbym_md_unb_var1 bli_zdcastm bli_zdcastnzm bli_zdcastv bli_zdcopysc -bli_zdgemm_ker_var2_md bli_zdivsc bli_zdotaxpyv bli_zdotaxpyv_ex @@ -2185,174 +1079,93 @@ bli_zdotxf bli_zdotxf_ex bli_zdotxv bli_zdotxv_ex -bli_zdpackm_blk_var1_md -bli_zdpackm_cxk_1e_md -bli_zdpackm_cxk_1r_md -bli_zdpackm_struc_cxk_md bli_zdxpbym_md bli_zdxpbym_md_ex -bli_zdxpbym_md_unb_var1 +bli_zeqm +bli_zeqsc +bli_zeqv bli_zfprintm bli_zfprintv bli_zgemm -bli_zgemm1m -bli_zgemm3m1 -bli_zgemm3mh -bli_zgemm4m1 -bli_zgemm4mb -bli_zgemm4mb_ker_var2 -bli_zgemm4mh bli_zgemm_ex -bli_zgemm_ker_var2 -bli_zgemm_md_c2r_ref -bli_zgemmtrsm_l_ukernel -bli_zgemmtrsm_u_ukernel -bli_zgemm_ukernel +bli_zgemmt +bli_zgemmt_ex bli_zgemv bli_zgemv_ex -bli_zgemv_unb_var1 -bli_zgemv_unb_var2 -bli_zgemv_unf_var1 -bli_zgemv_unf_var2 bli_zger bli_zger_ex -bli_zger_unb_var1 -bli_zger_unb_var2 bli_zgetijm +bli_zgetijv bli_zgetsc +bli_zgesc +bli_zgtsc bli_zhemm -bli_zhemm1m -bli_zhemm3m1 -bli_zhemm3mh -bli_zhemm4m1 -bli_zhemm4mh bli_zhemm_ex bli_zhemv bli_zhemv_ex -bli_zhemv_unb_var1 -bli_zhemv_unb_var2 -bli_zhemv_unb_var3 -bli_zhemv_unb_var4 -bli_zhemv_unf_var1 -bli_zhemv_unf_var1a -bli_zhemv_unf_var3 -bli_zhemv_unf_var3a bli_zher bli_zher2 bli_zher2_ex bli_zher2k -bli_zher2k1m -bli_zher2k3m1 -bli_zher2k3mh -bli_zher2k4m1 -bli_zher2k4mh bli_zher2k_ex -bli_zher2_unb_var1 -bli_zher2_unb_var2 -bli_zher2_unb_var3 -bli_zher2_unb_var4 -bli_zher2_unf_var1 -bli_zher2_unf_var4 bli_zher_ex bli_zherk -bli_zherk1m -bli_zherk3m1 -bli_zherk3mh -bli_zherk4m1 -bli_zherk4mh bli_zherk_ex -bli_zherk_l_ker_var2 -bli_zherk_u_ker_var2 -bli_zher_unb_var1 -bli_zher_unb_var2 bli_zinvertd bli_zinvertd_ex bli_zinvertsc bli_zinvertv bli_zinvertv_ex +bli_zinvscald +bli_zinvscald_ex +bli_zinvscalm +bli_zinvscalm_ex +bli_zinvscalv +bli_zinvscalv_ex bli_zipsc -bli_zipsc_check -bli_zipsc_qfp +bli_zlesc +bli_zltsc bli_zmachval bli_zmkherm bli_zmkherm_ex -bli_zmkherm_unb_var1 bli_zmksymm bli_zmksymm_ex -bli_zmksymm_unb_var1 bli_zmktrim bli_zmktrim_ex -bli_zmktrim_unb_var1 bli_zmulsc bli_znorm1m bli_znorm1m_ex -bli_znorm1m_unb_var1 bli_znorm1v bli_znorm1v_ex -bli_znorm1v_unb_var1 bli_znormfm bli_znormfm_ex -bli_znormfm_unb_var1 bli_znormfsc bli_znormfv bli_znormfv_ex -bli_znormfv_unb_var1 bli_znormim bli_znormim_ex -bli_znormim_unb_var1 bli_znormiv bli_znormiv_ex -bli_znormiv_unb_var1 -bli_zpackm_blk_var1 -bli_zpackm_cxk -bli_zpackm_cxk_1er -bli_zpackm_cxk_3mis -bli_zpackm_cxk_4mi -bli_zpackm_cxk_rih -bli_zpackm_herm_cxk -bli_zpackm_herm_cxk_1er -bli_zpackm_herm_cxk_3mis -bli_zpackm_herm_cxk_4mi -bli_zpackm_herm_cxk_rih -bli_zpackm_struc_cxk -bli_zpackm_struc_cxk_1er -bli_zpackm_struc_cxk_3mis -bli_zpackm_struc_cxk_4mi -bli_zpackm_struc_cxk_rih -bli_zpackm_tri_cxk -bli_zpackm_tri_cxk_1er -bli_zpackm_tri_cxk_3mis -bli_zpackm_tri_cxk_4mi -bli_zpackm_tri_cxk_rih -bli_zpackm_unb_var1 bli_zprintm -bli_zprintm_ex bli_zprintv -bli_zprintv_ex bli_zrandm bli_zrandm_ex -bli_zrandm_unb_var1 bli_zrandnm bli_zrandnm_ex -bli_zrandnm_unb_var1 bli_zrandnv bli_zrandnv_ex -bli_zrandnv_unb_var1 bli_zrandv bli_zrandv_ex -bli_zrandv_unb_var1 bli_zscal2d bli_zscal2d_ex bli_zscal2m bli_zscal2m_ex -bli_zscal2m_unb_var1 bli_zscal2v bli_zscal2v_ex bli_zscald bli_zscald_ex bli_zscalm bli_zscalm_ex -bli_zscalm_unb_var1 bli_zscalv bli_zscalv_ex bli_zscastm @@ -2364,42 +1177,30 @@ bli_zsetd_ex bli_zsetid bli_zsetid_ex bli_zsetijm +bli_zsetijv bli_zsetm bli_zsetm_ex -bli_zsetm_unb_var1 bli_zsetsc bli_zsetv bli_zsetv_ex -bli_zsgemm_ker_var2_md bli_zshiftd bli_zshiftd_ex -bli_zspackm_blk_var1_md -bli_zspackm_cxk_1e_md -bli_zspackm_cxk_1r_md -bli_zspackm_struc_cxk_md +bli_zsqrtrsc bli_zsqrtsc bli_zsubd bli_zsubd_ex bli_zsubm bli_zsubm_ex -bli_zsubm_unb_var1 bli_zsubsc bli_zsubv bli_zsubv_ex bli_zsumsqv bli_zsumsqv_ex -bli_zsumsqv_unb_var1 bli_zswapv bli_zswapv_ex bli_zsxpbym_md bli_zsxpbym_md_ex -bli_zsxpbym_md_unb_var1 bli_zsymm -bli_zsymm1m -bli_zsymm3m1 -bli_zsymm3mh -bli_zsymm4m1 -bli_zsymm4mh bli_zsymm_ex bli_zsymv bli_zsymv_ex @@ -2407,257 +1208,47 @@ bli_zsyr bli_zsyr2 bli_zsyr2_ex bli_zsyr2k -bli_zsyr2k1m -bli_zsyr2k3m1 -bli_zsyr2k3mh -bli_zsyr2k4m1 -bli_zsyr2k4mh bli_zsyr2k_ex bli_zsyr_ex bli_zsyrk -bli_zsyrk1m -bli_zsyrk3m1 -bli_zsyrk3mh -bli_zsyrk4m1 -bli_zsyrk4mh bli_zsyrk_ex bli_ztrmm -bli_ztrmm1m bli_ztrmm3 -bli_ztrmm31m -bli_ztrmm33m1 -bli_ztrmm33mh -bli_ztrmm34m1 -bli_ztrmm34mh bli_ztrmm3_ex -bli_ztrmm3m1 -bli_ztrmm4m1 bli_ztrmm_ex -bli_ztrmm_ll_ker_var2 -bli_ztrmm_lu_ker_var2 -bli_ztrmm_rl_ker_var2 -bli_ztrmm_ru_ker_var2 bli_ztrmv bli_ztrmv_ex -bli_ztrmv_unb_var1 -bli_ztrmv_unb_var2 -bli_ztrmv_unf_var1 -bli_ztrmv_unf_var2 bli_ztrsm -bli_ztrsm1m -bli_ztrsm3m1 -bli_ztrsm4m1 bli_ztrsm_ex -bli_ztrsm_ll_ker_var2 -bli_ztrsm_l_ukernel -bli_ztrsm_lu_ker_var2 -bli_ztrsm_rl_ker_var2 -bli_ztrsm_ru_ker_var2 -bli_ztrsm_u_ukernel bli_ztrsv bli_ztrsv_ex -bli_ztrsv_unb_var1 -bli_ztrsv_unb_var2 -bli_ztrsv_unf_var1 -bli_ztrsv_unf_var2 -bli_zunpackm_blk_var1 -bli_zunpackm_cxk -bli_zunpackm_unb_var1 bli_zunzipsc bli_zxpbyd bli_zxpbyd_ex bli_zxpbym bli_zxpbym_ex -bli_zxpbym_unb_var1 bli_zxpbyv bli_zxpbyv_ex bli_zzcastm bli_zzcastnzm bli_zzcastv bli_zzcopysc -bli_zzgemm_ker_var2_md bli_zzipsc -bli_zzpackm_blk_var1_md -bli_zzpackm_cxk_1e_md -bli_zzpackm_cxk_1r_md -bli_zzpackm_struc_cxk_md bli_zzxpbym_md bli_zzxpbym_md_ex -bli_zzxpbym_md_unb_var1 -sasum_ -sasumsub_ -saxpy_ -scabs1_ -scasum_ -scasumsub_ -scnrm2_ -scnrm2sub_ -scopy_ -sdot_ -sdotsub_ -sdsdot_ -sdsdotsub_ -sgbmv_ -sgemm_ -sgemv_ -sger_ -snrm2_ -snrm2sub_ -srot_ -srotg_ -srotm_ -srotmg_ -ssbmv_ -sscal_ -sspmv_ -sspr_ -sspr2_ -sswap_ -ssymm_ -ssymv_ -ssyr_ -ssyr2_ -ssyr2k_ -ssyrk_ -stbmv_ -stbsv_ -stpmv_ -stpsv_ -strmm_ -strmv_ -strsm_ -strsv_ -dasum_ -dasumsub_ -daxpy_ -dcabs1_ -dcopy_ -ddot_ -ddotsub_ -dgbmv_ -dgemm_ -dgemv_ -dger_ -dnrm2_ -dnrm2sub_ -drot_ -drotg_ -drotm_ -drotmg_ -dsbmv_ -dscal_ -dsdot_ -dsdotsub_ -dspmv_ -dspr_ -dspr2_ -dswap_ -dsymm_ -dsymv_ -dsyr_ -dsyr2_ -dsyr2k_ -dsyrk_ -dtbmv_ -dtbsv_ -dtpmv_ -dtpsv_ -dtrmm_ -dtrmv_ -dtrsm_ -dtrsv_ -dzasum_ -dzasumsub_ -dznrm2_ -dznrm2sub_ +caxpby_ caxpy_ -ccopy_ -cdotc_ -cdotcsub_ -cdotu_ -cdotusub_ -cgbmv_ -cgemm_ -cgemv_ -cgerc_ -cgeru_ -chbmv_ -chemm_ -chemv_ -cher_ -cher2_ -cher2k_ -cherk_ -chpmv_ -chpr_ -chpr2_ -crotg_ -cscal_ -csrot_ -csscal_ -cswap_ -csymm_ -csyr2k_ -csyrk_ -ctbmv_ -ctbsv_ -ctpmv_ -ctpsv_ -ctrmm_ -ctrmv_ -ctrsm_ -ctrsv_ -zaxpy_ -zcopy_ -zdotc_ -zdotcsub_ -zdotu_ -zdotusub_ -zdrot_ -zdscal_ -zgbmv_ -zgemm_ -zgemv_ -zgerc_ -zgeru_ -zhbmv_ -zhemm_ -zhemv_ -zher_ -zher2_ -zher2k_ -zherk_ -zhpmv_ -zhpr_ -zhpr2_ -zrotg_ -zscal_ -zswap_ -zsymm_ -zsyr2k_ -zsyrk_ -ztbmv_ -ztbsv_ -ztpmv_ -ztpsv_ -ztrmm_ -ztrmv_ -ztrsm_ -ztrsv_ -icamax_ -icamaxsub_ -idamax_ -idamaxsub_ -isamax_ -isamaxsub_ -izamax_ -izamaxsub_ +cblas_caxpby cblas_caxpy cblas_ccopy cblas_cdotc_sub cblas_cdotu_sub cblas_cgbmv cblas_cgemm +cblas_cgemm3m +cblas_cgemm_batch +cblas_cgemmt +cblas_cgemmtr cblas_cgemv cblas_cgerc cblas_cgeru @@ -2686,11 +1277,15 @@ cblas_ctrmv cblas_ctrsm cblas_ctrsv cblas_dasum +cblas_daxpby cblas_daxpy cblas_dcopy cblas_ddot cblas_dgbmv cblas_dgemm +cblas_dgemm_batch +cblas_dgemmt +cblas_dgemmtr cblas_dgemv cblas_dger cblas_dnrm2 @@ -2726,6 +1321,7 @@ cblas_idamax cblas_isamax cblas_izamax cblas_sasum +cblas_saxpby cblas_saxpy cblas_scasum cblas_scnrm2 @@ -2734,6 +1330,9 @@ cblas_sdot cblas_sdsdot cblas_sgbmv cblas_sgemm +cblas_sgemm_batch +cblas_sgemmt +cblas_sgemmtr cblas_sgemv cblas_sger cblas_snrm2 @@ -2762,6 +1361,7 @@ cblas_strmv cblas_strsm cblas_strsv cblas_xerbla +cblas_zaxpby cblas_zaxpy cblas_zcopy cblas_zdotc_sub @@ -2769,6 +1369,10 @@ cblas_zdotu_sub cblas_zdscal cblas_zgbmv cblas_zgemm +cblas_zgemm3m +cblas_zgemm_batch +cblas_zgemmt +cblas_zgemmtr cblas_zgemv cblas_zgerc cblas_zgeru @@ -2795,3 +1399,190 @@ cblas_ztrmm cblas_ztrmv cblas_ztrsm cblas_ztrsv +ccopy_ +cdotc_ +cdotcsub_ +cdotu_ +cdotusub_ +cgbmv_ +cgemm3m_ +cgemm_ +cgemm_batch_ +cgemmt_ +cgemmtr_ +cgemv_ +cgerc_ +cgeru_ +chbmv_ +chemm_ +chemv_ +cher2_ +cher2k_ +cher_ +cherk_ +chpmv_ +chpr2_ +chpr_ +crotg_ +cscal_ +csrot_ +csscal_ +cswap_ +csymm_ +csyr2k_ +csyrk_ +ctbmv_ +ctbsv_ +ctpmv_ +ctpsv_ +ctrmm_ +ctrmv_ +ctrsm_ +ctrsv_ +dasum_ +dasumsub_ +daxpby_ +daxpy_ +dcabs1_ +dcopy_ +ddot_ +ddotsub_ +dgbmv_ +dgemm_ +dgemm_batch_ +dgemmt_ +dgemmtr_ +dgemv_ +dger_ +dnrm2_ +dnrm2sub_ +drot_ +drotg_ +drotm_ +drotmg_ +dsbmv_ +dscal_ +dsdot_ +dsdotsub_ +dspmv_ +dspr2_ +dspr_ +dswap_ +dsymm_ +dsymv_ +dsyr2_ +dsyr2k_ +dsyr_ +dsyrk_ +dtbmv_ +dtbsv_ +dtpmv_ +dtpsv_ +dtrmm_ +dtrmv_ +dtrsm_ +dtrsv_ +dzasum_ +dzasumsub_ +dznrm2_ +dznrm2sub_ +icamax_ +icamaxsub_ +idamax_ +idamaxsub_ +isamax_ +isamaxsub_ +izamax_ +izamaxsub_ +lsame_ +sasum_ +sasumsub_ +saxpby_ +saxpy_ +scabs1_ +scasum_ +scasumsub_ +scnrm2_ +scnrm2sub_ +scopy_ +sdot_ +sdotsub_ +sdsdot_ +sdsdotsub_ +sgbmv_ +sgemm_ +sgemm_batch_ +sgemmt_ +sgemmtr_ +sgemv_ +sger_ +snrm2_ +snrm2sub_ +srot_ +srotg_ +srotm_ +srotmg_ +ssbmv_ +sscal_ +sspmv_ +sspr2_ +sspr_ +sswap_ +ssymm_ +ssymv_ +ssyr2_ +ssyr2k_ +ssyr_ +ssyrk_ +stbmv_ +stbsv_ +stpmv_ +stpsv_ +strmm_ +strmv_ +strsm_ +strsv_ +xerbla_ +xerbla_array_ +zaxpby_ +zaxpy_ +zcopy_ +zdotc_ +zdotcsub_ +zdotu_ +zdotusub_ +zdrot_ +zdscal_ +zgbmv_ +zgemm3m_ +zgemm_ +zgemm_batch_ +zgemmt_ +zgemmtr_ +zgemv_ +zgerc_ +zgeru_ +zhbmv_ +zhemm_ +zhemv_ +zher2_ +zher2k_ +zher_ +zherk_ +zhpmv_ +zhpr2_ +zhpr_ +zrotg_ +zscal_ +zswap_ +zsymm_ +zsyr2k_ +zsyrk_ +ztbmv_ +ztbsv_ +ztpmv_ +ztpsv_ +ztrmm_ +ztrmv_ +ztrsm_ +ztrsv_ diff --git a/build/regen-symbols.sh b/build/old/regen-symbols.sh similarity index 100% rename from build/regen-symbols.sh rename to build/old/regen-symbols.sh diff --git a/build/plugin/Makefile b/build/plugin/Makefile new file mode 100644 index 0000000000..92554e3e80 --- /dev/null +++ b/build/plugin/Makefile @@ -0,0 +1,524 @@ +# +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2014, The University of Texas at Austin +# Copyright (C) 2022, Advanced Micro Devices, Inc. +# Copyright (C) 2023, Southern Methodist University +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name(s) of the copyright holder(s) nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + +# +# --- Makefile PHONY target definitions ---------------------------------------- +# + +.PHONY: all \ + plugin \ + showconfig \ + clean cleanmk cleanlib distclean \ + check-env check-env-make-defs check-env-fragments check-env-mk + + +# +# --- Include config makefile definitions -------------------------------------- +# + +# Define the name of the config makefile. +CONFIG_MK_FILE := config.mk + +# Include the configuration file. +-include $(CONFIG_MK_FILE) + + +# +# --- Include common makefile definitions -------------------------------------- +# + +INC_PATH := $(includedir)/blis + +# Define the name of the common makefile. +COMMON_MK_FILE := $(sharedir)/blis/common.mk + +# Include the configuration file. +include $(COMMON_MK_FILE) + +# Detect whether we actually got the configuration file. If we didn't, then +# it is likely that the user has not yet generated it (via configure). +ifeq ($(strip $(COMMON_MK_INCLUDED)),yes) +COMMON_MK_PRESENT := yes +else +COMMON_MK_PRESENT := no +endif + +# Source suffixes. +CONFIG_SRC_SUFS := c cxx cpp +KERNELS_SRC_SUFS := c cxx cpp s S +REFKERN_SRC_SUFS := c cxx cpp +FRAME_SRC_SUFS := c cxx cpp + +# Make sure the plugin path is included when searching for headers (e.g. bli_plugin_.h). +CINCFLAGS += -I$(DIST_PATH) + +PLUGIN_A_PATH := $(BASE_LIB_PATH)/libblis_$(PLUGIN_NAME).a +PLUGIN_SO_PATH := $(BASE_LIB_PATH)/libblis_$(PLUGIN_NAME).$(SHLIB_EXT) + +# Specify the shared library's 'soname' field. +# NOTE: The flag for creating shared objects is different for Linux and OS X. +LDFLAGS += -L$(libdir) -lblis +ifeq ($(OS_NAME),Darwin) +# OS X shared library link flags. +SOFLAGS := -dynamiclib +else +SOFLAGS := -shared +endif + +# +# --- Main target variable definitions ----------------------------------------- +# + +# --- Object file paths --- + +# Construct the base object file path for the current configuration. +BASE_OBJ_PATH := ./$(OBJ_DIR)/$(CONFIG_NAME) + +# Construct base object file paths corresponding to the four locations +# of source code. +BASE_OBJ_CONFIG_PATH := $(BASE_OBJ_PATH)/$(CONFIG_DIR) +BASE_OBJ_FRAME_PATH := $(BASE_OBJ_PATH)/$(FRAME_DIR) +BASE_OBJ_REFKERN_PATH := $(BASE_OBJ_PATH)/$(REFKERN_DIR) +BASE_OBJ_KERNELS_PATH := $(BASE_OBJ_PATH)/$(KERNELS_DIR) + +# --- Determine which libraries to build --- + +MK_LIBS := + +ifeq ($(MK_ENABLE_STATIC),yes) +MK_LIBS += $(PLUGIN_A_PATH) +endif +ifeq ($(MK_ENABLE_SHARED),yes) +MK_LIBS += $(PLUGIN_SO_PATH) +endif + +# +# --- Library object definitions ----------------------------------------------- +# + +# In this section, we will isolate the relevant source code filepaths and +# convert them to lists of object filepaths. Relevant source code falls into +# four categories: configuration source; architecture-specific kernel source; +# reference kernel source; and general framework source. + +# $(call gen-obj-paths-from-src file_exts, src_files, base_src_path, base_obj_path) +gen-obj-paths-from-src = $(foreach ch, $(1), \ + $(patsubst $(3)/%.$(ch), \ + $(4)/%.o, \ + $(filter %.$(ch), $(2)) ) ) + +# Generate object file paths for source code found in the sub-configuration +# directories. +MK_CONFIG_OBJS := $(call gen-obj-paths-from-src,$(CONFIG_SRC_SUFS),$(MK_CONFIG_SRC),$(CONFIG_PATH),$(BASE_OBJ_CONFIG_PATH)) + +# Generate object file paths for architecture-specific kernel source code. +# We target only .c, .s, and .S files. Note that MK_KERNELS_SRC is already +# limited to the kernel source corresponding to the kernel sets in +# KERNEL_LIST. This is because the configure script only propogated makefile +# fragments into those specific kernel subdirectories. +MK_KERNELS_OBJS := $(call gen-obj-paths-from-src,$(KERNELS_SRC_SUFS),$(MK_KERNELS_SRC),$(KERNELS_PATH),$(BASE_OBJ_KERNELS_PATH)) + +# Generate object file paths for reference kernels, with one set of object +# files for each sub-configuration in CONFIG_LIST. Note that due to the +# nuances of naming the reference kernel files, we can't use the function +# gen-obj-paths-from-src as we do above and below. +MK_REFKERN_OBJS := $(foreach suf, $(REFKERN_SRC_SUFS), \ + $(foreach arch, $(CONFIG_LIST), \ + $(patsubst $(REFKERN_PATH)/%_$(REFNM).$(suf), \ + $(BASE_OBJ_REFKERN_PATH)/$(arch)/%_$(arch)_$(REFNM).o, \ + $(filter %.$(suf), $(MK_REFKERN_SRC)) \ + ) \ + ) \ + ) + +# Generate object file paths for all of the portable framework source code. +MK_FRAME_OBJS := $(call gen-obj-paths-from-src,$(FRAME_SRC_SUFS),$(MK_FRAME_SRC),$(FRAME_PATH),$(BASE_OBJ_FRAME_PATH)) + +# Combine all of the object files into some readily-accessible variables. +MK_PLUGIN_OBJS := $(MK_CONFIG_OBJS) \ + $(MK_KERNELS_OBJS) \ + $(MK_REFKERN_OBJS) \ + $(MK_FRAME_OBJS) + + +# +# --- Targets/rules ------------------------------------------------------------ +# + +# --- Primary targets --- + +all: libs + +libs: plugin + +clean: cleanlib + + +# --- Environment check rules --- + +check-env: check-env-make-defs check-env-fragments check-env-mk + +check-env-mk: +ifeq ($(CONFIG_MK_PRESENT),no) + $(error Cannot proceed: config.mk not detected! Run configure first) +endif + +check-env-fragments: check-env-mk +ifeq ($(MAKEFILE_FRAGMENTS_PRESENT),no) + $(error Cannot proceed: makefile fragments not detected! Run configure first) +endif + +check-env-make-defs: check-env-fragments +ifeq ($(ALL_MAKE_DEFS_MK_PRESENT),no) + $(error Cannot proceed: Some make_defs.mk files not found or mislabeled!) +endif + + +# --- General source code / object code rules --- + +# FGVZ: Add support for compiling .s and .S files in 'config'/'kernels' +# directories. +# - May want to add an extra foreach loop around function eval/call. + +# first argument: a configuration name from config_list, used to look up the +# CFLAGS to use during compilation. +define make-config-rule +$(BASE_OBJ_CONFIG_PATH)/$(1)/%.o: $(CONFIG_PATH)/$(1)/%.c $(BLIS_H_FLAT) $(MAKE_DEFS_MK_PATHS) + @mkdir -p $$(dir $$@) +ifeq ($(ENABLE_VERBOSE),yes) + $(CC) $(call get-config-cflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@ +else + @echo "Compiling $$@" $(call get-config-text-for,$(1)) + @$(CC) $(call get-config-cflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@ +endif + +$(BASE_OBJ_CONFIG_PATH)/$(1)/%.o: $(CONFIG_PATH)/$(1)/%.cxx $(BLIS_H_FLAT) $(MAKE_DEFS_MK_PATHS) + @mkdir -p $$(dir $$@) +ifeq ($(ENABLE_VERBOSE),yes) + $(CXX) $(call get-config-cxxflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@ +else + @echo "Compiling $$@" $(call get-config-cxxtext-for,$(1)) + @$(CXX) $(call get-config-cxxflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@ +endif + +$(BASE_OBJ_CONFIG_PATH)/$(1)/%.o: $(CONFIG_PATH)/$(1)/%.cpp $(BLIS_H_FLAT) $(MAKE_DEFS_MK_PATHS) + @mkdir -p $$(dir $$@) +ifeq ($(ENABLE_VERBOSE),yes) + $(CXX) $(call get-config-cxxflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@ +else + @echo "Compiling $$@" $(call get-config-cxxtext-for,$(1)) + @$(CXX) $(call get-config-cxxflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@ +endif +endef + +# first argument: a kernel set (name) being targeted (e.g. haswell). +# The 'trailing' % is important so that these are technically pattern rules and the appropriate one can be +# selected based on the suffix of bli_cntx_ref. +define make-refinit-rule +$(BASE_OBJ_REFKERN_PATH)/$(1)/bli_cntx_$(1)_ref%.o: $(REFKERN_PATH)/bli_cntx_ref%.c $(BLIS_H_FLAT) $(MAKE_DEFS_MK_PATHS) + @mkdir -p $$(dir $$@) +ifeq ($(ENABLE_VERBOSE),yes) + $(CC) $(call get-refinit-cflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@ +else + @echo "Compiling $$@" $(call get-refinit-text-for,$(1)) + @$(CC) $(call get-refinit-cflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@ +endif + +$(BASE_OBJ_REFKERN_PATH)/$(1)/bli_cntx_$(1)_ref%.o: $(REFKERN_PATH)/bli_cntx_ref%.cpp $(BLIS_H_FLAT) $(MAKE_DEFS_MK_PATHS) + @mkdir -p $$(dir $$@) +ifeq ($(ENABLE_VERBOSE),yes) + $(CXX) $(call get-refinit-cxxflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@ +else + @echo "Compiling $$@" $(call get-refinit-cxxtext-for,$(1)) + @$(CXX) $(call get-refinit-cxxflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@ +endif + +$(BASE_OBJ_REFKERN_PATH)/$(1)/bli_cntx_$(1)_ref%.o: $(REFKERN_PATH)/bli_cntx_ref%.cxx $(BLIS_H_FLAT) $(MAKE_DEFS_MK_PATHS) + @mkdir -p $$(dir $$@) +ifeq ($(ENABLE_VERBOSE),yes) + $(CXX) $(call get-refinit-cxxflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@ +else + @echo "Compiling $$@" $(call get-refinit-cxxtext-for,$(1)) + @$(CXX) $(call get-refinit-cxxflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@ +endif +endef + +# first argument: a kernel set (name) being targeted (e.g. haswell). +define make-refkern-rule +$(BASE_OBJ_REFKERN_PATH)/$(1)/%_$(1)_ref.o: $(REFKERN_PATH)/%_ref.c $(BLIS_H_FLAT) $(MAKE_DEFS_MK_PATHS) + @mkdir -p $$(dir $$@) +ifeq ($(ENABLE_VERBOSE),yes) + $(CC) $(call get-refkern-cflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@ +else + @echo "Compiling $$@" $(call get-refkern-text-for,$(1)) + @$(CC) $(call get-refkern-cflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@ +endif + +$(BASE_OBJ_REFKERN_PATH)/$(1)/%_$(1)_ref.o: $(REFKERN_PATH)/%_ref.cpp $(BLIS_H_FLAT) $(MAKE_DEFS_MK_PATHS) + @mkdir -p $$(dir $$@) +ifeq ($(ENABLE_VERBOSE),yes) + $(CXX) $(call get-refkern-cxxflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@ +else + @echo "Compiling $$@" $(call get-refkern-cxxtext-for,$(1)) + @$(CXX) $(call get-refkern-cxxflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@ +endif + +$(BASE_OBJ_REFKERN_PATH)/$(1)/%_$(1)_ref.o: $(REFKERN_PATH)/%_ref.cxx $(BLIS_H_FLAT) $(MAKE_DEFS_MK_PATHS) + @mkdir -p $$(dir $$@) +ifeq ($(ENABLE_VERBOSE),yes) + $(CXX) $(call get-refkern-cxxflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@ +else + @echo "Compiling $$@" $(call get-refkern-cxxtext-for,$(1)) + @$(CXX) $(call get-refkern-cxxflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@ +endif +endef + +# first argument: a configuration name from the union of config_list and +# config_name, used to look up the CFLAGS to use during compilation. +define make-frame-rule +$(BASE_OBJ_FRAME_PATH)/%.o: $(FRAME_PATH)/%.c $(BLIS_H_FLAT) $(MAKE_DEFS_MK_PATHS) + @mkdir -p $$(dir $$@) +ifeq ($(ENABLE_VERBOSE),yes) + $(CC) $(call get-frame-cflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@ +else + @echo "Compiling $$@" $(call get-frame-text-for,$(1)) + @$(CC) $(call get-frame-cflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@ +endif + +$(BASE_OBJ_FRAME_PATH)/%.o: $(FRAME_PATH)/%.cxx $(BLIS_H_FLAT) $(MAKE_DEFS_MK_PATHS) + @mkdir -p $$(dir $$@) +ifeq ($(ENABLE_VERBOSE),yes) + $(CXX) $(call get-frame-cxxflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@ +else + @echo "Compiling $$@" $(call get-frame-cxxtext-for,$(1)) + @$(CXX) $(call get-frame-cxxflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@ +endif + +$(BASE_OBJ_FRAME_PATH)/%.o: $(FRAME_PATH)/%.cpp $(BLIS_H_FLAT) $(MAKE_DEFS_MK_PATHS) + @mkdir -p $$(dir $$@) +ifeq ($(ENABLE_VERBOSE),yes) + $(CXX) $(call get-frame-cxxflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@ +else + @echo "Compiling $$@" $(call get-frame-cxxtext-for,$(1)) + @$(CXX) $(call get-frame-cxxflags-for,$(1)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@ +endif +endef + +# first argument: a kernel set (name) being targeted (e.g. haswell). +# second argument: the configuration whose CFLAGS we should use in compilation. +# third argument: the kernel file suffix being considered. +define make-kernels-rule +$(BASE_OBJ_KERNELS_PATH)/$(1)/%.o: $(KERNELS_PATH)/$(1)/%.$(3) $(BLIS_H_FLAT) $(MAKE_DEFS_MK_PATHS) + @mkdir -p $$(dir $$@) +ifeq ($(3),$(filter cxx cpp,$(3))) +ifeq ($(ENABLE_VERBOSE),yes) + $(CXX) $(call get-kernel-cxxflags-for,$(2)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@ +else + @echo "Compiling $$@" $(call get-kernel-cxxtext-for,$(2)) + @$(CXX) $(call get-kernel-cxxflags-for,$(2)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@ +endif +else +ifeq ($(ENABLE_VERBOSE),yes) + $(CC) $(call get-kernel-cflags-for,$(2)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@ +else + @echo "Compiling $$@" $(call get-kernel-text-for,$(2)) + @$(CC) $(call get-kernel-cflags-for,$(2)) -DBLIS_PNAME=$(PLUGIN_NAME) -c $$< -o $$@ +endif +endif +endef + +# Define functions to choose the correct sub-configuration name for the +# given kernel set. This function is called when instantiating the +# make-kernels-rule. +get-config-for-kset = $(lastword $(subst :, ,$(filter $(1):%,$(KCONFIG_MAP)))) + +# Instantiate the build rule for files in the configuration directory for +# each of the sub-configurations in CONFIG_LIST with the CFLAGS designated +# for that sub-configuration. +$(foreach conf, $(CONFIG_LIST), $(eval $(call make-config-rule,$(conf)))) + +# Instantiate the build rule for reference kernel initialization and +# reference kernels for each of the sub-configurations in CONFIG_LIST with +# the CFLAGS designated for that sub-configuration. +$(foreach conf, $(CONFIG_LIST), $(eval $(call make-refinit-rule,$(conf)))) +$(foreach conf, $(CONFIG_LIST), $(eval $(call make-refkern-rule,$(conf)))) + +# Instantiate the build rule for framework files. Use the CFLAGS for the +# configuration family, which exists in the directory whose name is equal to +# CONFIG_NAME. Note that this doesn't need to be in a loop since we expect +# CONFIG_NAME to only ever contain a single name. (BTW: If CONFIG_NAME refers +# to a singleton family, then CONFIG_LIST contains CONFIG_NAME as its only +# item.) +$(foreach conf, $(CONFIG_NAME), $(eval $(call make-frame-rule,$(conf)))) + +# Instantiate the build rule for optimized kernels for each of the kernel +# sets in KERNEL_LIST with the CFLAGS designated for the sub-configuration +# specified by the KCONFIG_MAP. +$(foreach suf, $(KERNELS_SRC_SUFS), \ +$(foreach kset, $(KERNEL_LIST), $(eval $(call make-kernels-rule,$(kset),$(call get-config-for-kset,$(kset)),$(suf))))) + + +# --- All-purpose library rule (static and shared) --- + +plugin: check-env $(MK_LIBS) + + +# --- Static library archiver rules --- + +$(PLUGIN_A_PATH): $(MK_PLUGIN_OBJS) + @mkdir -p $(dir $@) +ifeq ($(ENABLE_VERBOSE),yes) +ifeq ($(ARG_MAX_HACK),yes) + $(file > $@.in,$^) + $(AR) $(ARFLAGS) $@ @$@.in + $(RM_F) $@.in + $(RANLIB) $@ +else + $(AR) $(ARFLAGS) $@ $? + $(RANLIB) $@ +endif +else # ifeq ($(ENABLE_VERBOSE),no) +ifeq ($(ARG_MAX_HACK),yes) + @echo "Archiving $@" + @$(file > $@.in,$^) + @$(AR) $(ARFLAGS) $@ @$@.in + @$(RM_F) $@.in + @$(RANLIB) $@ +else + @echo "Archiving $@" + @$(AR) $(ARFLAGS) $@ $? + @$(RANLIB) $@ +endif +endif + + +# --- Shared library linker rules --- + +$(PLUGIN_SO_PATH): $(MK_PLUGIN_OBJS) + @mkdir -p $(dir $@) +ifeq ($(ENABLE_VERBOSE),yes) +ifeq ($(ARG_MAX_HACK),yes) + $(file > $@.in,$^) + $(LINKER) $(SOFLAGS) -o $(LIBBLIS_SO_OUTPUT_NAME) @$@.in $(LDFLAGS) + $(RM_F) $@.in +else + $(LINKER) $(SOFLAGS) -o $(LIBBLIS_SO_OUTPUT_NAME) $^ $(LDFLAGS) +endif +else # ifeq ($(ENABLE_VERBOSE),no) +ifeq ($(ARG_MAX_HACK),yes) + @echo "Dynamically linking $@" + @$(file > $@.in,$^) + @$(LINKER) $(SOFLAGS) -o $(LIBBLIS_SO_OUTPUT_NAME) @$@.in $(LDFLAGS) + @$(RM_F) $@.in +else + @echo "Dynamically linking $@" + @$(LINKER) $(SOFLAGS) -o $(LIBBLIS_SO_OUTPUT_NAME) $^ $(LDFLAGS) +endif +endif + +# --- Query current configuration --- + +showconfig: check-env + @echo "configuration family: $(CONFIG_NAME)" + @echo "sub-configurations: $(CONFIG_LIST)" + @echo "requisite kernels sets: $(KERNEL_LIST)" + @echo "kernel-to-config map: $(KCONFIG_MAP)" + @echo "-------------------------" + @echo "BLIS version string: $(VERSION)" + @echo ".so major version: $(SO_MAJOR)" + @echo ".so minor.build vers: $(SO_MINORB)" + @echo "install libdir: $(INSTALL_LIBDIR)" + @echo "install includedir: $(INSTALL_INCDIR)" + @echo "install sharedir: $(INSTALL_SHAREDIR)" + @echo "debugging status: $(DEBUG_TYPE)" + @echo "enable AddressSanitizer? $(MK_ENABLE_ASAN)" + @echo "enabled threading model(s): $(THREADING_MODEL)" + @echo "enable BLAS API? $(MK_ENABLE_BLAS)" + @echo "enable CBLAS API? $(MK_ENABLE_CBLAS)" + @echo "build static library? $(MK_ENABLE_STATIC)" + @echo "build shared library? $(MK_ENABLE_SHARED)" + @echo "ARG_MAX hack enabled? $(ARG_MAX_HACK)" + + +# --- Clean rules --- + +cleanmk: +ifeq ($(IS_CONFIGURED),yes) +ifeq ($(ENABLE_VERBOSE),yes) + - $(FIND) $(CONFIG_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F) + - $(FIND) $(REFKERN_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F) + - $(FIND) $(KERNELS_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F) +else + @echo "Removing makefile fragments from $(CONFIG_FRAG_PATH)" + @- $(FIND) $(CONFIG_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F) + @echo "Removing makefile fragments from $(REFKERN_FRAG_PATH)" + @- $(FIND) $(REFKERN_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F) + @echo "Removing makefile fragments from $(KERNELS_FRAG_PATH)" + @- $(FIND) $(KERNELS_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F) +endif +endif + +cleanlib: +ifeq ($(IS_CONFIGURED),yes) +ifeq ($(ENABLE_VERBOSE),yes) + - $(FIND) $(BASE_OBJ_PATH) -name "*.o" | $(XARGS) $(RM_F) + - $(RM_F) $(LIBBLIS_A_PATH) + - $(RM_F) $(LIBBLIS_SO_PATH) +else + @echo "Removing object files from $(BASE_OBJ_PATH)" + @- $(FIND) $(BASE_OBJ_PATH) -name "*.o" | $(XARGS) $(RM_F) + @echo "Removing libraries from $(BASE_LIB_PATH)" + @- $(RM_F) $(LIBBLIS_A_PATH) + @- $(RM_F) $(LIBBLIS_SO_PATH) +endif +endif + +distclean: cleanmk cleanlib +ifeq ($(IS_CONFIGURED),yes) +ifeq ($(ENABLE_VERBOSE),yes) + - $(RM_F) $(CONFIG_MK_FILE) + - $(RM_RF) $(OBJ_DIR) + - $(RM_RF) $(LIB_DIR) +else + @echo "Removing $(CONFIG_MK_FILE)" + @- $(RM_F) $(CONFIG_MK_FILE) + @echo "Removing $(OBJ_DIR)" + @- $(RM_RF) $(OBJ_DIR) + @echo "Removing $(LIB_DIR)" + @- $(RM_RF) $(LIB_DIR) +endif +endif + diff --git a/frame/include/level0/1e/bli_scal1es.h b/build/plugin/bli_kernel_defs_zen3.h similarity index 71% rename from frame/include/level0/1e/bli_scal1es.h rename to build/plugin/bli_kernel_defs_zen3.h index 485a8ae645..adda5af9a8 100644 --- a/frame/include/level0/1e/bli_scal1es.h +++ b/build/plugin/bli_kernel_defs_zen3.h @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2023, Southern Methodist University Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -32,22 +32,18 @@ */ -#ifndef BLIS_SCAL1ES_H -#define BLIS_SCAL1ES_H +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H -// scal1es +// ----------------------------------------------------------------------------> +// -- Example macros to be used in reference kernels compiled for zen3 --------> +// ----------------------------------------------------------------------------> -#define bli_cscal1es( a, yri, yir ) \ -{ \ - bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(yri), bli_cimag(yri) ); \ - bli_ccopyris( -bli_cimag(yri), bli_creal(yri), bli_creal(yir), bli_cimag(yir) ); \ -} +#define MY_KERNEL_2_ROW_MAJOR 1 -#define bli_zscal1es( a, yri, yir ) \ -{ \ - bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(yri), bli_zimag(yri) ); \ - bli_zcopyris( -bli_zimag(yri), bli_zreal(yri), bli_zreal(yir), bli_zimag(yir) ); \ -} +// <---------------------------------------------------------------------------- +// <---------------------------------------------------------------------------- +// <---------------------------------------------------------------------------- -#endif +//#endif diff --git a/build/plugin/bli_plugin.h.in b/build/plugin/bli_plugin.h.in new file mode 100644 index 0000000000..ad87e8c73e --- /dev/null +++ b/build/plugin/bli_plugin.h.in @@ -0,0 +1,146 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Southern Methodist University + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Parameters passed to the plugin registration and initialization +// functions. +// + +#define plugin_@plugin_name@_params \ +\ + kerid_t* bszids, /* <----- Example arguments */ \ + kerid_t* kerids, /* <----- for plugin registration */ \ + kerid_t* prefids /* <----- and initialization. */ + +#define plugin_@plugin_name@_params_only \ +\ + bszids, /* <----- We also sometimes need */ \ + kerids, /* <----- the names of the */ \ + prefids /* <----- arguments without types. */ + +// ----------------------------------------------------------------------------> +// -- Example blocksize, micro-kernel, and preference enumerations. -----------> +// ----------------------------------------------------------------------------> + +enum +{ + MY_BLKSZ_1, + MY_BLKSZ_2, + + MY_NUM_BLOCK_SIZES +}; + +enum +{ + MY_KERNEL_1, + MY_KERNEL_2, + + MY_NUM_KERNELS +}; + +enum +{ + MY_PREF_1, + MY_PREF_2, + + MY_NUM_KERNEL_PREFS +}; + +// <---------------------------------------------------------------------------- +// <---------------------------------------------------------------------------- +// <---------------------------------------------------------------------------- + +// ----------------------------------------------------------------------------> +// -- Example prototypes for kernel functions. --------------------------------> +// ----------------------------------------------------------------------------> + +// Reference kernels for all data types +#undef GENTPROT +#define GENTPROT( ctype, ch, config_infix ) \ +\ +void PASTEMAC(ch,my_kernel_1,config_infix,BLIS_REF_SUFFIX) \ + ( \ + int n, \ + const ctype* a, \ + ctype* x \ + ); + +// Reference kernels for only complex data types +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctyper, ch, chr, config_infix ) \ +\ +void PASTEMAC(ch,my_kernel_2,config_infix,BLIS_REF_SUFFIX) \ + ( \ + int m, \ + int n, \ + ctype* a \ + ); + +// Optimized kernels +void bli_dmy_kernel_1_zen3 + ( + int n, + const double* a, + double* x + ); + +// Generate reference kernel prototypes for each configuration AND data type +#undef GENTCONF +#define GENTCONF( CONFIG, config ) \ +\ +INSERT_GENTPROT_BASIC( PASTECH(_,config) ) \ +INSERT_GENTPROTCO_BASIC( PASTECH(_,config) ) + +INSERT_GENTCONF + +// <---------------------------------------------------------------------------- +// <---------------------------------------------------------------------------- +// <---------------------------------------------------------------------------- + +// +// Registration and intialization function prototypes. +// + +#undef GENTCONF +#define GENTCONF( CONFIG, config ) \ +\ +void PASTEMAC(plugin_init_@plugin_name@_,config)( plugin_@plugin_name@_params ); \ +void PASTEMAC(plugin_init_@plugin_name@_,config,BLIS_REF_SUFFIX)( plugin_@plugin_name@_params ); + +INSERT_GENTCONF + +BLIS_EXPORT_BLIS err_t bli_plugin_register_@plugin_name@( plugin_@plugin_name@_params ); + diff --git a/build/plugin/bli_plugin_init_ref.c b/build/plugin/bli_plugin_init_ref.c new file mode 100644 index 0000000000..48767e8a01 --- /dev/null +++ b/build/plugin/bli_plugin_init_ref.c @@ -0,0 +1,108 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Southern Methodist University + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include @PLUGIN_HEADER@ + +// -- Macros to help concisely instantiate bli_func_init() --------------------- + +#define gen_func_init_ro( func_p, opname ) \ +do { \ + bli_func_init( func_p, PASTEMAC(s,opname), PASTEMAC(d,opname), \ + NULL, NULL ); \ +} while (0) + +#define gen_func_init_co( func_p, opname ) \ +do { \ + bli_func_init( func_p, NULL, NULL, \ + PASTEMAC(c,opname), PASTEMAC(z,opname) ); \ +} while (0) + +#define gen_func_init( func_p, opname ) \ +do { \ + bli_func_init( func_p, PASTEMAC(s,opname), PASTEMAC(d,opname), \ + PASTEMAC(c,opname), PASTEMAC(z,opname) ); \ +} while (0) + +// ----------------------------------------------------------------------------- + +void PASTEMAC(plugin_init_@plugin_name@,BLIS_CNAME_INFIX,BLIS_REF_SUFFIX) + ( + plugin_@plugin_name@_params + ) +{ + cntx_t* cntx = ( cntx_t* )bli_gks_lookup_id( PASTECH(BLIS_ARCH,BLIS_CNAME_UPPER_INFIX) ); + ( void )cntx; + + // ------------------------------------------------------------------------> + // -- Example Initialization ----------------------------------------------> + // ------------------------------------------------------------------------> + + blksz_t blkszs[ MY_NUM_BLOCK_SIZES ]; + kerid_t bmults[ MY_NUM_BLOCK_SIZES ]; + func_t funcs[ MY_NUM_KERNELS ]; + mbool_t mbools[ MY_NUM_KERNEL_PREFS ]; + + // -- Set blocksizes ------------------------------------------------------- + // s d c z + bli_blksz_init_easy( &blkszs[ MY_BLKSZ_1 ], 256, 128, 128, 64 ); + bli_blksz_init_easy( &blkszs[ MY_BLKSZ_2 ], 256, 256, 256, 256 ); + bmults[ MY_BLKSZ_1 ] = bszids[ MY_BLKSZ_1 ]; + bmults[ MY_BLKSZ_2 ] = bszids[ MY_BLKSZ_2 ]; + + // -- Set micro-kernels ---------------------------------------------------- + + gen_func_init ( &funcs[ MY_KERNEL_1 ], PASTECH(my_kernel_1,BLIS_CNAME_INFIX,BLIS_REF_SUFFIX) ); + gen_func_init_co( &funcs[ MY_KERNEL_2 ], PASTECH(my_kernel_2,BLIS_CNAME_INFIX,BLIS_REF_SUFFIX) ); + + // -- Set preferences ------------------------------------------------------ + // s d c z + bli_mbool_init( &mbools[ MY_PREF_1 ], TRUE, TRUE, TRUE, TRUE ); + bli_mbool_init( &mbools[ MY_PREF_2 ], FALSE, FALSE, FALSE, FALSE ); + + // -- Put block sizes, kernels, and preferences into the context ----------- + + for ( dim_t i = 0; i < MY_NUM_BLOCK_SIZES; i++ ) + bli_cntx_set_blksz( bszids[ i ], &blkszs[ i ], bmults[ i ], cntx ); + + for ( dim_t i = 0; i < MY_NUM_KERNELS; i++ ) + bli_cntx_set_ukr( kerids[ i ], &funcs[ i ], cntx ); + + for ( dim_t i = 0; i < MY_NUM_KERNEL_PREFS; i++ ) + bli_cntx_set_ukr_pref( prefids[ i ], &mbools[ i ], cntx ); + + // <------------------------------------------------------------------------ + // <------------------------------------------------------------------------ + // <------------------------------------------------------------------------ +} + diff --git a/frame/include/level0/ri/bli_copyjris.h b/build/plugin/bli_plugin_init_zen3.c similarity index 51% rename from frame/include/level0/ri/bli_copyjris.h rename to build/plugin/bli_plugin_init_zen3.c index 86fd705423..f5cd2c8bd1 100644 --- a/frame/include/level0/ri/bli_copyjris.h +++ b/build/plugin/bli_plugin_init_zen3.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2023, Southern Methodist University Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -32,35 +32,65 @@ */ -#ifndef BLIS_COPYJRIS_H -#define BLIS_COPYJRIS_H +#include @PLUGIN_HEADER@ -// copyjris +void PASTEMAC(plugin_init_@plugin_name@,BLIS_CNAME_INFIX) + ( + plugin_@plugin_name@_params + ) +{ + cntx_t* cntx = ( cntx_t* )bli_gks_lookup_id( PASTECH(BLIS_ARCH,BLIS_CNAME_UPPER_INFIX) ); + ( void )cntx; -#define bli_scopyjris( ar, ai, br, bi ) bli_scopyris( (ar), -(ai), (br), (bi) ) -#define bli_dcopyjris( ar, ai, br, bi ) bli_dcopyris( (ar), -(ai), (br), (bi) ) -#define bli_ccopyjris( ar, ai, br, bi ) bli_ccopyris( (ar), -(ai), (br), (bi) ) -#define bli_zcopyjris( ar, ai, br, bi ) bli_zcopyris( (ar), -(ai), (br), (bi) ) + PASTEMAC(plugin_init_@plugin_name@,BLIS_CNAME_INFIX,BLIS_REF_SUFFIX) + ( + plugin_@plugin_name@_params_only + ); -#define bli_sscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0F, br, bi ) -#define bli_dscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0, br, bi ) -#define bli_cscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) -#define bli_zscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) + // ------------------------------------------------------------------------> + // -- Example Initialization ----------------------------------------------> + // ------------------------------------------------------------------------> -#define bli_sdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0F, br, bi ) -#define bli_ddcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0, br, bi ) -#define bli_cdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) -#define bli_zdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) + // Update the context with optimized native micro-kernels. + bli_cntx_set_ukrs + ( + cntx, -#define bli_sccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0F, br, bi ) -#define bli_dccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0, br, bi ) -#define bli_cccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) -#define bli_zccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) + kerids[ MY_KERNEL_1 ], BLIS_DOUBLE, bli_dmy_kernel_1_zen3, -#define bli_szcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0F, br, bi ) -#define bli_dzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0, br, bi ) -#define bli_czcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) -#define bli_zzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) + BLIS_VA_END + ); -#endif + // Update the context with preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + prefids[ MY_PREF_1 ], BLIS_DOUBLE, TRUE, + prefids[ MY_PREF_2 ], BLIS_DOUBLE, TRUE, + + BLIS_VA_END + ); + + blksz_t blkszs[ MY_NUM_BLOCK_SIZES ]; + bszid_t bmults[ MY_NUM_BLOCK_SIZES ]; + + // Update block sizes + // s d c z + bli_blksz_init_easy( &blkszs[ MY_BLKSZ_1 ], 320, 240, 182, 96 ); + bmults[ MY_BLKSZ_1 ] = bszids[ MY_BLKSZ_1 ]; + + bli_cntx_set_blkszs + ( + cntx, + + bszids[ MY_BLKSZ_1 ], &blkszs[ MY_BLKSZ_1 ], bmults[ MY_BLKSZ_1 ], + + BLIS_VA_END + ); + + // <------------------------------------------------------------------------ + // <------------------------------------------------------------------------ + // <------------------------------------------------------------------------ +} diff --git a/build/plugin/bli_plugin_register.c b/build/plugin/bli_plugin_register.c new file mode 100644 index 0000000000..f711e39401 --- /dev/null +++ b/build/plugin/bli_plugin_register.c @@ -0,0 +1,81 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Southern Methodist University + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include @PLUGIN_HEADER@ + +err_t bli_plugin_register_@plugin_name@ + ( + plugin_@plugin_name@_params + ) +{ + // ------------------------------------------------------------------------> + // -- Example Plugin Registration ----------------------------------------> + // ------------------------------------------------------------------------> + + // + // Register slots for new microkernels, preferences, and block sizes. + // + + err_t err; + + err = bli_gks_register_blksz( &bszids[ MY_BLKSZ_1 ] ); + err = bli_gks_register_blksz( &bszids[ MY_BLKSZ_1 ] ); + err = bli_gks_register_ukr( &kerids[ MY_KERNEL_1 ] ); + err = bli_gks_register_ukr( &kerids[ MY_KERNEL_2 ] ); + err = bli_gks_register_ukr_pref( &prefids[ MY_PREF_1 ] ); + err = bli_gks_register_ukr_pref( &prefids[ MY_PREF_2 ] ); + + if ( err != BLIS_SUCCESS ) + return err; + + // <------------------------------------------------------------------------ + // <------------------------------------------------------------------------ + // <------------------------------------------------------------------------ + + // + // Initialize the context for each enabled sub-configuration. + // + + #undef GENTCONF + #define GENTCONF( CONFIG, config ) \ + PASTEMAC(plugin_init_@plugin_name@_,config) \ + ( \ + plugin_@plugin_name@_params_only \ + ); + + INSERT_GENTCONF + + return BLIS_SUCCESS; +} + diff --git a/build/plugin/config.mk.in b/build/plugin/config.mk.in new file mode 100644 index 0000000000..0d5989cbf7 --- /dev/null +++ b/build/plugin/config.mk.in @@ -0,0 +1,145 @@ +# +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2014, The University of Texas at Austin +# Copyright (C) 2022, Advanced Micro Devices, Inc. +# Copyright (C) 2023, Southern Methodist University +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name(s) of the copyright holder(s) nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + +ifndef CONFIG_MK_PLUGIN_INCLUDED +CONFIG_MK_PLUGIN_INCLUDED := yes + +# The installation prefix, exec_prefix, libdir, includedir, and shareddir +# values from configure tell us where to install the libraries, header files, +# and public makefile fragments. We must first assign each substituted +# @anchor@ to its own variable. Why? Because the subsitutions may contain +# unevaluated variable expressions. For example, '@libdir@' may be replaced +# with '${exec_prefix}/lib'. By assigning the anchors to variables first, and +# then assigning them to their final INSTALL_* variables, we allow prefix and +# exec_prefix to be used in the definitions of exec_prefix, libdir, +# includedir, and sharedir. +prefix := @prefix@ +exec_prefix := @exec_prefix@ +libdir := @libdir@ +includedir := @includedir@ +sharedir := @sharedir@ + +# Override SHARE_PATH from common.mk so that e.g. make_defs.mk files from +# configurations are loaded from the installed share directory. +SHARE_PATH := @sharedir@/blis + +# Override the source path locations to point to the plugin source, rather +# than the default values which assume the BLIS builtin source tree. +FRAME_DIR := . +DIST_PATH := @plugin_dir@ + +# Define the name of the global config.mk makefile. +GLOB_CONFIG_MK_FILE := $(sharedir)/blis/config.mk + +# Include the configuration file. +include $(GLOB_CONFIG_MK_FILE) + +# The name of the plugin. +PLUGIN_NAME := @plugin_name@ + +# This list contains some number of "kernel:config" pairs, where "config" +# specifies which configuration's compilation flags (CFLAGS) should be +# used to compile the source code for the kernel set named "kernel". +KCONFIG_MAP := @kconfig_map@ + +# The C compiler. +CC_VENDOR := @CC_VENDOR@ +CC := @CC@ + +# Important C compiler ranges. +GCC_OT_4_9_0 := @gcc_older_than_4_9_0@ +GCC_OT_6_1_0 := @gcc_older_than_6_1_0@ +GCC_OT_9_1_0 := @gcc_older_than_9_1_0@ +GCC_OT_10_3_0 := @gcc_older_than_10_3_0@ +CLANG_OT_9_0_0 := @clang_older_than_9_0_0@ +CLANG_OT_12_0_0 := @clang_older_than_12_0_0@ +AOCC_OT_2_0_0 := @aocc_older_than_2_0_0@ +AOCC_OT_3_0_0 := @aocc_older_than_3_0_0@ + +# The C++ compiler. +CXX := @CXX@ + +# The Fortran compiler. +FC := @FC@ + +# Static library indexer. +RANLIB := @RANLIB@ + +# Archiver. +AR := @AR@ + +# Preset (required) CFLAGS, CXXFLAGS, and LDFLAGS. These variables capture the value +# of the CFLAGS, CXXFLAGS, and LDFLAGS environment variables at configure-time (and/or +# the value of CFLAGS/CXXFLAGS/LDFLAGS if any was specified on the command line). +# These flags are used in addition to the flags automatically determined +# by the build system. +CFLAGS_PRESET := @cflags_preset@ +CXXFLAGS_PRESET := @cxxflags_preset@ +LDFLAGS_PRESET := @ldflags_preset@ + +# The level of debugging info to generate. +DEBUG_TYPE := @debug_type@ +ENABLE_DEBUG := @enable_debug@ + +# Whether to compile and link the AddressSanitizer library. +MK_ENABLE_ASAN := @enable_asan@ + +# Whether the compiler supports "#pragma omp simd" via the -fopenmp-simd option. +PRAGMA_OMP_SIMD := @pragma_omp_simd@ + +# Whether to output verbose command-line feedback as the Makefile is +# processed. +ENABLE_VERBOSE := @enable_verbose@ + +# Whether we need to employ an alternate method for passing object files to +# ar and/or the linker to work around a small value of ARG_MAX. +ARG_MAX_HACK := @enable_arg_max_hack@ + +# Whether to build the static and shared libraries. +# NOTE: The "MK_" prefix, which helps differentiate these variables from +# their corresonding cpp macros that use the BLIS_ prefix. +MK_ENABLE_STATIC := @mk_enable_static@ +MK_ENABLE_SHARED := @mk_enable_shared@ + +# Whether to use an install_name based on @rpath. +MK_ENABLE_RPATH := @enable_rpath@ + +# Whether to export all symbols within the shared library, even those symbols +# that are considered to be for internal use only. +EXPORT_SHARED := @export_shared@ + +# end of ifndef CONFIG_MK_PLUGIN_INCLUDED conditional block +endif diff --git a/frame/include/level0/ri/bli_absq2ris.h b/build/plugin/my_kernel_1_ref.c similarity index 77% rename from frame/include/level0/ri/bli_absq2ris.h rename to build/plugin/my_kernel_1_ref.c index 6698a51a1b..42fa593c10 100644 --- a/frame/include/level0/ri/bli_absq2ris.h +++ b/build/plugin/my_kernel_1_ref.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2023, Southern Methodist University Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -32,32 +32,25 @@ */ -#ifndef BLIS_ABSQ2RIS_H -#define BLIS_ABSQ2RIS_H - -// absq2ris - -#define bli_sabsq2ris( ar, ai, br, bi ) \ -{ \ - (br) = (ar) * (ar); \ -} - -#define bli_dabsq2ris( ar, ai, br, bi ) \ -{ \ - (br) = (ar) * (ar); \ -} - -#define bli_cabsq2ris( ar, ai, br, bi ) \ -{ \ - (br) = (ar) * (ar) + (ai) * (ai); \ - (bi) = 0.0F; \ -} - -#define bli_zabsq2ris( ar, ai, br, bi ) \ +#include @PLUGIN_HEADER@ + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, arch, suf ) \ +\ +void PASTEMAC(ch,opname,arch,suf) \ + ( \ + int n, \ + const ctype* a, \ + ctype* x \ + ) \ { \ - (br) = (ar) * (ar) + (ai) * (ai); \ - (bi) = 0.0; \ + if ( bli_zero_dim1( n ) ) return; \ +\ + for ( dim_t i = 0; i < n; ++i ) \ + { \ + bli_tcopys( ch,ch, *a, x[ i ] ); \ + } \ } -#endif +INSERT_GENTFUNC_BASIC( my_kernel_1, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) diff --git a/build/plugin/my_kernel_1_zen3.c b/build/plugin/my_kernel_1_zen3.c new file mode 100644 index 0000000000..00c8163388 --- /dev/null +++ b/build/plugin/my_kernel_1_zen3.c @@ -0,0 +1,63 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Southern Methodist University + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include @PLUGIN_HEADER@ + +void bli_dmy_kernel_1_zen3 + ( + int n, + const double* a, + double* x + ) +{ + if ( bli_zero_dim1( n ) ) return; + + double a_local = *a; + dim_t i = 0; + + while ( i <= n-4 ) + { + x[ i+0 ] = a_local; + x[ i+1 ] = a_local; + x[ i+2 ] = a_local; + x[ i+3 ] = a_local; + i += 4; + } + + while ( i < n ) + { + x[ i ] = a_local; + i++; + } +} diff --git a/frame/include/level0/bb/bli_set0bbs_mxn.h b/build/plugin/my_kernel_2_ref.c similarity index 69% rename from frame/include/level0/bb/bli_set0bbs_mxn.h rename to build/plugin/my_kernel_2_ref.c index 3a44883f42..0d241b5c1a 100644 --- a/frame/include/level0/bb/bli_set0bbs_mxn.h +++ b/build/plugin/my_kernel_2_ref.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2023, Southern Methodist University Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -32,43 +32,45 @@ */ -#ifndef BLIS_SET0BBS_MXN_H -#define BLIS_SET0BBS_MXN_H +#include @PLUGIN_HEADER@ -// set0bbs_mxn +#ifndef MY_KERNEL_2_ROW_MAJOR +#define MY_KERNEL_2_ROW_MAJOR 0 +#endif -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname ) \ +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ch, opname, arch, suf ) \ \ -BLIS_INLINE void PASTEMAC(ch,opname) \ +void PASTEMAC(ch,opname,arch,suf) \ ( \ - const dim_t m, \ - const dim_t n, \ - ctype* restrict y, const inc_t incy, const inc_t ldy \ + int m, \ + int n, \ + ctype* a \ ) \ { \ - /* Assume that the duplication factor is the row stride of y. */ \ - const dim_t d = incy; \ - const dim_t ds_y = 1; \ + if ( bli_zero_dim1( m ) || bli_zero_dim1( n ) ) return; \ \ - for ( dim_t j = 0; j < n; ++j ) \ + if ( MY_KERNEL_2_ROW_MAJOR ) \ + { \ + for ( dim_t j = 0; j < n; ++j ) \ + { \ + for ( dim_t i = 0; i < m; ++i ) \ + { \ + bli_tseti0s( ch, a[ i*n + j ] ); \ + } \ + } \ + } \ + else \ { \ - ctype* restrict yj = y + j*ldy; \ -\ for ( dim_t i = 0; i < m; ++i ) \ { \ - ctype* restrict yij = yj + i*incy; \ -\ - for ( dim_t p = 0; p < d; ++p ) \ + for ( dim_t j = 0; j < n; ++j ) \ { \ - ctype* restrict yijd = yij + p*ds_y; \ -\ - PASTEMAC(ch,set0s)( *yijd ); \ + bli_tseti0s( ch, a[ i + j*m ] ); \ } \ } \ } \ } -INSERT_GENTFUNC_BASIC0( set0bbs_mxn ) +INSERT_GENTFUNCCO_BASIC( my_kernel_2, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) -#endif diff --git a/build/recu-sed.sh b/build/recu-sed.sh new file mode 100755 index 0000000000..e7a1d43db3 --- /dev/null +++ b/build/recu-sed.sh @@ -0,0 +1,488 @@ +#!/bin/bash + +# +# recursive-sed.sh +# +# Field G. Van Zee +# + +print_usage() +{ + # Echo usage info + echo " " + echo " "$script_name + echo " " + echo " Field G. Van Zee" + echo " " + echo " Recusively descend a directory tree and perform sed commands, either on" + echo " the filename or the file contents, or both." + echo " " + echo " Usage:" + echo " ${script_name} [options]" + echo " " + echo " The following options are accepted:" + echo " " + echo " -d " + echo " Dry run. Go through all the motions, but don't actually" + echo " apply any of the sed expressions to file names or contents." + echo " -N " + echo " Do not proceed recursively into subdirectories; consider" + echo " only the files within the current directory. Default" + echo " behavior is to act recursively." + echo " -h " + echo " Consider hidden files and directories. Default behavior is" + echo " to ignore them." + echo " -n " + echo " Use svn mv instead of mv when renaming the file." + echo " Notice that this only applies if the filename changes." + echo " -p pattern " + echo " Specifies the filename pattern, as would be given to the" + echo " ls utility, to limit which files are affected. Default is" + echo " the to consider all files present." + echo " -r dir" + echo " The root directory for the recursive action to be performed." + echo " Default is to use the current working directory." + echo " -v [0|1|2]" + echo " verboseness level" + echo " level 0: silent (no output)" + echo " level 1: default (one line per directory; supress ls stderr)" + echo " level 2: verbose (one line per directory; show ls stderr)" + echo " " + echo " At least one of the following option-argument pairs is required:" + echo " " + echo " -f sed_expr " + echo " Specifies the sed expression that will be applied to the" + echo " filenames of the files touched by the script. This expression" + echo " must be a search-and-replace pattern." + echo " -c sed_expr " + echo " Specifies the sed expression that will be applied to the" + echo " contents of the files touched by the script. This expression" + echo " should be a search-and-replace pattern." + echo " -s sed_script" + echo " Specifies an arbitrary sed script that will be applied to the" + echo " file contents of the files touched by the script." + echo " " + echo " Note: -c and -s options are mutually exclusive." + echo " " + + # Exit with non-zero exit status + exit 1 +} + + + + +perform_sed() +{ + # Variables set by getopts. + local exist_dir="$1" + + #echo "exist_dir: $exist_dir" + + # The suffix used to create temporary files + local temp_file_suffix="sed_temp" + + # Check that exist_dir actually exists and is a directory + if [ ! -d "${exist_dir}" ]; then + echo "${script_name}: ${exist_dir} does not seem to be a valid directory." + exit 1 + fi + + # Check that the filename sed expression, if given, begins with an 's'. + if [ -n "$filename_sed_expr" ]; then + + # If it's a valid search-and-replace expression, this should return an 's'. + filename_sed_char=${filename_sed_expr%%/*} + + if [ "$filename_sed_char" != "s" ]; then + echo "${script_name}: sed expression given with -f must be search-and-replace." + exit 1 + fi + fi + + # Check that the sed script, if given, exists. + if [ -n "$contents_sed_script" ]; then + + if [ ! -f ${contents_sed_script} ]; then + echo "${script_name}: ${contents_sed_script} is not a regular file or does not exist." + exit 1 + fi + fi + + # Assume that the sed expression is a search-and-replace. Extract the patterns + # to match on. (Arbitrary sed expressions should be applied through a sed script.) + if [ "$filename_sed_expr" != "" ]; then + filename_sed_match=${filename_sed_expr#s/} + filename_sed_match=${filename_sed_match%%/*} + fi + + + # Get the list of source files in the directory given. Supress stderr if + # level 0 or 1 verbosity was requested. + #if [ "$verbose_level" != "2" ]; then + # old_filepaths=$(ls -d -b ${exist_dir}/${filename_pattern} 2> /dev/null) + #else + # old_filepaths="$(ls -d -b ${exist_dir}/${filename_pattern})" + #fi + + #echo $old_filepaths + #echo "$exist_dir/$filename_pattern" + + #for old_filepath in $old_filepaths; do + #echo "exist_dir: $exist_dir" + + # Find all files that match the pattern in the current directory. + find "${exist_dir}" -maxdepth 1 -name "${filename_pattern}" -print | while read old_filepath + do + #echo "old_filepath: $old_filepath" + + # Skip the current directory. + if [ "${old_filepath}" == "${exist_dir}" ]; then + continue + fi + + # Skip any non-regular files. + if [ ! -f "$old_filepath" ]; then + + # And say we are doing so if verboseness was requested. + if [ "$verbose_level" = "2" ]; then + echo "${script_name}: Ignoring $old_filepath" + fi + continue + fi + + # Strip exist_dir from filename. + old_filename=${old_filepath##*/} + + # Strip the filename from old_filepath to leave the directory path. + old_dirpath=${old_filepath%/*} + + # Create a new filename from the old one. If a filename sed expression was given, + # it will be applied now. + if [ "$filename_sed_expr" != "" ]; then + new_filename=$(echo "${old_filename}" | sed "${filename_sed_expr}") + else + new_filename="${old_filename}" + fi + + #echo "new_filename: $new_filename" + + # Create the filepath to the new file location. + new_filepath="${old_dirpath}/${new_filename}" + #echo "new_filepath: $new_filepath" + + # Grep for the filename pattern within the filename of the current file. + if [ "$filename_sed_expr" != "" ]; then + grep_filename=$(echo "${old_filename}" | grep "${filename_sed_match}") + fi + + + # If we are not performing a dry run, proceed. + if [ -z "$dry_run_flag" ]; then + + # Save the old file permissions so we can re-apply them to the + # new file if its contents change (ie: if it's not just a 'mv', + # which inherently preserves file permissions). + old_perms=$(stat -c %a "${old_filepath}") + + # If the old and new filepaths are different, then we start off by + # renaming the file. (Otherwise, if the old and new filepaths are + # identical, then we don't need to do anything to the file.) If + # the user requested that we use svn mv, then do that, otherwise we + # use regular mv. + if [ "${old_filepath}" != "${new_filepath}" ]; then + + if [ -n "$use_svn_mv_flag" ]; then + + svn mv "${old_filepath}" "${new_filepath}" + else + + mv -f "${old_filepath}" "${new_filepath}" + fi + fi + #else + + # A dry run still needs the act upon the "new" file, so if the + # filepaths are different, simply set the new filepath to the + # old one. (We won't need the previous value of new_filepath + # anymore.) + #if [ "${old_filepath}" != "${new_filepath}" ]; then + # new_filepath="${old_filepath}" + #fi + fi + + # Handle the cases that might change the contents of the file. + if [ "$contents_sed_expr" != "" ] || + [ "$contents_sed_script" != "" ]; then + + # Execute the sed command based on whether the sed action was given + # as a command line expression or a script residing in a file. + if [ "$contents_sed_script" != "" ]; then + + # Perform the action, saving the result to a temporary file. + cat "${new_filepath}" | sed -f ${contents_sed_script} \ + > ${new_filepath}.${temp_file_suffix} + + elif [ "$contents_sed_expr" != "" ]; then + + # Perform the action, saving the result to a temporary file. + cat "${new_filepath}" | sed -e "${contents_sed_expr}" \ + > ${new_filepath}.${temp_file_suffix} + fi + + # Check the difference. + file_diff=$(diff "${new_filepath}" "${new_filepath}.${temp_file_suffix}") + + + # If we are not performing a dry run, proceed. + if [ -z "$dry_run_flag" ]; then + + # If the file contents change. + if [ -n "$file_diff" ]; then + + # Apply the old file permissions to the new file (before we + # potentially overwrite the old file with the new one). + chmod ${old_perms} "${new_filepath}.${temp_file_suffix}" + + # Apply the file contents changes to the new filepath (which may + # or may not be the same as the old filepath). + mv -f "${new_filepath}.${temp_file_suffix}" "${new_filepath}" + + else + # Otherwise remove the new temporary file since it is identical + # to the original. + rm -f "${new_filepath}.${temp_file_suffix}" + fi + else + # Simply remove the file since we are only performing a dry run. + rm -f "${new_filepath}.${temp_file_suffix}" + fi + + fi + + # Check for dos2unix. If it's not here, we'll just substitute cat. + #type_dos2unix=$(type -path dos2unix) + #if [ -n "$type_dos2unix" ]; then + # dos2unix -q ${new_filepath} + #fi + + # Create a string that indicates what we are changing. We'll use this in + # the verbose progress echo to indicate how the file is or would be changed. + if [ -n "$grep_filename" ] && [ -n "$file_diff" ]; then + which_matches="filename/contents" + file_touched="yes" + elif [ -n "$grep_filename" ] && [ -z "$file_diff" ]; then + which_matches="filename " + file_touched="yes" + elif [ -z "$grep_filename" ] && [ -n "$file_diff" ]; then + which_matches=" contents" + file_touched="yes" + else + which_matches="" + file_touched="no" + fi + + # Be verbose, if requested, about which file we're looking at. + if [ "$verbose_level" != "0" ]; then + + # But we only need to output a line if the file was touched. + if [ "$file_touched" != "no" ]; then + + # Construct a relative filepath by stripping the initial root + # directory so that the output does not span as many columns on + # the terminal. + rel_old_filepath=${old_filepath#${initial_root_dir}/} + + # Add a "dry run" condition to the output if we're doing a dry-run + # so that the user knows we didn't really change anything. + if [ -z "$dry_run_flag" ]; then + echo "$script_name: Changing [${which_matches}] of ${rel_old_filepath}" + else + echo "$script_name: Changing (dry run) [${which_matches}] of ${rel_old_filepath}" + fi + fi + fi + + done + + # Exit peacefully. + return 0 +} + + + + +recursive_sed() +{ + # Local variable declarations + local item sub_items curr_dir this_dir + + + # Extract our argument + curr_dir="$1" + + + # Call our function to perform the sed operations on the files in the + # directory given. + perform_sed "${curr_dir}" + + + # If we were asked to act recursively, then continue processing + # curr_dir's contents. + if [ "$recursive_flag" = "1" ]; then + + # Get a listing of items in the directory according to the hidden + # files/directories flag. + if [ -n "$hidden_files_dirs_flag" ]; then + + # Get a listing of the directories in curr_dir (including hidden + # files and directories). + sub_items=$(ls -a "$curr_dir") + + else + + # Get a listing of the directories in curr_dir. + sub_items=$(ls "$curr_dir") + fi + + #echo "sub_items: $sub_items" + + # Descend into the contents of curr_dir, calling recursive_sed on + # any items that are directories. + find "${curr_dir}" -maxdepth 1 -name "*" -print | while read item + do + + #echo "conisdering item: $item" + + # Skip the current directory. + if [ "${item}" == "${curr_dir}" ]; then + continue + fi + + # If item is a directory, descend into it. + if [ -d "$item" ]; then + + #echo "item is dir: $item" + + recursive_sed "$item" + fi + done + + fi + + + # Return peacefully + return 0 +} + + + + +main() +{ + # Variables set by getopts. + dry_run_flag="" + hidden_files_dirs_flag="" + use_svn_mv_flag="" + filename_pattern="" + root_dir="" + initial_root_dir="" + verbose_level="" + filename_sed_expr="" + contents_sed_expr="" + contents_sed_script="" + + recursive_flag="1" + + + # Get the script name + script_name=${0##*/} + + + # Local variable declarations. + local item sub_items this_dir + + + # Process our command line options. + while getopts ":c:df:hp:r:s:nNv:" opt; do + case $opt in + d ) dry_run_flag="1" ;; + h ) hidden_files_dirs_flag="1" ;; + n ) use_svn_mv_flag="1" ;; + N ) recursive_flag="0" ;; + v ) verbose_level="$OPTARG" ;; + p ) filename_pattern="$OPTARG" ;; + r ) root_dir="$OPTARG" ;; + f ) filename_sed_expr="$OPTARG" ;; + c ) contents_sed_expr="$OPTARG" ;; + s ) contents_sed_script="$OPTARG" ;; + \? ) print_usage + esac + done + shift $(($OPTIND - 1)) + + + # Make sure we've parsed all command line arguments by now. + if [ $# != "0" ]; then + echo "${script_name}: Unparsed command line arguments! Try running with no arguments for help." + exit 1 + fi + + + # Make sure we received at least one of the required options. + if [ -z "$filename_sed_expr" ] && + [ -z "$contents_sed_expr" ] && + [ -z "$contents_sed_script" ]; then + print_usage + fi + + + # Make sure that both a file contents sed expression and sed script were + # not given. + if [ "$contents_sed_expr" != "" ] && + [ "$contents_sed_script" != "" ] ; then + echo "${script_name}: The -c and -s options may not be used at the same time." + exit 1 + fi + + + # Make sure that verboseness level is valid. + if [ "$verbose_level" != "0" ] && + [ "$verbose_level" != "1" ] && + [ "$verbose_level" != "2" ]; then + verbose_level="1" + fi + + # Prepare the filename pattern arguments to perform_sed(). + if [ "$filename_pattern" = "" ] ; then + filename_pattern='*' + fi + + # Prepare the directory arguments to perform_sed(). + if [ "$root_dir" != "" ] ; then + + # Strip / from end of directory paths, if there is one. + root_dir=${root_dir%/} + else + root_dir=$PWD + fi + initial_root_dir=${root_dir} + + + #echo "root_dir: $root_dir" + + + # Begin recursing on the root directory. + recursive_sed "$root_dir" + + + # Exit peacefully + return 0 +} + + + + +# The script's main entry point, passing all parameters given. +main "$@" + diff --git a/so_version b/build/so_version similarity index 100% rename from so_version rename to build/so_version diff --git a/build/version b/build/version new file mode 100644 index 0000000000..7a1511416b --- /dev/null +++ b/build/version @@ -0,0 +1 @@ +3.0-dev diff --git a/travis/cpuid/excavator.def b/ci/cpuid/excavator.def similarity index 100% rename from travis/cpuid/excavator.def rename to ci/cpuid/excavator.def diff --git a/travis/cpuid/haswell.def b/ci/cpuid/haswell.def similarity index 100% rename from travis/cpuid/haswell.def rename to ci/cpuid/haswell.def diff --git a/travis/cpuid/penryn.def b/ci/cpuid/penryn.def similarity index 100% rename from travis/cpuid/penryn.def rename to ci/cpuid/penryn.def diff --git a/travis/cpuid/piledriver.def b/ci/cpuid/piledriver.def similarity index 100% rename from travis/cpuid/piledriver.def rename to ci/cpuid/piledriver.def diff --git a/travis/cpuid/sandybridge.def b/ci/cpuid/sandybridge.def similarity index 100% rename from travis/cpuid/sandybridge.def rename to ci/cpuid/sandybridge.def diff --git a/travis/cpuid/skx.def b/ci/cpuid/skx.def similarity index 100% rename from travis/cpuid/skx.def rename to ci/cpuid/skx.def diff --git a/travis/cpuid/skx1.def b/ci/cpuid/skx1.def similarity index 100% rename from travis/cpuid/skx1.def rename to ci/cpuid/skx1.def diff --git a/travis/cpuid/steamroller.def b/ci/cpuid/steamroller.def similarity index 100% rename from travis/cpuid/steamroller.def rename to ci/cpuid/steamroller.def diff --git a/travis/cpuid/zen.def b/ci/cpuid/zen.def similarity index 100% rename from travis/cpuid/zen.def rename to ci/cpuid/zen.def diff --git a/ci/cpuid/zen2.def b/ci/cpuid/zen2.def new file mode 100644 index 0000000000..1e2cc63906 --- /dev/null +++ b/ci/cpuid/zen2.def @@ -0,0 +1,87 @@ +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2018, The University of Texas at Austin +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name(s) of the copyright holder(s) nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# CPU: AMD EPYC 7742 +# NOTE: This file was copied from zen.def and then the appropriate bits +# in the first field (eax) of leaf 1 were updated to reflect the Zen2 +# "Rome" processor. See [1] for details. +# [1] https://en.wikichip.org/wiki/amd/cpuid +# +00000000 ******** => 0000000D 68747541 444D4163 69746E65 +00000001 ******** => 00830F12 00400800 7ED8320B 178BFBFF +00000002 ******** => 00000000 00000000 00000000 00000000 +00000003 ******** => 00000000 00000000 00000000 00000000 +00000005 ******** => 00000040 00000040 00000003 00000011 +00000006 ******** => 00000004 00000000 00000001 00000000 +00000007 ******** => 00000000 209C01A9 00000000 00000000 +00000008 ******** => 00000000 00000000 00000000 00000000 +00000009 ******** => 00000000 00000000 00000000 00000000 +0000000A ******** => 00000000 00000000 00000000 00000000 +0000000C ******** => 00000000 00000000 00000000 00000000 +0000000D 00000000 => 00000007 00000340 00000340 00000000 +0000000D 00000001 => 0000000F 00000340 00000000 00000000 +0000000D 00000002 => 00000100 00000240 00000000 00000000 +80000000 ******** => 8000001F 68747541 444D4163 69746E65 +80000001 ******** => 00800F12 40000000 35C233FF 2FD3FBFF +80000002 ******** => 20444D41 43595045 35353720 33205031 +80000003 ******** => 6F432D32 50206572 65636F72 726F7373 +80000004 ******** => 20202020 20202020 20202020 00202020 +80000005 ******** => FF40FF40 FF40FF40 20080140 40040140 +80000006 ******** => 36006400 56006400 02006140 0200C140 +80000007 ******** => 00000000 0000001B 00000000 00006799 +80000008 ******** => 00003030 00000007 0000603F 00000000 +80000009 ******** => 00000000 00000000 00000000 00000000 +8000000A ******** => 00000001 00008000 00000000 0001BCFF +8000000B ******** => 00000000 00000000 00000000 00000000 +8000000C ******** => 00000000 00000000 00000000 00000000 +8000000D ******** => 00000000 00000000 00000000 00000000 +8000000E ******** => 00000000 00000000 00000000 00000000 +8000000F ******** => 00000000 00000000 00000000 00000000 +80000010 ******** => 00000000 00000000 00000000 00000000 +80000011 ******** => 00000000 00000000 00000000 00000000 +80000012 ******** => 00000000 00000000 00000000 00000000 +80000013 ******** => 00000000 00000000 00000000 00000000 +80000014 ******** => 00000000 00000000 00000000 00000000 +80000015 ******** => 00000000 00000000 00000000 00000000 +80000016 ******** => 00000000 00000000 00000000 00000000 +80000017 ******** => 00000000 00000000 00000000 00000000 +80000018 ******** => 00000000 00000000 00000000 00000000 +80000019 ******** => F040F040 00000000 00000000 00000000 +8000001A ******** => 00000003 00000000 00000000 00000000 +8000001B ******** => 000003FF 00000000 00000000 00000000 +8000001C ******** => 00000000 00000000 00000000 00000000 +8000001D 00000000 => 00004121 01C0003F 0000003F 00000000 +8000001D 00000001 => 00004122 00C0003F 000000FF 00000000 +8000001D 00000002 => 00004143 01C0003F 000003FF 00000002 +8000001D 00000003 => 0001C163 03C0003F 00001FFF 00000001 +8000001E ******** => 00000000 00000100 00000300 00000000 +8000001F ******** => 0000000F 0000016F 0000000F 00000001 +8FFFFFFF ******** => 00000000 00000000 00000000 00000000 diff --git a/ci/cpuid/zen3.def b/ci/cpuid/zen3.def new file mode 100644 index 0000000000..ed791813ea --- /dev/null +++ b/ci/cpuid/zen3.def @@ -0,0 +1,87 @@ +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2018, The University of Texas at Austin +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name(s) of the copyright holder(s) nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# CPU: AMD EPYC 7xxx +# NOTE: This file was copied from zen.def and then the appropriate bits +# in the first field (eax) of leaf 1 were updated to reflect the Zen3 +# "Milan" processor. See [1] for details. +# [1] https://en.wikichip.org/wiki/amd/cpuid +# +00000000 ******** => 0000000D 68747541 444D4163 69746E65 +00000001 ******** => 00A00F12 00400800 7ED8320B 178BFBFF +00000002 ******** => 00000000 00000000 00000000 00000000 +00000003 ******** => 00000000 00000000 00000000 00000000 +00000005 ******** => 00000040 00000040 00000003 00000011 +00000006 ******** => 00000004 00000000 00000001 00000000 +00000007 ******** => 00000000 209C01A9 00000000 00000000 +00000008 ******** => 00000000 00000000 00000000 00000000 +00000009 ******** => 00000000 00000000 00000000 00000000 +0000000A ******** => 00000000 00000000 00000000 00000000 +0000000C ******** => 00000000 00000000 00000000 00000000 +0000000D 00000000 => 00000007 00000340 00000340 00000000 +0000000D 00000001 => 0000000F 00000340 00000000 00000000 +0000000D 00000002 => 00000100 00000240 00000000 00000000 +80000000 ******** => 8000001F 68747541 444D4163 69746E65 +80000001 ******** => 00800F12 40000000 35C233FF 2FD3FBFF +80000002 ******** => 20444D41 43595045 35353720 33205031 +80000003 ******** => 6F432D32 50206572 65636F72 726F7373 +80000004 ******** => 20202020 20202020 20202020 00202020 +80000005 ******** => FF40FF40 FF40FF40 20080140 40040140 +80000006 ******** => 36006400 56006400 02006140 0200C140 +80000007 ******** => 00000000 0000001B 00000000 00006799 +80000008 ******** => 00003030 00000007 0000603F 00000000 +80000009 ******** => 00000000 00000000 00000000 00000000 +8000000A ******** => 00000001 00008000 00000000 0001BCFF +8000000B ******** => 00000000 00000000 00000000 00000000 +8000000C ******** => 00000000 00000000 00000000 00000000 +8000000D ******** => 00000000 00000000 00000000 00000000 +8000000E ******** => 00000000 00000000 00000000 00000000 +8000000F ******** => 00000000 00000000 00000000 00000000 +80000010 ******** => 00000000 00000000 00000000 00000000 +80000011 ******** => 00000000 00000000 00000000 00000000 +80000012 ******** => 00000000 00000000 00000000 00000000 +80000013 ******** => 00000000 00000000 00000000 00000000 +80000014 ******** => 00000000 00000000 00000000 00000000 +80000015 ******** => 00000000 00000000 00000000 00000000 +80000016 ******** => 00000000 00000000 00000000 00000000 +80000017 ******** => 00000000 00000000 00000000 00000000 +80000018 ******** => 00000000 00000000 00000000 00000000 +80000019 ******** => F040F040 00000000 00000000 00000000 +8000001A ******** => 00000003 00000000 00000000 00000000 +8000001B ******** => 000003FF 00000000 00000000 00000000 +8000001C ******** => 00000000 00000000 00000000 00000000 +8000001D 00000000 => 00004121 01C0003F 0000003F 00000000 +8000001D 00000001 => 00004122 00C0003F 000000FF 00000000 +8000001D 00000002 => 00004143 01C0003F 000003FF 00000002 +8000001D 00000003 => 0001C163 03C0003F 00001FFF 00000001 +8000001E ******** => 00000000 00000100 00000300 00000000 +8000001F ******** => 0000000F 0000016F 0000000F 00000001 +8FFFFFFF ******** => 00000000 00000000 00000000 00000000 diff --git a/travis/cxx/Makefile b/ci/cxx/Makefile similarity index 100% rename from travis/cxx/Makefile rename to ci/cxx/Makefile diff --git a/travis/cxx/cxx-test.cxx b/ci/cxx/cxx-test.cxx similarity index 100% rename from travis/cxx/cxx-test.cxx rename to ci/cxx/cxx-test.cxx diff --git a/travis/cxx/cxx-test.sh b/ci/cxx/cxx-test.sh similarity index 93% rename from travis/cxx/cxx-test.sh rename to ci/cxx/cxx-test.sh index c0036611f4..52402867d7 100755 --- a/travis/cxx/cxx-test.sh +++ b/ci/cxx/cxx-test.sh @@ -50,9 +50,9 @@ if [ ! -e $INCLUDE_DIR/blis.h ]; then exit 1 fi -if [ ! -e $SOURCE_DIR/travis/cxx/Makefile ]; then +if [ ! -e $SOURCE_DIR/ci/cxx/Makefile ]; then echo "could not find cxx-test Makefile" exit 1 fi -make -C $SOURCE_DIR/travis/cxx INCLUDE_DIR=$INCLUDE_DIR LIB_DIR=$LIB_DIR BUILD_DIR=$BUILD_DIR +make -C $SOURCE_DIR/ci/cxx INCLUDE_DIR=$INCLUDE_DIR LIB_DIR=$LIB_DIR BUILD_DIR=$BUILD_DIR diff --git a/ci/do_level0.sh b/ci/do_level0.sh new file mode 100755 index 0000000000..792e075904 --- /dev/null +++ b/ci/do_level0.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +set -e +set -x + +: ${SRCDIR:=../..} + +if ! [ -d test/level0 ]; then + mkdir -p test/level0 + ln -s $SRCDIR/test/level0/* test/level0/ +fi + +cd test/level0 +make -j2 + +./test_l0.x diff --git a/ci/do_riscv.sh b/ci/do_riscv.sh new file mode 100755 index 0000000000..82b6afee62 --- /dev/null +++ b/ci/do_riscv.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +set -e +set -x + +TAG=2024.08.03 + +# The prebuilt toolchains only support hardfloat, so we only +# test these for now. +case $1 in + "rv32iv") + TARBALL=riscv32-glibc-ubuntu-20.04-gcc-nightly-${TAG}-nightly.tar.gz + ;; + "rv64iv") + TARBALL=riscv64-glibc-ubuntu-20.04-gcc-nightly-${TAG}-nightly.tar.gz + ;; + "sifive_x280") + TARBALL=riscv64-glibc-ubuntu-20.04-llvm-nightly-${TAG}-nightly.tar.gz + ;; + *) + exit 1 + ;; +esac + +TOOLCHAIN_PATH=$DIST_PATH/../toolchain +TOOLCHAIN_URL=https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/${TAG}/${TARBALL} + +mkdir -p $TOOLCHAIN_PATH +cd $TOOLCHAIN_PATH + +wget $TOOLCHAIN_URL +tar -xf $TARBALL + +# Once CI upgrades to jammy, the next three lines can be removed. +# The qemu version installed via packages (qemu-user qemu-user-binfmt) +# is sufficient. +TARBALL_QEMU=qemu-riscv-2023.02.25-ubuntu-20.04.tar.gz +wget https://github.com/flame/ci-utils/raw/master/riscv/${TARBALL_QEMU} +tar -xf $TARBALL_QEMU diff --git a/ci/do_sde.sh b/ci/do_sde.sh new file mode 100755 index 0000000000..05a664b666 --- /dev/null +++ b/ci/do_sde.sh @@ -0,0 +1,76 @@ +#!/bin/bash + +set -e +set -x + +SDE_VERSION=sde-external-8.69.1-2021-07-18-lin +SDE_TARBALL=$SDE_VERSION.tar.bz2 +SDE=$SDE_VERSION/sde64 + +# +# This doesn't seem to be necessary anymore +# +#curl --verbose --form accept_license=1 --form form_id=intel_licensed_dls_step_1 \ +# --output /dev/null --cookie-jar jar.txt \ +# --location https://software.intel.com/protected-download/267266/144917 +#curl --verbose --cookie jar.txt --output $SDE_TARBALL \ +# https://software.intel.com/system/files/managed/2a/1a/$SDE_TARBALL + +#curl --verbose --output $SDE_TARBALL \ +# https://software.intel.com/content/dam/develop/external/us/en/documents/downloads/$SDE_TARBALL + +CI_UTILS=ci-utils +CI_UTILS_URL=https://github.com/flame/${CI_UTILS}.git +CI_UTILS_SDE_DIR=sde +SDE_DIRPATH=$CI_UTILS/$CI_UTILS_SDE_DIR + +git clone $CI_UTILS_URL +mv $SDE_DIRPATH/$SDE_TARBALL . + +tar xvf $SDE_TARBALL + +make -j2 testsuite-bin blastest-bin + +for ARCH in penryn sandybridge haswell skx knl piledriver steamroller excavator zen generic; do + export BLIS_ARCH_TYPE=-1 + + if [ "$ARCH" = "knl" ]; then + TESTSUITE_WRAPPER="$SDE -knl --" + elif [ "$ARCH" = "sandybridge" ]; then + # The sandybridge.def file causes a segfault in SDE on some systems. + # Instead, use the CPUID values for haswell, but force BLIS to use the + # sandybridge configuration. + TESTSUITE_WRAPPER="$SDE -cpuid_in $DIST_PATH/ci/cpuid/haswell.def --" + export BLIS_ARCH_TYPE=4 + elif [ "$ARCH" = "piledriver" ]; then + # We used to "patch" ld.so and libm to remove CPUID checks so that glibc + # wouldn't try to use instructions not supported by SDE (FMA4). That no + # longer works, so test Piledriver/Steamroller/Excavator as haswell + # but with the configuration forced via environment variable. + TESTSUITE_WRAPPER="$SDE -cpuid_in $DIST_PATH/ci/cpuid/haswell.def --" + export BLIS_ARCH_TYPE=11 + elif [ "$ARCH" = "steamroller" ]; then + TESTSUITE_WRAPPER="$SDE -cpuid_in $DIST_PATH/ci/cpuid/haswell.def --" + export BLIS_ARCH_TYPE=10 + elif [ "$ARCH" = "excavator" ]; then + TESTSUITE_WRAPPER="$SDE -cpuid_in $DIST_PATH/ci/cpuid/haswell.def --" + export BLIS_ARCH_TYPE=9 + elif [ "$ARCH" = "generic" ]; then + TESTSUITE_WRAPPER="$SDE -cpuid_in $DIST_PATH/ci/cpuid/haswell.def --" + export BLIS_ARCH_TYPE=33 + else + TESTSUITE_WRAPPER="$SDE -cpuid_in $DIST_PATH/ci/cpuid/$ARCH.def --" + fi + + make TESTSUITE_WRAPPER="$TESTSUITE_WRAPPER" check + + TMP=`grep "active sub-configuration" output.testsuite` + CONFIG=${TMP##* } + if [ "$CONFIG" != "$ARCH" ]; then + echo "Wrong configuration chosen:" + echo " Expected: $ARCH" + echo " Got: $CONFIG" + exit 1 + fi +done + diff --git a/ci/do_testsuite.sh b/ci/do_testsuite.sh new file mode 100755 index 0000000000..9ecd092367 --- /dev/null +++ b/ci/do_testsuite.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +set -e +set -x + +export BLIS_JC_NT=1 +export BLIS_IC_NT=2 +export BLIS_JR_NT=1 +export BLIS_IR_NT=1 +export BLIS_THREAD_IMPL="single" + +if [ "$TEST" = "FAST" -o "$TEST" = "ALL" ]; then + make testblis-fast + cat ./output.testsuite + $DIST_PATH/testsuite/check-blistest.sh ./output.testsuite + + for impl in $(echo $THR | sed 's/none//' | tr , ' '); do + export BLIS_THREAD_IMPL="$impl" + make testblis-fast + cat ./output.testsuite + $DIST_PATH/testsuite/check-blistest.sh ./output.testsuite + done +fi + +if [ "$TEST" = "MD" -o "$TEST" = "ALL" ]; then + make testblis-md + cat ./output.testsuite + $DIST_PATH/testsuite/check-blistest.sh ./output.testsuite +fi + +if [ "$TEST" = "SALT" -o "$TEST" = "ALL" ]; then + # Disable multithreading within BLIS. + export BLIS_JC_NT=1 BLIS_IC_NT=1 BLIS_JR_NT=1 BLIS_IR_NT=1 + make testblis-salt + cat ./output.testsuite + $DIST_PATH/testsuite/check-blistest.sh ./output.testsuite +fi + +if [ "$TEST" = "1" -o "$TEST" = "ALL" ]; then + make testblis + cat ./output.testsuite + $DIST_PATH/testsuite/check-blistest.sh ./output.testsuite +fi + +export BLIS_THREAD_IMPL="single" +make testblas +cat ./output.testsuite +$DIST_PATH/blastest/check-blastest.sh + diff --git a/common.mk b/common.mk index 2da306d792..584eb20f43 100644 --- a/common.mk +++ b/common.mk @@ -63,7 +63,6 @@ $(eval $(call store-var-for,CC, $(1))) $(eval $(call store-var-for,CC_VENDOR, $(1))) $(eval $(call store-var-for,CPPROCFLAGS,$(1))) $(eval $(call store-var-for,CLANGFLAGS, $(1))) -$(eval $(call store-var-for,CXXLANGFLAGS,$(1))) $(eval $(call store-var-for,CMISCFLAGS, $(1))) $(eval $(call store-var-for,CPICFLAGS, $(1))) $(eval $(call store-var-for,CWARNFLAGS, $(1))) @@ -101,25 +100,41 @@ get-noopt-cflags-for = $(strip $(CFLAGS_PRESET) \ $(call load-var-for,CLANGFLAGS,$(1)) \ $(call load-var-for,CPPROCFLAGS,$(1)) \ $(CTHREADFLAGS) \ - $(CINCFLAGS) $(VERS_DEF) \ + $(CINCFLAGS) \ ) -get-noopt-cxxflags-for = $(strip $(CFLAGS_PRESET) \ +get-noopt-cxxflags-for = $(strip $(CXXFLAGS_PRESET) \ $(call load-var-for,CDBGFLAGS,$(1)) \ $(call load-var-for,CWARNFLAGS,$(1)) \ $(call load-var-for,CPICFLAGS,$(1)) \ $(call load-var-for,CMISCFLAGS,$(1)) \ - $(call load-var-for,CXXLANGFLAGS,$(1)) \ $(call load-var-for,CPPROCFLAGS,$(1)) \ + $(CXXLANGFLAGS) \ $(CTHREADFLAGS) \ - $(CINCFLAGS) $(VERS_DEF) \ + $(CXXTHREADFLAGS) \ + $(CINCFLAGS) \ ) get-refinit-cflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \ $(call get-noopt-cflags-for,$(1)) \ -DBLIS_CNAME=$(1) \ + -DBLIS_CNAME_UPPER=$(shell echo $(1) | tr a-z A-Z) \ + $(BUILD_ASANFLAGS) \ $(BUILD_CPPFLAGS) \ $(BUILD_SYMFLAGS) \ + -DBLIS_IN_REF_KERNEL=1 \ + -include $(CONFIG_PATH)/$(1)/bli_kernel_defs_$(1).h \ + ) + +get-refinit-cxxflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \ + $(call get-noopt-cxxflags-for,$(1)) \ + -DBLIS_CNAME=$(1) \ + -DBLIS_CNAME_UPPER=$(shell echo $(1) | tr a-z A-Z) \ + $(BUILD_ASANFLAGS) \ + $(BUILD_CPPFLAGS) \ + $(BUILD_SYMFLAGS) \ + -DBLIS_IN_REF_KERNEL=1 \ + -include $(CONFIG_PATH)/$(1)/bli_kernel_defs_$(1).h \ ) get-refkern-cflags-for = $(strip $(call load-var-for,CROPTFLAGS,$(1)) \ @@ -127,18 +142,55 @@ get-refkern-cflags-for = $(strip $(call load-var-for,CROPTFLAGS,$(1)) \ $(call get-noopt-cflags-for,$(1)) \ $(COMPSIMDFLAGS) \ -DBLIS_CNAME=$(1) \ + -DBLIS_CNAME_UPPER=$(shell echo $(1) | tr a-z A-Z) \ + $(BUILD_ASANFLAGS) \ $(BUILD_CPPFLAGS) \ $(BUILD_SYMFLAGS) \ + -DBLIS_IN_REF_KERNEL=1 \ + -include $(CONFIG_PATH)/$(1)/bli_kernel_defs_$(1).h \ + ) + +get-refkern-cxxflags-for = $(strip $(call load-var-for,CROPTFLAGS,$(1)) \ + $(call load-var-for,CRVECFLAGS,$(1)) \ + $(call get-noopt-cxxflags-for,$(1)) \ + $(COMPSIMDFLAGS) \ + -DBLIS_CNAME=$(1) \ + -DBLIS_CNAME_UPPER=$(shell echo $(1) | tr a-z A-Z) \ + $(BUILD_ASANFLAGS) \ + $(BUILD_CPPFLAGS) \ + $(BUILD_SYMFLAGS) \ + -DBLIS_IN_REF_KERNEL=1 \ + -include $(CONFIG_PATH)/$(1)/bli_kernel_defs_$(1).h \ ) get-config-cflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \ $(call get-noopt-cflags-for,$(1)) \ + -DBLIS_CNAME=$(1) \ + -DBLIS_CNAME_UPPER=$(shell echo $(1) | tr a-z A-Z) \ + $(BUILD_ASANFLAGS) \ + $(BUILD_CPPFLAGS) \ + $(BUILD_SYMFLAGS) \ + ) + +get-config-cxxflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \ + $(call get-noopt-cxxflags-for,$(1)) \ + -DBLIS_CNAME=$(1) \ + -DBLIS_CNAME_UPPER=$(shell echo $(1) | tr a-z A-Z) \ + $(BUILD_ASANFLAGS) \ $(BUILD_CPPFLAGS) \ $(BUILD_SYMFLAGS) \ ) get-frame-cflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \ $(call get-noopt-cflags-for,$(1)) \ + $(BUILD_ASANFLAGS) \ + $(BUILD_CPPFLAGS) \ + $(BUILD_SYMFLAGS) \ + ) + +get-frame-cxxflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \ + $(call get-noopt-cxxflags-for,$(1)) \ + $(BUILD_ASANFLAGS) \ $(BUILD_CPPFLAGS) \ $(BUILD_SYMFLAGS) \ ) @@ -146,24 +198,61 @@ get-frame-cflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \ get-kernel-cflags-for = $(strip $(call load-var-for,CKOPTFLAGS,$(1)) \ $(call load-var-for,CKVECFLAGS,$(1)) \ $(call get-noopt-cflags-for,$(1)) \ + -DBLIS_CNAME=$(1) \ + -DBLIS_CNAME_UPPER=$(shell echo $(1) | tr a-z A-Z) \ + $(BUILD_CPPFLAGS) \ + $(BUILD_SYMFLAGS) \ + ) + +get-kernel-cxxflags-for = $(strip $(call load-var-for,CKOPTFLAGS,$(1)) \ + $(call load-var-for,CKVECFLAGS,$(1)) \ + $(call get-noopt-cxxflags-for,$(1)) \ + -DBLIS_CNAME=$(1) \ + -DBLIS_CNAME_UPPER=$(shell echo $(1) | tr a-z A-Z) \ + $(BUILD_CPPFLAGS) \ + $(BUILD_SYMFLAGS) \ + ) + +# When compiling addons, we use flags similar to those of general framework +# source. This ensures that the same code can be linked and run across various +# sub-configurations. +get-addon-c99flags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \ + $(call get-noopt-cflags-for,$(1)) \ + $(CADDONINCFLAGS) \ $(BUILD_CPPFLAGS) \ $(BUILD_SYMFLAGS) \ ) +get-addon-cxxflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \ + $(call get-noopt-cxxflags-for,$(1)) \ + $(CADDONINCFLAGS) \ + $(BUILD_CPPFLAGS) \ + $(BUILD_SYMFLAGS) \ + ) +# When compiling addon kernels, we use flags similar to those of kernels +# flags, except we also include the addon header paths. +get-addon-kernel-c99flags-for = $(strip $(call load-var-for,CKOPTFLAGS,$(1)) \ + $(call load-var-for,CKVECFLAGS,$(1)) \ + $(call get-noopt-cflags-for,$(1)) \ + $(CADDONINCFLAGS) \ + $(BUILD_CPPFLAGS) \ + $(BUILD_SYMFLAGS) \ + ) # When compiling sandboxes, we use flags similar to those of general framework # source. This ensures that the same code can be linked and run across various -# sub-configurations. (If we switch to using refkern/kernel flags, we should -# prevent enabling sandboxes for umbrella families by verifying that -# config_list == config_name if --enable-sandbox is given.) +# sub-configurations. (NOTE: If we ever switch to using refkernel or kernel +# flags, we should prevent enabling sandboxes for umbrella families by verifying +# that config_list == config_name if --enable-sandbox is given. THIS ALSO +# APPLIES TO ADDONS ABOVE.) get-sandbox-c99flags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \ $(call get-noopt-cflags-for,$(1)) \ - $(CSBOXINCFLAGS) \ + $(CSANDINCFLAGS) \ $(BUILD_CPPFLAGS) \ $(BUILD_SYMFLAGS) \ ) get-sandbox-cxxflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \ $(call get-noopt-cxxflags-for,$(1)) \ - $(CSBOXINCFLAGS) \ + $(CSANDINCFLAGS) \ $(BUILD_CPPFLAGS) \ $(BUILD_SYMFLAGS) \ ) @@ -171,25 +260,36 @@ get-sandbox-cxxflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \ # Define a separate function that will return appropriate flags for use by # applications that want to use the same basic flags as those used when BLIS # was compiled. (NOTE: This is the same as the $(get-frame-cflags-for ...) -# function, except that it omits two variables that contain flags exclusively -# for use when BLIS is being compiled/built: BUILD_CPPFLAGS, which contains a -# cpp macro that confirms that BLIS is being built; and BUILD_SYMFLAGS, which -# contains symbol export flags that are only needed when a shared library is -# being compiled/linked.) +# function, except that it omits a few variables that contain flags exclusively +# for use when BLIS is being compiled/built: +# - BUILD_CPPFLAGS, which contains a cpp macro that confirms that BLIS +# is being built; +# - BUILD_SYMFLAGS, which contains symbol export flags that are only +# needed when a shared library is being compiled/linked; and +# - BUILD_ASANFLAGS, which contains a flag that causes the compiler to +# insert instrumentation for memory error detection. get-user-cflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \ $(call get-noopt-cflags-for,$(1)) \ ) # Define functions that return messages appropriate for each non-verbose line # of compilation output. -get-noopt-text = "(CFLAGS for no optimization)" -get-refinit-text-for = "('$(1)' CFLAGS for ref. kernel init)" -get-refkern-text-for = "('$(1)' CFLAGS for ref. kernels)" -get-config-text-for = "('$(1)' CFLAGS for config code)" -get-frame-text-for = "('$(1)' CFLAGS for framework code)" -get-kernel-text-for = "('$(1)' CFLAGS for kernels)" -get-sandbox-c99text-for = "('$(1)' CFLAGS for sandboxes)" -get-sandbox-cxxtext-for = "('$(1)' CXXFLAGS for sandboxes)" +get-noopt-text = "(CFLAGS for no optimization)" +get-refinit-text-for = "('$(1)' CFLAGS for ref. kernel init)" +get-refinit-cxxtext-for = "('$(1)' CXXFLAGS for ref. kernel init)" +get-refkern-text-for = "('$(1)' CFLAGS for ref. kernels)" +get-refkern-cxxtext-for = "('$(1)' CXXFLAGS for ref. kernels)" +get-config-text-for = "('$(1)' CFLAGS for config code)" +get-config-cxxtext-for = "('$(1)' CXXFLAGS for config code)" +get-frame-text-for = "('$(1)' CFLAGS for framework code)" +get-frame-cxxtext-for = "('$(1)' CXXFLAGS for framework code)" +get-kernel-text-for = "('$(1)' CFLAGS for kernels)" +get-kernel-cxxtext-for = "('$(1)' CXXFLAGS for kernels)" +get-addon-c99text-for = "('$(1)' CFLAGS for addons)" +get-addon-cxxtext-for = "('$(1)' CXXFLAGS for addons)" +get-addon-kernel-text-for = "('$(1)' CFLAGS for addon kernels)" +get-sandbox-c99text-for = "('$(1)' CFLAGS for sandboxes)" +get-sandbox-cxxtext-for = "('$(1)' CXXFLAGS for sandboxes)" @@ -202,6 +302,11 @@ get-sandbox-cxxtext-for = "('$(1)' CXXFLAGS for sandboxes)" files-that-contain = $(strip $(foreach f, $(1), $(if $(findstring $(2),$(f)),$(f),))) files-that-dont-contain = $(strip $(foreach f, $(1), $(if $(findstring $(2),$(f)),,$(f)))) +# Define a function that removes duplicate strings *without* using the sort +# function. +rm-dups = $(if $1,$(firstword $1) $(call rm-dups,$(filter-out $(firstword $1),$1))) + + # # --- Include makefile configuration file -------------------------------------- @@ -283,9 +388,12 @@ FRAGMENT_MK := .fragment.mk # Locations of important files. BUILD_DIR := build CONFIG_DIR := config +ifeq ($(FRAME_DIR),) FRAME_DIR := frame +endif REFKERN_DIR := ref_kernels KERNELS_DIR := kernels +ADDON_DIR := addon SANDBOX_DIR := sandbox OBJ_DIR := obj LIB_DIR := lib @@ -302,17 +410,28 @@ REFNM := ref # Source suffixes. CONFIG_SRC_SUFS := c - KERNELS_SRC_SUFS := c s S - +ifneq ($(findstring hpx,$(THREADING_MODEL)),) +FRAME_SRC_SUFS := c cpp +else FRAME_SRC_SUFS := c +endif + +ADDON_C99_SUFS := c +ADDON_CXX_SUFS := cc cpp cxx +ADDON_SRC_SUFS := $(ADDON_C99_SUFS) $(ADDON_CXX_SUFS) SANDBOX_C99_SUFS := c SANDBOX_CXX_SUFS := cc cpp cxx SANDBOX_SRC_SUFS := $(SANDBOX_C99_SUFS) $(SANDBOX_CXX_SUFS) # Header suffixes. -FRAME_HDR_SUFS := h +FRAME_H99_SUFS := h +FRAME_HDR_SUFS := $(FRAME_H99_SUFS) + +ADDON_H99_SUFS := h +ADDON_HXX_SUFS := hh hpp hxx +ADDON_HDR_SUFS := $(ADDON_H99_SUFS) $(ADDON_HXX_SUFS) SANDBOX_H99_SUFS := h SANDBOX_HXX_SUFS := hh hpp hxx @@ -320,9 +439,11 @@ SANDBOX_HDR_SUFS := $(SANDBOX_H99_SUFS) $(SANDBOX_HXX_SUFS) # Combine all header suffixes and remove duplicates via sort(). ALL_HDR_SUFS := $(sort $(FRAME_HDR_SUFS) \ + $(ADDON_HDR_SUFS) \ $(SANDBOX_HDR_SUFS) ) -ALL_H99_SUFS := $(sort $(FRAME_HDR_SUFS) \ +ALL_H99_SUFS := $(sort $(FRAME_H99_SUFS) \ + $(ADDON_H99_SUFS) \ $(SANDBOX_H99_SUFS) ) # The names of scripts that check output from the BLAS test drivers and @@ -349,12 +470,15 @@ SHELL := bash # Construct paths to the four primary directories of source code: # the config directory, general framework code, reference kernel code, -# and optimized kernel code. +# and optimized kernel code. Also process paths for addon and sandbox +# directories. CONFIG_PATH := $(DIST_PATH)/$(CONFIG_DIR) FRAME_PATH := $(DIST_PATH)/$(FRAME_DIR) REFKERN_PATH := $(DIST_PATH)/$(REFKERN_DIR) KERNELS_PATH := $(DIST_PATH)/$(KERNELS_DIR) +ADDON_PATH := $(DIST_PATH)/$(ADDON_DIR) SANDBOX_PATH := $(DIST_PATH)/$(SANDBOX_DIR) +BUILD_PATH := $(DIST_PATH)/$(BUILD_DIR) # Construct paths to some optional C++ template headers contributed by AMD. VEND_CPP_PATH := $(DIST_PATH)/$(VEND_CPP_DIR) @@ -367,6 +491,7 @@ CONFIG_FRAG_PATH := ./obj/$(CONFIG_NAME)/$(CONFIG_DIR) FRAME_FRAG_PATH := ./obj/$(CONFIG_NAME)/$(FRAME_DIR) REFKERN_FRAG_PATH := ./obj/$(CONFIG_NAME)/$(REFKERN_DIR) KERNELS_FRAG_PATH := ./obj/$(CONFIG_NAME)/$(KERNELS_DIR) +ADDON_FRAG_PATH := ./obj/$(CONFIG_NAME)/$(ADDON_DIR) SANDBOX_FRAG_PATH := ./obj/$(CONFIG_NAME)/$(SANDBOX_DIR) @@ -397,7 +522,7 @@ LIBBLIS := libblis ifeq ($(OS_NAME),Darwin) SHLIB_EXT := dylib else ifeq ($(IS_WIN),yes) -ifeq ($(CC_VENDOR),gcc) +ifeq ($(IS_MSVC),no) SHLIB_EXT := dll.a else SHLIB_EXT := lib @@ -446,6 +571,8 @@ else LIBBLIS_SO_OUTPUT_NAME := $(LIBBLIS_SO_PATH) endif + + # # --- Utility program definitions ---------------------------------------------- # @@ -461,6 +588,7 @@ GREP := grep EGREP := grep -E XARGS := xargs INSTALL := install -c +DEVNULL := /dev/null # Script for creating a monolithic header file. #FLATTEN_H := $(DIST_PATH)/build/flatten-headers.sh @@ -473,6 +601,11 @@ ARFLAGS := cr GIT := git GIT_LOG := $(GIT) log --decorate +# Define the locations of a script to generate a list of shared library symbols +# within BLIS as well as the symbol file itself. +GEN_SYMS := $(BUILD_PATH)/gen-libblis-symbols.sh +SYM_FILE := $(BUILD_PATH)/libblis-symbols.def + # @@ -485,7 +618,7 @@ GIT_LOG := $(GIT) log --decorate # manually override whatever they need. # Define the external libraries we may potentially need at link-time. -ifeq ($(IS_WIN),yes) +ifeq ($(IS_MSVC),yes) LIBM := else LIBM := -lm @@ -513,6 +646,11 @@ ifeq ($(DEBUG_TYPE),sde) LDFLAGS := $(filter-out $(LIBMEMKIND),$(LDFLAGS)) endif +# If AddressSanitizer is enabled, add the compiler flag to LDFLAGS. +ifeq ($(MK_ENABLE_ASAN),yes) +LDFLAGS += -fsanitize=address +endif + # Specify the shared library's 'soname' field. # NOTE: The flag for creating shared objects is different for Linux and OS X. ifeq ($(OS_NAME),Darwin) @@ -527,7 +665,7 @@ else SOFLAGS := -shared ifeq ($(IS_WIN),yes) # Windows shared library link flags. -ifeq ($(CC_VENDOR),clang) +ifeq ($(IS_MSVC),yes) SOFLAGS += -Wl,-implib:$(BASE_LIB_PATH)/$(LIBBLIS).lib else SOFLAGS += -Wl,--out-implib,$(BASE_LIB_PATH)/$(LIBBLIS).dll.a @@ -570,6 +708,7 @@ endif endif + # # --- Include makefile definitions file ---------------------------------------- # @@ -625,17 +764,30 @@ endif # --- Linker program --- -# Use whatever compiler was chosen. +# Use whatever compiler was chosen. A C++ compiler must be used if HPX is enabled. +ifneq ($(findstring hpx,$(THREADING_MODEL)),) +LINKER := $(CXX) +else LINKER := $(CC) +endif # --- Warning flags --- CWARNFLAGS := +# Do not allow functions with implicit definitions to be called +ifneq ($(CC_VENDOR),ibm) +CWARNFLAGS += -Werror=implicit-function-declaration +endif + # Disable unused function warnings and stop compiling on first error for # all compilers that accept such options: gcc, clang, and icc. ifneq ($(CC_VENDOR),ibm) +ifneq ($(CC_VENDOR),nvc) CWARNFLAGS += -Wall -Wno-unused-function -Wfatal-errors +else +CWARNFLAGS += -Wall -Wno-unused-function +endif endif # Disable tautological comparision warnings in clang. @@ -643,22 +795,33 @@ ifeq ($(CC_VENDOR),clang) CWARNFLAGS += -Wno-tautological-compare -Wno-pass-failed endif +# Disable other annoying warnings. +ifeq ($(CC_VENDOR),clang) +CWARNFLAGS += +else +ifeq ($(CC_VENDOR),gcc) +# The '-Wno-maybe-uninitialized' option makes me nervous. Let's temporarily +# disable for now. -FGVZ +#CWARNFLAGS += -Wno-maybe-uninitialized -Wno-comment +CWARNFLAGS += -Wno-comment +endif +endif + $(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CWARNFLAGS,$(c)))) # --- Position-independent code flags (shared libraries only) --- -# Emit position-independent code for dynamic linking. -ifeq ($(IS_WIN),yes) -# Note: Don't use any fPIC flags for Windows builds since all code is position- +# Note: Avoid -fPIC flags for Windows builds since all code is position- # independent. +ifeq ($(IS_MSVC),yes) CPICFLAGS := -else -CPICFLAGS := -fPIC endif -$(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CPICFLAGS,$(c)))) +$(foreach c, $(CONFIG_LIST_FAM), $(eval $(call store-var-for,CPICFLAGS,$(c)))) # --- Symbol exporting flags (shared libraries only) --- +ifeq ($(MK_ENABLE_SHARED),yes) + # NOTE: These flags are only applied when building BLIS and not used by # applications that import BLIS compilation flags via the # $(get-user-cflags-for ...) function. @@ -700,6 +863,14 @@ endif # Determine default export behavior / visibility of symbols for clang. ifeq ($(CC_VENDOR),clang) ifeq ($(IS_WIN),yes) +ifeq ($(IS_MSVC),no) +# This is a clang build targetting MinGW-w64 env +ifeq ($(EXPORT_SHARED),all) +BUILD_SYMFLAGS := -Wl,--export-all-symbols, -Wl,--enable-auto-import +else # ifeq ($(EXPORT_SHARED),all) +BUILD_SYMFLAGS := -Wl,--exclude-all-symbols +endif +endif # ifeq ($(IS_MSVC),no) ifeq ($(EXPORT_SHARED),all) # NOTE: clang on Windows does not appear to support exporting all symbols # by default, and therefore we ignore the value of EXPORT_SHARED. @@ -722,70 +893,113 @@ endif endif endif +else #ifeq ($(MK_ENABLE_SHARED),no) + +# Don't modify CPICFLAGS for the various configuration family members. +# Don't use any special symbol export flags. +BUILD_SYMFLAGS := + +endif + # --- Language flags --- # Enable C99. CLANGFLAGS := -std=c99 $(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CLANGFLAGS,$(c)))) -# Enable C++11. +# Enable C++11, or C++17 if HPX threading is enabled. +# If building a plugin, do not set any default C++ standard. +ifeq ($(PLUGIN_NAME),) +ifneq ($(findstring hpx,$(THREADING_MODEL)),) +CXXLANGFLAGS := -std=c++17 +else CXXLANGFLAGS := -std=c++11 -$(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CXXLANGFLAGS,$(c)))) +endif +else +CXXLANGFLAGS := +endif # --- C Preprocessor flags --- # Enable clock_gettime() in time.h. CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L +# Enable ip_mreq on macOS which is needed for ASIO which is needed for HPX. +ifeq ($(OS_NAME),Darwin) +CPPROCFLAGS += -D_DARWIN_C_SOURCE +endif $(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CPPROCFLAGS,$(c)))) +# --- AddressSanitizer flags --- + +ifeq ($(MK_ENABLE_ASAN),yes) +BUILD_ASANFLAGS := -fsanitize=address +else +BUILD_ASANFLAGS := +endif + # --- Threading flags --- # NOTE: We don't have to explicitly omit -pthread when --disable-system is given -# since that option forces --enable-threading=none, and thus -pthread never gets -# added to begin with. +# since that option forces --enable-threading=single, and thus -pthread never +# gets added to begin with. + +CTHREADFLAGS := +CXXTHREADFLAGS := ifeq ($(CC_VENDOR),gcc) -ifeq ($(THREADING_MODEL),auto) -THREADING_MODEL := openmp -endif -ifeq ($(THREADING_MODEL),openmp) -CTHREADFLAGS := -fopenmp +#ifneq ($(findstring auto,$(THREADING_MODEL)),) +#THREADING_MODEL := openmp +#endif +ifneq ($(findstring openmp,$(THREADING_MODEL)),) +CTHREADFLAGS += -fopenmp LDFLAGS += -fopenmp endif -ifeq ($(THREADING_MODEL),pthreads) -CTHREADFLAGS := -pthread +ifneq ($(findstring pthreads,$(THREADING_MODEL)),) +CTHREADFLAGS += -pthread LDFLAGS += $(LIBPTHREAD) endif endif ifeq ($(CC_VENDOR),icc) -ifeq ($(THREADING_MODEL),auto) -THREADING_MODEL := openmp -endif -ifeq ($(THREADING_MODEL),openmp) -CTHREADFLAGS := -fopenmp +#ifneq ($(findstring auto,$(THREADING_MODEL)),) +#THREADING_MODEL := openmp +#endif +ifneq ($(findstring openmp,$(THREADING_MODEL)),) +CTHREADFLAGS += -fopenmp LDFLAGS += -fopenmp endif -ifeq ($(THREADING_MODEL),pthreads) -CTHREADFLAGS := -pthread +ifneq ($(findstring pthreads,$(THREADING_MODEL)),) +CTHREADFLAGS += -pthread LDFLAGS += $(LIBPTHREAD) endif endif ifeq ($(CC_VENDOR),clang) -ifeq ($(THREADING_MODEL),auto) -THREADING_MODEL := pthreads -endif -ifeq ($(THREADING_MODEL),openmp) -CTHREADFLAGS := -fopenmp +#ifneq ($(findstring auto,$(THREADING_MODEL)),) +#THREADING_MODEL := pthreads +#endif +ifneq ($(findstring openmp,$(THREADING_MODEL)),) +CTHREADFLAGS += -fopenmp LDFLAGS += -fopenmp endif -ifeq ($(THREADING_MODEL),pthreads) -CTHREADFLAGS := -pthread +ifneq ($(findstring pthreads,$(THREADING_MODEL)),) +CTHREADFLAGS += -pthread LDFLAGS += $(LIBPTHREAD) endif endif +# Threading flags for HPX. +ifneq ($(findstring hpx,$(THREADING_MODEL)),) +HPX_CXXFLAGS := $(shell pkg-config --cflags hpx_component) +HPX_LDFLAGS := $(filter-out -shared,$(shell pkg-config --libs hpx_component)) +CTHREADFLAGS += $(filter-out -std=%,$(HPX_CXXFLAGS)) +LDFLAGS += $(HPX_LDFLAGS) +ifeq ($(OS_NAME),Darwin) +RPATH_PREFIX := -Wl,-rpath, +LDFLAGS += $(patsubst -L%,$(RPATH_PREFIX)%,$(filter -L%,$(HPX_LDFLAGS))) +endif +endif + # --- #pragma omp simd flags (used for reference kernels only) --- ifeq ($(PRAGMA_OMP_SIMD),yes) @@ -827,8 +1041,13 @@ endif # ifeq ($(OS_NAME),Linux) +# Exclude -lrt on Android by detecting Bionic. +# printf *must* be used here rather than echo -e +BIONIC := $(findstring bionic,$(shell printf "\#ifdef __BIONIC__\nbionic\n\#endif" | $(CC) -E -)) +ifeq (,$(BIONIC)) LDFLAGS += -lrt endif +endif @@ -855,6 +1074,7 @@ MK_CONFIG_SRC := MK_KERNELS_SRC := MK_REFKERN_SRC := MK_FRAME_SRC := +MK_ADDON_SRC := MK_SANDBOX_SRC := # -- config -- @@ -905,6 +1125,24 @@ PARENT_PATH := $(OBJ_DIR)/$(CONFIG_NAME) -include $(addsuffix /$(FRAGMENT_MK), $(REFKERN_FRAG_PATH)) -include $(addsuffix /$(FRAGMENT_MK), $(FRAME_FRAG_PATH)) +# -- addon -- + +# Construct paths to each addon. +# NOTE: If $(ADDON_LIST) is empty (because no addon was enabled at configure- +# time) then $(ADDON_PATHS) will also be empty, which will cause no fragments +# to be included. +ADDON_PATHS := $(addprefix $(ADDON_FRAG_PATH)/, $(ADDON_LIST)) + +# This variable is used by the include statements as they recursively include +# one another. For the 'addons' directory, we initialize it to that directory +# in preparation to include the fragments in the configuration sub-directory. +PARENT_SRC_PATH := $(ADDON_PATH) +PARENT_PATH := $(ADDON_FRAG_PATH) + +# Recursively include the makefile fragments in each of the addons sub- +# directories. +-include $(addsuffix /$(FRAGMENT_MK), $(ADDON_PATHS)) + # -- sandbox -- # Construct paths to each sandbox. (At present, there can be only one.) @@ -922,6 +1160,8 @@ PARENT_PATH := $(SANDBOX_FRAG_PATH) # Recursively include the makefile fragments in the sandbox sub-directory. -include $(addsuffix /$(FRAGMENT_MK), $(SANDBOX_PATHS)) +# -- post-processing -- + # Create a list of the makefile fragments using the variable into which each # of the above include statements accumulated their directory paths. MAKEFILE_FRAGMENTS := $(addsuffix /$(FRAGMENT_MK), $(FRAGMENT_DIR_PATHS)) @@ -940,14 +1180,14 @@ endif # # Define a function that will expand all of the directory paths given in $(1) -# to actual filepaths using the list of suffixes provided $(2). +# to actual filepaths using the list of suffixes provided in $(2). get-filepaths = $(strip $(foreach path, $(1), \ $(foreach suf, $(2), \ $(wildcard $(path)/*.$(suf)) \ ) ) ) # Define a function that will expand all of the directory paths given in $(1) -# to actual filepaths using the list of suffixes provided $(2), taking only +# to actual filepaths using the list of suffixes provided in $(2), taking only # the first expansion from each directory with at least one file matching # the current suffix. Finally, strip the filenames from all resulting files, # returning only the directory paths. @@ -957,20 +1197,29 @@ get-dirpaths = $(dir $(foreach path, $(1), \ $(wildcard $(path)/*.$(suf)) \ ) ) ) ) -# We'll use two directory lists. The first is a list of all of the directories -# in which makefile fragments were generated (plus the current directory). The -# second is the subset of the first that begins with the sandbox root path. +# We'll use three directory lists. The first is a list of all of the directories +# in which makefile fragments were generated, plus the current directory. (The +# current directory is needed so we include bli_config.h and bli_addon.h in the +# processing of header files.) The second and third are subsets of the first +# that begins with the addon and sandbox root paths, respectively. ALLFRAG_DIR_PATHS := . $(FRAGMENT_DIR_PATHS) +ADDON_DIR_PATHS := $(filter $(ADDON_PATH)/%,$(ALLFRAG_DIR_PATHS)) SANDBOX_DIR_PATHS := $(filter $(SANDBOX_PATH)/%,$(ALLFRAG_DIR_PATHS)) ALL_H99_FILES := $(call get-filepaths,$(ALLFRAG_DIR_PATHS),$(ALL_H99_SUFS)) -FRAME_H99_FILES := $(filter-out $(SANDBOX_PATH)/%,$(ALL_H99_FILES)) +FRAME_H99_FILES := $(filter-out $(ADDON_PATH)/%, \ + $(filter-out $(SANDBOX_PATH)/%, \ + $(ALL_H99_FILES) \ + ) ) -ALL_H99_DIRPATHS := $(call get-dirpaths,$(ALLFRAG_DIR_PATHS),$(ALL_H99_SUFS)) +ALL_H99_DIRPATHS := $(call get-dirpaths,$(ALLFRAG_DIR_PATHS),$(ALL_H99_SUFS)) -SANDBOX_H99_FILES := $(call get-filepaths,$(SANDBOX_DIR_PATHS),$(SANDBOX_H99_SUFS)) -SANDBOX_HXX_FILES := $(call get-filepaths,$(SANDBOX_DIR_PATHS),$(SANDBOX_HXX_SUFS)) +ADDON_H99_FILES := $(call get-filepaths,$(ADDON_DIR_PATHS),$(ADDON_H99_SUFS)) +ADDON_HXX_FILES := $(call get-filepaths,$(ADDON_DIR_PATHS),$(ADDON_HXX_SUFS)) +ADDON_HDR_DIRPATHS := $(call get-dirpaths,$(ADDON_DIR_PATHS),$(ALL_HDR_SUFS)) +SANDBOX_H99_FILES := $(call get-filepaths,$(SANDBOX_DIR_PATHS),$(SANDBOX_H99_SUFS)) +SANDBOX_HXX_FILES := $(call get-filepaths,$(SANDBOX_DIR_PATHS),$(SANDBOX_HXX_SUFS)) SANDBOX_HDR_DIRPATHS := $(call get-dirpaths,$(SANDBOX_DIR_PATHS),$(ALL_HDR_SUFS)) @@ -1003,19 +1252,29 @@ BLIS_H_SRC_PATH := $(filter %/$(BLIS_H), $(FRAME_H99_FILES)) # blis.h file. BLIS_H_FLAT := $(BASE_INC_PATH)/$(BLIS_H) +# Construct the path to the helper blis.h file that will reside one directory +# up from the installed copy of blis.h. +HELP_BLIS_H_PATH := $(BUILD_DIR)/$(BLIS_H) + # # --- cblas.h header definitions ----------------------------------------------- # # Isolate the path to cblas.h by filtering the file from the list of framework -# header files. +# header files, and then strip the filename to obtain the directory in which +# cblas.h resides. CBLAS_H := cblas.h CBLAS_H_SRC_PATH := $(filter %/$(CBLAS_H), $(FRAME_H99_FILES)) +CBLAS_H_DIRPATH := $(dir $(CBLAS_H_SRC_PATH)) # Construct the path to what will be the intermediate flattened/monolithic # cblas.h file. -CBLAS_H_FLAT := $(BASE_INC_PATH)/$(CBLAS_H) +CBLAS_H_FLAT := $(BASE_INC_PATH)/$(CBLAS_H) + +# Construct the path to the helper cblas.h file that will reside one directory +# up from the installed copy of cblas.h. +HELP_CBLAS_H_PATH := $(BUILD_DIR)/$(CBLAS_H) # @@ -1023,21 +1282,34 @@ CBLAS_H_FLAT := $(BASE_INC_PATH)/$(CBLAS_H) # # Obtain a list of header files #included inside of the bli_cntx_ref.c file. -# Paths to these files will be needed when compiling with the monolithic -# header. +# Due to the way that bli_cntx_ref.c uses headers and macros, paths to these +# files will be needed when compiling bli_cntx_ref.c with the monolithic header. ifeq ($(strip $(SHARE_PATH)),.) REF_KER_SRC := $(DIST_PATH)/$(REFKERN_DIR)/bli_cntx_ref.c -REF_KER_HEADERS := $(shell $(GREP) "\#include" $(REF_KER_SRC) | sed -e "s/\#include [\"<]\([a-zA-Z0-9\_\.\/\-]*\)[\">].*/\1/g" | $(GREP) -v $(BLIS_H)) +# +# NOTE: A redirect to /dev/null has been added to the grep command below because +# as of version 3.8, grep outputs warnings when encountering stray backslashes +# in regular expressions [1]. Versions older than 3.8 not only do not complain, +# but actually seem to *require* the backslash, perhaps because of the way we +# are invoking grep via GNU make's shell command. WHEN DEBUGGING ANYTHING +# INVOLVING THE MAKE VARIABLE BELOW, PLEASE CONSIDER TEMPORARILY REMOVING THE +# REDIRECT TO /dev/null SO THAT YOU SEE ANY MESSAGES SENT TO STANDARD ERROR. +# +# [1] https://lists.gnu.org/archive/html/info-gnu/2022-09/msg00001.html +# +REF_KER_HEADERS := $(shell $(GREP) "\#include" $(REF_KER_SRC) 2> $(DEVNULL) | sed -e "s/\#include [\"<]\([a-zA-Z0-9\_\.\/\-]*\)[\">].*/\1/g" | $(GREP) -v $(BLIS_H)) endif # Match each header found above with the path to that header, and then strip # leading, trailing, and internal whitespace. -REF_KER_H_PATHS := $(strip $(foreach header, $(REF_KER_HEADERS), \ - $(dir $(filter %/$(header), \ - $(FRAME_H99_FILES))))) +REF_KER_H_PATHS := $(call rm-dups,$(strip \ + $(foreach header, $(REF_KER_HEADERS), \ + $(dir $(filter %/$(header), \ + $(FRAME_H99_FILES)))))) # Add -I to each header path so we can specify our include search paths to the -# C compiler. Then add frame/include since it's needed for bli_oapi_w[o]_cntx.h. +# C compiler. Then add frame/include since it's needed when compiling source +# files that #include bli_oapi_ba.h or bli_oapi_ex.h. REF_KER_I_PATHS := $(strip $(patsubst %, -I%, $(REF_KER_H_PATHS))) REF_KER_I_PATHS += -I$(DIST_PATH)/frame/include @@ -1046,17 +1318,29 @@ REF_KER_I_PATHS += -I$(DIST_PATH)/frame/include # now #include the monolithic/flattened blis.h instead. CINCFLAGS := -I$(BASE_INC_PATH) $(REF_KER_I_PATHS) +# If CBLAS is enabled, we also include the path to the cblas.h directory so +# that the compiler will be able to find cblas.h as the CBLAS source code is +# being compiled. +ifeq ($(MK_ENABLE_CBLAS),yes) +CINCFLAGS += -I$(CBLAS_H_DIRPATH) +endif + +# Obtain a list of header paths in the configured addons. Then add -I to each +# header path. +CADDONINCFLAGS := $(strip $(patsubst %, -I%, $(ADDON_HDR_DIRPATHS))) + # Obtain a list of header paths in the configured sandbox. Then add -I to each # header path. -CSBOXINCFLAGS := $(strip $(patsubst %, -I%, $(SANDBOX_HDR_DIRPATHS))) +CSANDINCFLAGS := $(strip $(patsubst %, -I%, $(SANDBOX_HDR_DIRPATHS))) # # --- BLIS configuration header definitions ------------------------------------ # -# This file was created by configure, but we need to define it here so we can -# remove it as part of the clean targets. +# These files were created by configure, but we need to define them here so we +# can remove them as part of the clean targets. +BLIS_ADDON_H := ./bli_addon.h BLIS_CONFIG_H := ./bli_config.h @@ -1064,17 +1348,18 @@ BLIS_CONFIG_H := ./bli_config.h # --- Special preprocessor macro definitions ----------------------------------- # -# Define a C preprocessor macro to communicate the current version so that it -# can be embedded into the library and queried later. -VERS_DEF := -DBLIS_VERSION_STRING=\"$(VERSION)\" - # Define a C preprocessor flag that is *only* defined when BLIS is being # compiled. (In other words, an application that #includes blis.h will not # get this cpp macro.) BUILD_CPPFLAGS := -DBLIS_IS_BUILDING_LIBRARY +# +# --- configure file location -------------------------------------------------- +# + +CONFIGURE_FILE := $(DIST_PATH)/configure + # end of ifndef COMMON_MK_INCLUDED conditional block endif - diff --git a/config/a64fx/bli_cntx_init_a64fx.c b/config/a64fx/bli_cntx_init_a64fx.c index 5132b2824c..81b4eb8f99 100644 --- a/config/a64fx/bli_cntx_init_a64fx.c +++ b/config/a64fx/bli_cntx_init_a64fx.c @@ -38,34 +38,41 @@ void bli_cntx_init_a64fx( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; - blksz_t thresh[ BLIS_NUM_THRESH ]; // Set default kernel blocksizes and functions. bli_cntx_init_a64fx_ref( cntx ); // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 4, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed, FALSE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed, + + // packm + BLIS_PACKM_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16x10, + + BLIS_VA_END ); - // Set SVE-512 packing routine. - bli_cntx_set_packm_kers + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs ( - 2, - BLIS_PACKM_10XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk, - // 12xk is not used and disabled for GCC 8-9 compatibility. - // BLIS_PACKM_12XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_int_12xk, - BLIS_PACKM_16XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -80,66 +87,18 @@ void bli_cntx_init_a64fx( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx - ); - -#if 0 - // Initialize sup thresholds with architecture-appropriate values. - // s d c z - bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 65, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 65, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 65, -1, -1 ); - // Initialize the context with the sup thresholds. - bli_cntx_set_l3_sup_thresh - ( - 3, - BLIS_MT, &thresh[ BLIS_MT ], - BLIS_NT, &thresh[ BLIS_NT ], - BLIS_KT, &thresh[ BLIS_KT ], - cntx + BLIS_VA_END ); - // Update the context with optimized small/unpacked gemm kernels. - bli_cntx_set_l3_sup_kers - ( - 4, - BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, - BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, - BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, - BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, - cntx - ); - - // Initialize level-3 sup blocksize objects with architecture-specific - // values. - // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 10, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 16, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 120, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 4080, -1, -1 ); - - // Update the context with the current architecture's register and cache - // blocksizes for small/unpacked level-3 problems. - bli_cntx_set_l3_sup_blkszs - ( - 5, - BLIS_NC, &blkszs[ BLIS_NC ], - BLIS_KC, &blkszs[ BLIS_KC ], - BLIS_MC, &blkszs[ BLIS_MC ], - BLIS_NR, &blkszs[ BLIS_NR ], - BLIS_MR, &blkszs[ BLIS_MR ], - cntx - ); -#endif - // Set A64FX cache sector sizes for each PE/CMG // SC Fugaku might disable users' setting cache sizes. #if !defined(CACHE_SECTOR_SIZE_READONLY) diff --git a/config/a64fx/bli_family_a64fx.h b/config/a64fx/bli_family_a64fx.h index 5e3f29fd4b..f2837459d1 100644 --- a/config/a64fx/bli_family_a64fx.h +++ b/config/a64fx/bli_family_a64fx.h @@ -38,9 +38,19 @@ // -- MEMORY ALLOCATION -------------------------------------------------------- -#define BLIS_SIMD_ALIGN_SIZE 256 -#define BLIS_SIMD_NUM_REGISTERS 32 - +#define BLIS_SIMD_ALIGN_SIZE 256 +#define BLIS_SIMD_MAX_NUM_REGISTERS 32 + +// SVE-specific configs. +#define N_L1_SVE_DEFAULT 64 +#define W_L1_SVE_DEFAULT 4 +#define C_L1_SVE_DEFAULT 256 +#define N_L2_SVE_DEFAULT 2048 +#define W_L2_SVE_DEFAULT 16 +#define C_L2_SVE_DEFAULT 256 +#define N_L3_SVE_DEFAULT 8192 +#define W_L3_SVE_DEFAULT 16 +#define C_L3_SVE_DEFAULT 256 //#endif diff --git a/config/a64fx/bli_kernel_defs_a64fx.h b/config/a64fx/bli_kernel_defs_a64fx.h new file mode 100644 index 0000000000..2c5c972049 --- /dev/null +++ b/config/a64fx/bli_kernel_defs_a64fx.h @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 32 +#define BLIS_MR_d 16 +#define BLIS_MR_c 16 +#define BLIS_MR_z 8 + +#define BLIS_NR_s 10 +#define BLIS_NR_d 10 +#define BLIS_NR_c 10 +#define BLIS_NR_z 10 + +//#endif + diff --git a/config/a64fx/make_defs.mk b/config/a64fx/make_defs.mk index d6871fac31..5cc8162ba8 100644 --- a/config/a64fx/make_defs.mk +++ b/config/a64fx/make_defs.mk @@ -47,7 +47,7 @@ THIS_CONFIG := a64fx # may specify additional flags here as needed. CPPROCFLAGS := -D_GNU_SOURCE -D_A64FX CMISCFLAGS := -CPICFLAGS := +CPICFLAGS := -fPIC CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) diff --git a/config/altra/QuickStart/TimeDGEMM.cfile b/config/altra/QuickStart/TimeDGEMM.cfile new file mode 100755 index 0000000000..172edc6596 --- /dev/null +++ b/config/altra/QuickStart/TimeDGEMM.cfile @@ -0,0 +1,143 @@ +#include +#include +#include +#include +#include +#include +#include +#include "blis.h" + +/*################################################### +// To build with openmp: +// Note: Don't need the -lomp on Linux +gcc -fopenmp -O2 -g -I$BLIS_HOME/include/$BLIS_ARCH TimeDGEMM.c $BLIS_HOME/lib/$BLIS_ARCH/libblis.a -lpthread -lm -o time_gemm.x + +// To build with pThreads +source ./enable_blis.sh +gcc -O2 -g -I$BLIS_HOME/include/$BLIS_ARCH TimeDGEMM.c $BLIS_HOME/lib/$BLIS_ARCH/libblis.a -lpthread -lm -o time_gemm.x + +// To run with QuickStart Macros... +for N_CORES, S_SOCKETS + +blis_set_cores_and_sockets N S; $BLIS_NUMA time_gemm.x + +###################################################*/ + +#include // for Linux stdarg + +//################################################### +// Handy blis functions +//################################################### + +// Returns 0.0 if out ofmatrix +double GetReal(obj_t *m, int row, int col) + { + double im = 0, re = 0; // Imaginary component + if (!m) return 0.0; + + bli_getijm(row, col, m, &re, &im); + return re; + } + +bool SetReal(obj_t *m, int row, int col, double dVal) + { + if (!m) return 0.0; + bli_setijm(dVal, 0.0, row, col, m); + + return true; + } + +//################################################### +// The basic meat - a one shot +//################################################### + +bool TimeBlis(long size) + { + int repeat = 3; // Best Of! + double dAlpha = 1.0, dBeta = 0.0; // simplest case! + + //============== Allocate matrices ============= + obj_t* alpha = (obj_t*) calloc(1, sizeof(obj_t)); + obj_t* beta = (obj_t*) calloc(1, sizeof(obj_t)); + + bli_obj_create(BLIS_DOUBLE, 1, 1, 0, 0, alpha); + bli_obj_create(BLIS_DOUBLE, 1, 1, 0, 0, beta); + + // Full gemm is alpha * A * B + beta * C + bli_setsc(dAlpha, 0.0, alpha); // alpha is one + bli_setsc(dBeta, 0.0, beta); // beta is zero + //============================================== + printf("Initializing %g GB of Matrices...\n", 8.0 * size * size * 3.0 / 1024.0 / 1024.0 / 1024.0); + + obj_t* a = (obj_t*) calloc(1, sizeof(obj_t)); + obj_t* b = (obj_t*) calloc(1, sizeof(obj_t)); + obj_t* c = (obj_t*) calloc(1, sizeof(obj_t)); + + bli_obj_create(BLIS_DOUBLE, size, size, size, 1, c); + bli_obj_create(BLIS_DOUBLE, size, size, size, 1, a); + bli_obj_create(BLIS_DOUBLE, size, size, size, 1, b); + + // Create Random matrices + // that are well conditioned and invertible + // (Note: this can be slow) + // + bli_randm(c); + bli_randm(a); + bli_randm(b); + + //============================================== + // DO the timing, blis style... + //============================================== + + double dBestTime = DBL_MAX; + + for (int i = 0; i < repeat; i++) + { + printf("Performing DGEMM %d of %d\n", i + 1, repeat); fflush(stdout); + double dStartTime = bli_clock(); + + bli_gemm(alpha, a, b, beta, c); + + // Always look at best of N for timing! + dBestTime = bli_clock_min_diff( dBestTime, dStartTime ); + } + + double gflops = ( 2.0 * size * size * size ) / ( dBestTime * 1.0e9 ); + + printf("Best DGEMM run completed in %g seconds @ size= \t %ld \t %g \t gigaflops\n", + dBestTime, size, gflops); fflush(stdout); + + return true; + } + + +int main( int argc, char** argv ) + { + long size = 0; + int cores = 1, sweep_inc = 0; + + printf("Details of parallelism are set by environment variables.\n"); + printf("Arg1 = size=M=N=K for DGEMM\n" + "optional arg2 = size step for sweep.\n"); + + if (argc < 2) return 0; + + if (argc > 1) { + size = atol(argv[1]); + printf("User set size to %ld\n", size); + } + + if (argc > 2) { + sweep_inc = atoi(argv[3]); + printf("User set sweep size inc to %d\n", sweep_inc); + } + + if (sweep_inc == 0) TimeBlis(size); + else + { + for (int i = size; i >= sweep_inc; i -= sweep_inc) + TimeBlis(i); + } + + return 0; + } diff --git a/config/altra/QuickStart/blis_build_altra.sh b/config/altra/QuickStart/blis_build_altra.sh new file mode 100755 index 0000000000..9208aac371 --- /dev/null +++ b/config/altra/QuickStart/blis_build_altra.sh @@ -0,0 +1,20 @@ +#!/bin/bash +echo "#######################################################" +echo "Building standard OpenMP BLIS..." +echo "#######################################################" +. ./blis_setenv.sh quiet +echo "##########################################################" +echo "Configuring BLIS for Altra using OpenMP for parallelism..." +echo "##########################################################" +. ./blis_configure_altra.sh quiet +echo "Switching to directory $BLIS_HOME" +pushd $BLIS_HOME > /dev/null +make -j +popd > /dev/null +if [ "$1" != "notest" ]; then + . ./blis_test.sh quiet +fi +. ./blis_setenv.sh +echo "##########################################################" +echo "...done" +echo "##########################################################" diff --git a/config/altra/QuickStart/blis_build_altra_pthreads.sh b/config/altra/QuickStart/blis_build_altra_pthreads.sh new file mode 100755 index 0000000000..3285258306 --- /dev/null +++ b/config/altra/QuickStart/blis_build_altra_pthreads.sh @@ -0,0 +1,20 @@ +#!/bin/bash +echo "#######################################################" +echo "Building pThreads version of BLIS..." +echo "#######################################################" +. ./blis_setenv.sh quiet +echo "##########################################################" +echo "Configuring BLIS for Altra using pThreads for parallelism..." +echo "##########################################################" +. ./blis_configure_altra_pthreads.sh quiet +echo "Switching to directory $BLIS_HOME" +pushd $BLIS_HOME > /dev/null +make -j +popd > /dev/null +if [ "$1" != "notest" ]; then + . ./blis_test.sh quiet +fi +. ./blis_setenv.sh +echo "##########################################################" +echo "...done" +echo "##########################################################" diff --git a/config/altra/QuickStart/blis_build_both_libraries.sh b/config/altra/QuickStart/blis_build_both_libraries.sh new file mode 100755 index 0000000000..2bcf186f81 --- /dev/null +++ b/config/altra/QuickStart/blis_build_both_libraries.sh @@ -0,0 +1,58 @@ +#!/bin/bash +echo "##########################################################" +echo "Creating both OpenMP and pThread BLIS libraries..." +echo "##########################################################" +echo "First, Creating pThread library..." +echo "##########################################################" +. ./blis_build_altra_pthreads.sh notest + +echo "##########################################################" +echo "Saving the pThreads build..." +echo "##########################################################" +# Temporarily move the pthreads build +mkdir $BLIS_HOME/.tempinc +mkdir $BLIS_HOME/.templib +mv $BLIS_INC/* $BLIS_HOME/.tempinc/ +mv $BLIS_LIB/* $BLIS_HOME/.templib/ +# And rename the pthread versions of the include and library files +#echo "##########################################################" +pushd $BLIS_HOME/.tempinc/ > /dev/null +echo "Renaming pThread-enabled blis.h -> blisP.h" +mv blis.h blisP.h +popd > /dev/null +pushd $BLIS_HOME/.templib/ > /dev/null +for f in $(ls -1); do + destf=${f/blis/blisP} + echo "Renaming pThread library $f -> $destf" + mv "$f" "$destf" + + # Fix the symbolic links + if [[ -L "$destf" ]]; then + target=$(readlink $destf) + target=${target/blis/blisP} + \rm "$destf" + ln -s "$target" "$destf" + fi +done +popd > /dev/null +echo "##########################################################" + +echo "##########################################################" +echo "Second, Creating OpenMP library..." +echo "##########################################################" +. ./blis_build_altra.sh notest + +echo "##########################################################" +echo "Restoring the pThreads build..." +echo "##########################################################" +# And move the pthread versions back +mv $BLIS_HOME/.tempinc/* $BLIS_INC/ +mv $BLIS_HOME/.templib/* $BLIS_LIB/ +rmdir $BLIS_HOME/.tempinc +rmdir $BLIS_HOME/.templib + +. ./blis_test.sh quiet +. ./blis_setenv.sh +echo "##########################################################" +echo "Done creating BLIS libraries..." +echo "##########################################################" diff --git a/config/altra/QuickStart/blis_configure_altra.sh b/config/altra/QuickStart/blis_configure_altra.sh new file mode 100755 index 0000000000..206384eca6 --- /dev/null +++ b/config/altra/QuickStart/blis_configure_altra.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +if [ "$1" = "quiet" ]; then + quiet_confopenmp="quiet" +else + quiet_confopenmp="" +fi + +if [ "$quiet_confopenmp" = "" ]; then + echo "##########################################################" + echo "Configuring BLIS for Altra using OpenMP for parallelism..." + echo "##########################################################" +fi + +. ./blis_setenv.sh $quiet_confopenmp +pushd $BLIS_HOME > /dev/null +make distclean +./configure -t openmp --disable-pba-pools altra +popd > /dev/null + diff --git a/config/altra/QuickStart/blis_configure_altra_pthreads.sh b/config/altra/QuickStart/blis_configure_altra_pthreads.sh new file mode 100755 index 0000000000..7293fb664a --- /dev/null +++ b/config/altra/QuickStart/blis_configure_altra_pthreads.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +if [ "$1" = "quiet" ]; then + quiet_confpthreads="quiet" +else + quiet_confpthreads="" +fi + +if [ "$quiet_confpthreads" = "" ]; then + echo "##########################################################" + echo "Configuring BLIS for Altra using pThreads for parallelism..." + echo "##########################################################" +fi + +. ./blis_setenv.sh $quiet_confpthreads +pushd $BLIS_HOME > /dev/null +make distclean +./configure -t pthreads --disable-pba-pools altra +popd > /dev/null + diff --git a/config/altra/QuickStart/blis_quick_start_altra.txt b/config/altra/QuickStart/blis_quick_start_altra.txt new file mode 100755 index 0000000000..bf7988144e --- /dev/null +++ b/config/altra/QuickStart/blis_quick_start_altra.txt @@ -0,0 +1,202 @@ +Welcome to the Altra Platform! We've made some scripts to help you build and use blis, +but feel free to look at them for your own inspiration. +Note that all the provided scripts must be SOURCED, NOT executed! This is because they +set up environment variables needed for the steps below. + +Using BLIS requires a few steps: + +1) Configuring the library +2) Building the library & validating it +3) Linking your program with BLIS +4) Setting the environment parameters for an optimized blis to run your program + +Let's briefly touch on these points, and how the scripts provided can help +But first, let's make sure your configuration is correct... + +Open blis_setenv.sh +In the Platform Specific: section, around line 50 or so, you will see: +firmware=107 +or +firmware=108 + +If your firmware is version 1.08 or greater, make sure this is set to 108, else make sure +it's set to 107. Ampere entirely changed the CoreID mappings between these versions. + +The Altra Platform updated their firmware to 1.08 in May 2021, so if your firmware was +updated later than that, odds are good that you have 2.04 or later. + +Note: the scripts referenced here modify environment variables, so they must be sourced. +E.g., with + source +or + . + +=================================================== +1) Configuring the library +2) Building the library & validating it +=================================================== + +There are custom configuration options for Altra, but, as a user, your main decision is +whether you want BLIS to use OpenMP or pthreads for parallelism? OpenMP is the default +option, since OpenMP allows thread pinning and thus results in better performance. +To build with OpenMP use: + +. ./blis_build_altra.sh + +However, some platforms (like MacOS) cannot use OpenMP at all. In this case, you want +to build the pThreads version of BLIS: + +. ./blis_build_altra_pthreads.sh + +In both cases, it will create libblis.a in $BLIS_HOME/lib/$BLIS_ARCH + +Try doing that in the root blis directory, depending on your OS. + +LINUX: +. ./blis_build_altra.sh + +MacOS Apple Silicon: +. ./blis_build_altra_pthreads.sh + +---------------------------------------------------------------------------- +HOWEVER, there is a tricky case: If you link BLIS with a program that uses pThreads, you +MUST use the pthreads version of BLIS, even though it will be slower. This is because +there is a bug in which attempting to use both pthreads AND OpenMP will pin all threads to +a single core and essentially freeze your program. + +If this is a possibility, you may want to have both libraries available and switch between +them for each application. The script: + +. ./blis_build_both_libraries.sh + +will build both versions, with the pThreads version being called libblisP.a, and a second +header blisP.h +This is a little inconvenient, and we're working on improving the situation in the near +future. +---------------------------------------------------------------------------- + +The build will additionally check the library, but if you would like to check a la carte, do + +. ./blis_test.sh + +You should see near the bottom: +check-blastest.sh: All BLAS tests passed! +check-blistest.sh: All BLIS tests passed! + +-------------------------------- +Finally, here's a script that will be important when you are doing testing. +This performs the important step of unsetting any parameters effecting blis parallelism. + +. ./blis_unset_par.sh + +=================================================== +3) Building and Linking your program with BLIS +=================================================== + +This depends whether you are using the pThreads version of BLIS or the OpenMP version... +Note this uses the BLIS locations automatically defined when sourcing blis_setenv.sh + +. ./blis_setenv.sh + +(This will display you environment variable settings, your blis libraries and headers (if +built), and also unset blis parallelism parameters for safety.) + +// BUILDING your app with the OpenMP version of BLIS: +// Note: Don't need -lomp on Linux + +gcc -fopenmp -O2 -g -I$BLIS_HOME/include/$BLIS_ARCH MyFiles.c $BLIS_HOME/lib/$BLIS_ARCH/libblis.a -lpthread -lm -o MyExe + +// To build with pThreads +gcc -O2 -g -I$BLIS_HOME/include/$BLIS_ARCH MyFiles.c $BLIS_HOME/lib/$BLIS_ARCH/libblis.a -lpthread -lm -o MyExe + +// NOTE: If you used the scripts to build BOTH versions of blis, then use the renamed blis lib: +gcc -O2 -g -I$BLIS_HOME/include/$BLIS_ARCH MyFiles.c $BLIS_HOME/lib/$BLIS_ARCH/libblisP.a -lpthread -lm -o MyExe + +Let's try building a sample program that we've included to test BLIS: +TimeDGEMM.c + +If this is a new terminal session, make sure to: +. ./blis_setenv.sh +(there's no harm in running it again.) + +Linux: +gcc -fopenmp -O2 -g -I$BLIS_HOME/include/$BLIS_ARCH TimeDGEMM.c $BLIS_HOME/lib/$BLIS_ARCH/libblis.a -lpthread -lm -o time_gemm.x + +Apple Silicon: +gcc -O2 -g -I$BLIS_HOME/include/$BLIS_ARCH TimeDGEMM.c $BLIS_HOME/lib/$BLIS_ARCH/libblis.a -lpthread -lm -o time_gemm.x + +But don't try a timed run, yet - there's some runtime setup that needs to be done... + +=================================================== +4) Setting the environment parameters for an optimized blis to run your program +=================================================== + +The performance of some BLAS libraries are very sensitive to the compiler or the page size. +BLIS is not sensitive to either of these things, but it IS extremely dependent on pinning +the right threads to the right cores. We have scripts to help... + +. ./blis_setenv.sh + +This not only tells you where blis is, but it also creates shell functions to set +affinity, threading, and NUMA control for each run. There is a shell function created +that you can call to set up how your threads will be pinned and used: + +blis_set_cores_and_sockets + +Specifying the number of sockets is important because BLIS is configured very differently +for one vs two sockets. + +Example: +# Set up for a run with 128 total cores, half on each of 2 sockets. +blis_set_cores_and_sockets 128 2 + +You can also use the following aliases: +blis_set_cores_1S 80 # Run 80 cores on 1 socket +blis_set_cores_2S 160 # Run 160 cores across 2 sockets, 80 on each + +NOTE that at the moment, for multi-threaded BLIS, we only support active number of threads +that are a multiple of 8. +If you want to test single threaded performance, you can set + +export BLIS_NUM_THREADS=1 + +Launching your executable: + +If your application is MyExe, your commands to perform an optimized BLIS run might look +like this: + +blis_set_cores_2S 160 +$BLIS_NUMA MyExe + +This will set cpu affinity correctly, set BLIS parallelism correctly, set the NUMA +mode correctly, and launch your EXE. + +--------------------------------------------------- + +Let's try an example using the executable that you created in section 3, remembering that +if you're on an Apple Silicon Mac, make sure that you don't use more cores than you have. +(For example, 8 on an M1 Max.) + +Apple Silicon: (No NUMA is needed for Apple Mac) + +blis_set_cores_1S 8; ./time_gemm.x 8000 +(in tests, we obtained about 95% of peak with Neon64 - about 366 Gigaflops) + +Altra Dual Socket: +blis_set_cores_2S 160; $BLIS_NUMA ./time_gemm.x 12000 +(in tests, we obtained about 3.2 TF, or 82% of peak + +CONGRATULATIONS! You're ready to use BLIS! + +=================================================== +Performance Note: +=================================================== +We continue to enhance BLIS performance on the Altra. +One current issue is that not all variants of triangular operations obtain full +performance. + +For TRSM, best performance is with left triangular operations. +For TRMM, DUAL SOCKET, best performance is with left triangular operations. +For TRMM, SINGLE SOCKET, best performance is with right triangular operations. + + diff --git a/config/altra/QuickStart/blis_quick_start_uninstall_altra.sh b/config/altra/QuickStart/blis_quick_start_uninstall_altra.sh new file mode 100755 index 0000000000..8ad4378883 --- /dev/null +++ b/config/altra/QuickStart/blis_quick_start_uninstall_altra.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# This utility will remove all the configuration +# Specific QuickStart files from the blis directory. +# This is very useful when switching configurations! +# +if [[ -n "$BLIS_HOME" ]]; then + echo "REMOVING ALL ALTRA QUICKSTART FILES FROM $BLIS_HOME" + + rm $BLIS_HOME/blis_build_altra_pthreads.sh + rm $BLIS_HOME/blis_build_altra.sh + rm $BLIS_HOME/blis_build_both_libraries.sh + + rm $BLIS_HOME/blis_configure_altra_pthreads.sh + rm $BLIS_HOME/blis_configure_altra.sh + + rm $BLIS_HOME/blis_quick_start_altra.txt + rm $BLIS_HOME/blis_setenv.sh + + rm $BLIS_HOME/blis_unset_par.sh + rm $BLIS_HOME/blis_test.sh + + rm $BLIS_HOME/TimeDGEMM.c + rm $BLIS_HOME/time_gemm.x + + rm $BLIS_HOME/blis_quick_start_uninstall_altra.sh + +else + echo "ONLY USE THIS SCRIPT FROM THE BLIS HOME DIRECTORY!" + echo "BLIS_HOME is not set!" +fi diff --git a/config/altra/QuickStart/blis_setenv.sh b/config/altra/QuickStart/blis_setenv.sh new file mode 100755 index 0000000000..1f2b281502 --- /dev/null +++ b/config/altra/QuickStart/blis_setenv.sh @@ -0,0 +1,195 @@ +#!/bin/bash +####################################################################### +# Brought to you by Oracle Labs +####################################################################### +# Tested in bash and zsh +####################################################################### +# Sets up all the environment variables needed for running blis. +# For this reason, the script MUST be sourced, NOT executed! +# Needs to be run from BLIS directory to have a portable definition of +# BLIS_HOME. If this setup doesn't work for you, you may hard code +# the path to BLIS_HOME, but then be careful if you copy or move it! +####################################################################### +# This is the top level blis directory - it is recommended to set to an absolulte path +# Can be overridden by user to be called anywhere, but then less portable +# export BLIS_HOME=. +# PORTABLE - Set BLIS_HOME to the blis directory containing this script +# We need to get the full path to the file in case this is called from another directory + +if [ "$1" = "quiet" ]; then + quiet_setenv="quiet" +else + quiet_setenv="" +fi + +if [[ -n "$BASH_VERSION" ]] ; then + file_path_and_name="$( dirname "${BASH_SOURCE[0]}" )/blis_set_home_dir.sh" +else + file_path_and_name="$( dirname "$0" )/blis_set_home_dir.sh" +fi + +if [ -f "$file_path_and_name" ] ; then + . $file_path_and_name quiet +else + echo "ERROR - this file is not being executed from a blis home directory." + echo "If you cannot use this script in a home directory, you can hardcode" + echo "the absolute location of BLIS_HOME in blis_setenv,bash, but this" + echo "is then less portable and more error prone with multiple blis" + echo "directories." + return +fi + +####################################################################### +# Platform Specific: +####################################################################### +# Important! Set the firmware number to 107 for firmware version 1.07 or earlier, +# and 108 for 1.08 or later. We were unable to test 1.08 at this time. +# +firmware=108 + +qualifier="or later" +if (( firmware == 107 )); then + qualifier="or earlier" +fi + +# Use altra for both single and double socket - this might change +export BLIS_ARCH="altra" +export BLIS_LIB=$BLIS_HOME/lib/$BLIS_ARCH +export BLIS_INC=$BLIS_HOME/include/$BLIS_ARCH + +# Verify: +if [ "$quiet_setenv" = "" ]; then + echo "#################################################################" + echo "CoreID affinity assumes firmware version on this machine is $firmware $qualifier" + echo "BLIS_HOME set to $BLIS_HOME" + echo "BLIS_INC set to $BLIS_INC" + echo "=================================================================" + ls -l $BLIS_INC + echo "-----------------------------------------------------------------" + echo "BLIS_LIB set to $BLIS_LIB" + echo "-----------------------------------------------------------------" + ls -l $BLIS_LIB + echo "#################################################################" +fi + +# Affinity Macros, etc +export BLIS_NUMA="numactl --localalloc" + +# Use with firmware versions 1.07 and earlier. + +export BLIS_AFFINITY_2S_1_07="0 40 20 60 4 44 24 64 8 48 28 68 12 52 32 72 2 42 22 62 6 46 26 66 10 50 30 70 14 54 34 74 1 41 21 61 5 45 25 65 9 49 29 69 13 53 33 73 3 43 23 63 7 47 27 67 11 51 31 71 15 55 35 75 16 56 36 76 18 58 38 78 17 57 37 77 19 59 39 79 80 120 100 140 84 124 104 144 88 128 108 148 92 132 112 152 82 122 102 142 86 126 106 146 90 130 110 150 94 134 114 154 81 121 101 141 85 125 105 145 89 129 109 149 93 133 113 153 83 123 103 143 87 127 107 147 91 131 111 151 95 135 115 155 96 136 116 156 98 138 118 158 97 137 117 157 99 139 119 159" + +export BLIS_AFFINITY_1S_1_07="0 40 20 60 4 44 24 64 8 48 28 68 12 52 32 72 2 42 22 62 6 46 26 66 10 50 30 70 14 54 34 74 1 41 21 61 5 45 25 65 9 49 29 69 13 53 33 73 3 43 23 63 7 47 27 67 11 51 31 71 15 55 35 75 16 56 36 76 18 58 38 78 17 57 37 77 19 59 39 79" + +# Use with firmware versions 1.08+ +# Warning - this has not been tested. +# +export BLIS_AFFINITY_2S_1_08="28, 29, 38, 39, 2, 3, 12, 13, 6, 7, 16, 17, 0, 1, 10, 11, 68, 69, 78, 79, 42, 43, 52, 53, 46, 47, 56, 57, 40, 41, 50, 51, 24, 25, 34, 35, 20, 21, 30, 31, 26, 27, 36, 37, 22, 23, 32, 33, 64, 65, 74, 75, 60, 61, 70, 71, 66, 67, 76, 77, 62, 63, 72, 73, 8, 9, 18, 19, 4, 5, 14, 15, 48, 49, 58, 59, 44, 45, 54, 55, 108, 109, 118, 119, 82, 83, 92, 93, 86, 87, 96, 97, 80, 81, 90, 91, 148, 149, 158, 159, 122, 123, 132, 133, 126, 127, 136, 137, 120, 121, 130, 131, 104, 105, 114, 115, 100, 101, 110, 111, 106, 107, 116, 117, 102, 103, 112, 113, 144, 145, 154, 155, 140, 141, 150, 151, 146, 147, 156, 157, 142, 143, 152, 153, 88, 89, 98, 99, 84, 85, 94, 95, 128, 129, 138, 139, 124, 125, 134, 135" + +export BLIS_AFFINITY_1S_1_08="28, 29, 38, 39, 2, 3, 12, 13, 6, 7, 16, 17, 0, 1, 10, 11, 68, 69, 78, 79, 42, 43, 52, 53, 46, 47, 56, 57, 40, 41, 50, 51, 24, 25, 34, 35, 20, 21, 30, 31, 26, 27, 36, 37, 22, 23, 32, 33, 64, 65, 74, 75, 60, 61, 70, 71, 66, 67, 76, 77, 62, 63, 72, 73, 8, 9, 18, 19, 4, 5, 14, 15, 48, 49, 58, 59, 44, 45, 54, 55" + +# Parallelism on the Altra is very flat: + +# Set JC to number of sockets: +export BLIS_JC_NT=2 + +# Set JR to groups of 8: +export BLIS_HR_NT=8 + +# Set IC to the number of cores per socket / 8: +export BLIS_IC_NT=10 + +# Experimental: Allow you to set threading and +# Core affinity on single or dual sockets for +# N threads. Currently, we only support N as +# a multple of 8 + +# Max Altra cores per socket +CPS=80 + +# Use Bash Arrays: + +# Choose which CoreID mapping to go with based on the firmware ID +if (($firmware == 107)); then +arrayCoreIDs=(0 40 20 60 4 44 24 64 8 48 28 68 12 52 32 72 2 42 22 62 6 46 26 66 10 50 30 70 14 54 34 74 1 41 21 61 5 45 25 65 9 49 29 69 13 53 33 73 3 43 23 63 7 47 27 67 11 51 31 71 15 55 35 75 16 56 36 76 18 58 38 78 17 57 37 77 19 59 39 79 80 120 100 140 84 124 104 144 88 128 108 148 92 132 112 152 82 122 102 142 86 126 106 146 90 130 110 150 94 134 114 154 81 121 101 141 85 125 105 145 89 129 109 149 93 133 113 153 83 123 103 143 87 127 107 147 91 131 111 151 95 135 115 155 96 136 116 156 98 138 118 158 97 137 117 157 99 139 119 159) +elif (($firmware == 108)); then +arrayCoreIDs=(28 29 38 39 2 3 12 13 6 7 16 17 0 1 10 11 68 69 78 79 42 43 52 53 46 47 56 57 40 41 50 51 24 25 34 35 20 21 30 31 26 27 36 37 22 23 32 33 64 65 74 75 60 61 70 71 66 67 76 77 62 63 72 73 8 9 18 19 4 5 14 15 48 49 58 59 44 45 54 55 108 109 118 119 82 83 92 93 86 87 96 97 80 81 90 91 148 149 158 159 122 123 132 133 126 127 136 137 120 121 130 131 104 105 114 115 100 101 110 111 106 107 116 117 102 103 112 113 144 145 154 155 140 141 150 151 146 147 156 157 142 143 152 153 88 89 98 99 84 85 94 95 128 129 138 139 124 125 134 135) +else + echo "ERROR - UNSUPPORTED FIRMWARE $firmware" + exit -1 +fi + +# Brief check: @ = list all numbers, loop for i in ${}; do ... done +# for Array Size, do ${#arr[@]} +# echo "CoreID array has ${#arrayCoreIDs[@]} elements" +# echo "CoreID array set to: ${arrayCoreIDs[@]}" + +# Give the TOTAL core count: +# Single socket runs +blis_set_cores_and_sockets() { + cores=$1 + sockets=$2 + # echo "Cores = $cores, sockets=$sockets" + + # Round up to nearest 8 cores per socket: + cores_per_group=8 + if (( $sockets == 2 )); then + cores_per_group=16; + fi + core_round_inc=$(($cores_per_group-1)) + + cores_per_socket=$(($cores)) + cores=$(($cores + $core_round_inc)) + groups_per_socket=$(($cores / $cores_per_group)) + rounded_cores=$(( $groups_per_socket * $cores_per_group )) + + # echo "Rounded Cores = $rounded_cores" + # echo "Groups Per Socket = $groups_per_socket" + + # set the parallelism for one socket with N cores: + # Set JC to number of sockets: + export BLIS_JC_NT=$sockets + + # Set JR to groups of 8: + export BLIS_JR_NT=8 + + # Set IC to the number of cores per socket / 8: + export BLIS_IC_NT=$groups_per_socket + + # Using an old version of zsh syntax that's compatible with bash + + if (( $sockets == 1 )); then + + # Simple single socket case + # quotes + # export GOMP_CPU_AFFINITY="\"${arrayCoreIDs[@]:0:$rounded_cores}\"" + # No quotes... + export GOMP_CPU_AFFINITY="${arrayCoreIDs[@]:0:$rounded_cores}" + + else + + # Dual socket case + half_cores=$(( $rounded_cores / 2 )) + # echo "Half cores are $half_cores" + # quotes + # export GOMP_CPU_AFFINITY="\"${arrayCoreIDs[@]:0:$half_cores} ${arrayCoreIDs[@]:$CPS:$half_cores}\"" + # No quotes + export GOMP_CPU_AFFINITY="${arrayCoreIDs[@]:0:$half_cores} ${arrayCoreIDs[@]:$CPS:$half_cores}" + fi + + echo "Activating $rounded_cores cores across $sockets sockets..." + echo "GOMP_CPU_AFFINITY set to $GOMP_CPU_AFFINITY" + echo "JC/IC/JR = $BLIS_JC_NT/$BLIS_IC_NT/$BLIS_JR_NT" + } + +# Convenience functions: +blis_set_cores_1S() { blis_set_cores_and_sockets $1 1 ; } +blis_set_cores_2S() { blis_set_cores_and_sockets $1 2 ; } + +# For safety: +. ./blis_unset_par.sh + + + + + diff --git a/config/altra/QuickStart/blis_test.sh b/config/altra/QuickStart/blis_test.sh new file mode 100755 index 0000000000..c3a25e1e1e --- /dev/null +++ b/config/altra/QuickStart/blis_test.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +if [ "$1" = "quiet" ]; then + quiet_blistest="quiet" +else + quiet_blistest="" +fi + +# We don't want to quiet this part: +echo "#################################################################" +echo "Simple testing of BLIS - use testsuite for more extensive tests." +echo "#################################################################" + +. ./blis_setenv.sh $quiet_blistest +# It's critical to unset parallelism parameters before +# running the test code! +. ./blis_unset_par.sh quiet +echo "Switching to directory $BLIS_HOME" +pushd $BLIS_HOME > /dev/null +make check -j +popd > /dev/null + + diff --git a/config/altra/QuickStart/blis_unset_par.sh b/config/altra/QuickStart/blis_unset_par.sh new file mode 100755 index 0000000000..6310a6f8e8 --- /dev/null +++ b/config/altra/QuickStart/blis_unset_par.sh @@ -0,0 +1,22 @@ +#!/bin/blis + +if [ "$1" = "quiet" ]; then + quiet_unsetpar="quiet" +else + quiet_unsetpar="" +fi + +if [ "$quiet_unsetpar" = "" ]; then + echo "#########################################################" + echo " UNSETTING BLIS ENVIRONMENT VARIABLES THAT SET THREADING" + echo " AND AFFINITY." + echo "#########################################################" +fi + +unset BLIS_JC_NT +unset BLIS_JR_NT +unset BLIS_IC_NT +unset BLIS_NUM_THREADS +unset OMP_NUM_THREADS +unset GOMP_CPU_AFFINITY + diff --git a/config/altra/bli_cntx_init_altra.c b/config/altra/bli_cntx_init_altra.c new file mode 100644 index 0000000000..53facbd476 --- /dev/null +++ b/config/altra/bli_cntx_init_altra.c @@ -0,0 +1,95 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2022, Oracle Labs, Oracle Corporation + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_cntx_init_altra( cntx_t* cntx ) +{ + blksz_t blkszs[ BLIS_NUM_BLKSZS ]; + + // Set default kernel blocksizes and functions. + bli_cntx_init_altra_ref( cntx ); + + // ------------------------------------------------------------------------- + + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs + ( + cntx, + + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + + BLIS_VA_END + ); + + // Initialize level-3 blocksize objects with architecture-specific values. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 8, 6, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 8, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 192, 120, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 640, 480, -1, -1 ); // Changed d to 480 - LDR +// bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 6144, -1, -1 ); // Doubled NC + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 12288, 8192, -1, -1 ); // Increased NC slightly more + + // Update the context with the current architecture's register and cache + // blocksizes (and multiples) for native execution. + bli_cntx_set_blkszs + ( + cntx, + + // level-3 + BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, + BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, + BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, + BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, + BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + + BLIS_VA_END + ); +} + diff --git a/frame/include/level0/bli_conjs.h b/config/altra/bli_family_altra.h similarity index 73% rename from frame/include/level0/bli_conjs.h rename to config/altra/bli_family_altra.h index 241148825f..9c7844bd8e 100644 --- a/frame/include/level0/bli_conjs.h +++ b/config/altra/bli_family_altra.h @@ -32,26 +32,24 @@ */ -#ifndef BLIS_CONJS_H -#define BLIS_CONJS_H +//#ifndef BLIS_FAMILY_H +//#define BLIS_FAMILY_H -// conjs +// Version with 16 byte alignment and jr=8 -#define bli_sconjs( x ) bli_sconjris( bli_sreal(x), bli_simag(x) ) -#define bli_dconjs( x ) bli_dconjris( bli_dreal(x), bli_dimag(x) ) +#define BLIS_THREAD_MAX_JR 8 -#ifndef BLIS_ENABLE_C99_COMPLEX +// -- MEMORY ALLOCATION -------------------------------------------------------- -#define bli_cconjs( x ) bli_cconjris( bli_creal(x), bli_cimag(x) ) -#define bli_zconjs( x ) bli_zconjris( bli_zreal(x), bli_zimag(x) ) +#define BLIS_SIMD_ALIGN_SIZE 16 -#else // ifdef BLIS_ENABLE_C99_COMPLEX +#define BLIS_FORCE_ROLL_PACKM_REF_KERNEL -#define bli_cconjs( x ) { (x) = conjf(x); } -#define bli_zconjs( x ) { (x) = conj (x); } - -#endif // BLIS_ENABLE_C99_COMPLEX - - -#endif +// Temporary microtile of for each supported datatype: +// - s: 8 * 12 * sizeof(float) +// - d: 6 * 8 * sizeof(double) +// Thus, 384 bytes should be sufficient. +#define BLIS_STACK_BUF_MAX_SIZE 384 +// Empirical best choices for TRMM +#define BLIS_DISABLE_TRMM_RIGHT_IF_JC_GT_1_ELSE_DISABLE_LEFT_IF_DP diff --git a/config/altra/bli_kernel_defs_altra.h b/config/altra/bli_kernel_defs_altra.h new file mode 100644 index 0000000000..815c593993 --- /dev/null +++ b/config/altra/bli_kernel_defs_altra.h @@ -0,0 +1,48 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 8 +#define BLIS_MR_d 6 + +#define BLIS_NR_s 12 +#define BLIS_NR_d 8 + +//#endif + diff --git a/config/altra/make_defs.mk b/config/altra/make_defs.mk new file mode 100644 index 0000000000..ef1e337db6 --- /dev/null +++ b/config/altra/make_defs.mk @@ -0,0 +1,90 @@ +# +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2014, The University of Texas at Austin +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name(s) of the copyright holder(s) nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + + +# Declare the name of the current configuration and add it to the +# running list of configurations included by common.mk. +THIS_CONFIG := altra +#CONFIGS_INCL += $(THIS_CONFIG) + +# +# --- Determine the C compiler and related flags --- +# + +# NOTE: The build system will append these variables with various +# general-purpose/configuration-agnostic flags in common.mk. You +# may specify additional flags here as needed. +CPPROCFLAGS := -D_GNU_SOURCE +CMISCFLAGS := +CPICFLAGS := +CWARNFLAGS := + +ifneq ($(DEBUG_TYPE),off) +CDBGFLAGS := -g +endif + +ifeq ($(DEBUG_TYPE),noopt) +COPTFLAGS := -O0 +else +COPTFLAGS := -O2 -mcpu=neoverse-n1 +endif + +# Flags specific to optimized kernels. +CKOPTFLAGS := $(COPTFLAGS) -O3 -ftree-vectorize +ifeq ($(CC_VENDOR),gcc) +CKVECFLAGS := -mcpu=neoverse-n1 +else +ifeq ($(CC_VENDOR),clang) +CKVECFLAGS := -mcpu=neoverse-n1 +else +$(error gcc or clang is required for this configuration.) +endif +endif + +# Flags specific to reference kernels. +CROPTFLAGS := $(CKOPTFLAGS) +ifeq ($(CC_VENDOR),gcc) +CRVECFLAGS := $(CKVECFLAGS) +else +ifeq ($(CC_VENDOR),clang) +CRVECFLAGS := $(CKVECFLAGS) +else +CRVECFLAGS := $(CKVECFLAGS) +endif +endif + +# Store all of the variables here to new variables containing the +# configuration name. +$(eval $(call store-make-defs,$(THIS_CONFIG))) + diff --git a/config/altramax/QuickStart/TimeDGEMM.cfile b/config/altramax/QuickStart/TimeDGEMM.cfile new file mode 100755 index 0000000000..172edc6596 --- /dev/null +++ b/config/altramax/QuickStart/TimeDGEMM.cfile @@ -0,0 +1,143 @@ +#include +#include +#include +#include +#include +#include +#include +#include "blis.h" + +/*################################################### +// To build with openmp: +// Note: Don't need the -lomp on Linux +gcc -fopenmp -O2 -g -I$BLIS_HOME/include/$BLIS_ARCH TimeDGEMM.c $BLIS_HOME/lib/$BLIS_ARCH/libblis.a -lpthread -lm -o time_gemm.x + +// To build with pThreads +source ./enable_blis.sh +gcc -O2 -g -I$BLIS_HOME/include/$BLIS_ARCH TimeDGEMM.c $BLIS_HOME/lib/$BLIS_ARCH/libblis.a -lpthread -lm -o time_gemm.x + +// To run with QuickStart Macros... +for N_CORES, S_SOCKETS + +blis_set_cores_and_sockets N S; $BLIS_NUMA time_gemm.x + +###################################################*/ + +#include // for Linux stdarg + +//################################################### +// Handy blis functions +//################################################### + +// Returns 0.0 if out ofmatrix +double GetReal(obj_t *m, int row, int col) + { + double im = 0, re = 0; // Imaginary component + if (!m) return 0.0; + + bli_getijm(row, col, m, &re, &im); + return re; + } + +bool SetReal(obj_t *m, int row, int col, double dVal) + { + if (!m) return 0.0; + bli_setijm(dVal, 0.0, row, col, m); + + return true; + } + +//################################################### +// The basic meat - a one shot +//################################################### + +bool TimeBlis(long size) + { + int repeat = 3; // Best Of! + double dAlpha = 1.0, dBeta = 0.0; // simplest case! + + //============== Allocate matrices ============= + obj_t* alpha = (obj_t*) calloc(1, sizeof(obj_t)); + obj_t* beta = (obj_t*) calloc(1, sizeof(obj_t)); + + bli_obj_create(BLIS_DOUBLE, 1, 1, 0, 0, alpha); + bli_obj_create(BLIS_DOUBLE, 1, 1, 0, 0, beta); + + // Full gemm is alpha * A * B + beta * C + bli_setsc(dAlpha, 0.0, alpha); // alpha is one + bli_setsc(dBeta, 0.0, beta); // beta is zero + //============================================== + printf("Initializing %g GB of Matrices...\n", 8.0 * size * size * 3.0 / 1024.0 / 1024.0 / 1024.0); + + obj_t* a = (obj_t*) calloc(1, sizeof(obj_t)); + obj_t* b = (obj_t*) calloc(1, sizeof(obj_t)); + obj_t* c = (obj_t*) calloc(1, sizeof(obj_t)); + + bli_obj_create(BLIS_DOUBLE, size, size, size, 1, c); + bli_obj_create(BLIS_DOUBLE, size, size, size, 1, a); + bli_obj_create(BLIS_DOUBLE, size, size, size, 1, b); + + // Create Random matrices + // that are well conditioned and invertible + // (Note: this can be slow) + // + bli_randm(c); + bli_randm(a); + bli_randm(b); + + //============================================== + // DO the timing, blis style... + //============================================== + + double dBestTime = DBL_MAX; + + for (int i = 0; i < repeat; i++) + { + printf("Performing DGEMM %d of %d\n", i + 1, repeat); fflush(stdout); + double dStartTime = bli_clock(); + + bli_gemm(alpha, a, b, beta, c); + + // Always look at best of N for timing! + dBestTime = bli_clock_min_diff( dBestTime, dStartTime ); + } + + double gflops = ( 2.0 * size * size * size ) / ( dBestTime * 1.0e9 ); + + printf("Best DGEMM run completed in %g seconds @ size= \t %ld \t %g \t gigaflops\n", + dBestTime, size, gflops); fflush(stdout); + + return true; + } + + +int main( int argc, char** argv ) + { + long size = 0; + int cores = 1, sweep_inc = 0; + + printf("Details of parallelism are set by environment variables.\n"); + printf("Arg1 = size=M=N=K for DGEMM\n" + "optional arg2 = size step for sweep.\n"); + + if (argc < 2) return 0; + + if (argc > 1) { + size = atol(argv[1]); + printf("User set size to %ld\n", size); + } + + if (argc > 2) { + sweep_inc = atoi(argv[3]); + printf("User set sweep size inc to %d\n", sweep_inc); + } + + if (sweep_inc == 0) TimeBlis(size); + else + { + for (int i = size; i >= sweep_inc; i -= sweep_inc) + TimeBlis(i); + } + + return 0; + } diff --git a/config/altramax/QuickStart/blis_build_altramax.sh b/config/altramax/QuickStart/blis_build_altramax.sh new file mode 100755 index 0000000000..99a1c39485 --- /dev/null +++ b/config/altramax/QuickStart/blis_build_altramax.sh @@ -0,0 +1,20 @@ +#!/bin/bash +echo "#######################################################" +echo "Building standard OpenMP BLIS..." +echo "#######################################################" +. ./blis_setenv.sh quiet +echo "#############################################################" +echo "Configuring BLIS for Altramax using OpenMP for parallelism..." +echo "#############################################################" +. ./blis_configure_altramax.sh quiet +echo "Switching to directory $BLIS_HOME" +pushd $BLIS_HOME > /dev/null +make -j +popd > /dev/null +if [ "$1" != "notest" ]; then + . ./blis_test.sh quiet +fi +. ./blis_setenv.sh +echo "##########################################################" +echo "...done" +echo "##########################################################" diff --git a/config/altramax/QuickStart/blis_build_altramax_pthreads.sh b/config/altramax/QuickStart/blis_build_altramax_pthreads.sh new file mode 100755 index 0000000000..052a682f44 --- /dev/null +++ b/config/altramax/QuickStart/blis_build_altramax_pthreads.sh @@ -0,0 +1,20 @@ +#!/bin/bash +echo "#######################################################" +echo "Building pThreads version of BLIS..." +echo "#######################################################" +. ./blis_setenv.sh quiet +echo "###############################################################" +echo "Configuring BLIS for Altramax using pThreads for parallelism..." +echo "###############################################################" +. ./blis_configure_altramax_pthreads.sh quiet +echo "Switching to directory $BLIS_HOME" +pushd $BLIS_HOME > /dev/null +make -j +popd > /dev/null +if [ "$1" != "notest" ]; then + . ./blis_test.sh quiet +fi +. ./blis_setenv.sh +echo "##########################################################" +echo "...done" +echo "##########################################################" diff --git a/config/altramax/QuickStart/blis_build_both_libraries.sh b/config/altramax/QuickStart/blis_build_both_libraries.sh new file mode 100755 index 0000000000..73f2b9679f --- /dev/null +++ b/config/altramax/QuickStart/blis_build_both_libraries.sh @@ -0,0 +1,58 @@ +#!/bin/bash +echo "##########################################################" +echo "Creating both OpenMP and pThread BLIS libraries..." +echo "##########################################################" +echo "First, Creating pThread library..." +echo "##########################################################" +. ./blis_build_altramax_pthreads.sh notest + +echo "##########################################################" +echo "Saving the pThreads build..." +echo "##########################################################" +# Temporarily move the pthreads build +mkdir $BLIS_HOME/.tempinc +mkdir $BLIS_HOME/.templib +mv $BLIS_INC/* $BLIS_HOME/.tempinc/ +mv $BLIS_LIB/* $BLIS_HOME/.templib/ +# And rename the pthread versions of the include and library files +#echo "##########################################################" +pushd $BLIS_HOME/.tempinc/ > /dev/null +echo "Renaming pThread-enabled blis.h -> blisP.h" +mv blis.h blisP.h +popd > /dev/null +pushd $BLIS_HOME/.templib/ > /dev/null +for f in $(ls -1); do + destf=${f/blis/blisP} + echo "Renaming pThread library $f -> $destf" + mv "$f" "$destf" + + # Fix the symbolic links + if [[ -L "$destf" ]]; then + target=$(readlink $destf) + target=${target/blis/blisP} + \rm "$destf" + ln -s "$target" "$destf" + fi +done +popd > /dev/null +echo "##########################################################" + +echo "##########################################################" +echo "Second, Creating OpenMP library..." +echo "##########################################################" +. ./blis_build_altramax.sh notest + +echo "##########################################################" +echo "Restoring the pThreads build..." +echo "##########################################################" +# And move the pthread versions back +mv $BLIS_HOME/.tempinc/* $BLIS_INC/ +mv $BLIS_HOME/.templib/* $BLIS_LIB/ +rmdir $BLIS_HOME/.tempinc +rmdir $BLIS_HOME/.templib + +. ./blis_test.sh quiet +. ./blis_setenv.sh +echo "##########################################################" +echo "Done creating BLIS libraries..." +echo "##########################################################" diff --git a/config/altramax/QuickStart/blis_configure_altramax.sh b/config/altramax/QuickStart/blis_configure_altramax.sh new file mode 100755 index 0000000000..4cd02c6845 --- /dev/null +++ b/config/altramax/QuickStart/blis_configure_altramax.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +if [ "$1" = "quiet" ]; then + quiet_confopenmp="quiet" +else + quiet_confopenmp="" +fi + +if [ "$quiet_confopenmp" = "" ]; then + echo "#############################################################" + echo "Configuring BLIS for Altramax using OpenMP for parallelism..." + echo "#############################################################" +fi + +. ./blis_setenv.sh $quiet_confopenmp +pushd $BLIS_HOME > /dev/null +make distclean +./configure -t openmp --disable-pba-pools altramax +popd > /dev/null + diff --git a/config/altramax/QuickStart/blis_configure_altramax_pthreads.sh b/config/altramax/QuickStart/blis_configure_altramax_pthreads.sh new file mode 100755 index 0000000000..69d9ecc2fe --- /dev/null +++ b/config/altramax/QuickStart/blis_configure_altramax_pthreads.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +if [ "$1" = "quiet" ]; then + quiet_confpthreads="quiet" +else + quiet_confpthreads="" +fi + +if [ "$quiet_confpthreads" = "" ]; then + echo "###############################################################" + echo "Configuring BLIS for Altramax using pThreads for parallelism..." + echo "###############################################################" +fi + +. ./blis_setenv.sh $quiet_confpthreads +pushd $BLIS_HOME > /dev/null +make distclean +./configure -t pthreads --disable-pba-pools altramax +popd > /dev/null + diff --git a/config/altramax/QuickStart/blis_quick_start_altramax.txt b/config/altramax/QuickStart/blis_quick_start_altramax.txt new file mode 100755 index 0000000000..efccf28a1b --- /dev/null +++ b/config/altramax/QuickStart/blis_quick_start_altramax.txt @@ -0,0 +1,200 @@ +Welcome to the Altramax Platform! We've made some scripts to help you build and use blis, +but feel free to look at them for your own inspiration. +Note that all the provided scripts must be SOURCED, NOT executed! This is because they +set up environment variables needed for the steps below. + +Using BLIS requires a few steps: + +1) Configuring the library +2) Building the library & validating it +3) Linking your program with BLIS +4) Setting the environment parameters for an optimized blis to run your program + +Let's briefly touch on these points, and how the scripts provided can help +But first, let's make sure your configuration is correct... + +Open blis_setenv.sh +In the Platform Specific: section, around line 50 or so, you will see: +firmware=205 +or +firmware=204 + +If your firmware is version 2.05 or greater (most likely), make sure this is set to 205, +else make sure it's set to 204. Ampere changed the CoreID mappings between these +versions around May 2022. + +Note: the scripts referenced here modify environment variables, so they must be sourced. +E.g., with + source +or + . + +=================================================== +1) Configuring the library +2) Building the library & validating it +=================================================== + +There are custom configuration options for Altramax, but, as a user, your main decision is +whether you want BLIS to use OpenMP or pthreads for parallelism? OpenMP is the default +option, since OpenMP allows thread pinning and thus results in better performance. +To build with OpenMP use: + +. ./blis_build_altramax.sh + +However, some platforms (like MacOS) cannot use OpenMP at all. In this case, you want +to build the pThreads version of BLIS: + +. ./blis_build_altramax_pthreads.sh + +In both cases, it will create libblis.a in $BLIS_HOME/lib/$BLIS_ARCH + +Try doing that in the root blis directory, depending on your OS. + +LINUX: +. ./blis_build_altramax.sh + +MacOS Apple Silicon: +. ./blis_build_altramax_pthreads.sh + +---------------------------------------------------------------------------- +HOWEVER, there is a tricky case: If you link BLIS with a program that uses pThreads, you +MUST use the pthreads version of BLIS, even though it will be slower. This is because +there is a bug in which attempting to use both pthreads AND OpenMP will pin all threads to +a single core and essentially freeze your program. + +If this is a possibility, you may want to have both libraries available and switch between +them for each application. The script: + +. ./blis_build_both_libraries.sh + +will build both versions, with the pThreads version being called libblisP.a, and a second +header blisP.h +This is a little inconvenient, and we're working on improving the situation in the near +future. +---------------------------------------------------------------------------- + +The build will additionally check the library, but if you would like to check a la carte, do + +. ./blis_test.sh + +You should see near the bottom: +check-blastest.sh: All BLAS tests passed! +check-blistest.sh: All BLIS tests passed! + +-------------------------------- +Finally, here's a script that will be important when you are doing testing. +This performs the important step of unsetting any parameters effecting blis parallelism. + +. ./blis_unset_par.sh + +=================================================== +3) Building and Linking your program with BLIS +=================================================== + +This depends whether you are using the pThreads version of BLIS or the OpenMP version... +Note this uses the BLIS locations automatically defined when sourcing blis_setenv.sh + +. ./blis_setenv.sh + +(This will display you environment variable settings, your blis libraries and headers (if +built), and also unset blis parallelism parameters for safety.) + +// BUILDING your app with the OpenMP version of BLIS: +// Note: Don't need -lomp on Linux + +gcc -fopenmp -O2 -g -I$BLIS_HOME/include/$BLIS_ARCH MyFiles.c $BLIS_HOME/lib/$BLIS_ARCH/libblis.a -lpthread -lm -o MyExe + +// To build with pThreads +gcc -O2 -g -I$BLIS_HOME/include/$BLIS_ARCH MyFiles.c $BLIS_HOME/lib/$BLIS_ARCH/libblis.a -lpthread -lm -o MyExe + +// NOTE: If you used the scripts to build BOTH versions of blis, then use the renamed blis lib: +gcc -O2 -g -I$BLIS_HOME/include/$BLIS_ARCH MyFiles.c $BLIS_HOME/lib/$BLIS_ARCH/libblisP.a -lpthread -lm -o MyExe + +Let's try building a sample program that we've included to test BLIS: +TimeDGEMM.c + +If this is a new terminal session, make sure to: +. ./blis_setenv.sh +(there's no harm in running it again.) + +Linux: +gcc -fopenmp -O2 -g -I$BLIS_HOME/include/$BLIS_ARCH TimeDGEMM.c $BLIS_HOME/lib/$BLIS_ARCH/libblis.a -lpthread -lm -o time_gemm.x + +Apple Silicon: +gcc -O2 -g -I$BLIS_HOME/include/$BLIS_ARCH TimeDGEMM.c $BLIS_HOME/lib/$BLIS_ARCH/libblis.a -lpthread -lm -o time_gemm.x + +But don't try a timed run, yet - there's some runtime setup that needs to be done... + +=================================================== +4) Setting the environment parameters for an optimized blis to run your program +=================================================== + +The performance of some BLAS libraries are very sensitive to the compiler or the page size. +BLIS is not sensitive to either of these things, but it IS extremely dependent on pinning +the right threads to the right cores. We have scripts to help... + +. ./blis_setenv.sh + +This not only tells you where blis is, but it also creates shell functions to set +affinity, threading, and NUMA control for each run. There is a shell function created +that you can call to set up how your threads will be pinned and used: + +blis_set_cores_and_sockets + +Specifying the number of sockets is important because BLIS is configured very differently +for one vs two sockets. + +Example: +# Set up for a run with 128 total cores, half on each of 2 sockets. +blis_set_cores_and_sockets 128 2 + +You can also use the following aliases: +blis_set_cores_1S 128 # Run 128 cores on 1 socket +blis_set_cores_2S 256 # Run 256 cores across 2 sockets, 128 on each + +NOTE that at the moment, for multi-threaded BLIS, we only support active number of threads +that are a multiple of 8. +If you want to test single threaded performance, you can set + +export BLIS_NUM_THREADS=1 + +Launching your executable: + +If your application is MyExe, your commands to perform an optimized BLIS run might look +like this: + +blis_set_cores_1S 128 +$BLIS_NUMA MyExe + +This will set cpu affinity correctly, set BLIS parallelism correctly, set the NUMA +mode correctly, and launch your EXE. + +--------------------------------------------------- + +Let's try an example using the executable that you created in section 3, remembering that +if you're on an Apple Silicon Mac, make sure that you don't use more cores than you have. +(For example, 8 on an M1 Max.) + +Apple Silicon: (No NUMA is needed for Apple platforms.) + +blis_set_cores_1S 8; ./time_gemm.x 8000 +(in tests, we obtained about 95% of peak with Neon64 - about 366 Gigaflops) + +AltraMax Single Socket: +blis_set_cores_1S 128; $BLIS_NUMA ./time_gemm.x 12000 +(in tests, we obtained about 2.6 TF, or 85% of peak + +CONGRATULATIONS! You're ready to use BLIS! + +=================================================== +Performance Note: +=================================================== +We continue to enhance BLIS performance on the Altramax. +One current issue is that not all variants of triangular operations obtain full +performance. + +For TRSM, best performance is with left triangular operations. +For TRMM, DUAL SOCKET, best performance is with left triangular operations. +For TRMM, SINGLE SOCKET, best performance is with right triangular operations. + + diff --git a/config/altramax/QuickStart/blis_quick_start_uninstall_altramax.sh b/config/altramax/QuickStart/blis_quick_start_uninstall_altramax.sh new file mode 100755 index 0000000000..a36be40fda --- /dev/null +++ b/config/altramax/QuickStart/blis_quick_start_uninstall_altramax.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# This utility will remove all the configuration +# Specific QuickStart files from the blis directory. +# This is very useful when switching configurations! +# +if [[ -n "$BLIS_HOME" ]]; then + echo "REMOVING ALL ALTRAMAX QUICKSTART FILES FROM $BLIS_HOME" + + rm $BLIS_HOME/blis_build_altramax_pthreads.sh + rm $BLIS_HOME/blis_build_altramax.sh + rm $BLIS_HOME/blis_build_both_libraries.sh + + rm $BLIS_HOME/blis_configure_altramax_pthreads.sh + rm $BLIS_HOME/blis_configure_altramax.sh + + rm $BLIS_HOME/blis_quick_start_altramax.txt + rm $BLIS_HOME/blis_setenv.sh + + rm $BLIS_HOME/blis_unset_par.sh + rm $BLIS_HOME/blis_test.sh + + rm $BLIS_HOME/TimeDGEMM.c + rm $BLIS_HOME/time_gemm.x + + rm $BLIS_HOME/blis_quick_start_uninstall_altramax.sh + +else + echo "ONLY USE THIS SCRIPT FROM THE BLIS HOME DIRECTORY!" + echo "BLIS_HOME is not set!" +fi diff --git a/config/altramax/QuickStart/blis_setenv.sh b/config/altramax/QuickStart/blis_setenv.sh new file mode 100755 index 0000000000..0b2cbbe06d --- /dev/null +++ b/config/altramax/QuickStart/blis_setenv.sh @@ -0,0 +1,183 @@ +#!/bin/bash +####################################################################### +# Brought to you by Oracle Labs +####################################################################### +# Tested in bash and zsh +####################################################################### +# Sets up all the environment variables needed for running blis. +# For this reason, the script MUST be sourced, NOT executed! +# Needs to be run from BLIS directory to have a portable definition of +# BLIS_HOME. If this setup doesn't work for you, you may hard code +# the path to BLIS_HOME, but then be careful if you copy or move it! +####################################################################### +# This is the top level blis directory - it is recommended to set to an absolulte path +# Can be overridden by user to be called anywhere, but then less portable +# export BLIS_HOME=. +# PORTABLE - Set BLIS_HOME to the blis directory containing this script +# We need to get the full path to the file in case this is called from another directory + +if [ "$1" = "quiet" ]; then + quiet_setenv="quiet" +else + quiet_setenv="" +fi + +if [[ -n "$BASH_VERSION" ]] ; then + file_path_and_name="$( dirname "${BASH_SOURCE[0]}" )/blis_set_home_dir.sh" +else + file_path_and_name="$( dirname "$0" )/blis_set_home_dir.sh" +fi + +if [ -f "$file_path_and_name" ] ; then + . $file_path_and_name quiet +else + echo "ERROR - this file is not being executed from a blis home directory." + echo "If you cannot use this script in a home directory, you can hardcode" + echo "the absolute location of BLIS_HOME in blis_setenv,bash, but this" + echo "is then less portable and more error prone with multiple blis" + echo "directories." + return +fi + +####################################################################### +# Platform Specific: +# Important! Set the firmware flag to 204 for 2.04 or earlier, +# and 205 for 2.05 or later. +firmware=205 +# Use altramax for both single and double socket - this might change +export BLIS_ARCH="altramax" +export BLIS_LIB=$BLIS_HOME/lib/$BLIS_ARCH +export BLIS_INC=$BLIS_HOME/include/$BLIS_ARCH + + +# Verify: +if [ "$quiet_setenv" = "" ]; then + echo "BLIS_HOME set to $BLIS_HOME" + echo "BLIS_INC set to $BLIS_INC" + echo "-----------------------------------------------------------------" + ls -l $BLIS_INC + echo "-----------------------------------------------------------------" + echo "BLIS_LIB set to $BLIS_LIB" + echo "-----------------------------------------------------------------" + ls -l $BLIS_LIB + echo "-----------------------------------------------------------------" +fi + +# Affinity Macros, etc +export BLIS_NUMA="numactl --localalloc" + +# Use with firmware versions 2.04 and earlier. +# You can check the firmware version using dmidecode + +export BLIS_AFFINITY_2S_2_04="0 64 32 96 4 68 36 100 1 65 33 97 5 69 37 101 2 66 34 98 6 70 38 102 3 67 35 99 7 71 39 103 8 72 40 104 12 76 44 108 9 73 41 105 13 77 45 109 10 74 42 106 14 78 46 110 11 75 43 107 15 79 47 111 16 80 48 112 20 84 52 116 17 81 49 113 21 85 53 117 18 82 50 114 22 86 54 118 19 83 51 115 23 87 55 119 24 88 56 120 26 90 58 122 25 89 57 121 27 91 59 123 28 92 60 124 30 94 62 126 29 93 61 125 31 95 63 127 128 192 160 224 132 196 164 228 129 193 161 225 133 197 165 229 130 194 162 226 134 198 166 230 131 195 163 227 135 199 167 231 136 200 168 232 140 204 172 236 137 201 169 233 141 205 173 237 138 202 170 234 142 206 174 238 139 203 171 235 143 207 175 239 144 208 176 240 148 212 180 244 145 209 177 241 149 213 181 245 146 210 178 242 150 214 182 246 147 211 179 243 151 215 183 247 152 216 184 248 154 218 186 250 153 217 185 249 155 219 187 251 156 220 188 252 158 222 190 254 157 221 189 253 159 223 191 255" + +export BLIS_AFFINITY_1S_2_04="0 64 32 96 4 68 36 100 1 65 33 97 5 69 37 101 2 66 34 98 6 70 38 102 3 67 35 99 7 71 39 103 8 72 40 104 12 76 44 108 9 73 41 105 13 77 45 109 10 74 42 106 14 78 46 110 11 75 43 107 15 79 47 111 16 80 48 112 20 84 52 116 17 81 49 113 21 85 53 117 18 82 50 114 22 86 54 118 19 83 51 115 23 87 55 119 24 88 56 120 26 90 58 122 25 89 57 121 27 91 59 123 28 92 60 124 30 94 62 126 29 93 61 125 31 95 63 127" + +# Use with firmware versions 2.05 and later +# You can check the firmware version using dmidecode + +export BLIS_AFFINITY_2S_2_05="0 1 64 65 8 9 72 73 2 3 66 67 10 11 74 75 4 5 68 69 12 13 76 77 6 7 70 71 14 15 78 79 16 17 80 81 24 25 88 89 18 19 82 83 26 27 90 91 20 21 84 85 28 29 92 93 22 23 86 87 30 31 94 95 32 33 96 97 40 41 104 105 34 35 98 99 42 43 106 107 36 37 100 101 44 45 108 109 38 39 102 103 46 47 110 111 48 49 112 113 52 53 116 117 50 51 114 115 54 55 118 119 56 57 120 121 60 61 124 125 58 59 122 123 62 63 126 127 128 129 192 193 136 137 200 201 130 131 194 195 138 139 202 203 132 133 196 197 140 141 204 205 134 135 198 199 142 143 206 207 144 145 208 209 152 153 216 217 146 147 210 211 154 155 218 219 148 149 212 213 156 157 220 221 150 151 214 215 158 159 222 223 160 161 224 225 168 169 232 233 162 163 226 227 170 171 234 235 164 165 228 229 172 173 236 237 166 167 230 231 174 175 238 239 176 177 240 241 180 181 244 245 178 179 242 243 182 183 246 247 184 185 248 249 188 189 252 253 186 187 250 251 190 191 254 255" + +export BLIS_AFFINITY_1S_2_05="0 1 64 65 8 9 72 73 2 3 66 67 10 11 74 75 4 5 68 69 12 13 76 77 6 7 70 71 14 15 78 79 16 17 80 81 24 25 88 89 18 19 82 83 26 27 90 91 20 21 84 85 28 29 92 93 22 23 86 87 30 31 94 95 32 33 96 97 40 41 104 105 34 35 98 99 42 43 106 107 36 37 100 101 44 45 108 109 38 39 102 103 46 47 110 111 48 49 112 113 52 53 116 117 50 51 114 115 54 55 118 119 56 57 120 121 60 61 124 125 58 59 122 123 62 63 126 127" + +# Parallelism on the Altramax is very flat: + +# Set JC to number of sockets: +export BLIS_JC_NT=2 + +# Set JR to groups of 8: +export BLIS_HR_NT=8 + +# Set IC to the number of cores per socket / 8: +export BLIS_IC_NT=16 + +# Experimental: Allow you to set threading and +# Core affinity on single or dual sockets for +# N threads. Currently, we only support N as +# a multple of 8 + +# Maximum Altramax cores per socket +CPS=128 + +# Use Bash Arrays: + + +if (($firmware == 204)); then + arrayCoreIDs=(0 64 32 96 4 68 36 100 1 65 33 97 5 69 37 101 2 66 34 98 6 70 38 102 3 67 35 99 7 71 39 103 8 72 40 104 12 76 44 108 9 73 41 105 13 77 45 109 10 74 42 106 14 78 46 110 11 75 43 107 15 79 47 111 16 80 48 112 20 84 52 116 17 81 49 113 21 85 53 117 18 82 50 114 22 86 54 118 19 83 51 115 23 87 55 119 24 88 56 120 26 90 58 122 25 89 57 121 27 91 59 123 28 92 60 124 30 94 62 126 29 93 61 125 31 95 63 127 128 192 160 224 132 196 164 228 129 193 161 225 133 197 165 229 130 194 162 226 134 198 166 230 131 195 163 227 135 199 167 231 136 200 168 232 140 204 172 236 137 201 169 233 141 205 173 237 138 202 170 234 142 206 174 238 139 203 171 235 143 207 175 239 144 208 176 240 148 212 180 244 145 209 177 241 149 213 181 245 146 210 178 242 150 214 182 246 147 211 179 243 151 215 183 247 152 216 184 248 154 218 186 250 153 217 185 249 155 219 187 251 156 220 188 252 158 222 190 254 157 221 189 253 159 223 191 255) +elif (($firmware == 205)); then + arrayCoreIDs=(0 1 64 65 8 9 72 73 2 3 66 67 10 11 74 75 4 5 68 69 12 13 76 77 6 7 70 71 14 15 78 79 16 17 80 81 24 25 88 89 18 19 82 83 26 27 90 91 20 21 84 85 28 29 92 93 22 23 86 87 30 31 94 95 32 33 96 97 40 41 104 105 34 35 98 99 42 43 106 107 36 37 100 101 44 45 108 109 38 39 102 103 46 47 110 111 48 49 112 113 52 53 116 117 50 51 114 115 54 55 118 119 56 57 120 121 60 61 124 125 58 59 122 123 62 63 126 127 128 129 192 193 136 137 200 201 130 131 194 195 138 139 202 203 132 133 196 197 140 141 204 205 134 135 198 199 142 143 206 207 144 145 208 209 152 153 216 217 146 147 210 211 154 155 218 219 148 149 212 213 156 157 220 221 150 151 214 215 158 159 222 223 160 161 224 225 168 169 232 233 162 163 226 227 170 171 234 235 164 165 228 229 172 173 236 237 166 167 230 231 174 175 238 239 176 177 240 241 180 181 244 245 178 179 242 243 182 183 246 247 184 185 248 249 188 189 252 253 186 187 250 251 190 191 254 255) +else + echo "ERROR - UNSUPPORTED FIRMWARE $firmware" + exit -1 +fi + +# Brief check: @ = list all numbers, loop for i in ${}; do ... done +# for Array Size, do ${#arr[@]} +# echo "CoreID array has ${#arrayCoreIDs[@]} elements" +# echo "CoreID array set to: ${arrayCoreIDs[@]}" + +# Give the TOTAL core count: +# Single socket runs +blis_set_cores_and_sockets() { + cores=$1 + sockets=$2 + # echo "Cores = $cores, sockets=$sockets" + + # Round up to nearest 8 cores per socket: + cores_per_group=8 + if (( $sockets == 2 )); then + cores_per_group=16; + fi + core_round_inc=$(($cores_per_group-1)) + + cores_per_socket=$(($cores)) + cores=$(($cores + $core_round_inc)) + groups_per_socket=$(($cores / $cores_per_group)) + rounded_cores=$(( $groups_per_socket * $cores_per_group )) + + # echo "Rounded Cores = $rounded_cores" + # echo "Groups Per Socket = $groups_per_socket" + + # set the parallelism for one socket with N cores: + # Set JC to number of sockets: + export BLIS_JC_NT=$sockets + + # Set JR to groups of 8: + export BLIS_JR_NT=8 + + # Set IC to the number of cores per socket / 8: + export BLIS_IC_NT=$groups_per_socket + + # Using an old version of zsh syntax that's compatible with bash + + if (( $sockets == 1 )); then + + # Simple single socket case + # quotes + # export GOMP_CPU_AFFINITY="\"${arrayCoreIDs[@]:0:$rounded_cores}\"" + # No quotes... + export GOMP_CPU_AFFINITY="${arrayCoreIDs[@]:0:$rounded_cores}" + + else + + # Dual socket case + half_cores=$(( $rounded_cores / 2 )) + # echo "Half cores are $half_cores" + # quotes + # export GOMP_CPU_AFFINITY="\"${arrayCoreIDs[@]:0:$half_cores} ${arrayCoreIDs[@]:$CPS:$half_cores}\"" + # No quotes + export GOMP_CPU_AFFINITY="${arrayCoreIDs[@]:0:$half_cores} ${arrayCoreIDs[@]:$CPS:$half_cores}" + fi + + echo "Activating $rounded_cores cores across $sockets sockets..." + echo "GOMP_CPU_AFFINITY set to $GOMP_CPU_AFFINITY" + echo "JC/IC/JR = $BLIS_JC_NT/$BLIS_IC_NT/$BLIS_JR_NT" + } + +# Convenience functions: +blis_set_cores_1S() { blis_set_cores_and_sockets $1 1 ; } +blis_set_cores_2S() { blis_set_cores_and_sockets $1 2 ; } + +# For safety: +. ./blis_unset_par.sh + diff --git a/config/altramax/QuickStart/blis_test.sh b/config/altramax/QuickStart/blis_test.sh new file mode 100755 index 0000000000..b6153ea604 --- /dev/null +++ b/config/altramax/QuickStart/blis_test.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +if [ "$1" = "quiet" ]; then + quiet_blistest="quiet" +else + quiet_blistest="" +fi + +# We don't want to quiet this part: +echo "#################################################################" +echo "Simple testing of BLIS - use testsuite for more extensive tests." +echo "#################################################################" + +. ./blis_setenv.sh $quiet_blistest +# It's critical to unset parallelism parameters before +# running the test code! +. ./blis_unset_par.sh quiet +echo "Switching to directory $BLIS_HOME" +pushd $BLIS_HOME > /dev/null +make check -j +popd > /dev/null + diff --git a/config/altramax/QuickStart/blis_unset_par.sh b/config/altramax/QuickStart/blis_unset_par.sh new file mode 100755 index 0000000000..6310a6f8e8 --- /dev/null +++ b/config/altramax/QuickStart/blis_unset_par.sh @@ -0,0 +1,22 @@ +#!/bin/blis + +if [ "$1" = "quiet" ]; then + quiet_unsetpar="quiet" +else + quiet_unsetpar="" +fi + +if [ "$quiet_unsetpar" = "" ]; then + echo "#########################################################" + echo " UNSETTING BLIS ENVIRONMENT VARIABLES THAT SET THREADING" + echo " AND AFFINITY." + echo "#########################################################" +fi + +unset BLIS_JC_NT +unset BLIS_JR_NT +unset BLIS_IC_NT +unset BLIS_NUM_THREADS +unset OMP_NUM_THREADS +unset GOMP_CPU_AFFINITY + diff --git a/config/altramax/bli_cntx_init_altramax.c b/config/altramax/bli_cntx_init_altramax.c new file mode 100644 index 0000000000..1219468740 --- /dev/null +++ b/config/altramax/bli_cntx_init_altramax.c @@ -0,0 +1,95 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2022, Oracle Labs, Oracle Corporation + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_cntx_init_altramax( cntx_t* cntx ) +{ + blksz_t blkszs[ BLIS_NUM_BLKSZS ]; + + // Set default kernel blocksizes and functions. + bli_cntx_init_altramax_ref( cntx ); + + // ------------------------------------------------------------------------- + + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs + ( + cntx, + + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + + BLIS_VA_END + ); + + // Initialize level-3 blocksize objects with architecture-specific values. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 8, 6, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 8, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 192, 120, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 640, 480, -1, -1 ); // Changed d to 480 - LDR +// bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 6144, -1, -1 ); // Doubled NC + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 12288, 8192, -1, -1 ); // Increased NC slightly more + + // Update the context with the current architecture's register and cache + // blocksizes (and multiples) for native execution. + bli_cntx_set_blkszs + ( + cntx, + + // level-3 + BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, + BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, + BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, + BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, + BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + + BLIS_VA_END + ); +} + diff --git a/frame/include/level0/ri/bli_neg2ris.h b/config/altramax/bli_family_altramax.h similarity index 82% rename from frame/include/level0/ri/bli_neg2ris.h rename to config/altramax/bli_family_altramax.h index 860b144cff..2594ed73ab 100644 --- a/frame/include/level0/ri/bli_neg2ris.h +++ b/config/altramax/bli_family_altramax.h @@ -32,32 +32,17 @@ */ -#ifndef BLIS_NEG2RIS_H -#define BLIS_NEG2RIS_H - -// neg2ris - -#define bli_sneg2ris( ar, ai, br, bi ) \ -{ \ - (br) = -(ar); \ -} - -#define bli_dneg2ris( ar, ai, br, bi ) \ -{ \ - (br) = -(ar); \ -} - -#define bli_cneg2ris( ar, ai, br, bi ) \ -{ \ - (br) = -(ar); \ - (bi) = -(ai); \ -} - -#define bli_zneg2ris( ar, ai, br, bi ) \ -{ \ - (br) = -(ar); \ - (bi) = -(ai); \ -} - -#endif +//#ifndef BLIS_FAMILY_H +//#define BLIS_FAMILY_H + +// Version with 16 byte alignment and jr=8 + +#define BLIS_THREAD_MAX_JR 8 + +// -- MEMORY ALLOCATION -------------------------------------------------------- + +#define BLIS_SIMD_ALIGN_SIZE 16 + +#define BLIS_FORCE_ROLL_PACKM_REF_KERNEL +#define BLIS_DISABLE_TRMM_RIGHT_IF_JC_GT_1_ELSE_DISABLE_LEFT_IF_DP diff --git a/config/altramax/bli_kernel_defs_altramax.h b/config/altramax/bli_kernel_defs_altramax.h new file mode 100644 index 0000000000..815c593993 --- /dev/null +++ b/config/altramax/bli_kernel_defs_altramax.h @@ -0,0 +1,48 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 8 +#define BLIS_MR_d 6 + +#define BLIS_NR_s 12 +#define BLIS_NR_d 8 + +//#endif + diff --git a/config/altramax/make_defs.mk b/config/altramax/make_defs.mk new file mode 100644 index 0000000000..35bd7de489 --- /dev/null +++ b/config/altramax/make_defs.mk @@ -0,0 +1,90 @@ +# +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2014, The University of Texas at Austin +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name(s) of the copyright holder(s) nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + + +# Declare the name of the current configuration and add it to the +# running list of configurations included by common.mk. +THIS_CONFIG := altramax +#CONFIGS_INCL += $(THIS_CONFIG) + +# +# --- Determine the C compiler and related flags --- +# + +# NOTE: The build system will append these variables with various +# general-purpose/configuration-agnostic flags in common.mk. You +# may specify additional flags here as needed. +CPPROCFLAGS := -D_GNU_SOURCE +CMISCFLAGS := +CPICFLAGS := +CWARNFLAGS := + +ifneq ($(DEBUG_TYPE),off) +CDBGFLAGS := -g +endif + +ifeq ($(DEBUG_TYPE),noopt) +COPTFLAGS := -O0 +else +COPTFLAGS := -O2 -mcpu=neoverse-n1 +endif + +# Flags specific to optimized kernels. +CKOPTFLAGS := $(COPTFLAGS) -O3 -ftree-vectorize +ifeq ($(CC_VENDOR),gcc) +CKVECFLAGS := -mcpu=neoverse-n1 +else +ifeq ($(CC_VENDOR),clang) +CKVECFLAGS := -mcpu=neoverse-n1 +else +$(error gcc or clang is required for this configuration.) +endif +endif + +# Flags specific to reference kernels. +CROPTFLAGS := $(CKOPTFLAGS) +ifeq ($(CC_VENDOR),gcc) +CRVECFLAGS := $(CKVECFLAGS) +else +ifeq ($(CC_VENDOR),clang) +CRVECFLAGS := $(CKVECFLAGS) +else +CRVECFLAGS := $(CKVECFLAGS) +endif +endif + +# Store all of the variables here to new variables containing the +# configuration name. +$(eval $(call store-make-defs,$(THIS_CONFIG))) + diff --git a/config/amd64/bli_family_amd64.h b/config/amd64/bli_family_amd64.h index 278c228182..4791cceeb5 100644 --- a/config/amd64/bli_family_amd64.h +++ b/config/amd64/bli_family_amd64.h @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -32,15 +32,8 @@ */ -//#ifndef BLIS_FAMILY_H -//#define BLIS_FAMILY_H +#ifndef BLIS_FAMILY_AMD64_H +#define BLIS_FAMILY_AMD64_H - -// -- MEMORY ALLOCATION -------------------------------------------------------- - -#define BLIS_SIMD_ALIGN_SIZE 16 - - - -//#endif +#endif diff --git a/config/amd64/make_defs.mk b/config/amd64/make_defs.mk index b9232ac6c8..bbe4d8d5f6 100644 --- a/config/amd64/make_defs.mk +++ b/config/amd64/make_defs.mk @@ -1,10 +1,10 @@ # # -# BLIS +# BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # -# Copyright (C) 2014, The University of Texas at Austin +# Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -47,7 +47,7 @@ THIS_CONFIG := amd64 # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := -CPICFLAGS := +CPICFLAGS := -fPIC CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) @@ -60,29 +60,8 @@ else COPTFLAGS := -O2 endif -# Flags specific to optimized kernels. -CKOPTFLAGS := $(COPTFLAGS) -O3 -ifeq ($(CC_VENDOR),gcc) -CKVECFLAGS := -mfpmath=sse -mavx -mfma -march=bdver2 -else -ifeq ($(CC_VENDOR),clang) -CKVECFLAGS := -mfpmath=sse -mavx -mfma -march=bdver2 -else -$(error gcc or clang are required for this configuration.) -endif -endif - -# Flags specific to reference kernels. -CROPTFLAGS := $(CKOPTFLAGS) -ifeq ($(CC_VENDOR),gcc) -CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast -else -ifeq ($(CC_VENDOR),clang) -CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast -else -CRVECFLAGS := $(CKVECFLAGS) -endif -endif +# Setting for reference and optimized kernels are taken from individual +# subconfiguration makefile fragments in this family. # Store all of the variables here to new variables containing the # configuration name. diff --git a/frame/include/level0/1r/bli_invert1rs.h b/config/amd64_legacy/bli_family_amd64_legacy.h similarity index 90% rename from frame/include/level0/1r/bli_invert1rs.h rename to config/amd64_legacy/bli_family_amd64_legacy.h index 16f7283fd7..c4f84885f7 100644 --- a/frame/include/level0/1r/bli_invert1rs.h +++ b/config/amd64_legacy/bli_family_amd64_legacy.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -32,12 +33,10 @@ */ -#ifndef BLIS_INVERT1RS_H -#define BLIS_INVERT1RS_H +#ifndef BLIS_FAMILY_AMD64_LEGACY_H +#define BLIS_FAMILY_AMD64_LEGACY_H -// invert1rs - -#define bli_cinvert1rs( xr, xi ) bli_cinvertris( xr, xi ) -#define bli_zinvert1rs( xr, xi ) bli_zinvertris( xr, xi ) +// Placeholder for bundle configuration. #endif + diff --git a/config/amd64_legacy/make_defs.mk b/config/amd64_legacy/make_defs.mk new file mode 100644 index 0000000000..914f533ae0 --- /dev/null +++ b/config/amd64_legacy/make_defs.mk @@ -0,0 +1,70 @@ +# +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2014, The University of Texas at Austin +# Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name(s) of the copyright holder(s) nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + + +# Declare the name of the current configuration and add it to the +# running list of configurations included by common.mk. +THIS_CONFIG := amd64_legacy +#CONFIGS_INCL += $(THIS_CONFIG) + +# +# --- Determine the C compiler and related flags --- +# + +# NOTE: The build system will append these variables with various +# general-purpose/configuration-agnostic flags in common.mk. You +# may specify additional flags here as needed. +CPPROCFLAGS := +CMISCFLAGS := +CPICFLAGS := -fPIC +CWARNFLAGS := + +ifneq ($(DEBUG_TYPE),off) +CDBGFLAGS := -g +endif + +ifeq ($(DEBUG_TYPE),noopt) +COPTFLAGS := -O0 +else +COPTFLAGS := -O2 +endif + +# Setting for reference and optimized kernels are taken from individual +# subconfiguration makefile fragments in this family. + +# Store all of the variables here to new variables containing the +# configuration name. +$(eval $(call store-make-defs,$(THIS_CONFIG))) + diff --git a/config/arm32/make_defs.mk b/config/arm32/make_defs.mk index e6818a19d7..ee95296386 100644 --- a/config/arm32/make_defs.mk +++ b/config/arm32/make_defs.mk @@ -47,7 +47,7 @@ THIS_CONFIG := arm32 # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := -mfloat-abi=hard -mfpu=neon -CPICFLAGS := +CPICFLAGS := -fPIC CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) diff --git a/config/arm64/bli_family_arm64.h b/config/arm64/bli_family_arm64.h index 278c228182..3fb08fc422 100644 --- a/config/arm64/bli_family_arm64.h +++ b/config/arm64/bli_family_arm64.h @@ -40,6 +40,19 @@ #define BLIS_SIMD_ALIGN_SIZE 16 +#define BLIS_SIMD_MAX_SIZE 128 // Note: The default is 64. +#define BLIS_SIMD_MAX_NUM_REGISTERS 32 + +// SVE-specific configs. +#define N_L1_SVE_DEFAULT 64 +#define W_L1_SVE_DEFAULT 4 +#define C_L1_SVE_DEFAULT 256 +#define N_L2_SVE_DEFAULT 2048 +#define W_L2_SVE_DEFAULT 16 +#define C_L2_SVE_DEFAULT 256 +#define N_L3_SVE_DEFAULT 8192 +#define W_L3_SVE_DEFAULT 16 +#define C_L3_SVE_DEFAULT 256 //#endif diff --git a/config/arm64/make_defs.mk b/config/arm64/make_defs.mk index fc1a062e68..1f8c2e84b5 100644 --- a/config/arm64/make_defs.mk +++ b/config/arm64/make_defs.mk @@ -47,7 +47,7 @@ THIS_CONFIG := arm64 # may specify additional flags here as needed. CPPROCFLAGS := -D_GNU_SOURCE CMISCFLAGS := -CPICFLAGS := +CPICFLAGS := -fPIC CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) diff --git a/config/armsve/bli_cntx_init_armsve.c b/config/armsve/bli_cntx_init_armsve.c index fafed2229b..b1246bcd71 100644 --- a/config/armsve/bli_cntx_init_armsve.c +++ b/config/armsve/bli_cntx_init_armsve.c @@ -33,18 +33,25 @@ */ #include "blis.h" -#include "bli_armsve_config_utils.h" void bli_cntx_init_armsve( cntx_t* cntx ) { - blksz_t blkszs[ BLIS_NUM_BLKSZS ]; -#if 0 - blksz_t thresh[ BLIS_NUM_THRESH ]; -#endif - // Set default kernel blocksizes and functions. bli_cntx_init_armsve_ref( cntx ); + // If we are autodetecting the correct aarch64 config, then we have to make sure + // that SVE instructions are actually available since these are used in determining + // the register blocksizes. + #ifdef BLIS_FAMILY_ARM64 + uint32_t family, model, features = 0; + bli_cpuid_query( &family, &model, &features ); + + if ( ! bli_cpuid_has_features( features, FEATURE_SVE ) ) + return; + #endif + + blksz_t blkszs[ BLIS_NUM_BLKSZS ]; + // ------------------------------------------------------------------------- // Block size. @@ -57,35 +64,54 @@ void bli_cntx_init_armsve( cntx_t* cntx ) bli_c_blksz_armsve(&m_r_c, &n_r_c, &k_c_c, &m_c_c, &n_c_c); bli_z_blksz_armsve(&m_r_z, &n_r_z, &k_c_z, &m_c_z, &n_c_z); - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 4, + cntx, + + // level-3 // These are vector-length agnostic kernels. Yet knowing mr is required at runtime. - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed, FALSE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed, FALSE, - cntx + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + + BLIS_VA_END ); // Set VL-specific packing routines if applicable. - if (m_r_d==16) - bli_cntx_set_packm_kers + if ( m_r_d == 16 ) + { + bli_cntx_set_ukrs ( - 2, - BLIS_PACKM_10XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk, - BLIS_PACKM_16XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk, - cntx + cntx, + BLIS_PACKM_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16x10, + BLIS_VA_END ); - else if (m_r_d==8) - bli_cntx_set_packm_kers + } + else if ( m_r_d == 8 ) + { + bli_cntx_set_ukrs ( - 1, - BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_armsve256_int_8xk, - cntx + cntx, + BLIS_PACKM_KER, BLIS_DOUBLE, bli_dpackm_armsve256_int_8x10, + BLIS_VA_END ); + } // Initialize level-3 blocksize objects with architecture-specific values. // s d c z @@ -99,64 +125,16 @@ void bli_cntx_init_armsve( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx - ); -#if 0 - // Initialize sup thresholds with architecture-appropriate values. - // s d c z - bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 101, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 101, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 101, -1, -1 ); - - // Initialize the context with the sup thresholds. - bli_cntx_set_l3_sup_thresh - ( - 3, - BLIS_MT, &thresh[ BLIS_MT ], - BLIS_NT, &thresh[ BLIS_NT ], - BLIS_KT, &thresh[ BLIS_KT ], - cntx - ); - - // Update the context with optimized small/unpacked gemm kernels. - bli_cntx_set_l3_sup_kers - ( - 4, - BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, - BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, - BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, - BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, - cntx - ); - - // Initialize level-3 sup blocksize objects with architecture-specific - // values. - // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, n_r_d, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, m_r_d, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 120, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 2048, -1, -1 ); - - // Update the context with the current architecture's register and cache - // blocksizes for small/unpacked level-3 problems. - bli_cntx_set_l3_sup_blkszs - ( - 5, - BLIS_NC, &blkszs[ BLIS_NC ], - BLIS_KC, &blkszs[ BLIS_KC ], - BLIS_MC, &blkszs[ BLIS_MC ], - BLIS_NR, &blkszs[ BLIS_NR ], - BLIS_MR, &blkszs[ BLIS_MR ], - cntx + BLIS_VA_END ); -#endif } diff --git a/config/armsve/bli_family_armsve.h b/config/armsve/bli_family_armsve.h index b67ae7c606..f2837459d1 100644 --- a/config/armsve/bli_family_armsve.h +++ b/config/armsve/bli_family_armsve.h @@ -38,8 +38,8 @@ // -- MEMORY ALLOCATION -------------------------------------------------------- -#define BLIS_SIMD_ALIGN_SIZE 256 -#define BLIS_SIMD_NUM_REGISTERS 32 +#define BLIS_SIMD_ALIGN_SIZE 256 +#define BLIS_SIMD_MAX_NUM_REGISTERS 32 // SVE-specific configs. #define N_L1_SVE_DEFAULT 64 diff --git a/frame/include/level0/old/ri3/bli_copyri3s.h b/config/armsve/bli_kernel_defs_armsve.h similarity index 73% rename from frame/include/level0/old/ri3/bli_copyri3s.h rename to config/armsve/bli_kernel_defs_armsve.h index 86ec79b0a8..8c9c0b0dd6 100644 --- a/frame/include/level0/old/ri3/bli_copyri3s.h +++ b/config/armsve/bli_kernel_defs_armsve.h @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2022, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -32,34 +32,27 @@ */ -#ifndef BLIS_COPYRI3S_H -#define BLIS_COPYRI3S_H +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H -// copyri3s -#define bli_scopyri3s( ar, ai, br, bi, bri ) \ -{ \ - (br) = (ar); \ -} +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- -#define bli_dcopyri3s( ar, ai, br, bi, bri ) \ -{ \ - (br) = (ar); \ -} +// +// The armsve configuration handles both 256-bit and 512-bit SVE vectors, +// so it is not possible to define specific register block sizes. Thus, +// armsve can't use reference kernels! +// -#define bli_ccopyri3s( ar, ai, br, bi, bri ) \ -{ \ - (br) = (ar); \ - (bi) = (ai); \ - (bri) = (ar) + (ai); \ -} +#define BLIS_MR_s -1 +#define BLIS_MR_d -1 +#define BLIS_MR_c -1 +#define BLIS_MR_z -1 -#define bli_zcopyri3s( ar, ai, br, bi, bri ) \ -{ \ - (br) = (ar); \ - (bi) = (ai); \ - (bri) = (ar) + (ai); \ -} +#define BLIS_NR_s 10 +#define BLIS_NR_d 10 +#define BLIS_NR_c 10 +#define BLIS_NR_z 10 -#endif +//#endif diff --git a/config/armsve/make_defs.mk b/config/armsve/make_defs.mk index d3495efbb8..340b52f316 100644 --- a/config/armsve/make_defs.mk +++ b/config/armsve/make_defs.mk @@ -47,7 +47,7 @@ THIS_CONFIG := armsve # may specify additional flags here as needed. CPPROCFLAGS := -D_GNU_SOURCE CMISCFLAGS := -CPICFLAGS := +CPICFLAGS := -fPIC CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) diff --git a/config/bgq/bli_cntx_init_bgq.c b/config/bgq/bli_cntx_init_bgq.c index 782c441b97..a61d1b95d4 100644 --- a/config/bgq/bli_cntx_init_bgq.c +++ b/config/bgq/bli_cntx_init_bgq.c @@ -43,35 +43,52 @@ void bli_cntx_init_bgq( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 2, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_bgq_int_8x8, FALSE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_bgq_int_4x4, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_bgq_int_8x8, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_bgq_int_4x4, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_MR ], 0, 8, 0, 4 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 0, 8, 0, 4 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 0, 1024, 0, 768 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 0, 2048, 0, 1536 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 0, 10240, 0, 10240 ); + bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 8, -1, 4 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 8, -1, 4 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 1024, -1, 768 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 2048, -1, 1536 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 10240, -1, 10240 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/bgq/bli_kernel_defs_bgq.h b/config/bgq/bli_kernel_defs_bgq.h new file mode 100644 index 0000000000..bd3962e45a --- /dev/null +++ b/config/bgq/bli_kernel_defs_bgq.h @@ -0,0 +1,48 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_d 8 +#define BLIS_MR_z 4 + +#define BLIS_NR_d 8 +#define BLIS_NR_z 4 + +//#endif + diff --git a/config/bgq/make_defs.mk b/config/bgq/make_defs.mk index 0cbbf439d5..fa4479956b 100644 --- a/config/bgq/make_defs.mk +++ b/config/bgq/make_defs.mk @@ -58,7 +58,7 @@ CMISCFLAGS := -fopenmp else $(error xlc or bgclang is required for this configuration.) endif -CPICFLAGS := +CPICFLAGS := -fPIC CWARNFLAGS := -w ifneq ($(DEBUG_TYPE),off) diff --git a/config/bulldozer/bli_cntx_init_bulldozer.c b/config/bulldozer/bli_cntx_init_bulldozer.c index 9f6e83d6ba..5b056f591f 100644 --- a/config/bulldozer/bli_cntx_init_bulldozer.c +++ b/config/bulldozer/bli_cntx_init_bulldozer.c @@ -43,16 +43,32 @@ void bli_cntx_init_bulldozer( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 4, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_bulldozer_asm_8x8_fma4, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_bulldozer_asm_4x6_fma4, FALSE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_bulldozer_asm_8x4_fma4, FALSE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_bulldozer_asm_4x4_fma4, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_bulldozer_asm_8x8_fma4, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_bulldozer_asm_4x6_fma4, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_bulldozer_asm_8x4_fma4, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_bulldozer_asm_4x4_fma4, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -67,13 +83,16 @@ void bli_cntx_init_bulldozer( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/frame/include/level0/old/ro/bli_scal2jros.h b/config/bulldozer/bli_kernel_defs_bulldozer.h similarity index 80% rename from frame/include/level0/old/ro/bli_scal2jros.h rename to config/bulldozer/bli_kernel_defs_bulldozer.h index be7b43fb05..ea1e58e66b 100644 --- a/frame/include/level0/old/ro/bli_scal2jros.h +++ b/config/bulldozer/bli_kernel_defs_bulldozer.h @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2022, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -32,20 +32,21 @@ */ -#ifndef BLIS_SCAL2JROS_H -#define BLIS_SCAL2JROS_H +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H -// scal2jros -#define bli_cscal2jros( a, x, yr ) \ -{ \ - (yr) = bli_creal(a) * bli_creal(x) + bli_cimag(a) * bli_cimag(x); \ -} +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- -#define bli_zscal2jros( a, x, yr ) \ -{ \ - (yr) = bli_zreal(a) * bli_zreal(x) + bli_zimag(a) * bli_zimag(x); \ -} +#define BLIS_MR_s 8 +#define BLIS_MR_d 4 +#define BLIS_MR_c 8 +#define BLIS_MR_z 4 -#endif +#define BLIS_NR_s 8 +#define BLIS_NR_d 6 +#define BLIS_NR_c 4 +#define BLIS_NR_z 4 + +//#endif diff --git a/config/bulldozer/make_defs.mk b/config/bulldozer/make_defs.mk index 1f80f2ab65..e3e2088622 100644 --- a/config/bulldozer/make_defs.mk +++ b/config/bulldozer/make_defs.mk @@ -47,7 +47,7 @@ THIS_CONFIG := bulldozer # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := -CPICFLAGS := +CPICFLAGS := -fPIC CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) diff --git a/config/cortexa15/bli_cntx_init_cortexa15.c b/config/cortexa15/bli_cntx_init_cortexa15.c index 7c6134ff01..28ebdef71b 100644 --- a/config/cortexa15/bli_cntx_init_cortexa15.c +++ b/config/cortexa15/bli_cntx_init_cortexa15.c @@ -43,14 +43,28 @@ void bli_cntx_init_cortexa15( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 2, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv7a_int_4x4, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv7a_int_4x4, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv7a_int_4x4, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv7a_int_4x4, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -73,13 +87,16 @@ void bli_cntx_init_cortexa15( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/cortexa15/bli_kernel_defs_cortexa15.h b/config/cortexa15/bli_kernel_defs_cortexa15.h new file mode 100644 index 0000000000..9c413f7f84 --- /dev/null +++ b/config/cortexa15/bli_kernel_defs_cortexa15.h @@ -0,0 +1,48 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 4 +#define BLIS_MR_d 4 + +#define BLIS_NR_s 4 +#define BLIS_NR_d 4 + +//#endif + diff --git a/config/cortexa15/make_defs.mk b/config/cortexa15/make_defs.mk index abbee599de..3a9a83b39d 100644 --- a/config/cortexa15/make_defs.mk +++ b/config/cortexa15/make_defs.mk @@ -47,7 +47,7 @@ THIS_CONFIG := cortexa15 # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := -mfloat-abi=hard -mfpu=neon -CPICFLAGS := +CPICFLAGS := -fPIC CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) diff --git a/config/cortexa53/bli_cntx_init_cortexa53.c b/config/cortexa53/bli_cntx_init_cortexa53.c index d7d786f8c6..4957de04e5 100644 --- a/config/cortexa53/bli_cntx_init_cortexa53.c +++ b/config/cortexa53/bli_cntx_init_cortexa53.c @@ -43,14 +43,28 @@ void bli_cntx_init_cortexa53( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 2, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -65,13 +79,16 @@ void bli_cntx_init_cortexa53( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/cortexa53/bli_kernel_defs_cortexa53.h b/config/cortexa53/bli_kernel_defs_cortexa53.h new file mode 100644 index 0000000000..60292099cc --- /dev/null +++ b/config/cortexa53/bli_kernel_defs_cortexa53.h @@ -0,0 +1,48 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 8 +#define BLIS_MR_d 6 + +#define BLIS_NR_s 12 +#define BLIS_NR_d 8 + +//#endif + diff --git a/config/cortexa53/make_defs.mk b/config/cortexa53/make_defs.mk index b5b2220a67..6036ea55a4 100644 --- a/config/cortexa53/make_defs.mk +++ b/config/cortexa53/make_defs.mk @@ -47,7 +47,7 @@ THIS_CONFIG := cortexa53 # may specify additional flags here as needed. CPPROCFLAGS := -D_GNU_SOURCE CMISCFLAGS := -CPICFLAGS := +CPICFLAGS := -fPIC CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) diff --git a/config/cortexa57/bli_cntx_init_cortexa57.c b/config/cortexa57/bli_cntx_init_cortexa57.c index 57d18792de..28558bc522 100644 --- a/config/cortexa57/bli_cntx_init_cortexa57.c +++ b/config/cortexa57/bli_cntx_init_cortexa57.c @@ -43,14 +43,28 @@ void bli_cntx_init_cortexa57( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 2, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -65,13 +79,16 @@ void bli_cntx_init_cortexa57( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/cortexa57/bli_kernel_defs_cortexa57.h b/config/cortexa57/bli_kernel_defs_cortexa57.h new file mode 100644 index 0000000000..60292099cc --- /dev/null +++ b/config/cortexa57/bli_kernel_defs_cortexa57.h @@ -0,0 +1,48 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 8 +#define BLIS_MR_d 6 + +#define BLIS_NR_s 12 +#define BLIS_NR_d 8 + +//#endif + diff --git a/config/cortexa57/make_defs.mk b/config/cortexa57/make_defs.mk index 83565b8a79..d84f8538a5 100644 --- a/config/cortexa57/make_defs.mk +++ b/config/cortexa57/make_defs.mk @@ -47,7 +47,7 @@ THIS_CONFIG := cortexa57 # may specify additional flags here as needed. CPPROCFLAGS := -D_GNU_SOURCE CMISCFLAGS := -CPICFLAGS := +CPICFLAGS := -fPIC CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) diff --git a/config/cortexa9/bli_cntx_init_cortexa9.c b/config/cortexa9/bli_cntx_init_cortexa9.c index d38e12ebbf..55a8000e74 100644 --- a/config/cortexa9/bli_cntx_init_cortexa9.c +++ b/config/cortexa9/bli_cntx_init_cortexa9.c @@ -43,35 +43,52 @@ void bli_cntx_init_cortexa9( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 2, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv7a_int_4x4, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv7a_int_4x4, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv7a_int_4x4, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv7a_int_4x4, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_MR ], 4, 4, 0, 0 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 4, 4, 0, 0 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 432, 176, 0, 0 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 352, 368, 0, 0 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, 0, 0 ); + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 4, 4, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 4, 4, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 432, 176, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 352, 368, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, -1, -1 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/cortexa9/bli_kernel_defs_cortexa9.h b/config/cortexa9/bli_kernel_defs_cortexa9.h new file mode 100644 index 0000000000..9c413f7f84 --- /dev/null +++ b/config/cortexa9/bli_kernel_defs_cortexa9.h @@ -0,0 +1,48 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 4 +#define BLIS_MR_d 4 + +#define BLIS_NR_s 4 +#define BLIS_NR_d 4 + +//#endif + diff --git a/config/cortexa9/make_defs.mk b/config/cortexa9/make_defs.mk index ea9dc29ac6..f5f19e5309 100644 --- a/config/cortexa9/make_defs.mk +++ b/config/cortexa9/make_defs.mk @@ -47,7 +47,7 @@ THIS_CONFIG := cortexa9 # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := -mfloat-abi=hard -mfpu=neon -CPICFLAGS := +CPICFLAGS := -fPIC CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) diff --git a/config/excavator/bli_cntx_init_excavator.c b/config/excavator/bli_cntx_init_excavator.c index adae152d50..d36865b216 100644 --- a/config/excavator/bli_cntx_init_excavator.c +++ b/config/excavator/bli_cntx_init_excavator.c @@ -43,16 +43,32 @@ void bli_cntx_init_excavator( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 4, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_piledriver_asm_16x3, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_piledriver_asm_8x3, FALSE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2, FALSE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_piledriver_asm_16x3, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_piledriver_asm_8x3, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -67,13 +83,16 @@ void bli_cntx_init_excavator( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/excavator/bli_kernel_defs_excavator.h b/config/excavator/bli_kernel_defs_excavator.h new file mode 100644 index 0000000000..df4a8c4118 --- /dev/null +++ b/config/excavator/bli_kernel_defs_excavator.h @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 16 +#define BLIS_MR_d 8 +#define BLIS_MR_c 4 +#define BLIS_MR_z 2 + +#define BLIS_NR_s 3 +#define BLIS_NR_d 3 +#define BLIS_NR_c 2 +#define BLIS_NR_z 2 + +//#endif + diff --git a/config/excavator/make_defs.mk b/config/excavator/make_defs.mk index 6e73e60584..7977806b22 100644 --- a/config/excavator/make_defs.mk +++ b/config/excavator/make_defs.mk @@ -47,7 +47,7 @@ THIS_CONFIG := excavator # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := -CPICFLAGS := +CPICFLAGS := -fPIC CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) diff --git a/config/firestorm/bli_cntx_init_firestorm.c b/config/firestorm/bli_cntx_init_firestorm.c index a15ce03448..f1e59e27ee 100644 --- a/config/firestorm/bli_cntx_init_firestorm.c +++ b/config/firestorm/bli_cntx_init_firestorm.c @@ -37,108 +37,111 @@ void bli_cntx_init_firestorm( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; - blksz_t thresh[ BLIS_NUM_THRESH ]; // Set default kernel blocksizes and functions. bli_cntx_init_firestorm_ref( cntx ); // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 2, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_12x8r, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_8x6r, + + // packm + BLIS_PACKM_KER, BLIS_FLOAT, bli_spackm_armv8a_int_8x12, + BLIS_PACKM_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_6x8, + + // gemmsup + BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, + BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_armv8a_asm_6x8m, + BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, + BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, + BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, + BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_armv8a_asm_6x8n, + BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, + BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, + + BLIS_VA_END ); - // Update the context with optimized packm kernels. - bli_cntx_set_packm_kers + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs ( - 4, - BLIS_PACKM_8XK_KER, BLIS_FLOAT, bli_spackm_armv8a_int_8xk, - BLIS_PACKM_12XK_KER, BLIS_FLOAT, bli_spackm_armv8a_int_12xk, - BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_6xk, - BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_8xk, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + // gemmsup + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_MR ], 8, 6, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 8, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 120, 252, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 640, 3072, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 8192, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 12, 8, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 8, 6, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 480, 256, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 4096, 3072, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 9600, 8184, -1, -1 ); + + // Initialize sup thresholds with architecture-appropriate values. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MT ], -1, 99, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NT ], -1, 99, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KT ], -1, 99, -1, -1 ); + + // Initialize level-3 sup blocksize objects with architecture-specific + // values. + // s d c z + bli_blksz_init ( &blkszs[ BLIS_MR_SUP ], -1, 6, -1, -1, + -1, 9, -1, -1 ); + bli_blksz_init ( &blkszs[ BLIS_NR_SUP ], -1, 8, -1, -1, + -1, 13, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ], -1, 240, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ], -1, 1024, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ], -1, 3072, -1, -1 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx - ); - // ------------------------------------------------------------------------- + // sup thresholds + BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT, + BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT, + BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT, - // Initialize sup thresholds with architecture-appropriate values. - // s d c z - bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 99, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 99, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 99, -1, -1 ); + // level-3 sup + BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NR_SUP, + BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KR_SUP, + BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MR_SUP, + BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP, + BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP, - // Initialize the context with the sup thresholds. - bli_cntx_set_l3_sup_thresh - ( - 3, - BLIS_MT, &thresh[ BLIS_MT ], - BLIS_NT, &thresh[ BLIS_NT ], - BLIS_KT, &thresh[ BLIS_KT ], - cntx - ); - - // Update the context with optimized small/unpacked gemm kernels. - bli_cntx_set_l3_sup_kers - ( - 8, - BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, TRUE, - BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_armv8a_asm_6x8m, TRUE, - BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, TRUE, - BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, TRUE, - BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, TRUE, - BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_armv8a_asm_6x8n, TRUE, - BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, TRUE, - BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, TRUE, - cntx - ); - - // Initialize level-3 sup blocksize objects with architecture-specific - // values. - // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 6, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 8, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 240, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 1024, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 3072, -1, -1 ); - - // Update the context with the current architecture's register and cache - // blocksizes for small/unpacked level-3 problems. - bli_cntx_set_l3_sup_blkszs - ( - 5, - BLIS_NC, &blkszs[ BLIS_NC ], - BLIS_KC, &blkszs[ BLIS_KC ], - BLIS_MC, &blkszs[ BLIS_MC ], - BLIS_NR, &blkszs[ BLIS_NR ], - BLIS_MR, &blkszs[ BLIS_MR ], - cntx + BLIS_VA_END ); } diff --git a/config/firestorm/bli_kernel_defs_firestorm.h b/config/firestorm/bli_kernel_defs_firestorm.h new file mode 100644 index 0000000000..60292099cc --- /dev/null +++ b/config/firestorm/bli_kernel_defs_firestorm.h @@ -0,0 +1,48 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 8 +#define BLIS_MR_d 6 + +#define BLIS_NR_s 12 +#define BLIS_NR_d 8 + +//#endif + diff --git a/config/firestorm/make_defs.mk b/config/firestorm/make_defs.mk index dc4286e6a8..2353e0040e 100644 --- a/config/firestorm/make_defs.mk +++ b/config/firestorm/make_defs.mk @@ -47,7 +47,7 @@ THIS_CONFIG := firestorm # may specify additional flags here as needed. CPPROCFLAGS := -D_GNU_SOURCE CMISCFLAGS := -CPICFLAGS := +CPICFLAGS := -fPIC CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) diff --git a/frame/3/her2k/bli_her2k.h b/config/generic/bli_kernel_defs_generic.h similarity index 88% rename from frame/3/her2k/bli_her2k.h rename to config/generic/bli_kernel_defs_generic.h index 02975c2b51..db2f32947b 100644 --- a/frame/3/her2k/bli_her2k.h +++ b/config/generic/bli_kernel_defs_generic.h @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2022, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -32,5 +32,11 @@ */ -#include "bli_her2k_front.h" +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +//#endif diff --git a/config/generic/make_defs.mk b/config/generic/make_defs.mk index ee77b6cf0e..cbe4fb86f7 100644 --- a/config/generic/make_defs.mk +++ b/config/generic/make_defs.mk @@ -47,7 +47,7 @@ THIS_CONFIG := generic # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := -CPICFLAGS := +CPICFLAGS := -fPIC CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) @@ -71,7 +71,11 @@ else ifeq ($(CC_VENDOR),clang) CKVECFLAGS := else -$(error gcc, icc, or clang is required for this configuration.) +ifeq ($(CC_VENDOR),nvc) +CKVECFLAGS := +else +$(error gcc, icc, nvc, or clang is required for this configuration.) +endif endif endif endif diff --git a/config/haswell/bli_cntx_init_haswell.c b/config/haswell/bli_cntx_init_haswell.c index f2dc900ead..e211513b26 100644 --- a/config/haswell/bli_cntx_init_haswell.c +++ b/config/haswell/bli_cntx_init_haswell.c @@ -35,79 +35,54 @@ #include "blis.h" -//GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref ) - void bli_cntx_init_haswell( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; - blksz_t thresh[ BLIS_NUM_THRESH ]; // Set default kernel blocksizes and functions. bli_cntx_init_haswell_ref( cntx ); // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 8, + cntx, + // gemm #if 1 - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE, + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, #else - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_16x6, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_8x6, FALSE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_8x3, FALSE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_4x3, FALSE, + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_16x6, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_8x6, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_8x3, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_4x3, #endif // gemmtrsm_l - BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE, - BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE, + BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, + BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, // gemmtrsm_u - BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE, - BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE, - cntx - ); + BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, + BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, #if 1 - // Update the context with optimized packm kernels. - bli_cntx_set_packm_kers - ( - 8, - BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk, - BLIS_PACKM_16XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk, - BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk, - BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk, - BLIS_PACKM_3XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk, - BLIS_PACKM_8XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk, - BLIS_PACKM_3XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk, - BLIS_PACKM_4XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk, - cntx - ); + // packm + BLIS_PACKM_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6x16, + BLIS_PACKM_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6x8, + BLIS_PACKM_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3x8, + BLIS_PACKM_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3x4, #endif - // Update the context with optimized level-1f kernels. - bli_cntx_set_l1f_kers - ( - 4, // axpyf BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8, BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8, // dotxf BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, - cntx - ); - - // Update the context with optimized level-1v kernels. - bli_cntx_set_l1v_kers - ( - 10, // amaxv BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, @@ -137,7 +112,74 @@ void bli_cntx_init_haswell( cntx_t* cntx ) BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, #endif - cntx + + // gemmsup + BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, + BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, + BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + + BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, + BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, + BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, + BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, + BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, + BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, + BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, + BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // gemm +#if 1 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, +#else + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, +#endif + // gemmtrsm_l + BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + // gemmtrsm_u + BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + // gemmsup + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -161,97 +203,54 @@ void bli_cntx_init_haswell( cntx_t* cntx ) bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, 8, 8 ); bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, 8, 8 ); + // ------------------------------------------------------------------------- + + // Initialize sup thresholds with architecture-appropriate values. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MT ], 201, 201, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NT ], 201, 201, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KT ], 201, 201, -1, -1 ); + + // Initialize level-3 sup blocksize objects with architecture-specific + // values. + // s d c z + bli_blksz_init ( &blkszs[ BLIS_MR_SUP ], 6, 6, -1, -1, + 9, 9, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ], 16, 8, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ], 168, 72, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ], 256, 256, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ], 4080, 4080, -1, -1 ); + // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 7, + cntx, + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + // level-1f BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, - cntx - ); - // ------------------------------------------------------------------------- + // gemmsup thresholds + BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT, + BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT, + BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT, - // Initialize sup thresholds with architecture-appropriate values. - // s d c z - bli_blksz_init_easy( &thresh[ BLIS_MT ], 201, 201, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_NT ], 201, 201, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_KT ], 201, 201, -1, -1 ); - - // Initialize the context with the sup thresholds. - bli_cntx_set_l3_sup_thresh - ( - 3, - BLIS_MT, &thresh[ BLIS_MT ], - BLIS_NT, &thresh[ BLIS_NT ], - BLIS_KT, &thresh[ BLIS_KT ], - cntx - ); + // level-3 sup + BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NR_SUP, + BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KR_SUP, + BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MR_SUP, + BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP, + BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP, -#if 0 - // Initialize the context with the sup handlers. - bli_cntx_set_l3_sup_handlers - ( - 1, - BLIS_GEMM, bli_gemmsup_ref, - cntx - ); -#endif - - // Update the context with optimized small/unpacked gemm kernels. - bli_cntx_set_l3_sup_kers - ( - 16, - //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref, - BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE, - BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE, - BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - - BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, - BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, TRUE, - BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, - BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, - BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, - BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE, - BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, - BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, - cntx - ); - - // Initialize level-3 sup blocksize objects with architecture-specific - // values. - // s d c z - bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, -1, -1, - 9, 9, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 168, 72, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, -1, -1 ); - - // Update the context with the current architecture's register and cache - // blocksizes for small/unpacked level-3 problems. - bli_cntx_set_l3_sup_blkszs - ( - 5, - BLIS_NC, &blkszs[ BLIS_NC ], - BLIS_KC, &blkszs[ BLIS_KC ], - BLIS_MC, &blkszs[ BLIS_MC ], - BLIS_NR, &blkszs[ BLIS_NR ], - BLIS_MR, &blkszs[ BLIS_MR ], - cntx + BLIS_VA_END ); } diff --git a/config/haswell/bli_kernel_defs_haswell.h b/config/haswell/bli_kernel_defs_haswell.h new file mode 100644 index 0000000000..c5bc8d63f3 --- /dev/null +++ b/config/haswell/bli_kernel_defs_haswell.h @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 6 +#define BLIS_MR_d 6 +#define BLIS_MR_c 3 +#define BLIS_MR_z 3 + +#define BLIS_NR_s 16 +#define BLIS_NR_d 8 +#define BLIS_NR_c 8 +#define BLIS_NR_z 4 + +//#endif + diff --git a/config/haswell/make_defs.mk b/config/haswell/make_defs.mk index a8135c1070..6f7b5b49a9 100644 --- a/config/haswell/make_defs.mk +++ b/config/haswell/make_defs.mk @@ -47,7 +47,7 @@ THIS_CONFIG := haswell # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := -CPICFLAGS := +CPICFLAGS := -fPIC CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) diff --git a/config/intel64/make_defs.mk b/config/intel64/make_defs.mk index 95f21f6f9c..3f62cef572 100644 --- a/config/intel64/make_defs.mk +++ b/config/intel64/make_defs.mk @@ -47,7 +47,7 @@ THIS_CONFIG := intel64 # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := -CPICFLAGS := +CPICFLAGS := -fPIC CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) diff --git a/config/knc/bli_cntx_init_knc.c b/config/knc/bli_cntx_init_knc.c index 198f08827a..bbaf37541b 100644 --- a/config/knc/bli_cntx_init_knc.c +++ b/config/knc/bli_cntx_init_knc.c @@ -43,36 +43,52 @@ void bli_cntx_init_knc( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 1, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_knc_asm_30x8, TRUE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_knc_asm_30x8, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_MR ], 0, 30, 0, 0 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 0, 8, 0, 0 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 0, 120, 0, 0, - 0, 160, 0, 0 ); - bli_blksz_init ( &blkszs[ BLIS_KC ], 0, 240, 0, 0, - 0, 300, 0, 0 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 0, 14400, 0, 0 ); + bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 30, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 8, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 120, -1, -1, + -1, 160, -1, -1 ); + bli_blksz_init ( &blkszs[ BLIS_KC ], -1, 240, -1, -1, + -1, 300, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 14400, -1, -1 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/knc/bli_family_knc.h b/config/knc/bli_family_knc.h index 6f9e03e8fa..b968b0c9a1 100644 --- a/config/knc/bli_family_knc.h +++ b/config/knc/bli_family_knc.h @@ -46,8 +46,8 @@ #define BLIS_SIMD_ALIGN_SIZE 64 -#define BLIS_SIMD_SIZE 64 -#define BLIS_SIMD_NUM_REGISTERS 32 +#define BLIS_SIMD_MAX_SIZE 64 +#define BLIS_SIMD_MAX_NUM_REGISTERS 32 #if 0 diff --git a/config/knc/bli_kernel_defs_knc.h b/config/knc/bli_kernel_defs_knc.h new file mode 100644 index 0000000000..0ae6d1b75c --- /dev/null +++ b/config/knc/bli_kernel_defs_knc.h @@ -0,0 +1,48 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_d 30 + +#define BLIS_NR_d 8 + +#define BLIS_PACKMR_d 32 + +//#endif + diff --git a/config/knc/make_defs.mk b/config/knc/make_defs.mk index 0a1d43a645..243eb8f19f 100644 --- a/config/knc/make_defs.mk +++ b/config/knc/make_defs.mk @@ -47,7 +47,7 @@ THIS_CONFIG := knc # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := -mmic -fasm-blocks -CPICFLAGS := +CPICFLAGS := -fPIC CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) diff --git a/config/knl/bli_cntx_init_knl.c b/config/knl/bli_cntx_init_knl.c index 6da3b7a3a9..548aba177c 100644 --- a/config/knl/bli_cntx_init_knl.c +++ b/config/knl/bli_cntx_init_knl.c @@ -43,47 +43,32 @@ void bli_cntx_init_knl( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 2, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_knl_asm_24x16, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_knl_asm_24x8, FALSE, - cntx - ); + cntx, - // Update the context with optimized packm kernels. - bli_cntx_set_packm_kers - ( - 2, - BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_knl_asm_8xk, - BLIS_PACKM_24XK_KER, BLIS_DOUBLE, bli_dpackm_knl_asm_24xk, - cntx - ); + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_knl_asm_24x16, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_knl_asm_24x8, + + // packm + BLIS_PACKM_KER, BLIS_DOUBLE, bli_dpackm_knl_asm_24x8, - // Update the context with optimized level-1f kernels. - bli_cntx_set_l1f_kers - ( - 4, // axpyf - BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8, - BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8, + BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8, + BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8, + // dotxf - BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, - BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, - cntx - ); + BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, + BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, - // Update the context with optimized level-1v kernels. - bli_cntx_set_l1v_kers - ( - 10, #if 1 // amaxv BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, #endif + // axpyv #if 0 BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int, @@ -92,12 +77,15 @@ void bli_cntx_init_knl( cntx_t* cntx ) BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10, BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10, #endif + // dotv BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int, BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int, + // dotxv BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int, BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int, + // scalv #if 0 BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int, @@ -106,7 +94,20 @@ void bli_cntx_init_knl( cntx_t* cntx ) BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, #endif - cntx + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -125,17 +126,20 @@ void bli_cntx_init_knl( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 7, + cntx, + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + // level-1f BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, - cntx + + BLIS_VA_END ); } diff --git a/config/knl/bli_family_knl.h b/config/knl/bli_family_knl.h index 64994cd9dd..98d3fe8d72 100644 --- a/config/knl/bli_family_knl.h +++ b/config/knl/bli_family_knl.h @@ -52,8 +52,8 @@ #define BLIS_SIMD_ALIGN_SIZE 64 -#define BLIS_SIMD_SIZE 64 -#define BLIS_SIMD_NUM_REGISTERS 32 +#define BLIS_SIMD_MAX_SIZE 64 +#define BLIS_SIMD_MAX_NUM_REGISTERS 32 /* #ifdef BLIS_NO_HBWMALLOC diff --git a/config/knl/bli_kernel_defs_knl.h b/config/knl/bli_kernel_defs_knl.h new file mode 100644 index 0000000000..ce514bb21a --- /dev/null +++ b/config/knl/bli_kernel_defs_knl.h @@ -0,0 +1,48 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 24 +#define BLIS_MR_d 24 + +#define BLIS_NR_s 16 +#define BLIS_NR_d 8 + +//#endif + diff --git a/config/knl/make_defs.mk b/config/knl/make_defs.mk index d4b0da4aa0..5458745b9c 100644 --- a/config/knl/make_defs.mk +++ b/config/knl/make_defs.mk @@ -47,7 +47,7 @@ THIS_CONFIG := knl # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := -CPICFLAGS := +CPICFLAGS := -fPIC CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) diff --git a/config/old/armv7a/bli_cntx_init_armv7a.c b/config/old/armv7a/bli_cntx_init_armv7a.c index d4cc9e91d4..acd8e6c182 100644 --- a/config/old/armv7a/bli_cntx_init_armv7a.c +++ b/config/old/armv7a/bli_cntx_init_armv7a.c @@ -66,7 +66,7 @@ void bli_cntx_init_armv7a( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + 5, BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, diff --git a/config/old/haswellbb/bli_cntx_init_haswell.c b/config/old/haswellbb/bli_cntx_init_haswell.c index 9e1d03503a..88bd14a071 100644 --- a/config/old/haswellbb/bli_cntx_init_haswell.c +++ b/config/old/haswellbb/bli_cntx_init_haswell.c @@ -203,7 +203,7 @@ void bli_cntx_init_haswell( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 7, + 7, // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, diff --git a/config/old/haswellbb/make_defs.mk b/config/old/haswellbb/make_defs.mk index 6752dde295..3e4868a1f6 100644 --- a/config/old/haswellbb/make_defs.mk +++ b/config/old/haswellbb/make_defs.mk @@ -47,7 +47,7 @@ THIS_CONFIG := haswell # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := -CPICFLAGS := +CPICFLAGS := -fPIC CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) diff --git a/config/old/newarch/make_defs.mk b/config/old/newarch/make_defs.mk index 523e0b13bc..59393c56fa 100644 --- a/config/old/newarch/make_defs.mk +++ b/config/old/newarch/make_defs.mk @@ -1,6 +1,6 @@ -#!/bin/bash # -# BLIS +# +# BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # @@ -47,7 +47,7 @@ CC := gcc CC_VENDOR := gcc endif -# Enable IEEE Standard 1003.1-2004 (POSIX.1d). +# Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L CMISCFLAGS := -std=c99 @@ -67,13 +67,13 @@ endif CKOPTFLAGS := $(COPTFLAGS) ifeq ($(CC_VENDOR),gcc) -CKVECFLAGS := +CKVECFLAGS := else ifeq ($(CC_VENDOR),icc) -CKVECFLAGS := +CKVECFLAGS := else ifeq ($(CC_VENDOR),clang) -CKVECFLAGS := +CKVECFLAGS := else $(error gcc, icc, or clang is required for this configuration.) endif @@ -83,4 +83,3 @@ endif # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) - diff --git a/config/old/pnacl/make_defs.mk b/config/old/pnacl/make_defs.mk index f82493f8b2..28f5e2a2e0 100644 --- a/config/old/pnacl/make_defs.mk +++ b/config/old/pnacl/make_defs.mk @@ -49,7 +49,7 @@ CC_VENDOR := pnacl-clang # NOTE: This is needed to enable posix_memalign(). CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L CMISCFLAGS := -std=gnu11 -I$(NACL_SDK_ROOT)/include -CPICFLAGS := +CPICFLAGS := -fPIC CDBGFLAGS := -g CWARNFLAGS := -Wall -Wno-unused-function -Wfatal-errors COPTFLAGS := -O3 diff --git a/config/penryn/bli_cntx_init_penryn.c b/config/penryn/bli_cntx_init_penryn.c index 1576bf9448..30b3ac9fa4 100644 --- a/config/penryn/bli_cntx_init_penryn.c +++ b/config/penryn/bli_cntx_init_penryn.c @@ -43,39 +43,60 @@ void bli_cntx_init_penryn( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 4, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_penryn_asm_8x4, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_penryn_asm_4x4, FALSE, - //BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_penryn_asm_8x4, FALSE, - //BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_penryn_asm_4x4, FALSE, - BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_penryn_asm_4x4, FALSE, - BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_penryn_asm_4x4, FALSE, - cntx + cntx, + + //level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_penryn_asm_8x4, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_penryn_asm_4x4, + //BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_penryn_asm_8x4, + //BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_penryn_asm_4x4, + BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_penryn_asm_4x4, + BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_penryn_asm_4x4, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + //level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + //BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + //BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_MR ], 8, 4, 0, 0 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 4, 4, 0, 0 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 768, 384, 0, 0 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 384, 384, 0, 0 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, 0, 0 ); + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 8, 4, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 4, 4, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 768, 384, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 384, 384, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, -1, -1 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-1 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/penryn/bli_kernel_defs_penryn.h b/config/penryn/bli_kernel_defs_penryn.h new file mode 100644 index 0000000000..f1e483646a --- /dev/null +++ b/config/penryn/bli_kernel_defs_penryn.h @@ -0,0 +1,48 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 8 +#define BLIS_MR_d 4 + +#define BLIS_NR_s 4 +#define BLIS_NR_d 4 + +//#endif + diff --git a/config/penryn/make_defs.mk b/config/penryn/make_defs.mk index a3474e9ce7..d070b7f1ae 100644 --- a/config/penryn/make_defs.mk +++ b/config/penryn/make_defs.mk @@ -47,7 +47,7 @@ THIS_CONFIG := penryn # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := -CPICFLAGS := +CPICFLAGS := -fPIC CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) diff --git a/config/piledriver/bli_cntx_init_piledriver.c b/config/piledriver/bli_cntx_init_piledriver.c index 4ed15e322b..1c9a96fd9e 100644 --- a/config/piledriver/bli_cntx_init_piledriver.c +++ b/config/piledriver/bli_cntx_init_piledriver.c @@ -43,16 +43,32 @@ void bli_cntx_init_piledriver( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 4, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_piledriver_asm_16x3, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_piledriver_asm_8x3, FALSE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2, FALSE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_piledriver_asm_16x3, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_piledriver_asm_8x3, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -67,13 +83,16 @@ void bli_cntx_init_piledriver( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/piledriver/bli_kernel_defs_piledriver.h b/config/piledriver/bli_kernel_defs_piledriver.h new file mode 100644 index 0000000000..df4a8c4118 --- /dev/null +++ b/config/piledriver/bli_kernel_defs_piledriver.h @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 16 +#define BLIS_MR_d 8 +#define BLIS_MR_c 4 +#define BLIS_MR_z 2 + +#define BLIS_NR_s 3 +#define BLIS_NR_d 3 +#define BLIS_NR_c 2 +#define BLIS_NR_z 2 + +//#endif + diff --git a/config/piledriver/make_defs.mk b/config/piledriver/make_defs.mk index ab42872fb3..56b7d0fc51 100644 --- a/config/piledriver/make_defs.mk +++ b/config/piledriver/make_defs.mk @@ -47,7 +47,7 @@ THIS_CONFIG := piledriver # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := -CPICFLAGS := +CPICFLAGS := -fPIC CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) diff --git a/frame/3/herk/bli_herk.h b/config/power/bli_family_power.h similarity index 96% rename from frame/3/herk/bli_herk.h rename to config/power/bli_family_power.h index c437289688..21b44db870 100644 --- a/frame/3/herk/bli_herk.h +++ b/config/power/bli_family_power.h @@ -32,7 +32,10 @@ */ -#include "bli_herk_front.h" +//#ifndef BLIS_FAMILY_H +//#define BLIS_FAMILY_H -#include "bli_herk_var.h" + + +//#endif diff --git a/config/power/make_defs.mk b/config/power/make_defs.mk new file mode 100644 index 0000000000..8350a0a5c0 --- /dev/null +++ b/config/power/make_defs.mk @@ -0,0 +1,82 @@ +# +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2014, The University of Texas at Austin +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name(s) of the copyright holder(s) nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + + +# Declare the name of the current configuration and add it to the +# running list of configurations included by common.mk. +THIS_CONFIG := power +#CONFIGS_INCL += $(THIS_CONFIG) + +# +# --- Determine the C compiler and related flags --- +# + +# NOTE: The build system will append these variables with various +# general-purpose/configuration-agnostic flags in common.mk. You +# may specify additional flags here as needed. +CPPROCFLAGS := +CMISCFLAGS := +CPICFLAGS := -fPIC +CWARNFLAGS := + +ifneq ($(DEBUG_TYPE),off) +CDBGFLAGS := -g +endif + +ifeq ($(DEBUG_TYPE),noopt) +COPTFLAGS := -O0 +else +COPTFLAGS := -O2 +endif + +# Flags specific to optimized kernels. +CKOPTFLAGS := $(COPTFLAGS) -O3 +CKVECFLAGS := + +# Flags specific to reference kernels. +CROPTFLAGS := $(CKOPTFLAGS) +ifeq ($(CC_VENDOR),gcc) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else +ifeq ($(CC_VENDOR),clang) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else +CRVECFLAGS := $(CKVECFLAGS) +endif +endif + +# Store all of the variables here to new variables containing the +# configuration name. +$(eval $(call store-make-defs,$(THIS_CONFIG))) + diff --git a/config/power10/bli_cntx_init_power10.c b/config/power10/bli_cntx_init_power10.c index 14c940f995..f662d5791f 100644 --- a/config/power10/bli_cntx_init_power10.c +++ b/config/power10/bli_cntx_init_power10.c @@ -34,35 +34,6 @@ #include "blis.h" -// Instantiate prototypes for packm kernels. -PACKM_KER_PROT( float, s, packm_6xk_bb4_power10_ref ) -PACKM_KER_PROT( double, d, packm_6xk_bb2_power10_ref ) - -// Instantiate prototypes for level-3 kernels. -GEMM_UKR_PROT( float, s, gemmbb_power10_ref ) -GEMMTRSM_UKR_PROT( float, s, gemmtrsmbb_l_power10_ref ) -GEMMTRSM_UKR_PROT( float, s, gemmtrsmbb_u_power10_ref ) -TRSM_UKR_PROT( float, s, trsmbb_l_power10_ref ) -TRSM_UKR_PROT( float, s, trsmbb_u_power10_ref ) - -GEMM_UKR_PROT( double, d, gemmbb_power10_ref ) -GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_l_power10_ref ) -GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_u_power10_ref ) -TRSM_UKR_PROT( double, d, trsmbb_l_power10_ref ) -TRSM_UKR_PROT( double, d, trsmbb_u_power10_ref ) - -GEMM_UKR_PROT( scomplex, c, gemmbb_power10_ref ) -GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_l_power10_ref ) -GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_u_power10_ref ) -TRSM_UKR_PROT( scomplex, c, trsmbb_l_power10_ref ) -TRSM_UKR_PROT( scomplex, c, trsmbb_u_power10_ref ) - -GEMM_UKR_PROT( dcomplex, z, gemmbb_power10_ref ) -GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_l_power10_ref ) -GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_u_power10_ref ) -TRSM_UKR_PROT( dcomplex, z, trsmbb_l_power10_ref ) -TRSM_UKR_PROT( dcomplex, z, trsmbb_u_power10_ref ) - void bli_cntx_init_power10( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; @@ -72,51 +43,28 @@ void bli_cntx_init_power10( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 12, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_power10_mma_8x16, TRUE, - - BLIS_TRSM_L_UKR, BLIS_FLOAT, bli_strsmbb_l_power10_ref, FALSE, - BLIS_TRSM_U_UKR, BLIS_FLOAT, bli_strsmbb_u_power10_ref, FALSE, - - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power10_mma_8x8, TRUE, - - BLIS_TRSM_L_UKR, BLIS_DOUBLE, bli_dtrsmbb_l_power10_ref, FALSE, - BLIS_TRSM_U_UKR, BLIS_DOUBLE, bli_dtrsmbb_u_power10_ref, FALSE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemmbb_power10_ref, FALSE, - BLIS_TRSM_L_UKR, BLIS_SCOMPLEX, bli_ctrsmbb_l_power10_ref, FALSE, - BLIS_TRSM_U_UKR, BLIS_SCOMPLEX, bli_ctrsmbb_u_power10_ref, FALSE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemmbb_power10_ref, FALSE, - BLIS_TRSM_L_UKR, BLIS_DCOMPLEX, bli_ztrsmbb_l_power10_ref, FALSE, - BLIS_TRSM_U_UKR, BLIS_DCOMPLEX, bli_ztrsmbb_u_power10_ref, FALSE, - cntx - ); + cntx, - // Update the context with customized virtual [gemm]trsm micro-kernels. - bli_cntx_set_l3_vir_ukrs - ( - 8, - BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsmbb_l_power10_ref, - BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsmbb_u_power10_ref, - BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsmbb_l_power10_ref, - BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsmbb_u_power10_ref, - BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_l_power10_ref, - BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_u_power10_ref, - BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_l_power10_ref, - BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_u_power10_ref, - cntx + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_power10_mma_8x16, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power10_mma_8x8, + + BLIS_VA_END ); - // Update the context with optimized packm kernels. - bli_cntx_set_packm_kers + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs ( - 2, - BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_6xk_bb4_power10_ref, - BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_bb2_power10_ref, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + BLIS_VA_END ); // s d c z @@ -131,14 +79,16 @@ void bli_cntx_init_power10( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/power10/bli_kernel_defs_power10.h b/config/power10/bli_kernel_defs_power10.h new file mode 100644 index 0000000000..9b47a77c0a --- /dev/null +++ b/config/power10/bli_kernel_defs_power10.h @@ -0,0 +1,49 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 8 +#define BLIS_MR_d 8 + +#define BLIS_NR_s 16 +#define BLIS_NR_d 8 + + +//#endif + diff --git a/config/power10/make_defs.mk b/config/power10/make_defs.mk index 2c3f7cd7b9..191a3e42a8 100644 --- a/config/power10/make_defs.mk +++ b/config/power10/make_defs.mk @@ -48,7 +48,7 @@ THIS_CONFIG := power10 # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := -CPICFLAGS := +CPICFLAGS := -fPIC CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) diff --git a/config/power7/bli_cntx_init_power7.c b/config/power7/bli_cntx_init_power7.c index c9caf62a6d..9d1de3da5c 100644 --- a/config/power7/bli_cntx_init_power7.c +++ b/config/power7/bli_cntx_init_power7.c @@ -43,34 +43,50 @@ void bli_cntx_init_power7( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 1, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power7_int_8x4, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power7_int_8x4, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_MR ], 0, 8, 0, 0 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 0, 4, 0, 0 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 0, 64, 0, 0 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 0, 256, 0, 0 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 0, 4096, 0, 0 ); + bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 8, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 4, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 64, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 4096, -1, -1 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/power7/bli_kernel_defs_power7.h b/config/power7/bli_kernel_defs_power7.h new file mode 100644 index 0000000000..ceec01df3c --- /dev/null +++ b/config/power7/bli_kernel_defs_power7.h @@ -0,0 +1,46 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_d 8 + +#define BLIS_NR_d 4 + +//#endif + diff --git a/config/power7/make_defs.mk b/config/power7/make_defs.mk index f80774e48b..a732cfe9c1 100644 --- a/config/power7/make_defs.mk +++ b/config/power7/make_defs.mk @@ -47,7 +47,7 @@ THIS_CONFIG := power7 # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := -mcpu=power7 -CPICFLAGS := +CPICFLAGS := -fPIC CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) diff --git a/config/power9/bli_cntx_init_power9.c b/config/power9/bli_cntx_init_power9.c index 4370ce26c1..9f2d67632e 100644 --- a/config/power9/bli_cntx_init_power9.c +++ b/config/power9/bli_cntx_init_power9.c @@ -34,35 +34,6 @@ #include "blis.h" -// Instantiate prototypes for packm kernels. -PACKM_KER_PROT( float, s, packm_6xk_bb4_power9_ref ) -PACKM_KER_PROT( double, d, packm_6xk_bb2_power9_ref ) - -// Instantiate prototypes for level-3 kernels. -GEMM_UKR_PROT( float, s, gemmbb_power9_ref ) -GEMMTRSM_UKR_PROT( float, s, gemmtrsmbb_l_power9_ref ) -GEMMTRSM_UKR_PROT( float, s, gemmtrsmbb_u_power9_ref ) -TRSM_UKR_PROT( float, s, trsmbb_l_power9_ref ) -TRSM_UKR_PROT( float, s, trsmbb_u_power9_ref ) - -GEMM_UKR_PROT( double, d, gemmbb_power9_ref ) -GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_l_power9_ref ) -GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_u_power9_ref ) -TRSM_UKR_PROT( double, d, trsmbb_l_power9_ref ) -TRSM_UKR_PROT( double, d, trsmbb_u_power9_ref ) - -GEMM_UKR_PROT( scomplex, c, gemmbb_power9_ref ) -GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_l_power9_ref ) -GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_u_power9_ref ) -TRSM_UKR_PROT( scomplex, c, trsmbb_l_power9_ref ) -TRSM_UKR_PROT( scomplex, c, trsmbb_u_power9_ref ) - -GEMM_UKR_PROT( dcomplex, z, gemmbb_power9_ref ) -GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_l_power9_ref ) -GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_u_power9_ref ) -TRSM_UKR_PROT( dcomplex, z, trsmbb_l_power9_ref ) -TRSM_UKR_PROT( dcomplex, z, trsmbb_u_power9_ref ) - void bli_cntx_init_power9( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; @@ -72,50 +43,37 @@ void bli_cntx_init_power9( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 12, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemmbb_power9_ref, FALSE, - BLIS_TRSM_L_UKR, BLIS_FLOAT, bli_strsmbb_l_power9_ref, FALSE, - BLIS_TRSM_U_UKR, BLIS_FLOAT, bli_strsmbb_u_power9_ref, FALSE, - - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power9_asm_12x6, FALSE, - - BLIS_TRSM_L_UKR, BLIS_DOUBLE, bli_dtrsmbb_l_power9_ref, FALSE, - BLIS_TRSM_U_UKR, BLIS_DOUBLE, bli_dtrsmbb_u_power9_ref, FALSE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemmbb_power9_ref, FALSE, - BLIS_TRSM_L_UKR, BLIS_SCOMPLEX, bli_ctrsmbb_l_power9_ref, FALSE, - BLIS_TRSM_U_UKR, BLIS_SCOMPLEX, bli_ctrsmbb_u_power9_ref, FALSE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemmbb_power9_ref, FALSE, - BLIS_TRSM_L_UKR, BLIS_DCOMPLEX, bli_ztrsmbb_l_power9_ref, FALSE, - BLIS_TRSM_U_UKR, BLIS_DCOMPLEX, bli_ztrsmbb_u_power9_ref, FALSE, - cntx - ); + cntx, - // Update the context with customized virtual [gemm]trsm micro-kernels. - bli_cntx_set_l3_vir_ukrs - ( - 8, - BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsmbb_l_power9_ref, - BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsmbb_u_power9_ref, - BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsmbb_l_power9_ref, - BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsmbb_u_power9_ref, - BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_l_power9_ref, - BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_u_power9_ref, - BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_l_power9_ref, - BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_u_power9_ref, - cntx + // level-3 + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power9_asm_12x6, + + BLIS_VA_END ); - // Update the context with optimized packm kernels. - bli_cntx_set_packm_kers + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs ( - 2, - BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_6xk_bb4_power9_ref, - BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_bb2_power9_ref, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + BLIS_TRSM_L_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_TRSM_U_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_TRSM_L_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_TRSM_U_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_TRSM_L_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_TRSM_U_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_TRSM_L_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + BLIS_TRSM_U_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + + BLIS_VA_END ); @@ -131,14 +89,15 @@ void bli_cntx_init_power9( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx - ); + BLIS_VA_END + ); } diff --git a/config/power9/bli_kernel_defs_power9.h b/config/power9/bli_kernel_defs_power9.h new file mode 100644 index 0000000000..debfeac5fc --- /dev/null +++ b/config/power9/bli_kernel_defs_power9.h @@ -0,0 +1,49 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_d 12 + +#define BLIS_NR_d 6 + +#define BLIS_BBN_s 4 +#define BLIS_BBN_d 2 + +//#endif + diff --git a/config/power9/make_defs.mk b/config/power9/make_defs.mk index 85fa592d84..9f604a6074 100644 --- a/config/power9/make_defs.mk +++ b/config/power9/make_defs.mk @@ -48,7 +48,7 @@ THIS_CONFIG := power9 # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := -CPICFLAGS := +CPICFLAGS := -fPIC CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) diff --git a/frame/thread/bli_l3_sup_decor_single.h b/config/rv32i/bli_cntx_init_rv32i.c similarity index 88% rename from frame/thread/bli_l3_sup_decor_single.h rename to config/rv32i/bli_cntx_init_rv32i.c index 418c3814c3..84fd2dca63 100644 --- a/frame/thread/bli_l3_sup_decor_single.h +++ b/config/rv32i/bli_cntx_init_rv32i.c @@ -32,13 +32,13 @@ */ -#ifndef BLIS_L3_SUP_DECOR_SINGLE_H -#define BLIS_L3_SUP_DECOR_SINGLE_H +#include "blis.h" -// Definitions specific to situations when multithreading is disabled. -#ifndef BLIS_ENABLE_MULTITHREADING -#endif - -#endif +void bli_cntx_init_rv32i( cntx_t* cntx ) +{ + // Set default kernel blocksizes and functions. + bli_cntx_init_rv32i_ref( cntx ); + // ------------------------------------------------------------------------- +} diff --git a/config/rv32i/bli_kernel_defs_rv32i.h b/config/rv32i/bli_kernel_defs_rv32i.h new file mode 100644 index 0000000000..fe51f998da --- /dev/null +++ b/config/rv32i/bli_kernel_defs_rv32i.h @@ -0,0 +1,43 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +// Fall through to generic sizes + +//#endif diff --git a/config/rv32i/make_defs.mk b/config/rv32i/make_defs.mk new file mode 100644 index 0000000000..21128717f3 --- /dev/null +++ b/config/rv32i/make_defs.mk @@ -0,0 +1,102 @@ +# +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2014, The University of Texas at Austin +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name(s) of the copyright holder(s) nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + + +# Declare the name of the current configuration and add it to the +# running list of configurations included by common.mk. +THIS_CONFIG := rv32i +#CONFIGS_INCL += $(THIS_CONFIG) + +# +# --- Determine the C compiler and related flags --- +# + +# NOTE: The build system will append these variables with various +# general-purpose/configuration-agnostic flags in common.mk. You +# may specify additional flags here as needed. +CPPROCFLAGS := -DRISCV_SIZE=32 + +RISCV_ARCH := $(shell $(CC) -E build/detect/riscv/bli_riscv_detect_arch.h | grep '^[^\#]') +RISCV_ABI := $(shell $(CC) -E build/detect/riscv/bli_riscv_detect_abi.h | grep '^[^\#]') + +ifeq (,$(findstring 32,$(RISCV_ARCH))) +$(error The RISC-V compiler architecture $(RISCV_ARCH) is not compatible with $(THIS_CONFIG)) +else ifeq (,$(findstring 32,$(RISCV_ABI))) +$(error The RISC-V compiler ABI $(RISCV_ABI) is not compatible with $(THIS_CONFIG)) +endif + +CMISCFLAGS := -march=$(RISCV_ARCH) -mabi=$(RISCV_ABI) +CPICFLAGS := -fPIC +CWARNFLAGS := -Wall -Wno-unused-function -Wfatal-errors + +# In case the A extension is not available +LDFLAGS += -latomic + +ifneq ($(DEBUG_TYPE),off) +CDBGFLAGS := -g +endif + +ifeq ($(DEBUG_TYPE),noopt) +COPTFLAGS := -O0 +else +COPTFLAGS := -O2 +endif + +# Flags specific to optimized kernels. +CKOPTFLAGS := $(COPTFLAGS) -O3 +ifeq ($(CC_VENDOR),gcc) +CKVECFLAGS := +else +ifeq ($(CC_VENDOR),clang) +CKVECFLAGS := +else +$(error gcc or clang is required for this configuration.) +endif +endif + +# Flags specific to reference kernels. +CROPTFLAGS := $(CKOPTFLAGS) +ifeq ($(CC_VENDOR),gcc) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else +ifeq ($(CC_VENDOR),clang) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else +CRVECFLAGS := $(CKVECFLAGS) +endif +endif + +# Store all of the variables here to new variables containing the +# configuration name. +$(eval $(call store-make-defs,$(THIS_CONFIG))) diff --git a/config/rv32iv/bli_cntx_init_rv32iv.c b/config/rv32iv/bli_cntx_init_rv32iv.c new file mode 100644 index 0000000000..dd10a36555 --- /dev/null +++ b/config/rv32iv/bli_cntx_init_rv32iv.c @@ -0,0 +1,109 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "../../kernels/rviv/3/bli_rviv_utils.h" + +void bli_cntx_init_rv32iv( cntx_t* cntx ) +{ + blksz_t blkszs[ BLIS_NUM_BLKSZS ]; + + // Set default kernel blocksizes and functions. + bli_cntx_init_rv32iv_ref( cntx ); + + // ------------------------------------------------------------------------- + + // A reasonable assumptions for application cores is VLEN >= 128 bits, i.e., + // v >= 4. Embedded cores, however, may implement the minimal configuration, + // which allows VLEN = 32 bits. Here, we assume VLEN >= 128 and otherwise + // fall back to the reference kernels. + const uint32_t v = get_vlenb() / sizeof(float); + + if ( v >= 4 ) + { + const uint32_t mr_s = 4 * v; + const uint32_t mr_d = 2 * v; + const uint32_t mr_c = 2 * v; + const uint32_t mr_z = v; + + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_rviv_4vx4, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_rviv_4vx4, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_rviv_4vx4, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_rviv_4vx4, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + + BLIS_VA_END + ); + + // Initialize level-3 blocksize objects with architecture-specific values. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MR ], mr_s, mr_d, mr_c, mr_z ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 4, 4, 4, 4 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 20*mr_s, 20*mr_d, 60*mr_c, 30*mr_z ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 640, 320, 320, 160 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 3072, 3072, 3072 ); + + bli_cntx_set_blkszs + ( + cntx, + + // level-3 + BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, + BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, + BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, + BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, + BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + + BLIS_VA_END + ); + } +} diff --git a/config/rv32iv/bli_kernel_defs_rv32iv.h b/config/rv32iv/bli_kernel_defs_rv32iv.h new file mode 100644 index 0000000000..b179892085 --- /dev/null +++ b/config/rv32iv/bli_kernel_defs_rv32iv.h @@ -0,0 +1,43 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + + + +//#endif diff --git a/config/rv32iv/make_defs.mk b/config/rv32iv/make_defs.mk new file mode 100644 index 0000000000..9daaee3d68 --- /dev/null +++ b/config/rv32iv/make_defs.mk @@ -0,0 +1,104 @@ +# +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2014, The University of Texas at Austin +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name(s) of the copyright holder(s) nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + + +# Declare the name of the current configuration and add it to the +# running list of configurations included by common.mk. +THIS_CONFIG := rv32iv +#CONFIGS_INCL += $(THIS_CONFIG) + +# +# --- Determine the C compiler and related flags --- +# + +# NOTE: The build system will append these variables with various +# general-purpose/configuration-agnostic flags in common.mk. You +# may specify additional flags here as needed. +CPPROCFLAGS := -DRISCV_SIZE=32 + +RISCV_ARCH := $(shell $(CC) -DFORCE_RISCV_VECTOR -E build/detect/riscv/bli_riscv_detect_arch.h | grep '^[^\#]') +RISCV_ABI := $(shell $(CC) -DFORCE_RISCV_VECTOR -E build/detect/riscv/bli_riscv_detect_abi.h | grep '^[^\#]') + +ifeq (,$(findstring 32,$(RISCV_ARCH))) +$(error The RISC-V compiler architecture $(RISCV_ARCH) is not compatible with $(THIS_CONFIG)) +else ifeq (,$(findstring 32,$(RISCV_ABI))) +$(error The RISC-V compiler ABI $(RISCV_ABI) is not compatible with $(THIS_CONFIG)) +endif + +CMISCFLAGS := -march=$(RISCV_ARCH) -mabi=$(RISCV_ABI) +CPICFLAGS := -fPIC +CWARNFLAGS := -Wall -Wno-unused-function -Wfatal-errors + +# In case the A extension is not available +LDFLAGS += -latomic + +ifneq ($(DEBUG_TYPE),off) +CDBGFLAGS := -g +endif + +ifeq ($(DEBUG_TYPE),noopt) +COPTFLAGS := -O0 +else +COPTFLAGS := -O0 +endif + +# Flags specific to optimized kernels. +CKOPTFLAGS := $(COPTFLAGS) -O3 +ifeq ($(CC_VENDOR),gcc) +CKVECFLAGS := +else +ifeq ($(CC_VENDOR),clang) +CKVECFLAGS := +else +$(error gcc or clang is required for this configuration.) +endif +endif + +# Flags specific to reference kernels. +CROPTFLAGS := $(CKOPTFLAGS) +ifeq ($(CC_VENDOR),gcc) +# Lower compiler optimization to -O1. At -O3, gcc version 12.0.1 20220505 +# computes offsets for the matrix ab in the ref gemm kernel incorrectly. +CRVECFLAGS := $(CKVECFLAGS) -O1 +else +ifeq ($(CC_VENDOR),clang) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else +CRVECFLAGS := $(CKVECFLAGS) +endif +endif + +# Store all of the variables here to new variables containing the +# configuration name. +$(eval $(call store-make-defs,$(THIS_CONFIG))) diff --git a/frame/thread/bli_l3_decor_single.h b/config/rv64i/bli_cntx_init_rv64i.c similarity index 88% rename from frame/thread/bli_l3_decor_single.h rename to config/rv64i/bli_cntx_init_rv64i.c index 481763a908..f670e4a570 100644 --- a/frame/thread/bli_l3_decor_single.h +++ b/config/rv64i/bli_cntx_init_rv64i.c @@ -32,13 +32,13 @@ */ -#ifndef BLIS_L3_DECOR_SINGLE_H -#define BLIS_L3_DECOR_SINGLE_H +#include "blis.h" -// Definitions specific to situations when multithreading is disabled. -#ifndef BLIS_ENABLE_MULTITHREADING -#endif - -#endif +void bli_cntx_init_rv64i( cntx_t* cntx ) +{ + // Set default kernel blocksizes and functions. + bli_cntx_init_rv64i_ref( cntx ); + // ------------------------------------------------------------------------- +} diff --git a/config/rv64i/bli_kernel_defs_rv64i.h b/config/rv64i/bli_kernel_defs_rv64i.h new file mode 100644 index 0000000000..fe51f998da --- /dev/null +++ b/config/rv64i/bli_kernel_defs_rv64i.h @@ -0,0 +1,43 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +// Fall through to generic sizes + +//#endif diff --git a/config/rv64i/make_defs.mk b/config/rv64i/make_defs.mk new file mode 100644 index 0000000000..7c055f0128 --- /dev/null +++ b/config/rv64i/make_defs.mk @@ -0,0 +1,102 @@ +# +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2014, The University of Texas at Austin +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name(s) of the copyright holder(s) nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + + +# Declare the name of the current configuration and add it to the +# running list of configurations included by common.mk. +THIS_CONFIG := rv64i +#CONFIGS_INCL += $(THIS_CONFIG) + +# +# --- Determine the C compiler and related flags --- +# + +# NOTE: The build system will append these variables with various +# general-purpose/configuration-agnostic flags in common.mk. You +# may specify additional flags here as needed. +CPPROCFLAGS := -DRISCV_SIZE=64 + +RISCV_ARCH := $(shell $(CC) -E build/detect/riscv/bli_riscv_detect_arch.h | grep '^[^\#]') +RISCV_ABI := $(shell $(CC) -E build/detect/riscv/bli_riscv_detect_abi.h | grep '^[^\#]') + +ifeq (,$(findstring 64,$(RISCV_ARCH))) +$(error The RISC-V compiler architecture $(RISCV_ARCH) is not compatible with $(THIS_CONFIG)) +else ifeq (,$(findstring 64,$(RISCV_ABI))) +$(error The RISC-V compiler ABI $(RISCV_ABI) is not compatible with $(THIS_CONFIG)) +endif + +CMISCFLAGS := -march=$(RISCV_ARCH) -mabi=$(RISCV_ABI) +CPICFLAGS := -fPIC +CWARNFLAGS := -Wall -Wno-unused-function -Wfatal-errors + +# In case the A extension is not available +LDFLAGS += -latomic + +ifneq ($(DEBUG_TYPE),off) +CDBGFLAGS := -g +endif + +ifeq ($(DEBUG_TYPE),noopt) +COPTFLAGS := -O0 +else +COPTFLAGS := -O2 +endif + +# Flags specific to optimized kernels. +CKOPTFLAGS := $(COPTFLAGS) -O3 +ifeq ($(CC_VENDOR),gcc) +CKVECFLAGS := +else +ifeq ($(CC_VENDOR),clang) +CKVECFLAGS := +else +$(error gcc or clang is required for this configuration.) +endif +endif + +# Flags specific to reference kernels. +CROPTFLAGS := $(CKOPTFLAGS) +ifeq ($(CC_VENDOR),gcc) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else +ifeq ($(CC_VENDOR),clang) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else +CRVECFLAGS := $(CKVECFLAGS) +endif +endif + +# Store all of the variables here to new variables containing the +# configuration name. +$(eval $(call store-make-defs,$(THIS_CONFIG))) diff --git a/config/rv64iv/bli_cntx_init_rv64iv.c b/config/rv64iv/bli_cntx_init_rv64iv.c new file mode 100644 index 0000000000..eb1f79ebc9 --- /dev/null +++ b/config/rv64iv/bli_cntx_init_rv64iv.c @@ -0,0 +1,114 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "../../kernels/rviv/3/bli_rviv_utils.h" + +void bli_cntx_init_rv64iv( cntx_t* cntx ) +{ + blksz_t blkszs[ BLIS_NUM_BLKSZS ]; + + // Set default kernel blocksizes and functions. + bli_cntx_init_rv64iv_ref( cntx ); + + // ------------------------------------------------------------------------- + + // A reasonable assumptions for application cores is VLEN >= 128 bits, i.e., + // v >= 4. Embedded cores, however, may implement the minimal configuration, + // which allows VLEN = 32 bits. Here, we assume VLEN >= 128 and otherwise + // fall back to the reference kernels. + const uint32_t v = get_vlenb() / sizeof(float); + + if ( v >= 4 ) + { + const uint32_t mr_s = 4 * v; + const uint32_t mr_d = 2 * v; + const uint32_t mr_c = 2 * v; + const uint32_t mr_z = v; + + // TODO: Register different kernels based on the value + // of v to avoid MC becoming too big. (e.g. 2vx8) + + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_rviv_4vx4, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_rviv_4vx4, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_rviv_4vx4, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_rviv_4vx4, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + + BLIS_VA_END + ); + + // Initialize level-3 blocksize objects with architecture-specific values. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MR ], mr_s, mr_d, mr_c, mr_z ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 4, 4, 4, 4 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 20*mr_s, 20*mr_d, 60*mr_c, 30*mr_z ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 640, 320, 320, 160 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 3072, 3072, 3072 ); + + // Update the context with the current architecture's register and cache + // blocksizes (and multiples) for native execution. + bli_cntx_set_blkszs + ( + cntx, + + // level-3 + BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, + BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, + BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, + BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, + BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + + BLIS_VA_END + ); + } +} diff --git a/frame/1m/packm/bli_packm_md.h b/config/rv64iv/bli_kernel_defs_rv64iv.h similarity index 88% rename from frame/1m/packm/bli_packm_md.h rename to config/rv64iv/bli_kernel_defs_rv64iv.h index bb9d6d6135..18ca4030e0 100644 --- a/frame/1m/packm/bli_packm_md.h +++ b/config/rv64iv/bli_kernel_defs_rv64iv.h @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2022, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -32,6 +32,11 @@ */ -#include "bli_packm_blk_var1_md.h" -#include "bli_packm_struc_cxk_md.h" +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + + +//#endif diff --git a/config/rv64iv/make_defs.mk b/config/rv64iv/make_defs.mk new file mode 100644 index 0000000000..9ec5a889af --- /dev/null +++ b/config/rv64iv/make_defs.mk @@ -0,0 +1,103 @@ +# +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2014, The University of Texas at Austin +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name(s) of the copyright holder(s) nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + + +# Declare the name of the current configuration and add it to the +# running list of configurations included by common.mk. +THIS_CONFIG := rv64iv +#CONFIGS_INCL += $(THIS_CONFIG) + +# +# --- Determine the C compiler and related flags --- +# + +# NOTE: The build system will append these variables with various +# general-purpose/configuration-agnostic flags in common.mk. You +# may specify additional flags here as needed. +CPPROCFLAGS := -DRISCV_SIZE=64 + +RISCV_ARCH := $(shell $(CC) -DFORCE_RISCV_VECTOR -E build/detect/riscv/bli_riscv_detect_arch.h | grep '^[^\#]') +RISCV_ABI := $(shell $(CC) -DFORCE_RISCV_VECTOR -E build/detect/riscv/bli_riscv_detect_abi.h | grep '^[^\#]') + +ifeq (,$(findstring 64,$(RISCV_ARCH))) +$(error The RISC-V compiler architecture $(RISCV_ARCH) is not compatible with $(THIS_CONFIG)) +else ifeq (,$(findstring 64,$(RISCV_ABI))) +$(error The RISC-V compiler ABI $(RISCV_ABI) is not compatible with $(THIS_CONFIG)) +endif + +CMISCFLAGS := -march=$(RISCV_ARCH) -mabi=$(RISCV_ABI) +CPICFLAGS := -fPIC +CWARNFLAGS := -Wall -Wno-unused-function -Wfatal-errors + +# In case the A extension is not available +LDFLAGS += -latomic + +ifneq ($(DEBUG_TYPE),off) +CDBGFLAGS := -g +endif + +ifeq ($(DEBUG_TYPE),noopt) +COPTFLAGS := -O0 +else +COPTFLAGS := -O2 -ftree-vectorize +endif + +# Flags specific to optimized kernels. +CKOPTFLAGS := $(COPTFLAGS) -O3 +ifeq ($(CC_VENDOR),gcc) +CKVECFLAGS := +else +ifeq ($(CC_VENDOR),clang) +CKVECFLAGS := +else +$(error gcc or clang is required for this configuration.) +endif +endif + +# Flags specific to reference kernels. +CROPTFLAGS := $(CKOPTFLAGS) +ifeq ($(CC_VENDOR),gcc) +# Lower compiler optimization. cinvscalv fails at -O1 +CRVECFLAGS := $(CKVECFLAGS) -O0 +else +ifeq ($(CC_VENDOR),clang) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else +CRVECFLAGS := $(CKVECFLAGS) +endif +endif + +# Store all of the variables here to new variables containing the +# configuration name. +$(eval $(call store-make-defs,$(THIS_CONFIG))) diff --git a/config/sandybridge/bli_cntx_init_sandybridge.c b/config/sandybridge/bli_cntx_init_sandybridge.c index 1ffa5bf8b6..0697a3351c 100644 --- a/config/sandybridge/bli_cntx_init_sandybridge.c +++ b/config/sandybridge/bli_cntx_init_sandybridge.c @@ -43,16 +43,32 @@ void bli_cntx_init_sandybridge( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 4, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_sandybridge_asm_8x8, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_sandybridge_asm_8x4, FALSE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_sandybridge_asm_8x4, FALSE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_sandybridge_asm_4x4, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_sandybridge_asm_8x8, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_sandybridge_asm_8x4, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_sandybridge_asm_8x4, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_sandybridge_asm_4x4, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -67,13 +83,16 @@ void bli_cntx_init_sandybridge( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/frame/include/level0/1r/bli_copyj1rs.h b/config/sandybridge/bli_kernel_defs_sandybridge.h similarity index 80% rename from frame/include/level0/1r/bli_copyj1rs.h rename to config/sandybridge/bli_kernel_defs_sandybridge.h index d7cdff3051..dc1b843f60 100644 --- a/frame/include/level0/1r/bli_copyj1rs.h +++ b/config/sandybridge/bli_kernel_defs_sandybridge.h @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2022, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -32,20 +32,21 @@ */ -#ifndef BLIS_COPYJ1RS_H -#define BLIS_COPYJ1RS_H +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H -// copyj1rs -#define bli_ccopyj1rs( a, br, bi ) \ -{ \ - bli_ccopyjris( bli_creal(a), bli_cimag(a), br, bi ); \ -} +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- -#define bli_zcopyj1rs( a, br, bi ) \ -{ \ - bli_zcopyjris( bli_zreal(a), bli_zimag(a), br, bi ); \ -} +#define BLIS_MR_s 8 +#define BLIS_MR_d 8 +#define BLIS_MR_c 8 +#define BLIS_MR_z 4 -#endif +#define BLIS_NR_s 8 +#define BLIS_NR_d 4 +#define BLIS_NR_c 4 +#define BLIS_NR_z 4 + +//#endif diff --git a/config/sandybridge/make_defs.mk b/config/sandybridge/make_defs.mk index d3ceb34837..6047787cda 100644 --- a/config/sandybridge/make_defs.mk +++ b/config/sandybridge/make_defs.mk @@ -47,7 +47,7 @@ THIS_CONFIG := sandybridge # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := -CPICFLAGS := +CPICFLAGS := -fPIC CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) diff --git a/config/sifive_rvv/bli_cntx_init_sifive_rvv.c b/config/sifive_rvv/bli_cntx_init_sifive_rvv.c new file mode 100644 index 0000000000..54f17303fd --- /dev/null +++ b/config/sifive_rvv/bli_cntx_init_sifive_rvv.c @@ -0,0 +1,225 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include + +void bli_cntx_init_sifive_rvv( cntx_t* cntx ) +{ + blksz_t blkszs[ BLIS_NUM_BLKSZS ]; + + // Set default kernel blocksizes and functions. + bli_cntx_init_sifive_rvv_ref( cntx ); + + // ------------------------------------------------------------------------- + + unsigned vlenb = __riscv_vlenb(); + + // Update the context with optimized native kernels. + bli_cntx_set_ukrs + ( + cntx, + + // Level 1 + BLIS_ADDV_KER, BLIS_FLOAT, bli_saddv_sifive_rvv_intr, + BLIS_ADDV_KER, BLIS_DOUBLE, bli_daddv_sifive_rvv_intr, + BLIS_ADDV_KER, BLIS_SCOMPLEX, bli_caddv_sifive_rvv_intr, + BLIS_ADDV_KER, BLIS_DCOMPLEX, bli_zaddv_sifive_rvv_intr, + + BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_sifive_rvv_intr, + BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_sifive_rvv_intr, + BLIS_AMAXV_KER, BLIS_SCOMPLEX, bli_camaxv_sifive_rvv_intr, + BLIS_AMAXV_KER, BLIS_DCOMPLEX, bli_zamaxv_sifive_rvv_intr, + + BLIS_AXPBYV_KER, BLIS_FLOAT, bli_saxpbyv_sifive_rvv_intr, + BLIS_AXPBYV_KER, BLIS_DOUBLE, bli_daxpbyv_sifive_rvv_intr, + BLIS_AXPBYV_KER, BLIS_SCOMPLEX, bli_caxpbyv_sifive_rvv_intr, + BLIS_AXPBYV_KER, BLIS_DCOMPLEX, bli_zaxpbyv_sifive_rvv_intr, + + BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_sifive_rvv_intr, + BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_sifive_rvv_intr, + BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_sifive_rvv_intr, + BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_sifive_rvv_intr, + + BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_sifive_rvv_intr, + BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_sifive_rvv_intr, + BLIS_COPYV_KER, BLIS_SCOMPLEX, bli_ccopyv_sifive_rvv_intr, + BLIS_COPYV_KER, BLIS_DCOMPLEX, bli_zcopyv_sifive_rvv_intr, + + BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_sifive_rvv_intr, + BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_sifive_rvv_intr, + BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_sifive_rvv_intr, + BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_sifive_rvv_intr, + + BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_sifive_rvv_intr, + BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_sifive_rvv_intr, + BLIS_DOTXV_KER, BLIS_SCOMPLEX, bli_cdotxv_sifive_rvv_intr, + BLIS_DOTXV_KER, BLIS_DCOMPLEX, bli_zdotxv_sifive_rvv_intr, + + BLIS_INVERTV_KER, BLIS_FLOAT, bli_sinvertv_sifive_rvv_intr, + BLIS_INVERTV_KER, BLIS_DOUBLE, bli_dinvertv_sifive_rvv_intr, + BLIS_INVERTV_KER, BLIS_SCOMPLEX, bli_cinvertv_sifive_rvv_intr, + BLIS_INVERTV_KER, BLIS_DCOMPLEX, bli_zinvertv_sifive_rvv_intr, + + BLIS_INVSCALV_KER, BLIS_FLOAT, bli_sinvscalv_sifive_rvv_intr, + BLIS_INVSCALV_KER, BLIS_DOUBLE, bli_dinvscalv_sifive_rvv_intr, + BLIS_INVSCALV_KER, BLIS_SCOMPLEX, bli_cinvscalv_sifive_rvv_intr, + BLIS_INVSCALV_KER, BLIS_DCOMPLEX, bli_zinvscalv_sifive_rvv_intr, + + BLIS_SCAL2V_KER, BLIS_FLOAT, bli_sscal2v_sifive_rvv_intr, + BLIS_SCAL2V_KER, BLIS_DOUBLE, bli_dscal2v_sifive_rvv_intr, + BLIS_SCAL2V_KER, BLIS_SCOMPLEX, bli_cscal2v_sifive_rvv_intr, + BLIS_SCAL2V_KER, BLIS_DCOMPLEX, bli_zscal2v_sifive_rvv_intr, + + BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_sifive_rvv_intr, + BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_sifive_rvv_intr, + BLIS_SCALV_KER, BLIS_SCOMPLEX, bli_cscalv_sifive_rvv_intr, + BLIS_SCALV_KER, BLIS_DCOMPLEX, bli_zscalv_sifive_rvv_intr, + + BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_sifive_rvv_intr, + BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_sifive_rvv_intr, + BLIS_SETV_KER, BLIS_SCOMPLEX, bli_csetv_sifive_rvv_intr, + BLIS_SETV_KER, BLIS_DCOMPLEX, bli_zsetv_sifive_rvv_intr, + + BLIS_SUBV_KER, BLIS_FLOAT, bli_ssubv_sifive_rvv_intr, + BLIS_SUBV_KER, BLIS_DOUBLE, bli_dsubv_sifive_rvv_intr, + BLIS_SUBV_KER, BLIS_SCOMPLEX, bli_csubv_sifive_rvv_intr, + BLIS_SUBV_KER, BLIS_DCOMPLEX, bli_zsubv_sifive_rvv_intr, + + BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_sifive_rvv_intr, + BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_sifive_rvv_intr, + BLIS_SWAPV_KER, BLIS_SCOMPLEX, bli_cswapv_sifive_rvv_intr, + BLIS_SWAPV_KER, BLIS_DCOMPLEX, bli_zswapv_sifive_rvv_intr, + + BLIS_XPBYV_KER, BLIS_FLOAT, bli_sxpbyv_sifive_rvv_intr, + BLIS_XPBYV_KER, BLIS_DOUBLE, bli_dxpbyv_sifive_rvv_intr, + BLIS_XPBYV_KER, BLIS_SCOMPLEX, bli_cxpbyv_sifive_rvv_intr, + BLIS_XPBYV_KER, BLIS_DCOMPLEX, bli_zxpbyv_sifive_rvv_intr, + + // Level 1f + BLIS_AXPY2V_KER, BLIS_FLOAT, bli_saxpy2v_sifive_rvv_intr, + BLIS_AXPY2V_KER, BLIS_DOUBLE, bli_daxpy2v_sifive_rvv_intr, + BLIS_AXPY2V_KER, BLIS_SCOMPLEX, bli_caxpy2v_sifive_rvv_intr, + BLIS_AXPY2V_KER, BLIS_DCOMPLEX, bli_zaxpy2v_sifive_rvv_intr, + + BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_sifive_rvv_intr, + BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_sifive_rvv_intr, + BLIS_AXPYF_KER, BLIS_SCOMPLEX, bli_caxpyf_sifive_rvv_intr, + BLIS_AXPYF_KER, BLIS_DCOMPLEX, bli_zaxpyf_sifive_rvv_intr, + + BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_sifive_rvv_intr, + BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_sifive_rvv_intr, + BLIS_DOTXF_KER, BLIS_SCOMPLEX, bli_cdotxf_sifive_rvv_intr, + BLIS_DOTXF_KER, BLIS_DCOMPLEX, bli_zdotxf_sifive_rvv_intr, + + BLIS_DOTAXPYV_KER, BLIS_FLOAT, bli_sdotaxpyv_sifive_rvv_intr, + BLIS_DOTAXPYV_KER, BLIS_DOUBLE, bli_ddotaxpyv_sifive_rvv_intr, + BLIS_DOTAXPYV_KER, BLIS_SCOMPLEX, bli_cdotaxpyv_sifive_rvv_intr, + BLIS_DOTAXPYV_KER, BLIS_DCOMPLEX, bli_zdotaxpyv_sifive_rvv_intr, + + BLIS_DOTXAXPYF_KER, BLIS_FLOAT, bli_sdotxaxpyf_sifive_rvv_intr, + BLIS_DOTXAXPYF_KER, BLIS_DOUBLE, bli_ddotxaxpyf_sifive_rvv_intr, + BLIS_DOTXAXPYF_KER, BLIS_SCOMPLEX, bli_cdotxaxpyf_sifive_rvv_intr, + BLIS_DOTXAXPYF_KER, BLIS_DCOMPLEX, bli_zdotxaxpyf_sifive_rvv_intr, + + // Level 1m + BLIS_PACKM_KER, BLIS_FLOAT, bli_spackm_sifive_rvv_intr, + BLIS_PACKM_KER, BLIS_DOUBLE, bli_dpackm_sifive_rvv_intr, + BLIS_PACKM_KER, BLIS_SCOMPLEX, bli_cpackm_sifive_rvv_intr, + BLIS_PACKM_KER, BLIS_DCOMPLEX, bli_zpackm_sifive_rvv_intr, + + // Level 3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_sifive_rvv_intr, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_sifive_rvv_intr, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_sifive_rvv_intr, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_sifive_rvv_intr, + + BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_sifive_rvv_intr, + BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_sifive_rvv_intr, + BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsm_l_sifive_rvv_intr, + BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_l_sifive_rvv_intr, + BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_sifive_rvv_intr, + BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_sifive_rvv_intr, + BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsm_u_sifive_rvv_intr, + BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_u_sifive_rvv_intr, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + + BLIS_VA_END + ); + + // Initialize level-3 blocksize objects with architecture-specific values. + // s d c z + bli_blksz_init ( &blkszs[ BLIS_MR ], 7, 7, 6, 6, + 8, 8, 8, 8 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 4 * vlenb / 4, 4 * vlenb / 8, 2 * vlenb / 4, 2 * vlenb / 8 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 7, 7, 6, 6 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4 * vlenb / 4, 4 * vlenb / 8, 2 * vlenb / 4, 2 * vlenb / 8 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 64, 64, 64, 64 ); + // Default BLIS_BBM_s = 1, but set here to ensure it's correct + bli_blksz_init_easy( &blkszs[ BLIS_BBM ], 1, 1, 1, 1 ); + bli_blksz_init_easy( &blkszs[ BLIS_BBN ], 1, 1, 1, 1 ); + + // Update the context with the current architecture's register and cache + // blocksizes (and multiples) for native execution. + bli_cntx_set_blkszs + ( + cntx, + + // level-3 + BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, + BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, + BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, + BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, + BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + + // level-1m + BLIS_BBM, &blkszs[ BLIS_BBM ], BLIS_BBM, + BLIS_BBN, &blkszs[ BLIS_BBN ], BLIS_BBN, + + BLIS_VA_END + ); +} + diff --git a/frame/include/level0/old/bli_castto.h b/config/sifive_rvv/bli_family_sifive_rvv.h similarity index 96% rename from frame/include/level0/old/bli_castto.h rename to config/sifive_rvv/bli_family_sifive_rvv.h index 52e6a98b8e..708c1960fd 100644 --- a/frame/include/level0/old/bli_castto.h +++ b/config/sifive_rvv/bli_family_sifive_rvv.h @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2024, SiFive, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -31,3 +31,4 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ + diff --git a/config/sifive_rvv/bli_kernel_defs_sifive_rvv.h b/config/sifive_rvv/bli_kernel_defs_sifive_rvv.h new file mode 100644 index 0000000000..33543db50f --- /dev/null +++ b/config/sifive_rvv/bli_kernel_defs_sifive_rvv.h @@ -0,0 +1,55 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- +#define BLIS_MR_s 7 +#define BLIS_MR_d 7 +#define BLIS_MR_c 6 +#define BLIS_MR_z 6 + +#define BLIS_PACKMR_s 8 +#define BLIS_PACKMR_d 8 +#define BLIS_PACKMR_c 8 +#define BLIS_PACKMR_z 8 + +#define BLIS_NR_s -1 +#define BLIS_NR_d -1 +#define BLIS_NR_c -1 +#define BLIS_NR_z -1 +//#endif + diff --git a/config/sifive_rvv/make_defs.mk b/config/sifive_rvv/make_defs.mk new file mode 100644 index 0000000000..a4b3675e15 --- /dev/null +++ b/config/sifive_rvv/make_defs.mk @@ -0,0 +1,80 @@ +# +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2024, SiFive, Inc. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name(s) of the copyright holder(s) nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + + +# Declare the name of the current configuration and add it to the +# running list of configurations included by common.mk. +THIS_CONFIG := sifive_rvv +#CONFIGS_INCL += $(THIS_CONFIG) + +# +# --- Determine the C compiler and related flags --- +# + + +# NOTE: The build system will append these variables with various +# general-purpose/configuration-agnostic flags in common.mk. You +# may specify additional flags here as needed. +CMISCFLAGS_SIFIVE := -mcmodel=medany -march=rv64gcv_zba_zbb -mabi=lp64d +CMISCFLAGS_SIFIVE_OTHER := +CPPROCFLAGS := +CMISCFLAGS := $(CMISCFLAGS_SIFIVE) $(CMISCFLAGS_SIFIVE_OTHER) \ + -fdata-sections -ffunction-sections \ + -fdiagnostics-color=always -fno-rtti -fno-exceptions +CPICFLAGS := -fPIC +CWARNFLAGS := -Wall -Wextra -Wno-unused-function -Wno-unused-parameter \ + -Wno-sign-compare -Wno-unused-variable + +ifneq ($(DEBUG_TYPE),off) +CDBGFLAGS := -g +endif + +ifeq ($(DEBUG_TYPE),noopt) +COPTFLAGS := -O0 +else +COPTFLAGS := -O3 +endif + +# Flags specific to optimized kernels. +CKOPTFLAGS := $(COPTFLAGS) +CKVECFLAGS := + +# Flags specific to reference kernels. +CROPTFLAGS := $(CKOPTFLAGS) +CRVECFLAGS := $(CKVECFLAGS) + +# Store all of the variables here to new variables containing the +# configuration name. +$(eval $(call store-make-defs,$(THIS_CONFIG))) + diff --git a/config/sifive_x280/bli_cntx_init_sifive_x280.c b/config/sifive_x280/bli_cntx_init_sifive_x280.c new file mode 100644 index 0000000000..142ca19278 --- /dev/null +++ b/config/sifive_x280/bli_cntx_init_sifive_x280.c @@ -0,0 +1,222 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_cntx_init_sifive_x280( cntx_t* cntx ) +{ + blksz_t blkszs[ BLIS_NUM_BLKSZS ]; + + // Set default kernel blocksizes and functions. + bli_cntx_init_sifive_x280_ref( cntx ); + + // ------------------------------------------------------------------------- + + // Update the context with optimized native kernels. + bli_cntx_set_ukrs + ( + cntx, + + // Level 1 + BLIS_ADDV_KER, BLIS_FLOAT, bli_saddv_sifive_rvv_intr, + BLIS_ADDV_KER, BLIS_DOUBLE, bli_daddv_sifive_rvv_intr, + BLIS_ADDV_KER, BLIS_SCOMPLEX, bli_caddv_sifive_rvv_intr, + BLIS_ADDV_KER, BLIS_DCOMPLEX, bli_zaddv_sifive_rvv_intr, + + BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_sifive_rvv_intr, + BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_sifive_rvv_intr, + BLIS_AMAXV_KER, BLIS_SCOMPLEX, bli_camaxv_sifive_rvv_intr, + BLIS_AMAXV_KER, BLIS_DCOMPLEX, bli_zamaxv_sifive_rvv_intr, + + BLIS_AXPBYV_KER, BLIS_FLOAT, bli_saxpbyv_sifive_rvv_intr, + BLIS_AXPBYV_KER, BLIS_DOUBLE, bli_daxpbyv_sifive_rvv_intr, + BLIS_AXPBYV_KER, BLIS_SCOMPLEX, bli_caxpbyv_sifive_rvv_intr, + BLIS_AXPBYV_KER, BLIS_DCOMPLEX, bli_zaxpbyv_sifive_rvv_intr, + + BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_sifive_rvv_intr, + BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_sifive_rvv_intr, + BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_sifive_rvv_intr, + BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_sifive_rvv_intr, + + BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_sifive_rvv_intr, + BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_sifive_rvv_intr, + BLIS_COPYV_KER, BLIS_SCOMPLEX, bli_ccopyv_sifive_rvv_intr, + BLIS_COPYV_KER, BLIS_DCOMPLEX, bli_zcopyv_sifive_rvv_intr, + + BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_sifive_rvv_intr, + BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_sifive_rvv_intr, + BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_sifive_rvv_intr, + BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_sifive_rvv_intr, + + BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_sifive_rvv_intr, + BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_sifive_rvv_intr, + BLIS_DOTXV_KER, BLIS_SCOMPLEX, bli_cdotxv_sifive_rvv_intr, + BLIS_DOTXV_KER, BLIS_DCOMPLEX, bli_zdotxv_sifive_rvv_intr, + + BLIS_INVERTV_KER, BLIS_FLOAT, bli_sinvertv_sifive_rvv_intr, + BLIS_INVERTV_KER, BLIS_DOUBLE, bli_dinvertv_sifive_rvv_intr, + BLIS_INVERTV_KER, BLIS_SCOMPLEX, bli_cinvertv_sifive_rvv_intr, + BLIS_INVERTV_KER, BLIS_DCOMPLEX, bli_zinvertv_sifive_rvv_intr, + + BLIS_INVSCALV_KER, BLIS_FLOAT, bli_sinvscalv_sifive_rvv_intr, + BLIS_INVSCALV_KER, BLIS_DOUBLE, bli_dinvscalv_sifive_rvv_intr, + BLIS_INVSCALV_KER, BLIS_SCOMPLEX, bli_cinvscalv_sifive_rvv_intr, + BLIS_INVSCALV_KER, BLIS_DCOMPLEX, bli_zinvscalv_sifive_rvv_intr, + + BLIS_SCAL2V_KER, BLIS_FLOAT, bli_sscal2v_sifive_rvv_intr, + BLIS_SCAL2V_KER, BLIS_DOUBLE, bli_dscal2v_sifive_rvv_intr, + BLIS_SCAL2V_KER, BLIS_SCOMPLEX, bli_cscal2v_sifive_rvv_intr, + BLIS_SCAL2V_KER, BLIS_DCOMPLEX, bli_zscal2v_sifive_rvv_intr, + + BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_sifive_rvv_intr, + BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_sifive_rvv_intr, + BLIS_SCALV_KER, BLIS_SCOMPLEX, bli_cscalv_sifive_rvv_intr, + BLIS_SCALV_KER, BLIS_DCOMPLEX, bli_zscalv_sifive_rvv_intr, + + BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_sifive_rvv_intr, + BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_sifive_rvv_intr, + BLIS_SETV_KER, BLIS_SCOMPLEX, bli_csetv_sifive_rvv_intr, + BLIS_SETV_KER, BLIS_DCOMPLEX, bli_zsetv_sifive_rvv_intr, + + BLIS_SUBV_KER, BLIS_FLOAT, bli_ssubv_sifive_rvv_intr, + BLIS_SUBV_KER, BLIS_DOUBLE, bli_dsubv_sifive_rvv_intr, + BLIS_SUBV_KER, BLIS_SCOMPLEX, bli_csubv_sifive_rvv_intr, + BLIS_SUBV_KER, BLIS_DCOMPLEX, bli_zsubv_sifive_rvv_intr, + + BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_sifive_rvv_intr, + BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_sifive_rvv_intr, + BLIS_SWAPV_KER, BLIS_SCOMPLEX, bli_cswapv_sifive_rvv_intr, + BLIS_SWAPV_KER, BLIS_DCOMPLEX, bli_zswapv_sifive_rvv_intr, + + BLIS_XPBYV_KER, BLIS_FLOAT, bli_sxpbyv_sifive_rvv_intr, + BLIS_XPBYV_KER, BLIS_DOUBLE, bli_dxpbyv_sifive_rvv_intr, + BLIS_XPBYV_KER, BLIS_SCOMPLEX, bli_cxpbyv_sifive_rvv_intr, + BLIS_XPBYV_KER, BLIS_DCOMPLEX, bli_zxpbyv_sifive_rvv_intr, + + // Level 1f + BLIS_AXPY2V_KER, BLIS_FLOAT, bli_saxpy2v_sifive_rvv_intr, + BLIS_AXPY2V_KER, BLIS_DOUBLE, bli_daxpy2v_sifive_rvv_intr, + BLIS_AXPY2V_KER, BLIS_SCOMPLEX, bli_caxpy2v_sifive_rvv_intr, + BLIS_AXPY2V_KER, BLIS_DCOMPLEX, bli_zaxpy2v_sifive_rvv_intr, + + BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_sifive_rvv_intr, + BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_sifive_rvv_intr, + BLIS_AXPYF_KER, BLIS_SCOMPLEX, bli_caxpyf_sifive_rvv_intr, + BLIS_AXPYF_KER, BLIS_DCOMPLEX, bli_zaxpyf_sifive_rvv_intr, + + BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_sifive_rvv_intr, + BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_sifive_rvv_intr, + BLIS_DOTXF_KER, BLIS_SCOMPLEX, bli_cdotxf_sifive_rvv_intr, + BLIS_DOTXF_KER, BLIS_DCOMPLEX, bli_zdotxf_sifive_rvv_intr, + + BLIS_DOTAXPYV_KER, BLIS_FLOAT, bli_sdotaxpyv_sifive_rvv_intr, + BLIS_DOTAXPYV_KER, BLIS_DOUBLE, bli_ddotaxpyv_sifive_rvv_intr, + BLIS_DOTAXPYV_KER, BLIS_SCOMPLEX, bli_cdotaxpyv_sifive_rvv_intr, + BLIS_DOTAXPYV_KER, BLIS_DCOMPLEX, bli_zdotaxpyv_sifive_rvv_intr, + + BLIS_DOTXAXPYF_KER, BLIS_FLOAT, bli_sdotxaxpyf_sifive_rvv_intr, + BLIS_DOTXAXPYF_KER, BLIS_DOUBLE, bli_ddotxaxpyf_sifive_rvv_intr, + BLIS_DOTXAXPYF_KER, BLIS_SCOMPLEX, bli_cdotxaxpyf_sifive_rvv_intr, + BLIS_DOTXAXPYF_KER, BLIS_DCOMPLEX, bli_zdotxaxpyf_sifive_rvv_intr, + + // Level 1m + BLIS_PACKM_KER, BLIS_FLOAT, bli_spackm_sifive_rvv_intr, + BLIS_PACKM_KER, BLIS_DOUBLE, bli_dpackm_sifive_rvv_intr, + BLIS_PACKM_KER, BLIS_SCOMPLEX, bli_cpackm_sifive_rvv_intr, + BLIS_PACKM_KER, BLIS_DCOMPLEX, bli_zpackm_sifive_rvv_intr, + + // Level 3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_sifive_rvv_intr, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_sifive_rvv_intr, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_sifive_rvv_intr, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_sifive_rvv_intr, + + BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_sifive_rvv_intr, + BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_sifive_rvv_intr, + BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsm_l_sifive_rvv_intr, + BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_l_sifive_rvv_intr, + BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_sifive_rvv_intr, + BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_sifive_rvv_intr, + BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsm_u_sifive_rvv_intr, + BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_u_sifive_rvv_intr, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + + BLIS_VA_END + ); + + // Initialize level-3 blocksize objects with architecture-specific values. + // s d c z + bli_blksz_init ( &blkszs[ BLIS_MR ], 7, 7, 6, 6, + 8, 8, 8, 8 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 64, 32, 32, 16 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 28, 28, 24, 24 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 1024, 1024, 1024, 1024 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 128, 256, 128 ); + // Default BLIS_BBM_s = 1, but set here to ensure it's correct + bli_blksz_init_easy( &blkszs[ BLIS_BBM ], 1, 1, 1, 1 ); + bli_blksz_init_easy( &blkszs[ BLIS_BBN ], 1, 1, 1, 1 ); + + // Update the context with the current architecture's register and cache + // blocksizes (and multiples) for native execution. + bli_cntx_set_blkszs + ( + cntx, + + // level-3 + BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, + BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, + BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, + BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, + BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + + // level-1m + BLIS_BBM, &blkszs[ BLIS_BBM ], BLIS_BBM, + BLIS_BBN, &blkszs[ BLIS_BBN ], BLIS_BBN, + + BLIS_VA_END + ); +} + diff --git a/frame/include/level0/old/bli_castfrom.h b/config/sifive_x280/bli_family_sifive_x280.h similarity index 96% rename from frame/include/level0/old/bli_castfrom.h rename to config/sifive_x280/bli_family_sifive_x280.h index 52e6a98b8e..4f02c048fa 100644 --- a/frame/include/level0/old/bli_castfrom.h +++ b/config/sifive_x280/bli_family_sifive_x280.h @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2023, SiFive, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -31,3 +31,4 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ + diff --git a/config/sifive_x280/bli_kernel_defs_sifive_x280.h b/config/sifive_x280/bli_kernel_defs_sifive_x280.h new file mode 100644 index 0000000000..bb6865a669 --- /dev/null +++ b/config/sifive_x280/bli_kernel_defs_sifive_x280.h @@ -0,0 +1,55 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- +#define BLIS_MR_s 7 +#define BLIS_MR_d 7 +#define BLIS_MR_c 6 +#define BLIS_MR_z 6 + +#define BLIS_PACKMR_s 8 +#define BLIS_PACKMR_d 8 +#define BLIS_PACKMR_c 8 +#define BLIS_PACKMR_z 8 + +#define BLIS_NR_s 64 +#define BLIS_NR_d 32 +#define BLIS_NR_c 32 +#define BLIS_NR_z 16 +//#endif + diff --git a/config/sifive_x280/make_defs.mk b/config/sifive_x280/make_defs.mk new file mode 100644 index 0000000000..5f19e4e442 --- /dev/null +++ b/config/sifive_x280/make_defs.mk @@ -0,0 +1,80 @@ +# +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2023, SiFive, Inc. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name(s) of the copyright holder(s) nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + + +# Declare the name of the current configuration and add it to the +# running list of configurations included by common.mk. +THIS_CONFIG := sifive_x280 +#CONFIGS_INCL += $(THIS_CONFIG) + +# +# --- Determine the C compiler and related flags --- +# + + +# NOTE: The build system will append these variables with various +# general-purpose/configuration-agnostic flags in common.mk. You +# may specify additional flags here as needed. +CMISCFLAGS_SIFIVE := -mcmodel=medany -march=rv64gcv_zba_zbb_zvl512b -mabi=lp64d +CMISCFLAGS_SIFIVE_OTHER := +CPPROCFLAGS := +CMISCFLAGS := $(CMISCFLAGS_SIFIVE) $(CMISCFLAGS_SIFIVE_OTHER) \ + -fdata-sections -ffunction-sections \ + -fdiagnostics-color=always -fno-rtti -fno-exceptions +CPICFLAGS := -fPIC +CWARNFLAGS := -Wall -Wextra -Wno-unused-function -Wno-unused-parameter \ + -Wno-sign-compare -Wno-unused-variable + +ifneq ($(DEBUG_TYPE),off) +CDBGFLAGS := -g +endif + +ifeq ($(DEBUG_TYPE),noopt) +COPTFLAGS := -O0 +else +COPTFLAGS := -O3 +endif + +# Flags specific to optimized kernels. +CKOPTFLAGS := $(COPTFLAGS) +CKVECFLAGS := + +# Flags specific to reference kernels. +CROPTFLAGS := $(CKOPTFLAGS) +CRVECFLAGS := $(CKVECFLAGS) + +# Store all of the variables here to new variables containing the +# configuration name. +$(eval $(call store-make-defs,$(THIS_CONFIG))) + diff --git a/config/skx/bli_cntx_init_skx.c b/config/skx/bli_cntx_init_skx.c index f18503a7a7..3af58b38d2 100644 --- a/config/skx/bli_cntx_init_skx.c +++ b/config/skx/bli_cntx_init_skx.c @@ -43,39 +43,29 @@ void bli_cntx_init_skx( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 2, - // gemm - BLIS_GEMM_UKR, BLIS_FLOAT , bli_sgemm_skx_asm_32x12_l2, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_skx_asm_16x14, FALSE, - cntx - ); + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT , bli_sgemm_skx_asm_32x12_l2, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_skx_asm_16x14, - // Update the context with optimized level-1f kernels. - bli_cntx_set_l1f_kers - ( - 4, // axpyf BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8, BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8, + // dotxf BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, - cntx - ); - // Update the context with optimized level-1v kernels. - bli_cntx_set_l1v_kers - ( - 10, #if 1 // amaxv BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, #endif + // axpyv #if 0 BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int, @@ -84,12 +74,15 @@ void bli_cntx_init_skx( cntx_t* cntx ) BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10, BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10, #endif + // dotv BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int, BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int, + // dotxv BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int, BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int, + // scalv #if 0 BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int, @@ -98,7 +91,20 @@ void bli_cntx_init_skx( cntx_t* cntx ) BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, #endif - cntx + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT , FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -116,17 +122,20 @@ void bli_cntx_init_skx( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 7, + cntx, + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + // level-1f BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, - cntx + + BLIS_VA_END ); } diff --git a/config/skx/bli_family_skx.h b/config/skx/bli_family_skx.h index ac9478f8ba..d698f12b4d 100644 --- a/config/skx/bli_family_skx.h +++ b/config/skx/bli_family_skx.h @@ -47,8 +47,8 @@ #define BLIS_SIMD_ALIGN_SIZE 64 -#define BLIS_SIMD_SIZE 64 -#define BLIS_SIMD_NUM_REGISTERS 32 +#define BLIS_SIMD_MAX_SIZE 64 +#define BLIS_SIMD_MAX_NUM_REGISTERS 32 //#include diff --git a/config/skx/bli_kernel_defs_skx.h b/config/skx/bli_kernel_defs_skx.h new file mode 100644 index 0000000000..2aaf477ad5 --- /dev/null +++ b/config/skx/bli_kernel_defs_skx.h @@ -0,0 +1,48 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 32 +#define BLIS_MR_d 16 + +#define BLIS_NR_s 12 +#define BLIS_NR_d 14 + +//#endif + diff --git a/config/skx/make_defs.mk b/config/skx/make_defs.mk index 00ae94a364..589e73dda0 100644 --- a/config/skx/make_defs.mk +++ b/config/skx/make_defs.mk @@ -47,7 +47,7 @@ THIS_CONFIG := skx # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := -CPICFLAGS := +CPICFLAGS := -fPIC CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) diff --git a/config/steamroller/bli_cntx_init_steamroller.c b/config/steamroller/bli_cntx_init_steamroller.c index 13e7f6495b..4b4ecdf4e6 100644 --- a/config/steamroller/bli_cntx_init_steamroller.c +++ b/config/steamroller/bli_cntx_init_steamroller.c @@ -43,16 +43,32 @@ void bli_cntx_init_steamroller( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 4, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_piledriver_asm_16x3, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_piledriver_asm_8x3, FALSE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2, FALSE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_piledriver_asm_16x3, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_piledriver_asm_8x3, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -67,13 +83,16 @@ void bli_cntx_init_steamroller( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/steamroller/bli_kernel_defs_steamroller.h b/config/steamroller/bli_kernel_defs_steamroller.h new file mode 100644 index 0000000000..df4a8c4118 --- /dev/null +++ b/config/steamroller/bli_kernel_defs_steamroller.h @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 16 +#define BLIS_MR_d 8 +#define BLIS_MR_c 4 +#define BLIS_MR_z 2 + +#define BLIS_NR_s 3 +#define BLIS_NR_d 3 +#define BLIS_NR_c 2 +#define BLIS_NR_z 2 + +//#endif + diff --git a/config/steamroller/make_defs.mk b/config/steamroller/make_defs.mk index 5220c3540b..122472c85d 100644 --- a/config/steamroller/make_defs.mk +++ b/config/steamroller/make_defs.mk @@ -47,7 +47,7 @@ THIS_CONFIG := steamroller # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := -CPICFLAGS := +CPICFLAGS := -fPIC CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) diff --git a/config/template/bli_cntx_init_template.c b/config/template/bli_cntx_init_template.c index f2b1c8d175..8e5a57d6cf 100644 --- a/config/template/bli_cntx_init_template.c +++ b/config/template/bli_cntx_init_template.c @@ -45,55 +45,68 @@ void bli_cntx_init_template( cntx_t* cntx ) // Update the context with optimized native gemm micro-kernels and // their storage preferences. - bli_cntx_set_l3_nat_ukrs + bli_cntx_set_ukrs ( - 5, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_template_noopt, FALSE, - BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_l_template_noopt, FALSE, - BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_u_template_noopt, FALSE, - BLIS_TRSM_L_UKR, BLIS_DCOMPLEX, bli_ztrsm_l_template_noopt, FALSE, - BLIS_TRSM_U_UKR, BLIS_DCOMPLEX, bli_ztrsm_u_template_noopt, FALSE, - cntx - ); + cntx, - // Update the context with optimized level-1f kernels. - bli_cntx_set_l1f_kers - ( + // level-3 + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_template_noopt, + BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_l_template_noopt, + BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_u_template_noopt, + BLIS_TRSM_L_UKR, BLIS_DCOMPLEX, bli_ztrsm_l_template_noopt, + BLIS_TRSM_U_UKR, BLIS_DCOMPLEX, bli_ztrsm_u_template_noopt, + + // level-1f BLIS_AXPY2V_KER, BLIS_DCOMPLEX, bli_zaxpy2v_template_noopt, BLIS_DOTAXPYV_KER, BLIS_DCOMPLEX, bli_zdotaxpyv_template_noopt, BLIS_AXPYF_KER, BLIS_DCOMPLEX, bli_zaxpyf_template_noopt, BLIS_DOTXF_KER, BLIS_DCOMPLEX, bli_zdotxf_template_noopt, BLIS_DOTXAXPYF_KER, BLIS_DCOMPLEX, bli_zdotxaxpyf_template_noopt, - cntx - ); - // Update the context with optimized level-1v kernels. - bli_cntx_set_l1v_kers - ( + // level-1v BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_template_noopt, BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_template_noopt, - cntx + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + BLIS_TRSM_L_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + BLIS_TRSM_U_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_MR ], 0, 0, 0, 4 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 0, 0, 0, 4 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 0, 0, 0, 128 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 0, 0, 0, 256 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 0, 0, 0, 4096 ); + bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, -1, -1, 4 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, -1, -1, 4 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, -1, -1, 128 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, -1, -1, 256 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, -1, -1, 4096 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/frame/include/level0/old/ro/bli_scal2ros.h b/config/template/bli_kernel_defs_template.h similarity index 75% rename from frame/include/level0/old/ro/bli_scal2ros.h rename to config/template/bli_kernel_defs_template.h index 5f68de5ab3..86a33d8d8e 100644 --- a/frame/include/level0/old/ro/bli_scal2ros.h +++ b/config/template/bli_kernel_defs_template.h @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2022, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -32,31 +32,29 @@ */ -#ifndef BLIS_SCAL2ROS_H -#define BLIS_SCAL2ROS_H +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H -// scal2ros -#define bli_cscal2ros( a, x, yr ) \ -{ \ - (yr) = bli_creal(a) * bli_creal(x) - bli_cimag(a) * bli_cimag(x); \ -} +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- -#define bli_zscal2ros( a, x, yr ) \ -{ \ - (yr) = bli_zreal(a) * bli_zreal(x) - bli_zimag(a) * bli_zimag(x); \ -} +// +// Only defined for block sizes which are not taken as the default (i.e. when +// an optimized kernel is provided). +// -#define bli_scscal2ros( a, x, yr ) \ -{ \ - (yr) = bli_creal(a) * bli_creal(x); \ -} +#define BLIS_MR_z 4 -#define bli_dzscal2ros( a, x, yr ) \ -{ \ - (yr) = bli_zreal(a) * bli_zreal(x); \ -} +#define BLIS_NR_z 4 +// +// PACKMR/PACKNR do not need to be defined unless they are different from the +// "normal" MR/NR. +// -#endif +//#define BLIS_PACKMR_z 4 + +//#define BLIS_PACKNR_z 4 + +//#endif diff --git a/config/template/kernels/1/bli_axpyv_template_noopt_var1.c b/config/template/kernels/1/bli_axpyv_template_noopt_var1.c index d1918466f7..cc3c078c12 100644 --- a/config/template/kernels/1/bli_axpyv_template_noopt_var1.c +++ b/config/template/kernels/1/bli_axpyv_template_noopt_var1.c @@ -42,7 +42,7 @@ void bli_zaxpyv_template_noopt dcomplex* restrict alpha, dcomplex* restrict x, inc_t incx, dcomplex* restrict y, inc_t incy, - cntx_t* restrict cntx + cntx_t* cntx ) { /* @@ -117,7 +117,7 @@ void bli_zaxpyv_template_noopt if ( bli_zero_dim1( n ) ) return; - if ( bli_zeq0( *alpha ) ) return; + if ( bli_teq0s( z, *alpha ) ) return; // If there is anything that would interfere with our use of aligned @@ -179,7 +179,7 @@ void bli_zaxpyv_template_noopt // Compute front edge cases if x and y were unaligned. for ( i = 0; i < n_pre; ++i ) { - bli_zaxpys( *alpha, *xp, *yp ); + bli_taxpys( z,z,z,z, *alpha, *xp, *yp ); xp += 1; yp += 1; } @@ -188,7 +188,7 @@ void bli_zaxpyv_template_noopt // yp are guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE. for ( i = 0; i < n_iter; ++i ) { - bli_zaxpys( *alpha, *xp, *yp ); + bli_taxpys( z,z,z,z, *alpha, *xp, *yp ); xp += n_elem_per_iter; yp += n_elem_per_iter; @@ -197,7 +197,7 @@ void bli_zaxpyv_template_noopt // Compute tail edge cases, if applicable. for ( i = 0; i < n_left; ++i ) { - bli_zaxpys( *alpha, *xp, *yp ); + bli_taxpys( z,z,z,z, *alpha, *xp, *yp ); xp += 1; yp += 1; } @@ -207,7 +207,7 @@ void bli_zaxpyv_template_noopt // Compute front edge cases if x and y were unaligned. for ( i = 0; i < n_pre; ++i ) { - bli_zaxpyjs( *alpha, *xp, *yp ); + bli_taxpyjs( z,z,z,z, *alpha, *xp, *yp ); xp += 1; yp += 1; } @@ -216,7 +216,7 @@ void bli_zaxpyv_template_noopt // yp are guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE. for ( i = 0; i < n_iter; ++i ) { - bli_zaxpyjs( *alpha, *xp, *yp ); + bli_taxpyjs( z,z,z,z, *alpha, *xp, *yp ); xp += n_elem_per_iter; yp += n_elem_per_iter; @@ -225,7 +225,7 @@ void bli_zaxpyv_template_noopt // Compute tail edge cases, if applicable. for ( i = 0; i < n_left; ++i ) { - bli_zaxpyjs( *alpha, *xp, *yp ); + bli_taxpyjs( z,z,z,z, *alpha, *xp, *yp ); xp += 1; yp += 1; } diff --git a/config/template/kernels/1/bli_dotv_template_noopt_var1.c b/config/template/kernels/1/bli_dotv_template_noopt_var1.c index 3761d2e764..c59c0d6209 100644 --- a/config/template/kernels/1/bli_dotv_template_noopt_var1.c +++ b/config/template/kernels/1/bli_dotv_template_noopt_var1.c @@ -43,7 +43,7 @@ void bli_zdotv_template_noopt dcomplex* restrict x, inc_t incx, dcomplex* restrict y, inc_t incy, dcomplex* restrict rho, - cntx_t* restrict cntx + cntx_t* cntx ) { /* @@ -127,7 +127,7 @@ void bli_zdotv_template_noopt // If the vector lengths are zero, set rho to zero and return. if ( bli_zero_dim1( n ) ) { - bli_zset0s( *rho ); + bli_tset0s( z, *rho ); return; } @@ -185,9 +185,9 @@ void bli_zdotv_template_noopt // Initialize accumulator to zero. - bli_zset0s( dotxy ); + bli_tset0s( z, dotxy ); + - conjx_use = conjx; // If y must be conjugated, we compute the result indirectly by first @@ -204,7 +204,7 @@ void bli_zdotv_template_noopt // Compute front edge cases if x and y were unaligned. for ( i = 0; i < n_pre; ++i ) { - bli_zdots( *xp, *yp, dotxy ); + bli_tdots( z,z,z,z, *xp, *yp, dotxy ); xp += 1; yp += 1; } @@ -213,7 +213,7 @@ void bli_zdotv_template_noopt // yp are guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE. for ( i = 0; i < n_iter; ++i ) { - bli_zdots( *xp, *yp, dotxy ); + bli_tdots( z,z,z,z, *xp, *yp, dotxy ); xp += n_elem_per_iter; yp += n_elem_per_iter; @@ -222,7 +222,7 @@ void bli_zdotv_template_noopt // Compute tail edge cases, if applicable. for ( i = 0; i < n_left; ++i ) { - bli_zdots( *xp, *yp, dotxy ); + bli_tdots( z,z,z,z, *xp, *yp, dotxy ); xp += 1; yp += 1; } @@ -232,7 +232,7 @@ void bli_zdotv_template_noopt // Compute front edge cases if x and y were unaligned. for ( i = 0; i < n_pre; ++i ) { - bli_zdotjs( *xp, *yp, dotxy ); + bli_tdotjs( z,z,z,z, *xp, *yp, dotxy ); xp += 1; yp += 1; } @@ -241,7 +241,7 @@ void bli_zdotv_template_noopt // yp are guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE. for ( i = 0; i < n_iter; ++i ) { - bli_zdotjs( *xp, *yp, dotxy ); + bli_tdotjs( z,z,z,z, *xp, *yp, dotxy ); xp += n_elem_per_iter; yp += n_elem_per_iter; @@ -250,7 +250,7 @@ void bli_zdotv_template_noopt // Compute tail edge cases, if applicable. for ( i = 0; i < n_left; ++i ) { - bli_zdotjs( *xp, *yp, dotxy ); + bli_tdotjs( z,z,z,z, *xp, *yp, dotxy ); xp += 1; yp += 1; } @@ -259,8 +259,8 @@ void bli_zdotv_template_noopt // If conjugation on y was requested, we induce it by conjugating // the contents of dotxy. if ( bli_is_conj( conjy ) ) - bli_zconjs( dotxy ); + bli_tconjs( z, dotxy ); - bli_zcopys( dotxy, *rho ); + bli_tcopys( z,z, dotxy, *rho ); } diff --git a/config/template/kernels/1f/bli_axpy2v_template_noopt_var1.c b/config/template/kernels/1f/bli_axpy2v_template_noopt_var1.c index 7080abce06..649511cf03 100644 --- a/config/template/kernels/1f/bli_axpy2v_template_noopt_var1.c +++ b/config/template/kernels/1f/bli_axpy2v_template_noopt_var1.c @@ -45,7 +45,7 @@ void bli_zaxpy2v_template_noopt dcomplex* restrict x, inc_t incx, dcomplex* restrict y, inc_t incy, dcomplex* restrict z, inc_t incz, - cntx_t* restrict cntx + cntx_t* cntx ) { /* @@ -194,8 +194,8 @@ void bli_zaxpy2v_template_noopt // Compute front edge cases if x, y, and z were unaligned. for ( i = 0; i < n_pre; ++i ) { - bli_zaxpys( *alpha1, *xp, *zp ); - bli_zaxpys( *alpha2, *yp, *zp ); + bli_taxpys( z,z,z,z, *alpha1, *xp, *zp ); + bli_taxpys( z,z,z,z, *alpha2, *yp, *zp ); xp += 1; yp += 1; zp += 1; } @@ -207,8 +207,8 @@ void bli_zaxpy2v_template_noopt // to BLIS_SIMD_ALIGN_SIZE. for ( i = 0; i < n_iter; ++i ) { - bli_zaxpys( *alpha1, *xp, *zp ); - bli_zaxpys( *alpha2, *yp, *zp ); + bli_taxpys( z,z,z,z, *alpha1, *xp, *zp ); + bli_taxpys( z,z,z,z, *alpha2, *yp, *zp ); xp += n_elem_per_iter; yp += n_elem_per_iter; @@ -218,8 +218,8 @@ void bli_zaxpy2v_template_noopt // Compute tail edge cases, if applicable. for ( i = 0; i < n_left; ++i ) { - bli_zaxpys( *alpha1, *xp, *zp ); - bli_zaxpys( *alpha2, *yp, *zp ); + bli_taxpys( z,z,z,z, *alpha1, *xp, *zp ); + bli_taxpys( z,z,z,z, *alpha2, *yp, *zp ); xp += 1; yp += 1; zp += 1; } @@ -229,8 +229,8 @@ void bli_zaxpy2v_template_noopt // Compute front edge cases if x, y, and z were unaligned. for ( i = 0; i < n_pre; ++i ) { - bli_zaxpys( *alpha1, *xp, *zp ); - bli_zaxpyjs( *alpha2, *yp, *zp ); + bli_taxpys( z,z,z,z, *alpha1, *xp, *zp ); + bli_taxpyjs( z,z,z,z, *alpha2, *yp, *zp ); xp += 1; yp += 1; zp += 1; } @@ -242,8 +242,8 @@ void bli_zaxpy2v_template_noopt // to BLIS_SIMD_ALIGN_SIZE. for ( i = 0; i < n_iter; ++i ) { - bli_zaxpys( *alpha1, *xp, *zp ); - bli_zaxpyjs( *alpha2, *yp, *zp ); + bli_taxpys( z,z,z,z, *alpha1, *xp, *zp ); + bli_taxpyjs( z,z,z,z, *alpha2, *yp, *zp ); xp += n_elem_per_iter; yp += n_elem_per_iter; @@ -253,8 +253,8 @@ void bli_zaxpy2v_template_noopt // Compute tail edge cases, if applicable. for ( i = 0; i < n_left; ++i ) { - bli_zaxpys( *alpha1, *xp, *zp ); - bli_zaxpyjs( *alpha2, *yp, *zp ); + bli_taxpys( z,z,z,z, *alpha1, *xp, *zp ); + bli_taxpyjs( z,z,z,z, *alpha2, *yp, *zp ); xp += 1; yp += 1; zp += 1; } @@ -264,8 +264,8 @@ void bli_zaxpy2v_template_noopt // Compute front edge cases if x, y, and z were unaligned. for ( i = 0; i < n_pre; ++i ) { - bli_zaxpyjs( *alpha1, *xp, *zp ); - bli_zaxpys( *alpha2, *yp, *zp ); + bli_taxpyjs( z,z,z,z, *alpha1, *xp, *zp ); + bli_taxpys( z,z,z,z, *alpha2, *yp, *zp ); xp += 1; yp += 1; zp += 1; } @@ -277,8 +277,8 @@ void bli_zaxpy2v_template_noopt // to BLIS_SIMD_ALIGN_SIZE. for ( i = 0; i < n_iter; ++i ) { - bli_zaxpyjs( *alpha1, *xp, *zp ); - bli_zaxpys( *alpha2, *yp, *zp ); + bli_taxpyjs( z,z,z,z, *alpha1, *xp, *zp ); + bli_taxpys( z,z,z,z, *alpha2, *yp, *zp ); xp += n_elem_per_iter; yp += n_elem_per_iter; @@ -288,8 +288,8 @@ void bli_zaxpy2v_template_noopt // Compute tail edge cases, if applicable. for ( i = 0; i < n_left; ++i ) { - bli_zaxpyjs( *alpha1, *xp, *zp ); - bli_zaxpys( *alpha2, *yp, *zp ); + bli_taxpyjs( z,z,z,z, *alpha1, *xp, *zp ); + bli_taxpys( z,z,z,z, *alpha2, *yp, *zp ); xp += 1; yp += 1; zp += 1; } @@ -299,8 +299,8 @@ void bli_zaxpy2v_template_noopt // Compute front edge cases if x, y, and z were unaligned. for ( i = 0; i < n_pre; ++i ) { - bli_zaxpyjs( *alpha1, *xp, *zp ); - bli_zaxpyjs( *alpha2, *yp, *zp ); + bli_taxpyjs( z,z,z,z, *alpha1, *xp, *zp ); + bli_taxpyjs( z,z,z,z, *alpha2, *yp, *zp ); xp += 1; yp += 1; zp += 1; } @@ -312,8 +312,8 @@ void bli_zaxpy2v_template_noopt // to BLIS_SIMD_ALIGN_SIZE. for ( i = 0; i < n_iter; ++i ) { - bli_zaxpyjs( *alpha1, *xp, *zp ); - bli_zaxpyjs( *alpha2, *yp, *zp ); + bli_taxpyjs( z,z,z,z, *alpha1, *xp, *zp ); + bli_taxpyjs( z,z,z,z, *alpha2, *yp, *zp ); xp += n_elem_per_iter; yp += n_elem_per_iter; @@ -323,8 +323,8 @@ void bli_zaxpy2v_template_noopt // Compute tail edge cases, if applicable. for ( i = 0; i < n_left; ++i ) { - bli_zaxpyjs( *alpha1, *xp, *zp ); - bli_zaxpyjs( *alpha2, *yp, *zp ); + bli_taxpyjs( z,z,z,z, *alpha1, *xp, *zp ); + bli_taxpyjs( z,z,z,z, *alpha2, *yp, *zp ); xp += 1; yp += 1; zp += 1; } diff --git a/config/template/kernels/1f/bli_axpyf_template_noopt_var1.c b/config/template/kernels/1f/bli_axpyf_template_noopt_var1.c index a0afedfcaf..834c2fc242 100644 --- a/config/template/kernels/1f/bli_axpyf_template_noopt_var1.c +++ b/config/template/kernels/1f/bli_axpyf_template_noopt_var1.c @@ -45,7 +45,7 @@ void bli_zaxpyf_template_noopt dcomplex* restrict a, inc_t inca, inc_t lda, dcomplex* restrict x, inc_t incx, dcomplex* restrict y, inc_t incy, - cntx_t* restrict cntx + cntx_t* cntx ) { /* @@ -209,16 +209,16 @@ void bli_zaxpyf_template_noopt { for ( j = 0; j < b_n; ++j ) { - bli_zcopys( *xp[ j ], alpha_x[ j ] ); - bli_zscals( *alpha, alpha_x[ j ] ); + bli_tcopys( z,z, *xp[ j ], alpha_x[ j ] ); + bli_tscals( z,z,z, *alpha, alpha_x[ j ] ); } } else // if ( bli_is_conj( conjx ) ) { for ( j = 0; j < b_n; ++j ) { - bli_zcopyjs( *xp[ j ], alpha_x[ j ] ); - bli_zscals( *alpha, alpha_x[ j ] ); + bli_tcopyjs( z,z, *xp[ j ], alpha_x[ j ] ); + bli_tscals( z,z,z, *alpha, alpha_x[ j ] ); } } @@ -231,7 +231,7 @@ void bli_zaxpyf_template_noopt { for ( j = 0; j < b_n; ++j ) { - bli_zaxpys( alpha_x[ j ], *ap[ j ], *yp ); + bli_taxpys( z,z,z,z, alpha_x[ j ], *ap[ j ], *yp ); ap[ j ] += 1; } @@ -247,7 +247,7 @@ void bli_zaxpyf_template_noopt { for ( j = 0; j < b_n; ++j ) { - bli_zaxpys( alpha_x[ j ], *ap[ j ], *yp ); + bli_taxpys( z,z,z,z, alpha_x[ j ], *ap[ j ], *yp ); ap[ j ] += n_elem_per_iter; } @@ -259,7 +259,7 @@ void bli_zaxpyf_template_noopt { for ( j = 0; j < b_n; ++j ) { - bli_zaxpys( alpha_x[ j ], *ap[ j ], *yp ); + bli_taxpys( z,z,z,z, alpha_x[ j ], *ap[ j ], *yp ); ap[ j ] += 1; } @@ -273,7 +273,7 @@ void bli_zaxpyf_template_noopt { for ( j = 0; j < b_n; ++j ) { - bli_zaxpyjs( alpha_x[ j ], *ap[ j ], *yp ); + bli_taxpyjs( z,z,z,z, alpha_x[ j ], *ap[ j ], *yp ); ap[ j ] += 1; } @@ -289,7 +289,7 @@ void bli_zaxpyf_template_noopt { for ( j = 0; j < b_n; ++j ) { - bli_zaxpyjs( alpha_x[ j ], *ap[ j ], *yp ); + bli_taxpyjs( z,z,z,z, alpha_x[ j ], *ap[ j ], *yp ); ap[ j ] += n_elem_per_iter; } @@ -301,7 +301,7 @@ void bli_zaxpyf_template_noopt { for ( j = 0; j < b_n; ++j ) { - bli_zaxpyjs( alpha_x[ j ], *ap[ j ], *yp ); + bli_taxpyjs( z,z,z,z, alpha_x[ j ], *ap[ j ], *yp ); ap[ j ] += 1; } diff --git a/config/template/kernels/1f/bli_dotaxpyv_template_noopt_var1.c b/config/template/kernels/1f/bli_dotaxpyv_template_noopt_var1.c index 275c399982..ae806d50d9 100644 --- a/config/template/kernels/1f/bli_dotaxpyv_template_noopt_var1.c +++ b/config/template/kernels/1f/bli_dotaxpyv_template_noopt_var1.c @@ -46,7 +46,7 @@ void bli_zdotaxpyv_template_noopt dcomplex* restrict y, inc_t incy, dcomplex* restrict rho, dcomplex* restrict z, inc_t incz, - cntx_t* restrict cntx + cntx_t* cntx ) { /* @@ -138,7 +138,7 @@ void bli_zdotaxpyv_template_noopt // If the vector lengths are zero, set rho to zero and return. if ( bli_zero_dim1( n ) ) { - bli_zset0s( *rho ); + bli_tset0s( z, *rho ); return; } @@ -202,7 +202,7 @@ void bli_zdotaxpyv_template_noopt // Initialize accumulator to zero. - bli_zset0s( dotxy ); + bli_tset0s( z, dotxy ); conjxt_use = conjxt; @@ -222,8 +222,8 @@ void bli_zdotaxpyv_template_noopt // Compute front edge cases if x, y, and z were unaligned. for ( i = 0; i < n_pre; ++i ) { - bli_zdots( *xp, *yp, dotxy ); - bli_zaxpys( *alpha, *xp, *zp ); + bli_tdots( z,z,z,z, *xp, *yp, dotxy ); + bli_taxpys( z,z,z,z, *alpha, *xp, *zp ); xp += 1; yp += 1; zp += 1; } @@ -235,8 +235,8 @@ void bli_zdotaxpyv_template_noopt // guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE. for ( i = 0; i < n_iter; ++i ) { - bli_zdots( *xp, *yp, dotxy ); - bli_zaxpys( *alpha, *xp, *zp ); + bli_tdots( z,z,z,z, *xp, *yp, dotxy ); + bli_taxpys( z,z,z,z, *alpha, *xp, *zp ); xp += n_elem_per_iter; yp += n_elem_per_iter; @@ -246,8 +246,8 @@ void bli_zdotaxpyv_template_noopt // Compute tail edge cases, if applicable. for ( i = 0; i < n_left; ++i ) { - bli_zdots( *xp, *yp, dotxy ); - bli_zaxpys( *alpha, *xp, *zp ); + bli_tdots( z,z,z,z, *xp, *yp, dotxy ); + bli_taxpys( z,z,z,z, *alpha, *xp, *zp ); xp += 1; yp += 1; zp += 1; } @@ -257,8 +257,8 @@ void bli_zdotaxpyv_template_noopt // Compute front edge cases if x, y, and z were unaligned. for ( i = 0; i < n_pre; ++i ) { - bli_zdotjs( *xp, *yp, dotxy ); - bli_zaxpys( *alpha, *xp, *zp ); + bli_tdotjs( z,z,z,z, *xp, *yp, dotxy ); + bli_taxpys( z,z,z,z, *alpha, *xp, *zp ); xp += 1; yp += 1; zp += 1; } @@ -270,8 +270,8 @@ void bli_zdotaxpyv_template_noopt // guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE. for ( i = 0; i < n_iter; ++i ) { - bli_zdotjs( *xp, *yp, dotxy ); - bli_zaxpys( *alpha, *xp, *zp ); + bli_tdotjs( z,z,z,z, *xp, *yp, dotxy ); + bli_taxpys( z,z,z,z, *alpha, *xp, *zp ); xp += n_elem_per_iter; yp += n_elem_per_iter; @@ -281,8 +281,8 @@ void bli_zdotaxpyv_template_noopt // Compute tail edge cases, if applicable. for ( i = 0; i < n_left; ++i ) { - bli_zdotjs( *xp, *yp, dotxy ); - bli_zaxpys( *alpha, *xp, *zp ); + bli_tdotjs( z,z,z,z, *xp, *yp, dotxy ); + bli_taxpys( z,z,z,z, *alpha, *xp, *zp ); xp += 1; yp += 1; zp += 1; } @@ -292,8 +292,8 @@ void bli_zdotaxpyv_template_noopt // Compute front edge cases if x, y, and z were unaligned. for ( i = 0; i < n_pre; ++i ) { - bli_zdots( *xp, *yp, dotxy ); - bli_zaxpyjs( *alpha, *xp, *zp ); + bli_tdots( z,z,z,z, *xp, *yp, dotxy ); + bli_taxpyjs( z,z,z,z, *alpha, *xp, *zp ); xp += 1; yp += 1; zp += 1; } @@ -305,8 +305,8 @@ void bli_zdotaxpyv_template_noopt // guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE. for ( i = 0; i < n_iter; ++i ) { - bli_zdots( *xp, *yp, dotxy ); - bli_zaxpyjs( *alpha, *xp, *zp ); + bli_tdots( z,z,z,z, *xp, *yp, dotxy ); + bli_taxpyjs( z,z,z,z, *alpha, *xp, *zp ); xp += n_elem_per_iter; yp += n_elem_per_iter; @@ -316,8 +316,8 @@ void bli_zdotaxpyv_template_noopt // Compute tail edge cases, if applicable. for ( i = 0; i < n_left; ++i ) { - bli_zdots( *xp, *yp, dotxy ); - bli_zaxpyjs( *alpha, *xp, *zp ); + bli_tdots( z,z,z,z, *xp, *yp, dotxy ); + bli_taxpyjs( z,z,z,z, *alpha, *xp, *zp ); xp += 1; yp += 1; zp += 1; } @@ -327,8 +327,8 @@ void bli_zdotaxpyv_template_noopt // Compute front edge cases if x, y, and z were unaligned. for ( i = 0; i < n_pre; ++i ) { - bli_zdotjs( *xp, *yp, dotxy ); - bli_zaxpyjs( *alpha, *xp, *zp ); + bli_tdotjs( z,z,z,z, *xp, *yp, dotxy ); + bli_taxpyjs( z,z,z,z, *alpha, *xp, *zp ); xp += 1; yp += 1; zp += 1; } @@ -340,8 +340,8 @@ void bli_zdotaxpyv_template_noopt // guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE. for ( i = 0; i < n_iter; ++i ) { - bli_zdotjs( *xp, *yp, dotxy ); - bli_zaxpyjs( *alpha, *xp, *zp ); + bli_tdotjs( z,z,z,z, *xp, *yp, dotxy ); + bli_taxpyjs( z,z,z,z, *alpha, *xp, *zp ); xp += n_elem_per_iter; yp += n_elem_per_iter; @@ -351,8 +351,8 @@ void bli_zdotaxpyv_template_noopt // Compute tail edge cases, if applicable. for ( i = 0; i < n_left; ++i ) { - bli_zdotjs( *xp, *yp, dotxy ); - bli_zaxpyjs( *alpha, *xp, *zp ); + bli_tdotjs( z,z,z,z, *xp, *yp, dotxy ); + bli_taxpyjs( z,z,z,z, *alpha, *xp, *zp ); xp += 1; yp += 1; zp += 1; } @@ -361,8 +361,8 @@ void bli_zdotaxpyv_template_noopt // If conjugation on y was requested, we induce it by conjugating // the contents of rho. if ( bli_is_conj( conjy ) ) - bli_zconjs( dotxy ); + bli_tconjs( z, dotxy ); - bli_zcopys( dotxy, *rho ); + bli_tcopys( z,z, dotxy, *rho ); } diff --git a/config/template/kernels/1f/bli_dotxaxpyf_template_noopt_var1.c b/config/template/kernels/1f/bli_dotxaxpyf_template_noopt_var1.c index 6754d86ce8..468647ff2c 100644 --- a/config/template/kernels/1f/bli_dotxaxpyf_template_noopt_var1.c +++ b/config/template/kernels/1f/bli_dotxaxpyf_template_noopt_var1.c @@ -50,7 +50,7 @@ void bli_zdotxaxpyf_template_noopt dcomplex* restrict beta, dcomplex* restrict y, inc_t incy, dcomplex* restrict z, inc_t incz, - cntx_t* restrict cntx + cntx_t* cntx ) { @@ -238,23 +238,23 @@ void bli_zdotxaxpyf_template_noopt { for ( j = 0; j < b_n; ++j ) { - bli_zcopys( *xp[ j ], alpha_x[ j ] ); - bli_zscals( *alpha, alpha_x[ j ] ); + bli_tcopys( z,z, *xp[ j ], alpha_x[ j ] ); + bli_tscals( z,z,z, *alpha, alpha_x[ j ] ); } } else // if ( bli_is_conj( conjx ) ) { for ( j = 0; j < b_n; ++j ) { - bli_zcopyjs( *xp[ j ], alpha_x[ j ] ); - bli_zscals( *alpha, alpha_x[ j ] ); + bli_tcopyjs( z,z, *xp[ j ], alpha_x[ j ] ); + bli_tscals( z,z,z, *alpha, alpha_x[ j ] ); } } // Initialize our accumulators to zero. for ( j = 0; j < b_n; ++j ) { - bli_zset0s( At_w[ j ] ); + bli_tset0s( z, At_w[ j ] ); } @@ -278,8 +278,8 @@ void bli_zdotxaxpyf_template_noopt { for ( j = 0; j < b_n; ++j ) { - bli_zdots( *ap[ j ], *wp, At_w[ j ] ); - bli_zdots( *ap[ j ], alpha_x[ j ], *zp ); + bli_tdots( z,z,z,z, *ap[ j ], *wp, At_w[ j ] ); + bli_tdots( z,z,z,z, *ap[ j ], alpha_x[ j ], *zp ); ap[ j ] += 1; } @@ -295,8 +295,8 @@ void bli_zdotxaxpyf_template_noopt { for ( j = 0; j < b_n; ++j ) { - bli_zdots( *ap[ j ], *wp, At_w[ j ] ); - bli_zdots( *ap[ j ], alpha_x[ j ], *zp ); + bli_tdots( z,z,z,z, *ap[ j ], *wp, At_w[ j ] ); + bli_tdots( z,z,z,z, *ap[ j ], alpha_x[ j ], *zp ); ap[ j ] += n_elem_per_iter; } @@ -308,8 +308,8 @@ void bli_zdotxaxpyf_template_noopt { for ( j = 0; j < b_n; ++j ) { - bli_zdots( *ap[ j ], *wp, At_w[ j ] ); - bli_zdots( *ap[ j ], alpha_x[ j ], *zp ); + bli_tdots( z,z,z,z, *ap[ j ], *wp, At_w[ j ] ); + bli_tdots( z,z,z,z, *ap[ j ], alpha_x[ j ], *zp ); ap[ j ] += 1; } @@ -323,8 +323,8 @@ void bli_zdotxaxpyf_template_noopt { for ( j = 0; j < b_n; ++j ) { - bli_zdotjs( *ap[ j ], *wp, At_w[ j ] ); - bli_zdots( *ap[ j ], alpha_x[ j ], *zp ); + bli_tdotjs( z,z,z,z, *ap[ j ], *wp, At_w[ j ] ); + bli_tdots( z,z,z,z, *ap[ j ], alpha_x[ j ], *zp ); ap[ j ] += 1; } @@ -340,8 +340,8 @@ void bli_zdotxaxpyf_template_noopt { for ( j = 0; j < b_n; ++j ) { - bli_zdotjs( *ap[ j ], *wp, At_w[ j ] ); - bli_zdots( *ap[ j ], alpha_x[ j ], *zp ); + bli_tdotjs( z,z,z,z, *ap[ j ], *wp, At_w[ j ] ); + bli_tdots( z,z,z,z, *ap[ j ], alpha_x[ j ], *zp ); ap[ j ] += n_elem_per_iter; } @@ -353,8 +353,8 @@ void bli_zdotxaxpyf_template_noopt { for ( j = 0; j < b_n; ++j ) { - bli_zdotjs( *ap[ j ], *wp, At_w[ j ] ); - bli_zdots( *ap[ j ], alpha_x[ j ], *zp ); + bli_tdotjs( z,z,z,z, *ap[ j ], *wp, At_w[ j ] ); + bli_tdots( z,z,z,z, *ap[ j ], alpha_x[ j ], *zp ); ap[ j ] += 1; } @@ -368,8 +368,8 @@ void bli_zdotxaxpyf_template_noopt { for ( j = 0; j < b_n; ++j ) { - bli_zdots( *ap[ j ], *wp, At_w[ j ] ); - bli_zdotjs( *ap[ j ], alpha_x[ j ], *zp ); + bli_tdots( z,z,z,z, *ap[ j ], *wp, At_w[ j ] ); + bli_tdotjs( z,z,z,z, *ap[ j ], alpha_x[ j ], *zp ); ap[ j ] += 1; } @@ -385,8 +385,8 @@ void bli_zdotxaxpyf_template_noopt { for ( j = 0; j < b_n; ++j ) { - bli_zdots( *ap[ j ], *wp, At_w[ j ] ); - bli_zdotjs( *ap[ j ], alpha_x[ j ], *zp ); + bli_tdots( z,z,z,z, *ap[ j ], *wp, At_w[ j ] ); + bli_tdotjs( z,z,z,z, *ap[ j ], alpha_x[ j ], *zp ); ap[ j ] += n_elem_per_iter; } @@ -398,8 +398,8 @@ void bli_zdotxaxpyf_template_noopt { for ( j = 0; j < b_n; ++j ) { - bli_zdots( *ap[ j ], *wp, At_w[ j ] ); - bli_zdotjs( *ap[ j ], alpha_x[ j ], *zp ); + bli_tdots( z,z,z,z, *ap[ j ], *wp, At_w[ j ] ); + bli_tdotjs( z,z,z,z, *ap[ j ], alpha_x[ j ], *zp ); ap[ j ] += 1; } @@ -413,8 +413,8 @@ void bli_zdotxaxpyf_template_noopt { for ( j = 0; j < b_n; ++j ) { - bli_zdotjs( *ap[ j ], *wp, At_w[ j ] ); - bli_zdotjs( *ap[ j ], alpha_x[ j ], *zp ); + bli_tdotjs( z,z,z,z, *ap[ j ], *wp, At_w[ j ] ); + bli_tdotjs( z,z,z,z, *ap[ j ], alpha_x[ j ], *zp ); ap[ j ] += 1; } @@ -430,8 +430,8 @@ void bli_zdotxaxpyf_template_noopt { for ( j = 0; j < b_n; ++j ) { - bli_zdotjs( *ap[ j ], *wp, At_w[ j ] ); - bli_zdotjs( *ap[ j ], alpha_x[ j ], *zp ); + bli_tdotjs( z,z,z,z, *ap[ j ], *wp, At_w[ j ] ); + bli_tdotjs( z,z,z,z, *ap[ j ], alpha_x[ j ], *zp ); ap[ j ] += n_elem_per_iter; } @@ -443,8 +443,8 @@ void bli_zdotxaxpyf_template_noopt { for ( j = 0; j < b_n; ++j ) { - bli_zdotjs( *ap[ j ], *wp, At_w[ j ] ); - bli_zdotjs( *ap[ j ], alpha_x[ j ], *zp ); + bli_tdotjs( z,z,z,z, *ap[ j ], *wp, At_w[ j ] ); + bli_tdotjs( z,z,z,z, *ap[ j ], alpha_x[ j ], *zp ); ap[ j ] += 1; } @@ -459,7 +459,7 @@ void bli_zdotxaxpyf_template_noopt { for ( j = 0; j < b_n; ++j ) { - bli_zconjs( At_w[ j ] ); + bli_tconjs( z, At_w[ j ] ); } } @@ -467,8 +467,8 @@ void bli_zdotxaxpyf_template_noopt // scaling by beta. for ( j = 0; j < b_n; ++j ) { - bli_zscals( *beta, *yp[ j ] ); - bli_zaxpys( *alpha, At_w[ j ], *yp[ j ] ); + bli_tscals( z,z,z, *beta, *yp[ j ] ); + bli_taxpys( z,z,z,z, *alpha, At_w[ j ], *yp[ j ] ); } } diff --git a/config/template/kernels/1f/bli_dotxf_template_noopt_var1.c b/config/template/kernels/1f/bli_dotxf_template_noopt_var1.c index 430fb277db..ac62ff9997 100644 --- a/config/template/kernels/1f/bli_dotxf_template_noopt_var1.c +++ b/config/template/kernels/1f/bli_dotxf_template_noopt_var1.c @@ -46,7 +46,7 @@ void bli_zdotxf_template_noopt dcomplex* restrict x, inc_t incx, dcomplex* restrict beta, dcomplex* restrict y, inc_t incy, - cntx_t* restrict cntx + cntx_t* cntx ) { /* @@ -227,7 +227,7 @@ void bli_zdotxf_template_noopt // Initialize our accumulators to zero. for ( i = 0; i < b_n; ++i ) { - bli_zset0s( Atx[ i ] ); + bli_tset0s( z, Atx[ i ] ); } @@ -239,7 +239,7 @@ void bli_zdotxf_template_noopt if ( bli_is_conj( conjx ) ) bli_toggle_conj( &conjat_use ); - + // Iterate over columns of A and rows of x to compute: // Atx = conjat_use( A^T ) * x; if ( bli_is_noconj( conjat_use ) ) @@ -249,7 +249,7 @@ void bli_zdotxf_template_noopt { for ( i = 0; i < b_n; ++i ) { - bli_zzzdots( *ap[ i ], *xp, Atx[ i ] ); + bli_tdots( z,z,z,z, *ap[ i ], *xp, Atx[ i ] ); ap[ i ] += 1; } @@ -264,7 +264,7 @@ void bli_zdotxf_template_noopt { for ( i = 0; i < b_n; ++i ) { - bli_zzzdots( *ap[ i ], *xp, Atx[ i ] ); + bli_tdots( z,z,z,z, *ap[ i ], *xp, Atx[ i ] ); ap[ i ] += n_elem_per_iter; } @@ -276,7 +276,7 @@ void bli_zdotxf_template_noopt { for ( i = 0; i < b_n; ++i ) { - bli_zzzdots( *ap[ i ], *xp, Atx[ i ] ); + bli_tdots( z,z,z,z, *ap[ i ], *xp, Atx[ i ] ); ap[ i ] += 1; } @@ -290,7 +290,7 @@ void bli_zdotxf_template_noopt { for ( i = 0; i < b_n; ++i ) { - bli_zzzdotjs( *ap[ i ], *xp, Atx[ i ] ); + bli_tdotjs( z,z,z,z, *ap[ i ], *xp, Atx[ i ] ); ap[ i ] += 1; } @@ -305,7 +305,7 @@ void bli_zdotxf_template_noopt { for ( i = 0; i < b_n; ++i ) { - bli_zzzdotjs( *ap[ i ], *xp, Atx[ i ] ); + bli_tdotjs( z,z,z,z, *ap[ i ], *xp, Atx[ i ] ); ap[ i ] += n_elem_per_iter; } @@ -317,7 +317,7 @@ void bli_zdotxf_template_noopt { for ( i = 0; i < b_n; ++i ) { - bli_zzzdotjs( *ap[ i ], *xp, Atx[ i ] ); + bli_tdotjs( z,z,z,z, *ap[ i ], *xp, Atx[ i ] ); ap[ i ] += 1; } @@ -332,7 +332,7 @@ void bli_zdotxf_template_noopt { for ( i = 0; i < b_n; ++i ) { - bli_zconjs( Atx[ i ] ); + bli_tconjs( z, Atx[ i ] ); } } @@ -341,8 +341,8 @@ void bli_zdotxf_template_noopt // scaling by beta. for ( i = 0; i < b_n; ++i ) { - bli_zzscals( *beta, *yp[ i ] ); - bli_zzzaxpys( *alpha, Atx[ i ], *yp[ i ] ); + bli_tscals( z,z,z, *beta, *yp[ i ] ); + bli_taxpys( z,z,z,z, *alpha, Atx[ i ], *yp[ i ] ); } } diff --git a/config/template/kernels/3/bli_gemm_template_noopt_mxn.c b/config/template/kernels/3/bli_gemm_template_noopt_mxn.c index b7a13f3b69..97a924b0a5 100644 --- a/config/template/kernels/3/bli_gemm_template_noopt_mxn.c +++ b/config/template/kernels/3/bli_gemm_template_noopt_mxn.c @@ -37,14 +37,16 @@ void bli_zgemm_template_noopt ( + dim_t m, + dim_t n, dim_t k, dcomplex* restrict alpha, dcomplex* restrict a1, dcomplex* restrict b1, dcomplex* restrict beta, dcomplex* restrict c11, inc_t rs_c, inc_t cs_c, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { /* @@ -88,8 +90,7 @@ void bli_zgemm_template_noopt dim_t l, j, i; - dcomplex ab[ bli_zmr * - bli_znr ]; + dcomplex ab[ mr * nr ]; dcomplex* abij; dcomplex ai, bj; @@ -97,7 +98,7 @@ void bli_zgemm_template_noopt /* Initialize the accumulator elements in ab to zero. */ for ( i = 0; i < mr * nr; ++i ) { - bli_zset0s( *(ab + i) ); + bli_tset0s( z, *(ab + i) ); } /* Perform a series of k rank-1 updates into ab. */ @@ -115,7 +116,7 @@ void bli_zgemm_template_noopt { ai = *(a1 + i); - bli_zdots( ai, bj, *abij ); + bli_tdots( z,z,z,z, ai, bj, *abij ); abij += rs_ab; } @@ -128,25 +129,27 @@ void bli_zgemm_template_noopt /* Scale each element of ab by alpha. */ for ( i = 0; i < mr * nr; ++i ) { - bli_zscals( *alpha, *(ab + i) ); + bli_tscals( z,z,z, *alpha, *(ab + i) ); } /* If beta is zero, overwrite c11 with the scaled result in ab. Otherwise, scale c11 by beta and then add the scaled result in ab. */ - if ( bli_zeq0( *beta ) ) + if ( bli_teq0s( z, *beta ) ) { /* c11 := ab */ - bli_zcopys_mxn( mr, - nr, + bli_tcopys_mxn( z,z, + m, + n, ab, rs_ab, cs_ab, c11, rs_c, cs_c ); } else { /* c11 := beta * c11 + ab */ - bli_zxpbys_mxn( mr, - nr, + bli_txpbys_mxn( z,z,z,z, + m, + n, ab, rs_ab, cs_ab, beta, c11, rs_c, cs_c ); diff --git a/config/template/kernels/3/bli_gemmtrsm_l_template_noopt_mxn.c b/config/template/kernels/3/bli_gemmtrsm_l_template_noopt_mxn.c index da0cd3110f..d44fa4c1ef 100644 --- a/config/template/kernels/3/bli_gemmtrsm_l_template_noopt_mxn.c +++ b/config/template/kernels/3/bli_gemmtrsm_l_template_noopt_mxn.c @@ -44,8 +44,8 @@ void bli_zgemmtrsm_l_template_noopt dcomplex* restrict b01, dcomplex* restrict b11, dcomplex* restrict c11, inc_t rs_c, inc_t cs_c, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { /* @@ -74,6 +74,8 @@ void bli_zgemmtrsm_l_template_noopt */ const num_t dt = BLIS_DCOMPLEX; + const inc_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); + const inc_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); const inc_t rs_b = packnr; @@ -84,6 +86,8 @@ void bli_zgemmtrsm_l_template_noopt /* b11 = alpha * b11 - a10 * b01; */ bli_zgemm_template_noopt ( + mr, + nr, k, minus_one, a10, diff --git a/config/template/kernels/3/bli_gemmtrsm_u_template_noopt_mxn.c b/config/template/kernels/3/bli_gemmtrsm_u_template_noopt_mxn.c index 09b3af9cee..0a3d596227 100644 --- a/config/template/kernels/3/bli_gemmtrsm_u_template_noopt_mxn.c +++ b/config/template/kernels/3/bli_gemmtrsm_u_template_noopt_mxn.c @@ -44,8 +44,8 @@ void bli_zgemmtrsm_u_template_noopt dcomplex* restrict b01, dcomplex* restrict b11, dcomplex* restrict c11, inc_t rs_c, inc_t cs_c, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { /* @@ -74,6 +74,8 @@ void bli_zgemmtrsm_u_template_noopt */ const num_t dt = BLIS_DCOMPLEX; + const inc_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); + const inc_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); const inc_t rs_b = packnr; @@ -84,10 +86,12 @@ void bli_zgemmtrsm_u_template_noopt /* b11 = alpha * b11 - a12 * b21; */ bli_zgemm_template_noopt ( + mr, + nr, k, minus_one, - a12, - b21, + a10, + b01, alpha, b11, rs_b, cs_b, data diff --git a/config/template/kernels/3/bli_trsm_l_template_noopt_mxn.c b/config/template/kernels/3/bli_trsm_l_template_noopt_mxn.c index ce15798b0e..2688a7bc58 100644 --- a/config/template/kernels/3/bli_trsm_l_template_noopt_mxn.c +++ b/config/template/kernels/3/bli_trsm_l_template_noopt_mxn.c @@ -40,8 +40,8 @@ void bli_ztrsm_l_template_noopt dcomplex* restrict a11, dcomplex* restrict b11, dcomplex* restrict c11, inc_t rs_c, inc_t cs_c, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { /* @@ -116,25 +116,25 @@ void bli_ztrsm_l_template_noopt gamma11 = c11 + (i )*rs_c + (j )*cs_c; /* chi11 = chi11 - a10t * x01; */ - bli_zset0s( rho11 ); + bli_tset0s( z, rho11 ); for ( l = 0; l < n_behind; ++l ) { alpha10 = a10t + (l )*cs_a; chi01 = x01 + (l )*rs_b; - bli_zaxpys( *alpha10, *chi01, rho11 ); + bli_taxpys( z,z,z,z, *alpha10, *chi01, rho11 ); } - bli_zsubs( rho11, *chi11 ); + bli_tsubs( z,z,z, rho11, *chi11 ); /* chi11 = chi11 / alpha11; */ /* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead of alpha11, so we can multiply rather than divide. We store the inverse of alpha11 intentionally to avoid expensive division instructions within the micro-kernel. */ - bli_zscals( *alpha11, *chi11 ); + bli_tscals( z,z,z, *alpha11, *chi11 ); /* Output final result to matrix C. */ - bli_zcopys( *chi11, *gamma11 ); + bli_tcopys( z,z, *chi11, *gamma11 ); } } } diff --git a/config/template/kernels/3/bli_trsm_u_template_noopt_mxn.c b/config/template/kernels/3/bli_trsm_u_template_noopt_mxn.c index 661167c9ca..9d133b0371 100644 --- a/config/template/kernels/3/bli_trsm_u_template_noopt_mxn.c +++ b/config/template/kernels/3/bli_trsm_u_template_noopt_mxn.c @@ -40,8 +40,8 @@ void bli_ztrsm_u_template_noopt dcomplex* restrict a11, dcomplex* restrict b11, dcomplex* restrict c11, inc_t rs_c, inc_t cs_c, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { /* @@ -116,25 +116,25 @@ void bli_ztrsm_u_template_noopt gamma11 = c11 + (i )*rs_c + (j )*cs_c; /* chi11 = chi11 - a12t * x21; */ - bli_zset0s( rho11 ); + bli_tset0s( z, rho11 ); for ( l = 0; l < n_behind; ++l ) { alpha12 = a12t + (l )*cs_a; chi21 = x21 + (l )*rs_b; - bli_zaxpys( *alpha12, *chi21, rho11 ); + bli_taxpys( z,z,z,z, *alpha12, *chi21, rho11 ); } - bli_zsubs( rho11, *chi11 ); + bli_tsubs( z,z,z, rho11, *chi11 ); /* chi11 = chi11 / alpha11; */ /* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead of alpha11, so we can multiply rather than divide. We store the inverse of alpha11 intentionally to avoid expensive division instructions within the micro-kernel. */ - bli_zscals( *alpha11, *chi11 ); + bli_tscals( z,z,z, *alpha11, *chi11 ); /* Output final result to matrix C. */ - bli_zcopys( *chi11, *gamma11 ); + bli_tcopys( z,z, *chi11, *gamma11 ); } } } diff --git a/config/template/make_defs.mk b/config/template/make_defs.mk index 7b5b532a34..d4e70d574f 100644 --- a/config/template/make_defs.mk +++ b/config/template/make_defs.mk @@ -47,7 +47,7 @@ THIS_CONFIG := template # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := -CPICFLAGS := +CPICFLAGS := -fPIC CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) diff --git a/config/thunderx2/bli_cntx_init_thunderx2.c b/config/thunderx2/bli_cntx_init_thunderx2.c index f2b7b633d9..9d1af2c99c 100644 --- a/config/thunderx2/bli_cntx_init_thunderx2.c +++ b/config/thunderx2/bli_cntx_init_thunderx2.c @@ -43,14 +43,28 @@ void bli_cntx_init_thunderx2( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 2, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -65,13 +79,16 @@ void bli_cntx_init_thunderx2( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/thunderx2/bli_kernel_defs_thunderx2.h b/config/thunderx2/bli_kernel_defs_thunderx2.h new file mode 100644 index 0000000000..60292099cc --- /dev/null +++ b/config/thunderx2/bli_kernel_defs_thunderx2.h @@ -0,0 +1,48 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 8 +#define BLIS_MR_d 6 + +#define BLIS_NR_s 12 +#define BLIS_NR_d 8 + +//#endif + diff --git a/config/thunderx2/make_defs.mk b/config/thunderx2/make_defs.mk index b43fea87c5..fd7df2eee2 100644 --- a/config/thunderx2/make_defs.mk +++ b/config/thunderx2/make_defs.mk @@ -47,7 +47,7 @@ THIS_CONFIG := thunderx2 # may specify additional flags here as needed. CPPROCFLAGS := -D_GNU_SOURCE CMISCFLAGS := -CPICFLAGS := +CPICFLAGS := -fPIC CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) diff --git a/config/x86_64/make_defs.mk b/config/x86_64/make_defs.mk index 6a05a1f8f9..3c912370e0 100644 --- a/config/x86_64/make_defs.mk +++ b/config/x86_64/make_defs.mk @@ -47,7 +47,7 @@ THIS_CONFIG := x86_64 # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := -CPICFLAGS := +CPICFLAGS := -fPIC CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) diff --git a/config/zen/amd_config.mk b/config/zen/amd_config.mk index def1cadbae..b76bdebada 100644 --- a/config/zen/amd_config.mk +++ b/config/zen/amd_config.mk @@ -39,7 +39,7 @@ # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := -CPICFLAGS := +CPICFLAGS := -fPIC CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) diff --git a/config/zen/bli_cntx_init_zen.c b/config/zen/bli_cntx_init_zen.c index ed7287cee0..99bd794d97 100644 --- a/config/zen/bli_cntx_init_zen.c +++ b/config/zen/bli_cntx_init_zen.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020, Advanced Micro Devices, Inc. + Copyright (C) 2020-2022, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -40,90 +40,103 @@ void bli_cntx_init_zen( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; - blksz_t thresh[ BLIS_NUM_THRESH ]; // Set default kernel blocksizes and functions. bli_cntx_init_zen_ref( cntx ); // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 8, + cntx, // gemm - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE, + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, // gemmtrsm_l - BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE, - BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE, + BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, + BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, // gemmtrsm_u - BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE, - BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE, - cntx - ); -#if 1 - // Update the context with optimized packm kernels. - bli_cntx_set_packm_kers - ( - 8, - BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk, - BLIS_PACKM_16XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk, - BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk, - BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk, - BLIS_PACKM_3XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk, - BLIS_PACKM_8XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk, - BLIS_PACKM_3XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk, - BLIS_PACKM_4XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk, - cntx - ); + BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, + BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, + + // gemmsup + BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, + BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, + BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + + BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, + BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, + BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, + BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, + BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, + BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, + BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, + BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, +#if 0 + BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, + BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, + BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, + BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, + BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, + BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, + BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, + BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, #endif - // Update the context with optimized level-1f kernels. - bli_cntx_set_l1f_kers - ( - 4, +#if 0 + // NOTE: This set of kernels is likely broken and therefore disabled. + BLIS_GEMMSUP_RRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, + BLIS_GEMMSUP_RCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, + BLIS_GEMMSUP_CRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, + BLIS_GEMMSUP_RCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, + BLIS_GEMMSUP_CCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, + BLIS_GEMMSUP_CCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, + + BLIS_GEMMSUP_RRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, + BLIS_GEMMSUP_RCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, + BLIS_GEMMSUP_CRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, + BLIS_GEMMSUP_RCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, + BLIS_GEMMSUP_CCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, + BLIS_GEMMSUP_CCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, +#endif + + // packm + BLIS_PACKM_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6x16, + BLIS_PACKM_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6x8, + BLIS_PACKM_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3x8, + BLIS_PACKM_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3x4, // axpyf - BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8, - BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8, + BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8, + BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8, // dotxf - BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, - BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, - cntx - ); - - // Update the context with optimized level-1v kernels. - bli_cntx_set_l1v_kers - ( - 10, + BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, + BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, // amaxv BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, // axpyv -#if 0 - BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int, - BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int, -#else BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10, BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10, -#endif -#if 0 // copyv BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int, BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int, -#endif // dotv BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int, @@ -134,24 +147,76 @@ void bli_cntx_init_zen( cntx_t* cntx ) BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int, // scalv -#if 0 - BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int, - BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int, -#else BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, -#endif -#if 0 // setv - BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int, - BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int, + BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int, + BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int, // swapv BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8, BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // gemm + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + + // gemmtrsm_l + BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + // gemmtrsm_u + BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + // gemmsup + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + +#if 0 + // NOTE: This set of kernels is likely broken and therefore disabled. + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, #endif - cntx + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -173,143 +238,93 @@ void bli_cntx_init_zen( cntx_t* cntx ) mc = 510, kc = 1024 and nc = 4080 */ +#ifdef BLIS_ENABLE_ZEN_BLOCK_SIZES + // Zen optmized level 3 cache block sizes #if BLIS_ENABLE_SINGLE_INSTANCE_BLOCK_SIZES - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 510, 144, 72 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 1024, 256, 256 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 1020, 510, 510, 255 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 1024, 1024, 1024, 1024 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 4080, 3056 ); #else bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 240, 144, 72 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 512, 256, 256 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 2040, 4080, 4080 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 2040, 2040, 1528 ); #endif +#else + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 4080, 3056 ); +#endif bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); + // Initialize sup thresholds with architecture-appropriate values. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MT ], 512, 256, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NT ], 512, 256, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KT ], 440, 220, -1, -1 ); + + // Initialize level-3 sup blocksize objects with architecture-specific + // values. + // s d c z + bli_blksz_init ( &blkszs[ BLIS_MR_SUP ], 6, 6, -1, -1, + 9, 9, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ], 16, 8, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ], 144, 72, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ], 256, 256, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ], 8160, 4080, -1, -1 ); +#if 0 + bli_blksz_init ( &blkszs[ BLIS_MR_SUP ], 6, 6, 3, 3, + 9, 9, 3, 3 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ], 16, 8, 8, 4 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ], 144, 72, 72, 36 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ], 512, 256, 128, 64 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ], 8160, 4080, 2040, 1020 ); +#endif + // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 7, + cntx, + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + // level-1f BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, - cntx - ); - // ------------------------------------------------------------------------- + // sup thresholds + BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT, + BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT, + BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT, - // Initialize sup thresholds with architecture-appropriate values. - // s d c z - bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_NT ], 512, 256, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_KT ], 440, 220, -1, -1 ); + // gemmsup + BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NR_SUP, + BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KR_SUP, + BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MR_SUP, + BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP, + BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP, - // Initialize the context with the sup thresholds. - bli_cntx_set_l3_sup_thresh - ( - 3, - BLIS_MT, &thresh[ BLIS_MT ], - BLIS_NT, &thresh[ BLIS_NT ], - BLIS_KT, &thresh[ BLIS_KT ], - cntx + BLIS_VA_END ); + // ------------------------------------------------------------------------- + +#if 0 // Initialize the context with the sup handlers. bli_cntx_set_l3_sup_handlers ( - 1, + cntx, + BLIS_GEMM, bli_gemmsup_ref, //BLIS_GEMMT, bli_gemmtsup_ref, - cntx - ); - // Update the context with optimized small/unpacked gemm kernels. - bli_cntx_set_l3_sup_kers - ( - 16, - //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref, - BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE, - BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE, - BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - - BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, - BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, TRUE, - BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, - BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, - BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, - BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE, - BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, - BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, -#if 0 - BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, - BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE, - BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, - BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, - BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, - BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE, - BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, - BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, -#endif - -#if 0 - // NOTE: This set of kernels is likely broken and therefore disabled. - BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - - BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, - BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, - BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, - BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, - BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, - BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, -#endif - cntx + BLIS_VA_END ); - - // Initialize level-3 sup blocksize objects with architecture-specific - // values. - // s d c z - bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, -1, -1, - 9, 9, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, -1, -1 ); -#if 0 - bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, 3, 3, - 9, 9, 3, 3 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, 128, 64 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 2040, 1020 ); #endif - - // Update the context with the current architecture's register and cache - // blocksizes for small/unpacked level-3 problems. - bli_cntx_set_l3_sup_blkszs - ( - 5, - BLIS_NC, &blkszs[ BLIS_NC ], - BLIS_KC, &blkszs[ BLIS_KC ], - BLIS_MC, &blkszs[ BLIS_MC ], - BLIS_NR, &blkszs[ BLIS_NR ], - BLIS_MR, &blkszs[ BLIS_MR ], - cntx - ); } - diff --git a/config/zen/bli_family_zen.h b/config/zen/bli_family_zen.h index c82392b60e..da03bd7e42 100644 --- a/config/zen/bli_family_zen.h +++ b/config/zen/bli_family_zen.h @@ -39,6 +39,7 @@ #define BLIS_THREAD_MAX_IR 1 #define BLIS_THREAD_MAX_JR 1 +#define BLIS_ENABLE_ZEN_BLOCK_SIZES // Vanilla BLIS disables AMD's small matrix handling by default. #if 0 @@ -52,8 +53,8 @@ #define BLIS_SMALL_MATRIX_THRES_TRSM 32768 //128(128+128) => m*(m+n) #define BLIS_SMALL_MATRIX_A_THRES_TRSM 128 -#define BLIS_SMALL_MATRIX_A_THRES_M_SYRK 96 -#define BLIS_SMALL_MATRIX_A_THRES_N_SYRK 128 +#define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT 96 +#define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT 128 //This macro will enable BLIS DGEMM to choose block sizes for a single instance mode #define BLIS_ENABLE_SINGLE_INSTANCE_BLOCK_SIZES 0 diff --git a/config/zen/bli_kernel_defs_zen.h b/config/zen/bli_kernel_defs_zen.h new file mode 100644 index 0000000000..c5bc8d63f3 --- /dev/null +++ b/config/zen/bli_kernel_defs_zen.h @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 6 +#define BLIS_MR_d 6 +#define BLIS_MR_c 3 +#define BLIS_MR_z 3 + +#define BLIS_NR_s 16 +#define BLIS_NR_d 8 +#define BLIS_NR_c 8 +#define BLIS_NR_z 4 + +//#endif + diff --git a/config/zen/make_defs.mk b/config/zen/make_defs.mk index 8f975d5bc5..389a313b6c 100644 --- a/config/zen/make_defs.mk +++ b/config/zen/make_defs.mk @@ -1,11 +1,10 @@ # # -# BLIS +# BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # -# Copyright (C) 2014, The University of Texas at Austin -# Copyright (C) 2019, Advanced Micro Devices, Inc. +# Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -33,9 +32,6 @@ # # -# FLAGS that are specific to the 'zen' architecture are added here. -# FLAGS that are common for all the AMD architectures are present in -# amd_config.mk. # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. @@ -46,37 +42,50 @@ THIS_CONFIG := zen # --- Determine the C compiler and related flags --- # -# Include the file containing common flags for all AMD architectures. -AMD_CONFIG_FILE := amd_config.mk -AMD_CONFIG_PATH := $(BASE_SHARE_PATH)/config/zen --include $(AMD_CONFIG_PATH)/$(AMD_CONFIG_FILE) +# NOTE: The build system will append these variables with various +# general-purpose/configuration-agnostic flags in common.mk. You +# may specify additional flags here as needed. +CPPROCFLAGS := +CMISCFLAGS := +CPICFLAGS := -fPIC +CWARNFLAGS := -ifeq ($(CC_VENDOR),gcc) -# If gcc is older than 6.1.0, we must use -march=bdver4 and then remove the -# Bulldozer instruction sets that were omitted from Zen. -# Additionally, if gcc is 4.9 (clang 3.5?) or newer, we may want to add -# Zen-specific instructions back into the mix: -# -mclzero -madx -mrdseed -mmwaitx -msha -mxsavec -mxsaves -mclflushopt -mpopcnt -ifeq ($(GCC_OT_6_1_0),yes) -CRVECFLAGS += -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp -CKVECFLAGS += -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp +ifneq ($(DEBUG_TYPE),off) +CDBGFLAGS := -g +endif + +ifeq ($(DEBUG_TYPE),noopt) +COPTFLAGS := -O0 else -# If gcc is at least 6.1.0, then we can specify the microarchitecture using -# the preferred option. -CRVECFLAGS += -march=znver1 -CKVECFLAGS += -march=znver1 +COPTFLAGS := -O2 -fomit-frame-pointer endif + +# Flags specific to optimized and reference kernels. +# NOTE: The -fomit-frame-pointer option is needed for some kernels because +# they make explicit use of the rbp register. +CKOPTFLAGS := $(COPTFLAGS) -O3 +CROPTFLAGS := $(CKOPTFLAGS) +CKVECFLAGS := -mavx2 -mfma -mfpmath=sse +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +ifeq ($(CC_VENDOR),gcc) + ifeq ($(GCC_OT_6_1_0),yes) # gcc versions older than 6.1. + CVECFLAGS_VER := -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp + else + CVECFLAGS_VER := -march=znver1 -mno-avx256-split-unaligned-store + endif else ifeq ($(CC_VENDOR),clang) -# I couldn't find which versions of clang added support for -march=znver1, -# so we don't even bother attempting the differentiation that appears in the -# gcc branch above. -CRVECFLAGS += -march=znver1 -CKVECFLAGS += -march=znver1 + CVECFLAGS_VER := -march=znver1 +else +ifeq ($(CC_VENDOR),aocc) + CVECFLAGS_VER := -march=znver1 -mllvm -disable-licm-vrp else -$(error gcc or clang are required for this configuration.) + $(error gcc, clang, or aocc is required for this configuration.) +endif endif endif +CKVECFLAGS += $(CVECFLAGS_VER) +CRVECFLAGS += $(CVECFLAGS_VER) # Store all of the variables here to new variables containing the # configuration name. diff --git a/config/zen/make_defs.mk.old b/config/zen/make_defs.mk.old new file mode 100644 index 0000000000..44c2ad18d6 --- /dev/null +++ b/config/zen/make_defs.mk.old @@ -0,0 +1,84 @@ +# +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2014, The University of Texas at Austin +# Copyright (C) 2019, Advanced Micro Devices, Inc. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name(s) of the copyright holder(s) nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + +# FLAGS that are specific to the 'zen' architecture are added here. +# FLAGS that are common for all the AMD architectures are present in +# amd_config.mk. + +# Declare the name of the current configuration and add it to the +# running list of configurations included by common.mk. +THIS_CONFIG := zen +#CONFIGS_INCL += $(THIS_CONFIG) + +# +# --- Determine the C compiler and related flags --- +# + +# Include the file containing common flags for all AMD architectures. +AMD_CONFIG_FILE := amd_config.mk +AMD_CONFIG_PATH := $(BASE_SHARE_PATH)/config/zen +-include $(AMD_CONFIG_PATH)/$(AMD_CONFIG_FILE) + +ifeq ($(CC_VENDOR),gcc) +# If gcc is older than 6.1.0, we must use -march=bdver4 and then remove the +# Bulldozer instruction sets that were omitted from Zen. +# Additionally, if gcc is 4.9 (clang 3.5?) or newer, we may want to add +# Zen-specific instructions back into the mix: +# -mclzero -madx -mrdseed -mmwaitx -msha -mxsavec -mxsaves -mclflushopt -mpopcnt +ifeq ($(GCC_OT_6_1_0),yes) +CRVECFLAGS += -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp +CKVECFLAGS += -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp +else +# If gcc is at least 6.1.0, then we can specify the microarchitecture using +# the preferred option. +CRVECFLAGS += -march=znver1 +CKVECFLAGS += -march=znver1 +endif +else +ifeq ($(CC_VENDOR),clang) +# I couldn't find which versions of clang added support for -march=znver1, +# so we don't even bother attempting the differentiation that appears in the +# gcc branch above. +CRVECFLAGS += -march=znver1 +CKVECFLAGS += -march=znver1 +else +$(error gcc or clang are required for this configuration.) +endif +endif + +# Store all of the variables here to new variables containing the +# configuration name. +$(eval $(call store-make-defs,$(THIS_CONFIG))) + diff --git a/config/zen2/bli_cntx_init_zen2.c b/config/zen2/bli_cntx_init_zen2.c index 0964ce463e..7f507d073d 100644 --- a/config/zen2/bli_cntx_init_zen2.c +++ b/config/zen2/bli_cntx_init_zen2.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020, Advanced Micro Devices, Inc. + Copyright (C) 2020-2022, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -38,71 +38,90 @@ void bli_cntx_init_zen2( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; - blksz_t thresh[ BLIS_NUM_THRESH ]; // Set default kernel blocksizes and functions. bli_cntx_init_zen2_ref( cntx ); // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 8, + cntx, // gemm - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE, + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, // gemmtrsm_l - BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE, - BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE, + BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, + BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, // gemmtrsm_u - BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE, - BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE, - cntx - ); + BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, + BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, + + // level-3 sup + BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, + BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, + BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + + BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, + BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, + BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, + BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, + BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, + BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, + BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, + BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, +#if 0 + BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, + BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, + BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, + BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, + BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, + BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, + BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, + BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, +#endif -#if 1 - // Update the context with optimized packm kernels. - bli_cntx_set_packm_kers - ( - 8, - BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk, - BLIS_PACKM_16XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk, - BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk, - BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk, - BLIS_PACKM_3XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk, - BLIS_PACKM_8XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk, - BLIS_PACKM_3XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk, - BLIS_PACKM_4XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk, - cntx - ); +#if 0 + // NOTE: This set of kernels is likely broken and therefore disabled. + BLIS_GEMMSUP_RRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, + BLIS_GEMMSUP_RCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, + BLIS_GEMMSUP_RCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, + BLIS_GEMMSUP_CRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, + BLIS_GEMMSUP_CCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, + BLIS_GEMMSUP_CCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, + + BLIS_GEMMSUP_RRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, + BLIS_GEMMSUP_RCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, + BLIS_GEMMSUP_CRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, + BLIS_GEMMSUP_RCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, + BLIS_GEMMSUP_CCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, + BLIS_GEMMSUP_CCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, #endif - // Update the context with optimized level-1f kernels. - bli_cntx_set_l1f_kers - ( - 4, + // packm + BLIS_PACKM_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6x16, + BLIS_PACKM_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6x8, + BLIS_PACKM_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3x8, + BLIS_PACKM_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3x4, // axpyf - BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_5, - BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_5, + BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_5, + BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_5, // dotxf - BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, - BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, - cntx - ); - - // Update the context with optimized level-1v kernels. - bli_cntx_set_l1v_kers - ( - 16, + BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, + BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, // amaxv BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, @@ -125,17 +144,59 @@ void bli_cntx_init_zen2( cntx_t* cntx ) BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, //swap - BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8, - BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8, + BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8, + BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8, //copy BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int, BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int, //set - BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int, - BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int, - cntx + BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int, + BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // gemm + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + + // gemmtrsm_l + BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + // gemmtrsm_u + BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + // level-3 sup + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -155,130 +216,73 @@ void bli_cntx_init_zen2( cntx_t* cntx ) bli_blksz_init_easy( &blkszs[ BLIS_AF ], 5, 5, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); + // Initialize sup thresholds with architecture-appropriate values. + // s d c z +#if 1 + bli_blksz_init_easy( &blkszs[ BLIS_MT ], 500, 249, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NT ], 500, 249, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KT ], 500, 249, -1, -1 ); +#else + bli_blksz_init_easy( &blkszs[ BLIS_MT ], 100000, 100000, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NT ], 100000, 100000, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KT ], 100000, 100000, -1, -1 ); +#endif + + // Initialize level-3 sup blocksize objects with architecture-specific + // values. + // s d c z + bli_blksz_init ( &blkszs[ BLIS_MR_SUP ], 6, 6, -1, -1, + 9, 9, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ], 16, 8, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ], 168, 72, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ], 256, 256, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ], 4080, 4080, -1, -1 ); + // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 7, + cntx, + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + // level-1f BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, - cntx - ); - // ------------------------------------------------------------------------- + // sup thresholds + BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT, + BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT, + BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT, - // Initialize sup thresholds with architecture-appropriate values. - // s d c z -#if 1 - bli_blksz_init_easy( &thresh[ BLIS_MT ], 500, 249, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_NT ], 500, 249, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_KT ], 500, 249, -1, -1 ); -#else - bli_blksz_init_easy( &thresh[ BLIS_MT ], 100000, 100000, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_NT ], 100000, 100000, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_KT ], 100000, 100000, -1, -1 ); -#endif + // level-3 sup + BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NC_SUP, + BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KC_SUP, + BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MC_SUP, + BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP, + BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP, - // Initialize the context with the sup thresholds. - bli_cntx_set_l3_sup_thresh - ( - 3, - BLIS_MT, &thresh[ BLIS_MT ], - BLIS_NT, &thresh[ BLIS_NT ], - BLIS_KT, &thresh[ BLIS_KT ], - cntx + BLIS_VA_END ); + // ------------------------------------------------------------------------- + #if 0 // Initialize the context with the sup handlers. bli_cntx_set_l3_sup_handlers ( - 1, - BLIS_GEMM, bli_gemmsup_ref, - cntx - ); -#endif + cntx, - // Update the context with optimized small/unpacked gemm kernels. - bli_cntx_set_l3_sup_kers - ( - 16, - //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref, - BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE, - BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE, - BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - - BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, - BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, TRUE, - BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, - BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, - BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, - BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE, - BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, - BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, -#if 0 - BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, - BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE, - BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, - BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, - BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, - BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE, - BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, - BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, -#endif - -#if 0 - // NOTE: This set of kernels is likely broken and therefore disabled. - BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - - BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, - BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, - BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, - BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, - BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, - BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, -#endif - cntx - ); - - // Initialize level-3 sup blocksize objects with architecture-specific - // values. - // s d c z - bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, -1, -1, - 9, 9, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 168, 72, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, -1, -1 ); + BLIS_GEMM, bli_gemmsup_ref, + //BLIS_GEMMT, bli_gemmtsup_ref, - // Update the context with the current architecture's register and cache - // blocksizes for small/unpacked level-3 problems. - bli_cntx_set_l3_sup_blkszs - ( - 5, - BLIS_NC, &blkszs[ BLIS_NC ], - BLIS_KC, &blkszs[ BLIS_KC ], - BLIS_MC, &blkszs[ BLIS_MC ], - BLIS_NR, &blkszs[ BLIS_NR ], - BLIS_MR, &blkszs[ BLIS_MR ], - cntx + BLIS_VA_END ); +#endif } diff --git a/config/zen2/bli_family_zen2.h b/config/zen2/bli_family_zen2.h index a0f5b574d2..d7adddf3c8 100644 --- a/config/zen2/bli_family_zen2.h +++ b/config/zen2/bli_family_zen2.h @@ -51,8 +51,8 @@ #define BLIS_SMALL_MATRIX_THRES_TRSM 32768 //128(128+128) => m*(m+n) #define BLIS_SMALL_MATRIX_A_THRES_TRSM 128 -#define BLIS_SMALL_MATRIX_A_THRES_M_SYRK 96 -#define BLIS_SMALL_MATRIX_A_THRES_N_SYRK 128 +#define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT 96 +#define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT 128 #define BLIS_ENABLE_SMALL_MATRIX_ROME #define BLIS_SMALL_MATRIX_THRES_ROME 400 diff --git a/config/zen2/bli_kernel_defs_zen2.h b/config/zen2/bli_kernel_defs_zen2.h new file mode 100644 index 0000000000..c5bc8d63f3 --- /dev/null +++ b/config/zen2/bli_kernel_defs_zen2.h @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 6 +#define BLIS_MR_d 6 +#define BLIS_MR_c 3 +#define BLIS_MR_z 3 + +#define BLIS_NR_s 16 +#define BLIS_NR_d 8 +#define BLIS_NR_c 8 +#define BLIS_NR_z 4 + +//#endif + diff --git a/config/zen2/make_defs.mk b/config/zen2/make_defs.mk index 7d3ccb4bfe..1eebf7fa76 100644 --- a/config/zen2/make_defs.mk +++ b/config/zen2/make_defs.mk @@ -1,11 +1,10 @@ # # -# BLIS +# BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # -# Copyright (C) 2014, The University of Texas at Austin -# Copyright (C) 2019, Advanced Micro Devices, Inc. +# Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -33,9 +32,6 @@ # # -# FLAGS that are specific to the 'zen2' architecture are added here. -# FLAGS that are common for all the AMD architectures are present in -# config/zen/amd_config.mk. # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. @@ -46,41 +42,62 @@ THIS_CONFIG := zen2 # --- Determine the C compiler and related flags --- # -# Include file containing common flags for all AMD architectures. -AMD_CONFIG_FILE := amd_config.mk -AMD_CONFIG_PATH := $(BASE_SHARE_PATH)/config/zen --include $(AMD_CONFIG_PATH)/$(AMD_CONFIG_FILE) +# NOTE: The build system will append these variables with various +# general-purpose/configuration-agnostic flags in common.mk. You +# may specify additional flags here as needed. +CPPROCFLAGS := +CMISCFLAGS := +CPICFLAGS := -fPIC +CWARNFLAGS := -ifeq ($(CC_VENDOR),gcc) -ifeq ($(GCC_OT_9_1_0),yes) -ifeq ($(GCC_OT_6_1_0),yes) -# If gcc is older than 6.1.0, we must use -march=bdver4 and then remove the -# Bulldozer instruction sets that were omitted from Zen. -CRVECFLAGS += -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp -CKVECFLAGS += -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp -else -# If gcc is older than 9.1.0 but at least 6.1.0, then we can use -march=znver1 -# as the fallback option. -CRVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store -CKVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store +ifneq ($(DEBUG_TYPE),off) +CDBGFLAGS := -g endif + +ifeq ($(DEBUG_TYPE),noopt) +COPTFLAGS := -O0 else -# If gcc is at least 9.1.0, then we can specify the microarchitecture using -# the preferred option. -CRVECFLAGS += -march=znver2 -CKVECFLAGS += -march=znver2 +COPTFLAGS := -O2 -fomit-frame-pointer endif + +# Flags specific to optimized and reference kernels. +# NOTE: The -fomit-frame-pointer option is needed for some kernels because +# they make explicit use of the rbp register. +CKOPTFLAGS := $(COPTFLAGS) -O3 +CROPTFLAGS := $(CKOPTFLAGS) +CKVECFLAGS := -mavx2 -mfma -mfpmath=sse +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +ifeq ($(CC_VENDOR),gcc) + ifeq ($(GCC_OT_6_1_0),yes) # gcc versions older than 6.1. + CVECFLAGS_VER := -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp + else + ifeq ($(GCC_OT_9_1_0),yes) # gcc versions 6.1 or newer, but older than 9.1. + CVECFLAGS_VER := -march=znver1 -mno-avx256-split-unaligned-store + else # gcc versions 9.1 or newer. + CVECFLAGS_VER := -march=znver2 + endif + endif else ifeq ($(CC_VENDOR),clang) -# I couldn't find which versions of clang added support for -march=znver1 -# or -march=znver2, so we don't even bother attempting the differentiation -# that appears in the gcc branch above. -CRVECFLAGS += -march=znver1 -CKVECFLAGS += -march=znver1 + ifeq ($(CLANG_OT_9_0_0),yes) # clang versions older than 9.0. + CVECFLAGS_VER := -march=znver1 + else # clang versions 9.0 or newer. + CVECFLAGS_VER := -march=znver2 + endif +else +ifeq ($(CC_VENDOR),aocc) + ifeq ($(AOCC_OT_2_0_0),yes) # aocc versions older than 2.0. + CVECFLAGS_VER := -march=znver1 -mllvm -disable-licm-vrp + else # aocc versions 2.0 or newer. + CVECFLAGS_VER := -march=znver2 + endif else -$(error gcc or clang are required for this configuration.) + $(error gcc, clang, or aocc is required for this configuration.) +endif endif endif +CKVECFLAGS += $(CVECFLAGS_VER) +CRVECFLAGS += $(CVECFLAGS_VER) # Store all of the variables here to new variables containing the # configuration name. diff --git a/config/zen2/make_defs.mk.old b/config/zen2/make_defs.mk.old new file mode 100644 index 0000000000..9f0370376c --- /dev/null +++ b/config/zen2/make_defs.mk.old @@ -0,0 +1,94 @@ +# +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2014, The University of Texas at Austin +# Copyright (C) 2019, Advanced Micro Devices, Inc. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name(s) of the copyright holder(s) nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + +# FLAGS that are specific to the 'zen2' architecture are added here. +# FLAGS that are common for all the AMD architectures are present in +# config/zen/amd_config.mk. + +# Declare the name of the current configuration and add it to the +# running list of configurations included by common.mk. +THIS_CONFIG := zen2 +#CONFIGS_INCL += $(THIS_CONFIG) + +# +# --- Determine the C compiler and related flags --- +# + +# Include file containing common flags for all AMD architectures. +AMD_CONFIG_FILE := amd_config.mk +AMD_CONFIG_PATH := $(BASE_SHARE_PATH)/config/zen +-include $(AMD_CONFIG_PATH)/$(AMD_CONFIG_FILE) + +ifeq ($(CC_VENDOR),gcc) + ifeq ($(GCC_OT_9_1_0),yes) + ifeq ($(GCC_OT_6_1_0),yes) + # If gcc is older than 6.1.0, we must use -march=bdver4 and then remove the + # Bulldozer instruction sets that were omitted from Zen. + CRVECFLAGS += -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp + CKVECFLAGS += -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp + else + # If gcc is older than 9.1.0 but at least 6.1.0, then we can use -march=znver1 + # as the fallback option. + CRVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store + CKVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store + endif + else + # If gcc is at least 9.1.0, then we can specify the microarchitecture using + # the preferred option. + CRVECFLAGS += -march=znver2 + CKVECFLAGS += -march=znver2 + endif + else + ifeq ($(CC_VENDOR),clang) + ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC.LLVM.2\|AOCC_2')),1) + CKVECFLAGS += -march=znver2 + else + #if compiling with clang + VENDOR_STRING := $(strip $(shell ${CC_VENDOR} --version | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*')) + CC_MAJOR := $(shell (echo ${VENDOR_STRING} | cut -d. -f1)) + #clang 9.0 or later: + ifeq ($(shell test $(CC_MAJOR) -ge 9; echo $$?),0) + CKVECFLAGS += -march=znver2 + else + CKVECFLAGS += -march=znver1 + endif # ge 9 + endif # AOCC 2 + endif # Clang +endif # gcc + +# Store all of the variables here to new variables containing the +# configuration name. +$(eval $(call store-make-defs,$(THIS_CONFIG))) + diff --git a/config/zen3/bli_cntx_init_zen3.c b/config/zen3/bli_cntx_init_zen3.c new file mode 100644 index 0000000000..cf0516a892 --- /dev/null +++ b/config/zen3/bli_cntx_init_zen3.c @@ -0,0 +1,305 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_cntx_init_zen3( cntx_t* cntx ) +{ + blksz_t blkszs[ BLIS_NUM_BLKSZS ]; + + // Set default kernel blocksizes and functions. + bli_cntx_init_zen3_ref( cntx ); + + // ------------------------------------------------------------------------- + + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs + ( + cntx, + + // gemm + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, + + // gemmtrsm_l + BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, + BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, + + // gemmtrsm_u + BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, + BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, + + // gemmsup +#if 0 + // AMD: This should be enabled in the PR which has added these kernels + // Update the context with optimized small/unpacked gemm kernels. + BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, + BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, + BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, + BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, + BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, + BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, + BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, + BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, + BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, + BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, + BLIS_GEMMSUP_RRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, + BLIS_GEMMSUP_RCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, + BLIS_GEMMSUP_CRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, + BLIS_GEMMSUP_RCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, + BLIS_GEMMSUP_CCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, + BLIS_GEMMSUP_CCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, + BLIS_GEMMSUP_RRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, + BLIS_GEMMSUP_RCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, + BLIS_GEMMSUP_CRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, + BLIS_GEMMSUP_RCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, + BLIS_GEMMSUP_CCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, + BLIS_GEMMSUP_CCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, +#else + BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, + BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, + BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + + BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, + BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, + BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, + BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, + BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, + BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, + BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, + BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, +#endif + + // packm +#if 0 + // AMD: This will be enabled in other PRs. + BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_6xk_gen_zen, + BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_8xk_gen_zen, +#else + BLIS_PACKM_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6x16, + BLIS_PACKM_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6x8, + BLIS_PACKM_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3x8, + BLIS_PACKM_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3x4, +#endif + + // axpyf + BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_5, + BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_5, + + // dotxf + BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, + BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, + + // amaxv + BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, + BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, + + // axpyv + BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10, + BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10, + + // dotv + BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int10, + BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int10, + + // dotxv + BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int, + BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int, + + // scalv + BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, + BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, + + // swapv + BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8, + BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8, + + // copyv + BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int, + BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int, + + // setv + BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int, + BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // gemm + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + + // gemmtrsm_l + BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + // gemmtrsm_u + BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + // gemmsup + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, +#if 0 + // AMD: This should be enabled in the PR which has added these kernels + // Update the context with optimized small/unpacked gemm kernels. + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, +#endif + + BLIS_VA_END + ); + + // Initialize level-3 blocksize objects with architecture-specific values. + // + // These are reference block sizes and may be overridden based on + // number of threads used at runtime. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 ); + + bli_blksz_init_easy( &blkszs[ BLIS_AF ], 5, 5, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); + + // Initialize sup thresholds with architecture-appropriate values. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MT ], 512, 256, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NT ], 200, 256, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KT ], 240, 220, -1, -1 ); + + // Initialize level-3 sup blocksize objects with architecture-specific + // values. + // s d c z + bli_blksz_init ( &blkszs[ BLIS_MR_SUP ], 6, 6, 3, 3, + 9, 9, 3, 3 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ], 16, 8, 8, 4 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ], 144, 72, 72, 36 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ], 512, 256, 128, 64 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ], 8160, 4080, 2040, 1020 ); + + // Update the context with the current architecture's register and cache + // blocksizes (and multiples) for native execution. + bli_cntx_set_blkszs + ( + cntx, + + // level-3 + BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, + BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, + BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, + BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, + BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + + // level-1f + BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, + BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, + + // sup thresholds + BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT, + BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT, + BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT, + + // gemmsup + BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NR_SUP, + BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KR_SUP, + BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MR_SUP, + BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP, + BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP, + + BLIS_VA_END + ); + + // ------------------------------------------------------------------------- + +#if 0 + // Initialize the context with the sup handlers. + bli_cntx_set_l3_sup_handlers + ( + cntx, + + BLIS_GEMM, bli_gemmsup_ref, + //BLIS_GEMMT, bli_gemmtsup_ref, + + BLIS_VA_END + ); +#endif +} + diff --git a/config/zen3/bli_family_zen3.h b/config/zen3/bli_family_zen3.h new file mode 100644 index 0000000000..d03e2edc7a --- /dev/null +++ b/config/zen3/bli_family_zen3.h @@ -0,0 +1,95 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLI_FAMILY_ZEN3_ +#define BLI_FAMILY_ZEN3_ + +// By default, it is effective to parallelize the outer loops. +// Setting these macros to 1 will force JR and IR inner loops +// to be not paralleized. +// + +#define BLIS_THREAD_MAX_IR 1 +#define BLIS_THREAD_MAX_JR 1 + + +// To enable framework optimizations for zen3 platform +// All zen3 specific code should be included in this macro +#define BLIS_CONFIG_ZEN3 + +// To enable framework optimizations for zen3 platform +// All zen3 specific code should be included in this macro +#define BLIS_CONFIG_ZEN3 + +//#define BLIS_ENABLE_SMALL_MATRIX +//#define BLIS_ENABLE_SMALL_MATRIX_TRSM + + +// This will select the threshold below which small matrix code will be called. +#define BLIS_SMALL_MATRIX_THRES 700 +#define BLIS_SMALL_M_RECT_MATRIX_THRES 160 +#define BLIS_SMALL_K_RECT_MATRIX_THRES 128 + +#define BLIS_SMALL_MATRIX_THRES_TRSM 32768 //128(128+128) => m*(m+n) +#define BLIS_SMALL_MATRIX_A_THRES_TRSM 128 + +#define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT 96 +#define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT 128 + +#define BLIS_ENABLE_SMALL_MATRIX_ROME +#define BLIS_SMALL_MATRIX_THRES_ROME 400 + +#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME 80 +#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_ROW_PANEL_M 40 +#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_M 1000 +#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_N 10 + +#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME 150 +#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_ROW_PANEL_M 5 +#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_COLUMN_PANEL_N 130 + +#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME 120 +#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_M 10 +#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_N 1200 +#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_M 30 +#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_N 280 +#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_COLUMN_PANEL_N 100 + +#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME 110 +#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME_COL_PANEL_N 30 + +#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME 120 +#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME_COL_PANEL_N 50 + +#endif diff --git a/config/zen3/bli_kernel_defs_zen3.h b/config/zen3/bli_kernel_defs_zen3.h new file mode 100644 index 0000000000..c5bc8d63f3 --- /dev/null +++ b/config/zen3/bli_kernel_defs_zen3.h @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 6 +#define BLIS_MR_d 6 +#define BLIS_MR_c 3 +#define BLIS_MR_z 3 + +#define BLIS_NR_s 16 +#define BLIS_NR_d 8 +#define BLIS_NR_c 8 +#define BLIS_NR_z 4 + +//#endif + diff --git a/config/zen3/make_defs.mk b/config/zen3/make_defs.mk new file mode 100644 index 0000000000..0bd4ed3441 --- /dev/null +++ b/config/zen3/make_defs.mk @@ -0,0 +1,126 @@ +# +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name(s) of the copyright holder(s) nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + + +# Declare the name of the current configuration and add it to the +# running list of configurations included by common.mk. +THIS_CONFIG := zen3 +#CONFIGS_INCL += $(THIS_CONFIG) + +# +# --- Determine the C compiler and related flags --- +# + +# NOTE: The build system will append these variables with various +# general-purpose/configuration-agnostic flags in common.mk. You +# may specify additional flags here as needed. +CPPROCFLAGS := +CMISCFLAGS := +CPICFLAGS := -fPIC +CWARNFLAGS := + +ifneq ($(DEBUG_TYPE),off) +CDBGFLAGS := -g +endif + +ifeq ($(DEBUG_TYPE),noopt) +COPTFLAGS := -O0 +else +COPTFLAGS := -O3 +endif + +# Flags specific to optimized and reference kernels. +# NOTE: The -fomit-frame-pointer option is needed for some kernels because +# they make explicit use of the rbp register. +CKOPTFLAGS := $(COPTFLAGS) -fomit-frame-pointer +CROPTFLAGS := $(CKOPTFLAGS) +CKVECFLAGS := -mavx2 -mfma +CRVECFLAGS := $(CKVECFLAGS) +ifeq ($(CC_VENDOR),gcc) + ifeq ($(GCC_OT_9_1_0),yes) # gcc versions older than 9.1. + CVECFLAGS_VER := -march=znver1 -mno-avx256-split-unaligned-store + else + ifeq ($(GCC_OT_10_3_0),yes) # gcc versions 9.1 or newer, but older than 10.3. + CVECFLAGS_VER := -march=znver2 + else # gcc versions 10.1 or newer. + CVECFLAGS_VER := -march=znver3 + endif + endif + CKVECFLAGS += -mfpmath=sse + CRVECFLAGS += -funsafe-math-optimizations -ffp-contract=fast +else +ifeq ($(CC_VENDOR),clang) + ifeq ($(CLANG_OT_9_0_0),yes) # clang versions older than 9.0. + CVECFLAGS_VER := -march=znver1 + else + ifeq ($(CLANG_OT_12_0_0),yes) # clang versions 9.0 or newer, but older than 12.0. + CVECFLAGS_VER := -march=znver2 + else + ifeq ($(OS_NAME),Darwin) # clang version 12.0 on OSX lacks znver3 support + CVECFLAGS_VER := -march=znver2 + else # clang versions 12.0 or newer. + CVECFLAGS_VER := -march=znver3 + endif + endif + endif + CKVECFLAGS += -mfpmath=sse + CRVECFLAGS += -funsafe-math-optimizations -ffp-contract=fast +else +ifeq ($(CC_VENDOR),aocc) + ifeq ($(AOCC_OT_2_0_0),yes) # aocc versions older than 2.0. + CVECFLAGS_VER := -march=znver1 + else + ifeq ($(AOCC_OT_3_0_0),yes) # aocc versions 2.0 or newer, but older than 3.0. + CVECFLAGS_VER := -march=znver2 + else # aocc versions 3.0 or newer. + CVECFLAGS_VER := -march=znver3 + endif + endif + CKVECFLAGS += -mfpmath=sse + CRVECFLAGS += -funsafe-math-optimizations -ffp-contract=fast +ifeq ($(CC_VENDOR),nvc) + CVECFLAGS_VER := -march=znver3 + CRVECFLAGS += -fast +else + $(error gcc, clang, nvc or aocc is required for this configuration.) +endif +endif +endif +endif +CKVECFLAGS += $(CVECFLAGS_VER) +CRVECFLAGS += $(CVECFLAGS_VER) + +# Store all of the variables here to new variables containing the +# configuration name. +$(eval $(call store-make-defs,$(THIS_CONFIG))) diff --git a/config/zen3/make_defs.mk.old b/config/zen3/make_defs.mk.old new file mode 100644 index 0000000000..9af3a90d4f --- /dev/null +++ b/config/zen3/make_defs.mk.old @@ -0,0 +1,137 @@ +# +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name(s) of the copyright holder(s) nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + +# FLAGS that are specific to the 'zen3' architecture are added here. +# FLAGS that are common for all the AMD architectures are present in +# config/zen/amd_config.mk. + +# Declare the name of the current configuration and add it to the +# running list of configurations included by common.mk. +THIS_CONFIG := zen3 +#CONFIGS_INCL += $(THIS_CONFIG) + +# +# --- Determine the C compiler and related flags --- +# + +# NOTE: The build system will append these variables with various +# general-purpose/configuration-agnostic flags in common.mk. You +# may specify additional flags here as needed. +CPPROCFLAGS := +CMISCFLAGS := +CPICFLAGS := -fPIC +CWARNFLAGS := + +ifneq ($(DEBUG_TYPE),off) +CDBGFLAGS := -g +endif + +ifeq ($(DEBUG_TYPE),noopt) +COPTFLAGS := -O0 +else +#frame pointers are needed to execution tracing +ifeq ($(ETRACE_ENABLE),1) +COPTFLAGS := -O3 +else +COPTFLAGS := -O3 -fomit-frame-pointer +endif +endif + + +# +# --- Enable ETRACE across the library if enabled ETRACE_ENABLE=[0,1] ----------------------- +# + +ifeq ($(ETRACE_ENABLE),1) +CDBGFLAGS += -pg -finstrument-functions -DAOCL_DTL_AUTO_TRACE_ENABLE +LDFLAGS += -ldl +endif + +# Flags specific to optimized kernels. +CKOPTFLAGS := $(COPTFLAGS) +ifeq ($(CC_VENDOR),gcc) +GCC_VERSION := $(strip $(shell $(CC) -dumpversion | cut -d. -f1)) +#gcc or clang version must be atleast 4.0 +# gcc 9.0 or later: +ifeq ($(shell test $(GCC_VERSION) -ge 9; echo $$?),0) +CKVECFLAGS += -march=znver2 +else +# If gcc is older than 9.1.0 but at least 6.1.0, then we can use -march=znver1 +# as the fallback option. +CRVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store +CKVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store +endif +else +ifeq ($(CC_VENDOR),clang) + +# AOCC clang has various formats for the version line + +# AOCC.LLVM.2.0.0.B191.2019_07_19 clang version 8.0.0 (CLANG: Jenkins AOCC_2_0_0-Build#191) (based on LLVM AOCC.LLVM.2.0.0.B191.2019_07_19) +# AOCC.LLVM.2.1.0.B1030.2019_11_12 clang version 9.0.0 (CLANG: Build#1030) (based on LLVM AOCC.LLVM.2.1.0.B1030.2019_11_12) +# AMD clang version 10.0.0 (CLANG: AOCC_2.2.0-Build#93 2020_06_25) (based on LLVM Mirror.Version.10.0.0) +# AMD clang version 11.0.0 (CLANG: AOCC_2.3.0-Build#85 2020_11_10) (based on LLVM Mirror.Version.11.0.0) +# AMD clang version 12.0.0 (CLANG: AOCC_3.0.0-Build#2 2020_11_05) (based on LLVM Mirror.Version.12.0.0) + +# For our prupose we just want to know if it version 2x or 3x + +# for version 3x we will enable znver3 +ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC_3')),1) +CKVECFLAGS += -march=znver3 +else +# for version 2x we will enable znver2 +ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC.LLVM.2\|AOCC_2')),1) +CKVECFLAGS += -march=znver2 +else +#if compiling with clang +VENDOR_STRING := $(strip $(shell ${CC_VENDOR} --version | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*')) +CC_MAJOR := $(shell (echo ${VENDOR_STRING} | cut -d. -f1)) +#clang 9.0 or later: +ifeq ($(shell test $(CC_MAJOR) -ge 9; echo $$?),0) +CKVECFLAGS += -march=znver2 +else +CKVECFLAGS += -march=znver1 +endif # ge 9 +endif # aocc 2 +endif # aocc 3 +endif # clang +endif # gcc + +# Flags specific to reference kernels. +CROPTFLAGS := $(CKOPTFLAGS) +CRVECFLAGS := $(CKVECFLAGS) + +# Store all of the variables here to new variables containing the +# configuration name. +$(eval $(call store-make-defs,$(THIS_CONFIG))) + diff --git a/config_registry b/config_registry index bdd3d22281..8154393487 100644 --- a/config_registry +++ b/config_registry @@ -8,11 +8,13 @@ # # Processor families. -x86_64: intel64 amd64 -intel64: skx knl haswell sandybridge penryn generic -amd64: zen2 zen excavator steamroller piledriver bulldozer generic -arm64: firestorm thunderx2 cortexa57 cortexa53 generic -arm32: cortexa15 cortexa9 generic +x86_64: intel64 amd64 amd64_legacy +intel64: skx knl haswell sandybridge penryn generic +amd64_legacy: excavator steamroller piledriver bulldozer generic +amd64: zen3 zen2 zen generic +arm64: armsve firestorm thunderx2 cortexa57 cortexa53 generic +arm32: cortexa15 cortexa9 generic +power: power10 power9 generic # Intel architectures. skx: skx/skx/haswell/zen @@ -22,6 +24,7 @@ sandybridge: sandybridge penryn: penryn # AMD architectures. +zen3: zen3/zen3/zen2/zen/haswell zen2: zen2/zen2/zen/haswell zen: zen/zen/haswell excavator: excavator/piledriver @@ -32,10 +35,18 @@ bulldozer: bulldozer # ARM architectures. armsve: armsve/armsve a64fx: a64fx/armsve + +# ARM Neon64 (4 pipes x 128b) architectures. +altramax: altramax/armv8a +altra: altra/armv8a firestorm: firestorm/armv8a + +# ARM (2 pipes x 128b) architectures. thunderx2: thunderx2/armv8a cortexa57: cortexa57/armv8a cortexa53: cortexa53/armv8a + +# ARM Vintage architectures. cortexa15: cortexa15/armv7a cortexa9: cortexa9/armv7a @@ -44,5 +55,15 @@ power10: power10 power9: power9 bgq: bgq +# RISC-V architectures. +rv32i: rv32i/rvi +rv64i: rv64i/rvi +rv32iv: rv32iv/rviv +rv64iv: rv64iv/rviv + +# SiFive architectures. +sifive_rvv: sifive_rvv +sifive_x280: sifive_x280/sifive_rvv + # Generic architectures. generic: generic diff --git a/configure b/configure index 3c865dad90..a22054e755 100755 --- a/configure +++ b/configure @@ -5,7 +5,7 @@ # libraries. # # Copyright (C) 2014, The University of Texas at Austin -# Copyright (C) 2020, Advanced Micro Devices, Inc. +# Copyright (C) 2020-2022, Advanced Micro Devices, Inc. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -32,6 +32,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # +# shellcheck disable=2001,2249,2034,2154,2181,2312,2250,2292 # # -- Helper functions ---------------------------------------------------------- @@ -42,330 +43,439 @@ print_usage() # Use the version string in the 'version' file since we don't have # the patched version string yet. if [ -z "${version}" ]; then - version=$(cat "${version_filepath}") + version=$(<"${version_filepath}") fi # Echo usage info. - echo " " - echo " ${script_name} (BLIS ${version})" - #echo " " - #echo " BLIS ${version}" - echo " " - echo " Configure BLIS's build system for compilation using a specified" - echo " configuration directory." - echo " " - echo " Usage:" - echo " " - echo " ${script_name} [options] [env. vars.] confname" - echo " " - echo " Arguments:" - echo " " - echo " confname The name of the sub-directory inside of the 'config'" - echo " directory containing the desired BLIS configuration." - echo " Note that confname MUST be specified; if it is not," - echo " configure will complain. To build a completely generic" - echo " implementation, use the 'generic' configuration" - echo " " - echo " Options:" - echo " " - echo " -p PREFIX, --prefix=PREFIX" - echo " " - echo " The common installation prefix for all files. If given," - echo " this option effectively implies:" - echo " --libdir=EXECPREFIX/lib" - echo " --includedir=PREFIX/include" - echo " --sharedir=PREFIX/share" - echo " where EXECPREFIX defaults to PREFIX. If this option is" - echo " not given, PREFIX defaults to '${prefix_def}'. If PREFIX" - echo " refers to a directory that does not exist, it will be" - echo " created." - echo " " - echo " --exec-prefix=EXECPREFIX" - echo " " - echo " The installation prefix for libraries. Specifically, if" - echo " given, this option effectively implies:" - echo " --libdir=EXECPREFIX/lib" - echo " If not given, EXECPREFIX defaults to PREFIX, which may be" - echo " modified by the --prefix option. If EXECPREFIX refers to" - echo " a directory that does not exist, it will be created." - echo " " - echo " --libdir=LIBDIR" - echo " " - echo " The path to which make will install libraries. If not" - echo " given, LIBDIR defaults to PREFIX/lib. If LIBDIR refers to" - echo " a directory that does not exist, it will be created." - echo " " - echo " --includedir=INCDIR" - echo " " - echo " The path to which make will install development header" - echo " files. If not given, INCDIR defaults to PREFIX/include." - echo " If INCDIR refers to a directory that does not exist, it" - echo " will be created." - echo " " - echo " --sharedir=SHAREDIR" - echo " " - echo " The path to which make will makefile fragments containing" - echo " make variables determined by configure (e.g. CC, CFLAGS," - echo " and LDFLAGS). These files allow certain BLIS makefiles," - echo " such as those in the examples or testsuite directories, to" - echo " operate on an installed copy of BLIS rather than a local" - echo " (and possibly uninstalled) copy. If not given, SHAREDIR" - echo " defaults to PREFIX/share. If SHAREDIR refers to a" - echo " directory that does not exist, it will be created." - echo " " - echo " --enable-verbose-make, --disable-verbose-make" - echo " " - echo " Enable (disabled by default) verbose compilation output" - echo " during make." - echo " " - echo " --enable-arg-max-hack --disable-arg-max-hack" - echo " " - echo " Enable (disabled by default) build system logic that" - echo " will allow archiving/linking the static/shared library" - echo " even if the command plus command line arguments exceeds" - echo " the operating system limit (ARG_MAX)." - echo " " - echo " -d DEBUG, --enable-debug[=DEBUG]" - echo " " - echo " Enable debugging symbols in the library. If argument" - echo " DEBUG is given as 'opt', then optimization flags are" - echo " kept in the framework, otherwise optimization is" - echo " turned off." - echo " " - echo " --disable-static, --enable-static" - echo " " - echo " Disable (enabled by default) building BLIS as a static" - echo " library. If the static library build is disabled, the" - echo " shared library build must remain enabled." - echo " " - echo " --disable-shared, --enable-shared" - echo " " - echo " Disable (enabled by default) building BLIS as a shared" - echo " library. If the shared library build is disabled, the" - echo " static library build must remain enabled." - echo " " - echo " --enable-rpath, --disable-rpath" - echo " " - echo " Enable (disabled by default) setting an install_name for" - echo " dynamic libraries on macOS which starts with @rpath rather" - echo " than the absolute install path." - echo " " - echo " -e SYMBOLS, --export-shared[=SYMBOLS]" - echo " " - echo " Specify the subset of library symbols that are exported" - echo " within a shared library. Valid values for SYMBOLS are:" - echo " 'public' (the default) and 'all'. By default, only" - echo " functions and variables that belong to public APIs are" - echo " exported in shared libraries. However, the user may" - echo " instead export all symbols in BLIS, even those that were" - echo " intended for internal use only. Note that the public APIs" - echo " encompass all functions that almost any user would ever" - echo " want to call, including the BLAS/CBLAS compatibility APIs" - echo " as well as the basic and expert interfaces to the typed" - echo " and object APIs that are unique to BLIS. Also note that" - echo " changing this option to 'all' will have no effect in some" - echo " environments, such as when compiling with clang on" - echo " Windows." - echo " " - echo " -t MODEL, --enable-threading[=MODEL], --disable-threading" - echo " " - echo " Enable threading in the library, using threading model" - echo " MODEL={openmp,pthreads,no}. If MODEL=no or " - echo " --disable-threading is specified, threading will be" - echo " disabled. The default is 'no'." - echo " " - echo " --enable-system, --disable-system" - echo " " - echo " Enable conventional operating system support, such as" - echo " pthreads for thread-safety. The default state is enabled." - echo " However, in rare circumstances you may wish to configure" - echo " BLIS for use with a minimal or nonexistent operating" - echo " system (e.g. hardware simulators). In these situations," - echo " --disable-system may be used to jettison all compile-time" - echo " and link-time dependencies outside of the standard C" - echo " library. When disabled, this option also forces the use" - echo " of --disable-threading." - echo " " - echo " --disable-pba-pools, --enable-pba-pools" - echo " --disable-sba-pools, --enable-sba-pools" - echo " " - echo " Disable (enabled by default) use of internal memory pools" - echo " within the packing block allocator (pba) and/or the small" - echo " block allocator (sba). The former is used to allocate" - echo " memory used to pack submatrices while the latter is used" - echo " to allocate control/thread tree nodes and thread" - echo " communicators. Both allocations take place in the context" - echo " of level-3 operations. When the pba is disabled, the" - echo " malloc()-like function specified by BLIS_MALLOC_POOL is" - echo " called on-demand whenever a packing block is needed, and" - echo " when the sba is disabled, the malloc()-like function" - echo " specified by BLIS_MALLOC_INTL is called whenever a small" - echo " block is needed, with the two allocators calling free()-" - echo " like functions BLIS_FREE_POOL and BLIS_FREE_INTL," - echo " respectively when blocks are released. When enabled," - echo " either or both pools are populated via the same functions" - echo " mentioned previously, and henceforth blocks are checked" - echo " out and in. The library quickly reaches a state in which" - echo " it no longer needs to call malloc() or free(), even" - echo " across many separate level-3 operation invocations." - echo " " - echo " --enable-mem-tracing, --disable-mem-tracing" - echo " " - echo " Enable (disable by default) output to stdout that traces" - echo " the allocation and freeing of memory, including the names" - echo " of the functions that triggered the allocation/freeing." - echo " Enabling this option WILL NEGATIVELY IMPACT PERFORMANCE." - echo " Please use only for informational/debugging purposes." - echo " " - echo " -i SIZE, --int-size=SIZE" - echo " " - echo " Set the size (in bits) of internal BLIS integers and" - echo " integer types used in native BLIS interfaces. The" - echo " default inteter type size is architecture dependent." - echo " (Hint: You can always find this value printed at the" - echo " beginning of the testsuite output.)" - echo " " - echo " -b SIZE, --blas-int-size=SIZE" - echo " " - echo " Set the size (in bits) of integer types in external" - echo " BLAS and CBLAS interfaces, if enabled. The default" - echo " integer type size used in BLAS/CBLAS is 32 bits." - echo " " - echo " --disable-blas, --enable-blas" - echo " " - echo " Disable (enabled by default) building the BLAS" - echo " compatibility layer." - echo " " - echo " --enable-cblas, --disable-cblas" - echo " " - echo " Enable (disabled by default) building the CBLAS" - echo " compatibility layer. This automatically enables the" - echo " BLAS compatibility layer as well." - echo " " - echo " --disable-mixed-dt, --enable-mixed-dt" - echo " " - echo " Disable (enabled by default) support for mixing the" - echo " storage domain and/or storage precision of matrix" - echo " operands for the gemm operation, as well as support" - echo " for computing in a precision different from one or" - echo " both of matrices A and B." - echo " " - echo " --disable-mixed-dt-extra-mem, --enable-mixed-dt-extra-mem" - echo " " - echo " Disable (enabled by default) support for additional" - echo " mixed datatype optimizations that require temporarily" - echo " allocating extra memory--specifically, a single m x n" - echo " matrix (per application thread) whose storage datatype" - echo " is equal to the computation datatype. This option may" - echo " only be enabled when mixed domain/precision support is" - echo " enabled." - echo " " - echo " --disable-sup-handling, --enable-sup-handling" - echo " " - echo " Disable (enabled by default) handling of small/skinny" - echo " matrix problems via separate code branches. When disabled," - echo " these small/skinny level-3 operations will be performed by" - echo " the conventional implementation, which is optimized for" - echo " medium and large problems. Note that what qualifies as" - echo " \"small\" depends on thresholds that may vary by sub-" - echo " configuration." - echo " " - echo " -s NAME --enable-sandbox=NAME" - echo " " - echo " Enable a separate sandbox implementation of gemm. This" - echo " option disables BLIS's conventional gemm implementation" - echo " (which shares common infrastructure with other level-3" - echo " operations) and instead compiles and uses the code in" - echo " the NAME directory, which is expected to be a sub-" - echo " directory of 'sandbox'. By default, no sandboxes are" - echo " enabled." - echo " " - echo " --with-memkind, --without-memkind" - echo " " - echo " Forcibly enable or disable the use of libmemkind's" - echo " hbw_malloc() and hbw_free() as substitutes for malloc()" - echo " and free(), respectively, when allocating memory for" - echo " BLIS's memory pools, which are used to manage buffers" - echo " into which matrices are packed. The default behavior" - echo " for this option is environment-dependent; if configure" - echo " detects the presence of libmemkind, libmemkind is used" - echo " by default, and otherwise it is not used by default." - echo " " - echo " -r METHOD, --thread-part-jrir=METHOD" - echo " " - echo " Request a method of assigning micropanels to threads in" - echo " the JR and IR loops. Valid values for METHOD are 'slab'" - echo " and 'rr'. Using 'slab' assigns (as much as possible)" - echo " contiguous regions of micropanels to each thread while" - echo " using 'rr' assigns micropanels to threads in a round-" - echo " robin fashion. The chosen method also applies during" - echo " the packing of A and B. The default method is 'slab'." - echo " NOTE: Specifying this option constitutes a request," - echo " which may be ignored in select situations if the" - echo " implementation has a good reason to do so." - echo " " - echo " --disable-trsm-preinversion, --enable-trsm-preinversion" - echo " " - echo " Disable (enabled by default) pre-inversion of triangular" - echo " matrix diagonals when performing trsm. When pre-inversion" - echo " is enabled, diagonal elements are inverted outside of the" - echo " microkernel (e.g. during packing) so that the microkernel" - echo " can use multiply instructions. When disabled, division" - echo " instructions are used within the microkernel. Executing" - echo " these division instructions within the microkernel will" - echo " incur a performance penalty, but numerical robustness will" - echo " improve for certain cases involving denormal numbers that" - echo " would otherwise result in overflow in the pre-inverted" - echo " values." - echo " " - echo " --force-version=STRING" - echo " " - echo " Force configure to use an arbitrary version string" - echo " STRING. This option may be useful when repackaging" - echo " custom versions of BLIS by outside organizations." - echo " " - echo " -c, --show-config-lists" - echo " " - echo " Print the config and kernel lists, and kernel-to-config" - echo " map after they are read from file. This can be useful" - echo " when debugging certain configuration issues, and/or as" - echo " a sanity check to make sure these lists are constituted" - echo " as expected." - echo " " - echo " --complex-return=gnu|intel" - echo " " - echo " Specify the way in which complex numbers are returned" - echo " from Fortran functions, either \"gnu\" (return in" - echo " registers) or \"intel\" (return via hidden argument)." - echo " If not specified and the environment variable FC is set," - echo " attempt to determine the return type from the compiler." - echo " Otherwise, the default is \"gnu\"." - echo " " - echo " -q, --quiet Suppress informational output. By default, configure" - echo " is verbose. (NOTE: -q is not yet implemented)" - echo " " - echo " -h, --help Output this information and quit." - echo " " - echo " Environment Variables:" - echo " " - echo " CC Specifies the C compiler to use." - echo " CXX Specifies the C++ compiler to use (sandbox only)." - echo " FC Specifies the Fortran compiler to use (only to determine --complex-return)." - echo " RANLIB Specifies the ranlib executable to use." - echo " AR Specifies the archiver to use." - echo " CFLAGS Specifies additional compiler flags to use (prepended)." - echo " LDFLAGS Specifies additional linker flags to use (prepended)." - echo " LIBPTHREAD Pthreads library to use." - echo " PYTHON Specifies the python interpreter to use." - echo " " - echo " Environment variables may also be specified as command line" - echo " options, e.g.:" - echo " " - echo " ./configure [options] CC=gcc haswell" - echo " " - echo " Note that not all compilers are compatible with a given" - echo " configuration." - echo " " + cat <