From 3fb54e5eab165588efa90442b548865be0ecb1f8 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Keck <jean-baptiste.keck@grenoble-inp.fr> Date: Wed, 28 Sep 2022 17:26:55 +0200 Subject: [PATCH] Singularity support --- README.md | 61 ++++++++++++++++++++++++-- ci/README.md | 6 --- ci/singularity_images/.gitignore | 1 + ci/utils/pull_docker_image.sh | 2 +- ci/utils/pull_singularity_image.sh | 27 ++++++++++++ ci/utils/push_docker_image.sh | 2 +- ci/utils/run_singularity_image.sh | 56 ++++++++++++++++++++++++ hysop/__init__.py.in | 4 ++ hysop/core/tests/test_checkpoint.sh | 38 +++++++++-------- hysop/core/tests/test_tasks.sh | 16 ++++--- hysop/fields/tests/test_cartesian.py | 4 +- hysop/fields/tests/test_cartesian.sh | 4 +- hysop/tools/cache.py | 64 ++++++++++++++++++---------- hysop_examples/argparser.py | 10 ++++- 14 files changed, 232 insertions(+), 63 deletions(-) delete mode 100644 ci/README.md create mode 100644 ci/singularity_images/.gitignore create mode 100755 ci/utils/pull_singularity_image.sh create mode 100755 ci/utils/run_singularity_image.sh diff --git a/README.md b/README.md index eb19ce4c1..36f536edf 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ make install By default, cmake will try to find your most up to date Python3 installation. The minimum required version is Python3.8. You can force the python version by using the following trick during the cmake configuration step. If you want to force cmake to detect python3.9: ``` PYTHON_EXECUTABLE="$(which python3.9)" -PYTHON_INCLUDE_DIR=$(${PYTHON_EXECUTABLE} -c "import sysconfig as sc; print(sc.get_paths()['include'])") +PYTHON_INCLUDE_DIR=$(${PYTHON_EXECUTABLE} -c "import sysconfig as sc; print(sc.get_paths()['include'])") PYTHON_LIBRARY=$(${PYTHON_EXECUTABLE} -c "import sysconfig as sc, os; print(os.path.normpath(os.path.sep.join(sc.get_config_vars('LIBDIR', 'INSTSONAME'))))") cmake -DPython3_EXECUTABLE="${PYTHON_EXECUTABLE}" -DPython3_INCLUDE_DIR="${PYTHON_INCLUDE_DIR}" -DPython3_LIBRARY="${PYTHON_LIBRARY}" .. ``` @@ -63,9 +63,62 @@ sudo apt-get install -y expat unzip xz-utils automake libtool pkg-config cmake g python3 -m pip install --upgrade numpy setuptools cffi wheel pytest pybind11 cython python3 -m pip install --upgrade -r requirements.txt ``` -Additionally you may want to provide a working OpenCL platform, HPTT, llvm/llvmlite/numba, clFFT/gpyFFT, flint/arb/python-flint and tbb/mklfft. See the docker files to install those packages (`hysop/ci/docker_images/ubuntu`). Alternatively your can run hysop in an isolated environment by using docker containers (see next section). +Additionally you may want to provide a working OpenCL platform, HPTT, llvm/llvmlite/numba, clFFT/gpyFFT, flint/arb/python-flint and tbb/mklfft. See the docker files to get instructions on how to configure and install these packages (`hysop/ci/docker_images/ubuntu/jammy/Dockerfile`). Alternatively your can run hysop in an isolated environment by using either singularity or docker containers (see next sections). + +# Running in isolation with singularity +Singularity support uses hysop docker images used for continuous integration and thus do not directly include hysop. Singularity is easier to use then docker because it bind mounts `$PWD`, `$TMP` and your `$HOME` directory by default. Getting started with singularity is the way to go if you have singularity installed your compute cluster. To install singularity on your personal machine, please follow the [singularity documentation](https://docs.sylabs.io/guides/3.0/user-guide/quick_start.html). + +Steps to setup hysop in singularity: +1. Put hysop source code somewhere in your home. + ``` + export HYSOP_ROOT="${HOME}/hysop" + git clone https://gricad-gitlab.univ-grenoble-alpes.fr/particle_methods/hysop.git "${HYSOP_ROOT}" + ``` +2. Pull singularity image, in this example we use the `jammy_cuda` image wich include GPU support. You can use `jammy` to use the CPU OpenCL backend. + ``` + singularity pull --docker-login docker://gricad-registry.univ-grenoble-alpes.fr/particle_methods/hysop:jammy_cuda + ``` + +3. Build and install hysop in directly in your home from within the container. This has to be run on a node with an OpenCL library available, most likely a compute node if you are on a cluster. + ``` + singularity run --nv --bind /etc/OpenCL hysop_jammy_cuda.sif bash -c "cd ${HYSOP_ROOT} && cmake -S . -B ./build && cmake --install ./build" + ``` + +Now the container is ready and you can: +- Run the container in interactive mode: + ``` + singularity shell --nv --bind /etc/OpenCL hysop_jammy_cuda.sif + python3 ${HYSOP_ROOT}/hysop_examples/examples/taylor_green/taylor_green.py -V + ``` +- Directly run a simulation inside the container: + ``` + singularity run --nv --bind /etc/OpenCL hysop_jammy_cuda.sif python3 "${HYSOP_ROOT}/hysop_examples/examples/taylor_green/taylor_green.py" -V + ``` +- Directly run a simulation with MPI + ``` + mpirun -np 4 -- singularity run --nv --bind /etc/OpenCL hysop_jammy_cuda.sif python3 "${HYSOP_ROOT}/hysop_examples/examples/taylor_green/taylor_green.py" -V + ``` + + Please note that `--nv --bind /etc/OpenCL` is only required enable the NVidia OpenCL backend. See [singularity documentation](https://docs.sylabs.io/guides/3.5/user-guide/gpu.html) for other OpenCL backends. Environement variables are also forwarded by default unless `--cleanenv` has been specified. + +## Known problems within singularity containers +1. You may get warnings about file locking. If this is a problem, consider disabling the file locking mechanism by passing `--env HYSOP_ENABLE_FILELOCKS=0` to singularity or by passing `--disable-file-locks` directly to your hysop script. + +2. As singularity bind mounts your `$HOME` by default, your `$HOME` python environment and `.bashrc` may alter the runtime. You can disable this by passing `--no-home` but this may break python packages that cache data inside your `$HOME` by default. An alternative solution is to bind mount a custom home directory inside the image by passing `--cleanenv --home cache_dir_on_host:/home`. To get rid of this problem, you can use singularity utility scripts. + +## Singularity utility scripts +Singularity utility script take care of isolating your home and environment variables on host. It also disables hysop file locking by default. Host hysop directory is mounted as read/write to `/hysop` and `./ci/singularity_images/hysop_[imgname].home` is mounted to `/home`. You can use git from within the container as `${HOME}/.ssh` is also bind to the fake home. +``` +./ci/utils/pull_singularity_image.sh jammy_cuda +./ci/utils/run_singularity_image.sh jammy_cuda bash -c 'cd /hysop && cmake -S . -B ./build && cmake --install ./build' +./ci/utils/run_singularity_image.sh jammy_cuda python3 /hysop/hysop_examples/examples/taylor_green/taylor_green.py -V +``` + +You can also run an interactive shell with `./ci/utils/run_singularity_image.sh jammy_cuda`. + +You may also bind additional files and directories for I/O by using the `SINGULARITY_BIND` environment variable on host with a comma separated list of mounts with the following format `src[:dest[:opts]]`.Current working directory `$PWD` is mounted as read-write by default and this cannot be disabled. -# How to use docker images ? +# Running in isolation with docker Docker images can be pulled (resp. pushed) with `./ci/utils/pull_docker_image.sh [imgname]` and `./ci/utils/push_docker_image.sh [imgname]`. The docker images do not contain de hysop library and can be run with `./ci/utils/run_docker_image.sh [imgname]`. This script mounts your local hysop directory (read only) to `/hysop` inside the docker container and prompt a shell. Images have to be downloaded (pulled) prior to be run with this script. By default, `[imgname]` corresponds to the docker image used for gitlab continuous integration (currently `jammy`, which corresponds to Ubuntu 22.04 running with python3.10). Docker images without the `_cuda` postfix ship an intel OpenCL platform that is compatible with intel CPUs. Try out the `jammy_cuda` docker image to run on the GPUs (requires host system driver compatible with cuda 11.7). @@ -74,7 +127,7 @@ To quickly test and/or debug hysop inside the docker you can run `./ci/utils/run Docker images can be build locally by using the `./ci/utils/build_docker_image.sh` script. It is advised to build docker image only if the pull fails or if you want to add/update dependencies. Each build takes around one hour (36 cores @ 2GHz). By default, the build script will use all of your cores. At least 16GB of RAM is recommended. -There are equivalent `*.bat` scripts to run on Windows. Please note that the cuda enabled images currently do not work on Windows due to this [bug](https://github.com/microsoft/WSL/issues/6951). +There are equivalent `*.bat` scripts to run on Windows. Please note that the cuda enabled images currently do not work on Windows due to this [bug](https://github.com/microsoft/WSL/issues/6951). ## Known problems within docker containers 1. **OpenMPI complains about the program being run as root:** diff --git a/ci/README.md b/ci/README.md deleted file mode 100644 index bf1daf7d3..000000000 --- a/ci/README.md +++ /dev/null @@ -1,6 +0,0 @@ -Continuous integration howto -============================ - -See scripts in utils to build, run, push and pull the docker image for continuous integration tests. - -OpenCL is not working on Windows yet due to issue https://github.com/microsoft/WSL/issues/6951. diff --git a/ci/singularity_images/.gitignore b/ci/singularity_images/.gitignore new file mode 100644 index 000000000..72e8ffc0d --- /dev/null +++ b/ci/singularity_images/.gitignore @@ -0,0 +1 @@ +* diff --git a/ci/utils/pull_docker_image.sh b/ci/utils/pull_docker_image.sh index 46d12f5a6..2429e9fe1 100755 --- a/ci/utils/pull_docker_image.sh +++ b/ci/utils/pull_docker_image.sh @@ -14,7 +14,7 @@ ## See the License for the specific language governing permissions and ## limitations under the License. ## -set -euf -o pipefail +set -ef -o pipefail DOCKER_IMAGE_TAG=${1:-jammy} HYSOP_REGISTRY_URL='gricad-registry.univ-grenoble-alpes.fr' docker logout diff --git a/ci/utils/pull_singularity_image.sh b/ci/utils/pull_singularity_image.sh new file mode 100755 index 000000000..b774dca67 --- /dev/null +++ b/ci/utils/pull_singularity_image.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +## +## Copyright (C) HySoP 2011-2022 +## +## Licensed under the Apache License, Version 2.0 (the "License"); +## you may not use this file except in compliance with the License. +## You may obtain a copy of the License at +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +## See the License for the specific language governing permissions and +## limitations under the License. +## +set -feu -o pipefail +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +HYSOP_SINGULARITY_IMG_DIR=$(readlink -f "${SCRIPT_DIR}/../singularity_images") +DOCKER_IMAGE_TAG=${1:-jammy} +HYSOP_REGISTRY_URL='gricad-registry.univ-grenoble-alpes.fr' + +cd "${HYSOP_SINGULARITY_IMG_DIR}" +singularity pull --docker-login "docker://${HYSOP_REGISTRY_URL}/particle_methods/hysop:${DOCKER_IMAGE_TAG}" +cd - + +echo "Singularity image successfully pulled to ${HYSOP_SINGULARITY_IMG_DIR}/hysop_${DOCKER_IMAGE_TAG}.sif" diff --git a/ci/utils/push_docker_image.sh b/ci/utils/push_docker_image.sh index c4e9a855e..5d1a46a9e 100755 --- a/ci/utils/push_docker_image.sh +++ b/ci/utils/push_docker_image.sh @@ -14,7 +14,7 @@ ## See the License for the specific language governing permissions and ## limitations under the License. ## -set -euf -o pipefail +set -ef -o pipefail DOCKER_IMAGE_TAG=${1:-jammy} HYSOP_REGISTRY_URL='gricad-registry.univ-grenoble-alpes.fr' docker logout diff --git a/ci/utils/run_singularity_image.sh b/ci/utils/run_singularity_image.sh new file mode 100755 index 000000000..63f626ada --- /dev/null +++ b/ci/utils/run_singularity_image.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +## +## Copyright (C) HySoP 2011-2022 +## +## Licensed under the Apache License, Version 2.0 (the "License"); +## you may not use this file except in compliance with the License. +## You may obtain a copy of the License at +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +## See the License for the specific language governing permissions and +## limitations under the License. +## +set -feu -o pipefail +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +HYSOP_ROOT=$(readlink -f "${SCRIPT_DIR}/../..") +DOCKER_IMAGE_TAG=${1:-jammy} + +SINGULARITY_IMAGE_DIR="${HYSOP_ROOT}/ci/singularity_images" +SINGULARITY_IMAGE="${SINGULARITY_IMAGE_DIR}/hysop_${DOCKER_IMAGE_TAG}.sif" +SINGULARITY_HOME="${SINGULARITY_IMAGE_DIR}/hysop_${DOCKER_IMAGE_TAG}.home" + +if [[ ! -f "${SINGULARITY_IMAGE}" ]]; then + echo "Singularity image '${SINGULARITY_IMAGE}' is not present on your system, you can get it by executing '${SCRIPT_DIR}/pull_singularity_image.sh ${DOCKER_IMAGE_TAG}'." + exit 1 +fi + +SINGULARITY_ARGS="--cleanenv --env HYSOP_ROOT=/hysop --env HYSOP_DISABLE_FILELOCKING=1" +SINGULARITY_ARGS="${SINGULARITY_ARGS} --home ${SINGULARITY_HOME}:${HOME}" +SINGULARITY_ARGS="${SINGULARITY_ARGS} --bind ${HYSOP_ROOT}:/hysop" +if [[ -f "${HOME}/.gitconfig" ]]; then + SINGULARITY_ARGS="${SINGULARITY_ARGS} --bind ${HOME}/.gitconfig" +fi +if [[ -d "${HOME}/.ssh" ]]; then + SINGULARITY_ARGS="${SINGULARITY_ARGS} --bind ${HOME}/.ssh" +fi +if [[ ${DOCKER_IMAGE_TAG} == *_cuda ]]; then + SINGULARITY_ARGS="${SINGULARITY_ARGS} --nv" + if [[ -d '/etc/OpenCL' ]]; then + SINGULARITY_ARGS="${SINGULARITY_ARGS} --bind /etc/OpenCL" + fi +fi + +mkdir -p "${SINGULARITY_HOME}" + +echo "Container home directory set to '${SINGULARITY_HOME}'." +echo "Running singularity with the following args:$(echo "${SINGULARITY_ARGS}" | sed 's/--/\n --/g')" + +if [[ "${#}" -ge 2 ]]; then + singularity run ${SINGULARITY_ARGS} "${SINGULARITY_IMAGE}" "${@:2}" +else + singularity shell ${SINGULARITY_ARGS} "${SINGULARITY_IMAGE}" +fi diff --git a/hysop/__init__.py.in b/hysop/__init__.py.in index 36fa93a9f..b749d5f25 100644 --- a/hysop/__init__.py.in +++ b/hysop/__init__.py.in @@ -101,6 +101,9 @@ __FFTW_PLANNER_TIMELIMIT__ = int(get_env('FFTW_PLANNER_TIMELIMIT', -1)) __DEFAULT_PLATFORM_ID__ = int(get_env('DEFAULT_PLATFORM_ID', @OPENCL_DEFAULT_OPENCL_PLATFORM_ID@)) __DEFAULT_DEVICE_ID__ = int(get_env('DEFAULT_DEVICE_ID', @OPENCL_DEFAULT_OPENCL_DEVICE_ID@)) +# File locks +__HYSOP_ENABLE_FILELOCKS__ = bool(get_env('ENABLE_FILELOCKS', True)) + if __TRACE_WARNINGS__: warnings.simplefilter('always') @@ -209,6 +212,7 @@ default_path = IO.default_path() cache_path = IO.default_cache_path() msg_io = '\n*Default path for all i/o is \'{}\'.'.format(default_path) msg_io += '\n*Default path for caching is \'{}\'.'.format(cache_path) +msg_io += f'\n*Cache file locking mechanism is {"enabled" if __HYSOP_ENABLE_FILELOCKS__ else "disabled"}.' vprint(msg_io) msg_threads = \ diff --git a/hysop/core/tests/test_checkpoint.sh b/hysop/core/tests/test_checkpoint.sh index f11560968..d32a15334 100755 --- a/hysop/core/tests/test_checkpoint.sh +++ b/hysop/core/tests/test_checkpoint.sh @@ -17,8 +17,10 @@ set -feu -o pipefail PYTHON_EXECUTABLE=${PYTHON_EXECUTABLE:-python3} MPIRUN_EXECUTABLE=${MPIRUN_EXECUTABLE:-mpirun} -MPIRUN_TASKS_OPTION='-np' -if [ "${MPIRUN_EXECUTABLE}" = "srun" ]; then MPIRUN_TASKS_OPTION='-n'; fi +MPIRUN_ARGS='-np' +MPIRUN_FAIL_EARLY="-mca orte_abort_on_non_zero_status 1" +if [ "${MPIRUN_EXECUTABLE}" = "srun" ]; then MPIRUN_ARGS='-n'; fi +MPIRUN_ARGS="${MPIRUN_FAIL_EARLY} ${MPIRUN_ARGS}" SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" EXAMPLE_DIR="$(realpath ${SCRIPT_DIR}/../../../hysop_examples/examples)" @@ -59,8 +61,8 @@ if [[ ! -f "${EXAMPLE_FILE}" ]]; then fi echo ' Running simulations...' -${MPIRUN_EXECUTABLE} ${MPIRUN_TASKS_OPTION} 1 ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} -S "${TEST_DIR}/checkpoint0.tar" --dump-dir "${TEST_DIR}/run0" --checkpoint-dump-time 0.05 --checkpoint-dump-freq 0 -${MPIRUN_EXECUTABLE} ${MPIRUN_TASKS_OPTION} 1 ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} -S "${TEST_DIR}/checkpoint1.tar" --dump-dir "${TEST_DIR}/run1" --checkpoint-dump-time 0.05 --checkpoint-dump-freq 0 +${MPIRUN_EXECUTABLE} ${MPIRUN_ARGS} 1 ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} -S "${TEST_DIR}/checkpoint0.tar" --dump-dir "${TEST_DIR}/run0" --checkpoint-dump-time 0.05 --checkpoint-dump-freq 0 +${MPIRUN_EXECUTABLE} ${MPIRUN_ARGS} 1 ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} -S "${TEST_DIR}/checkpoint1.tar" --dump-dir "${TEST_DIR}/run1" --checkpoint-dump-time 0.05 --checkpoint-dump-freq 0 echo ' Comparing solutions...' echo " >debug dumps match" @@ -73,8 +75,8 @@ done echo echo ' Running simulations from checkpoints...' -${MPIRUN_EXECUTABLE} ${MPIRUN_TASKS_OPTION} 1 ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} -L "${TEST_DIR}/checkpoint0.tar" --dump-dir "${TEST_DIR}/run2" -${MPIRUN_EXECUTABLE} ${MPIRUN_TASKS_OPTION} 1 ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} -L "${TEST_DIR}/checkpoint1.tar" --dump-dir "${TEST_DIR}/run3" +${MPIRUN_EXECUTABLE} ${MPIRUN_ARGS} 1 ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} -L "${TEST_DIR}/checkpoint0.tar" --dump-dir "${TEST_DIR}/run2" +${MPIRUN_EXECUTABLE} ${MPIRUN_ARGS} 1 ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} -L "${TEST_DIR}/checkpoint1.tar" --dump-dir "${TEST_DIR}/run3" echo ' Comparing solutions...' compare_files "${TEST_DIR}/run2/dump/run.txt" "${TEST_DIR}/run3/dump/run.txt" @@ -105,8 +107,8 @@ if [[ ! -f "${EXAMPLE_FILE}" ]]; then fi echo ' Running simulations...' -${MPIRUN_EXECUTABLE} ${MPIRUN_TASKS_OPTION} 4 ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} -S "${TEST_DIR}/checkpoint0.tar" --dump-dir "${TEST_DIR}/run0" --checkpoint-dump-time 0.05 --checkpoint-dump-freq 0 -${MPIRUN_EXECUTABLE} ${MPIRUN_TASKS_OPTION} 4 ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} -S "${TEST_DIR}/checkpoint1.tar" --dump-dir "${TEST_DIR}/run1" --checkpoint-dump-time 0.05 --checkpoint-dump-freq 0 +${MPIRUN_EXECUTABLE} ${MPIRUN_ARGS} 4 ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} -S "${TEST_DIR}/checkpoint0.tar" --dump-dir "${TEST_DIR}/run0" --checkpoint-dump-time 0.05 --checkpoint-dump-freq 0 +${MPIRUN_EXECUTABLE} ${MPIRUN_ARGS} 4 ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} -S "${TEST_DIR}/checkpoint1.tar" --dump-dir "${TEST_DIR}/run1" --checkpoint-dump-time 0.05 --checkpoint-dump-freq 0 echo ' Comparing solutions...' echo " >debug dumps match" @@ -119,8 +121,8 @@ done echo echo ' Running simulations from checkpoints...' -${MPIRUN_EXECUTABLE} ${MPIRUN_TASKS_OPTION} 4 ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} -L "${TEST_DIR}/checkpoint0.tar" --dump-dir "${TEST_DIR}/run2" -${MPIRUN_EXECUTABLE} ${MPIRUN_TASKS_OPTION} 4 ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} -L "${TEST_DIR}/checkpoint1.tar" --dump-dir "${TEST_DIR}/run3" +${MPIRUN_EXECUTABLE} ${MPIRUN_ARGS} 4 ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} -L "${TEST_DIR}/checkpoint0.tar" --dump-dir "${TEST_DIR}/run2" +${MPIRUN_EXECUTABLE} ${MPIRUN_ARGS} 4 ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} -L "${TEST_DIR}/checkpoint1.tar" --dump-dir "${TEST_DIR}/run3" echo ' Comparing solutions...' compare_files "${TEST_DIR}/run2/dump/run.txt" "${TEST_DIR}/run3/dump/run.txt" @@ -155,9 +157,9 @@ fi # Fortran FFTW does not yield exactly the same results in parallel so we use h5diff with an absolute tolerance of 10^-12 echo ' Running simulations...' -${MPIRUN_EXECUTABLE} ${MPIRUN_TASKS_OPTION} 1 ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} -impl fortran -cp fp64 -S "${TEST_DIR}/checkpoint0.tar" --dump-dir "${TEST_DIR}/run0" --checkpoint-dump-time 0.15 --checkpoint-dump-freq 0 -${MPIRUN_EXECUTABLE} ${MPIRUN_TASKS_OPTION} 2 ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} -impl fortran -cp fp64 -S "${TEST_DIR}/checkpoint1.tar" --dump-dir "${TEST_DIR}/run1" --checkpoint-dump-time 0.15 --checkpoint-dump-freq 0 -${MPIRUN_EXECUTABLE} ${MPIRUN_TASKS_OPTION} 3 ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} -impl fortran -cp fp64 -S "${TEST_DIR}/checkpoint2.tar" --dump-dir "${TEST_DIR}/run2" --checkpoint-dump-time 0.15 --checkpoint-dump-freq 0 +${MPIRUN_EXECUTABLE} ${MPIRUN_ARGS} 1 ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} -impl fortran -cp fp64 -S "${TEST_DIR}/checkpoint0.tar" --dump-dir "${TEST_DIR}/run0" --checkpoint-dump-time 0.15 --checkpoint-dump-freq 0 +${MPIRUN_EXECUTABLE} ${MPIRUN_ARGS} 2 ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} -impl fortran -cp fp64 -S "${TEST_DIR}/checkpoint1.tar" --dump-dir "${TEST_DIR}/run1" --checkpoint-dump-time 0.15 --checkpoint-dump-freq 0 +${MPIRUN_EXECUTABLE} ${MPIRUN_ARGS} 3 ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} -impl fortran -cp fp64 -S "${TEST_DIR}/checkpoint2.tar" --dump-dir "${TEST_DIR}/run2" --checkpoint-dump-time 0.15 --checkpoint-dump-freq 0 echo ' Comparing solutions...' for f0 in $(find "${TEST_DIR}/run0" -name '*.h5' | sort -n); do f1=$(echo "${f0}" | sed 's/run0/run1/') @@ -169,9 +171,9 @@ done echo ' Running simulations from checkpoints using different MPI topologies...' COMMON_OPTIONS="-NC -d24 --tend 0.3 --dump-tstart 0.15 --dump-freq 1 --hdf5-disable-slicing --hdf5-disable-compression --checkpoint-relax-constraints" -${MPIRUN_EXECUTABLE} ${MPIRUN_TASKS_OPTION} 3 ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} -impl fortran -cp fp64 -L "${TEST_DIR}/checkpoint0.tar" --dump-dir "${TEST_DIR}/run3" -${MPIRUN_EXECUTABLE} ${MPIRUN_TASKS_OPTION} 2 ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} -impl fortran -cp fp64 -L "${TEST_DIR}/checkpoint1.tar" --dump-dir "${TEST_DIR}/run4" -${MPIRUN_EXECUTABLE} ${MPIRUN_TASKS_OPTION} 1 ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} -impl fortran -cp fp64 -L "${TEST_DIR}/checkpoint2.tar" --dump-dir "${TEST_DIR}/run5" +${MPIRUN_EXECUTABLE} ${MPIRUN_ARGS} 3 ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} -impl fortran -cp fp64 -L "${TEST_DIR}/checkpoint0.tar" --dump-dir "${TEST_DIR}/run3" +${MPIRUN_EXECUTABLE} ${MPIRUN_ARGS} 2 ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} -impl fortran -cp fp64 -L "${TEST_DIR}/checkpoint1.tar" --dump-dir "${TEST_DIR}/run4" +${MPIRUN_EXECUTABLE} ${MPIRUN_ARGS} 1 ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} -impl fortran -cp fp64 -L "${TEST_DIR}/checkpoint2.tar" --dump-dir "${TEST_DIR}/run5" echo ' Comparing solutions...' for f3 in $(find "${TEST_DIR}/run3" -name '*.h5' | sort -n); do f0=$(echo "${f3}" | sed 's/run3/run0/') @@ -184,8 +186,8 @@ for f3 in $(find "${TEST_DIR}/run3" -name '*.h5' | sort -n); do done echo ' Running simulations from checkpoints using OpenCL and different datatypes...' -${MPIRUN_EXECUTABLE} ${MPIRUN_TASKS_OPTION} 1 ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} -cp fp64 -impl opencl -L "${TEST_DIR}/checkpoint0.tar" --dump-dir "${TEST_DIR}/run6" -${MPIRUN_EXECUTABLE} ${MPIRUN_TASKS_OPTION} 1 ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} -cp fp32 -impl opencl -L "${TEST_DIR}/checkpoint1.tar" --dump-dir "${TEST_DIR}/run7" +${MPIRUN_EXECUTABLE} ${MPIRUN_ARGS} 1 ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} -cp fp64 -impl opencl -L "${TEST_DIR}/checkpoint0.tar" --dump-dir "${TEST_DIR}/run6" +${MPIRUN_EXECUTABLE} ${MPIRUN_ARGS} 1 ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} -cp fp32 -impl opencl -L "${TEST_DIR}/checkpoint1.tar" --dump-dir "${TEST_DIR}/run7" echo ' Comparing solutions...' for f6 in $(find "${TEST_DIR}/run6" -name '*.h5' | sort -n); do f7=$(echo "${f6}" | sed 's/run0/run7/') diff --git a/hysop/core/tests/test_tasks.sh b/hysop/core/tests/test_tasks.sh index fea6da686..8fa61a94a 100755 --- a/hysop/core/tests/test_tasks.sh +++ b/hysop/core/tests/test_tasks.sh @@ -19,6 +19,8 @@ PYTHON_EXECUTABLE=${PYTHON_EXECUTABLE:-python3} MPIRUN_EXECUTABLE=${MPIRUN_EXECUTABLE:-mpirun} MPIRUN_TASKS_OPTION='-np' if [ "${MPIRUN_EXECUTABLE}" = "srun" ]; then MPIRUN_TASKS_OPTION='-n'; fi +MPIRUN_FAIL_EARLY="-mca orte_abort_on_non_zero_status 1" +MPIRUN_ARGS="${MPIRUN_FAIL_EARLY} ${MPIRUN_TASKS_OPTION} 4" SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" EXAMPLE_DIR="$(realpath ${SCRIPT_DIR}/../../../hysop_examples/examples)" @@ -39,23 +41,23 @@ ${MPIRUN_EXECUTABLE} ${MPIRUN_TASKS_OPTION} 4 ${PYTHON_EXECUTABLE} "${EXAMPLE_FI if [[ $? -ne 0 ]] ; then exit 1; fi -${MPIRUN_EXECUTABLE} ${MPIRUN_TASKS_OPTION} 4 ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} --proc-tasks "(111,111,222,222)" +${MPIRUN_EXECUTABLE} ${MPIRUN_ARGS} ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} --proc-tasks "(111,111,222,222)" if [[ $? -ne 0 ]] ; then exit 1; fi -${MPIRUN_EXECUTABLE} ${MPIRUN_TASKS_OPTION} 4 ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} --proc-tasks "(111,222,222,222)" +${MPIRUN_EXECUTABLE} ${MPIRUN_ARGS} ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} --proc-tasks "(111,222,222,222)" if [[ $? -ne 0 ]] ; then exit 1; fi -${MPIRUN_EXECUTABLE} ${MPIRUN_TASKS_OPTION} 4 ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} --proc-tasks "((111,),(222,),(222,),(222,))" +${MPIRUN_EXECUTABLE} ${MPIRUN_ARGS} ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} --proc-tasks "((111,),(222,),(222,),(222,))" if [[ $? -ne 0 ]] ; then exit 1; fi -${MPIRUN_EXECUTABLE} ${MPIRUN_TASKS_OPTION} 4 ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} --proc-tasks "((111,222),(222,),(222,),(222,))" +${MPIRUN_EXECUTABLE} ${MPIRUN_ARGS} ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} --proc-tasks "((111,222),(222,),(222,),(222,))" if [[ $? -ne 0 ]] ; then exit 1; fi -${MPIRUN_EXECUTABLE} ${MPIRUN_TASKS_OPTION} 4 ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} --proc-tasks "((111,222),(222,),(222,111),(222,))" +${MPIRUN_EXECUTABLE} ${MPIRUN_ARGS} ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} --proc-tasks "((111,222),(222,),(222,111),(222,))" if [[ $? -ne 0 ]] ; then exit 1; fi # Followings should fail (not handled yet) echo ' Running simulations that MUST fail' -${MPIRUN_EXECUTABLE} ${MPIRUN_TASKS_OPTION} 4 ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} --proc-tasks "((111,),(222,),(222,111),(222,))" +${MPIRUN_EXECUTABLE} ${MPIRUN_ARGS} ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} --proc-tasks "((111,),(222,),(222,111),(222,))" if [[ $? -eq 0 ]] ; then exit 1; fi -${MPIRUN_EXECUTABLE} ${MPIRUN_TASKS_OPTION} 4 ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} --proc-tasks "((222,),(222,),(222,111),(222,))" +${MPIRUN_EXECUTABLE} ${MPIRUN_ARGS} ${PYTHON_EXECUTABLE} "${EXAMPLE_FILE}" ${COMMON_OPTIONS} --proc-tasks "((222,),(222,),(222,111),(222,))" if [[ $? -eq 0 ]] ; then exit 1; fi echo ' Done' diff --git a/hysop/fields/tests/test_cartesian.py b/hysop/fields/tests/test_cartesian.py index 3ce1f1478..66711d83e 100644 --- a/hysop/fields/tests/test_cartesian.py +++ b/hysop/fields/tests/test_cartesian.py @@ -37,9 +37,9 @@ def __random_init(data, coords, component): shape = data.shape dtype = data.dtype if is_integer(dtype): - data[...] = npw.random.random_integers(low=0, high=255, size=shape) + data[...] = npw.random.random_integers(low=0, high=255, size=shape).astype(dtype) elif is_fp(dtype): - data[...] = npw.random.random(size=shape) + data[...] = npw.random.random(size=shape).astype(dtype) else: msg = f'Unknown dtype {dtype}.' raise NotImplementedError(msg) diff --git a/hysop/fields/tests/test_cartesian.sh b/hysop/fields/tests/test_cartesian.sh index ce5cf87c3..0e84cf24e 100755 --- a/hysop/fields/tests/test_cartesian.sh +++ b/hysop/fields/tests/test_cartesian.sh @@ -20,6 +20,8 @@ MPIRUN_EXECUTABLE=${MPIRUN_EXECUTABLE:-mpirun} MPIRUN_TASKS_OPTION='-np' if [ "${MPIRUN_EXECUTABLE}" = "srun" ]; then MPIRUN_TASKS_OPTION='-n'; fi +MPIRUN_FAIL_EARLY="-mca orte_abort_on_non_zero_status 1" + SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" TEST_FILE=${SCRIPT_DIR}/test_cartesian.py @@ -28,5 +30,5 @@ export HYSOP_DEBUG=0 export KERNEL_DEBUG=0 for i in 2; do - ${MPIRUN_EXECUTABLE} ${MPIRUN_TASKS_OPTION} $i ${PYTHON_EXECUTABLE} ${TEST_FILE} + ${MPIRUN_EXECUTABLE} ${MPIRUN_FAIL_EARLY} ${MPIRUN_TASKS_OPTION} $i ${PYTHON_EXECUTABLE} ${TEST_FILE} done diff --git a/hysop/tools/cache.py b/hysop/tools/cache.py index acbb7e44d..5ff3aa6fe 100644 --- a/hysop/tools/cache.py +++ b/hysop/tools/cache.py @@ -19,7 +19,11 @@ try: except: import pickle -import gzip, portalocker, contextlib, os, errno, uuid +import gzip, portalocker, contextlib, os, errno, uuid, warnings + +from hysop import __HYSOP_ENABLE_FILELOCKS__ +from hysop.tools.decorators import static_vars +from hysop.tools.warning import HysopCacheWarning machine_id = None for path in ('/var/lib/dbus/machine-id', '/var/lib/yum/uuid'): @@ -34,22 +38,28 @@ if (machine_id in (None,'')): machine_id = uuid.getnode() @contextlib.contextmanager +@static_vars(ignored_locks=set()) def lock_file(filepath, mode, compressed=True, - timeout=3600, check_interval=1): + timeout=10, check_interval=0.1, + ignore_lock_after_timeout=True): """ Opens a locked file with specified mode, possibly compressed. """ _dir = os.path.dirname(filepath) + + if not os.path.isdir(_dir): + try: + os.makedirs(_dir) + except OSError as e: + if (e.errno != errno.EEXIST): + raise + + if not os.path.exists(filepath): + open(filepath, 'a').close() try: - if not os.path.isdir(_dir): - try: - os.makedirs(_dir) - except OSError as e: - if (e.errno != errno.EEXIST): - raise - if not os.path.exists(filepath): - open(filepath, 'a').close() + if not __HYSOP_ENABLE_FILELOCKS__ or (ignore_lock_after_timeout and filepath in lock_file.ignored_locks): + raise portalocker.exceptions.LockException with portalocker.Lock(filename=filepath, timeout=timeout, mode=mode, check_interval=check_interval) as fl: if compressed: @@ -58,25 +68,35 @@ def lock_file(filepath, mode, compressed=True, else: yield fl except portalocker.exceptions.LockException as e: - msg='\nFATAL ERROR: Could not obtain lock for file \'{}\' after waiting for {}s.\n' - msg=msg.format(filepath, timeout) - print(msg) - raise e + # Could not obtain the lock in time, so do it the dirty way. + if ignore_lock_after_timeout: + if __HYSOP_ENABLE_FILELOCKS__: + msg=(f'Could not obtain lock for file \'{filepath}\' after waiting for {timeout}s, ignoring file lock.' + '\nIf this causes a performance issue, consider disabling file locking mechanism by setting ' + 'environment variable HYSOP_ENABLE_FILELOCKS=0 or by passing --disable-file-locks to your script.\n') + warnings.warn(msg, HysopCacheWarning) + lock_file.ignored_locks.add(filepath) + with open(filepath, mode=mode) as fl: + if compressed: + with gzip.GzipFile(fileobj=fl, mode=mode) as f: + yield f + else: + yield fl + else: + msg=f'Could not obtain lock for file \'{filepath}\' after waiting for {timeout}s, ignoring file lock.\n' + print(f'\nFATAL ERROR: {msg}') + raise e @contextlib.contextmanager -def read_only_lock(filepath, compressed=True, - timeout=3600, check_interval=1): +def read_only_lock(filepath, compressed=True, **kwds): """Opens a locked read only file, possibly compressed.""" - with lock_file(filepath=filepath, mode='rb', compressed=compressed, - timeout=timeout, check_interval=check_interval) as f: + with lock_file(filepath=filepath, mode='rb', compressed=compressed, **kwds) as f: yield f @contextlib.contextmanager -def write_only_lock(filepath, compressed=True, - timeout=3600, check_interval=1): +def write_only_lock(filepath, compressed=True, **kwds): """Opens a locked write only file, possibly compressed.""" - with lock_file(filepath=filepath, mode='wb', compressed=compressed, - timeout=timeout, check_interval=check_interval) as f: + with lock_file(filepath=filepath, mode='wb', compressed=compressed, **kwds) as f: yield f diff --git a/hysop_examples/argparser.py b/hysop_examples/argparser.py index 05e320258..d54239885 100644 --- a/hysop_examples/argparser.py +++ b/hysop_examples/argparser.py @@ -1075,6 +1075,11 @@ class HysopArgParser(argparse.ArgumentParser): help=('Disable HDF5 slicing that is obtained with XDMF JOIN. ' 'May reduce performances when HDF5 slicing applies (<= 16 processes slab topologies).' 'Enabling this option guarantees a single HDF5 file for all processes per dump.')) + + file_io.add_argument('--disable-file-locks', default=False, action='store_true', + dest='disable_file_locks', + help=('Disable file locking mechanism which restricts access to hysop cache files among multiple processes. ' + 'This may be necessary on some platforms or when running in containers like singularity.')) # list of additional named io_params to be generated assert (generate_io_params is not None), generate_io_params @@ -1196,7 +1201,9 @@ class HysopArgParser(argparse.ArgumentParser): self._check_default(args, ('cache_dir', 'postprocess_dump'), str, allow_none=True) self._check_dir(args, 'cache_dir', allow_shared=False, allow_none=True) self._check_default(args, ('no_interactive', 'dump_is_temporary', - 'enable_ram_fs', 'force_ram_fs', 'hdf5_disable_compression', 'hdf5_disable_slicing'), + 'enable_ram_fs', 'force_ram_fs', + 'hdf5_disable_compression', 'hdf5_disable_slicing', + 'disable_file_locks'), bool, allow_none=False) self._check_default(args, 'dump_dir', str, allow_none=False) @@ -1628,6 +1635,7 @@ class HysopArgParser(argparse.ArgumentParser): self.set_env('FFTW_NUM_THREADS', args.fftw_threads, True) self.set_env('FFTW_PLANNER_EFFORT', args.fftw_planner_effort, True) self.set_env('FFTW_PLANNER_TIMELIMIT', args.fftw_planner_timelimit, True) + self.set_env('ENABLE_FILELOCKS', not args.disable_file_locks, True) # those environment variables are not part of HySoP self.set_env('OMP_NUM_THREADS', args.openmp_threads, False) -- GitLab