Run Singularity images

The script below shows how to run Singularity planner images using Downward Lab.

../examples/singularity/singularity-exp.py
#! /usr/bin/env python

"""
Example experiment for running Singularity/Apptainer planner images.

The time and memory limits set with Lab can be circumvented by solvers that fork
child processes. Their resource usage is not checked. If you're running solvers
that don't check their resource usage like Fast Downward, we recommend using
cgroups or the "runsolver" tool to enforce resource limits. Since setting time
limits for solvers with cgroups is difficult, the experiment below uses the
``runsolver`` tool, which has been used in multiple SAT competitions to enforce
resource limits. For the experiment to run, the runsolver binary needs to be on
the PATH. You can obtain a runsolver copy from
https://github.com/jendrikseipp/runsolver.

Since Singularity (and Apptainer) reserve 1-2 GiB of *virtual* memory when
starting the container, we recommend either enforcing a higher virtual memory
limit with ``runsolver`` or limiting RSS memory with ``runsolver`` (like below).
For limiting RSS memory, you can also use `runlim
<https://github.com/arminbiere/runlim>`_, which is more actively maintained than
runsolver.

A note on running Singularity on clusters: reading large Singularity files over
the network is not optimal, so we recommend copying the images to a local
filesystem (e.g., /tmp/) before running experiments.
"""

import os
from pathlib import Path
import platform
import sys

from singularity_parser import get_parser

from downward import suites
from downward.reports.absolute import AbsoluteReport
from lab.environments import BaselSlurmEnvironment, LocalEnvironment
from lab.experiment import Experiment


# Create custom report class with suitable info and error attributes.
class BaseReport(AbsoluteReport):
    INFO_ATTRIBUTES = []
    ERROR_ATTRIBUTES = [
        "domain",
        "problem",
        "algorithm",
        "unexplained_errors",
        "error",
        "node",
    ]


NODE = platform.node()
RUNNING_ON_CLUSTER = NODE.endswith((".scicore.unibas.ch", ".cluster.bc2.ch"))
DIR = Path(__file__).resolve().parent
REPO = DIR.parent
IMAGES_DIR = Path(os.environ["SINGULARITY_IMAGES"])
assert IMAGES_DIR.is_dir(), IMAGES_DIR
BENCHMARKS_DIR = os.environ["DOWNWARD_BENCHMARKS"]
MEMORY_LIMIT = 3584  # MiB
if RUNNING_ON_CLUSTER:
    SUITE = ["depot", "freecell", "gripper", "zenotravel"]
    ENVIRONMENT = BaselSlurmEnvironment(
        partition="infai_1",
        email="my.name@unibas.ch",
        memory_per_cpu="3872M",
        export=["PATH"],
        setup=BaselSlurmEnvironment.DEFAULT_SETUP,
        # Until recently, we had to load the Singularity module here
        # by adding "module load Singularity/2.6.1 2> /dev/null".
    )
    TIME_LIMIT = 1800
else:
    SUITE = ["depot:p01.pddl", "gripper:prob01.pddl", "mystery:prob07.pddl"]
    ENVIRONMENT = LocalEnvironment(processes=2)
    TIME_LIMIT = 5

ATTRIBUTES = [
    "cost",
    "coverage",
    "error",
    "g_values_over_time",
    "run_dir",
    "raw_memory",
    "runtime",
    "virtual_memory",
]

exp = Experiment(environment=ENVIRONMENT)
exp.add_step("build", exp.build)
exp.add_step("start", exp.start_runs)
exp.add_step("parse", exp.parse)
exp.add_fetcher(name="fetch")
exp.add_parser(get_parser())


def get_image(name):
    planner = name.replace("-", "_")
    image = IMAGES_DIR / (name + ".img")
    assert image.is_file(), image
    return planner, image


IMAGES = [get_image("fd1906-lama-first")]

for planner, image in IMAGES:
    exp.add_resource(planner, image, symlink=True)

exp.add_resource("run_singularity", DIR / "run-singularity.sh")
exp.add_resource("filter_stderr", DIR / "filter-stderr.py")

for planner, _ in IMAGES:
    for task in suites.build_suite(BENCHMARKS_DIR, SUITE):
        run = exp.add_run()
        run.add_resource("domain", task.domain_file, "domain.pddl")
        run.add_resource("problem", task.problem_file, "problem.pddl")
        # Use runsolver to limit time and memory. It must be on the system
        # PATH. Important: we cannot use time_limit and memory_limit of
        # Lab's add_command() because setting the same memory limit with
        # runsolver again using setrlimit fails.
        run.add_command(
            "run-planner",
            [
                "runsolver",
                "--cpu-limit",
                TIME_LIMIT,
                "--rss-swap-limit",
                MEMORY_LIMIT,
                "--watcher-data",
                "watch.log",
                "--var",
                "values.log",
                "{run_singularity}",
                f"{{{planner}}}",
                "{domain}",
                "{problem}",
                "sas_plan",
            ],
        )
        # Remove temporary files from old Fast Downward versions.
        run.add_command("rm-tmp-files", ["rm", "-f", "output.sas", "output"])
        run.add_command("filter-stderr", [sys.executable, "{filter_stderr}"])

        run.set_property("domain", task.domain)
        run.set_property("problem", task.problem)
        run.set_property("algorithm", planner)
        run.set_property("id", [planner, task.domain, task.problem])

report = Path(exp.eval_dir) / f"{exp.name}.html"
exp.add_report(BaseReport(attributes=ATTRIBUTES), outfile=report)

exp.run_steps()

The experiment script needs a parser and a helper script:

../examples/singularity/singularity_parser.py
import re
import sys

from lab.parser import Parser


def coverage(content, props):
    props["coverage"] = int("cost" in props)


def unsolvable(content, props):
    # Note that this naive test may easily generate false positives.
    props["unsolvable"] = int(
        not props["coverage"]
        and "Completely explored state space -- no solution!" in content
    )


def parse_g_value_over_time(content, props):
    """Example line: "[g=6, 16 evaluated, 15 expanded, t=0.00328561s, 22300 KB]" """
    matches = re.findall(
        r"\[g=(\d+), \d+ evaluated, \d+ expanded, t=(.+)s, \d+ KB\]\n", content
    )
    props["g_values_over_time"] = [(float(t), int(g)) for g, t in matches]


def set_outcome(content, props):
    lines = content.splitlines()
    solved = props["coverage"]
    unsolvable = props["unsolvable"]
    out_of_time = int("TIMEOUT=true" in lines)
    out_of_memory = int("MEMOUT=true" in lines)
    # runsolver decides "out of time" based on CPU rather than (cumulated)
    # WCTIME.
    if (
        not solved
        and not unsolvable
        and not out_of_time
        and not out_of_memory
        and props["runtime"] > props["time_limit"]
    ):
        out_of_time = 1
    # In cases where CPU time is very slightly above the threshold so that
    # runsolver didn't kill the planner yet and the planner solved a task
    # just within the limit, runsolver will still record an "out of time".
    # We remove this record. This case also applies to iterative planners.
    # If such planners solve the task, we don't treat them as running out
    # of time.
    if (solved or unsolvable) and (out_of_time or out_of_memory):
        print("task solved however runsolver recorded an out_of_*")
        print(props)
        out_of_time = 0
        out_of_memory = 0

    if not solved and not unsolvable:
        props["runtime"] = None

    if solved ^ unsolvable ^ out_of_time ^ out_of_memory:
        if solved:
            props["error"] = "solved"
        elif unsolvable:
            props["error"] = "unsolvable"
        elif out_of_time:
            props["error"] = "out_of_time"
        elif out_of_memory:
            props["error"] = "out_of_memory"
    else:
        print(f"unexpected error: {props}", file=sys.stderr)
        props["error"] = "unexpected-error"


def get_parser():
    parser = Parser()
    parser.add_pattern(
        "planner_exit_code",
        r"run-planner exit code: (.+)\n",
        type=int,
        file="driver.log",
        required=True,
    )
    parser.add_pattern(
        "node", r"node: (.+)\n", type=str, file="driver.log", required=True
    )
    parser.add_pattern(
        "planner_wall_clock_time",
        r"run-planner wall-clock time: (.+)s",
        type=float,
        file="driver.log",
        required=True,
    )
    parser.add_pattern("runtime", r"Singularity runtime: (.+?)s", type=float)
    parser.add_pattern(
        "time_limit",
        r"Enforcing CPUTime limit \(soft limit, will send "
        r"SIGTERM then SIGKILL\): (\d+) seconds",
        type=int,
        file="watch.log",
        required=True,
    )
    # Cumulative runtime and virtual memory of the solver and all child processes.
    parser.add_pattern(
        "runtime", r"WCTIME=(.+)", type=float, file="values.log", required=True
    )
    parser.add_pattern(
        "virtual_memory", r"MAXVM=(\d+)", type=int, file="values.log", required=True
    )
    parser.add_pattern("raw_memory", r"Peak memory: (\d+) KB", type=int)
    parser.add_pattern("cost", r"\nFinal value: (.+)\n", type=int)
    parser.add_function(coverage)
    parser.add_function(unsolvable)
    parser.add_function(parse_g_value_over_time)
    parser.add_function(set_outcome, file="values.log")
    return parser
../examples/singularity/run-singularity.sh
#!/bin/bash

set -euo pipefail

if [[ $# != 4 ]]; then
    echo "usage: $(basename "$0") image domain_file problem_file plan_file" 1>&2
    exit 2
fi

if [ -f $PWD/$4 ]; then
    echo "Error: remove $PWD/$4" 1>&2
    exit 2
fi

# Ensure that strings like "CPU time limit exceeded" and "Killed" are in English.
export LANG=C

set +e
singularity run -C -H "$PWD" "$1" "$PWD/$2" "$PWD/$3" "$4"
set -e

printf "\nRun VAL\n\n"

if [ -f $PWD/$4 ]; then
    echo "Found plan file."
    validate -v "$PWD/$2" "$PWD/$3" "$PWD/$4"
    exit 0
else
    echo "No plan file."
    validate -v "$PWD/$2" "$PWD/$3"
    exit 99
fi