Skip to content

Commit

Permalink
Fix cloud multi-nodes
Browse files Browse the repository at this point in the history
* Copy ssh key to allow connections from master to workers
* Use local ip for manager's ip such that workers can find it and connect to it
* Fix incompatibility between pandas and numpy 2.0.0
* Fix diffusion benches permission
  • Loading branch information
satyaog committed Aug 22, 2024
1 parent 0f34dd2 commit 9e394be
Show file tree
Hide file tree
Showing 14 changed files with 186 additions and 55 deletions.
58 changes: 47 additions & 11 deletions .github/workflows/cloud-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,12 @@ jobs:
cloud-tests:
strategy:
fail-fast: true
max-parallel: 1
matrix:
system: [1x_gpu, 2x_gpu, 2x_node]
include:
- arch: cuda
exclude: "no-cuda"
run_on: azure__a100
# - arch: rocm
# exclude : "no-rocm"

Expand All @@ -27,7 +28,7 @@ jobs:

# Cancel previous jobs if a new version was pushed
concurrency:
group: "${{ github.ref }}-${{ matrix.arch }}-${{ matrix.run_on }}"
group: "${{ github.ref }}-${{ matrix.arch }}-${{ matrix.system }}"
cancel-in-progress: true

defaults:
Expand All @@ -36,13 +37,15 @@ jobs:

env:
MILABENCH_CONFIG: "config/standard.yaml"
MILABENCH_SYSTEM: "config/cloud-system.yaml"
MILABENCH_SYSTEM: "config/cloud-multinodes-system.yaml"
MILABENCH_BASE: "output"
MILABENCH_ARGS: ""
MILABENCH_DASH: "no"
ARM_TENANT_ID: "${{ secrets.ARM_TENANT_ID }}"
ARM_SUBSCRIPTION_ID: "${{ secrets.ARM_SUBSCRIPTION_ID }}"
AZURE_CORE_OUTPUT: none
_MULTI_GPUS: "diffusion-gpus,dinov2-giant-gpus,lightning-gpus,resnet152-ddp-gpus,llm-lora-ddp-gpus,llm-lora-mp-gpus,llm-full-mp-gpus"
_MULTI_NODES: "diffusion-nodes,dinov2-giant-nodes,llm-lora-ddp-nodes,llm-full-mp-nodes"

steps:
- uses: actions/checkout@v3
Expand Down Expand Up @@ -90,23 +93,51 @@ jobs:
- name: setup cloud
run: |
case "${{ matrix.system }}" in
"1x_gpu")
export MILABENCH_SYSTEM="config/cloud-system.yaml"
export RUN_ON="azure__a100"
export SELECT=
export EXCLUDES="--exclude $_MULTI_GPUS,$_MULTI_NODES,llm-full-mp-gpus,llm-full-mp-nodes,llm-lora-ddp-gpus,llm-lora-ddp-nodes,llm-lora-mp-gpus,llm-lora-single"
;;
"2x_gpu")
export MILABENCH_SYSTEM="config/cloud-system.yaml"
export RUN_ON="azure__a100_x2"
export SELECT="--select $_MULTI_GPUS"
export EXCLUDES="--exclude llm-full-mp-gpus,llm-full-mp-nodes,llm-lora-ddp-gpus,llm-lora-ddp-nodes,llm-lora-mp-gpus,llm-lora-single"
;;
"2x_node")
export MILABENCH_SYSTEM="config/cloud-multinodes-system.yaml"
export RUN_ON="azure__a100"
export SELECT="--select $_MULTI_NODES"
export EXCLUDES="--exclude llm-full-mp-gpus,llm-full-mp-nodes,llm-lora-ddp-gpus,llm-lora-ddp-nodes,llm-lora-mp-gpus,llm-lora-single"
;;
*)
exit 1
;;
esac
poetry run milabench cloud \
--setup \
--run-on ${{ matrix.run_on }} \
--system "$MILABENCH_SYSTEM" >$MILABENCH_SYSTEM.${{ matrix.run_on }}
echo "MILABENCH_SYSTEM=$MILABENCH_SYSTEM.${{ matrix.run_on }}" >>$GITHUB_ENV
--run-on $RUN_ON \
--system "$MILABENCH_SYSTEM" >$MILABENCH_SYSTEM.$RUN_ON
echo "RUN_ON=$RUN_ON" >>$GITHUB_ENV
echo "SELECT=$SELECT" >>$GITHUB_ENV
echo "EXCLUDES=$EXCLUDES" >>$GITHUB_ENV
echo "MILABENCH_SYSTEM=$MILABENCH_SYSTEM.$RUN_ON" >>$GITHUB_ENV
- name: install benchmarks
run: |
poetry run milabench install --variant ${{ matrix.arch }}
poetry run milabench install --variant ${{ matrix.arch }} $SELECT $EXCLUDES
- name: prepare benchmarks
run: |
poetry run milabench prepare
poetry run milabench prepare $SELECT $EXCLUDES
- name: run benchmarks
run: |
poetry run milabench run
poetry run milabench run $SELECT $EXCLUDES
- name: Summary
run: |
Expand All @@ -118,6 +149,11 @@ jobs:
env:
GITHUB_TOKEN: ${{ github.token }}

- name: DEBUG state file
if: always()
run: |
cat /tmp/milabench/covalent_venv/lib/python*/site-packages/covalent_azure_plugin/infra/*.tfstate
- name: teardown cloud
if: always()
run: |
Expand All @@ -127,10 +163,10 @@ jobs:
fi
poetry run milabench cloud \
--teardown \
--run-on ${{ matrix.run_on }} \
--run-on $RUN_ON \
--all
- name: debug logs
- name: DEBUG logs
if: always()
run: |
cat ~/.cache/covalent/covalent_ui.log
2 changes: 2 additions & 0 deletions benchmarks/diffusion/main.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/usr/bin/env python

from dataclasses import dataclass

from accelerate import Accelerator
Expand Down
4 changes: 4 additions & 0 deletions config/cloud-multinodes-system.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ system:
- name: manager
# Use 1.1.1.1 as an ip placeholder
ip: 1.1.1.1
port: 5000
# Use this node as the master node or not
main: true
# User to use in remote milabench operations
Expand All @@ -21,11 +22,14 @@ system:
username: ubuntu
size: Standard_NC24ads_A100_v4
location: eastus2
disk_size: 512
azure__a100_x2:
username: ubuntu
size: Standard_NC48ads_A100_v4
location: eastus2
disk_size: 512
azure__a10_x2:
username: ubuntu
size: Standard_NV72ads_A10_v5
location: eastus2
disk_size: 512
9 changes: 9 additions & 0 deletions config/cloud-system.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ system:
- name: manager
# Use 1.1.1.1 as an ip placeholder
ip: 1.1.1.1
port: 5000
# Use this node as the master node or not
main: true
# User to use in remote milabench operations
Expand All @@ -16,11 +17,19 @@ system:
username: ubuntu
size: Standard_NC24ads_A100_v4
location: eastus2
disk_size: 512
azure__a100_x2:
username: ubuntu
size: Standard_NC48ads_A100_v4
location: eastus2
disk_size: 512
azure__a10:
username: ubuntu
size: Standard_NV36ads_A10_v5
location: eastus2
disk_size: 512
azure__a10_x2:
username: ubuntu
size: Standard_NV72ads_A10_v5
location: eastus2
disk_size: 512
12 changes: 6 additions & 6 deletions config/examples/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,18 @@ _defaults:

test:
inherits: _defaults
group: test_remote
install_group: test_remote
definition: ../../benchmarks/_template
group: simple
install_group: test
definition: ../../benchmarks/_templates/simple
plan:
method: njobs
n: 1

testing:
inherits: _defaults
definition: ../../benchmarks/_template
group: test_remote_2
install_group: test_remote_2
definition: ../../benchmarks/_templates/stdout
group: stdout
install_group: test
plan:
method: njobs
n: 1
20 changes: 16 additions & 4 deletions milabench/cli/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from coleo import Option, tooled

from milabench.remote import is_remote
from milabench.utils import validation_layers

from ..common import (
Expand Down Expand Up @@ -63,32 +64,43 @@ def arguments():
return Arguments(run_name, repeat, fulltrace, report, dash, noterm, validations)



def _fetch_arch(mp):
try:
arch = next(iter(mp.packs.values())).config["system"]["arch"]
except StopIteration:
print("no selected bench")
return None



def _fetch_first_pack(mp):
try:
return next(iter(mp.packs.values()))
except StopIteration:
print("no selected bench")
return None


@tooled
def cli_run(args=None):
"""Run the benchmarks."""
if args is None:
args = arguments()

layers = validation_names(args.validations)

dash_class = {
"short": ShortDashFormatter,
"long": LongDashFormatter,
"no": None,
}.get(args.dash, None)

mp = get_multipack(run_name=args.run_name)
first_pack = _fetch_first_pack(mp)
arch = _fetch_arch(mp)

layers = validation_names(args.validations)
if is_remote(first_pack):
# Remote execution will never send back rates
layers.remove("ensure_rate")

# Initialize the backend here so we can retrieve GPU stats
init_arch(arch)

Expand Down
25 changes: 22 additions & 3 deletions milabench/commands/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -674,7 +674,16 @@ def __init__(self, executor: Command, **kwargs) -> None:
main = self.nodes[0]

# node[port] is for SSH
main_host = main["ip"]
# Find local ip such that workers can connect to the port
for main_host in main["ipaddrlist"]:
if ":" in main_host or main_host == "127.0.0.1":
continue
if all(str.isnumeric(n) for n in main_host.split(".")):
break
else:
main_host = main["ip"]
if len(self.nodes) == 1:
main_host = "localhost"
# add them as option so we could tweak them if necessary
main_port = option("torchrun.port", int, default=29400)
backend = option("torchrun.backend", str, default="c10d")
Expand Down Expand Up @@ -886,9 +895,10 @@ def make_new_node_executor(self, rank, node, base):
config = base.pack.config

pack = self.make_new_node_pack(rank, node, base)
executor = self.executor.copy(pack)

return DockerRunCommand(
AccelerateLaunchCommand(pack, rank=rank),
AccelerateLaunchCommand(executor, rank=rank),
config["system"].get("docker_image"),
)

Expand Down Expand Up @@ -939,6 +949,15 @@ def _get_main_and_workers(self):
def _argv(self, **_) -> List:
manager, nodes = self._get_main_and_workers()

# Find local ip such that workers can connect to the port
for manager_ip in manager["ipaddrlist"]:
if ":" in manager_ip or manager_ip == "127.0.0.1":
continue
if all(str.isnumeric(n) for n in manager_ip.split(".")):
break
else:
manager_ip = manager['ip']

num_machines = max(1, len(nodes) + 1)

# Cant do that maybe this run is constrained
Expand Down Expand Up @@ -978,7 +997,7 @@ def _argv(self, **_) -> List:
*deepspeed_argv,
f"--gradient_accumulation_steps={self.pack.config.get('gradient_accumulation_steps', 1)}",
f"--num_cpu_threads_per_process={cpu_per_process}",
f"--main_process_ip={manager['ip']}",
f"--main_process_ip={manager_ip}",
f"--main_process_port={manager['port']}",
f"--num_processes={nproc}",
*self.accelerate_argv,
Expand Down
16 changes: 8 additions & 8 deletions milabench/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,18 +314,18 @@ def _read_reports(*runs):
return all_data


def _filter_reports(*reports):
all_reports = []
def _filter_reports(**reports):
_reports = {}

for report in reports:
for k, report in reports.items():
config = next(iter(e for e in report if e["event"] == "config"), None)
if config is None:
continue

if config["data"]["name"] != "remote":
all_reports.append(report)
_reports[k] = report

return all_reports
return _reports


def _push_reports(reports_repo, runs):
Expand Down Expand Up @@ -356,8 +356,8 @@ def _push_reports(reports_repo, runs):

device_reports = {}
for run in runs:
reports = list(_read_reports(run).values())
reports = _filter_reports(*reports)
reports = _read_reports(run)
reports = list(_filter_reports(**reports).values())

if not reports:
continue
Expand Down Expand Up @@ -392,7 +392,7 @@ def _push_reports(reports_repo, runs):
for (device, build), reports in device_reports.items():
reports_dir = XPath(reports_repo.working_tree_dir) / build
reports = _read_reports(*reports)
reports = _filter_reports(*reports.values())
reports = _filter_reports(**reports)
summary = make_summary(reports)

successes = [s["successes"] for s in summary.values()]
Expand Down
1 change: 1 addition & 0 deletions milabench/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ def build_config(*config_files):
for layer in _config_layers(config_files):
all_configs = merge(all_configs, layer)

all_configs.setdefault("*", {})
all_configs["*"]["hash"] = compute_config_hash(all_configs)

all_configs = build_matrix_bench(all_configs)
Expand Down
7 changes: 1 addition & 6 deletions milabench/remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,11 @@
import os
import sys

import yaml

from milabench.fs import XPath

from . import ROOT_FOLDER
from .commands import (
CmdCommand,
Command,
ListCommand,
SCPCommand,
SequenceCommand,
SSHCommand,
VoidCommand,
Expand Down Expand Up @@ -291,6 +286,6 @@ def milabench_remote_run(pack) -> Command:

argv = sys.argv[2:]
return SequenceCommand(
milabench_remote_command(pack, "run", *argv, run_for="main"),
milabench_remote_command(pack, "run", *argv, "--run-name", pack.config["run_name"], run_for="main"),
milabench_remote_fetch_reports_plan(pack, run_for="main"),
)
Loading

0 comments on commit 9e394be

Please sign in to comment.