Skip to content

Distributed Models

Introduction

Developing distributed models using BMF is straightforward given knowledge of how to use the Horovod python library. All other infrastructure (e.g., MPI, data loading) is handled by the BOSS platform.

PyTorch

The current version of BMF using Horovod model training was only tested using PyTorch models. Additionally, the BMF advanced model approach must be used for Horovod modeling. Using other frameworks such as TensorFlow might work, but they are not officially supported at this time. Examples of how to use Horovod in PyTorch can be found at https://github.com/horovod/horovod/tree/master/examples/pytorch. A specific example of a model used for testing with BMF can be found in the The BOSS Model Shop.

For convenience, an example is included below.

import horovod.torch as hvd
import torch
import torch.nn as nn
import torch.nn.functional as F
from boss.core.internal import train
from boss.core.lib import constants
from boss.core.lib.training_resources import get_features_and_labels
from torch.utils.data import DataLoader

torch.manual_seed(1234)


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(4, 10)
        self.fc2 = nn.Linear(10, 3)
        self.out = nn.Softmax(dim=1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        x = self.out(x)
        return x


# Function used in classification models to map int labels to string names
def label_mapping():
    return {2: 'I. setosa', 1: 'I. virginica', 0: 'I. versicolor'}


def main(args):
    hvd.init()

    tid = args['train_id']

    learning_rate = args['parameters']['lr']
    training_steps = args['parameters']['steps']

    train_df, train_lb, train_ids, eval_df, eval_lb, eval_ids, test_df, test_lb, test_ids = get_features_and_labels(args)
    trainloader = DataLoader(train_df, batch_size=10)

    net = Net()
    criterion = nn.NLLLoss()

    optimizer = hvd.DistributedOptimizer(torch.optim.SGD(net.parameters(), lr=learning_rate),
                                         named_parameters=net.named_parameters())
    hvd.broadcast_parameters(net.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    # PyTorch Training loop
    for epoch in range(training_steps):
        train_loss, valid_loss, predictions = [], [], []
        tl = 0
        step = 0
        total_items = 0
        for data in trainloader:
            net.train()
            optimizer.zero_grad()
            out = torch.log(net(data[constants.PYTORCH_FEATURES]) + 1e-20)
            loss = criterion(out, data[constants.PYTORCH_LABEL])
            loss.backward()
            optimizer.step()
            train_loss.append(loss.item())
            tl += loss.item()
            total_items += len(data[constants.PYTORCH_FEATURES])
            step += 1

        # Compute the per-epoch average loss and set up the plots dictionary
        epoch_avg_loss = tl / total_items
        print("epoch, avg loss: (" + str(epoch) + ", " + str(epoch_avg_loss) + ")")

    hvd.join()
    train.status(tid, 3)