    1. In [1]:
    1. import d2lzh as d2l
    2. import mxnet as mx
    3. from mxnet import autograd, gluon, init, nd
    4. from mxnet.gluon import loss as gloss, nn, utils as gutils
    5. import time

    8.5.1. 多GPU上初始化模型参数


    1. In [2]:
    1. def resnet18(num_classes): # 本函数已保存在d2lzh包中方便以后使用
    2. def resnet_block(num_channels, num_residuals, first_block=False):
    3. blk = nn.Sequential()
    4. for i in range(num_residuals):
    5. if i == 0 and not first_block:
    6. blk.add(d2l.Residual(
    7. num_channels, use_1x1conv=True, strides=2))
    8. else:
    9. blk.add(d2l.Residual(num_channels))
    10. return blk
    12. net = nn.Sequential()
    13. # 这里使用了较小的卷积核、步幅和填充,并去掉了最大池化层
    14. net.add(nn.Conv2D(64, kernel_size=3, strides=1, padding=1),
    15. nn.BatchNorm(), nn.Activation('relu'))
    16. net.add(resnet_block(64, 2, first_block=True),
    17. resnet_block(128, 2),
    18. resnet_block(256, 2),
    19. resnet_block(512, 2))
    20. net.add(nn.GlobalAvgPool2D(), nn.Dense(num_classes))
    21. return net
    23. net = resnet18(10)


    1. In [3]:
    1. ctx = [mx.gpu(0), mx.gpu(1)]
    2. net.initialize(init=init.Normal(sigma=0.01), ctx=ctx)


    1. In [4]:
    1. x = nd.random.uniform(shape=(4, 1, 28, 28))
    2. gpu_x = gutils.split_and_load(x, ctx)
    3. net(gpu_x[0]), net(gpu_x[1])
    1. Out[4]:
    1. (
    2. [[ 5.48149410e-06 -8.33710715e-07 -1.63167692e-06 -6.36740651e-07
    3. -3.82161625e-06 -2.35140487e-06 -2.54695942e-06 -9.47847525e-08
    4. -6.90336265e-07 2.57562351e-06]
    5. [ 5.47108630e-06 -9.42464624e-07 -1.04940636e-06 9.80811592e-08
    6. -3.32518175e-06 -2.48629181e-06 -3.36428002e-06 1.04558694e-07
    7. -6.10013558e-07 2.03278455e-06]]
    8. <NDArray 2x10 @gpu(0)>,
    9. [[ 5.61763409e-06 -1.28375871e-06 -1.46055413e-06 1.83029556e-07
    10. -3.55116504e-06 -2.43710201e-06 -3.57318004e-06 -3.09748373e-07
    11. -1.10165661e-06 1.89098932e-06]
    12. [ 5.14186922e-06 -1.37299264e-06 -1.15200896e-06 1.15074045e-07
    13. -3.73728130e-06 -2.82897167e-06 -3.64771950e-06 1.57815748e-07
    14. -6.07329866e-07 1.97120107e-06]]
    15. <NDArray 2x10 @gpu(1)>)


    1. In [5]:
    1. weight = net[0].params.get('weight')
    3. try:
    4. weight.data()
    5. except RuntimeError:
    6. print('not initialized on', mx.cpu())
    7. weight.data(ctx[0])[0], weight.data(ctx[1])[0]
    1. not initialized on cpu(0)
    1. Out[5]:
    1. (
    2. [[[-0.01473444 -0.01073093 -0.01042483]
    3. [-0.01327885 -0.01474966 -0.00524142]
    4. [ 0.01266256 0.00895064 -0.00601594]]]
    5. <NDArray 1x3x3 @gpu(0)>,
    6. [[[-0.01473444 -0.01073093 -0.01042483]
    7. [-0.01327885 -0.01474966 -0.00524142]
    8. [ 0.01266256 0.00895064 -0.00601594]]]
    9. <NDArray 1x3x3 @gpu(1)>)

    8.5.2. 多GPU训练模型


    1. In [6]:
    1. def train(num_gpus, batch_size, lr):
    2. train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
    3. ctx = [mx.gpu(i) for i in range(num_gpus)]
    4. print('running on:', ctx)
    5. net.initialize(init=init.Normal(sigma=0.01), ctx=ctx, force_reinit=True)
    6. trainer = gluon.Trainer(
    7. net.collect_params(), 'sgd', {'learning_rate': lr})
    8. loss = gloss.SoftmaxCrossEntropyLoss()
    9. for epoch in range(4):
    10. start = time.time()
    11. for X, y in train_iter:
    12. gpu_Xs = gutils.split_and_load(X, ctx)
    13. gpu_ys = gutils.split_and_load(y, ctx)
    14. with autograd.record():
    15. ls = [loss(net(gpu_X), gpu_y)
    16. for gpu_X, gpu_y in zip(gpu_Xs, gpu_ys)]
    17. for l in ls:
    18. l.backward()
    19. trainer.step(batch_size)
    20. nd.waitall()
    21. train_time = time.time() - start
    22. test_acc = d2l.evaluate_accuracy(test_iter, net, ctx[0])
    23. print('epoch %d, time %.1f sec, test acc %.2f' % (
    24. epoch + 1, train_time, test_acc))


    1. In [7]:
    1. train(num_gpus=1, batch_size=256, lr=0.1)
    1. running on: [gpu(0)]
    2. epoch 1, time 14.6 sec, test acc 0.87
    3. epoch 2, time 13.3 sec, test acc 0.90
    4. epoch 3, time 13.3 sec, test acc 0.92
    5. epoch 4, time 13.3 sec, test acc 0.93


    1. In [8]:
    1. train(num_gpus=2, batch_size=512, lr=0.2)
    1. running on: [gpu(0), gpu(1)]
    2. epoch 1, time 7.6 sec, test acc 0.75
    3. epoch 2, time 6.9 sec, test acc 0.86
    4. epoch 3, time 6.8 sec, test acc 0.85
    5. epoch 4, time 6.8 sec, test acc 0.76

    8.5.3. 小结

    • 在Gluon中,可以很方便地进行多GPU计算,例如,在多GPU及相应的显存上初始化模型参数和训练模型。

    8.5.4. 练习

    • 本节使用了ResNet-18模型。试试不同的迭代周期、批量大小和学习率。如果条件允许,使用更多GPU来计算。
    • 有时候,不同设备的计算能力不一样,例如,同时使用CPU和GPU,或者不同GPU之间型号不一样。这时候,应该如何将小批量划分到内存或不同显卡的显存?