1. 多GPU训练¶
In [1]:
%matplotlib inline
import torch
from torch import nn
from torch.nn import functional as F
from d2l import torch as d2l
In [2]:
W = torch.randn(size=(3,4)) # 从标准正态分布(均值为0,方差为1)中抽取的一组随机数。
scale = 0.01
print(W * scale)
tensor([[-0.0241, -0.0077, 0.0035, 0.0003],
[-0.0109, 0.0084, 0.0114, 0.0123],
[ 0.0036, -0.0087, 0.0080, 0.0018]])
In [3]:
# 简单网络
scale = 0.01
W1 = torch.randn(size=(20,1,3,3)) * scale
b1 = torch.zeros(20)
W2 = torch.randn(size=(50,20,5,5)) * scale
b2 = torch.zeros(50)
W3 = torch.randn(size=(800,128)) * scale
b3 = torch.zeros(128)
W4 = torch.randn(size=(128,10)) * scale
b4 = torch.zeros(10)
params = [W1,b1,W2,b2,W3,b3,W4,b4]
def lenet(X, params):
h1_conv = F.conv2d(input=X, weight=params[0],bias=params[1])
h1_activation = F.relu(h1_conv)
h1 = F.avg_pool2d(input=h1_activation, kernel_size=(2,2), stride=(2,2))
h2_conv = F.conv2d(input=h1, weight=params[2], bias=params[3])
h2_activation = F.relu(h2_conv)
h2 = F.avg_pool2d(input=h2_activation,kernel_size=(2,2),stride=(2,2))
h2 = h2.reshape(h2.shape[0],-1)
h3_linear = torch.mm(h2, params[4]) + params[5]
h3 = F.relu(h3_linear)
y_hat = torch.mm(h3, params[6]) + params[7]
return y_hat
loss = nn.CrossEntropyLoss(reduction='none')
In [4]:
# 向多个设备分发参数
def get_params(params, device):
new_params = [p.clone().to(device) for p in params] # 把params中所有参数挪到GPU上
for p in new_params:
p.requires_grad_() # 每一个参数都要计算梯度
return new_params
new_params = get_params(params, d2l.try_gpu(0))
print('b1 weight:', new_params[1])
print('b1 grad:', new_params[1].grad)
b1 weight: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
device='cuda:0', requires_grad=True)
b1 grad: None
In [5]:
# allreduce函数将所有向量相加,并将结果广播给所有GPU
def allreduce(data): # 如果有四个GPU的话,data这个list里面就有四个元素
for i in range(1, len(data)):
data[0][:] += data[i].to(data[0].device) # 把其余三个GPU上元素,拷贝到0号GPU上
for i in range(1, len(data)):
data[i] = data[0].to(data[i].device) # 把相加后的结果复制回所有GPU上
data = [torch.ones((1,2),device=d2l.try_gpu(i)) * (i + 1) for i in range(2)]
print('before allreduce:\n', data[0], '\n', data[1])
allreduce(data) # allreduce函数可以用于各个GPU的梯度加起来,然后各个GPU拿到合梯度
print('before allreduce:\n', data[0], '\n', data[1])
before allreduce: tensor([[1., 1.]], device='cuda:0') tensor([[2., 2.]]) before allreduce: tensor([[3., 3.]], device='cuda:0') tensor([[3., 3.]])
In [6]:
# 将一个小批量数据均匀地分布在多个GPU上
data = torch.arange(20).reshape(4,5)
devices = [torch.device('cuda:0'),torch.device('cuda:1')]
print('input:',data)
print('load into',devices)
split = nn.parallel.scatter(data,devices) # 将data均匀的切开,放到不同的GPU上
print('output:',split)
input: tensor([[ 0, 1, 2, 3, 4],
[ 5, 6, 7, 8, 9],
[10, 11, 12, 13, 14],
[15, 16, 17, 18, 19]])
load into [device(type='cuda', index=0), device(type='cuda', index=1)]
--------------------------------------------------------------------------- IndexError Traceback (most recent call last) <ipython-input-6-67c72ea6039f> in <module> 4 print('input:',data) 5 print('load into',devices) ----> 6 split = nn.parallel.scatter(data,devices) # 将data均匀的切开,放到不同的GPU上 7 print('output:',split) D:\11_Anaconda\envs\py3.6.3\lib\site-packages\torch\nn\parallel\scatter_gather.py in scatter(inputs, target_gpus, dim) 34 # None, clearing the cell 35 try: ---> 36 res = scatter_map(inputs) 37 finally: 38 scatter_map = None D:\11_Anaconda\envs\py3.6.3\lib\site-packages\torch\nn\parallel\scatter_gather.py in scatter_map(obj) 17 def scatter_map(obj): 18 if isinstance(obj, torch.Tensor): ---> 19 return Scatter.apply(target_gpus, None, dim, obj) 20 if is_namedtuple(obj): 21 return [type(obj)(*args) for args in zip(*map(scatter_map, obj))] D:\11_Anaconda\envs\py3.6.3\lib\site-packages\torch\nn\parallel\_functions.py in forward(ctx, target_gpus, chunk_sizes, dim, input) 93 if torch.cuda.is_available() and ctx.input_device == -1: 94 # Perform CPU to GPU copies in a background stream ---> 95 streams = [_get_stream(device) for device in target_gpus] 96 outputs = comm.scatter(input, target_gpus, chunk_sizes, ctx.dim, streams) 97 # Synchronize with the copy stream D:\11_Anaconda\envs\py3.6.3\lib\site-packages\torch\nn\parallel\_functions.py in <listcomp>(.0) 93 if torch.cuda.is_available() and ctx.input_device == -1: 94 # Perform CPU to GPU copies in a background stream ---> 95 streams = [_get_stream(device) for device in target_gpus] 96 outputs = comm.scatter(input, target_gpus, chunk_sizes, ctx.dim, streams) 97 # Synchronize with the copy stream D:\11_Anaconda\envs\py3.6.3\lib\site-packages\torch\nn\parallel\_functions.py in _get_stream(device) 120 if _streams is None: 121 _streams = [None] * torch.cuda.device_count() --> 122 if _streams[device] is None: 123 _streams[device] = torch.cuda.Stream(device) 124 return _streams[device] IndexError: list index out of range
In [7]:
def split_batch(X, y, devices):
"""将X和y拆分到多个设备上"""
assert X.shape[0] == y.shape[0] # 确定样本数与标号数是一样的
return (nn.parallel.scatter(X, devices), nn.parallel.scatter(y, devices)) # 样本与标号都均匀分布到不同GPU上
In [8]:
# 在一个小批量上实现多GPU训练
def train_batch(X, y, device_params, devices, lr):
X_shards, y_shards = split_batch(X, y, devices) # 把X,y分到不同GPU上
# 拿到每一个GPU上的X_shard、y_shard以及对应的device_W
# sum是对每一个GPU上所有样本的损失求和
# ls返回的是每一个GPU上对应的损失
ls = [loss(lenet(X_shard, device_W),y_shard).sum()
for X_shard, y_shard, device_W in zip(X_shards, y_shards, device_params)]
for l in ls:
l.backward() # 对每一个GPU的loss做负反馈,算梯度
with torch.no_grad():
for i in range(len(device_params[0])):
# i是层,c是GPU
allreduce([device_params[c][i].grad for c in range(len(devices))])
for param in device_params: # 拿到所有梯度后,对每一个GPU做SGD
d2l.sgd(param, lr, X.shape[0])
In [9]:
# 训练
def train(num_gpus, batch_size, lr):
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
devices = [d2l.try_gpu(i) for i in range(num_gpus)] # 创建多少个GPU
device_params = [get_params(params, d) for d in devices] # 初始化的params复制到每个GPU上
num_epochs = 10
animator = d2l.Animator('epoch','test acc',xlim=[1,num_epochs])
timer = d2l.Timer()
for epoch in range(num_epochs):
timer.start()
for X, y in train_iter:
train_batch(X, y, device_params, devices, lr)
torch.cuda.synchronize() # 等待所有的GPU算完
timer.stop()
animator.add(epoch+1,(d2l.evaluate_accuracy_gpu(
lambda x: lenet(x,device_params[0]),test_iter,devices[0]),))
print(f'test acc: {animator.Y[0][-1]:.2f}, {timer.avg():.1f} sec/epoch'
f'on {str(devices)}')
In [10]:
help(torch.cuda.synchronize)
Help on function synchronize in module torch.cuda:
synchronize(device:Union[torch.device, str, int, NoneType]=None) -> None
Waits for all kernels in all streams on a CUDA device to complete.
Args:
device (torch.device or int, optional): device for which to synchronize.
It uses the current device, given by :func:`~torch.cuda.current_device`,
if :attr:`device` is ``None`` (default).
In [11]:
# 在单个GPU上运行
train(num_gpus=1, batch_size=256, lr=0.2)
test acc: 0.84, 3.7 sec/epochon [device(type='cuda', index=0)]
In [12]:
# 增加为2个GPU
train(num_gpus=2, batch_size=256, lr=0.2) # 本电脑只有一个GPU,呜呜呜呜
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-12-ef7689592cb4> in <module> 1 # 增加为2个GPU ----> 2 train(num_gpus=2, batch_size=256, lr=0.2) # 本电脑只有一个GPU,呜呜呜呜 <ipython-input-9-ddae8883987e> in train(num_gpus, batch_size, lr) 10 timer.start() 11 for X, y in train_iter: ---> 12 train_batch(X, y, device_params, devices, lr) 13 torch.cuda.synchronize() # 等待所有的GPU算完 14 timer.stop() <ipython-input-8-055fd69e9627> in train_batch(X, y, device_params, devices, lr) 1 # 在一个小批量上实现多GPU训练 2 def train_batch(X, y, device_params, devices, lr): ----> 3 X_shards, y_shards = split_batch(X, y, devices) # 把X,y分到不同GPU上 4 # 拿到每一个GPU上的X_shard、y_shard以及对应的device_W 5 # sum是对每一个GPU上所有样本的损失求和 <ipython-input-7-82122d5c5691> in split_batch(X, y, devices) 2 """将X和y拆分到多个设备上""" 3 assert X.shape[0] == y.shape[0] # 确定样本数与标号数是一样的 ----> 4 return (nn.parallel.scatter(X, devices), nn.parallel.scatter(y, devices)) # 样本与标号都均匀分布到不同GPU上 D:\11_Anaconda\envs\py3.6.3\lib\site-packages\torch\nn\parallel\scatter_gather.py in scatter(inputs, target_gpus, dim) 34 # None, clearing the cell 35 try: ---> 36 res = scatter_map(inputs) 37 finally: 38 scatter_map = None D:\11_Anaconda\envs\py3.6.3\lib\site-packages\torch\nn\parallel\scatter_gather.py in scatter_map(obj) 17 def scatter_map(obj): 18 if isinstance(obj, torch.Tensor): ---> 19 return Scatter.apply(target_gpus, None, dim, obj) 20 if is_namedtuple(obj): 21 return [type(obj)(*args) for args in zip(*map(scatter_map, obj))] D:\11_Anaconda\envs\py3.6.3\lib\site-packages\torch\nn\parallel\_functions.py in forward(ctx, target_gpus, chunk_sizes, dim, input) 87 @staticmethod 88 def forward(ctx, target_gpus, chunk_sizes, dim, input): ---> 89 target_gpus = [_get_device_index(x, True) for x in target_gpus] 90 ctx.dim = dim 91 ctx.input_device = input.get_device() if input.device.type != "cpu" else -1 D:\11_Anaconda\envs\py3.6.3\lib\site-packages\torch\nn\parallel\_functions.py in <listcomp>(.0) 87 @staticmethod 88 def forward(ctx, target_gpus, chunk_sizes, dim, input): ---> 89 target_gpus = [_get_device_index(x, True) for x in target_gpus] 90 ctx.dim = dim 91 ctx.input_device = input.get_device() if input.device.type != "cpu" else -1 D:\11_Anaconda\envs\py3.6.3\lib\site-packages\torch\_utils.py in _get_device_index(device, optional, allow_cpu) 495 if isinstance(device, torch.device): 496 if not allow_cpu and device.type == 'cpu': --> 497 raise ValueError('Expected a non cpu device, but got: {}'.format(device)) 498 device_idx = -1 if device.type == 'cpu' else device.index 499 if isinstance(device, int): ValueError: Expected a non cpu device, but got: cpu
2. 多GPU的简洁实现¶
In [13]:
import torch
from torch import nn
from d2l import torch as d2l
In [14]:
def resnet18(num_classes, in_channels=1):
"""稍加修改的ResNet-18模型"""
def resnet_block(in_channels, out_channels, num_residuals, first_block=False):
blk = []
for i in range(num_residuals):
if i == 0 and not first_block:
blk.append(d2l.Residual(in_channels, out_channels, use_1x1conv=True,strides=2))
else:
blk.append(d2l.Residual(out_channels,out_channels))
return nn.Sequential(*blk)
net = nn.Sequential(
nn.Conv2d(in_channels,64,kernel_size=3,stride=1,padding=1),
nn.BatchNorm2d(64),
nn.ReLU())
net.add_module("resnet_block1", resnet_block(64,64,2,first_block=True))
net.add_module("resnet_block2", resnet_block(64,128,2))
net.add_module("resnet_block3", resnet_block(128,256,2))
net.add_module("resnet_block4", resnet_block(256,512,2))
net.add_module("resnet_avg_pool", nn.AdaptiveAvgPool2d((1,1)))
net.add_module("fc", nn.Sequential(nn.Flatten(), nn.Linear(512, num_classes)))
return net
net = resnet18(10)
devices = d2l.try_all_gpus()
In [15]:
# 训练
def train(net, num_gpus, batch_size, lr):
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
devices = [d2l.try_gpu(i) for i in range(num_gpus)]
def init_weights(m):
if type(m) in [nn.Linear, nn.Conv2d]:
nn.init.normal_(m.weight, std=0.01)
net.apply(init_weights)
# nn.DataParallel会的是X切开并行到各个GPU上,并行算梯度,然后loss加起来,它重新定义了net的forward函数
net = nn.DataParallel(net, device_ids=devices) # net会复制到每一个GPU上
trainer = torch.optim.SGD(net.parameters(),lr)
loss = nn.CrossEntropyLoss()
timer, num_epochs = d2l.Timer(), 10
animator = d2l.Animator('epoch','test acc', xlim=[1, num_epochs])
for epoch in range(num_epochs):
net.train()
timer.start()
for X, y in train_iter:
trainer.zero_grad()
X, y = X.to(devices[0]), y.to(devices[0])
l = loss(net(X), y)
l.backward()
trainer.step()
timer.stop()
animator.add(epoch+1, (d2l.evaluate_accuracy_gpu(net, test_iter),))
print(f'test acc: {animator.Y[0][-1]:.2f}, {timer.avg():.1f} sec/epoch'
f'on {str(devices)}')
In [16]:
help(nn.DataParallel)
Help on class DataParallel in module torch.nn.parallel.data_parallel:
class DataParallel(torch.nn.modules.module.Module)
| Implements data parallelism at the module level.
|
| This container parallelizes the application of the given :attr:`module` by
| splitting the input across the specified devices by chunking in the batch
| dimension (other objects will be copied once per device). In the forward
| pass, the module is replicated on each device, and each replica handles a
| portion of the input. During the backwards pass, gradients from each replica
| are summed into the original module.
|
| The batch size should be larger than the number of GPUs used.
|
| .. warning::
| It is recommended to use :class:`~torch.nn.parallel.DistributedDataParallel`,
| instead of this class, to do multi-GPU training, even if there is only a single
| node. See: :ref:`cuda-nn-ddp-instead` and :ref:`ddp`.
|
| Arbitrary positional and keyword inputs are allowed to be passed into
| DataParallel but some types are specially handled. tensors will be
| **scattered** on dim specified (default 0). tuple, list and dict types will
| be shallow copied. The other types will be shared among different threads
| and can be corrupted if written to in the model's forward pass.
|
| The parallelized :attr:`module` must have its parameters and buffers on
| ``device_ids[0]`` before running this :class:`~torch.nn.DataParallel`
| module.
|
| .. warning::
| In each forward, :attr:`module` is **replicated** on each device, so any
| updates to the running module in ``forward`` will be lost. For example,
| if :attr:`module` has a counter attribute that is incremented in each
| ``forward``, it will always stay at the initial value because the update
| is done on the replicas which are destroyed after ``forward``. However,
| :class:`~torch.nn.DataParallel` guarantees that the replica on
| ``device[0]`` will have its parameters and buffers sharing storage with
| the base parallelized :attr:`module`. So **in-place** updates to the
| parameters or buffers on ``device[0]`` will be recorded. E.g.,
| :class:`~torch.nn.BatchNorm2d` and :func:`~torch.nn.utils.spectral_norm`
| rely on this behavior to update the buffers.
|
| .. warning::
| Forward and backward hooks defined on :attr:`module` and its submodules
| will be invoked ``len(device_ids)`` times, each with inputs located on
| a particular device. Particularly, the hooks are only guaranteed to be
| executed in correct order with respect to operations on corresponding
| devices. For example, it is not guaranteed that hooks set via
| :meth:`~torch.nn.Module.register_forward_pre_hook` be executed before
| `all` ``len(device_ids)`` :meth:`~torch.nn.Module.forward` calls, but
| that each such hook be executed before the corresponding
| :meth:`~torch.nn.Module.forward` call of that device.
|
| .. warning::
| When :attr:`module` returns a scalar (i.e., 0-dimensional tensor) in
| :func:`forward`, this wrapper will return a vector of length equal to
| number of devices used in data parallelism, containing the result from
| each device.
|
| .. note::
| There is a subtlety in using the
| ``pack sequence -> recurrent network -> unpack sequence`` pattern in a
| :class:`~torch.nn.Module` wrapped in :class:`~torch.nn.DataParallel`.
| See :ref:`pack-rnn-unpack-with-data-parallelism` section in FAQ for
| details.
|
|
| Args:
| module (Module): module to be parallelized
| device_ids (list of int or torch.device): CUDA devices (default: all devices)
| output_device (int or torch.device): device location of output (default: device_ids[0])
|
| Attributes:
| module (Module): the module to be parallelized
|
| Example::
|
| >>> net = torch.nn.DataParallel(model, device_ids=[0, 1, 2])
| >>> output = net(input_var) # input_var can be on any device, including CPU
|
| Method resolution order:
| DataParallel
| torch.nn.modules.module.Module
| builtins.object
|
| Methods defined here:
|
| __init__(self, module, device_ids=None, output_device=None, dim=0)
| Initializes internal Module state, shared by both nn.Module and ScriptModule.
|
| forward(self, *inputs, **kwargs)
| Defines the computation performed at every call.
|
| Should be overridden by all subclasses.
|
| .. note::
| Although the recipe for forward pass needs to be defined within
| this function, one should call the :class:`Module` instance afterwards
| instead of this since the former takes care of running the
| registered hooks while the latter silently ignores them.
|
| gather(self, outputs, output_device)
|
| parallel_apply(self, replicas, inputs, kwargs)
|
| replicate(self, module, device_ids)
|
| scatter(self, inputs, kwargs, device_ids)
|
| ----------------------------------------------------------------------
| Methods inherited from torch.nn.modules.module.Module:
|
| __call__ = _call_impl(self, *input, **kwargs)
|
| __delattr__(self, name)
| Implement delattr(self, name).
|
| __dir__(self)
| __dir__() -> list
| default dir() implementation
|
| __getattr__(self, name:str) -> Union[torch.Tensor, _ForwardRef('Module')]
|
| __repr__(self)
| Return repr(self).
|
| __setattr__(self, name:str, value:Union[torch.Tensor, _ForwardRef('Module')]) -> None
| Implement setattr(self, name, value).
|
| __setstate__(self, state)
|
| add_module(self, name:str, module:Union[_ForwardRef('Module'), NoneType]) -> None
| Adds a child module to the current module.
|
| The module can be accessed as an attribute using the given name.
|
| Args:
| name (string): name of the child module. The child module can be
| accessed from this module using the given name
| module (Module): child module to be added to the module.
|
| apply(self:~T, fn:Callable[[_ForwardRef('Module')], NoneType]) -> ~T
| Applies ``fn`` recursively to every submodule (as returned by ``.children()``)
| as well as self. Typical use includes initializing the parameters of a model
| (see also :ref:`nn-init-doc`).
|
| Args:
| fn (:class:`Module` -> None): function to be applied to each submodule
|
| Returns:
| Module: self
|
| Example::
|
| >>> @torch.no_grad()
| >>> def init_weights(m):
| >>> print(m)
| >>> if type(m) == nn.Linear:
| >>> m.weight.fill_(1.0)
| >>> print(m.weight)
| >>> net = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2))
| >>> net.apply(init_weights)
| Linear(in_features=2, out_features=2, bias=True)
| Parameter containing:
| tensor([[ 1., 1.],
| [ 1., 1.]])
| Linear(in_features=2, out_features=2, bias=True)
| Parameter containing:
| tensor([[ 1., 1.],
| [ 1., 1.]])
| Sequential(
| (0): Linear(in_features=2, out_features=2, bias=True)
| (1): Linear(in_features=2, out_features=2, bias=True)
| )
| Sequential(
| (0): Linear(in_features=2, out_features=2, bias=True)
| (1): Linear(in_features=2, out_features=2, bias=True)
| )
|
| bfloat16(self:~T) -> ~T
| Casts all floating point parameters and buffers to ``bfloat16`` datatype.
|
| .. note::
| This method modifies the module in-place.
|
| Returns:
| Module: self
|
| buffers(self, recurse:bool=True) -> Iterator[torch.Tensor]
| Returns an iterator over module buffers.
|
| Args:
| recurse (bool): if True, then yields buffers of this module
| and all submodules. Otherwise, yields only buffers that
| are direct members of this module.
|
| Yields:
| torch.Tensor: module buffer
|
| Example::
|
| >>> for buf in model.buffers():
| >>> print(type(buf), buf.size())
| <class 'torch.Tensor'> (20L,)
| <class 'torch.Tensor'> (20L, 1L, 5L, 5L)
|
| children(self) -> Iterator[_ForwardRef('Module')]
| Returns an iterator over immediate children modules.
|
| Yields:
| Module: a child module
|
| cpu(self:~T) -> ~T
| Moves all model parameters and buffers to the CPU.
|
| .. note::
| This method modifies the module in-place.
|
| Returns:
| Module: self
|
| cuda(self:~T, device:Union[int, torch.device, NoneType]=None) -> ~T
| Moves all model parameters and buffers to the GPU.
|
| This also makes associated parameters and buffers different objects. So
| it should be called before constructing optimizer if the module will
| live on GPU while being optimized.
|
| .. note::
| This method modifies the module in-place.
|
| Args:
| device (int, optional): if specified, all parameters will be
| copied to that device
|
| Returns:
| Module: self
|
| double(self:~T) -> ~T
| Casts all floating point parameters and buffers to ``double`` datatype.
|
| .. note::
| This method modifies the module in-place.
|
| Returns:
| Module: self
|
| eval(self:~T) -> ~T
| Sets the module in evaluation mode.
|
| This has any effect only on certain modules. See documentations of
| particular modules for details of their behaviors in training/evaluation
| mode, if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`,
| etc.
|
| This is equivalent with :meth:`self.train(False) <torch.nn.Module.train>`.
|
| See :ref:`locally-disable-grad-doc` for a comparison between
| `.eval()` and several similar mechanisms that may be confused with it.
|
| Returns:
| Module: self
|
| extra_repr(self) -> str
| Set the extra representation of the module
|
| To print customized extra information, you should re-implement
| this method in your own modules. Both single-line and multi-line
| strings are acceptable.
|
| float(self:~T) -> ~T
| Casts all floating point parameters and buffers to ``float`` datatype.
|
| .. note::
| This method modifies the module in-place.
|
| Returns:
| Module: self
|
| get_buffer(self, target:str) -> 'Tensor'
| Returns the buffer given by ``target`` if it exists,
| otherwise throws an error.
|
| See the docstring for ``get_submodule`` for a more detailed
| explanation of this method's functionality as well as how to
| correctly specify ``target``.
|
| Args:
| target: The fully-qualified string name of the buffer
| to look for. (See ``get_submodule`` for how to specify a
| fully-qualified string.)
|
| Returns:
| torch.Tensor: The buffer referenced by ``target``
|
| Raises:
| AttributeError: If the target string references an invalid
| path or resolves to something that is not a
| buffer
|
| get_extra_state(self) -> Any
| Returns any extra state to include in the module's state_dict.
| Implement this and a corresponding :func:`set_extra_state` for your module
| if you need to store extra state. This function is called when building the
| module's `state_dict()`.
|
| Note that extra state should be pickleable to ensure working serialization
| of the state_dict. We only provide provide backwards compatibility guarantees
| for serializing Tensors; other objects may break backwards compatibility if
| their serialized pickled form changes.
|
| Returns:
| object: Any extra state to store in the module's state_dict
|
| get_parameter(self, target:str) -> 'Parameter'
| Returns the parameter given by ``target`` if it exists,
| otherwise throws an error.
|
| See the docstring for ``get_submodule`` for a more detailed
| explanation of this method's functionality as well as how to
| correctly specify ``target``.
|
| Args:
| target: The fully-qualified string name of the Parameter
| to look for. (See ``get_submodule`` for how to specify a
| fully-qualified string.)
|
| Returns:
| torch.nn.Parameter: The Parameter referenced by ``target``
|
| Raises:
| AttributeError: If the target string references an invalid
| path or resolves to something that is not an
| ``nn.Parameter``
|
| get_submodule(self, target:str) -> 'Module'
| Returns the submodule given by ``target`` if it exists,
| otherwise throws an error.
|
| For example, let's say you have an ``nn.Module`` ``A`` that
| looks like this:
|
| .. code-block::text
|
| A(
| (net_b): Module(
| (net_c): Module(
| (conv): Conv2d(16, 33, kernel_size=(3, 3), stride=(2, 2))
| )
| (linear): Linear(in_features=100, out_features=200, bias=True)
| )
| )
|
| (The diagram shows an ``nn.Module`` ``A``. ``A`` has a nested
| submodule ``net_b``, which itself has two submodules ``net_c``
| and ``linear``. ``net_c`` then has a submodule ``conv``.)
|
| To check whether or not we have the ``linear`` submodule, we
| would call ``get_submodule("net_b.linear")``. To check whether
| we have the ``conv`` submodule, we would call
| ``get_submodule("net_b.net_c.conv")``.
|
| The runtime of ``get_submodule`` is bounded by the degree
| of module nesting in ``target``. A query against
| ``named_modules`` achieves the same result, but it is O(N) in
| the number of transitive modules. So, for a simple check to see
| if some submodule exists, ``get_submodule`` should always be
| used.
|
| Args:
| target: The fully-qualified string name of the submodule
| to look for. (See above example for how to specify a
| fully-qualified string.)
|
| Returns:
| torch.nn.Module: The submodule referenced by ``target``
|
| Raises:
| AttributeError: If the target string references an invalid
| path or resolves to something that is not an
| ``nn.Module``
|
| half(self:~T) -> ~T
| Casts all floating point parameters and buffers to ``half`` datatype.
|
| .. note::
| This method modifies the module in-place.
|
| Returns:
| Module: self
|
| load_state_dict(self, state_dict:'OrderedDict[str, Tensor]', strict:bool=True)
| Copies parameters and buffers from :attr:`state_dict` into
| this module and its descendants. If :attr:`strict` is ``True``, then
| the keys of :attr:`state_dict` must exactly match the keys returned
| by this module's :meth:`~torch.nn.Module.state_dict` function.
|
| Args:
| state_dict (dict): a dict containing parameters and
| persistent buffers.
| strict (bool, optional): whether to strictly enforce that the keys
| in :attr:`state_dict` match the keys returned by this module's
| :meth:`~torch.nn.Module.state_dict` function. Default: ``True``
|
| Returns:
| ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
| * **missing_keys** is a list of str containing the missing keys
| * **unexpected_keys** is a list of str containing the unexpected keys
|
| Note:
| If a parameter or buffer is registered as ``None`` and its corresponding key
| exists in :attr:`state_dict`, :meth:`load_state_dict` will raise a
| ``RuntimeError``.
|
| modules(self) -> Iterator[_ForwardRef('Module')]
| Returns an iterator over all modules in the network.
|
| Yields:
| Module: a module in the network
|
| Note:
| Duplicate modules are returned only once. In the following
| example, ``l`` will be returned only once.
|
| Example::
|
| >>> l = nn.Linear(2, 2)
| >>> net = nn.Sequential(l, l)
| >>> for idx, m in enumerate(net.modules()):
| print(idx, '->', m)
|
| 0 -> Sequential(
| (0): Linear(in_features=2, out_features=2, bias=True)
| (1): Linear(in_features=2, out_features=2, bias=True)
| )
| 1 -> Linear(in_features=2, out_features=2, bias=True)
|
| named_buffers(self, prefix:str='', recurse:bool=True) -> Iterator[Tuple[str, torch.Tensor]]
| Returns an iterator over module buffers, yielding both the
| name of the buffer as well as the buffer itself.
|
| Args:
| prefix (str): prefix to prepend to all buffer names.
| recurse (bool): if True, then yields buffers of this module
| and all submodules. Otherwise, yields only buffers that
| are direct members of this module.
|
| Yields:
| (string, torch.Tensor): Tuple containing the name and buffer
|
| Example::
|
| >>> for name, buf in self.named_buffers():
| >>> if name in ['running_var']:
| >>> print(buf.size())
|
| named_children(self) -> Iterator[Tuple[str, _ForwardRef('Module')]]
| Returns an iterator over immediate children modules, yielding both
| the name of the module as well as the module itself.
|
| Yields:
| (string, Module): Tuple containing a name and child module
|
| Example::
|
| >>> for name, module in model.named_children():
| >>> if name in ['conv4', 'conv5']:
| >>> print(module)
|
| named_modules(self, memo:Union[Set[_ForwardRef('Module')], NoneType]=None, prefix:str='', remove_duplicate:bool=True)
| Returns an iterator over all modules in the network, yielding
| both the name of the module as well as the module itself.
|
| Args:
| memo: a memo to store the set of modules already added to the result
| prefix: a prefix that will be added to the name of the module
| remove_duplicate: whether to remove the duplicated module instances in the result
| or not
|
| Yields:
| (string, Module): Tuple of name and module
|
| Note:
| Duplicate modules are returned only once. In the following
| example, ``l`` will be returned only once.
|
| Example::
|
| >>> l = nn.Linear(2, 2)
| >>> net = nn.Sequential(l, l)
| >>> for idx, m in enumerate(net.named_modules()):
| print(idx, '->', m)
|
| 0 -> ('', Sequential(
| (0): Linear(in_features=2, out_features=2, bias=True)
| (1): Linear(in_features=2, out_features=2, bias=True)
| ))
| 1 -> ('0', Linear(in_features=2, out_features=2, bias=True))
|
| named_parameters(self, prefix:str='', recurse:bool=True) -> Iterator[Tuple[str, torch.nn.parameter.Parameter]]
| Returns an iterator over module parameters, yielding both the
| name of the parameter as well as the parameter itself.
|
| Args:
| prefix (str): prefix to prepend to all parameter names.
| recurse (bool): if True, then yields parameters of this module
| and all submodules. Otherwise, yields only parameters that
| are direct members of this module.
|
| Yields:
| (string, Parameter): Tuple containing the name and parameter
|
| Example::
|
| >>> for name, param in self.named_parameters():
| >>> if name in ['bias']:
| >>> print(param.size())
|
| parameters(self, recurse:bool=True) -> Iterator[torch.nn.parameter.Parameter]
| Returns an iterator over module parameters.
|
| This is typically passed to an optimizer.
|
| Args:
| recurse (bool): if True, then yields parameters of this module
| and all submodules. Otherwise, yields only parameters that
| are direct members of this module.
|
| Yields:
| Parameter: module parameter
|
| Example::
|
| >>> for param in model.parameters():
| >>> print(type(param), param.size())
| <class 'torch.Tensor'> (20L,)
| <class 'torch.Tensor'> (20L, 1L, 5L, 5L)
|
| register_backward_hook(self, hook:Callable[[_ForwardRef('Module'), Union[Tuple[torch.Tensor, ...], torch.Tensor], Union[Tuple[torch.Tensor, ...], torch.Tensor]], Union[NoneType, torch.Tensor]]) -> torch.utils.hooks.RemovableHandle
| Registers a backward hook on the module.
|
| This function is deprecated in favor of :meth:`~torch.nn.Module.register_full_backward_hook` and
| the behavior of this function will change in future versions.
|
| Returns:
| :class:`torch.utils.hooks.RemovableHandle`:
| a handle that can be used to remove the added hook by calling
| ``handle.remove()``
|
| register_buffer(self, name:str, tensor:Union[torch.Tensor, NoneType], persistent:bool=True) -> None
| Adds a buffer to the module.
|
| This is typically used to register a buffer that should not to be
| considered a model parameter. For example, BatchNorm's ``running_mean``
| is not a parameter, but is part of the module's state. Buffers, by
| default, are persistent and will be saved alongside parameters. This
| behavior can be changed by setting :attr:`persistent` to ``False``. The
| only difference between a persistent buffer and a non-persistent buffer
| is that the latter will not be a part of this module's
| :attr:`state_dict`.
|
| Buffers can be accessed as attributes using given names.
|
| Args:
| name (string): name of the buffer. The buffer can be accessed
| from this module using the given name
| tensor (Tensor or None): buffer to be registered. If ``None``, then operations
| that run on buffers, such as :attr:`cuda`, are ignored. If ``None``,
| the buffer is **not** included in the module's :attr:`state_dict`.
| persistent (bool): whether the buffer is part of this module's
| :attr:`state_dict`.
|
| Example::
|
| >>> self.register_buffer('running_mean', torch.zeros(num_features))
|
| register_forward_hook(self, hook:Callable[..., NoneType]) -> torch.utils.hooks.RemovableHandle
| Registers a forward hook on the module.
|
| The hook will be called every time after :func:`forward` has computed an output.
| It should have the following signature::
|
| hook(module, input, output) -> None or modified output
|
| The input contains only the positional arguments given to the module.
| Keyword arguments won't be passed to the hooks and only to the ``forward``.
| The hook can modify the output. It can modify the input inplace but
| it will not have effect on forward since this is called after
| :func:`forward` is called.
|
| Returns:
| :class:`torch.utils.hooks.RemovableHandle`:
| a handle that can be used to remove the added hook by calling
| ``handle.remove()``
|
| register_forward_pre_hook(self, hook:Callable[..., NoneType]) -> torch.utils.hooks.RemovableHandle
| Registers a forward pre-hook on the module.
|
| The hook will be called every time before :func:`forward` is invoked.
| It should have the following signature::
|
| hook(module, input) -> None or modified input
|
| The input contains only the positional arguments given to the module.
| Keyword arguments won't be passed to the hooks and only to the ``forward``.
| The hook can modify the input. User can either return a tuple or a
| single modified value in the hook. We will wrap the value into a tuple
| if a single value is returned(unless that value is already a tuple).
|
| Returns:
| :class:`torch.utils.hooks.RemovableHandle`:
| a handle that can be used to remove the added hook by calling
| ``handle.remove()``
|
| register_full_backward_hook(self, hook:Callable[[_ForwardRef('Module'), Union[Tuple[torch.Tensor, ...], torch.Tensor], Union[Tuple[torch.Tensor, ...], torch.Tensor]], Union[NoneType, torch.Tensor]]) -> torch.utils.hooks.RemovableHandle
| Registers a backward hook on the module.
|
| The hook will be called every time the gradients with respect to module
| inputs are computed. The hook should have the following signature::
|
| hook(module, grad_input, grad_output) -> tuple(Tensor) or None
|
| The :attr:`grad_input` and :attr:`grad_output` are tuples that contain the gradients
| with respect to the inputs and outputs respectively. The hook should
| not modify its arguments, but it can optionally return a new gradient with
| respect to the input that will be used in place of :attr:`grad_input` in
| subsequent computations. :attr:`grad_input` will only correspond to the inputs given
| as positional arguments and all kwarg arguments are ignored. Entries
| in :attr:`grad_input` and :attr:`grad_output` will be ``None`` for all non-Tensor
| arguments.
|
| For technical reasons, when this hook is applied to a Module, its forward function will
| receive a view of each Tensor passed to the Module. Similarly the caller will receive a view
| of each Tensor returned by the Module's forward function.
|
| .. warning ::
| Modifying inputs or outputs inplace is not allowed when using backward hooks and
| will raise an error.
|
| Returns:
| :class:`torch.utils.hooks.RemovableHandle`:
| a handle that can be used to remove the added hook by calling
| ``handle.remove()``
|
| register_parameter(self, name:str, param:Union[torch.nn.parameter.Parameter, NoneType]) -> None
| Adds a parameter to the module.
|
| The parameter can be accessed as an attribute using given name.
|
| Args:
| name (string): name of the parameter. The parameter can be accessed
| from this module using the given name
| param (Parameter or None): parameter to be added to the module. If
| ``None``, then operations that run on parameters, such as :attr:`cuda`,
| are ignored. If ``None``, the parameter is **not** included in the
| module's :attr:`state_dict`.
|
| requires_grad_(self:~T, requires_grad:bool=True) -> ~T
| Change if autograd should record operations on parameters in this
| module.
|
| This method sets the parameters' :attr:`requires_grad` attributes
| in-place.
|
| This method is helpful for freezing part of the module for finetuning
| or training parts of a model individually (e.g., GAN training).
|
| See :ref:`locally-disable-grad-doc` for a comparison between
| `.requires_grad_()` and several similar mechanisms that may be confused with it.
|
| Args:
| requires_grad (bool): whether autograd should record operations on
| parameters in this module. Default: ``True``.
|
| Returns:
| Module: self
|
| set_extra_state(self, state:Any)
| This function is called from :func:`load_state_dict` to handle any extra state
| found within the `state_dict`. Implement this function and a corresponding
| :func:`get_extra_state` for your module if you need to store extra state within its
| `state_dict`.
|
| Args:
| state (dict): Extra state from the `state_dict`
|
| share_memory(self:~T) -> ~T
| See :meth:`torch.Tensor.share_memory_`
|
| state_dict(self, destination=None, prefix='', keep_vars=False)
| Returns a dictionary containing a whole state of the module.
|
| Both parameters and persistent buffers (e.g. running averages) are
| included. Keys are corresponding parameter and buffer names.
| Parameters and buffers set to ``None`` are not included.
|
| Returns:
| dict:
| a dictionary containing a whole state of the module
|
| Example::
|
| >>> module.state_dict().keys()
| ['bias', 'weight']
|
| to(self, *args, **kwargs)
| Moves and/or casts the parameters and buffers.
|
| This can be called as
|
| .. function:: to(device=None, dtype=None, non_blocking=False)
| :noindex:
|
| .. function:: to(dtype, non_blocking=False)
| :noindex:
|
| .. function:: to(tensor, non_blocking=False)
| :noindex:
|
| .. function:: to(memory_format=torch.channels_last)
| :noindex:
|
| Its signature is similar to :meth:`torch.Tensor.to`, but only accepts
| floating point or complex :attr:`dtype`\ s. In addition, this method will
| only cast the floating point or complex parameters and buffers to :attr:`dtype`
| (if given). The integral parameters and buffers will be moved
| :attr:`device`, if that is given, but with dtypes unchanged. When
| :attr:`non_blocking` is set, it tries to convert/move asynchronously
| with respect to the host if possible, e.g., moving CPU Tensors with
| pinned memory to CUDA devices.
|
| See below for examples.
|
| .. note::
| This method modifies the module in-place.
|
| Args:
| device (:class:`torch.device`): the desired device of the parameters
| and buffers in this module
| dtype (:class:`torch.dtype`): the desired floating point or complex dtype of
| the parameters and buffers in this module
| tensor (torch.Tensor): Tensor whose dtype and device are the desired
| dtype and device for all parameters and buffers in this module
| memory_format (:class:`torch.memory_format`): the desired memory
| format for 4D parameters and buffers in this module (keyword
| only argument)
|
| Returns:
| Module: self
|
| Examples::
|
| >>> linear = nn.Linear(2, 2)
| >>> linear.weight
| Parameter containing:
| tensor([[ 0.1913, -0.3420],
| [-0.5113, -0.2325]])
| >>> linear.to(torch.double)
| Linear(in_features=2, out_features=2, bias=True)
| >>> linear.weight
| Parameter containing:
| tensor([[ 0.1913, -0.3420],
| [-0.5113, -0.2325]], dtype=torch.float64)
| >>> gpu1 = torch.device("cuda:1")
| >>> linear.to(gpu1, dtype=torch.half, non_blocking=True)
| Linear(in_features=2, out_features=2, bias=True)
| >>> linear.weight
| Parameter containing:
| tensor([[ 0.1914, -0.3420],
| [-0.5112, -0.2324]], dtype=torch.float16, device='cuda:1')
| >>> cpu = torch.device("cpu")
| >>> linear.to(cpu)
| Linear(in_features=2, out_features=2, bias=True)
| >>> linear.weight
| Parameter containing:
| tensor([[ 0.1914, -0.3420],
| [-0.5112, -0.2324]], dtype=torch.float16)
|
| >>> linear = nn.Linear(2, 2, bias=None).to(torch.cdouble)
| >>> linear.weight
| Parameter containing:
| tensor([[ 0.3741+0.j, 0.2382+0.j],
| [ 0.5593+0.j, -0.4443+0.j]], dtype=torch.complex128)
| >>> linear(torch.ones(3, 2, dtype=torch.cdouble))
| tensor([[0.6122+0.j, 0.1150+0.j],
| [0.6122+0.j, 0.1150+0.j],
| [0.6122+0.j, 0.1150+0.j]], dtype=torch.complex128)
|
| to_empty(self:~T, *, device:Union[str, torch.device]) -> ~T
| Moves the parameters and buffers to the specified device without copying storage.
|
| Args:
| device (:class:`torch.device`): The desired device of the parameters
| and buffers in this module.
|
| Returns:
| Module: self
|
| train(self:~T, mode:bool=True) -> ~T
| Sets the module in training mode.
|
| This has any effect only on certain modules. See documentations of
| particular modules for details of their behaviors in training/evaluation
| mode, if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`,
| etc.
|
| Args:
| mode (bool): whether to set training mode (``True``) or evaluation
| mode (``False``). Default: ``True``.
|
| Returns:
| Module: self
|
| type(self:~T, dst_type:Union[torch.dtype, str]) -> ~T
| Casts all parameters and buffers to :attr:`dst_type`.
|
| .. note::
| This method modifies the module in-place.
|
| Args:
| dst_type (type or string): the desired type
|
| Returns:
| Module: self
|
| xpu(self:~T, device:Union[int, torch.device, NoneType]=None) -> ~T
| Moves all model parameters and buffers to the XPU.
|
| This also makes associated parameters and buffers different objects. So
| it should be called before constructing optimizer if the module will
| live on XPU while being optimized.
|
| .. note::
| This method modifies the module in-place.
|
| Arguments:
| device (int, optional): if specified, all parameters will be
| copied to that device
|
| Returns:
| Module: self
|
| zero_grad(self, set_to_none:bool=False) -> None
| Sets gradients of all model parameters to zero. See similar function
| under :class:`torch.optim.Optimizer` for more context.
|
| Args:
| set_to_none (bool): instead of setting to zero, set the grads to None.
| See :meth:`torch.optim.Optimizer.zero_grad` for details.
|
| ----------------------------------------------------------------------
| Data descriptors inherited from torch.nn.modules.module.Module:
|
| __dict__
| dictionary for instance variables (if defined)
|
| __weakref__
| list of weak references to the object (if defined)
|
| ----------------------------------------------------------------------
| Data and other attributes inherited from torch.nn.modules.module.Module:
|
| T_destination = ~T_destination
| Type variable.
|
| Usage::
|
| T = TypeVar('T') # Can be anything
| A = TypeVar('A', str, bytes) # Must be str or bytes
|
| Type variables exist primarily for the benefit of static type
| checkers. They serve as the parameters for generic types as well
| as for generic function definitions. See class Generic for more
| information on generic types. Generic functions work as follows:
|
| def repeat(x: T, n: int) -> List[T]:
| '''Return a list containing n references to x.'''
| return [x]*n
|
| def longest(x: A, y: A) -> A:
| '''Return the longest of two strings.'''
| return x if len(x) >= len(y) else y
|
| The latter example's signature is essentially the overloading
| of (str, str) -> str and (bytes, bytes) -> bytes. Also note
| that if the arguments are instances of some subclass of str,
| the return type is still plain str.
|
| At runtime, isinstance(x, T) and issubclass(C, T) will raise TypeError.
|
| Type variables defined with covariant=True or contravariant=True
| can be used do declare covariant or contravariant generic types.
| See PEP 484 for more details. By default generic types are invariant
| in all type variables.
|
| Type variables can be introspected. e.g.:
|
| T.__name__ == 'T'
| T.__constraints__ == ()
| T.__covariant__ == False
| T.__contravariant__ = False
| A.__constraints__ == (str, bytes)
|
| __annotations__ = {'__call__': typing.Callable[..., typing.Any], '_is_...
|
| dump_patches = False
In [17]:
# 在单个GPU上训练网络
train(net, num_gpus=1, batch_size=256, lr=0.1)
test acc: 0.92, 27.5 sec/epochon [device(type='cuda', index=0)]
In [18]:
# 使用2个GPU进行训练
train(net, num_gpus=2, batch_size=512, lr=0.2)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-18-767446712b65> in <module> 1 # 使用2个GPU进行训练 ----> 2 train(net, num_gpus=2, batch_size=512, lr=0.2) <ipython-input-15-d7b4a8ff7033> in train(net, num_gpus, batch_size, lr) 10 net.apply(init_weights) 11 # nn.DataParallel会的是X切开并行到各个GPU上,并行算梯度,然后loss加起来,它重新定义了net的forward函数 ---> 12 net = nn.DataParallel(net, device_ids=devices) # net会复制到每一个GPU上 13 trainer = torch.optim.SGD(net.parameters(),lr) 14 loss = nn.CrossEntropyLoss() D:\11_Anaconda\envs\py3.6.3\lib\site-packages\torch\nn\parallel\data_parallel.py in __init__(self, module, device_ids, output_device, dim) 136 self.dim = dim 137 self.module = module --> 138 self.device_ids = [_get_device_index(x, True) for x in device_ids] 139 self.output_device = _get_device_index(output_device, True) 140 self.src_device_obj = torch.device(device_type, self.device_ids[0]) D:\11_Anaconda\envs\py3.6.3\lib\site-packages\torch\nn\parallel\data_parallel.py in <listcomp>(.0) 136 self.dim = dim 137 self.module = module --> 138 self.device_ids = [_get_device_index(x, True) for x in device_ids] 139 self.output_device = _get_device_index(output_device, True) 140 self.src_device_obj = torch.device(device_type, self.device_ids[0]) D:\11_Anaconda\envs\py3.6.3\lib\site-packages\torch\_utils.py in _get_device_index(device, optional, allow_cpu) 495 if isinstance(device, torch.device): 496 if not allow_cpu and device.type == 'cpu': --> 497 raise ValueError('Expected a non cpu device, but got: {}'.format(device)) 498 device_idx = -1 if device.type == 'cpu' else device.index 499 if isinstance(device, int): ValueError: Expected a non cpu device, but got: cpu