Shortcuts

Source code for mmpretrain.models.utils.data_preprocessor

# Copyright (c) OpenMMLab. All rights reserved.
import math
from numbers import Number
from typing import List, Optional, Sequence, Tuple, Union

import torch
import torch.nn.functional as F
from mmengine.model import (BaseDataPreprocessor, ImgDataPreprocessor,
                            stack_batch)

from mmpretrain.registry import MODELS
from mmpretrain.structures import (DataSample, MultiTaskDataSample,
                                   batch_label_to_onehot, cat_batch_labels,
                                   tensor_split)
from .batch_augments import RandomBatchAugment


[docs]@MODELS.register_module() class ClsDataPreprocessor(BaseDataPreprocessor): """Image pre-processor for classification tasks. Comparing with the :class:`mmengine.model.ImgDataPreprocessor`, 1. It won't do normalization if ``mean`` is not specified. 2. It does normalization and color space conversion after stacking batch. 3. It supports batch augmentations like mixup and cutmix. It provides the data pre-processing as follows - Collate and move data to the target device. - Pad inputs to the maximum size of current batch with defined ``pad_value``. The padding size can be divisible by a defined ``pad_size_divisor`` - Stack inputs to batch_inputs. - Convert inputs from bgr to rgb if the shape of input is (3, H, W). - Normalize image with defined std and mean. - Do batch augmentations like Mixup and Cutmix during training. Args: mean (Sequence[Number], optional): The pixel mean of R, G, B channels. Defaults to None. std (Sequence[Number], optional): The pixel standard deviation of R, G, B channels. Defaults to None. pad_size_divisor (int): The size of padded image should be divisible by ``pad_size_divisor``. Defaults to 1. pad_value (Number): The padded pixel value. Defaults to 0. to_rgb (bool): whether to convert image from BGR to RGB. Defaults to False. to_onehot (bool): Whether to generate one-hot format gt-labels and set to data samples. Defaults to False. num_classes (int, optional): The number of classes. Defaults to None. batch_augments (dict, optional): The batch augmentations settings, including "augments" and "probs". For more details, see :class:`mmpretrain.models.RandomBatchAugment`. """ def __init__(self, mean: Sequence[Number] = None, std: Sequence[Number] = None, pad_size_divisor: int = 1, pad_value: Number = 0, to_rgb: bool = False, to_onehot: bool = False, num_classes: Optional[int] = None, batch_augments: Optional[dict] = None): super().__init__() self.pad_size_divisor = pad_size_divisor self.pad_value = pad_value self.to_rgb = to_rgb self.to_onehot = to_onehot self.num_classes = num_classes if mean is not None: assert std is not None, 'To enable the normalization in ' \ 'preprocessing, please specify both `mean` and `std`.' # Enable the normalization in preprocessing. self._enable_normalize = True self.register_buffer('mean', torch.tensor(mean).view(-1, 1, 1), False) self.register_buffer('std', torch.tensor(std).view(-1, 1, 1), False) else: self._enable_normalize = False if batch_augments: self.batch_augments = RandomBatchAugment(**batch_augments) if not self.to_onehot: from mmengine.logging import MMLogger MMLogger.get_current_instance().info( 'Because batch augmentations are enabled, the data ' 'preprocessor automatically enables the `to_onehot` ' 'option to generate one-hot format labels.') self.to_onehot = True else: self.batch_augments = None
[docs] def forward(self, data: dict, training: bool = False) -> dict: """Perform normalization, padding, bgr2rgb conversion and batch augmentation based on ``BaseDataPreprocessor``. Args: data (dict): data sampled from dataloader. training (bool): Whether to enable training time augmentation. Returns: dict: Data in the same format as the model input. """ inputs = self.cast_data(data['inputs']) if isinstance(inputs, torch.Tensor): # The branch if use `default_collate` as the collate_fn in the # dataloader. # ------ To RGB ------ if self.to_rgb and inputs.size(1) == 3: inputs = inputs.flip(1) # -- Normalization --- inputs = inputs.float() if self._enable_normalize: inputs = (inputs - self.mean) / self.std # ------ Padding ----- if self.pad_size_divisor > 1: h, w = inputs.shape[-2:] target_h = math.ceil( h / self.pad_size_divisor) * self.pad_size_divisor target_w = math.ceil( w / self.pad_size_divisor) * self.pad_size_divisor pad_h = target_h - h pad_w = target_w - w inputs = F.pad(inputs, (0, pad_w, 0, pad_h), 'constant', self.pad_value) else: # The branch if use `pseudo_collate` as the collate_fn in the # dataloader. processed_inputs = [] for input_ in inputs: # ------ To RGB ------ if self.to_rgb and input_.size(0) == 3: input_ = input_.flip(0) # -- Normalization --- input_ = input_.float() if self._enable_normalize: input_ = (input_ - self.mean) / self.std processed_inputs.append(input_) # Combine padding and stack inputs = stack_batch(processed_inputs, self.pad_size_divisor, self.pad_value) data_samples = data.get('data_samples', None) sample_item = data_samples[0] if data_samples is not None else None if isinstance(sample_item, DataSample): batch_label = None batch_score = None if 'gt_label' in sample_item: gt_labels = [sample.gt_label for sample in data_samples] batch_label, label_indices = cat_batch_labels(gt_labels) batch_label = batch_label.to(self.device) if 'gt_score' in sample_item: gt_scores = [sample.gt_score for sample in data_samples] batch_score = torch.stack(gt_scores).to(self.device) elif self.to_onehot and 'gt_label' in sample_item: assert batch_label is not None, \ 'Cannot generate onehot format labels because no labels.' num_classes = self.num_classes or sample_item.get( 'num_classes') assert num_classes is not None, \ 'Cannot generate one-hot format labels because not set ' \ '`num_classes` in `data_preprocessor`.' batch_score = batch_label_to_onehot( batch_label, label_indices, num_classes).to(self.device) # ----- Batch Augmentations ---- if (training and self.batch_augments is not None and batch_score is not None): inputs, batch_score = self.batch_augments(inputs, batch_score) # ----- scatter labels and scores to data samples --- if batch_label is not None: for sample, label in zip( data_samples, tensor_split(batch_label, label_indices)): sample.set_gt_label(label) if batch_score is not None: for sample, score in zip(data_samples, batch_score): sample.set_gt_score(score) elif isinstance(sample_item, MultiTaskDataSample): data_samples = self.cast_data(data_samples) return {'inputs': inputs, 'data_samples': data_samples}
[docs]@MODELS.register_module() class SelfSupDataPreprocessor(ImgDataPreprocessor): """Image pre-processor for operations, like normalization and bgr to rgb. Compared with the :class:`mmengine.ImgDataPreprocessor`, this module supports ``inputs`` as torch.Tensor or a list of torch.Tensor. """ def __init__(self, mean: Optional[Sequence[Union[float, int]]] = None, std: Optional[Sequence[Union[float, int]]] = None, pad_size_divisor: int = 1, pad_value: Union[float, int] = 0, to_rgb: bool = False, bgr_to_rgb: bool = False, rgb_to_bgr: bool = False, non_blocking: Optional[bool] = False): super().__init__( mean=mean, std=std, pad_size_divisor=pad_size_divisor, pad_value=pad_value, bgr_to_rgb=bgr_to_rgb, rgb_to_bgr=rgb_to_bgr, non_blocking=non_blocking) self._channel_conversion = to_rgb or bgr_to_rgb or rgb_to_bgr
[docs] def forward( self, data: dict, training: bool = False ) -> Tuple[List[torch.Tensor], Optional[list]]: """Performs normalization and bgr2rgb conversion based on ``BaseDataPreprocessor``. Args: data (dict): data sampled from dataloader. training (bool): Whether to enable training time augmentation. If subclasses override this method, they can perform different preprocessing strategies for training and testing based on the value of ``training``. Returns: Tuple[torch.Tensor, Optional[list]]: Data in the same format as the model input. """ assert isinstance(data, dict), 'Please use default_collate in dataloader, \ instead of pseudo_collate.' data = [val for _, val in data.items()] batch_inputs, batch_data_samples = self.cast_data(data) # Here is what is different from :class:`mmengine.ImgDataPreprocessor` # Since there are multiple views for an image for some algorithms, # e.g. SimCLR, each item in inputs is a list, containing multi-views # for an image. if isinstance(batch_inputs, list): # channel transform if self._channel_conversion: batch_inputs = [ _input[:, [2, 1, 0], ...] for _input in batch_inputs ] # convert to float after channel conversion to ensure efficiency batch_inputs = [_input.float() for _input in batch_inputs] # normalization. if self._enable_normalize: batch_inputs = [(_input - self.mean) / self.std for _input in batch_inputs] else: # channel transform if self._channel_conversion: batch_inputs = batch_inputs[:, [2, 1, 0], ...] # convert to float after channel conversion to ensure efficiency batch_inputs = batch_inputs.float() # normalization. if self._enable_normalize: batch_inputs = (batch_inputs - self.mean) / self.std return {'inputs': batch_inputs, 'data_samples': batch_data_samples}
[docs]@MODELS.register_module() class TwoNormDataPreprocessor(SelfSupDataPreprocessor): """Image pre-processor for CAE, BEiT v1/v2, etc. Compared with the :class:`mmselfsup.SelfSupDataPreprocessor`, this module will normalize the prediction image and target image with different normalization parameters. Args: mean (Sequence[float or int], optional): The pixel mean of image channels. If ``to_rgb=True`` it means the mean value of R, G, B channels. If the length of `mean` is 1, it means all channels have the same mean value, or the input is a gray image. If it is not specified, images will not be normalized. Defaults to None. std (Sequence[float or int], optional): The pixel standard deviation of image channels. If ``to_rgb=True`` it means the standard deviation of R, G, B channels. If the length of `std` is 1, it means all channels have the same standard deviation, or the input is a gray image. If it is not specified, images will not be normalized. Defaults to None. second_mean (Sequence[float or int], optional): The description is like ``mean``, it can be customized for targe image. Defaults to None. second_std (Sequence[float or int], optional): The description is like ``std``, it can be customized for targe image. Defaults to None. pad_size_divisor (int): The size of padded image should be divisible by ``pad_size_divisor``. Defaults to 1. pad_value (float or int): The padded pixel value. Defaults to 0. to_rgb (bool): whether to convert image from BGR to RGB. Defaults to False. non_blocking (bool): Whether block current process when transferring data to device. Defaults to False. """ def __init__(self, mean: Optional[Sequence[Union[float, int]]] = None, std: Optional[Sequence[Union[float, int]]] = None, second_mean: Sequence[Union[float, int]] = None, second_std: Sequence[Union[float, int]] = None, pad_size_divisor: int = 1, pad_value: Union[float, int] = 0, to_rgb: bool = False, non_blocking: Optional[bool] = False): super().__init__( mean=mean, std=std, pad_size_divisor=pad_size_divisor, pad_value=pad_value, to_rgb=to_rgb, non_blocking=non_blocking) assert (second_mean is not None) and (second_std is not None), ( 'mean and std should not be None while using ' '`TwoNormDataPreprocessor`') assert len(second_mean) == 3 or len(second_mean) == 1, ( '`mean` should have 1 or 3 values, to be compatible with ' f'RGB or gray image, but got {len(second_mean)} values') assert len(second_std) == 3 or len(second_std) == 1, ( '`std` should have 1 or 3 values, to be compatible with RGB ' f'or gray image, but got {len(std)} values') self.register_buffer('second_mean', torch.tensor(second_mean).view(-1, 1, 1), False) self.register_buffer('second_std', torch.tensor(second_std).view(-1, 1, 1), False)
[docs] def forward( self, data: dict, training: bool = False ) -> Tuple[List[torch.Tensor], Optional[list]]: """Performs normalization and bgr2rgb conversion based on ``BaseDataPreprocessor``. The ``batch_inputs`` in forward function is a list. Args: data (dict): data sampled from dataloader. training (bool): Whether to enable training time augmentation. If subclasses override this method, they can perform different preprocessing strategies for training and testing based on the value of ``training``. Returns: Tuple[torch.Tensor, Optional[list]]: Data in the same format as the model input. """ data = [val for _, val in data.items()] batch_inputs, batch_data_samples = self.cast_data(data) # channel transform if self._channel_conversion: batch_inputs = [ _input[:, [2, 1, 0], ...] for _input in batch_inputs ] # convert to float after channel conversion to ensure efficiency batch_inputs = [_input.float() for _input in batch_inputs] # Normalization. Here is what is different from # :class:`mmselfsup.SelfSupDataPreprocessor`. Normalize the target # image and prediction image with different normalization params if self._enable_normalize: batch_inputs = [ (batch_inputs[0] - self.mean) / self.std, (batch_inputs[1] - self.second_mean) / self.second_std ] return {'inputs': batch_inputs, 'data_samples': batch_data_samples}
[docs]@MODELS.register_module() class VideoDataPreprocessor(BaseDataPreprocessor): """Video pre-processor for operations, like normalization and bgr to rgb conversion . Compared with the :class:`mmaction.ActionDataPreprocessor`, this module supports ``inputs`` as torch.Tensor or a list of torch.Tensor. Args: mean (Sequence[float or int, optional): The pixel mean of channels of images or stacked optical flow. Defaults to None. std (Sequence[float or int], optional): The pixel standard deviation of channels of images or stacked optical flow. Defaults to None. pad_size_divisor (int): The size of padded image should be divisible by ``pad_size_divisor``. Defaults to 1. pad_value (float or int): The padded pixel value. Defaults to 0. to_rgb (bool): Whether to convert image from BGR to RGB. Defaults to False. format_shape (str): Format shape of input data. Defaults to ``'NCHW'``. """ def __init__(self, mean: Optional[Sequence[Union[float, int]]] = None, std: Optional[Sequence[Union[float, int]]] = None, pad_size_divisor: int = 1, pad_value: Union[float, int] = 0, to_rgb: bool = False, format_shape: str = 'NCHW') -> None: super().__init__() self.pad_size_divisor = pad_size_divisor self.pad_value = pad_value self.to_rgb = to_rgb self.format_shape = format_shape if mean is not None: assert std is not None, 'To enable the normalization in ' \ 'preprocessing, please specify both ' \ '`mean` and `std`.' # Enable the normalization in preprocessing. self._enable_normalize = True if self.format_shape == 'NCHW': normalizer_shape = (-1, 1, 1) elif self.format_shape == 'NCTHW': normalizer_shape = (-1, 1, 1, 1) else: raise ValueError(f'Invalid format shape: {format_shape}') self.register_buffer( 'mean', torch.tensor(mean, dtype=torch.float32).view(normalizer_shape), False) self.register_buffer( 'std', torch.tensor(std, dtype=torch.float32).view(normalizer_shape), False) else: self._enable_normalize = False
[docs] def forward( self, data: dict, training: bool = False ) -> Tuple[List[torch.Tensor], Optional[list]]: """Performs normalization、padding and bgr2rgb conversion based on ``BaseDataPreprocessor``. Args: data (dict): data sampled from dataloader. training (bool): Whether to enable training time augmentation. If subclasses override this method, they can perform different preprocessing strategies for training and testing based on the value of ``training``. Returns: Tuple[List[torch.Tensor], Optional[list]]: Data in the same format as the model input. """ data = [val for _, val in data.items()] batch_inputs, batch_data_samples = self.cast_data(data) if isinstance(batch_inputs, list): # channel transform if self.to_rgb: if self.format_shape == 'NCHW': batch_inputs = [ _input[..., [2, 1, 0], :, :] for _input in batch_inputs ] elif self.format_shape == 'NCTHW': batch_inputs = [ _input[..., [2, 1, 0], :, :, :] for _input in batch_inputs ] else: raise ValueError( f'Invalid format shape: {self.format_shape}') # convert to float after channel conversion to ensure efficiency batch_inputs = [_input.float() for _input in batch_inputs] # normalization if self._enable_normalize: batch_inputs = [(_input - self.mean) / self.std for _input in batch_inputs] else: # channel transform if self.to_rgb: if self.format_shape == 'NCHW': batch_inputs = batch_inputs[..., [2, 1, 0], :, :] elif self.format_shape == 'NCTHW': batch_inputs = batch_inputs[..., [2, 1, 0], :, :, :] else: raise ValueError( f'Invalid format shape: {self.format_shape}') # convert to float after channel conversion to ensure efficiency batch_inputs = batch_inputs.float() # normalization if self._enable_normalize: batch_inputs = (batch_inputs - self.mean) / self.std return {'inputs': batch_inputs, 'data_samples': batch_data_samples}
@MODELS.register_module() class MultiModalDataPreprocessor(BaseDataPreprocessor): """Data pre-processor for image-text multimodality tasks. It provides the data pre-processing as follows - Collate and move data to the target device. - Pad inputs to the maximum size of current batch with defined ``pad_value``. The padding size can be divisible by a defined ``pad_size_divisor`` - Stack inputs to batch_inputs. - Convert inputs from bgr to rgb if the shape of input is (3, H, W). - Normalize image with defined std and mean. Args: mean (Sequence[Number], optional): The pixel mean of R, G, B channels. Defaults to None. std (Sequence[Number], optional): The pixel standard deviation of R, G, B channels. Defaults to None. pad_size_divisor (int): The size of padded image should be divisible by ``pad_size_divisor``. Defaults to 1. pad_value (Number): The padded pixel value. Defaults to 0. to_rgb (bool): whether to convert image from BGR to RGB. Defaults to False. """ def __init__( self, mean: Sequence[Number] = None, std: Sequence[Number] = None, pad_size_divisor: int = 1, pad_value: Number = 0, to_rgb: bool = False, ): super().__init__() self.pad_size_divisor = pad_size_divisor self.pad_value = pad_value self.to_rgb = to_rgb if mean is not None: assert std is not None, 'To enable the normalization in ' \ 'preprocessing, please specify both `mean` and `std`.' # Enable the normalization in preprocessing. self._enable_normalize = True self.register_buffer('mean', torch.tensor(mean).view(-1, 1, 1), False) self.register_buffer('std', torch.tensor(std).view(-1, 1, 1), False) else: self._enable_normalize = False def forward(self, data: dict, training: bool = False) -> dict: """Perform normalization, padding, bgr2rgb conversion and batch augmentation based on ``BaseDataPreprocessor``. Args: data (dict): data sampled from dataloader. training (bool): Whether to enable training time augmentation. Returns: dict: Data in the same format as the model input. """ data = self.cast_data(data) imgs = data.get('inputs', None) def _process_img(img): # ------ To RGB ------ if self.to_rgb and img.size(1) == 3: img = img.flip(1) # -- Normalization --- img = img.float() if self._enable_normalize: img = (img - self.mean) / self.std # ------ Padding ----- if self.pad_size_divisor > 1: h, w = img.shape[-2:] target_h = math.ceil( h / self.pad_size_divisor) * self.pad_size_divisor target_w = math.ceil( w / self.pad_size_divisor) * self.pad_size_divisor pad_h = target_h - h pad_w = target_w - w img = F.pad(img, (0, pad_w, 0, pad_h), 'constant', self.pad_value) return img if isinstance(imgs, torch.Tensor): imgs = _process_img(imgs) elif isinstance(imgs, Sequence): # B, T, C, H, W imgs = torch.stack([_process_img(img) for img in imgs], dim=1) elif imgs is not None: raise ValueError(f'{type(imgs)} is not supported for imgs inputs.') data_samples = data.get('data_samples', None) return {'images': imgs, 'data_samples': data_samples}
Read the Docs v: latest
Versions
latest
stable
mmcls-1.x
mmcls-0.x
dev
Downloads
epub
On Read the Docs
Project Home
Builds

Free document hosting provided by Read the Docs.