Note
You are reading the documentation for MMSelfSup 0.x, which will soon be deprecated by the end of 2022. We recommend you upgrade to MMSelfSup 1.0.0rc versions to enjoy fruitful new features and better performance brought by OpenMMLab 2.0. Check out the changelog, code and documentation of MMSelfSup 1.0.0rc for more details.
Source code for mmselfsup.datasets.data_sources.imagenet_21k
# Copyright (c) OpenMMLab. All rights reserved.
import os
import warnings
import numpy as np
from mmcv.utils import scandir
from ..builder import DATASOURCES
from .base import BaseDataSource
from .imagenet import find_folders
[docs]@DATASOURCES.register_module()
class ImageNet21k(BaseDataSource):
"""ImageNet21k Dataset. Since the dataset ImageNet21k is extremely big,
cantains 21k+ classes and 1.4B files. This class has improved the following
points on the basis of the class ``ImageNet``, in order to save memory
usage and time required :
- Delete the samples attribute
- using 'slots' create a Data_item tp replace dict
- Modify setting ``info`` dict from function ``load_annotations`` to
function ``prepare_data``
- using int instead of np.array(..., np.int64)
Args:
data_prefix (str): the prefix of data path
ann_file (str | None): the annotation file. When ann_file is str,
the subclass is expected to read from the ann_file. When ann_file
is None, the subclass is expected to read according to data_prefix
test_mode (bool): in train mode or test mode
multi_label (bool): use multi label or not.
recursion_subdir(bool): whether to use sub-directory pictures, which
are meet the conditions in the folder under category directory.
"""
IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif',
'.JPEG', '.JPG')
CLASSES = None
def __init__(self,
data_prefix,
classes=None,
ann_file=None,
multi_label=False,
recursion_subdir=False,
test_mode=False):
self.recursion_subdir = recursion_subdir
if multi_label:
raise NotImplementedError('Multi_label have not be implemented.')
self.multi_lable = multi_label
super(ImageNet21k, self).__init__(data_prefix, classes, ann_file,
test_mode)
[docs] def load_annotations(self):
"""load dataset annotations."""
if self.ann_file is None:
data_infos = self._load_annotations_from_dir()
elif isinstance(self.ann_file, str):
data_infos = self._load_annotations_from_file()
else:
raise TypeError('ann_file must be a str or None')
if len(data_infos) == 0:
msg = 'Found no valid file in '
msg += f'{self.ann_file}. ' if self.ann_file \
else f'{self.data_prefix}. '
msg += 'Supported extensions are: ' + \
', '.join(self.IMG_EXTENSIONS)
raise RuntimeError(msg)
return data_infos
def _find_allowed_files(self, root, folder_name):
"""find all the allowed files in a folder, including sub folder if
recursion_subdir is true."""
_dir = os.path.join(root, folder_name)
data_infos = []
for path in scandir(_dir, self.IMG_EXTENSIONS, self.recursion_subdir):
path = os.path.join(folder_name, path)
data_infos.append(path)
return data_infos
def _load_annotations_from_dir(self):
"""load annotations from self.data_prefix directory."""
data_infos, empty_classes = [], []
folder_to_idx = find_folders(self.data_prefix)
root = os.path.expanduser(self.data_prefix)
for folder_name in folder_to_idx.keys():
infos_pre_class = self._find_allowed_files(root, folder_name)
if len(infos_pre_class) == 0:
empty_classes.append(folder_name)
data_infos.extend(infos_pre_class)
if len(empty_classes) != 0:
msg = 'Found no valid file for the classes ' + \
f"{', '.join(sorted(empty_classes))} "
msg += 'Supported extensions are: ' + \
f"{', '.join(self.IMG_EXTENSIONS)}."
warnings.warn(msg)
return np.array(data_infos, dtype='S36')
def _load_annotations_from_file(self):
"""load annotations from self.ann_file."""
data_infos = []
with open(self.ann_file) as f:
for line in f.readlines():
if line == '':
continue
filepath, gt_label = line.strip().rsplit(' ', 1)
# info = ImageInfo(filepath, int(gt_label))
data_infos.append(filepath)
return np.array(data_infos, dtype='S36')