From 1f3aad86c1151ee24f3a7d4f9833bbb0dc04b2f8 Mon Sep 17 00:00:00 2001 From: Ayush Chaurasia Date: Sat, 12 Nov 2022 19:32:41 +0530 Subject: [PATCH] new check_dataset functions (#43) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .github/workflows/ci.yaml | 2 +- .../yolo/data/datasets/coco128-seg.yaml | 101 ++++++++++++++++++ ultralytics/yolo/data/datasets/coco128.yaml | 101 ++++++++++++++++++ ultralytics/yolo/data/utils.py | 97 +++++++++++++++++ ultralytics/yolo/engine/trainer.py | 16 ++- ultralytics/yolo/utils/__init__.py | 2 +- ultralytics/yolo/utils/checks.py | 7 +- ultralytics/yolo/utils/files.py | 17 +++ ultralytics/yolo/utils/modeling/__init__.py | 6 -- .../yolo/utils/modeling/autobackend.py | 5 +- ultralytics/yolo/v8/classify/train.py | 20 ---- ultralytics/yolo/v8/segment/train.py | 22 +--- ultralytics/yolo/v8/segment/val.py | 2 +- 13 files changed, 336 insertions(+), 62 deletions(-) create mode 100644 ultralytics/yolo/data/datasets/coco128-seg.yaml create mode 100644 ultralytics/yolo/data/datasets/coco128.yaml diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index c9d041f..581a7d0 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -94,7 +94,7 @@ jobs: - name: Test segmentation shell: bash # for Windows compatibility run: | - python ultralytics/yolo/v8/segment/train.py cfg=yolov5n-seg.yaml data=coco128-segments epochs=1 img_size=64 + python ultralytics/yolo/v8/segment/train.py cfg=yolov5n-seg.yaml data=coco128-seg.yaml epochs=1 img_size=64 - name: Test classification shell: bash # for Windows compatibility run: | diff --git a/ultralytics/yolo/data/datasets/coco128-seg.yaml b/ultralytics/yolo/data/datasets/coco128-seg.yaml new file mode 100644 index 0000000..acde4c1 --- /dev/null +++ b/ultralytics/yolo/data/datasets/coco128-seg.yaml @@ -0,0 +1,101 @@ +# YOLOv5 🚀 by Ultralytics, GPL-3.0 license +# COCO128-seg dataset https://www.kaggle.com/ultralytics/coco128 (first 128 images from COCO train2017) by Ultralytics +# Example usage: python train.py --data coco128.yaml +# parent +# ├── yolov5 +# └── datasets +# └── coco128-seg ← downloads here (7 MB) + + +# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..] +path: ../datasets/coco128-seg # dataset root dir +train: images/train2017 # train images (relative to 'path') 128 images +val: images/train2017 # val images (relative to 'path') 128 images +test: # test images (optional) + +# Classes +names: + 0: person + 1: bicycle + 2: car + 3: motorcycle + 4: airplane + 5: bus + 6: train + 7: truck + 8: boat + 9: traffic light + 10: fire hydrant + 11: stop sign + 12: parking meter + 13: bench + 14: bird + 15: cat + 16: dog + 17: horse + 18: sheep + 19: cow + 20: elephant + 21: bear + 22: zebra + 23: giraffe + 24: backpack + 25: umbrella + 26: handbag + 27: tie + 28: suitcase + 29: frisbee + 30: skis + 31: snowboard + 32: sports ball + 33: kite + 34: baseball bat + 35: baseball glove + 36: skateboard + 37: surfboard + 38: tennis racket + 39: bottle + 40: wine glass + 41: cup + 42: fork + 43: knife + 44: spoon + 45: bowl + 46: banana + 47: apple + 48: sandwich + 49: orange + 50: broccoli + 51: carrot + 52: hot dog + 53: pizza + 54: donut + 55: cake + 56: chair + 57: couch + 58: potted plant + 59: bed + 60: dining table + 61: toilet + 62: tv + 63: laptop + 64: mouse + 65: remote + 66: keyboard + 67: cell phone + 68: microwave + 69: oven + 70: toaster + 71: sink + 72: refrigerator + 73: book + 74: clock + 75: vase + 76: scissors + 77: teddy bear + 78: hair drier + 79: toothbrush + + +# Download script/URL (optional) +download: https://ultralytics.com/assets/coco128-seg.zip \ No newline at end of file diff --git a/ultralytics/yolo/data/datasets/coco128.yaml b/ultralytics/yolo/data/datasets/coco128.yaml new file mode 100644 index 0000000..6ab5425 --- /dev/null +++ b/ultralytics/yolo/data/datasets/coco128.yaml @@ -0,0 +1,101 @@ +# YOLOv5 🚀 by Ultralytics, GPL-3.0 license +# COCO128 dataset https://www.kaggle.com/ultralytics/coco128 (first 128 images from COCO train2017) by Ultralytics +# Example usage: python train.py --data coco128.yaml +# parent +# ├── yolov5 +# └── datasets +# └── coco128 ← downloads here (7 MB) + + +# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..] +path: ../datasets/coco128 # dataset root dir +train: images/train2017 # train images (relative to 'path') 128 images +val: images/train2017 # val images (relative to 'path') 128 images +test: # test images (optional) + +# Classes +names: + 0: person + 1: bicycle + 2: car + 3: motorcycle + 4: airplane + 5: bus + 6: train + 7: truck + 8: boat + 9: traffic light + 10: fire hydrant + 11: stop sign + 12: parking meter + 13: bench + 14: bird + 15: cat + 16: dog + 17: horse + 18: sheep + 19: cow + 20: elephant + 21: bear + 22: zebra + 23: giraffe + 24: backpack + 25: umbrella + 26: handbag + 27: tie + 28: suitcase + 29: frisbee + 30: skis + 31: snowboard + 32: sports ball + 33: kite + 34: baseball bat + 35: baseball glove + 36: skateboard + 37: surfboard + 38: tennis racket + 39: bottle + 40: wine glass + 41: cup + 42: fork + 43: knife + 44: spoon + 45: bowl + 46: banana + 47: apple + 48: sandwich + 49: orange + 50: broccoli + 51: carrot + 52: hot dog + 53: pizza + 54: donut + 55: cake + 56: chair + 57: couch + 58: potted plant + 59: bed + 60: dining table + 61: toilet + 62: tv + 63: laptop + 64: mouse + 65: remote + 66: keyboard + 67: cell phone + 68: microwave + 69: oven + 70: toaster + 71: sink + 72: refrigerator + 73: book + 74: clock + 75: vase + 76: scissors + 77: teddy bear + 78: hair drier + 79: toothbrush + + +# Download script/URL (optional) +download: https://ultralytics.com/assets/coco128.zip \ No newline at end of file diff --git a/ultralytics/yolo/data/utils.py b/ultralytics/yolo/data/utils.py index 63d5962..7c259cd 100644 --- a/ultralytics/yolo/data/utils.py +++ b/ultralytics/yolo/data/utils.py @@ -1,11 +1,22 @@ import contextlib import hashlib import os +import subprocess +import time +from pathlib import Path +from tarfile import is_tarfile +from zipfile import is_zipfile import cv2 import numpy as np +import torch from PIL import ExifTags, Image, ImageOps +from ultralytics.yolo.utils import LOGGER, ROOT, colorstr +from ultralytics.yolo.utils.checks import check_file, check_font, is_ascii +from ultralytics.yolo.utils.downloads import download +from ultralytics.yolo.utils.files import unzip_file, yaml_load + from ..utils.ops import segments2boxes HELP_URL = "See https://github.com/ultralytics/yolov5/wiki/Train-Custom-Data" @@ -176,3 +187,89 @@ def polygons2masks_overlap(img_size, segments, downsample_ratio=1): masks = masks + mask masks = np.clip(masks, a_min=0, a_max=i + 1) return masks, index + + +def check_dataset_yaml(data, autodownload=True): + # Download, check and/or unzip dataset if not found locally + data = check_file(data) + DATASETS_DIR = Path.cwd() / "../datasets" + # Download (optional) + extract_dir = '' + if isinstance(data, (str, Path)) and (is_zipfile(data) or is_tarfile(data)): + download(data, dir=f'{DATASETS_DIR}/{Path(data).stem}', unzip=True, delete=False, curl=False, threads=1) + data = next((DATASETS_DIR / Path(data).stem).rglob('*.yaml')) + extract_dir, autodownload = data.parent, False + # Read yaml (optional) + if isinstance(data, (str, Path)): + data = yaml_load(data) # dictionary + + # Checks + for k in 'train', 'val', 'names': + assert k in data, f"data.yaml '{k}:' field missing ❌" + if isinstance(data['names'], (list, tuple)): # old array format + data['names'] = dict(enumerate(data['names'])) # convert to dict + data['nc'] = len(data['names']) + + # Resolve paths + path = Path(extract_dir or data.get('path') or '') # optional 'path' default to '.' + if not path.is_absolute(): + path = (Path.cwd() / path).resolve() + data['path'] = path # download scripts + for k in 'train', 'val', 'test': + if data.get(k): # prepend path + if isinstance(data[k], str): + x = (path / data[k]).resolve() + if not x.exists() and data[k].startswith('../'): + x = (path / data[k][3:]).resolve() + data[k] = str(x) + else: + data[k] = [str((path / x).resolve()) for x in data[k]] + + # Parse yaml + train, val, test, s = (data.get(x) for x in ('train', 'val', 'test', 'download')) + if val: + val = [Path(x).resolve() for x in (val if isinstance(val, list) else [val])] # val path + if not all(x.exists() for x in val): + LOGGER.info('\nDataset not found ⚠️, missing paths %s' % [str(x) for x in val if not x.exists()]) + if not s or not autodownload: + raise Exception('Dataset not found ❌') + t = time.time() + if s.startswith('http') and s.endswith('.zip'): # URL + f = Path(s).name # filename + LOGGER.info(f'Downloading {s} to {f}...') + torch.hub.download_url_to_file(s, f) + Path(DATASETS_DIR).mkdir(parents=True, exist_ok=True) # create root + unzip_file(f, path=DATASETS_DIR) # unzip + Path(f).unlink() # remove zip + r = None # success + elif s.startswith('bash '): # bash script + LOGGER.info(f'Running {s} ...') + r = os.system(s) + else: # python script + r = exec(s, {'yaml': data}) # return None + dt = f'({round(time.time() - t, 1)}s)' + s = f"success ✅ {dt}, saved to {colorstr('bold', DATASETS_DIR)}" if r in (0, None) else f"failure {dt} ❌" + LOGGER.info(f"Dataset download {s}") + check_font('Arial.ttf' if is_ascii(data['names']) else 'Arial.Unicode.ttf', progress=True) # download fonts + return data # dictionary + + +def check_dataset(dataset: str): + data = Path.cwd() / "datasets" / dataset + data_dir = data if data.is_dir() else (Path.cwd() / data) + if not data_dir.is_dir(): + LOGGER.info(f'\nDataset not found ⚠️, missing path {data_dir}, attempting download...') + t = time.time() + if str(data) == 'imagenet': + subprocess.run(f"bash {ROOT / 'data/scripts/get_imagenet.sh'}", shell=True, check=True) + else: + url = f'https://github.com/ultralytics/yolov5/releases/download/v1.0/{dataset}.zip' + download(url, dir=data_dir.parent) + s = f"Dataset download success ✅ ({time.time() - t:.1f}s), saved to {colorstr('bold', data_dir)}\n" + LOGGER.info(s) + train_set = data_dir / "train" + test_set = data_dir / 'test' if (data_dir / 'test').exists() else data_dir / 'val' # data/test or data/val + nc = len([x for x in (data_dir / 'train').glob('*') if x.is_dir()]) # number of classes + names = [name for name in os.listdir(data_dir / 'train') if os.path.isdir(data_dir / 'train' / name)] + data = {"train": train_set, "val": test_set, "nc": nc, "names": names} + return data diff --git a/ultralytics/yolo/engine/trainer.py b/ultralytics/yolo/engine/trainer.py index 1191e75..758f227 100644 --- a/ultralytics/yolo/engine/trainer.py +++ b/ultralytics/yolo/engine/trainer.py @@ -24,7 +24,9 @@ from tqdm import tqdm import ultralytics.yolo.utils as utils import ultralytics.yolo.utils.loggers as loggers +from ultralytics.yolo.data.utils import check_dataset, check_dataset_yaml from ultralytics.yolo.utils import LOGGER, ROOT +from ultralytics.yolo.utils.checks import check_file, check_yaml from ultralytics.yolo.utils.files import increment_path, save_yaml from ultralytics.yolo.utils.modeling import get_model @@ -55,9 +57,14 @@ class BaseTrainer: self.scaler = amp.GradScaler(enabled=self.device.type != 'cpu') # Model and Dataloaders. - self.trainset, self.testset = self.get_dataset(self.args.data) + self.data = self.args.data + if self.data.endswith(".yaml"): + self.data = check_dataset_yaml(self.data) + else: + self.data = check_dataset(self.data) + self.trainset, self.testset = self.get_dataset(self.data) if self.args.cfg is not None: - self.model = self.load_cfg(self.args.cfg) + self.model = self.load_cfg(check_file(self.args.cfg)) if self.args.model is not None: self.model = self.get_model(self.args.model, self.args.pretrained).to(self.device) @@ -250,10 +257,9 @@ class BaseTrainer: def get_dataset(self, data): """ - Download the dataset if needed and verify it. - Returns train and val split datasets + Get train, val path from data dict if it exists. Returns None if data format is not recognized """ - pass + return data["train"], data["val"] def get_model(self, model, pretrained): """ diff --git a/ultralytics/yolo/utils/__init__.py b/ultralytics/yolo/utils/__init__.py index cbe112f..fc10ad9 100644 --- a/ultralytics/yolo/utils/__init__.py +++ b/ultralytics/yolo/utils/__init__.py @@ -7,7 +7,7 @@ from pathlib import Path # Constants FILE = Path(__file__).resolve() -ROOT = FILE.parents[2] # YOLOv5 root directory +ROOT = FILE.parents[2] # YOLO RANK = int(os.getenv('RANK', -1)) DATASETS_DIR = ROOT.parent / 'datasets' # YOLOv5 datasets directory NUM_THREADS = min(8, max(1, os.cpu_count() - 1)) # number of YOLOv5 multiprocessing threads diff --git a/ultralytics/yolo/utils/checks.py b/ultralytics/yolo/utils/checks.py index b4a9eb1..6eab17b 100644 --- a/ultralytics/yolo/utils/checks.py +++ b/ultralytics/yolo/utils/checks.py @@ -116,13 +116,10 @@ def check_file(file, suffix=''): torch.hub.download_url_to_file(url, file) assert Path(file).exists() and Path(file).stat().st_size > 0, f'File download failed: {url}' # check return file - elif file.startswith('clearml://'): # ClearML Dataset ID - assert 'clearml' in sys.modules, "Can not use ClearML dataset. Run 'pip install clearml' to install" - return file else: # search files = [] - for d in 'data', 'models', 'utils': # search directories - files.extend(glob.glob(str(ROOT / d / '**' / file), recursive=True)) # find file + for d in 'data', 'v8', 'utils': # search directories + files.extend(glob.glob(str(ROOT / "yolo" / d / '**' / file), recursive=True)) # find file assert len(files), f'File not found: {file}' # assert file was found assert len(files) == 1, f"Multiple files match '{file}', specify exact path: {files}" # assert unique return files[0] # return file diff --git a/ultralytics/yolo/utils/files.py b/ultralytics/yolo/utils/files.py index b84773d..c8f5385 100644 --- a/ultralytics/yolo/utils/files.py +++ b/ultralytics/yolo/utils/files.py @@ -1,6 +1,7 @@ import contextlib import os from pathlib import Path +from zipfile import ZipFile import yaml @@ -44,3 +45,19 @@ def save_yaml(file='data.yaml', data=None): # Single-line safe yaml saving with open(file, 'w') as f: yaml.safe_dump({k: str(v) if isinstance(v, Path) else v for k, v in data.items()}, f, sort_keys=False) + + +def yaml_load(file='data.yaml'): + # Single-line safe yaml loading + with open(file, errors='ignore') as f: + return yaml.safe_load(f) + + +def unzip_file(file, path=None, exclude=('.DS_Store', '__MACOSX')): + # Unzip a *.zip file to path/, excluding files containing strings in exclude list + if path is None: + path = Path(file).parent # default path + with ZipFile(file) as zipObj: + for f in zipObj.namelist(): # list all archived filenames in the zip + if all(x not in f for x in exclude): + zipObj.extract(f, path=path) diff --git a/ultralytics/yolo/utils/modeling/__init__.py b/ultralytics/yolo/utils/modeling/__init__.py index 35030bb..cdb5ad8 100644 --- a/ultralytics/yolo/utils/modeling/__init__.py +++ b/ultralytics/yolo/utils/modeling/__init__.py @@ -118,9 +118,3 @@ def get_model(model='s.pt', pretrained=True): return torchvision.models.__dict__[model](weights='IMAGENET1K_V1' if pretrained else None) else: # Ultralytics assets return torch.load(attempt_download(f"{model}.pt"), map_location='cpu') - - -def yaml_load(file='data.yaml'): - # Single-line safe yaml loading - with open(file, errors='ignore') as f: - return yaml.safe_load(f) diff --git a/ultralytics/yolo/utils/modeling/autobackend.py b/ultralytics/yolo/utils/modeling/autobackend.py index 8d13da1..4a11a90 100644 --- a/ultralytics/yolo/utils/modeling/autobackend.py +++ b/ultralytics/yolo/utils/modeling/autobackend.py @@ -32,7 +32,8 @@ class AutoBackend(nn.Module): # TensorFlow Lite: *.tflite # TensorFlow Edge TPU: *_edgetpu.tflite # PaddlePaddle: *_paddle_model - from ultralytics.yolo.utils.modeling import attempt_load_weights, yaml_load + from ultralytics.yolo.utils.files import yaml_load + from ultralytics.yolo.utils.modeling import attempt_load_weights super().__init__() w = str(weights[0] if isinstance(weights, list) else weights) @@ -315,7 +316,7 @@ class AutoBackend(nn.Module): @staticmethod def _load_metadata(f=Path('path/to/meta.yaml')): - from ultralytics.yolo.utils.modeling import yaml_load + from ultralytics.yolo.utils.files import yaml_load # Load metadata from meta.yaml if it exists if f.exists(): diff --git a/ultralytics/yolo/v8/classify/train.py b/ultralytics/yolo/v8/classify/train.py index 348d01f..4027d87 100644 --- a/ultralytics/yolo/v8/classify/train.py +++ b/ultralytics/yolo/v8/classify/train.py @@ -17,26 +17,6 @@ from ultralytics.yolo.utils.torch_utils import LOCAL_RANK, torch_distributed_zer # BaseTrainer python usage class ClassificationTrainer(BaseTrainer): - def get_dataset(self, dataset): - # temporary solution. Replace with new ultralytics.yolo.ClassificationDataset module - data = Path("datasets") / dataset - with torch_distributed_zero_first(LOCAL_RANK), WorkingDirectory(Path.cwd()): - data_dir = data if data.is_dir() else (Path.cwd() / data) - if not data_dir.is_dir(): - self.console.info(f'\nDataset not found ⚠️, missing path {data_dir}, attempting download...') - t = time.time() - if str(data) == 'imagenet': - subprocess.run(f"bash {v8.ROOT / 'data/scripts/get_imagenet.sh'}", shell=True, check=True) - else: - url = f'https://github.com/ultralytics/yolov5/releases/download/v1.0/{dataset}.zip' - download(url, dir=data_dir.parent) - s = f"Dataset download success ✅ ({time.time() - t:.1f}s), saved to {colorstr('bold', data_dir)}\n" - self.console.info(s) - train_set = data_dir / "train" - test_set = data_dir / 'test' if (data_dir / 'test').exists() else data_dir / 'val' # data/test or data/val - - return train_set, test_set - def get_dataloader(self, dataset_path, batch_size=None, rank=0): return build_classification_dataloader(path=dataset_path, imgsz=self.args.img_size, diff --git a/ultralytics/yolo/v8/segment/train.py b/ultralytics/yolo/v8/segment/train.py index 5bc0137..548bae6 100644 --- a/ultralytics/yolo/v8/segment/train.py +++ b/ultralytics/yolo/v8/segment/train.py @@ -21,26 +21,6 @@ from ultralytics.yolo.utils.torch_utils import LOCAL_RANK, de_parallel, torch_di # BaseTrainer python usage class SegmentationTrainer(BaseTrainer): - def get_dataset(self, dataset): - # temporary solution. Replace with new ultralytics.yolo.ClassificationDataset module - data = Path("datasets") / dataset - with torch_distributed_zero_first(LOCAL_RANK), WorkingDirectory(Path.cwd()): - data_dir = data if data.is_dir() else (Path.cwd() / data) - if not data_dir.is_dir(): - self.console.info(f'\nDataset not found ⚠️, missing path {data_dir}, attempting download...') - t = time.time() - if str(data) == 'imagenet': - subprocess.run(f"bash {v8.ROOT / 'data/scripts/get_imagenet.sh'}", shell=True, check=True) - else: - url = f'https://github.com/ultralytics/yolov5/releases/download/v1.0/{dataset}.zip' - download(url, dir=data_dir.parent) - # TODO: add colorstr - s = f"Dataset download success ✅ ({time.time() - t:.1f}s), saved to {'bold', data_dir}\n" - self.console.info(s) - train_set = data_dir.parent / "coco128-seg" - test_set = train_set - return train_set, test_set - def get_dataloader(self, dataset_path, batch_size, rank=0): # TODO: manage splits differently # calculate stride - check if model is initialized @@ -253,7 +233,7 @@ class SegmentationTrainer(BaseTrainer): @hydra.main(version_base=None, config_path=DEFAULT_CONFIG.parent, config_name=DEFAULT_CONFIG.name) def train(cfg): cfg.cfg = v8.ROOT / "models/yolov5n-seg.yaml" - cfg.data = cfg.data or "coco128-segments" # or yolo.ClassificationDataset("mnist") + cfg.data = cfg.data or "coco128-seg.yaml" # or yolo.ClassificationDataset("mnist") trainer = SegmentationTrainer(cfg) trainer.train() diff --git a/ultralytics/yolo/v8/segment/val.py b/ultralytics/yolo/v8/segment/val.py index 0078e76..c09d730 100644 --- a/ultralytics/yolo/v8/segment/val.py +++ b/ultralytics/yolo/v8/segment/val.py @@ -8,9 +8,9 @@ import torch.nn.functional as F from ultralytics.yolo.engine.validator import BaseValidator from ultralytics.yolo.utils import ops from ultralytics.yolo.utils.checks import check_requirements +from ultralytics.yolo.utils.files import yaml_load from ultralytics.yolo.utils.metrics import (ConfusionMatrix, Metrics, ap_per_class_box_and_mask, box_iou, fitness_segmentation, mask_iou) -from ultralytics.yolo.utils.modeling import yaml_load from ultralytics.yolo.utils.torch_utils import de_parallel