ultralytics 8.0.44 export and task fixes (#1088)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Mehran Ghandehari <mehran.maps@gmail.com>
Co-authored-by: Laughing <61612323+Laughing-q@users.noreply.github.com>
This commit is contained in:
Glenn Jocher
2023-02-24 03:11:25 +01:00
committed by GitHub
parent fe61018975
commit 3ea659411b
32 changed files with 439 additions and 480 deletions

View File

@ -6,7 +6,6 @@ Dataloaders and dataset utils
import contextlib
import glob
import hashlib
import json
import math
import os
import random
@ -27,11 +26,9 @@ from PIL import ExifTags, Image, ImageOps
from torch.utils.data import DataLoader, Dataset, dataloader, distributed
from tqdm import tqdm
from ultralytics.yolo.data.utils import check_det_dataset
from ultralytics.yolo.utils import (DATASETS_DIR, LOGGER, NUM_THREADS, TQDM_BAR_FORMAT, is_colab, is_dir_writeable,
is_kaggle, yaml_load)
from ultralytics.yolo.utils.checks import check_requirements, check_yaml
from ultralytics.yolo.utils.downloads import unzip_file
is_kaggle)
from ultralytics.yolo.utils.checks import check_requirements
from ultralytics.yolo.utils.ops import clean_str, segments2boxes, xyn2xy, xywh2xyxy, xywhn2xyxy, xyxy2xywhn
from ultralytics.yolo.utils.torch_utils import torch_distributed_zero_first
@ -1037,127 +1034,6 @@ def verify_image_label(args):
return [None, None, None, None, nm, nf, ne, nc, msg]
class HUBDatasetStats():
""" Class for generating HUB dataset JSON and `-hub` dataset directory
Arguments
path: Path to data.yaml or data.zip (with data.yaml inside data.zip)
autodownload: Attempt to download dataset if not found locally
Usage
from ultralytics.yolo.data.dataloaders.v5loader import HUBDatasetStats
stats = HUBDatasetStats('coco128.yaml', autodownload=True) # usage 1
stats = HUBDatasetStats('path/to/coco128.zip') # usage 2
stats.get_json(save=False)
stats.process_images()
"""
def __init__(self, path='coco128.yaml', autodownload=False):
# Initialize class
zipped, data_dir, yaml_path = self._unzip(Path(path))
# try:
# data = yaml_load(check_yaml(yaml_path)) # data dict
# if zipped:
# data['path'] = data_dir
# except Exception as e:
# raise Exception('error/HUB/dataset_stats/yaml_load') from e
data = check_det_dataset(yaml_path, autodownload) # download dataset if missing
self.hub_dir = Path(str(data['path']) + '-hub')
self.im_dir = self.hub_dir / 'images'
self.im_dir.mkdir(parents=True, exist_ok=True) # makes /images
self.stats = {'nc': data['nc'], 'names': list(data['names'].values())} # statistics dictionary
self.data = data
@staticmethod
def _find_yaml(dir):
# Return data.yaml file
files = list(dir.glob('*.yaml')) or list(dir.rglob('*.yaml')) # try root level first and then recursive
assert files, f'No *.yaml file found in {dir}'
if len(files) > 1:
files = [f for f in files if f.stem == dir.stem] # prefer *.yaml files that match dir name
assert files, f'Multiple *.yaml files found in {dir}, only 1 *.yaml file allowed'
assert len(files) == 1, f'Multiple *.yaml files found: {files}, only 1 *.yaml file allowed in {dir}'
return files[0]
def _unzip(self, path):
# Unzip data.zip
if not str(path).endswith('.zip'): # path is data.yaml
return False, None, path
assert Path(path).is_file(), f'Error unzipping {path}, file not found'
unzip_file(path, path=path.parent)
dir = path.with_suffix('') # dataset directory == zip name
assert dir.is_dir(), f'Error unzipping {path}, {dir} not found. path/to/abc.zip MUST unzip to path/to/abc/'
return True, str(dir), self._find_yaml(dir) # zipped, data_dir, yaml_path
def _hub_ops(self, f, max_dim=1920):
# HUB ops for 1 image 'f': resize and save at reduced quality in /dataset-hub for web/app viewing
f_new = self.im_dir / Path(f).name # dataset-hub image filename
try: # use PIL
im = Image.open(f)
r = max_dim / max(im.height, im.width) # ratio
if r < 1.0: # image too large
im = im.resize((int(im.width * r), int(im.height * r)))
im.save(f_new, 'JPEG', quality=50, optimize=True) # save
except Exception as e: # use OpenCV
LOGGER.info(f'WARNING ⚠️ HUB ops PIL failure {f}: {e}')
im = cv2.imread(f)
im_height, im_width = im.shape[:2]
r = max_dim / max(im_height, im_width) # ratio
if r < 1.0: # image too large
im = cv2.resize(im, (int(im_width * r), int(im_height * r)), interpolation=cv2.INTER_AREA)
cv2.imwrite(str(f_new), im)
def get_json(self, save=False, verbose=False):
# Return dataset JSON for Ultralytics HUB
def _round(labels):
# Update labels to integer class and 6 decimal place floats
return [[int(c), *(round(x, 4) for x in points)] for c, *points in labels]
for split in 'train', 'val', 'test':
if self.data.get(split) is None:
self.stats[split] = None # i.e. no test set
continue
dataset = LoadImagesAndLabels(self.data[split]) # load dataset
x = np.array([
np.bincount(label[:, 0].astype(int), minlength=self.data['nc'])
for label in tqdm(dataset.labels, total=dataset.n, desc='Statistics')]) # shape(128x80)
self.stats[split] = {
'instance_stats': {
'total': int(x.sum()),
'per_class': x.sum(0).tolist()},
'image_stats': {
'total': dataset.n,
'unlabelled': int(np.all(x == 0, 1).sum()),
'per_class': (x > 0).sum(0).tolist()},
'labels': [{
str(Path(k).name): _round(v.tolist())} for k, v in zip(dataset.im_files, dataset.labels)]}
# Save, print and return
if save:
stats_path = self.hub_dir / 'stats.json'
LOGGER.info(f'Saving {stats_path.resolve()}...')
with open(stats_path, 'w') as f:
json.dump(self.stats, f) # save stats.json
if verbose:
LOGGER.info(json.dumps(self.stats, indent=2, sort_keys=False))
return self.stats
def process_images(self):
# Compress images for Ultralytics HUB
for split in 'train', 'val', 'test':
if self.data.get(split) is None:
continue
dataset = LoadImagesAndLabels(self.data[split]) # load dataset
desc = f'{split} images'
total = dataset.n
with ThreadPool(NUM_THREADS) as pool:
for _ in tqdm(pool.imap(self._hub_ops, dataset.im_files), total=total, desc=desc):
pass
LOGGER.info(f'Done. All images saved to {self.im_dir}')
return self.im_dir
# Classification dataloaders -------------------------------------------------------------------------------------------
class ClassificationDataset(torchvision.datasets.ImageFolder):
"""

View File

@ -2,9 +2,11 @@
import contextlib
import hashlib
import json
import os
import subprocess
import time
from multiprocessing.pool import ThreadPool
from pathlib import Path
from tarfile import is_tarfile
from zipfile import is_zipfile
@ -12,10 +14,11 @@ from zipfile import is_zipfile
import cv2
import numpy as np
from PIL import ExifTags, Image, ImageOps
from tqdm import tqdm
from ultralytics.yolo.utils import DATASETS_DIR, LOGGER, ROOT, colorstr, emojis, yaml_load
from ultralytics.yolo.utils import DATASETS_DIR, LOGGER, NUM_THREADS, ROOT, colorstr, emojis, yaml_load
from ultralytics.yolo.utils.checks import check_file, check_font, is_ascii
from ultralytics.yolo.utils.downloads import download, safe_download
from ultralytics.yolo.utils.downloads import download, safe_download, unzip_file
from ultralytics.yolo.utils.ops import segments2boxes
HELP_URL = 'See https://github.com/ultralytics/yolov5/wiki/Train-Custom-Data'
@ -290,3 +293,128 @@ def check_cls_dataset(dataset: str):
names = [x.name for x in (data_dir / 'train').iterdir() if x.is_dir()] # class names list
names = dict(enumerate(sorted(names)))
return {'train': train_set, 'val': test_set, 'nc': nc, 'names': names}
class HUBDatasetStats():
""" Class for generating HUB dataset JSON and `-hub` dataset directory
Arguments
path: Path to data.yaml or data.zip (with data.yaml inside data.zip)
autodownload: Attempt to download dataset if not found locally
Usage
from ultralytics.yolo.data.utils import HUBDatasetStats
stats = HUBDatasetStats('coco128.yaml', autodownload=True) # usage 1
stats = HUBDatasetStats('/Users/glennjocher/Downloads/coco6.zip') # usage 2
stats.get_json(save=False)
stats.process_images()
"""
def __init__(self, path='coco128.yaml', autodownload=False):
# Initialize class
zipped, data_dir, yaml_path = self._unzip(Path(path))
try:
# data = yaml_load(check_yaml(yaml_path)) # data dict
data = check_det_dataset(yaml_path, autodownload) # data dict
if zipped:
data['path'] = data_dir
except Exception as e:
raise Exception('error/HUB/dataset_stats/yaml_load') from e
self.hub_dir = Path(str(data['path']) + '-hub')
self.im_dir = self.hub_dir / 'images'
self.im_dir.mkdir(parents=True, exist_ok=True) # makes /images
self.stats = {'nc': len(data['names']), 'names': list(data['names'].values())} # statistics dictionary
self.data = data
@staticmethod
def _find_yaml(dir):
# Return data.yaml file
files = list(dir.glob('*.yaml')) or list(dir.rglob('*.yaml')) # try root level first and then recursive
assert files, f'No *.yaml file found in {dir}'
if len(files) > 1:
files = [f for f in files if f.stem == dir.stem] # prefer *.yaml files that match dir name
assert files, f'Multiple *.yaml files found in {dir}, only 1 *.yaml file allowed'
assert len(files) == 1, f'Multiple *.yaml files found: {files}, only 1 *.yaml file allowed in {dir}'
return files[0]
def _unzip(self, path):
# Unzip data.zip
if not str(path).endswith('.zip'): # path is data.yaml
return False, None, path
assert Path(path).is_file(), f'Error unzipping {path}, file not found'
unzip_file(path, path=path.parent)
dir = path.with_suffix('') # dataset directory == zip name
assert dir.is_dir(), f'Error unzipping {path}, {dir} not found. path/to/abc.zip MUST unzip to path/to/abc/'
return True, str(dir), self._find_yaml(dir) # zipped, data_dir, yaml_path
def _hub_ops(self, f, max_dim=1920):
# HUB ops for 1 image 'f': resize and save at reduced quality in /dataset-hub for web/app viewing
f_new = self.im_dir / Path(f).name # dataset-hub image filename
try: # use PIL
im = Image.open(f)
r = max_dim / max(im.height, im.width) # ratio
if r < 1.0: # image too large
im = im.resize((int(im.width * r), int(im.height * r)))
im.save(f_new, 'JPEG', quality=50, optimize=True) # save
except Exception as e: # use OpenCV
LOGGER.info(f'WARNING ⚠️ HUB ops PIL failure {f}: {e}')
im = cv2.imread(f)
im_height, im_width = im.shape[:2]
r = max_dim / max(im_height, im_width) # ratio
if r < 1.0: # image too large
im = cv2.resize(im, (int(im_width * r), int(im_height * r)), interpolation=cv2.INTER_AREA)
cv2.imwrite(str(f_new), im)
def get_json(self, save=False, verbose=False):
# Return dataset JSON for Ultralytics HUB
# from ultralytics.yolo.data import YOLODataset
from ultralytics.yolo.data.dataloaders.v5loader import LoadImagesAndLabels
def _round(labels):
# Update labels to integer class and 6 decimal place floats
return [[int(c), *(round(x, 4) for x in points)] for c, *points in labels]
for split in 'train', 'val', 'test':
if self.data.get(split) is None:
self.stats[split] = None # i.e. no test set
continue
dataset = LoadImagesAndLabels(self.data[split]) # load dataset
x = np.array([
np.bincount(label[:, 0].astype(int), minlength=self.data['nc'])
for label in tqdm(dataset.labels, total=len(dataset), desc='Statistics')]) # shape(128x80)
self.stats[split] = {
'instance_stats': {
'total': int(x.sum()),
'per_class': x.sum(0).tolist()},
'image_stats': {
'total': len(dataset),
'unlabelled': int(np.all(x == 0, 1).sum()),
'per_class': (x > 0).sum(0).tolist()},
'labels': [{
str(Path(k).name): _round(v.tolist())} for k, v in zip(dataset.im_files, dataset.labels)]}
# Save, print and return
if save:
stats_path = self.hub_dir / 'stats.json'
LOGGER.info(f'Saving {stats_path.resolve()}...')
with open(stats_path, 'w') as f:
json.dump(self.stats, f) # save stats.json
if verbose:
LOGGER.info(json.dumps(self.stats, indent=2, sort_keys=False))
return self.stats
def process_images(self):
# Compress images for Ultralytics HUB
# from ultralytics.yolo.data import YOLODataset
from ultralytics.yolo.data.dataloaders.v5loader import LoadImagesAndLabels
for split in 'train', 'val', 'test':
if self.data.get(split) is None:
continue
dataset = LoadImagesAndLabels(self.data[split]) # load dataset
with ThreadPool(NUM_THREADS) as pool:
for _ in tqdm(pool.imap(self._hub_ops, dataset.im_files), total=len(dataset), desc=f'{split} images'):
pass
LOGGER.info(f'Done. All images saved to {self.im_dir}')
return self.im_dir