From c751c7f88a477062864b69c49e6baa967b9973a6 Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Sun, 6 Aug 2023 20:11:14 +0200 Subject: [PATCH] Enable Google Drive file downloads (#4196) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- ultralytics/cfg/datasets/Argoverse.yaml | 8 +- ultralytics/utils/downloads.py | 123 +++++++++++++++++++++--- 2 files changed, 113 insertions(+), 18 deletions(-) diff --git a/ultralytics/cfg/datasets/Argoverse.yaml b/ultralytics/cfg/datasets/Argoverse.yaml index ddfdbb6..76255e4 100644 --- a/ultralytics/cfg/datasets/Argoverse.yaml +++ b/ultralytics/cfg/datasets/Argoverse.yaml @@ -4,7 +4,7 @@ # parent # ├── ultralytics # └── datasets -# └── Argoverse ← downloads here (31.3 GB) +# └── Argoverse ← downloads here (31.5 GB) # Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..] @@ -61,13 +61,13 @@ download: | f.writelines(labels[k]) - # Download + # Download 'https://argoverse-hd.s3.us-east-2.amazonaws.com/Argoverse-HD-Full.zip' (deprecated S3 link) dir = Path(yaml['path']) # dataset root dir - urls = ['https://argoverse-hd.s3.us-east-2.amazonaws.com/Argoverse-HD-Full.zip'] + urls = ['https://drive.google.com/file/d/1st9qW3BeIwQsnR0t8mRpvbsSWIo16ACi/view?usp=drive_link'] download(urls, dir=dir) # Convert annotations_dir = 'Argoverse-HD/annotations/' (dir / 'Argoverse-1.1' / 'tracking').rename(dir / 'Argoverse-1.1' / 'images') # rename 'tracking' to 'images' for d in "train.json", "val.json": - argoverse2yolo(dir / annotations_dir / d) # convert VisDrone annotations to YOLO labels + argoverse2yolo(dir / annotations_dir / d) # convert Argoverse annotations to YOLO labels diff --git a/ultralytics/utils/downloads.py b/ultralytics/utils/downloads.py index a1e0f22..e6d7489 100644 --- a/ultralytics/utils/downloads.py +++ b/ultralytics/utils/downloads.py @@ -1,13 +1,13 @@ # Ultralytics YOLO 🚀, AGPL-3.0 license import contextlib +import re import shutil import subprocess from itertools import repeat from multiprocessing.pool import ThreadPool from pathlib import Path from urllib import parse, request -from zipfile import BadZipFile, ZipFile, is_zipfile import requests import torch @@ -39,7 +39,45 @@ def is_url(url, check=True): return False -def unzip_file(file, path=None, exclude=('.DS_Store', '__MACOSX'), exist_ok=False): +def zip_directory(directory, compress=True, exclude=('.DS_Store', '__MACOSX'), progress=True): + """ + Zips the contents of a directory, excluding files containing strings in the exclude list. + The resulting zip file is named after the directory and placed alongside it. + + Args: + directory (str | Path): The path to the directory to be zipped. + compress (bool): Whether to compress the files while zipping. Default is True. + exclude (tuple, optional): A tuple of filename strings to be excluded. Defaults to ('.DS_Store', '__MACOSX'). + progress (bool, optional): Whether to display a progress bar. Defaults to True. + + Returns: + (Path): The path to the resulting zip file. + + Example: + ```python + from ultralytics.utils.downloads import zip_directory + + file = zip_directory('path/to/dir') + ``` + """ + from zipfile import ZIP_DEFLATED, ZIP_STORED, ZipFile + + directory = Path(directory) + if not directory.is_dir(): + raise FileNotFoundError(f"Directory '{directory}' does not exist.") + + # Unzip with progress bar + files_to_zip = [f for f in directory.rglob('*') if f.is_file() and not any(x in f.name for x in exclude)] + zip_file = directory.with_suffix('.zip') + compression = ZIP_DEFLATED if compress else ZIP_STORED + with ZipFile(zip_file, 'w', compression) as f: + for file in tqdm(files_to_zip, desc=f'Zipping {directory} to {zip_file}...', unit='file', disable=not progress): + f.write(file, file.relative_to(directory)) + + return zip_file # return path to zip file + + +def unzip_file(file, path=None, exclude=('.DS_Store', '__MACOSX'), exist_ok=False, progress=True): """ Unzips a *.zip file to the specified path, excluding files containing strings in the exclude list. @@ -52,13 +90,23 @@ def unzip_file(file, path=None, exclude=('.DS_Store', '__MACOSX'), exist_ok=Fals path (str, optional): The path to extract the zipfile to. Defaults to None. exclude (tuple, optional): A tuple of filename strings to be excluded. Defaults to ('.DS_Store', '__MACOSX'). exist_ok (bool, optional): Whether to overwrite existing contents if they exist. Defaults to False. + progress (bool, optional): Whether to display a progress bar. Defaults to True. Raises: BadZipFile: If the provided file does not exist or is not a valid zipfile. Returns: (Path): The path to the directory where the zipfile was extracted. + + Example: + ```python + from ultralytics.utils.downloads import unzip_file + + dir = unzip_file('path/to/file.zip') + ``` """ + from zipfile import BadZipFile, ZipFile, is_zipfile + if not (Path(file).exists() and is_zipfile(file)): raise BadZipFile(f"File '{file}' does not exist or is a bad zip file.") if path is None: @@ -66,10 +114,10 @@ def unzip_file(file, path=None, exclude=('.DS_Store', '__MACOSX'), exist_ok=Fals # Unzip the file contents with ZipFile(file) as zipObj: - file_list = [f for f in zipObj.namelist() if all(x not in f for x in exclude)] - top_level_dirs = {Path(f).parts[0] for f in file_list} + files = [f for f in zipObj.namelist() if all(x not in f for x in exclude)] + top_level_dirs = {Path(f).parts[0] for f in files} - if len(top_level_dirs) > 1 or not file_list[0].endswith('/'): + if len(top_level_dirs) > 1 or not files[0].endswith('/'): path = Path(path) / Path(file).stem # define new unzip directory # Check if destination directory already exists and contains files @@ -79,7 +127,7 @@ def unzip_file(file, path=None, exclude=('.DS_Store', '__MACOSX'), exist_ok=Fals LOGGER.info(f'Skipping {file} unzip (already unzipped)') return path - for f in file_list: + for f in tqdm(files, desc=f'Unzipping {file} to {Path(path).resolve()}...', unit='file', disable=not progress): zipObj.extract(f, path=path) return path # return unzip dir @@ -117,6 +165,48 @@ def check_disk_space(url='https://ultralytics.com/assets/coco128.zip', sf=1.5, h return True +def get_google_drive_file_info(link): + """ + Retrieves the direct download link and filename for a shareable Google Drive file link. + + Args: + link (str): The shareable link of the Google Drive file. + + Returns: + (str): Direct download URL for the Google Drive file. + (str): Original filename of the Google Drive file. If filename extraction fails, returns None. + + Example: + ```python + from ultralytics.utils.downloads import get_google_drive_file_info + + link = "https://drive.google.com/file/d/1cqT-cJgANNrhIHCrEufUYhQ4RqiWG_lJ/view?usp=drive_link" + url, filename = get_google_drive_file_info(link) + ``` + """ + file_id = link.split('/d/')[1].split('/view')[0] + drive_url = f'https://drive.google.com/uc?export=download&id={file_id}' + + # Start session + filename = None + with requests.Session() as session: + response = session.get(drive_url, stream=True) + if 'quota exceeded' in str(response.content.lower()): + raise ConnectionError( + emojis(f'❌ Google Drive file download quota exceeded. ' + f'Please try again later or download this file manually at {link}.')) + token = None + for key, value in response.cookies.items(): + if key.startswith('download_warning'): + token = value + if token: + drive_url = f'https://drive.google.com/uc?export=download&confirm={token}&id={file_id}' + cd = response.headers.get('content-disposition') + if cd: + filename = re.findall('filename="(.+)"', cd)[0] + return drive_url, filename + + def safe_download(url, file=None, dir=None, @@ -143,13 +233,18 @@ def safe_download(url, a successful download. Default: 1E0. progress (bool, optional): Whether to display a progress bar during the download. Default: True. """ - f = dir / url2file(url) if dir else Path(file) # URL converted to filename + + # Check if the URL is a Google Drive link + gdrive = 'drive.google.com' in url + if gdrive: + url, file = get_google_drive_file_info(url) + + f = dir / (file if gdrive else url2file(url)) if dir else Path(file) # URL converted to filename if '://' not in str(url) and Path(url).is_file(): # URL exists ('://' check required in Windows Python<3.10) f = Path(url) # filename elif not f.is_file(): # URL and file do not exist assert dir or file, 'dir or file required for download' - f = dir / url2file(url) if dir else Path(file) - desc = f"Downloading {clean_url(url)} to '{f}'" + desc = f"Downloading {url if gdrive else clean_url(url)} to '{f}'" LOGGER.info(f'{desc}...') f.parent.mkdir(parents=True, exist_ok=True) # make directory if missing check_disk_space(url) @@ -189,14 +284,14 @@ def safe_download(url, LOGGER.warning(f'⚠️ Download failure, retrying {i + 1}/{retry} {url}...') if unzip and f.exists() and f.suffix in ('', '.zip', '.tar', '.gz'): + from zipfile import is_zipfile + unzip_dir = dir or f.parent # unzip to dir if provided else unzip in place - LOGGER.info(f'Unzipping {f} to {unzip_dir.absolute()}...') if is_zipfile(f): unzip_dir = unzip_file(file=f, path=unzip_dir) # unzip - elif f.suffix == '.tar': - subprocess.run(['tar', 'xf', f, '--directory', unzip_dir], check=True) # unzip - elif f.suffix == '.gz': - subprocess.run(['tar', 'xfz', f, '--directory', unzip_dir], check=True) # unzip + elif f.suffix in ('.tar', '.gz'): + LOGGER.info(f'Unzipping {f} to {unzip_dir.resolve()}...') + subprocess.run(['tar', 'xf' if f.suffix == '.tar' else 'xfz', f, '--directory', unzip_dir], check=True) if delete: f.unlink() # remove zip return unzip_dir