Enable Google Drive file downloads (#4196)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2023-08-06 20:11:14 +02:00
parent eb80ba9ce0
commit c751c7f88a
2 changed files with 113 additions and 18 deletions
--- a/ultralytics/cfg/datasets/Argoverse.yaml
+++ b/ultralytics/cfg/datasets/Argoverse.yaml
@ -4,7 +4,7 @@
 # parent
 # ├── ultralytics
 # └── datasets
-#     └── Argoverse  ← downloads here (31.3 GB)
+#     └── Argoverse  ← downloads here (31.5 GB)
 # Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
@ -61,13 +61,13 @@ download: |
              f.writelines(labels[k])
-  # Download
+  # Download 'https://argoverse-hd.s3.us-east-2.amazonaws.com/Argoverse-HD-Full.zip' (deprecated S3 link)
  dir = Path(yaml['path'])  # dataset root dir
-  urls = ['https://argoverse-hd.s3.us-east-2.amazonaws.com/Argoverse-HD-Full.zip']
+  urls = ['https://drive.google.com/file/d/1st9qW3BeIwQsnR0t8mRpvbsSWIo16ACi/view?usp=drive_link']
  download(urls, dir=dir)
  # Convert
  annotations_dir = 'Argoverse-HD/annotations/'
  (dir / 'Argoverse-1.1' / 'tracking').rename(dir / 'Argoverse-1.1' / 'images')  # rename 'tracking' to 'images'
  for d in "train.json", "val.json":
-      argoverse2yolo(dir / annotations_dir / d)  # convert VisDrone annotations to YOLO labels
+      argoverse2yolo(dir / annotations_dir / d)  # convert Argoverse annotations to YOLO labels
--- a/ultralytics/utils/downloads.py
+++ b/ultralytics/utils/downloads.py
@ -1,13 +1,13 @@
 # Ultralytics YOLO 🚀, AGPL-3.0 license
 import contextlib
 import re
 import shutil
 import subprocess
 from itertools import repeat
 from multiprocessing.pool import ThreadPool
 from pathlib import Path
 from urllib import parse, request
 from zipfile import BadZipFile, ZipFile, is_zipfile
 import requests
 import torch
@ -39,7 +39,45 @@ def is_url(url, check=True):
    return False
-def unzip_file(file, path=None, exclude=('.DS_Store', '__MACOSX'), exist_ok=False):
+def zip_directory(directory, compress=True, exclude=('.DS_Store', '__MACOSX'), progress=True):
    """
    Zips the contents of a directory, excluding files containing strings in the exclude list.
    The resulting zip file is named after the directory and placed alongside it.
    Args:
        directory (str | Path): The path to the directory to be zipped.
        compress (bool): Whether to compress the files while zipping. Default is True.
        exclude (tuple, optional): A tuple of filename strings to be excluded. Defaults to ('.DS_Store', '__MACOSX').
        progress (bool, optional): Whether to display a progress bar. Defaults to True.
    Returns:
        (Path): The path to the resulting zip file.
    Example:
        ```python
        from ultralytics.utils.downloads import zip_directory
        file = zip_directory('path/to/dir')
        ```
    """
    from zipfile import ZIP_DEFLATED, ZIP_STORED, ZipFile
    directory = Path(directory)
    if not directory.is_dir():
        raise FileNotFoundError(f"Directory '{directory}' does not exist.")
    # Unzip with progress bar
    files_to_zip = [f for f in directory.rglob('*') if f.is_file() and not any(x in f.name for x in exclude)]
    zip_file = directory.with_suffix('.zip')
    compression = ZIP_DEFLATED if compress else ZIP_STORED
    with ZipFile(zip_file, 'w', compression) as f:
        for file in tqdm(files_to_zip, desc=f'Zipping {directory} to {zip_file}...', unit='file', disable=not progress):
            f.write(file, file.relative_to(directory))
    return zip_file  # return path to zip file
 def unzip_file(file, path=None, exclude=('.DS_Store', '__MACOSX'), exist_ok=False, progress=True):
    """
    Unzips a *.zip file to the specified path, excluding files containing strings in the exclude list.
@ -52,13 +90,23 @@ def unzip_file(file, path=None, exclude=('.DS_Store', '__MACOSX'), exist_ok=Fals
        path (str, optional): The path to extract the zipfile to. Defaults to None.
        exclude (tuple, optional): A tuple of filename strings to be excluded. Defaults to ('.DS_Store', '__MACOSX').
        exist_ok (bool, optional): Whether to overwrite existing contents if they exist. Defaults to False.
        progress (bool, optional): Whether to display a progress bar. Defaults to True.
    Raises:
        BadZipFile: If the provided file does not exist or is not a valid zipfile.
    Returns:
        (Path): The path to the directory where the zipfile was extracted.
    Example:
        ```python
        from ultralytics.utils.downloads import unzip_file
        dir = unzip_file('path/to/file.zip')
        ```
    """
    from zipfile import BadZipFile, ZipFile, is_zipfile
    if not (Path(file).exists() and is_zipfile(file)):
        raise BadZipFile(f"File '{file}' does not exist or is a bad zip file.")
    if path is None:
@ -66,10 +114,10 @@ def unzip_file(file, path=None, exclude=('.DS_Store', '__MACOSX'), exist_ok=Fals
    # Unzip the file contents
    with ZipFile(file) as zipObj:
-        file_list = [f for f in zipObj.namelist() if all(x not in f for x in exclude)]
+        files = [f for f in zipObj.namelist() if all(x not in f for x in exclude)]
-        top_level_dirs = {Path(f).parts[0] for f in file_list}
+        top_level_dirs = {Path(f).parts[0] for f in files}
-        if len(top_level_dirs) > 1 or not file_list[0].endswith('/'):
+        if len(top_level_dirs) > 1 or not files[0].endswith('/'):
            path = Path(path) / Path(file).stem  # define new unzip directory
        # Check if destination directory already exists and contains files
@ -79,7 +127,7 @@ def unzip_file(file, path=None, exclude=('.DS_Store', '__MACOSX'), exist_ok=Fals
            LOGGER.info(f'Skipping {file} unzip (already unzipped)')
            return path
-        for f in file_list:
+        for f in tqdm(files, desc=f'Unzipping {file} to {Path(path).resolve()}...', unit='file', disable=not progress):
            zipObj.extract(f, path=path)
    return path  # return unzip dir
@ -117,6 +165,48 @@ def check_disk_space(url='https://ultralytics.com/assets/coco128.zip', sf=1.5, h
    return True
 def get_google_drive_file_info(link):
    """
    Retrieves the direct download link and filename for a shareable Google Drive file link.
    Args:
        link (str): The shareable link of the Google Drive file.
    Returns:
        (str): Direct download URL for the Google Drive file.
        (str): Original filename of the Google Drive file. If filename extraction fails, returns None.
    Example:
        ```python
        from ultralytics.utils.downloads import get_google_drive_file_info
        link = "https://drive.google.com/file/d/1cqT-cJgANNrhIHCrEufUYhQ4RqiWG_lJ/view?usp=drive_link"
        url, filename = get_google_drive_file_info(link)
        ```
    """
    file_id = link.split('/d/')[1].split('/view')[0]
    drive_url = f'https://drive.google.com/uc?export=download&id={file_id}'
    # Start session
    filename = None
    with requests.Session() as session:
        response = session.get(drive_url, stream=True)
        if 'quota exceeded' in str(response.content.lower()):
            raise ConnectionError(
                emojis(f'❌  Google Drive file download quota exceeded. '
                       f'Please try again later or download this file manually at {link}.'))
        token = None
        for key, value in response.cookies.items():
            if key.startswith('download_warning'):
                token = value
        if token:
            drive_url = f'https://drive.google.com/uc?export=download&confirm={token}&id={file_id}'
        cd = response.headers.get('content-disposition')
        if cd:
            filename = re.findall('filename="(.+)"', cd)[0]
    return drive_url, filename
 def safe_download(url,
                  file=None,
                  dir=None,
@ -143,13 +233,18 @@ def safe_download(url,
            a successful download. Default: 1E0.
        progress (bool, optional): Whether to display a progress bar during the download. Default: True.
    """
-    f = dir / url2file(url) if dir else Path(file)  # URL converted to filename
+
    # Check if the URL is a Google Drive link
    gdrive = 'drive.google.com' in url
    if gdrive:
        url, file = get_google_drive_file_info(url)
    f = dir / (file if gdrive else url2file(url)) if dir else Path(file)  # URL converted to filename
    if '://' not in str(url) and Path(url).is_file():  # URL exists ('://' check required in Windows Python<3.10)
        f = Path(url)  # filename
    elif not f.is_file():  # URL and file do not exist
        assert dir or file, 'dir or file required for download'
-        f = dir / url2file(url) if dir else Path(file)
+        desc = f"Downloading {url if gdrive else clean_url(url)} to '{f}'"
        desc = f"Downloading {clean_url(url)} to '{f}'"
        LOGGER.info(f'{desc}...')
        f.parent.mkdir(parents=True, exist_ok=True)  # make directory if missing
        check_disk_space(url)
@ -189,14 +284,14 @@ def safe_download(url,
                LOGGER.warning(f'⚠️ Download failure, retrying {i + 1}/{retry} {url}...')
    if unzip and f.exists() and f.suffix in ('', '.zip', '.tar', '.gz'):
        from zipfile import is_zipfile
        unzip_dir = dir or f.parent  # unzip to dir if provided else unzip in place
        LOGGER.info(f'Unzipping {f} to {unzip_dir.absolute()}...')
        if is_zipfile(f):
            unzip_dir = unzip_file(file=f, path=unzip_dir)  # unzip
-        elif f.suffix == '.tar':
+        elif f.suffix in ('.tar', '.gz'):
-            subprocess.run(['tar', 'xf', f, '--directory', unzip_dir], check=True)  # unzip
+            LOGGER.info(f'Unzipping {f} to {unzip_dir.resolve()}...')
-        elif f.suffix == '.gz':
+            subprocess.run(['tar', 'xf' if f.suffix == '.tar' else 'xfz', f, '--directory', unzip_dir], check=True)
            subprocess.run(['tar', 'xfz', f, '--directory', unzip_dir], check=True)  # unzip
        if delete:
            f.unlink()  # remove zip
        return unzip_dir