Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| import os | |
| import json | |
| import io | |
| import tempfile | |
| from pathlib import Path | |
| from huggingface_hub import HfApi, HfFolder, hf_hub_download | |
| print("Starting image download from Hugging Face dataset") | |
| # Get environment variables | |
| HF_USERNAME = os.environ.get("HF_USERNAME", "") | |
| HF_TOKEN = os.environ.get("HF_TOKEN", "") | |
| DATASET_REPO = os.environ.get("HF_DATASET_REPO", "image-uploader-data") | |
| # Validate required environment variables | |
| if not HF_USERNAME: | |
| print("ERROR: HF_USERNAME environment variable is not set") | |
| exit(1) | |
| if not HF_TOKEN: | |
| print("ERROR: HF_TOKEN environment variable is not set") | |
| exit(1) | |
| print(f"Using Hugging Face credentials for user: {HF_USERNAME}") | |
| print(f"Dataset repository: {DATASET_REPO}") | |
| # Set HF cache directory to a writable location | |
| os.environ["HF_HOME"] = os.path.join(tempfile.gettempdir(), "huggingface") | |
| os.environ["HUGGINGFACE_HUB_CACHE"] = os.path.join( | |
| tempfile.gettempdir(), "huggingface", "hub" | |
| ) | |
| os.makedirs(os.environ["HF_HOME"], exist_ok=True) | |
| os.makedirs(os.environ["HUGGINGFACE_HUB_CACHE"], exist_ok=True) | |
| # Constants | |
| IMAGES_PATH = "images" | |
| METADATA_PATH = "metadata" | |
| UPLOAD_DIR = Path("static/uploads") | |
| METADATA_DIR = Path("static/metadata") | |
| METADATA_FILE = METADATA_DIR / "image_metadata.json" | |
| # Alternative metadata location with guaranteed write permissions | |
| HOME_DIR = Path(os.environ.get("HOME", "/tmp")) | |
| ALT_METADATA_DIR = HOME_DIR / ".image_uploader" | |
| ALT_METADATA_DIR.mkdir(parents=True, exist_ok=True) | |
| ALT_METADATA_FILE = ALT_METADATA_DIR / "image_metadata.json" | |
| # Create directories if they don't exist | |
| UPLOAD_DIR.mkdir(parents=True, exist_ok=True) | |
| METADATA_DIR.mkdir(parents=True, exist_ok=True) | |
| # Function to get the appropriate metadata file | |
| def get_metadata_file(): | |
| # Try to write to the primary location | |
| try: | |
| if not METADATA_FILE.exists(): | |
| with open(METADATA_FILE, "w") as f: | |
| json.dump({}, f) | |
| # Test write permission | |
| if os.access(METADATA_FILE, os.W_OK): | |
| return METADATA_FILE | |
| raise PermissionError(f"No write permission for {METADATA_FILE}") | |
| except (PermissionError, OSError) as e: | |
| print(f"Warning: Cannot use {METADATA_FILE}: {e}") | |
| print(f"Using alternative location: {ALT_METADATA_FILE}") | |
| return ALT_METADATA_FILE | |
| # Initialize HfApi | |
| hf_api = HfApi(token=HF_TOKEN) | |
| try: | |
| # Check if repo exists | |
| print(f"Checking if repository {HF_USERNAME}/{DATASET_REPO} exists") | |
| hf_api.repo_info(repo_id=f"{HF_USERNAME}/{DATASET_REPO}", repo_type="dataset") | |
| print(f"Repository {HF_USERNAME}/{DATASET_REPO} exists") | |
| # Download metadata first | |
| print(f"Downloading metadata from {HF_USERNAME}/{DATASET_REPO}") | |
| try: | |
| metadata_file_path = hf_api.hf_hub_download( | |
| repo_id=f"{HF_USERNAME}/{DATASET_REPO}", | |
| filename=f"{METADATA_PATH}/image_metadata.json", | |
| repo_type="dataset", | |
| token=HF_TOKEN, | |
| local_dir=os.path.join(tempfile.gettempdir(), "hf_downloads"), | |
| ) | |
| print(f"Metadata downloaded to {metadata_file_path}") | |
| with open(metadata_file_path, "r") as f: | |
| metadata = json.load(f) | |
| # Save metadata locally to the appropriate file | |
| save_path = get_metadata_file() | |
| with open(save_path, "w") as f: | |
| json.dump(metadata, f) | |
| print(f"Metadata saved to {save_path}") | |
| except Exception as e: | |
| print(f"Error downloading metadata: {e}") | |
| print("Creating empty metadata file") | |
| metadata = {} | |
| # Initialize metadata file | |
| save_path = get_metadata_file() | |
| with open(save_path, "w") as f: | |
| json.dump({}, f) | |
| print(f"Created empty metadata file at {save_path}") | |
| # List all files in the dataset | |
| print("Listing files in the dataset") | |
| files = hf_api.list_repo_files( | |
| repo_id=f"{HF_USERNAME}/{DATASET_REPO}", repo_type="dataset", token=HF_TOKEN | |
| ) | |
| # Filter only image files | |
| image_files = [f for f in files if f.startswith(f"{IMAGES_PATH}/")] | |
| print(f"Found {len(image_files)} images") | |
| # Download each image | |
| success_count = 0 | |
| for i, image_file in enumerate(image_files): | |
| try: | |
| filename = os.path.basename(image_file) | |
| print(f"[{i+1}/{len(image_files)}] Downloading {filename}") | |
| # Download file | |
| download_path = hf_api.hf_hub_download( | |
| repo_id=f"{HF_USERNAME}/{DATASET_REPO}", | |
| filename=image_file, | |
| repo_type="dataset", | |
| token=HF_TOKEN, | |
| local_dir=os.path.join(tempfile.gettempdir(), "hf_downloads"), | |
| ) | |
| # Copy to uploads directory | |
| destination = UPLOAD_DIR / filename | |
| with open(download_path, "rb") as src, open(destination, "wb") as dst: | |
| dst.write(src.read()) | |
| print(f"Saved {filename} to {destination}") | |
| success_count += 1 | |
| except Exception as e: | |
| print(f"Error downloading {image_file}: {e}") | |
| print( | |
| f"Image download completed. Successfully downloaded {success_count}/{len(image_files)} images." | |
| ) | |
| except Exception as e: | |
| print(f"Error: {e}") | |
| print("Creating empty metadata file") | |
| save_path = get_metadata_file() | |
| with open(save_path, "w") as f: | |
| json.dump({}, f) | |
| print(f"Created empty metadata file at {save_path}") | |