|
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +# Script to generate GCS signed URLs for image files in JSONL format |
| 4 | +# Usage: ./listgcs.sh <gcs-path> [output-file] [expiration-seconds] [parallel-jobs] |
| 5 | + |
| 6 | +set -e |
| 7 | + |
| 8 | +# Check if GCS path is provided |
| 9 | +if [ -z "$1" ]; then |
| 10 | + echo "Error: GCS path is required" |
| 11 | + echo "Usage: $0 <gcs-path> [output-file] [expiration-seconds] [parallel-jobs]" |
| 12 | + echo "Example: $0 gs://my-bucket/images/ output.jsonl 21600 8" |
| 13 | + exit 1 |
| 14 | +fi |
| 15 | + |
| 16 | +GCS_PATH="$1" |
| 17 | +OUTPUT_FILE="${2:-signed_urls.jsonl}" |
| 18 | +EXPIRATION_SECONDS="${3:-21600}" # Default: 6 hours |
| 19 | +PARALLEL_JOBS="${4:-20}" # Default: 20 parallel jobs |
| 20 | + |
| 21 | +# Remove trailing slash from GCS path if present |
| 22 | +GCS_PATH="${GCS_PATH%/}" |
| 23 | + |
| 24 | +# Convert seconds to duration format for gcloud (e.g., 21600s) |
| 25 | +EXPIRATION="${EXPIRATION_SECONDS}s" |
| 26 | + |
| 27 | +# Image file extensions to include (regex pattern for grep) |
| 28 | +IMAGE_PATTERN='\.(jpg|jpeg|png|gif|bmp|webp|tiff|tif|svg)$' |
| 29 | + |
| 30 | +# Function to find an appropriate service account |
| 31 | +find_service_account() { |
| 32 | + # First, try to get the default compute service account for the current project |
| 33 | + local project_id=$(gcloud config get-value project 2>/dev/null) |
| 34 | + if [ -n "$project_id" ]; then |
| 35 | + local compute_sa="${project_id}-compute@developer.gserviceaccount.com" |
| 36 | + if gcloud iam service-accounts describe "$compute_sa" >/dev/null 2>&1; then |
| 37 | + echo "$compute_sa" |
| 38 | + return 0 |
| 39 | + fi |
| 40 | + fi |
| 41 | + |
| 42 | + # If that doesn't work, try to find any service account in the project |
| 43 | + local sa_list=$(gcloud iam service-accounts list --format="value(email)" --limit=1 2>/dev/null) |
| 44 | + if [ -n "$sa_list" ]; then |
| 45 | + echo "$sa_list" | head -n 1 |
| 46 | + return 0 |
| 47 | + fi |
| 48 | + |
| 49 | + return 1 |
| 50 | +} |
| 51 | + |
| 52 | +# Try to find a service account to use |
| 53 | +SERVICE_ACCOUNT=$(find_service_account) |
| 54 | +if [ -z "$SERVICE_ACCOUNT" ]; then |
| 55 | + echo "Warning: No service account found. Attempting to sign URLs without impersonation." |
| 56 | + echo "If this fails, you may need to:" |
| 57 | + echo "1. Authenticate with a service account: gcloud auth activate-service-account --key-file=key.json" |
| 58 | + echo "2. Or ensure you have appropriate service accounts in your project" |
| 59 | + echo "" |
| 60 | +fi |
| 61 | + |
| 62 | +# Function to process a single file |
| 63 | +process_file() { |
| 64 | + local object="$1" |
| 65 | + local service_account="$2" |
| 66 | + local expiration="$3" |
| 67 | + |
| 68 | + # Create signed URL using gcloud storage sign-url |
| 69 | + local signed_url_output |
| 70 | + if [ -n "$service_account" ]; then |
| 71 | + signed_url_output=$(gcloud storage sign-url --http-verb=GET --duration="$expiration" --impersonate-service-account="$service_account" "$object" 2>/dev/null) |
| 72 | + else |
| 73 | + signed_url_output=$(gcloud storage sign-url --http-verb=GET --duration="$expiration" "$object" 2>/dev/null) |
| 74 | + fi |
| 75 | + |
| 76 | + if [ $? -eq 0 ] && [ -n "$signed_url_output" ]; then |
| 77 | + # Extract just the signed_url from the YAML output |
| 78 | + local signed_url=$(echo "$signed_url_output" | grep "signed_url:" | sed 's/signed_url: //') |
| 79 | + |
| 80 | + if [ -n "$signed_url" ]; then |
| 81 | + # Extract the path after the bucket name and convert slashes to double underscores |
| 82 | + local path_part=$(echo "$object" | sed 's|gs://[^/]*/||') |
| 83 | + local name_with_path=$(echo "$path_part" | sed 's|/|__|g') |
| 84 | + |
| 85 | + # Output JSONL |
| 86 | + echo "{\"name\": \"$name_with_path\", \"url\": \"$signed_url\"}" |
| 87 | + fi |
| 88 | + fi |
| 89 | +} |
| 90 | + |
| 91 | +# Export function and variables for xargs |
| 92 | +export -f process_file |
| 93 | +export SERVICE_ACCOUNT |
| 94 | +export EXPIRATION |
| 95 | + |
| 96 | +echo "Listing files from $GCS_PATH..." |
| 97 | + |
| 98 | +# Get list of all files, filter for images, and process in parallel |
| 99 | +gsutil ls -r "$GCS_PATH" 2>/dev/null | \ |
| 100 | + grep -v '/$' | \ |
| 101 | + grep -v ':$' | \ |
| 102 | + grep -iE "$IMAGE_PATTERN" | \ |
| 103 | + xargs -I {} -P "$PARALLEL_JOBS" bash -c 'process_file "$@"' _ {} "$SERVICE_ACCOUNT" "$EXPIRATION" | \ |
| 104 | + tee "$OUTPUT_FILE" |
| 105 | + |
| 106 | +echo "" |
| 107 | +echo "Done! Signed URLs written to $OUTPUT_FILE" |
| 108 | +echo "Total images processed: $(wc -l < "$OUTPUT_FILE")" |
0 commit comments