Skip to content

Commit 4310e56

Browse files
authored
Merge pull request #419 from roboflow/tony/signurls-script
adds curlable script to sign urls to images in s3 buckets
2 parents 7034b60 + 753db10 commit 4310e56

File tree

2 files changed

+177
-0
lines changed

2 files changed

+177
-0
lines changed

scripts/generateGCSSignedUrls.sh

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
#!/bin/bash
2+
3+
# Script to generate GCS signed URLs for image files in JSONL format
4+
# Usage: ./listgcs.sh <gcs-path> [output-file] [expiration-seconds] [parallel-jobs]
5+
6+
set -e
7+
8+
# Check if GCS path is provided
9+
if [ -z "$1" ]; then
10+
echo "Error: GCS path is required"
11+
echo "Usage: $0 <gcs-path> [output-file] [expiration-seconds] [parallel-jobs]"
12+
echo "Example: $0 gs://my-bucket/images/ output.jsonl 21600 8"
13+
exit 1
14+
fi
15+
16+
GCS_PATH="$1"
17+
OUTPUT_FILE="${2:-signed_urls.jsonl}"
18+
EXPIRATION_SECONDS="${3:-21600}" # Default: 6 hours
19+
PARALLEL_JOBS="${4:-20}" # Default: 20 parallel jobs
20+
21+
# Remove trailing slash from GCS path if present
22+
GCS_PATH="${GCS_PATH%/}"
23+
24+
# Convert seconds to duration format for gcloud (e.g., 21600s)
25+
EXPIRATION="${EXPIRATION_SECONDS}s"
26+
27+
# Image file extensions to include (regex pattern for grep)
28+
IMAGE_PATTERN='\.(jpg|jpeg|png|gif|bmp|webp|tiff|tif|svg)$'
29+
30+
# Function to find an appropriate service account
31+
find_service_account() {
32+
# First, try to get the default compute service account for the current project
33+
local project_id=$(gcloud config get-value project 2>/dev/null)
34+
if [ -n "$project_id" ]; then
35+
local compute_sa="${project_id}-compute@developer.gserviceaccount.com"
36+
if gcloud iam service-accounts describe "$compute_sa" >/dev/null 2>&1; then
37+
echo "$compute_sa"
38+
return 0
39+
fi
40+
fi
41+
42+
# If that doesn't work, try to find any service account in the project
43+
local sa_list=$(gcloud iam service-accounts list --format="value(email)" --limit=1 2>/dev/null)
44+
if [ -n "$sa_list" ]; then
45+
echo "$sa_list" | head -n 1
46+
return 0
47+
fi
48+
49+
return 1
50+
}
51+
52+
# Try to find a service account to use
53+
SERVICE_ACCOUNT=$(find_service_account)
54+
if [ -z "$SERVICE_ACCOUNT" ]; then
55+
echo "Warning: No service account found. Attempting to sign URLs without impersonation."
56+
echo "If this fails, you may need to:"
57+
echo "1. Authenticate with a service account: gcloud auth activate-service-account --key-file=key.json"
58+
echo "2. Or ensure you have appropriate service accounts in your project"
59+
echo ""
60+
fi
61+
62+
# Function to process a single file
63+
process_file() {
64+
local object="$1"
65+
local service_account="$2"
66+
local expiration="$3"
67+
68+
# Create signed URL using gcloud storage sign-url
69+
local signed_url_output
70+
if [ -n "$service_account" ]; then
71+
signed_url_output=$(gcloud storage sign-url --http-verb=GET --duration="$expiration" --impersonate-service-account="$service_account" "$object" 2>/dev/null)
72+
else
73+
signed_url_output=$(gcloud storage sign-url --http-verb=GET --duration="$expiration" "$object" 2>/dev/null)
74+
fi
75+
76+
if [ $? -eq 0 ] && [ -n "$signed_url_output" ]; then
77+
# Extract just the signed_url from the YAML output
78+
local signed_url=$(echo "$signed_url_output" | grep "signed_url:" | sed 's/signed_url: //')
79+
80+
if [ -n "$signed_url" ]; then
81+
# Extract the path after the bucket name and convert slashes to double underscores
82+
local path_part=$(echo "$object" | sed 's|gs://[^/]*/||')
83+
local name_with_path=$(echo "$path_part" | sed 's|/|__|g')
84+
85+
# Output JSONL
86+
echo "{\"name\": \"$name_with_path\", \"url\": \"$signed_url\"}"
87+
fi
88+
fi
89+
}
90+
91+
# Export function and variables for xargs
92+
export -f process_file
93+
export SERVICE_ACCOUNT
94+
export EXPIRATION
95+
96+
echo "Listing files from $GCS_PATH..."
97+
98+
# Get list of all files, filter for images, and process in parallel
99+
gsutil ls -r "$GCS_PATH" 2>/dev/null | \
100+
grep -v '/$' | \
101+
grep -v ':$' | \
102+
grep -iE "$IMAGE_PATTERN" | \
103+
xargs -I {} -P "$PARALLEL_JOBS" bash -c 'process_file "$@"' _ {} "$SERVICE_ACCOUNT" "$EXPIRATION" | \
104+
tee "$OUTPUT_FILE"
105+
106+
echo ""
107+
echo "Done! Signed URLs written to $OUTPUT_FILE"
108+
echo "Total images processed: $(wc -l < "$OUTPUT_FILE")"

scripts/generateS3SignedUrls.sh

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
#!/bin/bash
2+
3+
# Script to generate S3 signed URLs for image files in JSONL format
4+
# Usage: ./generateS3SignedUrls.sh <s3-path> [output-file] [expiration-seconds] [parallel-jobs]
5+
# Or with curl:
6+
# curl -fsSL https://raw.githubusercontent.com/roboflow/roboflow-python/main/scripts/generateS3SignedUrls.sh | bash -s -- s3://bucket/path output.jsonl
7+
8+
set -e
9+
10+
# Check if S3 path is provided
11+
if [ -z "$1" ]; then
12+
echo "Error: S3 path is required"
13+
echo "Usage: $0 <s3-path> [output-file] [expiration-seconds] [parallel-jobs]"
14+
echo "Example: $0 s3://my-bucket/images/ output.jsonl 3600 8"
15+
exit 1
16+
fi
17+
18+
S3_PATH="$1"
19+
OUTPUT_FILE="${2:-signed_urls.jsonl}"
20+
EXPIRATION="${3:-21600}" # Default: 6 hours
21+
PARALLEL_JOBS="${4:-20}" # Default: 20 parallel jobs
22+
23+
# Remove trailing slash from S3 path if present
24+
S3_PATH="${S3_PATH%/}"
25+
26+
# Extract bucket name from S3_PATH
27+
BUCKET=$(echo "$S3_PATH" | sed 's|s3://||' | cut -d'/' -f1)
28+
29+
# Image file extensions to include (regex pattern for grep)
30+
IMAGE_PATTERN='\.(jpg|jpeg|png|gif|bmp|webp|tiff|tif|svg)$'
31+
32+
# Function to process a single file
33+
process_file() {
34+
local file_path="$1"
35+
local bucket="$2"
36+
local expiration="$3"
37+
38+
# Construct full S3 URI
39+
local s3_uri="s3://${bucket}/${file_path}"
40+
41+
# Generate signed URL
42+
local signed_url=$(aws s3 presign "$s3_uri" --expires-in "$expiration" 2>/dev/null)
43+
44+
if [ $? -eq 0 ]; then
45+
# Create name with full path using double underscores instead of slashes
46+
local name_with_path=$(echo "$file_path" | sed 's|/|__|g')
47+
48+
# Output JSONL
49+
echo "{\"name\": \"$name_with_path\", \"url\": \"$signed_url\"}"
50+
fi
51+
}
52+
53+
# Export function and variables for xargs
54+
export -f process_file
55+
export BUCKET
56+
export EXPIRATION
57+
58+
echo "Listing files from $S3_PATH..."
59+
60+
# Get list of all files, filter for images, and process in parallel
61+
aws s3 ls "$S3_PATH/" --recursive | \
62+
awk '{print $4}' | \
63+
grep -iE "$IMAGE_PATTERN" | \
64+
xargs -I {} -P "$PARALLEL_JOBS" bash -c 'process_file "$@"' _ {} "$BUCKET" "$EXPIRATION" | \
65+
tee "$OUTPUT_FILE"
66+
67+
echo ""
68+
echo "Done! Signed URLs written to $OUTPUT_FILE"
69+
echo "Total images processed: $(wc -l < "$OUTPUT_FILE")"

0 commit comments

Comments
 (0)