Skip to content

Commit 131555b

Browse files
committed
Add script to generate S3 signed URLs for image files in JSONL format
1 parent 7034b60 commit 131555b

File tree

1 file changed

+69
-0
lines changed

1 file changed

+69
-0
lines changed

scripts/generateS3SignedUrls.sh

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
#!/bin/bash
2+
3+
# Script to generate S3 signed URLs for image files in JSONL format
4+
# Usage: ./generateS3SignedUrls.sh <s3-path> [output-file] [expiration-seconds] [parallel-jobs]
5+
# Or with curl:
6+
# curl -fsSL https://gist.githubusercontent.com/tonylampada/20b7bc984a455f53e2d07f88b33bf43c/raw/generateS3SignedUrls.sh | bash -s -- s3://bucket/path output.jsonl
7+
8+
set -e
9+
10+
# Check if S3 path is provided
11+
if [ -z "$1" ]; then
12+
echo "Error: S3 path is required"
13+
echo "Usage: $0 <s3-path> [output-file] [expiration-seconds] [parallel-jobs]"
14+
echo "Example: $0 s3://my-bucket/images/ output.jsonl 3600 8"
15+
exit 1
16+
fi
17+
18+
S3_PATH="$1"
19+
OUTPUT_FILE="${2:-signed_urls.jsonl}"
20+
EXPIRATION="${3:-21600}" # Default: 6 hours
21+
PARALLEL_JOBS="${4:-20}" # Default: 20 parallel jobs
22+
23+
# Remove trailing slash from S3 path if present
24+
S3_PATH="${S3_PATH%/}"
25+
26+
# Extract bucket name from S3_PATH
27+
BUCKET=$(echo "$S3_PATH" | sed 's|s3://||' | cut -d'/' -f1)
28+
29+
# Image file extensions to include (regex pattern for grep)
30+
IMAGE_PATTERN='\.(jpg|jpeg|png|gif|bmp|webp|tiff|tif|svg)$'
31+
32+
# Function to process a single file
33+
process_file() {
34+
local file_path="$1"
35+
local bucket="$2"
36+
local expiration="$3"
37+
38+
# Construct full S3 URI
39+
local s3_uri="s3://${bucket}/${file_path}"
40+
41+
# Generate signed URL
42+
local signed_url=$(aws s3 presign "$s3_uri" --expires-in "$expiration" 2>/dev/null)
43+
44+
if [ $? -eq 0 ]; then
45+
# Create name with full path using double underscores instead of slashes
46+
local name_with_path=$(echo "$file_path" | sed 's|/|__|g')
47+
48+
# Output JSONL
49+
echo "{\"name\": \"$name_with_path\", \"url\": \"$signed_url\"}"
50+
fi
51+
}
52+
53+
# Export function and variables for xargs
54+
export -f process_file
55+
export BUCKET
56+
export EXPIRATION
57+
58+
echo "Listing files from $S3_PATH..."
59+
60+
# Get list of all files, filter for images, and process in parallel
61+
aws s3 ls "$S3_PATH/" --recursive | \
62+
awk '{print $4}' | \
63+
grep -iE "$IMAGE_PATTERN" | \
64+
xargs -I {} -P "$PARALLEL_JOBS" bash -c 'process_file "$@"' _ {} "$BUCKET" "$EXPIRATION" | \
65+
tee "$OUTPUT_FILE"
66+
67+
echo ""
68+
echo "Done! Signed URLs written to $OUTPUT_FILE"
69+
echo "Total images processed: $(wc -l < "$OUTPUT_FILE")"

0 commit comments

Comments
 (0)