Skip to content

Commit 4cda662

Browse files
committed
Add automated Hugging Face collection manager
Implements a nightly job to automatically discover and add openenv-tagged Docker Spaces to the Environment Hub collection. Changes: - Add scripts/manage_hf_collection.py: Python script to discover Docker Spaces with 'openenv' tag and add them to the collection - Add .github/workflows/manage-hf-collection.yml: GitHub Actions workflow that runs the script nightly at midnight UTC and on script changes - Add comprehensive test suite in tests/scripts/ with 20+ unit tests using mocked API calls - Reorganize tests to mirror source structure (tests/scripts/ for scripts/) - Fix collection slug to use full identifier with hash Features: - Automatically discovers Docker Spaces tagged with 'openenv' - Prevents duplicates by checking existing collection items - Supports --dry-run and --verbose flags for local testing - Idempotent (safe to run multiple times) - Comprehensive error handling and logging Architecture: - Uses GitHub Actions scheduled workflow (simpler than HF Jobs) - Runs on free GitHub Actions infrastructure - Easy to test locally and view logs in GitHub Actions UI - Single source of truth (script in repo, no duplication) Testing: - Full test coverage with mocked HuggingFace API calls - Tests for success, failure, duplicate handling, and error scenarios - Can be tested locally with: python scripts/manage_hf_collection.py --dry-run
1 parent 1a3902e commit 4cda662

File tree

4 files changed

+784
-0
lines changed

4 files changed

+784
-0
lines changed
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
name: Update HF Collection
2+
3+
on:
4+
# Run nightly at midnight UTC
5+
schedule:
6+
- cron: '0 0 * * *'
7+
8+
# Trigger when the management script is updated
9+
push:
10+
branches:
11+
- main
12+
paths:
13+
- 'scripts/manage_hf_collection.py'
14+
- '.github/workflows/manage-hf-collection.yml'
15+
16+
# Allow manual triggering
17+
workflow_dispatch:
18+
19+
jobs:
20+
update-collection:
21+
runs-on: ubuntu-latest
22+
permissions:
23+
contents: read
24+
25+
steps:
26+
- name: Checkout repository
27+
uses: actions/checkout@v4
28+
29+
- name: Set up Python
30+
uses: actions/setup-python@v4
31+
with:
32+
python-version: '3.12'
33+
34+
- name: Install dependencies
35+
run: |
36+
pip install huggingface-hub>=0.20.0
37+
38+
- name: Run collection manager
39+
env:
40+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
41+
run: |
42+
python scripts/manage_hf_collection.py
43+
44+
- name: Summary
45+
if: success()
46+
run: |
47+
echo "## Collection Updated ✅" >> $GITHUB_STEP_SUMMARY
48+
echo "" >> $GITHUB_STEP_SUMMARY
49+
echo "Successfully ran the collection manager." >> $GITHUB_STEP_SUMMARY
50+
echo "" >> $GITHUB_STEP_SUMMARY
51+
echo "**Details:**" >> $GITHUB_STEP_SUMMARY
52+
echo "- Scheduled to run daily at midnight UTC" >> $GITHUB_STEP_SUMMARY
53+
echo "- Task: Discover and add openenv-tagged Docker Spaces" >> $GITHUB_STEP_SUMMARY
54+
echo "- Collection: [openenv/environment-hub](https://huggingface.co/collections/openenv/environment-hub-68f16377abea1ea114fa0743)" >> $GITHUB_STEP_SUMMARY

scripts/manage_hf_collection.py

Lines changed: 293 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,293 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Hugging Face Collection Manager for OpenEnv
4+
5+
This script automatically discovers Docker Spaces tagged with 'openenv' on Hugging Face
6+
and adds them to the Environment Hub collection if they're not already present.
7+
8+
Usage:
9+
python scripts/manage_hf_collection.py [--dry-run] [--verbose]
10+
11+
Environment Variables:
12+
HF_TOKEN: Required. Your Hugging Face API token with write access to collections.
13+
"""
14+
15+
import argparse
16+
import logging
17+
import os
18+
import sys
19+
from typing import Set, List
20+
from huggingface_hub import HfApi, list_spaces
21+
from huggingface_hub.utils import HfHubHTTPError
22+
23+
24+
# Constants
25+
COLLECTION_SLUG = "openenv/environment-hub-68f16377abea1ea114fa0743"
26+
TAG_FILTER = "openenv"
27+
SPACE_TYPE = "docker"
28+
29+
# Configure logging
30+
logging.basicConfig(
31+
level=logging.INFO,
32+
format='%(asctime)s - %(levelname)s - %(message)s',
33+
datefmt='%Y-%m-%d %H:%M:%S'
34+
)
35+
logger = logging.getLogger(__name__)
36+
37+
38+
def setup_api() -> HfApi:
39+
"""
40+
Initialize and authenticate the Hugging Face API client.
41+
42+
Returns:
43+
HfApi: Authenticated API client
44+
45+
Raises:
46+
SystemExit: If HF_TOKEN is not set
47+
"""
48+
hf_token = os.environ.get('HF_TOKEN')
49+
50+
if not hf_token:
51+
logger.error("HF_TOKEN environment variable is not set!")
52+
logger.error("Please set it with: export HF_TOKEN=your_token_here")
53+
sys.exit(1)
54+
55+
logger.info("Authenticating with Hugging Face...")
56+
api = HfApi(token=hf_token)
57+
58+
try:
59+
whoami = api.whoami()
60+
logger.info(f"✓ Authenticated as: {whoami['name']}")
61+
except Exception as e:
62+
logger.error(f"Failed to authenticate with Hugging Face: {e}")
63+
sys.exit(1)
64+
65+
return api
66+
67+
68+
def get_collection_spaces(api: HfApi) -> Set[str]:
69+
"""
70+
Retrieve the list of spaces currently in the Environment Hub collection.
71+
72+
Args:
73+
api: Authenticated HfApi client
74+
75+
Returns:
76+
Set of space IDs (in format "owner/space-name")
77+
"""
78+
logger.info(f"Fetching current collection: {COLLECTION_SLUG}")
79+
80+
try:
81+
collection = api.get_collection(COLLECTION_SLUG)
82+
83+
# Extract space IDs from collection items
84+
space_ids = set()
85+
for item in collection.items:
86+
if item.item_type == "space":
87+
space_ids.add(item.item_id)
88+
89+
logger.info(f"✓ Found {len(space_ids)} spaces in collection")
90+
return space_ids
91+
92+
except HfHubHTTPError as e:
93+
if e.response.status_code == 404:
94+
logger.error(f"Collection not found: {COLLECTION_SLUG}")
95+
logger.error("Please ensure the collection exists and you have access to it")
96+
else:
97+
logger.error(f"Error fetching collection: {e}")
98+
sys.exit(1)
99+
except Exception as e:
100+
logger.error(f"Unexpected error fetching collection: {e}")
101+
sys.exit(1)
102+
103+
104+
def discover_openenv_spaces(api: HfApi) -> List[str]:
105+
"""
106+
Search for all Docker Spaces tagged with 'openenv'.
107+
108+
Args:
109+
api: Authenticated HfApi client
110+
111+
Returns:
112+
List of space IDs (in format "owner/space-name")
113+
"""
114+
logger.info(f"Searching for Docker Spaces with tag '{TAG_FILTER}'...")
115+
116+
try:
117+
# List all spaces with the openenv tag using search parameter
118+
spaces = list(list_spaces(
119+
search=TAG_FILTER,
120+
full=False
121+
))
122+
123+
# Filter for Docker spaces with the openenv tag
124+
# Note: search may return spaces that mention 'openenv' in description too,
125+
# so we need to verify the tag is actually present
126+
docker_spaces_with_tag = []
127+
for space in spaces:
128+
# Get full space info to check tags
129+
try:
130+
space_info = api.space_info(space.id)
131+
# Check if it's a Docker space and has the openenv tag
132+
if (hasattr(space_info, 'sdk') and space_info.sdk == 'docker' and
133+
hasattr(space_info, 'tags') and TAG_FILTER in space_info.tags):
134+
docker_spaces_with_tag.append(space.id)
135+
except Exception as e:
136+
logger.warning(f"Could not fetch info for space {space.id}: {e}")
137+
continue
138+
139+
logger.info(f"✓ Discovered {len(docker_spaces_with_tag)} Docker spaces with tag '{TAG_FILTER}'")
140+
141+
return docker_spaces_with_tag
142+
143+
except Exception as e:
144+
logger.error(f"Error discovering spaces: {e}")
145+
sys.exit(1)
146+
147+
148+
def add_spaces_to_collection(
149+
api: HfApi,
150+
space_ids: List[str],
151+
dry_run: bool = False
152+
) -> int:
153+
"""
154+
Add new spaces to the Environment Hub collection.
155+
156+
Args:
157+
api: Authenticated HfApi client
158+
space_ids: List of space IDs to add
159+
dry_run: If True, only simulate the addition without making changes
160+
161+
Returns:
162+
Number of spaces added (or would be added in dry-run mode)
163+
"""
164+
if not space_ids:
165+
logger.info("No new spaces to add")
166+
return 0
167+
168+
added_count = 0
169+
failed_count = 0
170+
171+
for space_id in space_ids:
172+
if dry_run:
173+
logger.info(f"[DRY RUN] Would add space: {space_id}")
174+
added_count += 1
175+
else:
176+
try:
177+
logger.info(f"Adding space to collection: {space_id}")
178+
api.add_collection_item(
179+
collection_slug=COLLECTION_SLUG,
180+
item_id=space_id,
181+
item_type="space"
182+
)
183+
logger.info(f"✓ Successfully added: {space_id}")
184+
added_count += 1
185+
except HfHubHTTPError as e:
186+
if e.response.status_code == 409:
187+
# Space already in collection (race condition)
188+
logger.warning(f"Space already in collection: {space_id}")
189+
else:
190+
logger.error(f"Failed to add {space_id}: {e}")
191+
failed_count += 1
192+
except Exception as e:
193+
logger.error(f"Unexpected error adding {space_id}: {e}")
194+
failed_count += 1
195+
196+
if failed_count > 0:
197+
logger.warning(f"Failed to add {failed_count} spaces")
198+
199+
return added_count
200+
201+
202+
def main():
203+
"""Main execution function."""
204+
parser = argparse.ArgumentParser(
205+
description="Manage Hugging Face Environment Hub collection for OpenEnv spaces",
206+
formatter_class=argparse.RawDescriptionHelpFormatter,
207+
epilog="""
208+
Examples:
209+
# Run in dry-run mode to preview changes
210+
python scripts/manage_hf_collection.py --dry-run --verbose
211+
212+
# Run for real to add spaces to collection
213+
python scripts/manage_hf_collection.py
214+
215+
# View verbose output
216+
python scripts/manage_hf_collection.py --verbose
217+
218+
Environment Variables:
219+
HF_TOKEN: Required. Your Hugging Face API token.
220+
"""
221+
)
222+
223+
parser.add_argument(
224+
'--dry-run',
225+
action='store_true',
226+
help='Preview changes without modifying the collection'
227+
)
228+
229+
parser.add_argument(
230+
'--verbose',
231+
action='store_true',
232+
help='Enable verbose logging output'
233+
)
234+
235+
args = parser.parse_args()
236+
237+
# Set logging level
238+
if args.verbose:
239+
logger.setLevel(logging.DEBUG)
240+
logger.debug("Verbose logging enabled")
241+
242+
if args.dry_run:
243+
logger.info("=" * 60)
244+
logger.info("DRY RUN MODE - No changes will be made")
245+
logger.info("=" * 60)
246+
247+
# Step 1: Setup API
248+
api = setup_api()
249+
250+
# Step 2: Get current collection spaces
251+
current_spaces = get_collection_spaces(api)
252+
253+
if args.verbose:
254+
logger.debug(f"Current spaces in collection: {sorted(current_spaces)}")
255+
256+
# Step 3: Discover all openenv spaces
257+
discovered_spaces = discover_openenv_spaces(api)
258+
259+
if args.verbose:
260+
logger.debug(f"Discovered spaces: {sorted(discovered_spaces)}")
261+
262+
# Step 4: Find new spaces not yet in collection
263+
new_spaces = [s for s in discovered_spaces if s not in current_spaces]
264+
265+
logger.info("=" * 60)
266+
logger.info(f"Summary:")
267+
logger.info(f" Total spaces in collection: {len(current_spaces)}")
268+
logger.info(f" Total spaces discovered: {len(discovered_spaces)}")
269+
logger.info(f" New spaces to add: {len(new_spaces)}")
270+
logger.info("=" * 60)
271+
272+
if new_spaces:
273+
logger.info(f"New spaces found:")
274+
for space in new_spaces:
275+
logger.info(f" - {space}")
276+
277+
# Step 5: Add new spaces to collection
278+
added_count = add_spaces_to_collection(api, new_spaces, dry_run=args.dry_run)
279+
280+
# Final summary
281+
logger.info("=" * 60)
282+
if args.dry_run:
283+
logger.info(f"[DRY RUN] Would add {added_count} new spaces to collection")
284+
else:
285+
logger.info(f"✓ Successfully added {added_count} new spaces to collection")
286+
logger.info("=" * 60)
287+
288+
logger.info(f"Collection URL: https://huggingface.co/collections/{COLLECTION_SLUG}")
289+
290+
291+
if __name__ == "__main__":
292+
main()
293+

tests/scripts/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
"""
2+
Tests for scripts in the scripts/ directory.
3+
"""
4+

0 commit comments

Comments
 (0)