Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
ff01ea0
feat: Add metadata models package with dynamic schema download
devin-ai-integration[bot] Oct 21, 2025
63930c6
refactor: Move metadata models to airbyte_cdk.test.models.connector_m…
devin-ai-integration[bot] Oct 21, 2025
62902f6
refactor: Move models to generated subdirectory and add convenience i…
devin-ai-integration[bot] Oct 21, 2025
3de3af0
refactor: Generate metadata models as single file with JSON schema ou…
devin-ai-integration[bot] Oct 21, 2025
933d478
style: Apply ruff formatting to build script
devin-ai-integration[bot] Oct 21, 2025
c89faab
docs: Move metadata models documentation to CONTRIBUTING.md
devin-ai-integration[bot] Oct 27, 2025
6980060
Merge branch 'main' into devin/1760999875-add-metadata-models
aaronsteers Oct 27, 2025
a56208b
chore: revert unrelated format changes on other generated file
aaronsteers Oct 27, 2025
0f48425
Delete airbyte_cdk/test/models/connector_metadata/README.md
aaronsteers Oct 27, 2025
07d7014
docs: clean up docstring (merged content from `README.md`)
aaronsteers Oct 27, 2025
c63223a
feat: Replace HTTP downloads with sparse git clone for metadata schemas
devin-ai-integration[bot] Oct 27, 2025
da4371f
Revert accidental formatting of generated file declarative_component_…
devin-ai-integration[bot] Oct 27, 2025
5373480
Add exclusions for auto-generated files in ruff and pre-commit configs
devin-ai-integration[bot] Oct 27, 2025
7e4e3f4
Fix JSON schema consolidation to properly resolve references
devin-ai-integration[bot] Oct 27, 2025
015a60e
Remove $schema and $id from definitions to fix IDE validation
devin-ai-integration[bot] Oct 27, 2025
fe4b9cc
Refactor: Extract metadata generation into separate script
devin-ai-integration[bot] Oct 27, 2025
66d4eeb
Move metadata generation to poe tasks instead of shell script
devin-ai-integration[bot] Oct 27, 2025
23837eb
Replace Dagger with uvx in metadata generation script
devin-ai-integration[bot] Oct 27, 2025
c686574
Simplify metadata generation: generate Python from JSON schema
devin-ai-integration[bot] Oct 27, 2025
ba912fe
Fix schema consolidation per CodeRabbit feedback
devin-ai-integration[bot] Oct 27, 2025
3c2a4f8
Add type annotations and fix formatting
devin-ai-integration[bot] Oct 27, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 86 additions & 58 deletions airbyte_cdk/sources/declarative/models/declarative_component_schema.py

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions airbyte_cdk/test/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
"""Models used for standard tests."""

from airbyte_cdk.test.models.connector_metadata import (
ConnectorMetadataDefinitionV0,
ConnectorTestSuiteOptions,
)
from airbyte_cdk.test.models.outcome import ExpectedOutcome
from airbyte_cdk.test.models.scenario import ConnectorTestScenario

__all__ = [
"ConnectorMetadataDefinitionV0",
"ConnectorTestScenario",
"ConnectorTestSuiteOptions",
"ExpectedOutcome",
]
16 changes: 16 additions & 0 deletions airbyte_cdk/test/models/connector_metadata/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Airbyte Connector Metadata Models

This package contains Pydantic models for validating Airbyte connector `metadata.yaml` files.

## Usage

```python
from airbyte_cdk.test.models import ConnectorMetadataDefinitionV0
import yaml

metadata = ConnectorMetadataDefinitionV0(**yaml.safe_load(metadata_yaml))
```

## Regenerating Models

See the [Contributing Guide](../../../docs/CONTRIBUTING.md#regenerating-connector-metadata-models) for information on regenerating these models.
13 changes: 13 additions & 0 deletions airbyte_cdk/test/models/connector_metadata/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
"""Connector metadata models for validation and testing.

These models are auto-generated from JSON schemas in the airbytehq/airbyte repository.
For information on regenerating these models, see the Contributing Guide:
https://github.com/airbytehq/airbyte-python-cdk/blob/main/docs/CONTRIBUTING.md#regenerating-connector-metadata-models
"""

from .generated.models import ConnectorMetadataDefinitionV0, ConnectorTestSuiteOptions

__all__ = [
"ConnectorMetadataDefinitionV0",
"ConnectorTestSuiteOptions",
]
Empty file.
279 changes: 239 additions & 40 deletions bin/generate_component_manifest_files.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,25 @@
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.

import json
import os
import re
import sys
import tempfile
from glob import glob
from pathlib import Path

import anyio
import dagger
import httpx
import yaml

PYTHON_IMAGE = "python:3.10"
LOCAL_YAML_DIR_PATH = "airbyte_cdk/sources/declarative"
LOCAL_OUTPUT_DIR_PATH = "airbyte_cdk/sources/declarative/models"

METADATA_SCHEMAS_GITHUB_URL = "https://api.github.com/repos/airbytehq/airbyte/contents/airbyte-ci/connectors/metadata_service/lib/metadata_service/models/src"
METADATA_SCHEMAS_RAW_URL_BASE = "https://raw.githubusercontent.com/airbytehq/airbyte/master/airbyte-ci/connectors/metadata_service/lib/metadata_service/models/src"
LOCAL_METADATA_OUTPUT_DIR_PATH = "airbyte_cdk/test/models/connector_metadata/generated"

PIP_DEPENDENCIES = [
"datamodel_code_generator==0.26.3",
Expand All @@ -22,13 +30,58 @@ def get_all_yaml_files_without_ext() -> list[str]:
return [Path(f).stem for f in glob(f"{LOCAL_YAML_DIR_PATH}/*.yaml")]


def generate_init_module_content() -> str:
def get_all_yaml_files_from_dir(directory: str) -> list[str]:
return [Path(f).stem for f in glob(f"{directory}/*.yaml")]


def generate_init_module_content(yaml_files: list[str]) -> str:
header = "# generated by bin/generate_component_manifest_files.py\n"
for module_name in get_all_yaml_files_without_ext():
for module_name in yaml_files:
header += f"from .{module_name} import *\n"
return header


async def download_metadata_schemas(temp_dir: Path) -> list[str]:
"""Download metadata schema YAML files from GitHub to a temporary directory."""
token = os.getenv("GITHUB_TOKEN") or os.getenv("GH_TOKEN")
headers = {
"User-Agent": "airbyte-python-cdk-build",
"Accept": "application/vnd.github.v3+json",
}
if token:
headers["Authorization"] = f"Bearer {token}"

async with httpx.AsyncClient(headers=headers, timeout=30.0) as client:
try:
response = await client.get(METADATA_SCHEMAS_GITHUB_URL)
response.raise_for_status()
files_info = response.json()
except httpx.HTTPStatusError as e:
if e.response.status_code == 403:
print(
"Warning: GitHub API rate limit exceeded. Provide GITHUB_TOKEN to authenticate.",
file=sys.stderr,
)
raise
raise

yaml_files = []
for file_info in files_info:
if file_info["name"].endswith(".yaml"):
file_name = file_info["name"]
file_url = f"{METADATA_SCHEMAS_RAW_URL_BASE}/{file_name}"

print(f"Downloading {file_name}...", file=sys.stderr)
file_response = await client.get(file_url)
file_response.raise_for_status()

file_path = temp_dir / file_name
file_path.write_text(file_response.text)
yaml_files.append(Path(file_name).stem)

return yaml_files


def replace_base_model_for_classes_with_deprecated_fields(post_processed_content: str) -> str:
"""
Replace the base model for classes with deprecated fields.
Expand Down Expand Up @@ -110,49 +163,195 @@ async def post_process_codegen(codegen_container: dagger.Container):
return codegen_container


async def main():
init_module_content = generate_init_module_content()
async def post_process_metadata_models(codegen_container: dagger.Container):
"""Post-process metadata models to use pydantic.v1 compatibility layer."""
codegen_container = codegen_container.with_exec(
["mkdir", "/generated_post_processed"], use_entrypoint=True
)
for generated_file in await codegen_container.directory("/generated").entries():
if generated_file.endswith(".py"):
original_content = await codegen_container.file(
f"/generated/{generated_file}"
).contents()

async with dagger.Connection(dagger.Config(log_output=sys.stderr)) as dagger_client:
codegen_container = (
dagger_client.container()
.from_(PYTHON_IMAGE)
.with_exec(["mkdir", "/generated"], use_entrypoint=True)
.with_exec(["pip", "install", " ".join(PIP_DEPENDENCIES)], use_entrypoint=True)
.with_mounted_directory(
"/yaml", dagger_client.host().directory(LOCAL_YAML_DIR_PATH, include=["*.yaml"])
post_processed_content = original_content.replace("from pydantic", "from pydantic.v1")

codegen_container = codegen_container.with_new_file(
f"/generated_post_processed/{generated_file}", contents=post_processed_content
)
.with_new_file("/generated/__init__.py", contents=init_module_content)
return codegen_container


async def generate_models_from_schemas(
dagger_client: dagger.Client,
yaml_dir_path: str,
output_dir_path: str,
yaml_files: list[str],
post_process: bool = False,
metadata_models: bool = False,
) -> None:
"""Generate Pydantic models from YAML schemas using datamodel-codegen."""
init_module_content = generate_init_module_content(yaml_files)

codegen_container = (
dagger_client.container()
.from_(PYTHON_IMAGE)
.with_exec(["mkdir", "/generated"], use_entrypoint=True)
.with_exec(["pip", "install", " ".join(PIP_DEPENDENCIES)], use_entrypoint=True)
.with_mounted_directory(
"/yaml", dagger_client.host().directory(yaml_dir_path, include=["*.yaml"])
)
for yaml_file in get_all_yaml_files_without_ext():
codegen_container = codegen_container.with_exec(
[
"datamodel-codegen",
"--input",
f"/yaml/{yaml_file}.yaml",
"--output",
f"/generated/{yaml_file}.py",
"--disable-timestamp",
"--enum-field-as-literal",
"one",
"--set-default-enum-member",
"--use-double-quotes",
"--remove-special-field-name-prefix",
# allow usage of the extra key such as `deprecated`, etc.
"--field-extra-keys",
# account the `deprecated` flag provided for the field.
"deprecated",
# account the `deprecation_message` provided for the field.
"deprecation_message",
],
use_entrypoint=True,
)
.with_new_file("/generated/__init__.py", contents=init_module_content)
)

await (
(await post_process_codegen(codegen_container))
.directory("/generated_post_processed")
.export(LOCAL_OUTPUT_DIR_PATH)
for yaml_file in yaml_files:
codegen_container = codegen_container.with_exec(
[
"datamodel-codegen",
"--input",
f"/yaml/{yaml_file}.yaml",
"--output",
f"/generated/{yaml_file}.py",
"--disable-timestamp",
"--enum-field-as-literal",
"one",
"--set-default-enum-member",
"--use-double-quotes",
"--remove-special-field-name-prefix",
"--field-extra-keys",
"deprecated",
"deprecation_message",
],
use_entrypoint=True,
)

if post_process:
codegen_container = await post_process_codegen(codegen_container)
await codegen_container.directory("/generated_post_processed").export(output_dir_path)
elif metadata_models:
codegen_container = await post_process_metadata_models(codegen_container)
await codegen_container.directory("/generated_post_processed").export(output_dir_path)
else:
await codegen_container.directory("/generated").export(output_dir_path)


def consolidate_yaml_schemas_to_json(yaml_dir_path: Path, output_json_path: str) -> None:
"""Consolidate all YAML schemas into a single JSON schema file."""
schemas = {}

for yaml_file in yaml_dir_path.glob("*.yaml"):
schema_name = yaml_file.stem
with yaml_file.open("r") as f:
schema_content = yaml.safe_load(f)
schemas[schema_name] = schema_content

# Find the main schema (ConnectorMetadataDefinitionV0)
main_schema = schemas.get("ConnectorMetadataDefinitionV0")

if main_schema:
# Create a consolidated schema with definitions
consolidated = {
"$schema": main_schema.get("$schema", "http://json-schema.org/draft-07/schema#"),
"title": "Connector Metadata Schema",
"description": "Consolidated JSON schema for Airbyte connector metadata validation",
**main_schema,
"definitions": {},
}

# Add all other schemas as definitions
for schema_name, schema_content in schemas.items():
if schema_name != "ConnectorMetadataDefinitionV0":
consolidated["definitions"][schema_name] = schema_content

Path(output_json_path).write_text(json.dumps(consolidated, indent=2))
print(f"Generated consolidated JSON schema: {output_json_path}", file=sys.stderr)
else:
print(
"Warning: ConnectorMetadataDefinitionV0 not found, generating simple consolidation",
file=sys.stderr,
)
Path(output_json_path).write_text(json.dumps(schemas, indent=2))


async def generate_metadata_models_single_file(
dagger_client: dagger.Client,
yaml_dir_path: str,
output_file_path: str,
) -> None:
"""Generate all metadata models into a single Python file."""
codegen_container = (
dagger_client.container()
.from_(PYTHON_IMAGE)
.with_exec(["mkdir", "-p", "/generated"], use_entrypoint=True)
.with_exec(["pip", "install", " ".join(PIP_DEPENDENCIES)], use_entrypoint=True)
.with_mounted_directory(
"/yaml", dagger_client.host().directory(yaml_dir_path, include=["*.yaml"])
)
)

codegen_container = codegen_container.with_exec(
[
"datamodel-codegen",
"--input",
"/yaml",
"--output",
"/generated/models.py",
"--disable-timestamp",
"--enum-field-as-literal",
"one",
"--set-default-enum-member",
"--use-double-quotes",
"--remove-special-field-name-prefix",
"--field-extra-keys",
"deprecated",
"deprecation_message",
],
use_entrypoint=True,
)

original_content = await codegen_container.file("/generated/models.py").contents()
post_processed_content = original_content.replace("from pydantic", "from pydantic.v1")

codegen_container = codegen_container.with_new_file(
"/generated/models_processed.py", contents=post_processed_content
)

await codegen_container.file("/generated/models_processed.py").export(output_file_path)


async def main():
async with dagger.Connection(dagger.Config(log_output=sys.stderr)) as dagger_client:
print("Generating declarative component models...", file=sys.stderr)
declarative_yaml_files = get_all_yaml_files_without_ext()
await generate_models_from_schemas(
dagger_client=dagger_client,
yaml_dir_path=LOCAL_YAML_DIR_PATH,
output_dir_path=LOCAL_OUTPUT_DIR_PATH,
yaml_files=declarative_yaml_files,
post_process=True,
)

print("\nGenerating metadata models...", file=sys.stderr)
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
await download_metadata_schemas(temp_path)

output_dir = Path(LOCAL_METADATA_OUTPUT_DIR_PATH)
output_dir.mkdir(parents=True, exist_ok=True)

print("Generating single Python file with all models...", file=sys.stderr)
output_file = str(output_dir / "models.py")
await generate_metadata_models_single_file(
dagger_client=dagger_client,
yaml_dir_path=str(temp_path),
output_file_path=output_file,
)

print("Generating consolidated JSON schema...", file=sys.stderr)
json_schema_file = str(output_dir / "metadata_schema.json")
consolidate_yaml_schemas_to_json(temp_path, json_schema_file)

print("\nModel generation complete!", file=sys.stderr)


anyio.run(main)
Loading
Loading