From bffcf86e099c7d6b597ef4045623d452583aecb4 Mon Sep 17 00:00:00 2001 From: Flook Peter Date: Thu, 9 Oct 2025 08:59:13 +1100 Subject: [PATCH 01/11] Add in codeowners file to get TSC approval for PRs --- .github/CODEOWNERS | 1 + 1 file changed, 1 insertion(+) create mode 100644 .github/CODEOWNERS diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000..0832a08 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1 @@ +* @bitol-io/tsc From ef3a7b969d92f5d7ec2504a67f3cd4b03244ecc4 Mon Sep 17 00:00:00 2001 From: johnroch <78982434+johnhroch@users.noreply.github.com> Date: Thu, 18 Sep 2025 22:46:55 +0200 Subject: [PATCH 02/11] doc: added tags into fundamentals def --- docs/README.md | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/docs/README.md b/docs/README.md index 13f4e5b..a925f6d 100644 --- a/docs/README.md +++ b/docs/README.md @@ -65,25 +65,25 @@ tags: ['finance'] ### Definitions -| Key | UX label | Required | Description | -|--------------------------------------|---------------------------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| apiVersion | Standard version | Yes | Version of the standard used to build data contract. Default value is `v3.0.2`. | -| kind | Kind | Yes | The kind of file this is. Valid value is `DataContract`. | -| id | ID | Yes | A unique identifier used to reduce the risk of dataset name collisions, such as a UUID. | -| name | Name | No | Name of the data contract. | -| version | Version | Yes | Current version of the data contract. | -| status | Status | Yes | Current status of the data contract. Examples are "proposed", "draft", "active", "deprecated", "retired". | -| tenant | Tenant | No | Indicates the property the data is primarily associated with. Value is case insensitive. | -| domain | Domain | No | Name of the logical data domain. | -| dataProduct | Data Product | No | Name of the data product. | -| authoritativeDefinitions | Authoritative Definitions | No | List of links to sources that provide more details on the data contract. | -| description | Description | No | Object containing the descriptions. | -| description.purpose | Purpose | No | Intended purpose for the provided data. | -| description.limitations | Limitations | No | Technical, compliance, and legal limitations for data use. | -| description.usage | Usage | No | Recommended usage of the data. | -| description.authoritativeDefinitions | Authoritative Definitions | No | List of links to sources that provide more details on the dataset; examples would be a link to privacy statement, terms and conditions, license agreements, data catalog, or another tool. | -| description.customProperties | Custom Properties | No | Custom properties that are not part of the standard. | - +| Key | UX label | Required | Description | +|--------------------------------------|---------------------------|----------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| apiVersion | Standard version | Yes | Version of the standard used to build data contract. Default value is `v3.0.2`. | +| kind | Kind | Yes | The kind of file this is. Valid value is `DataContract`. | +| id | ID | Yes | A unique identifier used to reduce the risk of dataset name collisions, such as a UUID. | +| name | Name | No | Name of the data contract. | +| version | Version | Yes | Current version of the data contract. | +| status | Status | Yes | Current status of the data contract. Examples are "proposed", "draft", "active", "deprecated", "retired". | +| tenant | Tenant | No | Indicates the property the data is primarily associated with. Value is case insensitive. | +| tags | Tags | No | A list of tags that may be assigned to the elements (object or property); the tags keyword may appear at any level. Tags may be used to better categorize an element. For example, `finance`, `sensitive`, `employee_record`. | +| domain | Domain | No | Name of the logical data domain. | +| dataProduct | Data Product | No | Name of the data product. | +| authoritativeDefinitions | Authoritative Definitions | No | List of links to sources that provide more details on the data contract. | +| description | Description | No | Object containing the descriptions. | +| description.purpose | Purpose | No | Intended purpose for the provided data. | +| description.limitations | Limitations | No | Technical, compliance, and legal limitations for data use. | +| description.usage | Usage | No | Recommended usage of the data. | +| description.authoritativeDefinitions | Authoritative Definitions | No | List of links to sources that provide more details on the dataset; examples would be a link to privacy statement, terms and conditions, license agreements, data catalog, or another tool. | +| description.customProperties | Custom Properties | No | Custom properties that are not part of the standard. | ## Schema From b859e7610571a9495298743d120d249f7bca9de2 Mon Sep 17 00:00:00 2001 From: pkoper Date: Mon, 13 Oct 2025 14:37:15 +0100 Subject: [PATCH 03/11] chore(DEV SETUP): added dev_setup bash file which automates setting up dev environment --- .gitignore | 8 +- src/script/dev_setup.sh | 251 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 258 insertions(+), 1 deletion(-) create mode 100644 src/script/dev_setup.sh diff --git a/.gitignore b/.gitignore index a0889dc..ba2a045 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,10 @@ docs/changelog.md docs/contributing.md docs/home.md docs/vendors.md -docs/examples/**/*.md \ No newline at end of file +docs/examples/**/*.md + +# dev setup exclusion to keep the repository as simple as possible +# these would normally be not included as part of `.gitignore` +.venv/ +.pre-commit-config.yaml +.markdownlint.json diff --git a/src/script/dev_setup.sh b/src/script/dev_setup.sh new file mode 100644 index 0000000..45cde48 --- /dev/null +++ b/src/script/dev_setup.sh @@ -0,0 +1,251 @@ +#!/usr/bin/env bash +set -o pipefail + +# 🎨 Colors +NC='\033[0m' # No Color +CYAN='\033[0;36m' +YELLOW='\033[0;33m' +GREEN='\033[0;32m' +RED='\033[0;31m' +MAGENTA='\033[0;35m' + +# 🌱 Default virtual environment directory +VENV_DIR=".venv" + +# ----------------------------- +# 🎯 Logging Functions +# ----------------------------- +print_info() { + echo -e "πŸ’‘ [${CYAN}INFO${NC}] ${CYAN}$1${NC}"; + } +print_task() { + echo -e "⚑ [${YELLOW}TASK${NC}] ${YELLOW}$1${NC}"; + } +print_pass() { + echo -e "βœ… [${GREEN}PASS${NC}] ${GREEN}$1${NC}"; + } +print_warning() { + echo -e "⚠️ [${MAGENTA}WARN${NC}] ${MAGENTA}$1${NC}"; + } +print_error() { + echo -e "❌ [${RED}FAIL${NC}] ${RED}$1${NC}"; + } + +# ----------------------------- +# 🐍 Virtual Environment Check +# ----------------------------- +virtual_environment_check() { + print_info "Checking virtual environment..." + + if [[ -d "$VENV_DIR" && -f "$VENV_DIR/bin/activate" ]]; then + if [[ -n "${VIRTUAL_ENV:-}" ]]; then + print_pass "Virtual environment found and active." + else + print_info "Virtual environment found but not active." + print_task "Activating..." + source "$VENV_DIR/bin/activate" + fi + else + print_warning "No virtual environment found." + print_task "Creating and activating..." + python3 -m venv "$VENV_DIR" + source "$VENV_DIR/bin/activate" + fi +} + +# ----------------------------- +# πŸ”§ Pip Version Check +# ----------------------------- +pip_current_version_check() { + CURRENT_VERSION=$(pip --version | grep -oP '(?<=pip )\d+(\.\d+)+') + [[ -z "$CURRENT_VERSION" ]] && print_error "Could not extract pip version." +} + +pip_latest_version_check() { + DRY_RUN_OUTPUT=$(python3 -m pip install --upgrade pip --dry-run 2>/dev/null) + LATEST_VERSION=$(echo "$DRY_RUN_OUTPUT" | grep -oP 'pip-[0-9]+\.[0-9]+(\.[0-9]+)?' | head -n1 | tr -d 'pip-') + [[ -z "$LATEST_VERSION" ]] && LATEST_VERSION=$(echo "$DRY_RUN_OUTPUT" | grep -oP '\([0-9]+\.[0-9]+(\.[0-9]+)?\)' | head -n1 | tr -d '()') + [[ -z "$LATEST_VERSION" ]] && print_error "Could not determine the latest pip version." +} + +pip_status_check() { + print_info "Checking pip version..." + print_info "Current: ${CURRENT_VERSION} | Latest: ${LATEST_VERSION}" + + if [[ "$CURRENT_VERSION" == "$LATEST_VERSION" ]]; then + print_pass "pip is up to date." + else + print_warning "pip is outdated." + print_task "Updating pip..." + python3 -m pip install --upgrade pip + fi +} + +# ----------------------------- +# πŸ”„ Pre-commit Check +# ----------------------------- +pre_commit_status_check() { + print_info "Checking pre-commit installation..." + if command -v pre-commit >/dev/null 2>&1; then + print_pass "pre-commit is installed." + else + print_warning "pre-commit is missing." + print_task "Installing pre-commit..." + pip install pre-commit + fi +} + +pre_commit_current_version_check() { + CURRENT_VERSION=$(pre-commit --version | grep -oP '(?<=pre-commit )\d+(\.\d+)+') +} + +pre_commit_latest_version_check() { + DRY_RUN_OUTPUT=$(pip install pre-commit --upgrade --dry-run 2>/dev/null) + LATEST_VERSION=$(echo "$DRY_RUN_OUTPUT" | grep -oP 'commit-[0-9]+\.[0-9]+(\.[0-9]+)?' | head -n1 | tr -d 'commit-') + [[ -z "$LATEST_VERSION" ]] && LATEST_VERSION=$(echo "$DRY_RUN_OUTPUT" | grep -oP '\([0-9]+\.[0-9]+(\.[0-9]+)?\)' | head -n1 | tr -d '()') + [[ -z "$LATEST_VERSION" ]] && print_error "Could not determine the latest pre-commit version." +} + +pre_commit_version_check() { + print_info "Checking pre-commit version..." + print_info "Current: ${CURRENT_VERSION} | Latest: ${LATEST_VERSION}" + + if [[ "$CURRENT_VERSION" == "$LATEST_VERSION" ]]; then + print_pass "pre-commit is up to date." + else + print_warning "pre-commit is outdated." + print_task "Updating..." + pip install --upgrade pre-commit + fi +} + +# ----------------------------- +# πŸ“„ Pre Commit Config File Creation +# ----------------------------- +pre_commit_config_create() { + cat < .pre-commit-config.yaml +default_stages: [pre-commit, manual] + +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v6.0.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + args: ["--unsafe"] + - id: check-added-large-files + + - repo: https://github.com/tcort/markdown-link-check + rev: v3.13.7 + hooks: + - id: markdown-link-check + args: [-q] + + - repo: https://github.com/igorshubovych/markdownlint-cli + rev: v0.45.0 + hooks: + - id: markdownlint + args: ["--ignore", "CHANGELOG.md", "--fix"] +EOF +} + +# ----------------------------- +# πŸ“„ Markdown Lint Config File Creation +# ----------------------------- +markdownlint_create() { + cat < .markdownlint.json +{ + "comment": "Markdown Lint Rules", + "default": true, + "MD007": {"indent": 4}, + "MD013": false, + "MD024": false, + "MD025": {"front_matter_title": ""}, + "MD029": {"style": "one_or_ordered"}, + "MD033": false +} +EOF +} + +# ----------------------------- +# πŸ“„ Commit Lint Config File Creation +# ----------------------------- +commitlintrc_create() { + cat < .commitlintrc.json +{ + "rules": { + "body-leading-blank": [1, "always"], + "footer-leading-blank": [1, "always"], + "header-max-length": [2, "always", 72], + "scope-case": [2, "always", "upper-case"], + "scope-empty": [2, "never"], + "subject-case": [2, "never", ["start-case", "pascal-case", "upper-case"]], + "subject-empty": [2, "never"], + "subject-full-stop": [2, "never", "."], + "type-case": [2, "always", "lower-case"], + "type-empty": [2, "never"], + "type-enum": [2, "always", ["build","chore","ci","docs","feat","fix","perf","refactor","revert","style","test"]] + } +} +EOF +} + +# ----------------------------- +# πŸ”§ Config Files Checks +# ----------------------------- +commitlintrc_file_check() { + print_info "Checking .commitlintrc.json..." + [[ -f ".commitlintrc.json" ]] && print_pass "Already exists." || { print_task "Creating..."; commitlintrc_create; } +} + +markdownlint_file_check() { + print_info "Checking .markdownlint.json..." + [[ -f ".markdownlint.json" ]] && print_pass "Already exists." || { print_task "Creating..."; markdownlint_create; } +} + +# ----------------------------- +# πŸ”§ Pre Commit Hooks Overall Check +# ----------------------------- +pre_commit_hooks_check() { + print_info "Checking pre-commit hooks..." + if [[ -f ".pre-commit-config.yaml" ]]; then + print_pass "pre-commit config exists." + print_task "Updating and installing hooks..." + pre-commit autoupdate + pre-commit install + [[ $(grep -v '^\s*#' .pre-commit-config.yaml | grep -cE "commit-msg|commitlint") -gt 0 ]] && { + print_task "Installing commit-msg hook..." + pre-commit install --hook-type commit-msg + commitlintrc_file_check + } + else + print_warning "Missing .pre-commit-config.yaml." + print_task "Creating..." + pre_commit_config_create + pre-commit autoupdate + pre-commit install + [[ $(grep -v '^\s*#' .pre-commit-config.yaml | grep -cE "commit-msg|commitlint") -gt 0 ]] && { + print_task "Installing commit-msg hook..." + pre-commit install --hook-type commit-msg + commitlintrc_file_check + } + fi +} + +# ----------------------------- +# πŸš€ Execute Steps +# ----------------------------- +virtual_environment_check +pip_current_version_check +pip_latest_version_check +pip_status_check + +pre_commit_status_check +pre_commit_current_version_check +pre_commit_latest_version_check +pre_commit_version_check +pre_commit_hooks_check +markdownlint_file_check + +print_pass "πŸŽ‰ Setup Completed Successfully!" From f6567a3b6dc58d6f92494271a70d4a2c2266e6a6 Mon Sep 17 00:00:00 2001 From: pkoper Date: Tue, 14 Oct 2025 09:25:24 +0100 Subject: [PATCH 04/11] chore(DEV SETUP): added dev_setup powershell script plus few updates --- src/script/dev_setup.ps1 | 267 +++++++++++++++++++++++++++++++++++++++ src/script/dev_setup.sh | 52 +++++--- 2 files changed, 299 insertions(+), 20 deletions(-) create mode 100644 src/script/dev_setup.ps1 diff --git a/src/script/dev_setup.ps1 b/src/script/dev_setup.ps1 new file mode 100644 index 0000000..73857c4 --- /dev/null +++ b/src/script/dev_setup.ps1 @@ -0,0 +1,267 @@ +#!/usr/bin/env pwsh +<# +.SYNOPSIS + Developer environment setup (PowerShell 7+) +.DESCRIPTION + Creates/activates venv (where possible), checks/updates pip and pre-commit, + and Asserts config files (.pre-commit-config.yaml, .markdownlint.json, .commitlintrc.json). +#> + +# ----------------------------- +# 🎨 Color map & helpers +# ----------------------------- +$Colors = @{ + 'Info' = 'Cyan' + 'Task' = 'Yellow' + 'Pass' = 'Green' + 'Warn' = 'Magenta' + 'Fail' = 'Red' +} + +function Write-Info([string]$m){Write-Host "πŸ’‘ [INFO] $m" -ForegroundColor $Colors.Info } +function Write-Task([string]$m){ Write-Host "⚑ [TASK] $m" -ForegroundColor $Colors.Task } +function Write-Pass([string]$m){ Write-Host "βœ… [PASS] $m" -ForegroundColor $Colors.Pass } +function Write-Warn([string]$m){ Write-Host "⚠️ [WARN] $m" -ForegroundColor $Colors.Warn } +function Write-Fail([string]$m){ Write-Host "❌ [FAIL] $m" -ForegroundColor $Colors.Fail } + +# ----------------------------- +# 🌱 Virtual environment +# ----------------------------- +$VenvDir = ".venv" + +function Assert-VirtualEnv { + Write-Info "Checking virtual environment status ..." + + # PowerShell activation path (Windows & pwsh) + $ActivatePs1 = Join-Path $VenvDir "Scripts/Activate.ps1" + + if (Test-Path $ActivatePs1) { + if ($env:VIRTUAL_ENV) { + Write-Pass "Virtual environment found and active." + } else { + Write-Info "Virtual environment found but not active." + Write-Task "activating ..." + try { + & $ActivatePs1 + } catch { + Write-Warn "Activation script ran but returned an error: $($_.Exception.Message)" + } + } + } else { + Write-Warn "No virtual environment found." + Write-Task "creating and activating ..." + try { + & python -m venv $VenvDir 2>&1 | Out-Null + & $ActivatePs1 + } catch { + Write-Fail "Failed to create virtual environment: $($_.Exception.Message)" + } + } +} + +# ----------------------------- +# πŸ”§ Pip version helpers (robust regex) +# ----------------------------- +function Get-PipVersion { + try { + $out = & pip --version 2>&1 + } catch { + return $null + } + $match = [regex]::Match($out, 'pip\s+(\d+(?:\.\d+)+)') + if ($match.Success) { return $match.Groups[1].Value } + return $null +} + +function Get-LatestPipVersion { + try { + $dry = & python -m pip install --upgrade pip --dry-run 2>&1 + } catch { + $dry = $null + } + if ($null -ne $dry) { + # look for "pip-1.2.3" or parenthesized versions like "(1.2.3)" + $m = [regex]::Match($dry, 'pip-?(\d+(?:\.\d+)*)') + if ($m.Success) { return $m.Groups[1].Value } + $m2 = [regex]::Match($dry, '\((\d+(?:\.\d+)*)\)') + if ($m2.Success) { return $m2.Groups[1].Value } + } + return $null +} + +function Assert-PipUpToDate { + Write-Info "Checking pip version ..." + $current = Get-PipVersion + $latest = Get-LatestPipVersion + + if (-not $current -or -not $latest) { + Write-Warn "Could not determine pip versions (current: $current, latest: $latest). Skipping upgrade check." + return + } + Write-Info "Current: $current | Latest: $latest" + if ($current -eq $latest) { + Write-Pass "pip is up to date." + } else { + Write-Warn "pip is outdated." + Write-Task "upgrading ..." + try { & python -m pip install --upgrade pip } catch { Write-Fail "pip upgrade failed: $($_.Exception.Message)" } + } +} + +# ----------------------------- +# πŸ”„ pre-commit helpers (robust regex) +# ----------------------------- +function Assert-PreCommitInstalled { + Write-Info "Checking pre-commit installation ..." + if (Get-Command pre-commit -ErrorAction SilentlyContinue) { + Write-Pass "pre-commit is installed." + } else { + Write-Warn "pre-commit is missing." + Write-Task "installing ..." + try { & pip install pre-commit } catch { Write-Fail "Failed to install pre-commit: $($_.Exception.Message)"; return } + } +} + +function Get-PreCommitVersion { + try { $out = & pre-commit --version 2>&1 } catch { return $null } + $match = [regex]::Match($out, 'pre-commit\s+(\d+(?:\.\d+)*)') + if ($match.Success) { return $match.Groups[1].Value } + return $null +} + +function Get-LatestPreCommitVersion { + try { $dry = & pip install pre-commit --upgrade --dry-run 2>&1 } catch { $dry = $null } + if ($null -ne $dry) { + $m = [regex]::Match($dry, 'commit-?(\d+(?:\.\d+)*)') + if ($m.Success) { return $m.Groups[1].Value } + $m2 = [regex]::Match($dry, '\((\d+(?:\.\d+)*)\)') + if ($m2.Success) { return $m2.Groups[1].Value } + } + return $null +} + +function Assert-PreCommitUpToDate { + Write-Info "Checking pre-commit version ..." + $current = Get-PreCommitVersion + $latest = Get-LatestPreCommitVersion + if (-not $current -or -not $latest) { + Write-Warn "Could not determine pre-commit versions (current: $current, latest: $latest). Skipping upgrade check." + return + } + Write-Info "Current: $current | Latest: $latest" + if ($current -eq $latest) { + Write-Pass "pre-commit is up to date." + } else { + Write-Warn "pre-commit is outdated." + Write-Task "upgrading ..." + try { & pip install --upgrade pre-commit } catch { Write-Fail "Failed to upgrade pre-commit: $($_.Exception.Message)" } + } +} + +# ----------------------------- +# πŸ“„ Config file creators (PowerShell-native) +# ----------------------------- +function New-PreCommitConfig { @" +default_stages: [pre-commit, manual] + +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v6.0.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + args: ["--unsafe"] + - id: check-added-large-files + + - repo: https://github.com/tcort/markdown-link-check + rev: v3.13.7 + hooks: + - id: markdown-link-check + args: [-q] + + - repo: https://github.com/igorshubovych/markdownlint-cli + rev: v0.45.0 + hooks: + - id: markdownlint + args: ["--ignore", "CHANGELOG.md", "--fix"] +"@ | Out-File -FilePath ".pre-commit-config.yaml" -Encoding utf8 } + +function New-MarkdownLintConfig { @" +{ + "comment": "Markdown Lint Rules", + "default": true, + "MD007": {"indent": 4}, + "MD013": false, + "MD024": false, + "MD025": {"front_matter_title": ""}, + "MD029": {"style": "one_or_ordered"}, + "MD033": false +} +"@ | Out-File -FilePath ".markdownlint.json" -Encoding utf8 } + +function New-CommitLintConfig { @" +{ + "rules": { + "body-leading-blank": [1, "always"], + "footer-leading-blank": [1, "always"], + "header-max-length": [2, "always", 72], + "scope-case": [2, "always", "upper-case"], + "scope-empty": [2, "never"], + "subject-case": [2, "never", ["start-case", "pascal-case", "upper-case"]], + "subject-empty": [2, "never"], + "subject-full-stop": [2, "never", "."], + "type-case": [2, "always", "lower-case"], + "type-empty": [2, "never"], + "type-enum": [2, "always", ["build","chore","ci","docs","feat","fix","perf","refactor","revert","style","test"]] + } +} +"@ | Out-File -FilePath ".commitlintrc.json" -Encoding utf8 } + +# ----------------------------- +# πŸ”§ Assert files and hooks +# ----------------------------- +function Assert-File([string]$path, [scriptblock]$createBlock) { + if (Test-Path $path) { + Write-Pass "$path already exists, please ensure it has the correct format." + } else { + Write-Warn "$path file is missing." + Write-Task "creating ..." + & $createBlock + } +} + +function Assert-PreCommitHooks { + Write-Info "Checking pre-commit config and hooks ..." + Assert-File ".pre-commit-config.yaml" { New-PreCommitConfig } + try { + & pre-commit autoupdate + & pre-commit install + } catch { + Write-Warn "pre-commit command failed to run: $($_.Exception.Message)" + } + + # If commit-msg/commitlint present, install commit-msg hook and Assert commitlintrc + $hasCommitMsg = Select-String -Path ".pre-commit-config.yaml" -Pattern "commit-msg|commitlint" -Quiet + if ($hasCommitMsg) { + Write-Task "Installing commit-msg hook ..." + try { + & pre-commit install --hook-type commit-msg + } catch { + Write-Warn "Could not install commit-msg hook: $($_.Exception.Message)" + } + Assert-File ".commitlintrc.json" { New-CommitLintConfig } + } + Assert-File ".markdownlint.json" { New-MarkdownLintConfig } +} + +# ----------------------------- +# πŸš€ Run tasks +# ----------------------------- +Assert-VirtualEnv +Assert-PipUpToDate +Assert-PreCommitInstalled +Assert-PreCommitUpToDate +Assert-PreCommitHooks + +Write-Pass "πŸŽ‰ Setup Completed Successfully!" diff --git a/src/script/dev_setup.sh b/src/script/dev_setup.sh index 45cde48..289aae0 100644 --- a/src/script/dev_setup.sh +++ b/src/script/dev_setup.sh @@ -35,19 +35,19 @@ print_error() { # 🐍 Virtual Environment Check # ----------------------------- virtual_environment_check() { - print_info "Checking virtual environment..." + print_info "Checking virtual environment ..." if [[ -d "$VENV_DIR" && -f "$VENV_DIR/bin/activate" ]]; then if [[ -n "${VIRTUAL_ENV:-}" ]]; then print_pass "Virtual environment found and active." else print_info "Virtual environment found but not active." - print_task "Activating..." + print_task "activating ..." source "$VENV_DIR/bin/activate" fi else print_warning "No virtual environment found." - print_task "Creating and activating..." + print_task "creating and activating ..." python3 -m venv "$VENV_DIR" source "$VENV_DIR/bin/activate" fi @@ -69,14 +69,14 @@ pip_latest_version_check() { } pip_status_check() { - print_info "Checking pip version..." + print_info "Checking pip version ..." print_info "Current: ${CURRENT_VERSION} | Latest: ${LATEST_VERSION}" if [[ "$CURRENT_VERSION" == "$LATEST_VERSION" ]]; then print_pass "pip is up to date." else print_warning "pip is outdated." - print_task "Updating pip..." + print_task "updating ..." python3 -m pip install --upgrade pip fi } @@ -85,12 +85,12 @@ pip_status_check() { # πŸ”„ Pre-commit Check # ----------------------------- pre_commit_status_check() { - print_info "Checking pre-commit installation..." + print_info "Checking pre-commit installation ..." if command -v pre-commit >/dev/null 2>&1; then print_pass "pre-commit is installed." else print_warning "pre-commit is missing." - print_task "Installing pre-commit..." + print_task "Installing pre-commit ..." pip install pre-commit fi } @@ -107,14 +107,14 @@ pre_commit_latest_version_check() { } pre_commit_version_check() { - print_info "Checking pre-commit version..." + print_info "Checking pre-commit version ..." print_info "Current: ${CURRENT_VERSION} | Latest: ${LATEST_VERSION}" if [[ "$CURRENT_VERSION" == "$LATEST_VERSION" ]]; then print_pass "pre-commit is up to date." else print_warning "pre-commit is outdated." - print_task "Updating..." + print_task "updating ..." pip install --upgrade pre-commit fi } @@ -195,38 +195,50 @@ EOF # πŸ”§ Config Files Checks # ----------------------------- commitlintrc_file_check() { - print_info "Checking .commitlintrc.json..." - [[ -f ".commitlintrc.json" ]] && print_pass "Already exists." || { print_task "Creating..."; commitlintrc_create; } + print_info "Checking .commitlintrc.json ..." + if [[ -f ".commitlintrc.json" ]]; then + print_pass "Already exists, please ensure it has the correct format." + else + print_warning ".pre-commit-config.yaml file is missing." + print_task "creating ..." + commitlintrc_create + fi } markdownlint_file_check() { - print_info "Checking .markdownlint.json..." - [[ -f ".markdownlint.json" ]] && print_pass "Already exists." || { print_task "Creating..."; markdownlint_create; } + print_info "Checking .markdownlint.json ..." + if [[ -f ".markdownlint.json" ]]; then + print_pass "Already exists, please ensure it has the correct format." + else + print_warning ".markdownlint.json file is missing." + print_task "creating ..." + markdownlint_create + fi } # ----------------------------- # πŸ”§ Pre Commit Hooks Overall Check # ----------------------------- pre_commit_hooks_check() { - print_info "Checking pre-commit hooks..." + print_info "Checking pre-commit hooks ..." if [[ -f ".pre-commit-config.yaml" ]]; then - print_pass "pre-commit config exists." - print_task "Updating and installing hooks..." + print_pass ".pre-commit-config.yaml already exists, please ensure it has the correct format." + print_task "Updating and installing hooks ..." pre-commit autoupdate pre-commit install [[ $(grep -v '^\s*#' .pre-commit-config.yaml | grep -cE "commit-msg|commitlint") -gt 0 ]] && { - print_task "Installing commit-msg hook..." + print_task "Installing commit-msg hook ..." pre-commit install --hook-type commit-msg commitlintrc_file_check } else - print_warning "Missing .pre-commit-config.yaml." - print_task "Creating..." + print_warning ".pre-commit-config.yaml is missing." + print_task "creating ..." pre_commit_config_create pre-commit autoupdate pre-commit install [[ $(grep -v '^\s*#' .pre-commit-config.yaml | grep -cE "commit-msg|commitlint") -gt 0 ]] && { - print_task "Installing commit-msg hook..." + print_task "Installing commit-msg hook ..." pre-commit install --hook-type commit-msg commitlintrc_file_check } From 1905538632bf2f1c7deee18ce3b040c2233c99b0 Mon Sep 17 00:00:00 2001 From: pkoper Date: Tue, 14 Oct 2025 09:48:06 +0100 Subject: [PATCH 05/11] chore(DEV SETUP): updated .gitignore --- .gitignore | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index ba2a045..7084b3f 100644 --- a/.gitignore +++ b/.gitignore @@ -11,8 +11,9 @@ docs/home.md docs/vendors.md docs/examples/**/*.md -# dev setup exclusion to keep the repository as simple as possible -# these would normally be not included as part of `.gitignore` +# Development setup exclusions β€” keeping the repository clean and minimal. +# Except for `.venv/`, these files are typically not included in +# `.gitignore`, but are listed here to maintain simplicity and clarity. .venv/ .pre-commit-config.yaml .markdownlint.json From 3b9ba7fc0c301263d7968be4e9d3cac4fc906316 Mon Sep 17 00:00:00 2001 From: pkoper Date: Tue, 14 Oct 2025 10:12:56 +0100 Subject: [PATCH 06/11] chore(DEV SETUP): files corrections --- .github/CODEOWNERS | 1 - docs/README.md | 308 ++++++++++++++++++++++++--------------------- 2 files changed, 164 insertions(+), 145 deletions(-) delete mode 100644 .github/CODEOWNERS diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS deleted file mode 100644 index 0832a08..0000000 --- a/.github/CODEOWNERS +++ /dev/null @@ -1 +0,0 @@ -* @bitol-io/tsc diff --git a/docs/README.md b/docs/README.md index a925f6d..f94e46c 100644 --- a/docs/README.md +++ b/docs/README.md @@ -7,13 +7,13 @@ image: "https://raw.githubusercontent.com/bitol-io/artwork/main/horizontal/color # Open Data Contract Standard ## Executive Summary -This document describes the keys and values expected in a YAML data contract, per the **Open Data Contract Standard**. -It is divided in multiple sections: [fundamentals (fka demographics)](#fundamentals), [schema](#schema), -[data quality](#data-quality), [Support & communication channels](#support-and-communication-channels), [pricing](#pricing), [team](#team), -[roles](#roles), [service-level agreement](#service-level-agreement-sla), [Infrastructures & servers](#infrastructure-and-servers) and -[other/custom properties](#custom-properties). Each section starts with at least an example followed by definition of -each field/key. +This document describes the keys and values expected in a YAML data contract, per the **Open Data Contract Standard**. +It is divided in multiple sections: [fundamentals (fka demographics)](#fundamentals), [schema](#schema), +[data quality](#data-quality), [Support & communication channels](#support-and-communication-channels), [pricing](#pricing), [team](#team), +[roles](#roles), [service-level agreement](#service-level-agreement-sla), [Infrastructures & servers](#infrastructure-and-servers) and +[other/custom properties](#custom-properties). Each section starts with at least an example followed by definition of +each field/key. ## Table of content @@ -30,15 +30,14 @@ each field/key. 1. [Custom & other properties](#custom-properties) 1. [Examples](#full-example-1) - ## Notes * This contract is containing example values, we reviewed very carefully the consistency of those values, but we cannot guarantee that there are no errors. If you spot one, please raise an [issue](https://github.com/AIDAUserGroup/open-data-contract-standard/issues). * Some fields have `null` value: even if it is equivalent to not having the field in the contract, we wanted to have the field for illustration purpose. * This contract should be **platform agnostic**. If you think it is not the case, please raise an [issue](https://github.com/AIDAUserGroup/open-data-contract-standard/issues). - ## Fundamentals + This section contains general information about the contract. ### Example @@ -49,7 +48,7 @@ kind: DataContract id: 53581432-6c55-4ba2-a65f-72344a91553a name: seller_payments_v1 -version: 1.1.0 # Data Contract Version +version: 1.1.0 # Data Contract Version status: active domain: seller dataProduct: payments @@ -65,28 +64,27 @@ tags: ['finance'] ### Definitions -| Key | UX label | Required | Description | -|--------------------------------------|---------------------------|----------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| apiVersion | Standard version | Yes | Version of the standard used to build data contract. Default value is `v3.0.2`. | -| kind | Kind | Yes | The kind of file this is. Valid value is `DataContract`. | -| id | ID | Yes | A unique identifier used to reduce the risk of dataset name collisions, such as a UUID. | -| name | Name | No | Name of the data contract. | -| version | Version | Yes | Current version of the data contract. | -| status | Status | Yes | Current status of the data contract. Examples are "proposed", "draft", "active", "deprecated", "retired". | -| tenant | Tenant | No | Indicates the property the data is primarily associated with. Value is case insensitive. | -| tags | Tags | No | A list of tags that may be assigned to the elements (object or property); the tags keyword may appear at any level. Tags may be used to better categorize an element. For example, `finance`, `sensitive`, `employee_record`. | -| domain | Domain | No | Name of the logical data domain. | -| dataProduct | Data Product | No | Name of the data product. | -| authoritativeDefinitions | Authoritative Definitions | No | List of links to sources that provide more details on the data contract. | -| description | Description | No | Object containing the descriptions. | -| description.purpose | Purpose | No | Intended purpose for the provided data. | -| description.limitations | Limitations | No | Technical, compliance, and legal limitations for data use. | -| description.usage | Usage | No | Recommended usage of the data. | -| description.authoritativeDefinitions | Authoritative Definitions | No | List of links to sources that provide more details on the dataset; examples would be a link to privacy statement, terms and conditions, license agreements, data catalog, or another tool. | -| description.customProperties | Custom Properties | No | Custom properties that are not part of the standard. | - +| Key | UX label | Required | Description | +|--------------------------------------|---------------------------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| apiVersion | Standard version | Yes | Version of the standard used to build data contract. Default value is `v3.0.2`. | +| kind | Kind | Yes | The kind of file this is. Valid value is `DataContract`. | +| id | ID | Yes | A unique identifier used to reduce the risk of dataset name collisions, such as a UUID. | +| name | Name | No | Name of the data contract. | +| version | Version | Yes | Current version of the data contract. | +| status | Status | Yes | Current status of the data contract. Examples are "proposed", "draft", "active", "deprecated", "retired". | +| tenant | Tenant | No | Indicates the property the data is primarily associated with. Value is case insensitive. | +| domain | Domain | No | Name of the logical data domain. | +| dataProduct | Data Product | No | Name of the data product. | +| authoritativeDefinitions | Authoritative Definitions | No | List of links to sources that provide more details on the data contract. | +| description | Description | No | Object containing the descriptions. | +| description.purpose | Purpose | No | Intended purpose for the provided data. | +| description.limitations | Limitations | No | Technical, compliance, and legal limitations for data use. | +| description.usage | Usage | No | Recommended usage of the data. | +| description.authoritativeDefinitions | Authoritative Definitions | No | List of links to sources that provide more details on the dataset; examples would be a link to privacy statement, terms and conditions, license agreements, data catalog, or another tool. | +| description.customProperties | Custom Properties | No | Custom properties that are not part of the standard. | ## Schema + This section describes the schema of the data contract. It is the support for data quality, which is detailed in the next section. Schema supports both a business representation of your data and a physical implementation. It allows to tie them together. In ODCS v3, the schema has evolved from the table and column representation, therefore the schema introduces a new terminology: @@ -110,10 +108,10 @@ schema: - name: tbl logicalType: object physicalType: table - physicalName: tbl_1 - description: Provides core payment metrics - authoritativeDefinitions: - - url: https://catalog.data.gov/dataset/air-quality + physicalName: tbl_1 + description: Provides core payment metrics + authoritativeDefinitions: + - url: https://catalog.data.gov/dataset/air-quality type: businessDefinition - url: https://youtu.be/jbY1BKFj9ec type: videoTutorial @@ -135,12 +133,12 @@ schema: - table_name_2 - table_name_3 transformLogic: sel t1.txn_dt as txn_ref_dt from table_name_1 as t1, table_name_2 as t2, table_name_3 as t3 where t1.txn_dt=date-3 - transformDescription: Defines the logic in business terms. + transformDescription: Defines the logic in business terms. examples: - 2022-10-03 - 2020-01-28 - name: rcvr_id - primaryKey: true + primaryKey: true primaryKeyPosition: 1 businessName: receiver id logicalType: string @@ -154,7 +152,7 @@ schema: classification: restricted encryptedName: enc_rcvr_id - name: rcvr_cntry_code - primaryKey: false + primaryKey: false primaryKeyPosition: -1 businessName: receiver country code logicalType: string @@ -184,7 +182,7 @@ schema: logicalType: object properties: - name: street_lines - logicalType: array + logicalType: array items: logicalType: string ``` @@ -202,7 +200,7 @@ schema: logicalType: object properties: - name: id - logicalType: string + logicalType: string physicalType: VARCHAR(40) - name: zip logicalType: string @@ -238,7 +236,7 @@ schema: #### Applicable to Properties -Some keys are more applicable when the described property is a column. +Some keys are more applicable when the described property is a column. | Key | UX label | Required | Description | |--------------------------|------------------------------|----------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| @@ -290,11 +288,11 @@ Additional metadata options to more accurately define the data type. | string | format | Format | No | Provides extra context about what format the string follows. For example, password, byte, binary, email, uuid, uri, hostname, ipv4, ipv6. | | string | maxLength | Maximum Length | No | Maximum length of the string. | | string | minLength | Minimum Length | No | Minimum length of the string. | -| string | pattern | Pattern | No | Regular expression pattern to define valid value. Follows regular expression syntax from ECMA-262 (https://262.ecma-international.org/5.1/#sec-15.10.1). | +| string | pattern | Pattern | No | Regular expression pattern to define valid value. Follows regular expression syntax from ECMA-262 (). | #### Expressing Date / Datetime / Timezone information -Given the complexity of handling various date and time formats (e.g., date, datetime, time, timestamp, timestamp with and without timezone), the existing `logicalType` options currently support `date`, `timestamp`, and `time`. To specify additional temporal details, `logicalType` should be used in conjunction with `logicalTypeOptions.format` or `physicalType` to define the desired format. Using `physicalType` allows for definition of your data-source specific data type. +Given the complexity of handling various date and time formats (e.g., date, datetime, time, timestamp, timestamp with and without timezone), the existing `logicalType` options currently support `date`, `timestamp`, and `time`. To specify additional temporal details, `logicalType` should be used in conjunction with `logicalTypeOptions.format` or `physicalType` to define the desired format. Using `physicalType` allows for definition of your data-source specific data type. ``` yaml version: 1.0.0 @@ -304,7 +302,7 @@ status: active name: date_example apiVersion: v3.0.2 schema: - # Date Only + # Date Only - name: event_date logicalType: date logicalTypeOptions: @@ -312,7 +310,7 @@ schema: examples: - "2024-07-10" - # Date & Time (UTC) + # Date & Time (UTC) - name: created_at logicalType: timestamp logicalTypeOptions: @@ -320,7 +318,7 @@ schema: examples: - "2024-03-10T14:22:35Z" - # Date & Time (Australia/Sydney) + # Date & Time (Australia/Sydney) - name: created_at_sydney logicalType: timestamp logicalTypeOptions: @@ -330,7 +328,7 @@ schema: examples: - "2024-03-10T14:22:35+10:00" - # Time Only + # Time Only - name: event_start_time logicalType: time logicalTypeOptions: @@ -350,6 +348,7 @@ schema: ``` ### Authoritative definitions + Reference to an external definition on element logic or values. | Key | UX label | Required | Description | @@ -360,24 +359,24 @@ Reference to an external definition on element logic or values. | authoritativeDefinitions.description | Description | No | Description for humans | ## References -This section describes how to reference elements within a data contract schema. References enable you to create relationships between different parts of your data contract. +This section describes how to reference elements within a data contract schema. References enable you to create relationships between different parts of your data contract. > [!IMPORTANT] > References are currently only supported within schema properties for foreign key relationships. - ### Reference Structure A fully formatted reference follows this structure: + ```yaml ``` Where: -- **``**: Path to the contract file (optional for same-contract references) -- **``**: '#' symbol to mark entry into a contract (optional for same-contract) -- **``**: The defined path within the contract +* **``**: Path to the contract file (optional for same-contract references) +* **``**: '#' symbol to mark entry into a contract (optional for same-contract) +* **``**: The defined path within the contract ### External Contract References @@ -396,9 +395,11 @@ https://example.com/data-contract-v1.yaml # Relative path ../../path/to/data-contract-v1.yaml ``` + ### Reference Examples #### External Contract References + ```yaml # Reference to an element in an external contract 'external-contract.yaml#schema.my-table' @@ -408,7 +409,8 @@ https://example.com/data-contract-v1.yaml ``` #### Same Contract References -When referencing elements within the same contract, the file component can be omitted. + +When referencing elements within the same contract, the file component can be omitted. ```yaml # Full reference within same contract @@ -432,15 +434,16 @@ Properties can define relationships to other properties, enabling you to specify #### Quick Overview Relationships can be defined in two ways: + 1. **At the property level** - Define relationships directly on a property (the `from` field is implicit and must NOT be specified) 2. **At the schema level** - Define relationships between any properties (both `from` and `to` are required) #### Important Rules -- **Property-level relationships**: The `from` field is implicit (derived from the property context) and must NOT be specified -- **Schema-level relationships**: Both `from` and `to` fields are required -- **Type consistency**: Both `from` and `to` must be the same type - either both strings (single column) or both arrays (composite keys). Mixing types is not allowed -- **Array length validation**: When using arrays for composite keys, both arrays must have the same number of elements. This is validated at runtime by implementations +* **Property-level relationships**: The `from` field is implicit (derived from the property context) and must NOT be specified +* **Schema-level relationships**: Both `from` and `to` fields are required +* **Type consistency**: Both `from` and `to` must be the same type - either both strings (single column) or both arrays (composite keys). Mixing types is not allowed +* **Array length validation**: When using arrays for composite keys, both arrays must have the same number of elements. This is validated at runtime by implementations #### Field Definitions @@ -454,9 +457,9 @@ Relationships can be defined in two ways: #### Reference Notation -- **Simple reference**: `users.id` - References the `id` property in the `users` schema -- **Nested reference**: `accounts.address.street` - References nested properties -- **Composite keys**: Use arrays to define composite keys (arrays must have matching lengths) +* **Simple reference**: `users.id` - References the `id` property in the `users` schema +* **Nested reference**: `accounts.address.street` - References nested properties +* **Composite keys**: Use arrays to define composite keys (arrays must have matching lengths) ### Examples @@ -529,7 +532,7 @@ schema: to: # Array (must match 'from' length) - product_inventory.order_id - product_inventory.product_id - + ``` #### Example 6: Invalid Configurations @@ -551,7 +554,7 @@ schema: - name: orders relationships: - from: orders.id # ERROR: 'from' is string but 'to' is array - to: + to: - items.order_id - items.line_num @@ -586,11 +589,11 @@ schema: relationships: # Simple foreign key (from is implicit) - to: accounts.user_id - + # With explicit from field - from: users.id to: profiles.user_id - + # With custom properties - to: departments.manager_id customProperties: @@ -604,19 +607,19 @@ schema: customProperties: - property: description value: "Externally referenced contract" - + - name: account_number - + # Schema-level composite key relationship relationships: - type: foreignKey - from: + from: - users.id - users.account_number to: - accounts.user_id - accounts.account_number - + - name: accounts properties: - name: user_id @@ -628,16 +631,18 @@ schema: ``` ## Data quality + This section describes data quality rules & parameters. They are tightly linked to the schema described in the previous section. Data quality rules support different levels/stages of data quality attributes: - - __Text__: A human-readable text that describes the quality of the data. - - __Library__ : A maintained library of commonly used quality metrics such as `rowCount`, `nullValues`, `invalidValues`, and more. - - __SQL__: An individual SQL query that returns a value that can be compared. - - __Custom__: Quality attributes that are vendor-specific, such as Soda, Great Expectations, dbt tests, dbx, or Montecarlo monitors. +* **Text**: A human-readable text that describes the quality of the data. +* **Library** : A maintained library of commonly used quality metrics such as `rowCount`, `nullValues`, `invalidValues`, and more. +* **SQL**: An individual SQL query that returns a value that can be compared. +* **Custom**: Quality attributes that are vendor-specific, such as Soda, Great Expectations, dbt tests, dbx, or Montecarlo monitors. ### Text + A human-readable text that describes the quality of the data. Later in the development process, these might be translated into an executable check (such as `sql`), a library metric, or checked through an AI engine. ```yaml @@ -647,6 +652,7 @@ quality: ``` ### Library + ODCS provides a set of predefined metrics commonly used in data quality checks, designed to be compatible with all major data quality engines. This simplifies the work for data engineers by eliminating the need to manually write SQL queries. The type for library metrics is `library`, which can be omitted, if a `metric` property is defined. @@ -662,7 +668,7 @@ properties: - name: order_id quality: - type: library - metric: nullValues + metric: nullValues mustBe: 0 unit: rows description: "There must be no null values in the column." @@ -674,12 +680,11 @@ is equalized to: properties: - name: order_id quality: - - metric: nullValues + - metric: nullValues mustBe: 0 description: "There must be no null values in the column." ``` - #### Metrics | Metric | Level | Description | Arguments | Arguments Example | @@ -716,7 +721,6 @@ properties: description: "There must be less than 1% null values in the column." ``` - ##### Missing Values Check that the missing values are within range. @@ -734,7 +738,6 @@ properties: unit: rows # rows (default) or percent ``` - ##### Invalid Values Check that the value is within a defined set or matching a pattern. @@ -763,7 +766,6 @@ properties: pattern: '^[A-Z]{2}[0-9]{2}[A-Z0-9]{4}[0-9]{7}([A-Z0-9]?){0,16}$' ``` - ##### Duplicate Values No more than 10 duplicate names. @@ -790,6 +792,7 @@ properties: ``` ##### Row count (Schema-Level) + Calculates the number of rows (usually in a table) and compares it to an absolute operator. ```yaml @@ -819,17 +822,19 @@ schema: ``` ### SQL + A single SQL query that returns either a numeric or boolean value for comparison. The query must be written in the SQL dialect specific to the provided server. `${object}` and `${property}` are automatically replaced by the current object (in the case of SQL on a relational database, the table or view name) and the current property name (in the case of SQL on a relational database, the column). ```yaml quality: - - type: sql + - type: sql query: | SELECT COUNT(*) FROM ${object} WHERE ${property} IS NOT NULL - mustBeLessThan: 3600 + mustBeLessThan: 3600 ``` ### Custom + Custom rules allow for vendor-specific checks, including tools like Soda, Great Expectations, dbt-tests, Montecarlo, and others. Any format for properties is acceptable, whether it's written in YAML, JSON, XML, or even uuencoded binary. They are an intermediate step before the vendor accepts ODCS natively. #### Soda Example @@ -860,22 +865,23 @@ quality: ``` ### Scheduling -The data contract can contain scheduling information for executing the rules. You can use `schedule` and `scheduler` for those operation. In previous versions of ODCS, the only allowed scheduler was cron and its syntax was `scheduleCronExpression`. + +The data contract can contain scheduling information for executing the rules. You can use `schedule` and `scheduler` for those operation. In previous versions of ODCS, the only allowed scheduler was cron and its syntax was `scheduleCronExpression`. ```yaml quality: - - type: sql + - type: sql query: | SELECT COUNT(*) FROM ${object} WHERE ${property} IS NOT NULL - mustBeLessThan: 3600 + mustBeLessThan: 3600 scheduler: cron schedule: 0 20 * * * ``` - ### Definitions Acronyms: + * DQ: data quality. | Key | UX label | Required | Description | @@ -903,22 +909,24 @@ Acronyms: | quality.schedule | Scheduler Configuration | No | Configuration information for the scheduling tool, for `cron` a possible value is `0 20 * * *`. | #### Valid Values for Dimension + Those data quality dimensions are used for classification and reporting in data quality. Valid values are: - * `accuracy` (synonym `ac`), - * `completeness` (synonym `cp`), - * `conformity` (synonym `cf`), - * `consistency` (synonym `cs`), - * `coverage` (synonym `cv`), - * `timeliness` (synonym `tm`), - * `uniqueness` (synonym `uq`). +* `accuracy` (synonym `ac`), +* `completeness` (synonym `cp`), +* `conformity` (synonym `cf`), +* `consistency` (synonym `cs`), +* `coverage` (synonym `cv`), +* `timeliness` (synonym `tm`), +* `uniqueness` (synonym `uq`). #### Valid Properties for Operator + The operator specifies the condition to validate a metric or result of a SQL query. | Operator | Expected Value | Math Symbol | Example | |--------------------------|---------------------|-------------|------------------------------| -| `mustBe` | number | `=` | `mustBe: 5` | +| `mustBe` | number | `=` | `mustBe: 5` | | `mustNotBe` | number | `<>`, `β‰ ` | `mustNotBe: 3.14` | | `mustBeGreaterThan` | number | `>` | `mustBeGreaterThan: 59` | | `mustBeGreaterOrEqualTo` | number | `>=`, `β‰₯` | `mustBeGreaterOrEqualTo: 60` | @@ -931,26 +939,26 @@ The operator specifies the condition to validate a metric or result of a SQL que ```yaml quality: - - type: sql + - type: sql query: | SELECT COUNT(*) FROM ${table} WHERE ${column} IS NOT NULL - mustBeBetween: [0, 100] + mustBeBetween: [0, 100] ``` is equivalent to: ```yaml quality: - - type: sql + - type: sql query: | SELECT COUNT(*) FROM ${table} WHERE ${column} IS NOT NULL mustBeGreaterThan: 0 - mustBeLessThan: 100 + mustBeLessThan: 100 ``` - ## Support and Communication Channels -Support and communication channels help consumers find help regarding their use of the data contract. + +Support and communication channels help consumers find help regarding their use of the data contract. ### Examples @@ -1003,9 +1011,9 @@ support: | support.invitationUrl | Invitation URL | No | Some tools uses invitation URL for requesting or subscribing. Follows the [URL scheme](https://en.wikipedia.org/wiki/URL#Syntax). | | support.customProperties | Custom Properties | No | Any custom properties. | - ## Pricing -This section covers pricing when you bill your customer for using this data product. + +This section covers pricing when you bill your customer for using this data product. ### Example @@ -1025,11 +1033,12 @@ price: | price.priceCurrency | Price Currency | No | Currency of the subscription price in `price.priceAmount`. | | price.priceUnit | Price Unit | No | The unit of measure for calculating cost. Examples megabyte, gigabyte. | - ## Team + This section lists team members and the history of their relation with this data contract. In v2.x, this section was called stakeholders. ### Example + ```YAML team: - username: ceastwood @@ -1048,7 +1057,8 @@ team: ``` ### Definitions -The UX label is the label used in the UI and other user experiences. + +The UX label is the label used in the UI and other user experiences. | Key | UX label | Required | Description | |-------------------------|----------------------|----------|--------------------------------------------------------------------------------------------| @@ -1062,6 +1072,7 @@ The UX label is the label used in the UI and other user experiences. | team.replacedByUsername | Replaced By Username | No | The username of the user who replaced the previous user. | ## Roles + This section lists the roles that a consumer may need to access the dataset depending on the type of access they require. ### Example @@ -1098,9 +1109,9 @@ roles: | roles.secondLevelApprovers | 2nd Level Approvers | No | The name(s) of the second-level approver(s) of the role. | | roles.customProperties | Custom Properties | No | Any custom properties. | - ## Service-Level Agreement (SLA) -This section describes the service-level agreements (SLA). + +This section describes the service-level agreements (SLA). * Use the `Object.Element` to indicate the number to do the checks on, as in `SELECT txn_ref_dt FROM tab1`. * Separate multiple object.element by a comma, as in `table1.col1`, `table2.col1`, `table1.col2`. @@ -1161,10 +1172,10 @@ The `servers` element describes where the data protected by this data contract i An entry in `servers` describes a single dataset on a specific environment and a specific technology. The `servers` element can contain multiple servers, each with its own configuration. The typical ways of using the top level `servers` element are as follows: -- **Single Server:** The data contract protects a specific dataset at a specific location. *Example:* a CSV file on an SFTP server. -- **Multiple Environments:** The data contract makes sure that the data is protected in all environments. *Example:* a data product with data in a dev(elopment), UAT, and prod(uction) environment on Databricks. -- **Different Technologies:** The data contract makes sure that regardless of the offered technology, it still holds. *Example:* a data product offers its data in a Kafka topic and in a BigQuery table that should have the same structure and content. -- **Different Technologies and Multiple Environments:** The data contract makes sure that regardless of the offered technology and environment, it still holds. *Example:* a data product offers its data in a Kafka topic and in a BigQuery table that should have the same structure and content in dev(elopment), UAT, and prod(uction). +* **Single Server:** The data contract protects a specific dataset at a specific location. *Example:* a CSV file on an SFTP server. +* **Multiple Environments:** The data contract makes sure that the data is protected in all environments. *Example:* a data product with data in a dev(elopment), UAT, and prod(uction) environment on Databricks. +* **Different Technologies:** The data contract makes sure that regardless of the offered technology, it still holds. *Example:* a data product offers its data in a Kafka topic and in a BigQuery table that should have the same structure and content. +* **Different Technologies and Multiple Environments:** The data contract makes sure that regardless of the offered technology and environment, it still holds. *Example:* a data product offers its data in a Kafka topic and in a BigQuery table that should have the same structure and content in dev(elopment), UAT, and prod(uction). ### General Server Structure @@ -1202,36 +1213,36 @@ Each server type can be customized with different properties such as `host`, `po If your server is not in the list, please use [custom](#custom-server) and suggest it as an improvement. Possible values for `type` are: -- [api](#api-server) -- [athena](#amazon-athena-server) -- [azure](#azure-server) -- [bigquery](#google-bigquery) -- [clickhouse](#clickhouse-server) -- [cloudsql](#google-cloud-sql) -- [databricks](#databricks-server) -- [db2](#ibm-db2-server) -- [denodo](#denodo-server) -- [dremio](#dremio-server) -- [duckdb](#duckdb-server) -- [glue](#amazon-glue) -- [hive](#hive) -- [informix](#ibm-informix-and-hcl-informix) -- [kafka](#kafka-server) -- [kinesis](#amazon-kinesis) -- [local](#local-files) -- [mysql](#mysql-server) -- [oracle](#oracle) -- [postgresql](#postgresql) -- [presto](#presto-server) -- [pubsub](#google-pubsub) -- [redshift](#amazon-redshift-server) -- [s3](#amazon-s3-server-and-compatible-servers) -- [sftp](#sftp-server) -- [snowflake](#snowflake) -- [sqlserver](#microsoft-sql-server) -- [synapse](#synapse-server) -- [trino](#trino-server) -- [vertica](#vertica-server) +* [api](#api-server) +* [athena](#amazon-athena-server) +* [azure](#azure-server) +* [bigquery](#google-bigquery) +* [clickhouse](#clickhouse-server) +* [cloudsql](#google-cloud-sql) +* [databricks](#databricks-server) +* [db2](#ibm-db2-server) +* [denodo](#denodo-server) +* [dremio](#dremio-server) +* [duckdb](#duckdb-server) +* [glue](#amazon-glue) +* [hive](#hive) +* [informix](#ibm-informix-and-hcl-informix) +* [kafka](#kafka-server) +* [kinesis](#amazon-kinesis) +* [local](#local-files) +* [mysql](#mysql-server) +* [oracle](#oracle) +* [postgresql](#postgresql) +* [presto](#presto-server) +* [pubsub](#google-pubsub) +* [redshift](#amazon-redshift-server) +* [s3](#amazon-s3-server-and-compatible-servers) +* [sftp](#sftp-server) +* [snowflake](#snowflake) +* [sqlserver](#microsoft-sql-server) +* [synapse](#synapse-server) +* [trino](#trino-server) +* [vertica](#vertica-server) #### API Server @@ -1239,8 +1250,8 @@ If your server is not in the list, please use [custom](#custom-server) and sugge |----------------|------------|------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------| | **location** | Location | Yes | URL to the API | - #### Amazon Athena Server + [Amazon Athena](https://docs.aws.amazon.com/athena/latest/ug/what-is.html) is an interactive query service that makes it easy to analyze data directly in Amazon Simple Storage Service (Amazon S3) using standard SQL. With a few actions in the AWS Management Console, you can point Athena at your data stored in Amazon S3 and begin using standard SQL to run ad-hoc queries and get results in seconds. | Key | UX Label | Required | Description | @@ -1259,6 +1270,7 @@ If your server is not in the list, please use [custom](#custom-server) and sugge | delimiter | Delimiter | No | Only for format = json. How multiple json documents are delimited within one file | #### Google BigQuery + [BigQuery](https://cloud.google.com/bigquery) is a fully managed, AI-ready data analytics platform that helps you maximize value from your data and is designed to be multi-engine, multi-format, and multi-cloud. | Key | UX Label | Required | Description | @@ -1267,6 +1279,7 @@ If your server is not in the list, please use [custom](#custom-server) and sugge | dataset | Dataset | Yes | The GCP dataset name. | #### ClickHouse Server + [ClickHouse](https://clickhouse.com/) is an open-source column-oriented database management system that allows generating analytical data reports in real-time. | Key | UX Label | Required | Description | @@ -1276,6 +1289,7 @@ If your server is not in the list, please use [custom](#custom-server) and sugge | database | Database | Yes | The name of the database. | #### Google Cloud SQL + [Google Cloud SQL](https://cloud.google.com/sql) is a fully managed, cost-effective relational database service for PostgreSQL, MySQL, and SQL Server. | Key | UX Label | Required | Description | @@ -1319,6 +1333,7 @@ If your server is not in the list, please use [custom](#custom-server) and sugge | schema | Schema | No | The name of the schema. | #### DuckDB Server + [DuckDB](https://duckdb.org/) supports a feature-rich SQL dialect complemented with deep integrations into client APIs. | Key | UX Label | Required | Description | @@ -1336,6 +1351,7 @@ If your server is not in the list, please use [custom](#custom-server) and sugge | format | Format | No | The format of the files | #### Hive + [Apache Hive](https://hive.apache.org/) is a distributed, fault-tolerant data warehouse system that enables analytics at massive scale. Built on top of Apache Hadoop, Hive allows users to read, write, and manage petabytes of data using SQL-like queries through HiveQL, with native support for cloud storage systems and enterprise-grade security features. | Key | UX Label | Required | Description | @@ -1344,8 +1360,8 @@ If your server is not in the list, please use [custom](#custom-server) and sugge | port | Port | No | The port to the Hive server. Defaults to 10000. | | database | Database | Yes | The name of the Hive database. | - #### IBM Informix and HCL Informix + [IBM Informix](https://www.ibm.com/products/informix) is a high performance, always-on, highly scalable and easily embeddable enterprise-class database optimized for the most demanding transactional and analytics workloads. As an object-relational engine, IBM Informix seamlessly integrates the best of relational and object-oriented capabilities enabling the flexible modeling of complex data structures and relationships. | Key | UX Label | Required | Description | @@ -1393,6 +1409,7 @@ If your server is not in the list, please use [custom](#custom-server) and sugge | serviceName | Service Name | Yes | The name of the service. | #### PostgreSQL + [PostgreSQL](https://www.postgresql.org/) is a powerful, open source object-relational database system with over 35 years of active development that has earned it a strong reputation for reliability, feature robustness, and performance. | Key | UX Label | Required | Description | @@ -1411,6 +1428,7 @@ If your server is not in the list, please use [custom](#custom-server) and sugge | schema | Schema | No | The name of the schema. | #### Google Pub/Sub + [Google Cloud](https://cloud.google.com/pubsub) service to Ingest events for streaming into BigQuery, data lakes or operational databases. | Key | UX Label | Required | Description | @@ -1418,6 +1436,7 @@ If your server is not in the list, please use [custom](#custom-server) and sugge | project | Project | Yes | The GCP project name. | #### Amazon Redshift Server + [Amazon Redshift](https://aws.amazon.com/redshift/) is a power data driven decisions with the best price-performance cloud data warehouse. | Key | UX Label | Required | Description | @@ -1429,6 +1448,7 @@ If your server is not in the list, please use [custom](#custom-server) and sugge | account | Account | No | The account used by the server. | #### Amazon S3 Server and Compatible Servers + [Amazon Simple Storage Service (Amazon S3)](https://aws.amazon.com/s3/) is an object storage service offering industry-leading scalability, data availability, security, and performance. Millions of customers of all sizes and industries store, manage, analyze, and protect any amount of data for virtually any use case, such as data lakes, cloud-native applications, and mobile apps. Other vendors have implemented a compatible implementation of S3. | Key | UX Label | Required | Description | @@ -1439,6 +1459,7 @@ If your server is not in the list, please use [custom](#custom-server) and sugge | delimiter | Delimiter | No | Only for format = json. How multiple json documents are delimited within one file | #### SFTP Server + Secure File Transfer Protocol (SFTP) is a network protocol that enables secure and encrypted file transfers between a client and a server. | Key | UX Label | Required | Description | @@ -1459,7 +1480,8 @@ Secure File Transfer Protocol (SFTP) is a network protocol that enables secure a | schema | Schema | Yes | The name of the schema. | #### Microsoft SQL Server -[Microsoft SQL Server](https://www.microsoft.com/en-us/sql-server/sql-server-downloads) is a proprietary relational database management system developed by Microsoft. + +[Microsoft SQL Server](https://www.microsoft.com/en-us/sql-server/sql-server-downloads) is a proprietary relational database management system developed by Microsoft. | Key | UX Label | Required | Description | |----------|----------|----------|----------------------------------------------------| @@ -1520,8 +1542,8 @@ Secure File Transfer Protocol (SFTP) is a network protocol that enables secure a If you need another property, use [custom properties](#custom-properties). - ## Custom Properties + This section covers custom properties you may find in a data contract. ### Example @@ -1546,8 +1568,8 @@ customProperties: | customProperties.value | Value | No | The value of the key. | | customProperties.description | Description | No | Description for humans. | - ## Other Properties + This section covers other properties you may find in a data contract. ### Example @@ -1556,7 +1578,6 @@ This section covers other properties you may find in a data contract. contractCreatedTs: 2024-09-17T11:58:08Z ``` - ### Other properties definition | Key | UX label | Required | Description | @@ -1567,5 +1588,4 @@ contractCreatedTs: 2024-09-17T11:58:08Z [Check full example here.](examples/all/full-example.odcs.yaml) - -All trademarks are the property of their respective owners. +All trademarks are the property of their respective owners. From c215c3db1fb9bf1f5742d10048b9959ab3ba801d Mon Sep 17 00:00:00 2001 From: pkoper Date: Tue, 14 Oct 2025 10:14:36 +0100 Subject: [PATCH 07/11] chore(DEV SETUP): README dev corrections --- docs/README.md | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/docs/README.md b/docs/README.md index f94e46c..c12808d 100644 --- a/docs/README.md +++ b/docs/README.md @@ -243,7 +243,7 @@ Some keys are more applicable when the described property is a column. | primaryKey | Primary Key | No | Boolean value specifying whether the field is primary or not. Default is false. | | primaryKeyPosition | Primary Key Position | No | If field is a primary key, the position of the primary key element. Starts from 1. Example of `account_id, name` being primary key columns, `account_id` has primaryKeyPosition 1 and `name` primaryKeyPosition 2. Default to -1. | | logicalType | Logical Type | No | The logical field datatype. One of `string`, `date`, `timestamp`, `time`, `number`, `integer`, `object`, `array` or `boolean`. | -| logicalTypeOptions | Logical Type Options | No | Additional optional metadata to describe the logical type. See [here](#logical-type-options) for more details about supported options for each `logicalType`. | +| logicalTypeOptions | Logical Type Options | No | Additional optional metadata to describe the logical type. See [Logical Type Options](#logical-type-options) for more details about supported options for each `logicalType`. | | physicalType | Physical Type | No | The physical element data type in the data source. For example, VARCHAR(2), DOUBLE, INT. | | description | Description | No | Description of the element. | | required | Required | No | Indicates if the element may contain Null values; possible values are true and false. Default is false. | @@ -374,6 +374,7 @@ A fully formatted reference follows this structure: ``` Where: + * **``**: Path to the contract file (optional for same-contract references) * **``**: '#' symbol to mark entry into a contract (optional for same-contract) * **``**: The defined path within the contract @@ -823,13 +824,13 @@ schema: ### SQL -A single SQL query that returns either a numeric or boolean value for comparison. The query must be written in the SQL dialect specific to the provided server. `${object}` and `${property}` are automatically replaced by the current object (in the case of SQL on a relational database, the table or view name) and the current property name (in the case of SQL on a relational database, the column). +A single SQL query that returns either a numeric or boolean value for comparison. The query must be written in the SQL dialect specific to the provided server. `{object}` and `{property}` are automatically replaced by the current object (in the case of SQL on a relational database, the table or view name) and the current property name (in the case of SQL on a relational database, the column). ```yaml quality: - type: sql query: | - SELECT COUNT(*) FROM ${object} WHERE ${property} IS NOT NULL + SELECT COUNT(*) FROM {object} WHERE {property} IS NOT NULL mustBeLessThan: 3600 ``` @@ -872,7 +873,7 @@ The data contract can contain scheduling information for executing the rules. Yo quality: - type: sql query: | - SELECT COUNT(*) FROM ${object} WHERE ${property} IS NOT NULL + SELECT COUNT(*) FROM {object} WHERE {property} IS NOT NULL mustBeLessThan: 3600 scheduler: cron schedule: 0 20 * * * @@ -941,7 +942,7 @@ The operator specifies the condition to validate a metric or result of a SQL que quality: - type: sql query: | - SELECT COUNT(*) FROM ${table} WHERE ${column} IS NOT NULL + SELECT COUNT(*) FROM {table} WHERE {column} IS NOT NULL mustBeBetween: [0, 100] ``` @@ -951,7 +952,7 @@ is equivalent to: quality: - type: sql query: | - SELECT COUNT(*) FROM ${table} WHERE ${column} IS NOT NULL + SELECT COUNT(*) FROM {table} WHERE {column} IS NOT NULL mustBeGreaterThan: 0 mustBeLessThan: 100 ``` @@ -1172,6 +1173,7 @@ The `servers` element describes where the data protected by this data contract i An entry in `servers` describes a single dataset on a specific environment and a specific technology. The `servers` element can contain multiple servers, each with its own configuration. The typical ways of using the top level `servers` element are as follows: + * **Single Server:** The data contract protects a specific dataset at a specific location. *Example:* a CSV file on an SFTP server. * **Multiple Environments:** The data contract makes sure that the data is protected in all environments. *Example:* a data product with data in a dev(elopment), UAT, and prod(uction) environment on Databricks. * **Different Technologies:** The data contract makes sure that regardless of the offered technology, it still holds. *Example:* a data product offers its data in a Kafka topic and in a BigQuery table that should have the same structure and content. From 2ecf858cc9471d5c5c91100822efaff035c689e8 Mon Sep 17 00:00:00 2001 From: pkoper Date: Tue, 14 Oct 2025 10:17:03 +0100 Subject: [PATCH 08/11] chore(DEV SETUP): README dev corrections --- docs/README.md | 871 ++++++++++++------------------------------------- 1 file changed, 217 insertions(+), 654 deletions(-) diff --git a/docs/README.md b/docs/README.md index c12808d..b93f4e4 100644 --- a/docs/README.md +++ b/docs/README.md @@ -7,19 +7,18 @@ image: "https://raw.githubusercontent.com/bitol-io/artwork/main/horizontal/color # Open Data Contract Standard ## Executive Summary - -This document describes the keys and values expected in a YAML data contract, per the **Open Data Contract Standard**. -It is divided in multiple sections: [fundamentals (fka demographics)](#fundamentals), [schema](#schema), -[data quality](#data-quality), [Support & communication channels](#support-and-communication-channels), [pricing](#pricing), [team](#team), -[roles](#roles), [service-level agreement](#service-level-agreement-sla), [Infrastructures & servers](#infrastructure-and-servers) and -[other/custom properties](#custom-properties). Each section starts with at least an example followed by definition of +This document describes the keys and values expected in a YAML data contract, per the **Open Data Contract Standard**. +It is divided in multiple sections: [fundamentals (fka demographics)](#fundamentals), [schema](#schema), +[data quality](#data-quality), [Support & communication channels](#support-and-communication-channels), [pricing](#pricing), [team](#team), +[roles](#roles), [service-level agreement](#service-level-agreement-sla), [Infrastructures & servers](#infrastructure-and-servers) and +[other/custom properties](#custom-properties). Each section starts with at least an example followed by definition of each field/key. + ## Table of content 1. [Fundamentals (fka demographics)](#fundamentals) 1. [Schema](#schema) -1. [References](#references) 1. [Data quality](#data-quality) 1. [Support & communication channels](#support-and-communication-channels) 1. [Pricing](#pricing) @@ -30,14 +29,15 @@ each field/key. 1. [Custom & other properties](#custom-properties) 1. [Examples](#full-example-1) + ## Notes * This contract is containing example values, we reviewed very carefully the consistency of those values, but we cannot guarantee that there are no errors. If you spot one, please raise an [issue](https://github.com/AIDAUserGroup/open-data-contract-standard/issues). * Some fields have `null` value: even if it is equivalent to not having the field in the contract, we wanted to have the field for illustration purpose. * This contract should be **platform agnostic**. If you think it is not the case, please raise an [issue](https://github.com/AIDAUserGroup/open-data-contract-standard/issues). -## Fundamentals +## Fundamentals This section contains general information about the contract. ### Example @@ -48,7 +48,7 @@ kind: DataContract id: 53581432-6c55-4ba2-a65f-72344a91553a name: seller_payments_v1 -version: 1.1.0 # Data Contract Version +version: 1.1.0 # Data Contract Version status: active domain: seller dataProduct: payments @@ -64,27 +64,28 @@ tags: ['finance'] ### Definitions -| Key | UX label | Required | Description | -|--------------------------------------|---------------------------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| apiVersion | Standard version | Yes | Version of the standard used to build data contract. Default value is `v3.0.2`. | -| kind | Kind | Yes | The kind of file this is. Valid value is `DataContract`. | -| id | ID | Yes | A unique identifier used to reduce the risk of dataset name collisions, such as a UUID. | -| name | Name | No | Name of the data contract. | -| version | Version | Yes | Current version of the data contract. | -| status | Status | Yes | Current status of the data contract. Examples are "proposed", "draft", "active", "deprecated", "retired". | -| tenant | Tenant | No | Indicates the property the data is primarily associated with. Value is case insensitive. | -| domain | Domain | No | Name of the logical data domain. | -| dataProduct | Data Product | No | Name of the data product. | -| authoritativeDefinitions | Authoritative Definitions | No | List of links to sources that provide more details on the data contract. | -| description | Description | No | Object containing the descriptions. | -| description.purpose | Purpose | No | Intended purpose for the provided data. | -| description.limitations | Limitations | No | Technical, compliance, and legal limitations for data use. | -| description.usage | Usage | No | Recommended usage of the data. | -| description.authoritativeDefinitions | Authoritative Definitions | No | List of links to sources that provide more details on the dataset; examples would be a link to privacy statement, terms and conditions, license agreements, data catalog, or another tool. | -| description.customProperties | Custom Properties | No | Custom properties that are not part of the standard. | +| Key | UX label | Required | Description | +|--------------------------------------|---------------------------|----------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| apiVersion | Standard version | Yes | Version of the standard used to build data contract. Default value is `v3.0.2`. | +| kind | Kind | Yes | The kind of file this is. Valid value is `DataContract`. | +| id | ID | Yes | A unique identifier used to reduce the risk of dataset name collisions, such as a UUID. | +| name | Name | No | Name of the data contract. | +| version | Version | Yes | Current version of the data contract. | +| status | Status | Yes | Current status of the data contract. Examples are "proposed", "draft", "active", "deprecated", "retired". | +| tenant | Tenant | No | Indicates the property the data is primarily associated with. Value is case insensitive. | +| tags | Tags | No | A list of tags that may be assigned to the elements (object or property); the tags keyword may appear at any level. Tags may be used to better categorize an element. For example, `finance`, `sensitive`, `employee_record`. | +| domain | Domain | No | Name of the logical data domain. | +| dataProduct | Data Product | No | Name of the data product. | +| authoritativeDefinitions | Authoritative Definitions | No | List of links to sources that provide more details on the data contract. | +| description | Description | No | Object containing the descriptions. | +| description.purpose | Purpose | No | Intended purpose for the provided data. | +| description.limitations | Limitations | No | Technical, compliance, and legal limitations for data use. | +| description.usage | Usage | No | Recommended usage of the data. | +| description.authoritativeDefinitions | Authoritative Definitions | No | List of links to sources that provide more details on the dataset; examples would be a link to privacy statement, terms and conditions, license agreements, data catalog, or another tool. | +| description.customProperties | Custom Properties | No | Custom properties that are not part of the standard. | -## Schema +## Schema This section describes the schema of the data contract. It is the support for data quality, which is detailed in the next section. Schema supports both a business representation of your data and a physical implementation. It allows to tie them together. In ODCS v3, the schema has evolved from the table and column representation, therefore the schema introduces a new terminology: @@ -108,10 +109,10 @@ schema: - name: tbl logicalType: object physicalType: table - physicalName: tbl_1 - description: Provides core payment metrics - authoritativeDefinitions: - - url: https://catalog.data.gov/dataset/air-quality + physicalName: tbl_1 + description: Provides core payment metrics + authoritativeDefinitions: + - url: https://catalog.data.gov/dataset/air-quality type: businessDefinition - url: https://youtu.be/jbY1BKFj9ec type: videoTutorial @@ -133,12 +134,12 @@ schema: - table_name_2 - table_name_3 transformLogic: sel t1.txn_dt as txn_ref_dt from table_name_1 as t1, table_name_2 as t2, table_name_3 as t3 where t1.txn_dt=date-3 - transformDescription: Defines the logic in business terms. + transformDescription: Defines the logic in business terms. examples: - 2022-10-03 - 2020-01-28 - name: rcvr_id - primaryKey: true + primaryKey: true primaryKeyPosition: 1 businessName: receiver id logicalType: string @@ -152,7 +153,7 @@ schema: classification: restricted encryptedName: enc_rcvr_id - name: rcvr_cntry_code - primaryKey: false + primaryKey: false primaryKeyPosition: -1 businessName: receiver country code logicalType: string @@ -182,7 +183,7 @@ schema: logicalType: object properties: - name: street_lines - logicalType: array + logicalType: array items: logicalType: string ``` @@ -200,7 +201,7 @@ schema: logicalType: object properties: - name: id - logicalType: string + logicalType: string physicalType: VARCHAR(40) - name: zip logicalType: string @@ -221,6 +222,7 @@ schema: |--------------------------|------------------------------|----------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | name | Name | Yes | Name of the element. | | physicalName | Physical Name | No | Physical name. | +| physicalType | Physical Type | No | The physical element data type in the data source. For objects: `table`, `view`, `topic`, `file`. For properties: `VARCHAR(2)`, `DOUBLE`, `INT`, etc. | | description | Description | No | Description of the element. | | businessName | Business Name | No | The business name of the element. | | authoritativeDefinitions | Authoritative Definitions | No | List of links to sources that provide more details on the element; examples would be a link to privacy statement, terms and conditions, license agreements, data catalog, or another tool. | @@ -236,15 +238,14 @@ schema: #### Applicable to Properties -Some keys are more applicable when the described property is a column. +Some keys are more applicable when the described property is a column. | Key | UX label | Required | Description | |--------------------------|------------------------------|----------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | primaryKey | Primary Key | No | Boolean value specifying whether the field is primary or not. Default is false. | | primaryKeyPosition | Primary Key Position | No | If field is a primary key, the position of the primary key element. Starts from 1. Example of `account_id, name` being primary key columns, `account_id` has primaryKeyPosition 1 and `name` primaryKeyPosition 2. Default to -1. | -| logicalType | Logical Type | No | The logical field datatype. One of `string`, `date`, `timestamp`, `time`, `number`, `integer`, `object`, `array` or `boolean`. | -| logicalTypeOptions | Logical Type Options | No | Additional optional metadata to describe the logical type. See [Logical Type Options](#logical-type-options) for more details about supported options for each `logicalType`. | -| physicalType | Physical Type | No | The physical element data type in the data source. For example, VARCHAR(2), DOUBLE, INT. | +| logicalType | Logical Type | No | The logical field datatype. One of `string`, `date`, `number`, `integer`, `object`, `array` or `boolean`. | +| logicalTypeOptions | Logical Type Options | No | Additional optional metadata to describe the logical type. See [here](#logical-type-options) for more details about supported options for each `logicalType`. | | description | Description | No | Description of the element. | | required | Required | No | Indicates if the element may contain Null values; possible values are true and false. Default is false. | | unique | Unique | No | Indicates if the element contains unique values; possible values are true and false. Default is false. | @@ -269,15 +270,13 @@ Additional metadata options to more accurately define the data type. | array | maxItems | Maximum Items | No | Maximum number of items. | | array | minItems | Minimum Items | No | Minimum number of items. | | array | uniqueItems | Unique Items | No | If set to true, all items in the array are unique. | -| date/timestamp/time | format | Format | No | Format of the date. Follows the format as prescribed by [JDK DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html). Default value is using ISO 8601: 'YYYY-MM-DDTHH:mm:ss.SSSZ'. For example, format 'yyyy-MM-dd'. | -| date/timestamp/time | exclusiveMaximum | Exclusive Maximum | No | All values must be strictly less than this value (values < exclusiveMaximum). | -| date/timestamp/time | exclusiveMinimum | Exclusive Minimum | No | All values must be strictly greater than this value (values > exclusiveMinimum). | -| date/timestamp/time | maximum | Maximum | No | All date values are less than or equal to this value (values <= maximum). | -| date/timestamp/time | minimum | Minimum | No | All date values are greater than or equal to this value (values >= minimum). | -| timestamp/time | timezone | Timezone | No | Whether the timestamp defines the timezone or not. If true, timezone information is included in the timestamp. | -| timestamp/time | defaultTimezone | Default Timezone | No | The default timezone of the timestamp. If timezone is not defined, the default timezone UTC is used. | -| integer/number | exclusiveMaximum | Exclusive Maximum | No | All values must be strictly less than this value (values < exclusiveMaximum). | -| integer/number | exclusiveMinimum | Exclusive Minimum | No | All values must be strictly greater than this value (values > exclusiveMinimum). | +| date | format | Format | No | Format of the date. Follows the format as prescribed by [JDK DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html). Default value is using ISO 8601: 'YYYY-MM-DDTHH:mm:ss.SSSZ'. For example, format 'yyyy-MM-dd'. | +| date | exclusiveMaximum | Exclusive Maximum | No | If set to true, all values are strictly less than the maximum value (values < maximum). Otherwise, less than or equal to the maximum value (values <= maximum). | +| date | exclusiveMinimum | Exclusive Minimum | No | If set to true, all values are strictly greater than the minimum value (values > minimum). Otherwise, greater than or equal to the minimum value (values >= minimum). | +| date | maximum | Maximum | No | All date values are less than or equal to this value (values <= maximum). | +| date | minimum | Minimum | No | All date values are greater than or equal to this value (values >= minimum). | +| integer/number | exclusiveMaximum | Exclusive Maximum | No | If set to true, all values are strictly less than the maximum value (values < maximum). Otherwise, less than or equal to the maximum value (values <= maximum). | +| integer/number | exclusiveMinimum | Exclusive Minimum | No | If set to true, all values are strictly greater than the minimum value (values > minimum). Otherwise, greater than or equal to the minimum value (values >= minimum). | | integer/number | format | Format | No | Format of the value in terms of how many bits of space it can use and whether it is signed or unsigned (follows the Rust integer types). | | integer/number | maximum | Maximum | No | All values are less than or equal to this value (values <= maximum). | | integer/number | minimum | Minimum | No | All values are greater than or equal to this value (values >= minimum). | @@ -288,11 +287,11 @@ Additional metadata options to more accurately define the data type. | string | format | Format | No | Provides extra context about what format the string follows. For example, password, byte, binary, email, uuid, uri, hostname, ipv4, ipv6. | | string | maxLength | Maximum Length | No | Maximum length of the string. | | string | minLength | Minimum Length | No | Minimum length of the string. | -| string | pattern | Pattern | No | Regular expression pattern to define valid value. Follows regular expression syntax from ECMA-262 (). | +| string | pattern | Pattern | No | Regular expression pattern to define valid value. Follows regular expression syntax from ECMA-262 (https://262.ecma-international.org/5.1/#sec-15.10.1). | #### Expressing Date / Datetime / Timezone information -Given the complexity of handling various date and time formats (e.g., date, datetime, time, timestamp, timestamp with and without timezone), the existing `logicalType` options currently support `date`, `timestamp`, and `time`. To specify additional temporal details, `logicalType` should be used in conjunction with `logicalTypeOptions.format` or `physicalType` to define the desired format. Using `physicalType` allows for definition of your data-source specific data type. +Given the complexity of handling various date and time formats (e.g., date, datetime, time, timestamp, timestamp with and without timezone), the existing `logicalType` options currently support only `date`. To specify additional temporal details, `logicalType` should be used in conjunction with `logicalTypeOptions.format` or `physicalType` to define the desired format. Using `physicalType` allows for definition of your data-source specific data type. ``` yaml version: 1.0.0 @@ -302,349 +301,62 @@ status: active name: date_example apiVersion: v3.0.2 schema: - # Date Only + # Date Only - name: event_date logicalType: date logicalTypeOptions: - format: "yyyy-MM-dd" + - format: "yyyy-MM-dd" examples: - "2024-07-10" - # Date & Time (UTC) + # Date & Time (UTC) - name: created_at - logicalType: timestamp + logicalType: date logicalTypeOptions: - format: "yyyy-MM-ddTHH:mm:ssZ" + - format: "yyyy-MM-ddTHH:mm:ssZ" examples: - "2024-03-10T14:22:35Z" - # Date & Time (Australia/Sydney) - - name: created_at_sydney - logicalType: timestamp - logicalTypeOptions: - format: "yyyy-MM-ddTHH:mm:ssZ" - timezone: true - defaultTimezone: "Australia/Sydney" - examples: - - "2024-03-10T14:22:35+10:00" - - # Time Only + # Time Only - name: event_start_time - logicalType: time + logicalType: date logicalTypeOptions: - format: "HH:mm:ss" + - format: "HH:mm:ss" examples: - "08:30:00" # Physical Type with Date & Time (UTC) - name: event_date - logicalType: timestamp + logicalType: date physicalType: DATETIME logicalTypeOptions: - format: "yyyy-MM-ddTHH:mm:ssZ" + - format: yyyy-MM-ddTHH:mm:ssZ" examples: - "2024-03-10T14:22:35Z" ``` ### Authoritative definitions - Reference to an external definition on element logic or values. -| Key | UX label | Required | Description | -|--------------------------------------|-------------------|----------|---------------------------------------------------------------------------------------------------------------------------------------------------------------| -| authoritativeDefinitions | Link | No | A list of type/link pairs for authoritative definitions. | -| authoritativeDefinitions.type | Definition type | Yes | Type of definition for authority. Valid values are: `businessDefinition`, `transformationImplementation`, `videoTutorial`, `tutorial`, and `implementation`. | -| authoritativeDefinitions.url | URL to definition | Yes | URL to the authority. | -| authoritativeDefinitions.description | Description | No | Description for humans | - -## References - -This section describes how to reference elements within a data contract schema. References enable you to create relationships between different parts of your data contract. - -> [!IMPORTANT] -> References are currently only supported within schema properties for foreign key relationships. - -### Reference Structure - -A fully formatted reference follows this structure: - -```yaml - -``` - -Where: - -* **``**: Path to the contract file (optional for same-contract references) -* **``**: '#' symbol to mark entry into a contract (optional for same-contract) -* **``**: The defined path within the contract - -### External Contract References - -To identify a contract, use one of these formats: - -```yaml -# Same folder as current contract -data-contract-v1.yaml - -# Full path -file:///path/to/data-contract-v1.yaml - -# URL -https://example.com/data-contract-v1.yaml - -# Relative path -../../path/to/data-contract-v1.yaml -``` - -### Reference Examples - -#### External Contract References - -```yaml -# Reference to an element in an external contract -'external-contract.yaml#schema.my-table' - -# Reference to a specific column in an external contract -'external-contract.yaml#schema.my-table.my-column' -``` - -#### Same Contract References - -When referencing elements within the same contract, the file component can be omitted. - -```yaml -# Full reference within same contract -'#schema.my-table.my-column' - -# File and anchor can be omitted for same contract -'schema.my-table.my-column' -``` - -### Shorthand Notation - -For improved readability, ODCS supports the following shorthand notation when referencing properties within the same schema. The shorthand notation allows for a more concise way to define relationships. It can be used in the `to` and `from` fields of a relationship. -The shorthand notation is `.`. - -These shorthand options are only available for properties within the same data contract. - -### Relationships between properties (Foreign Keys) - -Properties can define relationships to other properties, enabling you to specify foreign key constraints and other data relationships. Relationships use the reference mechanism from RFC 9. - -#### Quick Overview - -Relationships can be defined in two ways: - -1. **At the property level** - Define relationships directly on a property (the `from` field is implicit and must NOT be specified) -2. **At the schema level** - Define relationships between any properties (both `from` and `to` are required) - -#### Important Rules - -* **Property-level relationships**: The `from` field is implicit (derived from the property context) and must NOT be specified -* **Schema-level relationships**: Both `from` and `to` fields are required -* **Type consistency**: Both `from` and `to` must be the same type - either both strings (single column) or both arrays (composite keys). Mixing types is not allowed -* **Array length validation**: When using arrays for composite keys, both arrays must have the same number of elements. This is validated at runtime by implementations - -#### Field Definitions - -| Key | UX Label | Required | Description | -|-----|----------|----------|-------------| -| relationships | Relationships | No | Array of relationship definitions | -| relationships.type | Type | No | Type of relationship (defaults to `foreignKey`) | -| relationships.to | To | Yes | Target property reference using `schema.property` notation | -| relationships.from | From | Context-dependent | Source property reference - Required at schema level, forbidden at property level | -| relationships.customProperties | Custom Properties | No | Additional metadata about the relationship | - -#### Reference Notation - -* **Simple reference**: `users.id` - References the `id` property in the `users` schema -* **Nested reference**: `accounts.address.street` - References nested properties -* **Composite keys**: Use arrays to define composite keys (arrays must have matching lengths) - -### Examples - -#### Example 1: Simple Foreign Key (Property Level) - -When defining a relationship at the property level, the `from` field is implicit and must NOT be specified: - -```yaml -schema: - - name: users - properties: - - name: user_id - relationships: - - to: accounts.owner_id # 'from' is implicit (users.user_id) - # Note: DO NOT include 'from' field at property level -``` - -#### Example 2: Multiple Relationships - -A property can have multiple relationships: - -```yaml -schema: - - name: orders - properties: - - name: customer_id - relationships: - - to: customers.id - - to: loyalty_members.customer_id -``` - -#### Example 3: Schema-Level Relationships - -Define relationships at the schema level when you need explicit `from` and `to`. Both fields are REQUIRED at this level: - -```yaml -schema: - - name: users - relationships: - - from: users.account_id # Required at schema level - to: accounts.id # Required at schema level - type: foreignKey -``` - -#### Example 4: Nested Properties - -Reference nested properties using dot notation: - -```yaml -schema: - - name: users - properties: - - name: id - relationships: - - to: accounts.address.postal_code -``` - -#### Example 5: Composite Keys - -For composite foreign keys, use arrays. **Important**: Both `from` and `to` must be arrays with the same number of elements: - -```yaml -schema: - - name: order_items - relationships: - - type: foreignKey - from: # Array (must match 'to' length) - - order_items.order_id - - order_items.product_id - to: # Array (must match 'from' length) - - product_inventory.order_id - - product_inventory.product_id - -``` - -#### Example 6: Invalid Configurations - -Here are examples of invalid configurations that will be rejected: - -```yaml -# INVALID: 'from' specified at property level -schema: - - name: users - properties: - - name: user_id - relationships: - - from: users.user_id # ERROR: 'from' not allowed at property level - to: accounts.id - -# INVALID: Mismatched array types -schema: - - name: orders - relationships: - - from: orders.id # ERROR: 'from' is string but 'to' is array - to: - - items.order_id - - items.line_num - -# INVALID: Different array lengths (caught at runtime) -schema: - - name: orders - relationships: - - from: # 'from' has 2 elements - - orders.id - - orders.customer_id - to: # 'to' has 3 elements (runtime validation will fail) - - items.order_id - - items.customer_id - - items.line_num - -# INVALID: Missing 'from' at schema level -schema: - - name: orders - relationships: - - to: customers.id # ERROR: 'from' is required at schema level -``` - -#### Complete Example - -Here's a comprehensive example showing various relationship patterns: - -```yaml -schema: - - name: users - properties: - - name: id - relationships: - # Simple foreign key (from is implicit) - - to: accounts.user_id - - # With explicit from field - - from: users.id - to: profiles.user_id - - # With custom properties - - to: departments.manager_id - customProperties: - - property: cardinality - value: "one-to-many" - - property: label - value: "manages" - - # To external contract (from is implicit) - - to: https://example.com/data-contract-v1.yaml#profiles.user_id - customProperties: - - property: description - value: "Externally referenced contract" - - - name: account_number - - # Schema-level composite key relationship - relationships: - - type: foreignKey - from: - - users.id - - users.account_number - to: - - accounts.user_id - - accounts.account_number - - - name: accounts - properties: - - name: user_id - - name: account_number - - name: address - properties: - - name: street - - name: postal_code -``` +| Key | UX label | Required | Description | +|-------------------------------|-------------------|----------|---------------------------------------------------------------------------------------------------------------------------------------------------------------| +| authoritativeDefinitions | Link | No | A list of type/link pairs for authoritative definitions. | +| authoritativeDefinitions.type | Definition type | Yes | Type of definition for authority. Valid values are: `businessDefinition`, `transformationImplementation`, `videoTutorial`, `tutorial`, and `implementation`. | +| authoritativeDefinitions.url | URL to definition | Yes | URL to the authority. | ## Data quality - This section describes data quality rules & parameters. They are tightly linked to the schema described in the previous section. Data quality rules support different levels/stages of data quality attributes: -* **Text**: A human-readable text that describes the quality of the data. -* **Library** : A maintained library of commonly used quality metrics such as `rowCount`, `nullValues`, `invalidValues`, and more. -* **SQL**: An individual SQL query that returns a value that can be compared. -* **Custom**: Quality attributes that are vendor-specific, such as Soda, Great Expectations, dbt tests, dbx, or Montecarlo monitors. + - __Text__: A human-readable text that describes the quality of the data. + - __Library__ rules: A maintained library of commonly-used predefined quality attributes such as `rowCount`, `unique`, `freshness`, and more. + - __SQL__: An individual SQL query that returns a value that can be compared. Can be extended to `Python` or other. + - __Custom__: Quality attributes that are vendor-specific, such as Soda, Great Expectations, dbt tests, or Montecarlo monitors. ### Text - -A human-readable text that describes the quality of the data. Later in the development process, these might be translated into an executable check (such as `sql`), a library metric, or checked through an AI engine. +A human-readable text that describes the quality of the data. Later in the development process, these might be translated into an executable check (such as `sql`), a library rule, or checked through an AI engine. ```yaml quality: @@ -653,189 +365,67 @@ quality: ``` ### Library +ODCS will provide a set of predefined rules commonly used in data quality checks, designed to be compatible with all major data quality engines. This simplifies the work for data engineers by eliminating the need to manually write SQL queries. -ODCS provides a set of predefined metrics commonly used in data quality checks, designed to be compatible with all major data quality engines. This simplifies the work for data engineers by eliminating the need to manually write SQL queries. - -The type for library metrics is `library`, which can be omitted, if a `metric` property is defined. - -These metrics return a numeric value come with an operator to compare if the metric is valid and in the expected range. - -Some metrics require additional parameters, which can be defined in the `arguments` property. - -Example: - -```yaml -properties: - - name: order_id - quality: - - type: library - metric: nullValues - mustBe: 0 - unit: rows - description: "There must be no null values in the column." -``` - -is equalized to: - -```yaml -properties: - - name: order_id - quality: - - metric: nullValues - mustBe: 0 - description: "There must be no null values in the column." -``` - -#### Metrics - -| Metric | Level | Description | Arguments | Arguments Example | -|--------|--------|----------------------------------------------------------------|------------------------------------------------------------------|----------------------------------------------------------------------| -| `nullValues` | Property | Counts null values in a column/field | None | | -| `missingValues` | Property | Counts values considered as missing (empty strings, N/A, etc.) | `missingValues`: Array of values considered missing | `missingValues: [null, '', 'N/A']` | -| `invalidValues` | Property | Counts values that don't match valid criteria | `validValues`: Array of valid values
`pattern`: Regex pattern | `validValues: ['pounds', 'kg']`
`pattern: '^[A-Z]{2}[0-9]{2}...'` | -| `duplicateValues` | Property | Counts duplicate values in a column | None | | -| `duplicateValues` | Schema | Counts duplicate values across multiple columns | `properties`: Array of property names | `properties: ['tenant_id', 'order_id']` | -| `rowCount` | Schema | Counts total number of rows in a table/object store | None | | - -##### Null Values - -Check that the count of null values is within range. - -```yaml -properties: - - name: customer_id - quality: - - metric: nullValues - mustBe: 0 - description: "There must be no null values in the column." -``` - -Example with percent: - -```yaml -properties: - - name: order_status - quality: - - metric: nullValues - mustBeLessThan: 1 - unit: percent - description: "There must be less than 1% null values in the column." -``` - -##### Missing Values - -Check that the missing values are within range. - -In the argument `missingValues`, a list of values that are considered to be missing. - -```yaml -properties: - - name: email_address - quality: - - metric: missingValues - arguments: - missingValues: [null, '', 'N/A', 'n/a'] - mustBeLessThan: 100 - unit: rows # rows (default) or percent -``` - -##### Invalid Values - -Check that the value is within a defined set or matching a pattern. - -```yaml -properties: - - name: line_item_unit - quality: - - metric: invalidValues - arguments: - validValues: ['pounds', 'kg'] - mustBeLessThan: 5 - unit: rows -``` - -Using a pattern: - -```yaml -properties: - - name: iban - quality: - - metric: invalidValues - mustBe: 0 - description: "The value must be an IBAN." - arguments: - pattern: '^[A-Z]{2}[0-9]{2}[A-Z0-9]{4}[0-9]{7}([A-Z0-9]?){0,16}$' -``` - -##### Duplicate Values +#### Property-level +Those examples apply at the property level, such as column, field, etc. +##### Duplicate count on rows No more than 10 duplicate names. ```yaml -properties: - - name: email_address - quality: - - metric: duplicateValues - mustBeLessThan: 10 - unit: rows - description: "There must be less than 10 duplicate values in the column." +quality: +- type: library # optional and default value for data quality rules + rule: duplicateCount + mustBeLessThan: 10 + name: Fewer than 10 duplicate names + unit: rows ``` +##### Duplicate count on % Duplicates should be less than 1%. ```yaml -properties: - - name: phone_number - quality: - - metric: duplicateValues - mustBeLessThan: 1 - unit: percent +quality: +- rule: duplicateCount + mustBeLessThan: 1 + unit: percent ``` -##### Row count (Schema-Level) - -Calculates the number of rows (usually in a table) and compares it to an absolute operator. +##### Valid values +Valid values from a static list. ```yaml -schema: - - name: orders - quality: - - metric: rowCount - mustBeBetween: [100, 120] +quality: +- rule: validValues + validValues: ['pounds'] ``` -##### Duplicates (Schema-Level) +#### Object-level +This example applies at the object level (like a table or a view). -Checks for duplicate rows based on a combination of properties. -This is useful for validating compound keys where uniqueness is defined not by a single column but by multiple columns together. +##### Row count +The number of rows must be between 100 and 120. ```yaml -schema: - - name: orders - quality: - - description: The combination of tenant_id and order_id must be unique - metric: duplicateValues - mustBe: 0 - arguments: - properties: # Properties refer to the property in the schema. - - tenant_id - - order_id +quality: + - rule: rowCount + mustBeBetween: [100, 120] + name: Verify row count range ``` ### SQL - -A single SQL query that returns either a numeric or boolean value for comparison. The query must be written in the SQL dialect specific to the provided server. `{object}` and `{property}` are automatically replaced by the current object (in the case of SQL on a relational database, the table or view name) and the current property name (in the case of SQL on a relational database, the column). +A single SQL query that returns either a numeric or boolean value for comparison. The query must be written in the SQL dialect specific to the provided server. `${object}` and `${property}` are automatically replaced by the current object (in the case of SQL on a relational database, the table or view name) and the current property name (in the case of SQL on a relational database, the column). ```yaml quality: - - type: sql + - type: sql query: | - SELECT COUNT(*) FROM {object} WHERE {property} IS NOT NULL - mustBeLessThan: 3600 + SELECT COUNT(*) FROM ${object} WHERE ${property} IS NOT NULL + mustBeLessThan: 3600 ``` ### Custom - Custom rules allow for vendor-specific checks, including tools like Soda, Great Expectations, dbt-tests, Montecarlo, and others. Any format for properties is acceptable, whether it's written in YAML, JSON, XML, or even uuencoded binary. They are an intermediate step before the vendor accepts ODCS natively. #### Soda Example @@ -866,23 +456,22 @@ quality: ``` ### Scheduling - -The data contract can contain scheduling information for executing the rules. You can use `schedule` and `scheduler` for those operation. In previous versions of ODCS, the only allowed scheduler was cron and its syntax was `scheduleCronExpression`. +The data contract can contain scheduling information for executing the rules. You can use `schedule` and `scheduler` for those operation. In previous versions of ODCS, the only allowed scheduler was cron and its syntax was `scheduleCronExpression`. ```yaml quality: - - type: sql + - type: sql query: | - SELECT COUNT(*) FROM {object} WHERE {property} IS NOT NULL - mustBeLessThan: 3600 + SELECT COUNT(*) FROM ${object} WHERE ${property} IS NOT NULL + mustBeLessThan: 3600 scheduler: cron schedule: 0 20 * * * ``` + ### Definitions Acronyms: - * DQ: data quality. | Key | UX label | Required | Description | @@ -891,11 +480,10 @@ Acronyms: | quality.name | Name | No | A short name for the rule. | | quality.description | Description | No | Describe the quality check to be completed. | | quality.type | Type | No | Type of DQ rule. Valid values are `library` (default), `text`, `sql`, and `custom`. | -| quality.metric | Metric name | No | Required for `library`: the name of the metric to be calculated and compared. | -| quality.rule | Rule name | No | Deprecated, use `metric` instead. | -| quality.arguments | Arguments | No | Additional arguments for the metric, if needed. | +| quality.rule | Rule name | No | Required for `library` DQ rules: the name of the rule to be executed. | | quality.\ | See below | No | Multiple values are allowed for the **property**, the value is the one to compare to. | -| quality.unit | Unit | No | Unit the rule is using, popular values are `rows` or `percent`. | +| quality.unit | Unit | No | Unit the rule is using, popular values are `rows` or `percent`, but any value is allowed. | +| quality.validValues | Valid values | No | Static list of valid values. | | quality.query | SQL Query | No | Required for `sql` DQ rules: the SQL query to be executed. Note that it should match the target SQL engine/database, no transalation service are provided here. | | quality.engine | Third-party DQ Engine | No | Required for `custom` DQ rule: name of the third-party engine being used. Any value is authorized here but common values are `soda`, `greatExpectations`, `montecarlo`, etc. | | quality.implementation | Third-party Implementation | No | A text (non-parsed) block of code required for the third-party DQ engine to run. | @@ -910,56 +498,58 @@ Acronyms: | quality.schedule | Scheduler Configuration | No | Configuration information for the scheduling tool, for `cron` a possible value is `0 20 * * *`. | #### Valid Values for Dimension - Those data quality dimensions are used for classification and reporting in data quality. Valid values are: -* `accuracy` (synonym `ac`), -* `completeness` (synonym `cp`), -* `conformity` (synonym `cf`), -* `consistency` (synonym `cs`), -* `coverage` (synonym `cv`), -* `timeliness` (synonym `tm`), -* `uniqueness` (synonym `uq`). + * `accuracy` (synonym `ac`), + * `completeness` (synonym `cp`), + * `conformity` (synonym `cf`), + * `consistency` (synonym `cs`), + * `coverage` (synonym `cv`), + * `timeliness` (synonym `tm`), + * `uniqueness` (synonym `uq`). #### Valid Properties for Operator - -The operator specifies the condition to validate a metric or result of a SQL query. +The operator specifies the condition to validate the rule. | Operator | Expected Value | Math Symbol | Example | |--------------------------|---------------------|-------------|------------------------------| -| `mustBe` | number | `=` | `mustBe: 5` | +| `mustBe` | number | `=` | `mustBe: 5` | | `mustNotBe` | number | `<>`, `β‰ ` | `mustNotBe: 3.14` | | `mustBeGreaterThan` | number | `>` | `mustBeGreaterThan: 59` | | `mustBeGreaterOrEqualTo` | number | `>=`, `β‰₯` | `mustBeGreaterOrEqualTo: 60` | | `mustBeLessThan` | number | `<` | `mustBeLessThan: 1000` | | `mustBeLessOrEqualTo` | number | `<=`, `≀` | `mustBeLessOrEqualTo: 999` | -| `mustBeBetween` | list of two numbers | `∈` | `mustBeBetween: [0, 100]` | -| `mustNotBeBetween` | list of two numbers | `βˆ‰` | `mustNotBeBetween: [0, 100]` | +| `mustBeBetween` | list of two numbers | `βŠ‚` | `mustBeBetween: [0, 100]` | +| `mustNotBeBetween` | list of two numbers | `βŠ„` | `mustNotBeBetween: [0, 100]` | `mustBeBetween` is the equivalent to `mustBeGreaterThan` and `mustBeLessThan`. ```yaml quality: - - type: sql + - type: sql query: | - SELECT COUNT(*) FROM {table} WHERE {column} IS NOT NULL - mustBeBetween: [0, 100] + SELECT COUNT(*) FROM ${table} WHERE ${column} IS NOT NULL + mustBeBetween: [0, 100] ``` is equivalent to: ```yaml quality: - - type: sql + - type: sql query: | - SELECT COUNT(*) FROM {table} WHERE {column} IS NOT NULL + SELECT COUNT(*) FROM ${table} WHERE ${column} IS NOT NULL mustBeGreaterThan: 0 - mustBeLessThan: 100 + mustBeLessThan: 100 ``` -## Support and Communication Channels -Support and communication channels help consumers find help regarding their use of the data contract. +#### Library Rules +Bitol has the ambition of creating a library of common data quality rules. Join the working group around [RFC #0012](https://github.com/bitol-io/tsc/blob/main/rfcs/0012-implicit-dq-rules.md). + + +## Support and Communication Channels +Support and communication channels help consumers find help regarding their use of the data contract. ### Examples @@ -967,7 +557,8 @@ Support and communication channels help consumers find help regarding their use ```yaml support: - - channel: "#my-channel" # Simple Slack communication channel + - channel: channel-name-or-identifier # Simple Slack communication channel + url: https://aidaug.slack.com/archives/C05UZRSBKLY - channel: channel-name-or-identifier # Simple distribution list url: mailto:datacontract-ann@bitol.io ``` @@ -1001,20 +592,19 @@ support: ### Definitions -| Key | UX label | Required | Description | -|-----------------------|----------------|----------|-----------------------------------------------------------------------------------------------------------------------------------| -| support | Support | No | Top level for support channels. | -| support.channel | Channel | Yes | Channel name or identifier. | -| support.url | Channel URL | No | Access URL using normal [URL scheme](https://en.wikipedia.org/wiki/URL#Syntax) (https, mailto, etc.). | -| support.description | Description | No | Description of the channel, free text. | -| support.tool | Tool | No | Name of the tool, value can be `email`, `slack`, `teams`, `discord`, `ticket`, `googlechat`, or `other`. | -| support.scope | Scope | No | Scope can be: `interactive`, `announcements`, `issues`, `notifications`. | -| support.invitationUrl | Invitation URL | No | Some tools uses invitation URL for requesting or subscribing. Follows the [URL scheme](https://en.wikipedia.org/wiki/URL#Syntax). | -| support.customProperties | Custom Properties | No | Any custom properties. | +| Key | UX label | Required | Description | +|-----------------------|----------------|----------|-------------------------------------------------------------------------------------------------------------------------------------| +| support | Support | No | Top level for support channels. | +| support.channel | Channel | Yes | Channel name or identifier. | +| support.url | Channel URL | Yes | Access URL using normal [URL scheme](https://en.wikipedia.org/wiki/URL#Syntax) (https, mailto, etc.). | +| support.description | Description | No | Description of the channel, free text. | +| support.tool | Tool | No | Name of the tool, value can be `email`, `slack`, `teams`, `discord`, `ticket`, or `other`. | +| support.scope | Scope | No | Scope can be: `interactive`, `announcements`, `issues`. | +| support.invitationUrl | Invitation URL | No | Some tools uses invitation URL for requesting or subscribing. Follows the [URL scheme](https://en.wikipedia.org/wiki/URL#Syntax). | -## Pricing -This section covers pricing when you bill your customer for using this data product. +## Pricing +This section covers pricing when you bill your customer for using this data product. ### Example @@ -1034,12 +624,11 @@ price: | price.priceCurrency | Price Currency | No | Currency of the subscription price in `price.priceAmount`. | | price.priceUnit | Price Unit | No | The unit of measure for calculating cost. Examples megabyte, gigabyte. | -## Team +## Team This section lists team members and the history of their relation with this data contract. In v2.x, this section was called stakeholders. ### Example - ```YAML team: - username: ceastwood @@ -1058,8 +647,7 @@ team: ``` ### Definitions - -The UX label is the label used in the UI and other user experiences. +The UX label is the label used in the UI and other user experiences. | Key | UX label | Required | Description | |-------------------------|----------------------|----------|--------------------------------------------------------------------------------------------| @@ -1073,7 +661,6 @@ The UX label is the label used in the UI and other user experiences. | team.replacedByUsername | Replaced By Username | No | The username of the user who replaced the previous user. | ## Roles - This section lists the roles that a consumer may need to access the dataset depending on the type of access they require. ### Example @@ -1110,9 +697,9 @@ roles: | roles.secondLevelApprovers | 2nd Level Approvers | No | The name(s) of the second-level approver(s) of the role. | | roles.customProperties | Custom Properties | No | Any custom properties. | -## Service-Level Agreement (SLA) -This section describes the service-level agreements (SLA). +## Service-Level Agreement (SLA) +This section describes the service-level agreements (SLA). * Use the `Object.Element` to indicate the number to do the checks on, as in `SELECT txn_ref_dt FROM tab1`. * Separate multiple object.element by a comma, as in `table1.col1`, `table2.col1`, `table1.col2`. @@ -1121,14 +708,14 @@ This section describes the service-level agreements (SLA). ### Example ```YAML +slaDefaultElement: tab1.txn_ref_dt # Optional, default value is partitionColumn. slaProperties: - property: latency # Property, see list of values in DP QoS value: 4 unit: d # d, day, days for days; y, yr, years for years - element: tab1.txn_ref_dt + element: tab1.txn_ref_dt # This would not be needed as it is the same table.column as the default one - property: generalAvailability value: 2022-05-12T09:30:10-08:00 - description: GA at 12.5.22 - property: endOfSupport value: 2032-05-12T09:30:10-08:00 - property: endOfLife @@ -1154,17 +741,16 @@ slaProperties: ### Definitions -| Key | UX label | Required | Description | -|------------------------------------|------------------------|--------------------------------|-------------------------------------------------------------------------------------------------------------------| -| ~~slaDefaultElement~~ (Deprecated) | Default SLA element(s) | No | DEPRECATED SINCE 3.1. WILL BE REMOVED IN ODCS 4.0. Element (using the element path notation) to do the checks on. | -| slaProperties | SLA | No | A list of key/value pairs for SLA specific properties. There is no limit on the type of properties. | -| slaProperties.property | Property | Yes | Specific property in SLA, check the Data QoS periodic table. May requires units. | -| slaProperties.value | Value | Yes | Agreement value. The label will change based on the property itself. | -| slaProperties.valueExt | Extended value | No - unless needed by property | Extended agreement value. The label will change based on the property itself. | -| slaProperties.unit | Unit | No - unless needed by property | **d**, day, days for days; **y**, yr, years for years, etc. Units use the ISO standard. | -| slaProperties.element | Element(s) | No | Element(s) to check on. Multiple elements should be extremely rare and, if so, separated by commas. | -| slaProperties.driver | Driver | No | Describes the importance of the SLA from the list of: `regulatory`, `analytics`, or `operational`. | -| slaProperties.description | Description | No | Description of the SLA for humans. | +| Key | UX label | Required | Description | +|------------------------|------------------------|--------------------------------|-----------------------------------------------------------------------------------------------------| +| slaDefaultElement | Default SLA element(s) | No | Element (using the element path notation) to do the checks on. | +| slaProperties | SLA | No | A list of key/value pairs for SLA specific properties. There is no limit on the type of properties. | +| slaProperties.property | Property | Yes | Specific property in SLA, check the Data QoS periodic table. May requires units. | +| slaProperties.value | Value | Yes | Agreement value. The label will change based on the property itself. | +| slaProperties.valueExt | Extended value | No - unless needed by property | Extended agreement value. The label will change based on the property itself. | +| slaProperties.unit | Unit | No - unless needed by property | **d**, day, days for days; **y**, yr, years for years, etc. Units use the ISO standard. | +| slaProperties.element | Element(s) | No | Element(s) to check on. Multiple elements should be extremely rare and, if so, separated by commas. | +| slaProperties.driver | Driver | No | Describes the importance of the SLA from the list of: `regulatory`, `analytics`, or `operational`. | ## Infrastructure and Servers @@ -1173,11 +759,10 @@ The `servers` element describes where the data protected by this data contract i An entry in `servers` describes a single dataset on a specific environment and a specific technology. The `servers` element can contain multiple servers, each with its own configuration. The typical ways of using the top level `servers` element are as follows: - -* **Single Server:** The data contract protects a specific dataset at a specific location. *Example:* a CSV file on an SFTP server. -* **Multiple Environments:** The data contract makes sure that the data is protected in all environments. *Example:* a data product with data in a dev(elopment), UAT, and prod(uction) environment on Databricks. -* **Different Technologies:** The data contract makes sure that regardless of the offered technology, it still holds. *Example:* a data product offers its data in a Kafka topic and in a BigQuery table that should have the same structure and content. -* **Different Technologies and Multiple Environments:** The data contract makes sure that regardless of the offered technology and environment, it still holds. *Example:* a data product offers its data in a Kafka topic and in a BigQuery table that should have the same structure and content in dev(elopment), UAT, and prod(uction). +- **Single Server:** The data contract protects a specific dataset at a specific location. *Example:* a CSV file on an SFTP server. +- **Multiple Environments:** The data contract makes sure that the data is protected in all environments. *Example:* a data product with data in a dev(elopment), UAT, and prod(uction) environment on Databricks. +- **Different Technologies:** The data contract makes sure that regardless of the offered technology, it still holds. *Example:* a data product offers its data in a Kafka topic and in a BigQuery table that should have the same structure and content. +- **Different Technologies and Multiple Environments:** The data contract makes sure that regardless of the offered technology and environment, it still holds. *Example:* a data product offers its data in a Kafka topic and in a BigQuery table that should have the same structure and content in dev(elopment), UAT, and prod(uction). ### General Server Structure @@ -1198,14 +783,14 @@ servers: #### Common Server Properties -| Key | UX label | Required | Description | -|------------------|-------------------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| server | Server | Yes | Identifier of the server. | -| type | Type | Yes | Type of the server. Can be one of: api, athena, azure, bigquery, clickhouse, cloudsql, custom, databricks, db2, denodo, dremio, duckdb, glue, hive, informix, kafka, kinesis, local, mysql, oracle, postgres, postgresql, presto, pubsub, redshift, s3, sftp, snowflake, sqlserver, synapse, trino, vertica. | -| description | Description | No | Description of the server. | -| environment | Environment | No | Environment of the server. Examples includes: prod, preprod, dev, uat. | -| roles | Roles | No | List of roles that have access to the server. Check [roles](#roles) section for more details. | -| customProperties | Custom Properties | No | Custom properties that are not part of the standard. | +| Key | UX label | Required | Description | +|------------------|-------------------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| server | Server | Yes | Identifier of the server. | +| type | Type | Yes | Type of the server. Can be one of: api, athena, azure, bigquery, clickhouse, databricks, denodo, dremio, duckdb, glue, cloudsql, db2, informix, kafka, kinesis, local, mysql, oracle, postgresql, postgres, presto, pubsub, redshift, s3, sftp, snowflake, sqlserver, synapse, trino, vertica, custom. | +| description | Description | No | Description of the server. | +| environment | Environment | No | Environment of the server. Examples includes: prod, preprod, dev, uat. | +| roles | Roles | No | List of roles that have access to the server. Check [roles](#roles) section for more details. | +| customProperties | Custom Properties | No | Custom properties that are not part of the standard. | ### Specific Server Properties @@ -1215,36 +800,35 @@ Each server type can be customized with different properties such as `host`, `po If your server is not in the list, please use [custom](#custom-server) and suggest it as an improvement. Possible values for `type` are: -* [api](#api-server) -* [athena](#amazon-athena-server) -* [azure](#azure-server) -* [bigquery](#google-bigquery) -* [clickhouse](#clickhouse-server) -* [cloudsql](#google-cloud-sql) -* [databricks](#databricks-server) -* [db2](#ibm-db2-server) -* [denodo](#denodo-server) -* [dremio](#dremio-server) -* [duckdb](#duckdb-server) -* [glue](#amazon-glue) -* [hive](#hive) -* [informix](#ibm-informix-and-hcl-informix) -* [kafka](#kafka-server) -* [kinesis](#amazon-kinesis) -* [local](#local-files) -* [mysql](#mysql-server) -* [oracle](#oracle) -* [postgresql](#postgresql) -* [presto](#presto-server) -* [pubsub](#google-pubsub) -* [redshift](#amazon-redshift-server) -* [s3](#amazon-s3-server-and-compatible-servers) -* [sftp](#sftp-server) -* [snowflake](#snowflake) -* [sqlserver](#microsoft-sql-server) -* [synapse](#synapse-server) -* [trino](#trino-server) -* [vertica](#vertica-server) +- [api](#api-server) +- [athena](#amazon-athena-server) +- [azure](#azure-server) +- [bigquery](#google-bigquery) +- [clickhouse](#clickhouse-server) +- [databricks](#databricks-server) +- [db2](#ibm-db2-server) +- [denodo](#denodo-server) +- [dremio](#dremio-server) +- [duckdb](#duckdb-server) +- [glue](#amazon-glue) +- [cloudsql](#google-cloud-sql) +- [informix](#ibm-informix-and-hcl-informix) +- [kafka](#kafka-server) +- [kinesis](#amazon-kinesis) +- [local](#local-files) +- [mysql](#mysql-server) +- [oracle](#oracle) +- [postgresql](#postgresql) +- [presto](#presto-server) +- [pubsub](#google-pubsub) +- [redshift](#amazon-redshift-server) +- [s3](#amazon-s3-server-and-compatible-servers) +- [sftp](#sftp-server) +- [snowflake](#snowflake) +- [sqlserver](#microsoft-sql-server) +- [synapse](#synapse-server) +- [trino](#trino-server) +- [vertica](#vertica-server) #### API Server @@ -1252,8 +836,8 @@ If your server is not in the list, please use [custom](#custom-server) and sugge |----------------|------------|------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------| | **location** | Location | Yes | URL to the API | -#### Amazon Athena Server +#### Amazon Athena Server [Amazon Athena](https://docs.aws.amazon.com/athena/latest/ug/what-is.html) is an interactive query service that makes it easy to analyze data directly in Amazon Simple Storage Service (Amazon S3) using standard SQL. With a few actions in the AWS Management Console, you can point Athena at your data stored in Amazon S3 and begin using standard SQL to run ad-hoc queries and get results in seconds. | Key | UX Label | Required | Description | @@ -1272,7 +856,6 @@ If your server is not in the list, please use [custom](#custom-server) and sugge | delimiter | Delimiter | No | Only for format = json. How multiple json documents are delimited within one file | #### Google BigQuery - [BigQuery](https://cloud.google.com/bigquery) is a fully managed, AI-ready data analytics platform that helps you maximize value from your data and is designed to be multi-engine, multi-format, and multi-cloud. | Key | UX Label | Required | Description | @@ -1281,7 +864,6 @@ If your server is not in the list, please use [custom](#custom-server) and sugge | dataset | Dataset | Yes | The GCP dataset name. | #### ClickHouse Server - [ClickHouse](https://clickhouse.com/) is an open-source column-oriented database management system that allows generating analytical data reports in real-time. | Key | UX Label | Required | Description | @@ -1291,7 +873,6 @@ If your server is not in the list, please use [custom](#custom-server) and sugge | database | Database | Yes | The name of the database. | #### Google Cloud SQL - [Google Cloud SQL](https://cloud.google.com/sql) is a fully managed, cost-effective relational database service for PostgreSQL, MySQL, and SQL Server. | Key | UX Label | Required | Description | @@ -1335,7 +916,6 @@ If your server is not in the list, please use [custom](#custom-server) and sugge | schema | Schema | No | The name of the schema. | #### DuckDB Server - [DuckDB](https://duckdb.org/) supports a feature-rich SQL dialect complemented with deep integrations into client APIs. | Key | UX Label | Required | Description | @@ -1352,18 +932,7 @@ If your server is not in the list, please use [custom](#custom-server) and sugge | location | Location | No | The AWS S3 path. Must be in the form of a URL. | | format | Format | No | The format of the files | -#### Hive - -[Apache Hive](https://hive.apache.org/) is a distributed, fault-tolerant data warehouse system that enables analytics at massive scale. Built on top of Apache Hadoop, Hive allows users to read, write, and manage petabytes of data using SQL-like queries through HiveQL, with native support for cloud storage systems and enterprise-grade security features. - -| Key | UX Label | Required | Description | -|--------------|-----------------|------------|-------------------------------------------------| -| host | Host | Yes | The host to the Hive server. | -| port | Port | No | The port to the Hive server. Defaults to 10000. | -| database | Database | Yes | The name of the Hive database. | - #### IBM Informix and HCL Informix - [IBM Informix](https://www.ibm.com/products/informix) is a high performance, always-on, highly scalable and easily embeddable enterprise-class database optimized for the most demanding transactional and analytics workloads. As an object-relational engine, IBM Informix seamlessly integrates the best of relational and object-oriented capabilities enabling the flexible modeling of complex data structures and relationships. | Key | UX Label | Required | Description | @@ -1411,7 +980,6 @@ If your server is not in the list, please use [custom](#custom-server) and sugge | serviceName | Service Name | Yes | The name of the service. | #### PostgreSQL - [PostgreSQL](https://www.postgresql.org/) is a powerful, open source object-relational database system with over 35 years of active development that has earned it a strong reputation for reliability, feature robustness, and performance. | Key | UX Label | Required | Description | @@ -1430,7 +998,6 @@ If your server is not in the list, please use [custom](#custom-server) and sugge | schema | Schema | No | The name of the schema. | #### Google Pub/Sub - [Google Cloud](https://cloud.google.com/pubsub) service to Ingest events for streaming into BigQuery, data lakes or operational databases. | Key | UX Label | Required | Description | @@ -1438,7 +1005,6 @@ If your server is not in the list, please use [custom](#custom-server) and sugge | project | Project | Yes | The GCP project name. | #### Amazon Redshift Server - [Amazon Redshift](https://aws.amazon.com/redshift/) is a power data driven decisions with the best price-performance cloud data warehouse. | Key | UX Label | Required | Description | @@ -1450,7 +1016,6 @@ If your server is not in the list, please use [custom](#custom-server) and sugge | account | Account | No | The account used by the server. | #### Amazon S3 Server and Compatible Servers - [Amazon Simple Storage Service (Amazon S3)](https://aws.amazon.com/s3/) is an object storage service offering industry-leading scalability, data availability, security, and performance. Millions of customers of all sizes and industries store, manage, analyze, and protect any amount of data for virtually any use case, such as data lakes, cloud-native applications, and mobile apps. Other vendors have implemented a compatible implementation of S3. | Key | UX Label | Required | Description | @@ -1461,7 +1026,6 @@ If your server is not in the list, please use [custom](#custom-server) and sugge | delimiter | Delimiter | No | Only for format = json. How multiple json documents are delimited within one file | #### SFTP Server - Secure File Transfer Protocol (SFTP) is a network protocol that enables secure and encrypted file transfers between a client and a server. | Key | UX Label | Required | Description | @@ -1482,8 +1046,7 @@ Secure File Transfer Protocol (SFTP) is a network protocol that enables secure a | schema | Schema | Yes | The name of the schema. | #### Microsoft SQL Server - -[Microsoft SQL Server](https://www.microsoft.com/en-us/sql-server/sql-server-downloads) is a proprietary relational database management system developed by Microsoft. +[Microsoft SQL Server](https://www.microsoft.com/en-us/sql-server/sql-server-downloads) is a proprietary relational database management system developed by Microsoft. | Key | UX Label | Required | Description | |----------|----------|----------|----------------------------------------------------| @@ -1544,8 +1107,8 @@ Secure File Transfer Protocol (SFTP) is a network protocol that enables secure a If you need another property, use [custom properties](#custom-properties). -## Custom Properties +## Custom Properties This section covers custom properties you may find in a data contract. ### Example @@ -1556,22 +1119,20 @@ customProperties: value: gcsc.ruleset.name - property: somePropertyName value: property.value - - property: dataprocClusterName # Used for specific applications + - property: dataprocClusterName # Used for specific applications like Elevate value: [cluster name] - description: Cluster name for specific applications ``` ### Definitions -| Key | UX label | Required | Description | -|------------------------------|-------------------|----------|-------------------------------------------------------------------------------------------------------------------| -| customProperties | Custom Properties | No | A list of key/value pairs for custom properties. Initially created to support the REF ruleset property. | -| customProperties.property | Property | No | The name of the key. Names should be in camel case–the same as if they were permanent properties in the contract. | -| customProperties.value | Value | No | The value of the key. | -| customProperties.description | Description | No | Description for humans. | +| Key | UX label | Required | Description | +|---------------------------|----------------------|----------|-------------------------------------------------------------------------------------------------------------------| +| customProperties | Custom Properties | No | A list of key/value pairs for custom properties. Initially created to support the REF ruleset property. | +| customProperties.property | Property | No | The name of the key. Names should be in camel case–the same as if they were permanent properties in the contract. | +| customProperties.value | Value | No | The value of the key. | -## Other Properties +## Other Properties This section covers other properties you may find in a data contract. ### Example @@ -1580,6 +1141,7 @@ This section covers other properties you may find in a data contract. contractCreatedTs: 2024-09-17T11:58:08Z ``` + ### Other properties definition | Key | UX label | Required | Description | @@ -1590,4 +1152,5 @@ contractCreatedTs: 2024-09-17T11:58:08Z [Check full example here.](examples/all/full-example.odcs.yaml) -All trademarks are the property of their respective owners. + +All trademarks are the property of their respective owners. \ No newline at end of file From db7411bf36d147f1e4326a2dcdb0b1c20533f7ef Mon Sep 17 00:00:00 2001 From: pkoper Date: Tue, 14 Oct 2025 10:18:29 +0100 Subject: [PATCH 09/11] chore(DEV SETUP): README dev corrections --- docs/README.md | 871 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 654 insertions(+), 217 deletions(-) diff --git a/docs/README.md b/docs/README.md index b93f4e4..b597a78 100644 --- a/docs/README.md +++ b/docs/README.md @@ -7,18 +7,19 @@ image: "https://raw.githubusercontent.com/bitol-io/artwork/main/horizontal/color # Open Data Contract Standard ## Executive Summary -This document describes the keys and values expected in a YAML data contract, per the **Open Data Contract Standard**. -It is divided in multiple sections: [fundamentals (fka demographics)](#fundamentals), [schema](#schema), -[data quality](#data-quality), [Support & communication channels](#support-and-communication-channels), [pricing](#pricing), [team](#team), -[roles](#roles), [service-level agreement](#service-level-agreement-sla), [Infrastructures & servers](#infrastructure-and-servers) and -[other/custom properties](#custom-properties). Each section starts with at least an example followed by definition of -each field/key. +This document describes the keys and values expected in a YAML data contract, per the **Open Data Contract Standard**. +It is divided in multiple sections: [fundamentals (fka demographics)](#fundamentals), [schema](#schema), +[data quality](#data-quality), [Support & communication channels](#support-and-communication-channels), [pricing](#pricing), [team](#team), +[roles](#roles), [service-level agreement](#service-level-agreement-sla), [Infrastructures & servers](#infrastructure-and-servers) and +[other/custom properties](#custom-properties). Each section starts with at least an example followed by definition of +each field/key. ## Table of content 1. [Fundamentals (fka demographics)](#fundamentals) 1. [Schema](#schema) +1. [References](#references) 1. [Data quality](#data-quality) 1. [Support & communication channels](#support-and-communication-channels) 1. [Pricing](#pricing) @@ -29,15 +30,14 @@ each field/key. 1. [Custom & other properties](#custom-properties) 1. [Examples](#full-example-1) - ## Notes * This contract is containing example values, we reviewed very carefully the consistency of those values, but we cannot guarantee that there are no errors. If you spot one, please raise an [issue](https://github.com/AIDAUserGroup/open-data-contract-standard/issues). * Some fields have `null` value: even if it is equivalent to not having the field in the contract, we wanted to have the field for illustration purpose. * This contract should be **platform agnostic**. If you think it is not the case, please raise an [issue](https://github.com/AIDAUserGroup/open-data-contract-standard/issues). - ## Fundamentals + This section contains general information about the contract. ### Example @@ -48,7 +48,7 @@ kind: DataContract id: 53581432-6c55-4ba2-a65f-72344a91553a name: seller_payments_v1 -version: 1.1.0 # Data Contract Version +version: 1.1.0 # Data Contract Version status: active domain: seller dataProduct: payments @@ -64,28 +64,27 @@ tags: ['finance'] ### Definitions -| Key | UX label | Required | Description | -|--------------------------------------|---------------------------|----------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| apiVersion | Standard version | Yes | Version of the standard used to build data contract. Default value is `v3.0.2`. | -| kind | Kind | Yes | The kind of file this is. Valid value is `DataContract`. | -| id | ID | Yes | A unique identifier used to reduce the risk of dataset name collisions, such as a UUID. | -| name | Name | No | Name of the data contract. | -| version | Version | Yes | Current version of the data contract. | -| status | Status | Yes | Current status of the data contract. Examples are "proposed", "draft", "active", "deprecated", "retired". | -| tenant | Tenant | No | Indicates the property the data is primarily associated with. Value is case insensitive. | -| tags | Tags | No | A list of tags that may be assigned to the elements (object or property); the tags keyword may appear at any level. Tags may be used to better categorize an element. For example, `finance`, `sensitive`, `employee_record`. | -| domain | Domain | No | Name of the logical data domain. | -| dataProduct | Data Product | No | Name of the data product. | -| authoritativeDefinitions | Authoritative Definitions | No | List of links to sources that provide more details on the data contract. | -| description | Description | No | Object containing the descriptions. | -| description.purpose | Purpose | No | Intended purpose for the provided data. | -| description.limitations | Limitations | No | Technical, compliance, and legal limitations for data use. | -| description.usage | Usage | No | Recommended usage of the data. | -| description.authoritativeDefinitions | Authoritative Definitions | No | List of links to sources that provide more details on the dataset; examples would be a link to privacy statement, terms and conditions, license agreements, data catalog, or another tool. | -| description.customProperties | Custom Properties | No | Custom properties that are not part of the standard. | - +| Key | UX label | Required | Description | +|--------------------------------------|---------------------------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| apiVersion | Standard version | Yes | Version of the standard used to build data contract. Default value is `v3.0.2`. | +| kind | Kind | Yes | The kind of file this is. Valid value is `DataContract`. | +| id | ID | Yes | A unique identifier used to reduce the risk of dataset name collisions, such as a UUID. | +| name | Name | No | Name of the data contract. | +| version | Version | Yes | Current version of the data contract. | +| status | Status | Yes | Current status of the data contract. Examples are "proposed", "draft", "active", "deprecated", "retired". | +| tenant | Tenant | No | Indicates the property the data is primarily associated with. Value is case insensitive. | +| domain | Domain | No | Name of the logical data domain. | +| dataProduct | Data Product | No | Name of the data product. | +| authoritativeDefinitions | Authoritative Definitions | No | List of links to sources that provide more details on the data contract. | +| description | Description | No | Object containing the descriptions. | +| description.purpose | Purpose | No | Intended purpose for the provided data. | +| description.limitations | Limitations | No | Technical, compliance, and legal limitations for data use. | +| description.usage | Usage | No | Recommended usage of the data. | +| description.authoritativeDefinitions | Authoritative Definitions | No | List of links to sources that provide more details on the dataset; examples would be a link to privacy statement, terms and conditions, license agreements, data catalog, or another tool. | +| description.customProperties | Custom Properties | No | Custom properties that are not part of the standard. | ## Schema + This section describes the schema of the data contract. It is the support for data quality, which is detailed in the next section. Schema supports both a business representation of your data and a physical implementation. It allows to tie them together. In ODCS v3, the schema has evolved from the table and column representation, therefore the schema introduces a new terminology: @@ -109,10 +108,10 @@ schema: - name: tbl logicalType: object physicalType: table - physicalName: tbl_1 - description: Provides core payment metrics - authoritativeDefinitions: - - url: https://catalog.data.gov/dataset/air-quality + physicalName: tbl_1 + description: Provides core payment metrics + authoritativeDefinitions: + - url: https://catalog.data.gov/dataset/air-quality type: businessDefinition - url: https://youtu.be/jbY1BKFj9ec type: videoTutorial @@ -134,12 +133,12 @@ schema: - table_name_2 - table_name_3 transformLogic: sel t1.txn_dt as txn_ref_dt from table_name_1 as t1, table_name_2 as t2, table_name_3 as t3 where t1.txn_dt=date-3 - transformDescription: Defines the logic in business terms. + transformDescription: Defines the logic in business terms. examples: - 2022-10-03 - 2020-01-28 - name: rcvr_id - primaryKey: true + primaryKey: true primaryKeyPosition: 1 businessName: receiver id logicalType: string @@ -153,7 +152,7 @@ schema: classification: restricted encryptedName: enc_rcvr_id - name: rcvr_cntry_code - primaryKey: false + primaryKey: false primaryKeyPosition: -1 businessName: receiver country code logicalType: string @@ -183,7 +182,7 @@ schema: logicalType: object properties: - name: street_lines - logicalType: array + logicalType: array items: logicalType: string ``` @@ -201,7 +200,7 @@ schema: logicalType: object properties: - name: id - logicalType: string + logicalType: string physicalType: VARCHAR(40) - name: zip logicalType: string @@ -222,7 +221,6 @@ schema: |--------------------------|------------------------------|----------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | name | Name | Yes | Name of the element. | | physicalName | Physical Name | No | Physical name. | -| physicalType | Physical Type | No | The physical element data type in the data source. For objects: `table`, `view`, `topic`, `file`. For properties: `VARCHAR(2)`, `DOUBLE`, `INT`, etc. | | description | Description | No | Description of the element. | | businessName | Business Name | No | The business name of the element. | | authoritativeDefinitions | Authoritative Definitions | No | List of links to sources that provide more details on the element; examples would be a link to privacy statement, terms and conditions, license agreements, data catalog, or another tool. | @@ -238,14 +236,15 @@ schema: #### Applicable to Properties -Some keys are more applicable when the described property is a column. +Some keys are more applicable when the described property is a column. | Key | UX label | Required | Description | |--------------------------|------------------------------|----------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | primaryKey | Primary Key | No | Boolean value specifying whether the field is primary or not. Default is false. | | primaryKeyPosition | Primary Key Position | No | If field is a primary key, the position of the primary key element. Starts from 1. Example of `account_id, name` being primary key columns, `account_id` has primaryKeyPosition 1 and `name` primaryKeyPosition 2. Default to -1. | -| logicalType | Logical Type | No | The logical field datatype. One of `string`, `date`, `number`, `integer`, `object`, `array` or `boolean`. | -| logicalTypeOptions | Logical Type Options | No | Additional optional metadata to describe the logical type. See [here](#logical-type-options) for more details about supported options for each `logicalType`. | +| logicalType | Logical Type | No | The logical field datatype. One of `string`, `date`, `timestamp`, `time`, `number`, `integer`, `object`, `array` or `boolean`. | +| logicalTypeOptions | Logical Type Options | No | Additional optional metadata to describe the logical type. See [Logical Type Options](#logical-type-options) for more details about supported options for each `logicalType`. | +| physicalType | Physical Type | No | The physical element data type in the data source. For example, VARCHAR(2), DOUBLE, INT. | | description | Description | No | Description of the element. | | required | Required | No | Indicates if the element may contain Null values; possible values are true and false. Default is false. | | unique | Unique | No | Indicates if the element contains unique values; possible values are true and false. Default is false. | @@ -270,13 +269,15 @@ Additional metadata options to more accurately define the data type. | array | maxItems | Maximum Items | No | Maximum number of items. | | array | minItems | Minimum Items | No | Minimum number of items. | | array | uniqueItems | Unique Items | No | If set to true, all items in the array are unique. | -| date | format | Format | No | Format of the date. Follows the format as prescribed by [JDK DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html). Default value is using ISO 8601: 'YYYY-MM-DDTHH:mm:ss.SSSZ'. For example, format 'yyyy-MM-dd'. | -| date | exclusiveMaximum | Exclusive Maximum | No | If set to true, all values are strictly less than the maximum value (values < maximum). Otherwise, less than or equal to the maximum value (values <= maximum). | -| date | exclusiveMinimum | Exclusive Minimum | No | If set to true, all values are strictly greater than the minimum value (values > minimum). Otherwise, greater than or equal to the minimum value (values >= minimum). | -| date | maximum | Maximum | No | All date values are less than or equal to this value (values <= maximum). | -| date | minimum | Minimum | No | All date values are greater than or equal to this value (values >= minimum). | -| integer/number | exclusiveMaximum | Exclusive Maximum | No | If set to true, all values are strictly less than the maximum value (values < maximum). Otherwise, less than or equal to the maximum value (values <= maximum). | -| integer/number | exclusiveMinimum | Exclusive Minimum | No | If set to true, all values are strictly greater than the minimum value (values > minimum). Otherwise, greater than or equal to the minimum value (values >= minimum). | +| date/timestamp/time | format | Format | No | Format of the date. Follows the format as prescribed by [JDK DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html). Default value is using ISO 8601: 'YYYY-MM-DDTHH:mm:ss.SSSZ'. For example, format 'yyyy-MM-dd'. | +| date/timestamp/time | exclusiveMaximum | Exclusive Maximum | No | All values must be strictly less than this value (values < exclusiveMaximum). | +| date/timestamp/time | exclusiveMinimum | Exclusive Minimum | No | All values must be strictly greater than this value (values > exclusiveMinimum). | +| date/timestamp/time | maximum | Maximum | No | All date values are less than or equal to this value (values <= maximum). | +| date/timestamp/time | minimum | Minimum | No | All date values are greater than or equal to this value (values >= minimum). | +| timestamp/time | timezone | Timezone | No | Whether the timestamp defines the timezone or not. If true, timezone information is included in the timestamp. | +| timestamp/time | defaultTimezone | Default Timezone | No | The default timezone of the timestamp. If timezone is not defined, the default timezone UTC is used. | +| integer/number | exclusiveMaximum | Exclusive Maximum | No | All values must be strictly less than this value (values < exclusiveMaximum). | +| integer/number | exclusiveMinimum | Exclusive Minimum | No | All values must be strictly greater than this value (values > exclusiveMinimum). | | integer/number | format | Format | No | Format of the value in terms of how many bits of space it can use and whether it is signed or unsigned (follows the Rust integer types). | | integer/number | maximum | Maximum | No | All values are less than or equal to this value (values <= maximum). | | integer/number | minimum | Minimum | No | All values are greater than or equal to this value (values >= minimum). | @@ -287,11 +288,11 @@ Additional metadata options to more accurately define the data type. | string | format | Format | No | Provides extra context about what format the string follows. For example, password, byte, binary, email, uuid, uri, hostname, ipv4, ipv6. | | string | maxLength | Maximum Length | No | Maximum length of the string. | | string | minLength | Minimum Length | No | Minimum length of the string. | -| string | pattern | Pattern | No | Regular expression pattern to define valid value. Follows regular expression syntax from ECMA-262 (https://262.ecma-international.org/5.1/#sec-15.10.1). | +| string | pattern | Pattern | No | Regular expression pattern to define valid value. Follows regular expression syntax from ECMA-262 (). | #### Expressing Date / Datetime / Timezone information -Given the complexity of handling various date and time formats (e.g., date, datetime, time, timestamp, timestamp with and without timezone), the existing `logicalType` options currently support only `date`. To specify additional temporal details, `logicalType` should be used in conjunction with `logicalTypeOptions.format` or `physicalType` to define the desired format. Using `physicalType` allows for definition of your data-source specific data type. +Given the complexity of handling various date and time formats (e.g., date, datetime, time, timestamp, timestamp with and without timezone), the existing `logicalType` options currently support `date`, `timestamp`, and `time`. To specify additional temporal details, `logicalType` should be used in conjunction with `logicalTypeOptions.format` or `physicalType` to define the desired format. Using `physicalType` allows for definition of your data-source specific data type. ``` yaml version: 1.0.0 @@ -301,62 +302,349 @@ status: active name: date_example apiVersion: v3.0.2 schema: - # Date Only + # Date Only - name: event_date logicalType: date logicalTypeOptions: - - format: "yyyy-MM-dd" + format: "yyyy-MM-dd" examples: - "2024-07-10" - # Date & Time (UTC) + # Date & Time (UTC) - name: created_at - logicalType: date + logicalType: timestamp logicalTypeOptions: - - format: "yyyy-MM-ddTHH:mm:ssZ" + format: "yyyy-MM-ddTHH:mm:ssZ" examples: - "2024-03-10T14:22:35Z" - # Time Only + # Date & Time (Australia/Sydney) + - name: created_at_sydney + logicalType: timestamp + logicalTypeOptions: + format: "yyyy-MM-ddTHH:mm:ssZ" + timezone: true + defaultTimezone: "Australia/Sydney" + examples: + - "2024-03-10T14:22:35+10:00" + + # Time Only - name: event_start_time - logicalType: date + logicalType: time logicalTypeOptions: - - format: "HH:mm:ss" + format: "HH:mm:ss" examples: - "08:30:00" # Physical Type with Date & Time (UTC) - name: event_date - logicalType: date + logicalType: timestamp physicalType: DATETIME logicalTypeOptions: - - format: yyyy-MM-ddTHH:mm:ssZ" + format: "yyyy-MM-ddTHH:mm:ssZ" examples: - "2024-03-10T14:22:35Z" ``` ### Authoritative definitions + Reference to an external definition on element logic or values. -| Key | UX label | Required | Description | -|-------------------------------|-------------------|----------|---------------------------------------------------------------------------------------------------------------------------------------------------------------| -| authoritativeDefinitions | Link | No | A list of type/link pairs for authoritative definitions. | -| authoritativeDefinitions.type | Definition type | Yes | Type of definition for authority. Valid values are: `businessDefinition`, `transformationImplementation`, `videoTutorial`, `tutorial`, and `implementation`. | -| authoritativeDefinitions.url | URL to definition | Yes | URL to the authority. | +| Key | UX label | Required | Description | +|--------------------------------------|-------------------|----------|---------------------------------------------------------------------------------------------------------------------------------------------------------------| +| authoritativeDefinitions | Link | No | A list of type/link pairs for authoritative definitions. | +| authoritativeDefinitions.type | Definition type | Yes | Type of definition for authority. Valid values are: `businessDefinition`, `transformationImplementation`, `videoTutorial`, `tutorial`, and `implementation`. | +| authoritativeDefinitions.url | URL to definition | Yes | URL to the authority. | +| authoritativeDefinitions.description | Description | No | Description for humans | + +## References + +This section describes how to reference elements within a data contract schema. References enable you to create relationships between different parts of your data contract. + +> [!IMPORTANT] +> References are currently only supported within schema properties for foreign key relationships. + +### Reference Structure + +A fully formatted reference follows this structure: + +```yaml + +``` + +Where: + +* **``**: Path to the contract file (optional for same-contract references) +* **``**: '#' symbol to mark entry into a contract (optional for same-contract) +* **``**: The defined path within the contract + +### External Contract References + +To identify a contract, use one of these formats: + +```yaml +# Same folder as current contract +data-contract-v1.yaml + +# Full path +file:///path/to/data-contract-v1.yaml + +# URL +https://example.com/data-contract-v1.yaml + +# Relative path +../../path/to/data-contract-v1.yaml +``` + +### Reference Examples + +#### External Contract References + +```yaml +# Reference to an element in an external contract +'external-contract.yaml#schema.my-table' + +# Reference to a specific column in an external contract +'external-contract.yaml#schema.my-table.my-column' +``` + +#### Same Contract References + +When referencing elements within the same contract, the file component can be omitted. + +```yaml +# Full reference within same contract +'#schema.my-table.my-column' + +# File and anchor can be omitted for same contract +'schema.my-table.my-column' +``` + +### Shorthand Notation + +For improved readability, ODCS supports the following shorthand notation when referencing properties within the same schema. The shorthand notation allows for a more concise way to define relationships. It can be used in the `to` and `from` fields of a relationship. +The shorthand notation is `.`. + +These shorthand options are only available for properties within the same data contract. + +### Relationships between properties (Foreign Keys) + +Properties can define relationships to other properties, enabling you to specify foreign key constraints and other data relationships. Relationships use the reference mechanism from RFC 9. + +#### Quick Overview + +Relationships can be defined in two ways: + +1. **At the property level** - Define relationships directly on a property (the `from` field is implicit and must NOT be specified) +2. **At the schema level** - Define relationships between any properties (both `from` and `to` are required) + +#### Important Rules + +* **Property-level relationships**: The `from` field is implicit (derived from the property context) and must NOT be specified +* **Schema-level relationships**: Both `from` and `to` fields are required +* **Type consistency**: Both `from` and `to` must be the same type - either both strings (single column) or both arrays (composite keys). Mixing types is not allowed +* **Array length validation**: When using arrays for composite keys, both arrays must have the same number of elements. This is validated at runtime by implementations + +#### Field Definitions + +| Key | UX Label | Required | Description | +|-----|----------|----------|-------------| +| relationships | Relationships | No | Array of relationship definitions | +| relationships.type | Type | No | Type of relationship (defaults to `foreignKey`) | +| relationships.to | To | Yes | Target property reference using `schema.property` notation | +| relationships.from | From | Context-dependent | Source property reference - Required at schema level, forbidden at property level | +| relationships.customProperties | Custom Properties | No | Additional metadata about the relationship | + +#### Reference Notation + +* **Simple reference**: `users.id` - References the `id` property in the `users` schema +* **Nested reference**: `accounts.address.street` - References nested properties +* **Composite keys**: Use arrays to define composite keys (arrays must have matching lengths) + +### Examples + +#### Example 1: Simple Foreign Key (Property Level) + +When defining a relationship at the property level, the `from` field is implicit and must NOT be specified: + +```yaml +schema: + - name: users + properties: + - name: user_id + relationships: + - to: accounts.owner_id # 'from' is implicit (users.user_id) + # Note: DO NOT include 'from' field at property level +``` + +#### Example 2: Multiple Relationships + +A property can have multiple relationships: + +```yaml +schema: + - name: orders + properties: + - name: customer_id + relationships: + - to: customers.id + - to: loyalty_members.customer_id +``` + +#### Example 3: Schema-Level Relationships + +Define relationships at the schema level when you need explicit `from` and `to`. Both fields are REQUIRED at this level: + +```yaml +schema: + - name: users + relationships: + - from: users.account_id # Required at schema level + to: accounts.id # Required at schema level + type: foreignKey +``` + +#### Example 4: Nested Properties + +Reference nested properties using dot notation: + +```yaml +schema: + - name: users + properties: + - name: id + relationships: + - to: accounts.address.postal_code +``` + +#### Example 5: Composite Keys + +For composite foreign keys, use arrays. **Important**: Both `from` and `to` must be arrays with the same number of elements: + +```yaml +schema: + - name: order_items + relationships: + - type: foreignKey + from: # Array (must match 'to' length) + - order_items.order_id + - order_items.product_id + to: # Array (must match 'from' length) + - product_inventory.order_id + - product_inventory.product_id + +``` + +#### Example 6: Invalid Configurations + +Here are examples of invalid configurations that will be rejected: + +```yaml +# INVALID: 'from' specified at property level +schema: + - name: users + properties: + - name: user_id + relationships: + - from: users.user_id # ERROR: 'from' not allowed at property level + to: accounts.id + +# INVALID: Mismatched array types +schema: + - name: orders + relationships: + - from: orders.id # ERROR: 'from' is string but 'to' is array + to: + - items.order_id + - items.line_num + +# INVALID: Different array lengths (caught at runtime) +schema: + - name: orders + relationships: + - from: # 'from' has 2 elements + - orders.id + - orders.customer_id + to: # 'to' has 3 elements (runtime validation will fail) + - items.order_id + - items.customer_id + - items.line_num + +# INVALID: Missing 'from' at schema level +schema: + - name: orders + relationships: + - to: customers.id # ERROR: 'from' is required at schema level +``` + +#### Complete Example + +Here's a comprehensive example showing various relationship patterns: + +```yaml +schema: + - name: users + properties: + - name: id + relationships: + # Simple foreign key (from is implicit) + - to: accounts.user_id + + # With explicit from field + - from: users.id + to: profiles.user_id + + # With custom properties + - to: departments.manager_id + customProperties: + - property: cardinality + value: "one-to-many" + - property: label + value: "manages" + + # To external contract (from is implicit) + - to: https://example.com/data-contract-v1.yaml#profiles.user_id + customProperties: + - property: description + value: "Externally referenced contract" + + - name: account_number + + # Schema-level composite key relationship + relationships: + - type: foreignKey + from: + - users.id + - users.account_number + to: + - accounts.user_id + - accounts.account_number + + - name: accounts + properties: + - name: user_id + - name: account_number + - name: address + properties: + - name: street + - name: postal_code +``` ## Data quality + This section describes data quality rules & parameters. They are tightly linked to the schema described in the previous section. Data quality rules support different levels/stages of data quality attributes: - - __Text__: A human-readable text that describes the quality of the data. - - __Library__ rules: A maintained library of commonly-used predefined quality attributes such as `rowCount`, `unique`, `freshness`, and more. - - __SQL__: An individual SQL query that returns a value that can be compared. Can be extended to `Python` or other. - - __Custom__: Quality attributes that are vendor-specific, such as Soda, Great Expectations, dbt tests, or Montecarlo monitors. +* **Text**: A human-readable text that describes the quality of the data. +* **Library** : A maintained library of commonly used quality metrics such as `rowCount`, `nullValues`, `invalidValues`, and more. +* **SQL**: An individual SQL query that returns a value that can be compared. +* **Custom**: Quality attributes that are vendor-specific, such as Soda, Great Expectations, dbt tests, dbx, or Montecarlo monitors. ### Text -A human-readable text that describes the quality of the data. Later in the development process, these might be translated into an executable check (such as `sql`), a library rule, or checked through an AI engine. + +A human-readable text that describes the quality of the data. Later in the development process, these might be translated into an executable check (such as `sql`), a library metric, or checked through an AI engine. ```yaml quality: @@ -365,67 +653,189 @@ quality: ``` ### Library -ODCS will provide a set of predefined rules commonly used in data quality checks, designed to be compatible with all major data quality engines. This simplifies the work for data engineers by eliminating the need to manually write SQL queries. -#### Property-level -Those examples apply at the property level, such as column, field, etc. +ODCS provides a set of predefined metrics commonly used in data quality checks, designed to be compatible with all major data quality engines. This simplifies the work for data engineers by eliminating the need to manually write SQL queries. + +The type for library metrics is `library`, which can be omitted, if a `metric` property is defined. + +These metrics return a numeric value come with an operator to compare if the metric is valid and in the expected range. + +Some metrics require additional parameters, which can be defined in the `arguments` property. + +Example: + +```yaml +properties: + - name: order_id + quality: + - type: library + metric: nullValues + mustBe: 0 + unit: rows + description: "There must be no null values in the column." +``` + +is equalized to: + +```yaml +properties: + - name: order_id + quality: + - metric: nullValues + mustBe: 0 + description: "There must be no null values in the column." +``` + +#### Metrics + +| Metric | Level | Description | Arguments | Arguments Example | +|--------|--------|----------------------------------------------------------------|------------------------------------------------------------------|----------------------------------------------------------------------| +| `nullValues` | Property | Counts null values in a column/field | None | | +| `missingValues` | Property | Counts values considered as missing (empty strings, N/A, etc.) | `missingValues`: Array of values considered missing | `missingValues: [null, '', 'N/A']` | +| `invalidValues` | Property | Counts values that don't match valid criteria | `validValues`: Array of valid values
`pattern`: Regex pattern | `validValues: ['pounds', 'kg']`
`pattern: '^[A-Z]{2}[0-9]{2}...'` | +| `duplicateValues` | Property | Counts duplicate values in a column | None | | +| `duplicateValues` | Schema | Counts duplicate values across multiple columns | `properties`: Array of property names | `properties: ['tenant_id', 'order_id']` | +| `rowCount` | Schema | Counts total number of rows in a table/object store | None | | + +##### Null Values + +Check that the count of null values is within range. + +```yaml +properties: + - name: customer_id + quality: + - metric: nullValues + mustBe: 0 + description: "There must be no null values in the column." +``` + +Example with percent: + +```yaml +properties: + - name: order_status + quality: + - metric: nullValues + mustBeLessThan: 1 + unit: percent + description: "There must be less than 1% null values in the column." +``` + +##### Missing Values + +Check that the missing values are within range. + +In the argument `missingValues`, a list of values that are considered to be missing. + +```yaml +properties: + - name: email_address + quality: + - metric: missingValues + arguments: + missingValues: [null, '', 'N/A', 'n/a'] + mustBeLessThan: 100 + unit: rows # rows (default) or percent +``` + +##### Invalid Values + +Check that the value is within a defined set or matching a pattern. + +```yaml +properties: + - name: line_item_unit + quality: + - metric: invalidValues + arguments: + validValues: ['pounds', 'kg'] + mustBeLessThan: 5 + unit: rows +``` + +Using a pattern: + +```yaml +properties: + - name: iban + quality: + - metric: invalidValues + mustBe: 0 + description: "The value must be an IBAN." + arguments: + pattern: '^[A-Z]{2}[0-9]{2}[A-Z0-9]{4}[0-9]{7}([A-Z0-9]?){0,16}$' +``` + +##### Duplicate Values -##### Duplicate count on rows No more than 10 duplicate names. ```yaml -quality: -- type: library # optional and default value for data quality rules - rule: duplicateCount - mustBeLessThan: 10 - name: Fewer than 10 duplicate names - unit: rows +properties: + - name: email_address + quality: + - metric: duplicateValues + mustBeLessThan: 10 + unit: rows + description: "There must be less than 10 duplicate values in the column." ``` -##### Duplicate count on % Duplicates should be less than 1%. ```yaml -quality: -- rule: duplicateCount - mustBeLessThan: 1 - unit: percent +properties: + - name: phone_number + quality: + - metric: duplicateValues + mustBeLessThan: 1 + unit: percent ``` -##### Valid values -Valid values from a static list. +##### Row count (Schema-Level) + +Calculates the number of rows (usually in a table) and compares it to an absolute operator. ```yaml -quality: -- rule: validValues - validValues: ['pounds'] +schema: + - name: orders + quality: + - metric: rowCount + mustBeBetween: [100, 120] ``` -#### Object-level -This example applies at the object level (like a table or a view). +##### Duplicates (Schema-Level) -##### Row count -The number of rows must be between 100 and 120. +Checks for duplicate rows based on a combination of properties. +This is useful for validating compound keys where uniqueness is defined not by a single column but by multiple columns together. ```yaml -quality: - - rule: rowCount - mustBeBetween: [100, 120] - name: Verify row count range +schema: + - name: orders + quality: + - description: The combination of tenant_id and order_id must be unique + metric: duplicateValues + mustBe: 0 + arguments: + properties: # Properties refer to the property in the schema. + - tenant_id + - order_id ``` ### SQL -A single SQL query that returns either a numeric or boolean value for comparison. The query must be written in the SQL dialect specific to the provided server. `${object}` and `${property}` are automatically replaced by the current object (in the case of SQL on a relational database, the table or view name) and the current property name (in the case of SQL on a relational database, the column). + +A single SQL query that returns either a numeric or boolean value for comparison. The query must be written in the SQL dialect specific to the provided server. `{object}` and `{property}` are automatically replaced by the current object (in the case of SQL on a relational database, the table or view name) and the current property name (in the case of SQL on a relational database, the column). ```yaml quality: - - type: sql + - type: sql query: | - SELECT COUNT(*) FROM ${object} WHERE ${property} IS NOT NULL - mustBeLessThan: 3600 + SELECT COUNT(*) FROM {object} WHERE {property} IS NOT NULL + mustBeLessThan: 3600 ``` ### Custom + Custom rules allow for vendor-specific checks, including tools like Soda, Great Expectations, dbt-tests, Montecarlo, and others. Any format for properties is acceptable, whether it's written in YAML, JSON, XML, or even uuencoded binary. They are an intermediate step before the vendor accepts ODCS natively. #### Soda Example @@ -456,22 +866,23 @@ quality: ``` ### Scheduling -The data contract can contain scheduling information for executing the rules. You can use `schedule` and `scheduler` for those operation. In previous versions of ODCS, the only allowed scheduler was cron and its syntax was `scheduleCronExpression`. + +The data contract can contain scheduling information for executing the rules. You can use `schedule` and `scheduler` for those operation. In previous versions of ODCS, the only allowed scheduler was cron and its syntax was `scheduleCronExpression`. ```yaml quality: - - type: sql + - type: sql query: | - SELECT COUNT(*) FROM ${object} WHERE ${property} IS NOT NULL - mustBeLessThan: 3600 + SELECT COUNT(*) FROM {object} WHERE {property} IS NOT NULL + mustBeLessThan: 3600 scheduler: cron schedule: 0 20 * * * ``` - ### Definitions Acronyms: + * DQ: data quality. | Key | UX label | Required | Description | @@ -480,10 +891,11 @@ Acronyms: | quality.name | Name | No | A short name for the rule. | | quality.description | Description | No | Describe the quality check to be completed. | | quality.type | Type | No | Type of DQ rule. Valid values are `library` (default), `text`, `sql`, and `custom`. | -| quality.rule | Rule name | No | Required for `library` DQ rules: the name of the rule to be executed. | +| quality.metric | Metric name | No | Required for `library`: the name of the metric to be calculated and compared. | +| quality.rule | Rule name | No | Deprecated, use `metric` instead. | +| quality.arguments | Arguments | No | Additional arguments for the metric, if needed. | | quality.\ | See below | No | Multiple values are allowed for the **property**, the value is the one to compare to. | -| quality.unit | Unit | No | Unit the rule is using, popular values are `rows` or `percent`, but any value is allowed. | -| quality.validValues | Valid values | No | Static list of valid values. | +| quality.unit | Unit | No | Unit the rule is using, popular values are `rows` or `percent`. | | quality.query | SQL Query | No | Required for `sql` DQ rules: the SQL query to be executed. Note that it should match the target SQL engine/database, no transalation service are provided here. | | quality.engine | Third-party DQ Engine | No | Required for `custom` DQ rule: name of the third-party engine being used. Any value is authorized here but common values are `soda`, `greatExpectations`, `montecarlo`, etc. | | quality.implementation | Third-party Implementation | No | A text (non-parsed) block of code required for the third-party DQ engine to run. | @@ -498,58 +910,56 @@ Acronyms: | quality.schedule | Scheduler Configuration | No | Configuration information for the scheduling tool, for `cron` a possible value is `0 20 * * *`. | #### Valid Values for Dimension + Those data quality dimensions are used for classification and reporting in data quality. Valid values are: - * `accuracy` (synonym `ac`), - * `completeness` (synonym `cp`), - * `conformity` (synonym `cf`), - * `consistency` (synonym `cs`), - * `coverage` (synonym `cv`), - * `timeliness` (synonym `tm`), - * `uniqueness` (synonym `uq`). +* `accuracy` (synonym `ac`), +* `completeness` (synonym `cp`), +* `conformity` (synonym `cf`), +* `consistency` (synonym `cs`), +* `coverage` (synonym `cv`), +* `timeliness` (synonym `tm`), +* `uniqueness` (synonym `uq`). #### Valid Properties for Operator -The operator specifies the condition to validate the rule. + +The operator specifies the condition to validate a metric or result of a SQL query. | Operator | Expected Value | Math Symbol | Example | |--------------------------|---------------------|-------------|------------------------------| -| `mustBe` | number | `=` | `mustBe: 5` | +| `mustBe` | number | `=` | `mustBe: 5` | | `mustNotBe` | number | `<>`, `β‰ ` | `mustNotBe: 3.14` | | `mustBeGreaterThan` | number | `>` | `mustBeGreaterThan: 59` | | `mustBeGreaterOrEqualTo` | number | `>=`, `β‰₯` | `mustBeGreaterOrEqualTo: 60` | | `mustBeLessThan` | number | `<` | `mustBeLessThan: 1000` | | `mustBeLessOrEqualTo` | number | `<=`, `≀` | `mustBeLessOrEqualTo: 999` | -| `mustBeBetween` | list of two numbers | `βŠ‚` | `mustBeBetween: [0, 100]` | -| `mustNotBeBetween` | list of two numbers | `βŠ„` | `mustNotBeBetween: [0, 100]` | +| `mustBeBetween` | list of two numbers | `∈` | `mustBeBetween: [0, 100]` | +| `mustNotBeBetween` | list of two numbers | `βˆ‰` | `mustNotBeBetween: [0, 100]` | `mustBeBetween` is the equivalent to `mustBeGreaterThan` and `mustBeLessThan`. ```yaml quality: - - type: sql + - type: sql query: | - SELECT COUNT(*) FROM ${table} WHERE ${column} IS NOT NULL - mustBeBetween: [0, 100] + SELECT COUNT(*) FROM {table} WHERE {column} IS NOT NULL + mustBeBetween: [0, 100] ``` is equivalent to: ```yaml quality: - - type: sql + - type: sql query: | - SELECT COUNT(*) FROM ${table} WHERE ${column} IS NOT NULL + SELECT COUNT(*) FROM {table} WHERE {column} IS NOT NULL mustBeGreaterThan: 0 - mustBeLessThan: 100 + mustBeLessThan: 100 ``` - -#### Library Rules -Bitol has the ambition of creating a library of common data quality rules. Join the working group around [RFC #0012](https://github.com/bitol-io/tsc/blob/main/rfcs/0012-implicit-dq-rules.md). - - ## Support and Communication Channels -Support and communication channels help consumers find help regarding their use of the data contract. + +Support and communication channels help consumers find help regarding their use of the data contract. ### Examples @@ -557,8 +967,7 @@ Support and communication channels help consumers find help regarding their use ```yaml support: - - channel: channel-name-or-identifier # Simple Slack communication channel - url: https://aidaug.slack.com/archives/C05UZRSBKLY + - channel: "#my-channel" # Simple Slack communication channel - channel: channel-name-or-identifier # Simple distribution list url: mailto:datacontract-ann@bitol.io ``` @@ -592,19 +1001,20 @@ support: ### Definitions -| Key | UX label | Required | Description | -|-----------------------|----------------|----------|-------------------------------------------------------------------------------------------------------------------------------------| -| support | Support | No | Top level for support channels. | -| support.channel | Channel | Yes | Channel name or identifier. | -| support.url | Channel URL | Yes | Access URL using normal [URL scheme](https://en.wikipedia.org/wiki/URL#Syntax) (https, mailto, etc.). | -| support.description | Description | No | Description of the channel, free text. | -| support.tool | Tool | No | Name of the tool, value can be `email`, `slack`, `teams`, `discord`, `ticket`, or `other`. | -| support.scope | Scope | No | Scope can be: `interactive`, `announcements`, `issues`. | -| support.invitationUrl | Invitation URL | No | Some tools uses invitation URL for requesting or subscribing. Follows the [URL scheme](https://en.wikipedia.org/wiki/URL#Syntax). | - +| Key | UX label | Required | Description | +|-----------------------|----------------|----------|-----------------------------------------------------------------------------------------------------------------------------------| +| support | Support | No | Top level for support channels. | +| support.channel | Channel | Yes | Channel name or identifier. | +| support.url | Channel URL | No | Access URL using normal [URL scheme](https://en.wikipedia.org/wiki/URL#Syntax) (https, mailto, etc.). | +| support.description | Description | No | Description of the channel, free text. | +| support.tool | Tool | No | Name of the tool, value can be `email`, `slack`, `teams`, `discord`, `ticket`, `googlechat`, or `other`. | +| support.scope | Scope | No | Scope can be: `interactive`, `announcements`, `issues`, `notifications`. | +| support.invitationUrl | Invitation URL | No | Some tools uses invitation URL for requesting or subscribing. Follows the [URL scheme](https://en.wikipedia.org/wiki/URL#Syntax). | +| support.customProperties | Custom Properties | No | Any custom properties. | ## Pricing -This section covers pricing when you bill your customer for using this data product. + +This section covers pricing when you bill your customer for using this data product. ### Example @@ -624,11 +1034,12 @@ price: | price.priceCurrency | Price Currency | No | Currency of the subscription price in `price.priceAmount`. | | price.priceUnit | Price Unit | No | The unit of measure for calculating cost. Examples megabyte, gigabyte. | - ## Team + This section lists team members and the history of their relation with this data contract. In v2.x, this section was called stakeholders. ### Example + ```YAML team: - username: ceastwood @@ -647,7 +1058,8 @@ team: ``` ### Definitions -The UX label is the label used in the UI and other user experiences. + +The UX label is the label used in the UI and other user experiences. | Key | UX label | Required | Description | |-------------------------|----------------------|----------|--------------------------------------------------------------------------------------------| @@ -661,6 +1073,7 @@ The UX label is the label used in the UI and other user experiences. | team.replacedByUsername | Replaced By Username | No | The username of the user who replaced the previous user. | ## Roles + This section lists the roles that a consumer may need to access the dataset depending on the type of access they require. ### Example @@ -697,9 +1110,9 @@ roles: | roles.secondLevelApprovers | 2nd Level Approvers | No | The name(s) of the second-level approver(s) of the role. | | roles.customProperties | Custom Properties | No | Any custom properties. | - ## Service-Level Agreement (SLA) -This section describes the service-level agreements (SLA). + +This section describes the service-level agreements (SLA). * Use the `Object.Element` to indicate the number to do the checks on, as in `SELECT txn_ref_dt FROM tab1`. * Separate multiple object.element by a comma, as in `table1.col1`, `table2.col1`, `table1.col2`. @@ -708,14 +1121,14 @@ This section describes the service-level agreements (SLA). ### Example ```YAML -slaDefaultElement: tab1.txn_ref_dt # Optional, default value is partitionColumn. slaProperties: - property: latency # Property, see list of values in DP QoS value: 4 unit: d # d, day, days for days; y, yr, years for years - element: tab1.txn_ref_dt # This would not be needed as it is the same table.column as the default one + element: tab1.txn_ref_dt - property: generalAvailability value: 2022-05-12T09:30:10-08:00 + description: GA at 12.5.22 - property: endOfSupport value: 2032-05-12T09:30:10-08:00 - property: endOfLife @@ -741,16 +1154,17 @@ slaProperties: ### Definitions -| Key | UX label | Required | Description | -|------------------------|------------------------|--------------------------------|-----------------------------------------------------------------------------------------------------| -| slaDefaultElement | Default SLA element(s) | No | Element (using the element path notation) to do the checks on. | -| slaProperties | SLA | No | A list of key/value pairs for SLA specific properties. There is no limit on the type of properties. | -| slaProperties.property | Property | Yes | Specific property in SLA, check the Data QoS periodic table. May requires units. | -| slaProperties.value | Value | Yes | Agreement value. The label will change based on the property itself. | -| slaProperties.valueExt | Extended value | No - unless needed by property | Extended agreement value. The label will change based on the property itself. | -| slaProperties.unit | Unit | No - unless needed by property | **d**, day, days for days; **y**, yr, years for years, etc. Units use the ISO standard. | -| slaProperties.element | Element(s) | No | Element(s) to check on. Multiple elements should be extremely rare and, if so, separated by commas. | -| slaProperties.driver | Driver | No | Describes the importance of the SLA from the list of: `regulatory`, `analytics`, or `operational`. | +| Key | UX label | Required | Description | +|------------------------------------|------------------------|--------------------------------|-------------------------------------------------------------------------------------------------------------------| +| ~~slaDefaultElement~~ (Deprecated) | Default SLA element(s) | No | DEPRECATED SINCE 3.1. WILL BE REMOVED IN ODCS 4.0. Element (using the element path notation) to do the checks on. | +| slaProperties | SLA | No | A list of key/value pairs for SLA specific properties. There is no limit on the type of properties. | +| slaProperties.property | Property | Yes | Specific property in SLA, check the Data QoS periodic table. May requires units. | +| slaProperties.value | Value | Yes | Agreement value. The label will change based on the property itself. | +| slaProperties.valueExt | Extended value | No - unless needed by property | Extended agreement value. The label will change based on the property itself. | +| slaProperties.unit | Unit | No - unless needed by property | **d**, day, days for days; **y**, yr, years for years, etc. Units use the ISO standard. | +| slaProperties.element | Element(s) | No | Element(s) to check on. Multiple elements should be extremely rare and, if so, separated by commas. | +| slaProperties.driver | Driver | No | Describes the importance of the SLA from the list of: `regulatory`, `analytics`, or `operational`. | +| slaProperties.description | Description | No | Description of the SLA for humans. | ## Infrastructure and Servers @@ -759,10 +1173,11 @@ The `servers` element describes where the data protected by this data contract i An entry in `servers` describes a single dataset on a specific environment and a specific technology. The `servers` element can contain multiple servers, each with its own configuration. The typical ways of using the top level `servers` element are as follows: -- **Single Server:** The data contract protects a specific dataset at a specific location. *Example:* a CSV file on an SFTP server. -- **Multiple Environments:** The data contract makes sure that the data is protected in all environments. *Example:* a data product with data in a dev(elopment), UAT, and prod(uction) environment on Databricks. -- **Different Technologies:** The data contract makes sure that regardless of the offered technology, it still holds. *Example:* a data product offers its data in a Kafka topic and in a BigQuery table that should have the same structure and content. -- **Different Technologies and Multiple Environments:** The data contract makes sure that regardless of the offered technology and environment, it still holds. *Example:* a data product offers its data in a Kafka topic and in a BigQuery table that should have the same structure and content in dev(elopment), UAT, and prod(uction). + +* **Single Server:** The data contract protects a specific dataset at a specific location. *Example:* a CSV file on an SFTP server. +* **Multiple Environments:** The data contract makes sure that the data is protected in all environments. *Example:* a data product with data in a dev(elopment), UAT, and prod(uction) environment on Databricks. +* **Different Technologies:** The data contract makes sure that regardless of the offered technology, it still holds. *Example:* a data product offers its data in a Kafka topic and in a BigQuery table that should have the same structure and content. +* **Different Technologies and Multiple Environments:** The data contract makes sure that regardless of the offered technology and environment, it still holds. *Example:* a data product offers its data in a Kafka topic and in a BigQuery table that should have the same structure and content in dev(elopment), UAT, and prod(uction). ### General Server Structure @@ -783,14 +1198,14 @@ servers: #### Common Server Properties -| Key | UX label | Required | Description | -|------------------|-------------------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| server | Server | Yes | Identifier of the server. | -| type | Type | Yes | Type of the server. Can be one of: api, athena, azure, bigquery, clickhouse, databricks, denodo, dremio, duckdb, glue, cloudsql, db2, informix, kafka, kinesis, local, mysql, oracle, postgresql, postgres, presto, pubsub, redshift, s3, sftp, snowflake, sqlserver, synapse, trino, vertica, custom. | -| description | Description | No | Description of the server. | -| environment | Environment | No | Environment of the server. Examples includes: prod, preprod, dev, uat. | -| roles | Roles | No | List of roles that have access to the server. Check [roles](#roles) section for more details. | -| customProperties | Custom Properties | No | Custom properties that are not part of the standard. | +| Key | UX label | Required | Description | +|------------------|-------------------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| server | Server | Yes | Identifier of the server. | +| type | Type | Yes | Type of the server. Can be one of: api, athena, azure, bigquery, clickhouse, cloudsql, custom, databricks, db2, denodo, dremio, duckdb, glue, hive, informix, kafka, kinesis, local, mysql, oracle, postgres, postgresql, presto, pubsub, redshift, s3, sftp, snowflake, sqlserver, synapse, trino, vertica. | +| description | Description | No | Description of the server. | +| environment | Environment | No | Environment of the server. Examples includes: prod, preprod, dev, uat. | +| roles | Roles | No | List of roles that have access to the server. Check [roles](#roles) section for more details. | +| customProperties | Custom Properties | No | Custom properties that are not part of the standard. | ### Specific Server Properties @@ -800,35 +1215,36 @@ Each server type can be customized with different properties such as `host`, `po If your server is not in the list, please use [custom](#custom-server) and suggest it as an improvement. Possible values for `type` are: -- [api](#api-server) -- [athena](#amazon-athena-server) -- [azure](#azure-server) -- [bigquery](#google-bigquery) -- [clickhouse](#clickhouse-server) -- [databricks](#databricks-server) -- [db2](#ibm-db2-server) -- [denodo](#denodo-server) -- [dremio](#dremio-server) -- [duckdb](#duckdb-server) -- [glue](#amazon-glue) -- [cloudsql](#google-cloud-sql) -- [informix](#ibm-informix-and-hcl-informix) -- [kafka](#kafka-server) -- [kinesis](#amazon-kinesis) -- [local](#local-files) -- [mysql](#mysql-server) -- [oracle](#oracle) -- [postgresql](#postgresql) -- [presto](#presto-server) -- [pubsub](#google-pubsub) -- [redshift](#amazon-redshift-server) -- [s3](#amazon-s3-server-and-compatible-servers) -- [sftp](#sftp-server) -- [snowflake](#snowflake) -- [sqlserver](#microsoft-sql-server) -- [synapse](#synapse-server) -- [trino](#trino-server) -- [vertica](#vertica-server) +* [api](#api-server) +* [athena](#amazon-athena-server) +* [azure](#azure-server) +* [bigquery](#google-bigquery) +* [clickhouse](#clickhouse-server) +* [cloudsql](#google-cloud-sql) +* [databricks](#databricks-server) +* [db2](#ibm-db2-server) +* [denodo](#denodo-server) +* [dremio](#dremio-server) +* [duckdb](#duckdb-server) +* [glue](#amazon-glue) +* [hive](#hive) +* [informix](#ibm-informix-and-hcl-informix) +* [kafka](#kafka-server) +* [kinesis](#amazon-kinesis) +* [local](#local-files) +* [mysql](#mysql-server) +* [oracle](#oracle) +* [postgresql](#postgresql) +* [presto](#presto-server) +* [pubsub](#google-pubsub) +* [redshift](#amazon-redshift-server) +* [s3](#amazon-s3-server-and-compatible-servers) +* [sftp](#sftp-server) +* [snowflake](#snowflake) +* [sqlserver](#microsoft-sql-server) +* [synapse](#synapse-server) +* [trino](#trino-server) +* [vertica](#vertica-server) #### API Server @@ -836,8 +1252,8 @@ If your server is not in the list, please use [custom](#custom-server) and sugge |----------------|------------|------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------| | **location** | Location | Yes | URL to the API | - #### Amazon Athena Server + [Amazon Athena](https://docs.aws.amazon.com/athena/latest/ug/what-is.html) is an interactive query service that makes it easy to analyze data directly in Amazon Simple Storage Service (Amazon S3) using standard SQL. With a few actions in the AWS Management Console, you can point Athena at your data stored in Amazon S3 and begin using standard SQL to run ad-hoc queries and get results in seconds. | Key | UX Label | Required | Description | @@ -856,6 +1272,7 @@ If your server is not in the list, please use [custom](#custom-server) and sugge | delimiter | Delimiter | No | Only for format = json. How multiple json documents are delimited within one file | #### Google BigQuery + [BigQuery](https://cloud.google.com/bigquery) is a fully managed, AI-ready data analytics platform that helps you maximize value from your data and is designed to be multi-engine, multi-format, and multi-cloud. | Key | UX Label | Required | Description | @@ -864,6 +1281,7 @@ If your server is not in the list, please use [custom](#custom-server) and sugge | dataset | Dataset | Yes | The GCP dataset name. | #### ClickHouse Server + [ClickHouse](https://clickhouse.com/) is an open-source column-oriented database management system that allows generating analytical data reports in real-time. | Key | UX Label | Required | Description | @@ -873,6 +1291,7 @@ If your server is not in the list, please use [custom](#custom-server) and sugge | database | Database | Yes | The name of the database. | #### Google Cloud SQL + [Google Cloud SQL](https://cloud.google.com/sql) is a fully managed, cost-effective relational database service for PostgreSQL, MySQL, and SQL Server. | Key | UX Label | Required | Description | @@ -916,6 +1335,7 @@ If your server is not in the list, please use [custom](#custom-server) and sugge | schema | Schema | No | The name of the schema. | #### DuckDB Server + [DuckDB](https://duckdb.org/) supports a feature-rich SQL dialect complemented with deep integrations into client APIs. | Key | UX Label | Required | Description | @@ -932,7 +1352,18 @@ If your server is not in the list, please use [custom](#custom-server) and sugge | location | Location | No | The AWS S3 path. Must be in the form of a URL. | | format | Format | No | The format of the files | +#### Hive + +[Apache Hive](https://hive.apache.org/) is a distributed, fault-tolerant data warehouse system that enables analytics at massive scale. Built on top of Apache Hadoop, Hive allows users to read, write, and manage petabytes of data using SQL-like queries through HiveQL, with native support for cloud storage systems and enterprise-grade security features. + +| Key | UX Label | Required | Description | +|--------------|-----------------|------------|-------------------------------------------------| +| host | Host | Yes | The host to the Hive server. | +| port | Port | No | The port to the Hive server. Defaults to 10000. | +| database | Database | Yes | The name of the Hive database. | + #### IBM Informix and HCL Informix + [IBM Informix](https://www.ibm.com/products/informix) is a high performance, always-on, highly scalable and easily embeddable enterprise-class database optimized for the most demanding transactional and analytics workloads. As an object-relational engine, IBM Informix seamlessly integrates the best of relational and object-oriented capabilities enabling the flexible modeling of complex data structures and relationships. | Key | UX Label | Required | Description | @@ -980,6 +1411,7 @@ If your server is not in the list, please use [custom](#custom-server) and sugge | serviceName | Service Name | Yes | The name of the service. | #### PostgreSQL + [PostgreSQL](https://www.postgresql.org/) is a powerful, open source object-relational database system with over 35 years of active development that has earned it a strong reputation for reliability, feature robustness, and performance. | Key | UX Label | Required | Description | @@ -998,6 +1430,7 @@ If your server is not in the list, please use [custom](#custom-server) and sugge | schema | Schema | No | The name of the schema. | #### Google Pub/Sub + [Google Cloud](https://cloud.google.com/pubsub) service to Ingest events for streaming into BigQuery, data lakes or operational databases. | Key | UX Label | Required | Description | @@ -1005,6 +1438,7 @@ If your server is not in the list, please use [custom](#custom-server) and sugge | project | Project | Yes | The GCP project name. | #### Amazon Redshift Server + [Amazon Redshift](https://aws.amazon.com/redshift/) is a power data driven decisions with the best price-performance cloud data warehouse. | Key | UX Label | Required | Description | @@ -1016,6 +1450,7 @@ If your server is not in the list, please use [custom](#custom-server) and sugge | account | Account | No | The account used by the server. | #### Amazon S3 Server and Compatible Servers + [Amazon Simple Storage Service (Amazon S3)](https://aws.amazon.com/s3/) is an object storage service offering industry-leading scalability, data availability, security, and performance. Millions of customers of all sizes and industries store, manage, analyze, and protect any amount of data for virtually any use case, such as data lakes, cloud-native applications, and mobile apps. Other vendors have implemented a compatible implementation of S3. | Key | UX Label | Required | Description | @@ -1026,6 +1461,7 @@ If your server is not in the list, please use [custom](#custom-server) and sugge | delimiter | Delimiter | No | Only for format = json. How multiple json documents are delimited within one file | #### SFTP Server + Secure File Transfer Protocol (SFTP) is a network protocol that enables secure and encrypted file transfers between a client and a server. | Key | UX Label | Required | Description | @@ -1046,7 +1482,8 @@ Secure File Transfer Protocol (SFTP) is a network protocol that enables secure a | schema | Schema | Yes | The name of the schema. | #### Microsoft SQL Server -[Microsoft SQL Server](https://www.microsoft.com/en-us/sql-server/sql-server-downloads) is a proprietary relational database management system developed by Microsoft. + +[Microsoft SQL Server](https://www.microsoft.com/en-us/sql-server/sql-server-downloads) is a proprietary relational database management system developed by Microsoft. | Key | UX Label | Required | Description | |----------|----------|----------|----------------------------------------------------| @@ -1107,8 +1544,8 @@ Secure File Transfer Protocol (SFTP) is a network protocol that enables secure a If you need another property, use [custom properties](#custom-properties). - ## Custom Properties + This section covers custom properties you may find in a data contract. ### Example @@ -1119,20 +1556,22 @@ customProperties: value: gcsc.ruleset.name - property: somePropertyName value: property.value - - property: dataprocClusterName # Used for specific applications like Elevate + - property: dataprocClusterName # Used for specific applications value: [cluster name] + description: Cluster name for specific applications ``` ### Definitions -| Key | UX label | Required | Description | -|---------------------------|----------------------|----------|-------------------------------------------------------------------------------------------------------------------| -| customProperties | Custom Properties | No | A list of key/value pairs for custom properties. Initially created to support the REF ruleset property. | -| customProperties.property | Property | No | The name of the key. Names should be in camel case–the same as if they were permanent properties in the contract. | -| customProperties.value | Value | No | The value of the key. | - +| Key | UX label | Required | Description | +|------------------------------|-------------------|----------|-------------------------------------------------------------------------------------------------------------------| +| customProperties | Custom Properties | No | A list of key/value pairs for custom properties. Initially created to support the REF ruleset property. | +| customProperties.property | Property | No | The name of the key. Names should be in camel case–the same as if they were permanent properties in the contract. | +| customProperties.value | Value | No | The value of the key. | +| customProperties.description | Description | No | Description for humans. | ## Other Properties + This section covers other properties you may find in a data contract. ### Example @@ -1141,7 +1580,6 @@ This section covers other properties you may find in a data contract. contractCreatedTs: 2024-09-17T11:58:08Z ``` - ### Other properties definition | Key | UX label | Required | Description | @@ -1152,5 +1590,4 @@ contractCreatedTs: 2024-09-17T11:58:08Z [Check full example here.](examples/all/full-example.odcs.yaml) - -All trademarks are the property of their respective owners. \ No newline at end of file +All trademarks are the property of their respective owners. \ No newline at end of file From 1704cb2cdc3a97861eb7a111c5f9e798ba9c15d1 Mon Sep 17 00:00:00 2001 From: pkoper Date: Mon, 20 Oct 2025 13:38:51 +0100 Subject: [PATCH 10/11] chore(DEV SETUP): updated CONTRIBUTING.md with how create a local dev with setup scripts --- CONTRIBUTING.md | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8545d98..cb712a2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -7,3 +7,44 @@ image: "https://raw.githubusercontent.com/bitol-io/artwork/main/horizontal/color # Contributing to Open Data Contract Standard Thank you for your interest in contributing to Open Data Contract Standard (ODCS). Please refer to the [TSC contributing guidelines](https://github.com/bitol-io/tsc/blob/main/CONTRIBUTING.md). + +## Create a Local Development Environment + +To set up a local development environment, use the predefined setup scripts: + +
For Windows Development Environment run: + +```shell +. \scr\script\dev_setup.ps1 +``` + +
+ +
For Unix Development Environment run: + +```bash +source src/script/dev_setup.sh +``` + +
+ +Each of these scripts will: + +* Check the virtual environment: + * Create and activate it if missing + * Activate it if not already active +* Check `pip` status: + * Verify the current version + * Compare against the latest version + * Upgrade if necessary +* Check `pre_commit` status: + * Install if missing +* Check `pre-commit` version: + * Verify the current version + * Compare against the latest version + * Upgrade if necessary +* Check `pre-commit` hooks: + * Create `.pre-commit-config.yaml` if missing + * Update and install all hooks +* Check `.markdownlint.json`: + * Create the file if it doesn’t exist From 787c2d315469bf0817a610f2de9845400699c049 Mon Sep 17 00:00:00 2001 From: pkoper Date: Fri, 7 Nov 2025 10:41:16 +0000 Subject: [PATCH 11/11] chore(DEV SETUP): corrected setup scripts indentation --- src/script/dev_setup.sh | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/src/script/dev_setup.sh b/src/script/dev_setup.sh index 289aae0..0581747 100644 --- a/src/script/dev_setup.sh +++ b/src/script/dev_setup.sh @@ -2,7 +2,7 @@ set -o pipefail # 🎨 Colors -NC='\033[0m' # No Color +NC='\033[0m' # No Color CYAN='\033[0;36m' YELLOW='\033[0;33m' GREEN='\033[0;32m' @@ -16,20 +16,20 @@ VENV_DIR=".venv" # 🎯 Logging Functions # ----------------------------- print_info() { - echo -e "πŸ’‘ [${CYAN}INFO${NC}] ${CYAN}$1${NC}"; - } + echo -e "πŸ’‘ [${CYAN}INFO${NC}] ${CYAN}$1${NC}" +} print_task() { - echo -e "⚑ [${YELLOW}TASK${NC}] ${YELLOW}$1${NC}"; - } + echo -e "⚑ [${YELLOW}TASK${NC}] ${YELLOW}$1${NC}" +} print_pass() { - echo -e "βœ… [${GREEN}PASS${NC}] ${GREEN}$1${NC}"; - } + echo -e "βœ… [${GREEN}PASS${NC}] ${GREEN}$1${NC}" +} print_warning() { - echo -e "⚠️ [${MAGENTA}WARN${NC}] ${MAGENTA}$1${NC}"; - } + echo -e "⚠️ [${MAGENTA}WARN${NC}] ${MAGENTA}$1${NC}" +} print_error() { - echo -e "❌ [${RED}FAIL${NC}] ${RED}$1${NC}"; - } + echo -e "❌ [${RED}FAIL${NC}] ${RED}$1${NC}" +} # ----------------------------- # 🐍 Virtual Environment Check @@ -62,7 +62,7 @@ pip_current_version_check() { } pip_latest_version_check() { - DRY_RUN_OUTPUT=$(python3 -m pip install --upgrade pip --dry-run 2>/dev/null) + DRY_RUN_OUTPUT=$(python3 -m pip install --upgrade pip --dry-run 2> /dev/null) LATEST_VERSION=$(echo "$DRY_RUN_OUTPUT" | grep -oP 'pip-[0-9]+\.[0-9]+(\.[0-9]+)?' | head -n1 | tr -d 'pip-') [[ -z "$LATEST_VERSION" ]] && LATEST_VERSION=$(echo "$DRY_RUN_OUTPUT" | grep -oP '\([0-9]+\.[0-9]+(\.[0-9]+)?\)' | head -n1 | tr -d '()') [[ -z "$LATEST_VERSION" ]] && print_error "Could not determine the latest pip version." @@ -86,7 +86,7 @@ pip_status_check() { # ----------------------------- pre_commit_status_check() { print_info "Checking pre-commit installation ..." - if command -v pre-commit >/dev/null 2>&1; then + if command -v pre-commit > /dev/null 2>&1; then print_pass "pre-commit is installed." else print_warning "pre-commit is missing." @@ -100,7 +100,7 @@ pre_commit_current_version_check() { } pre_commit_latest_version_check() { - DRY_RUN_OUTPUT=$(pip install pre-commit --upgrade --dry-run 2>/dev/null) + DRY_RUN_OUTPUT=$(pip install pre-commit --upgrade --dry-run 2> /dev/null) LATEST_VERSION=$(echo "$DRY_RUN_OUTPUT" | grep -oP 'commit-[0-9]+\.[0-9]+(\.[0-9]+)?' | head -n1 | tr -d 'commit-') [[ -z "$LATEST_VERSION" ]] && LATEST_VERSION=$(echo "$DRY_RUN_OUTPUT" | grep -oP '\([0-9]+\.[0-9]+(\.[0-9]+)?\)' | head -n1 | tr -d '()') [[ -z "$LATEST_VERSION" ]] && print_error "Could not determine the latest pre-commit version." @@ -123,7 +123,7 @@ pre_commit_version_check() { # πŸ“„ Pre Commit Config File Creation # ----------------------------- pre_commit_config_create() { - cat < .pre-commit-config.yaml + cat << EOF > .pre-commit-config.yaml default_stages: [pre-commit, manual] repos: @@ -154,7 +154,7 @@ EOF # πŸ“„ Markdown Lint Config File Creation # ----------------------------- markdownlint_create() { - cat < .markdownlint.json + cat << EOF > .markdownlint.json { "comment": "Markdown Lint Rules", "default": true, @@ -172,7 +172,7 @@ EOF # πŸ“„ Commit Lint Config File Creation # ----------------------------- commitlintrc_create() { - cat < .commitlintrc.json + cat << EOF > .commitlintrc.json { "rules": { "body-leading-blank": [1, "always"],