diff --git a/datahub-web-react/src/app/ingest/source/builder/RecipeForm/constants.ts b/datahub-web-react/src/app/ingest/source/builder/RecipeForm/constants.ts index aea5a7eb9b5cf..443f87c93efb6 100644 --- a/datahub-web-react/src/app/ingest/source/builder/RecipeForm/constants.ts +++ b/datahub-web-react/src/app/ingest/source/builder/RecipeForm/constants.ts @@ -73,6 +73,13 @@ import { TARGET_PLATFORM, TARGET_PLATFORM_INSTANCE, } from '@app/ingest/source/builder/RecipeForm/dbt_cloud'; +import { + DORIS, + DORIS_DATABASE, + DORIS_HOST_PORT, + DORIS_PASSWORD, + DORIS_USERNAME, +} from '@app/ingest/source/builder/RecipeForm/doris'; import { HIVE_DATABASE, HIVE_HOST_PORT, @@ -499,6 +506,18 @@ export const RECIPE_FIELDS: RecipeFields = { ], filterSectionTooltip: 'Include or exclude specific Schemas, Tables and Views from ingestion.', }, + [DORIS]: { + fields: [DORIS_HOST_PORT, DORIS_USERNAME, DORIS_PASSWORD, DORIS_DATABASE], + filterFields: [SCHEMA_ALLOW, SCHEMA_DENY, TABLE_ALLOW, TABLE_DENY, VIEW_ALLOW, VIEW_DENY], + advancedFields: [ + INCLUDE_TABLES, + INCLUDE_VIEWS, + TABLE_PROFILING_ENABLED, + COLUMN_PROFILING_ENABLED, + STATEFUL_INGESTION_ENABLED, + ], + filterSectionTooltip: 'Include or exclude specific Schemas, Tables and Views from ingestion.', + }, [DATABRICKS]: { fields: [WORKSPACE_URL, TOKEN], filterFields: [ diff --git a/datahub-web-react/src/app/ingest/source/builder/RecipeForm/doris.ts b/datahub-web-react/src/app/ingest/source/builder/RecipeForm/doris.ts new file mode 100644 index 0000000000000..4a59370f3ef2f --- /dev/null +++ b/datahub-web-react/src/app/ingest/source/builder/RecipeForm/doris.ts @@ -0,0 +1,48 @@ +import { FieldType, RecipeField } from '@app/ingest/source/builder/RecipeForm/common'; + +export const DORIS = 'doris'; + +export const DORIS_HOST_PORT: RecipeField = { + name: 'host_port', + label: 'Host and Port', + tooltip: + "The host and port where Apache Doris is running. For example, 'doris-server:9030'. Note: this host must be accessible on the network where DataHub is running (or allowed via an IP Allow List, AWS PrivateLink, etc).", + type: FieldType.TEXT, + fieldPath: 'source.config.host_port', + placeholder: 'doris-server:9030', + required: true, + rules: null, +}; + +export const DORIS_DATABASE: RecipeField = { + name: 'database', + label: 'Database', + tooltip: 'Ingest metadata for a specific Database.', + type: FieldType.TEXT, + fieldPath: 'source.config.database', + placeholder: 'my_db', + required: true, + rules: null, +}; + +export const DORIS_USERNAME: RecipeField = { + name: 'username', + label: 'Username', + tooltip: 'The Apache Doris username used to extract metadata.', + type: FieldType.TEXT, + fieldPath: 'source.config.username', + placeholder: 'root', + required: true, + rules: null, +}; + +export const DORIS_PASSWORD: RecipeField = { + name: 'password', + label: 'Password', + tooltip: 'The Apache Doris password for the user.', + type: FieldType.SECRET, + fieldPath: 'source.config.password', + placeholder: 'password', + required: true, + rules: null, +}; diff --git a/datahub-web-react/src/app/ingest/source/builder/sources.json b/datahub-web-react/src/app/ingest/source/builder/sources.json index afc25db10b28d..0b68b8f57e7d5 100644 --- a/datahub-web-react/src/app/ingest/source/builder/sources.json +++ b/datahub-web-react/src/app/ingest/source/builder/sources.json @@ -151,6 +151,14 @@ "docsUrl": "https://docs.datahub.com/docs/generated/ingestion/sources/mariadb/", "recipe": "source:\n type: mariadb\n config:\n # Coordinates\n host_port: null\n # The name\n database: null\n # Credentials\n username: null\n include_views: true\n include_tables: true\n profiling:\n enabled: true\n profile_table_level_only: true\n stateful_ingestion:\n enabled: true" }, + { + "urn": "urn:li:dataPlatform:doris", + "name": "doris", + "displayName": "Apache Doris", + "description": "Import Tables, Views, Databases, Schemas, and statistics from Apache Doris.", + "docsUrl": "https://docs.datahub.com/docs/generated/ingestion/sources/doris/", + "recipe": "source:\n type: doris\n config:\n # Coordinates\n host_port: null\n # The name\n database: null\n # Credentials\n username: null\n include_views: true\n include_tables: true\n profiling:\n enabled: true\n profile_table_level_only: true\n stateful_ingestion:\n enabled: true" + }, { "urn": "urn:li:dataPlatform:mongodb", "name": "mongodb", diff --git a/datahub-web-react/src/app/ingest/source/conf/doris/doris.ts b/datahub-web-react/src/app/ingest/source/conf/doris/doris.ts new file mode 100644 index 0000000000000..973a85bc2ee20 --- /dev/null +++ b/datahub-web-react/src/app/ingest/source/conf/doris/doris.ts @@ -0,0 +1,35 @@ +import { SourceConfig } from '@app/ingest/source/conf/types'; + +import mysqlLogo from '@images/mysqllogo-2.png'; + +const placeholderRecipe = `\ +source: + type: doris + config: + # Coordinates + host_port: # Your Apache Doris host and port, e.g. doris:9030 + database: # Your Apache Doris database name, e.g. datahub + + # Credentials + # Add secret in Secrets Tab with relevant names for each variable + username: "\${DORIS_USERNAME}" # Your Apache Doris username, e.g. root + password: "\${DORIS_PASSWORD}" # Your Apache Doris password, e.g. password_01 + + # Options + include_tables: True + include_views: True + + # Profiling + profiling: + enabled: false +`; + +const dorisConfig: SourceConfig = { + type: 'doris', + placeholderRecipe, + displayName: 'Apache Doris', + docsUrl: 'https://docs.datahub.com/docs/generated/ingestion/sources/doris/', + logoUrl: mysqlLogo, +}; + +export default dorisConfig; diff --git a/datahub-web-react/src/app/ingestV2/source/builder/RecipeForm/constants.ts b/datahub-web-react/src/app/ingestV2/source/builder/RecipeForm/constants.ts index b00e2908e2295..196c3d3fbd22e 100644 --- a/datahub-web-react/src/app/ingestV2/source/builder/RecipeForm/constants.ts +++ b/datahub-web-react/src/app/ingestV2/source/builder/RecipeForm/constants.ts @@ -73,6 +73,13 @@ import { TARGET_PLATFORM, TARGET_PLATFORM_INSTANCE, } from '@app/ingestV2/source/builder/RecipeForm/dbt_cloud'; +import { + DORIS, + DORIS_DATABASE, + DORIS_HOST_PORT, + DORIS_PASSWORD, + DORIS_USERNAME, +} from '@app/ingestV2/source/builder/RecipeForm/doris'; import { HIVE_DATABASE, HIVE_HOST_PORT, @@ -473,6 +480,18 @@ export const RECIPE_FIELDS: RecipeFields = { ], filterSectionTooltip: 'Include or exclude specific Schemas, Tables and Views from ingestion.', }, + [DORIS]: { + fields: [DORIS_HOST_PORT, DORIS_USERNAME, DORIS_PASSWORD, DORIS_DATABASE], + filterFields: [SCHEMA_ALLOW, SCHEMA_DENY, TABLE_ALLOW, TABLE_DENY, VIEW_ALLOW, VIEW_DENY], + advancedFields: [ + INCLUDE_TABLES, + INCLUDE_VIEWS, + TABLE_PROFILING_ENABLED, + COLUMN_PROFILING_ENABLED, + STATEFUL_INGESTION_ENABLED, + ], + filterSectionTooltip: 'Include or exclude specific Schemas, Tables and Views from ingestion.', + }, [DATABRICKS]: { fields: [WORKSPACE_URL, TOKEN], filterFields: [ diff --git a/datahub-web-react/src/app/ingestV2/source/builder/RecipeForm/doris.ts b/datahub-web-react/src/app/ingestV2/source/builder/RecipeForm/doris.ts new file mode 100644 index 0000000000000..c382e3c0f0022 --- /dev/null +++ b/datahub-web-react/src/app/ingestV2/source/builder/RecipeForm/doris.ts @@ -0,0 +1,48 @@ +import { FieldType, RecipeField } from '@app/ingestV2/source/builder/RecipeForm/common'; + +export const DORIS = 'doris'; + +export const DORIS_HOST_PORT: RecipeField = { + name: 'host_port', + label: 'Host and Port', + tooltip: + "The host and port where Apache Doris is running. For example, 'doris-server:9030'. Note: this host must be accessible on the network where DataHub is running (or allowed via an IP Allow List, AWS PrivateLink, etc).", + type: FieldType.TEXT, + fieldPath: 'source.config.host_port', + placeholder: 'doris-server:9030', + required: true, + rules: null, +}; + +export const DORIS_DATABASE: RecipeField = { + name: 'database', + label: 'Database', + tooltip: 'Ingest metadata for a specific Database.', + type: FieldType.TEXT, + fieldPath: 'source.config.database', + placeholder: 'my_db', + required: true, + rules: null, +}; + +export const DORIS_USERNAME: RecipeField = { + name: 'username', + label: 'Username', + tooltip: 'The Apache Doris username used to extract metadata.', + type: FieldType.TEXT, + fieldPath: 'source.config.username', + placeholder: 'root', + required: true, + rules: null, +}; + +export const DORIS_PASSWORD: RecipeField = { + name: 'password', + label: 'Password', + tooltip: 'The Apache Doris password for the user.', + type: FieldType.SECRET, + fieldPath: 'source.config.password', + placeholder: 'password', + required: true, + rules: null, +}; diff --git a/datahub-web-react/src/app/ingestV2/source/builder/sources.json b/datahub-web-react/src/app/ingestV2/source/builder/sources.json index 17da5f894f491..9b46a49a91e56 100644 --- a/datahub-web-react/src/app/ingestV2/source/builder/sources.json +++ b/datahub-web-react/src/app/ingestV2/source/builder/sources.json @@ -151,6 +151,14 @@ "docsUrl": "https://docs.datahub.com/docs/generated/ingestion/sources/mariadb/", "recipe": "source:\n type: mariadb\n config:\n # Coordinates\n host_port: null\n # The name\n database: null\n # Credentials\n username: null\n include_views: true\n include_tables: true\n profiling:\n enabled: true\n profile_table_level_only: true\n stateful_ingestion:\n enabled: true" }, + { + "urn": "urn:li:dataPlatform:doris", + "name": "doris", + "displayName": "Apache Doris", + "description": "Import Tables, Views, Databases, Schemas, and statistics from Apache Doris.", + "docsUrl": "https://docs.datahub.com/docs/generated/ingestion/sources/doris/", + "recipe": "source:\n type: doris\n config:\n # Coordinates\n host_port: null\n # The name\n database: null\n # Credentials\n username: null\n include_views: true\n include_tables: true\n profiling:\n enabled: true\n profile_table_level_only: true\n stateful_ingestion:\n enabled: true" + }, { "urn": "urn:li:dataPlatform:mongodb", "name": "mongodb", diff --git a/datahub-web-react/src/app/ingestV2/source/conf/doris/doris.ts b/datahub-web-react/src/app/ingestV2/source/conf/doris/doris.ts new file mode 100644 index 0000000000000..616af87490f3e --- /dev/null +++ b/datahub-web-react/src/app/ingestV2/source/conf/doris/doris.ts @@ -0,0 +1,35 @@ +import { SourceConfig } from '@app/ingestV2/source/conf/types'; + +import mysqlLogo from '@images/mysqllogo-2.png'; + +const placeholderRecipe = `\ +source: + type: doris + config: + # Coordinates + host_port: # Your Apache Doris host and port, e.g. doris:9030 + database: # Your Apache Doris database name, e.g. datahub + + # Credentials + # Add secret in Secrets Tab with relevant names for each variable + username: "\${DORIS_USERNAME}" # Your Apache Doris username, e.g. root + password: "\${DORIS_PASSWORD}" # Your Apache Doris password, e.g. password_01 + + # Options + include_tables: True + include_views: True + + # Profiling + profiling: + enabled: false +`; + +const dorisConfig: SourceConfig = { + type: 'doris', + placeholderRecipe, + displayName: 'Apache Doris', + docsUrl: 'https://docs.datahub.com/docs/generated/ingestion/sources/doris/', + logoUrl: mysqlLogo, +}; + +export default dorisConfig; diff --git a/datahub-web-react/src/images/dorislogo.png b/datahub-web-react/src/images/dorislogo.png new file mode 100644 index 0000000000000..32c2595d33cf9 Binary files /dev/null and b/datahub-web-react/src/images/dorislogo.png differ diff --git a/metadata-ingestion/docs/sources/doris/doris_pre.md b/metadata-ingestion/docs/sources/doris/doris_pre.md new file mode 100644 index 0000000000000..72a3d4afc1850 --- /dev/null +++ b/metadata-ingestion/docs/sources/doris/doris_pre.md @@ -0,0 +1,133 @@ +### Prerequisites + +In order to execute this source, the user credentials need the following privileges: + +```sql +-- Grant necessary privileges to the DataHub user +GRANT SELECT_PRIV ON your_database.* TO 'datahub_user'@'%'; +GRANT SHOW_VIEW_PRIV ON your_database.* TO 'datahub_user'@'%'; + +-- For profiling (optional, if profiling is enabled) +GRANT SELECT_PRIV ON your_database.* TO 'datahub_user'@'%'; +``` + +**Note:** `SELECT_PRIV` is required to read table structures and perform profiling operations. `SHOW_VIEW_PRIV` is required to ingest views. + +#### Apache Doris Compatibility Notes + +Apache Doris uses the MySQL protocol for client connections, but with some key differences: + +**Port Configuration:** + +- Default Doris query port: **9030** (FE MySQL protocol port) +- **Not** MySQL's default 3306 +- Ensure you use `host_port: doris-server:9030` in your configuration + +**Architecture:** + +- Doris uses a Frontend (FE) and Backend (BE) architecture +- DataHub connects to the FE node on port 9030 +- Ensure the FE node is accessible and healthy + +**Data Types:** + +- Doris includes additional data types: `HLL`, `BITMAP`, `ARRAY`, `JSONB`, `QUANTILE_STATE` +- These types are automatically mapped to appropriate DataHub types +- No additional configuration needed + +**Stored Procedures:** + +- Apache Doris does not support stored procedures +- The `information_schema.ROUTINES` table is a MySQL compatibility stub (always empty) +- The connector automatically handles this limitation + +### Troubleshooting + +#### Connection Issues + +**Problem:** `Can't connect to MySQL server` or connection timeouts + +**Solutions:** + +- Verify you're using port **9030** (query port), not 9050 (HTTP port) or 3306 (MySQL default) +- Check that the Doris FE (Frontend) node is running: `curl http://fe-host:8030/api/bootstrap` +- Ensure network connectivity and firewall rules allow connections to port 9030 +- Verify the FE node has registered BE nodes: `SHOW BACKENDS;` + +**Problem:** `Access denied for user` + +**Solutions:** + +- Verify the user has been granted `SELECT_PRIV` and `SHOW_VIEW_PRIV` +- Check grants with: `SHOW GRANTS FOR 'datahub_user'@'%';` +- Ensure the user is allowed to connect from your host: use `'%'` for any host or specify the IP + +#### Missing Metadata + +**Problem:** Tables or views are not being ingested + +**Solutions:** + +- Verify the user has `SELECT_PRIV` on the target databases/tables +- Check that tables exist and are visible: `SHOW TABLES IN your_database;` +- Review `schema_pattern` and `table_pattern` in your recipe configuration +- Ensure the database is not filtered out by your configuration + +**Problem:** Column types showing as UNKNOWN + +**Solutions:** + +- This typically happens with Doris-specific types in older DataHub versions +- Ensure you're using the latest DataHub version which includes Doris type mappings +- Check Doris FE logs for any metadata query errors + +#### Performance Issues + +**Problem:** Ingestion is slow or timing out + +**Solutions:** + +- Use `schema_pattern` and `table_pattern` to limit scope: `schema_pattern: {"allow": ["important_db"]}` +- Enable table-level-only profiling: `profiling.profile_table_level_only: true` +- Disable profiling if not needed: `profiling.enabled: false` +- Increase query timeouts if you have very large tables: `options.connect_timeout: 300` + +**Problem:** Doris FE or BE is overloaded during ingestion + +**Solutions:** + +- Reduce profiling sample size: `profiling.max_number_of_fields_to_profile: 10` +- Schedule ingestion during off-peak hours +- Increase `profiling.query_combiner_enabled: false` to avoid complex queries + +#### Profiling Issues + +**Problem:** Profiling fails or returns no statistics + +**Solutions:** + +- Verify user has `SELECT_PRIV` on target tables +- Check that tables contain data (empty tables have no statistics) +- Ensure Doris statistics are up to date: `ANALYZE TABLE your_table;` +- Review Doris FE logs for query errors during profiling + +#### Doris-Specific Issues + +**Problem:** Warnings about `DUPLICATE KEY` or `DISTRIBUTED BY HASH` + +**Solutions:** + +- These are informational warnings from SQLAlchemy parsing Doris-specific table properties +- They do not affect ingestion and can be safely ignored +- The connector handles these properties correctly + +**Problem:** View lineage not being captured + +**Solutions:** + +- Ensure `include_view_lineage: true` (enabled by default) +- Verify views are created with proper table references +- Check that referenced tables are accessible to the DataHub user +- Review `include_view_column_lineage` configuration + +For additional support, consult the [Apache Doris documentation](https://doris.apache.org/docs) or reach out to the DataHub community. diff --git a/metadata-ingestion/docs/sources/doris/doris_recipe.yml b/metadata-ingestion/docs/sources/doris/doris_recipe.yml new file mode 100644 index 0000000000000..0c06a31cce12a --- /dev/null +++ b/metadata-ingestion/docs/sources/doris/doris_recipe.yml @@ -0,0 +1,19 @@ +source: + type: doris + config: + # Coordinates + # Note: Doris uses port 9030, not MySQL's 3306 + host_port: localhost:9030 + database: dbname + + # Credentials + username: root + password: example + + # If you need to use SSL with Apache Doris: + # options: + # connect_args: + # ssl_ca: "path_to/server-ca.pem" + # ssl_cert: "path_to/client-cert.pem" + # ssl_key: "path_to/client-key.pem" +# sink configs diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 55255f7e09938..8f121589a48d6 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -854,7 +854,8 @@ "mongodb = datahub.ingestion.source.mongodb:MongoDBSource", "mssql = datahub.ingestion.source.sql.mssql:SQLServerSource", "mysql = datahub.ingestion.source.sql.mysql:MySQLSource", - "mariadb = datahub.ingestion.source.sql.mariadb.MariaDBSource", + "mariadb = datahub.ingestion.source.sql.mariadb:MariaDBSource", + "doris = datahub.ingestion.source.sql.doris:DorisSource", "okta = datahub.ingestion.source.identity.okta:OktaSource", "oracle = datahub.ingestion.source.sql.oracle:OracleSource", "postgres = datahub.ingestion.source.sql.postgres:PostgresSource", diff --git a/metadata-ingestion/src/datahub/ingestion/autogenerated/capability_summary.json b/metadata-ingestion/src/datahub/ingestion/autogenerated/capability_summary.json index a827a2c717a1c..8ba0f971730c5 100644 --- a/metadata-ingestion/src/datahub/ingestion/autogenerated/capability_summary.json +++ b/metadata-ingestion/src/datahub/ingestion/autogenerated/capability_summary.json @@ -641,6 +641,87 @@ "platform_name": "Demo Data", "support_status": null }, + "doris": { + "capabilities": [ + { + "capability": "CONTAINERS", + "description": "Enabled by default", + "subtype_modifier": [ + "Database", + "Schema" + ], + "supported": true + }, + { + "capability": "CLASSIFICATION", + "description": "Optionally enabled via `classification.enabled`", + "subtype_modifier": null, + "supported": true + }, + { + "capability": "LINEAGE_FINE", + "description": "Enabled by default to get lineage for views via `include_view_column_lineage`", + "subtype_modifier": [ + "View" + ], + "supported": true + }, + { + "capability": "DATA_PROFILING", + "description": "Optionally enabled via configuration", + "subtype_modifier": null, + "supported": true + }, + { + "capability": "DESCRIPTIONS", + "description": "Enabled by default", + "subtype_modifier": null, + "supported": true + }, + { + "capability": "DELETION_DETECTION", + "description": "Enabled by default via stateful ingestion", + "subtype_modifier": null, + "supported": true + }, + { + "capability": "DOMAINS", + "description": "Supported via the `domain` config field", + "subtype_modifier": null, + "supported": true + }, + { + "capability": "PLATFORM_INSTANCE", + "description": "Enabled by default", + "subtype_modifier": null, + "supported": true + }, + { + "capability": "SCHEMA_METADATA", + "description": "Enabled by default", + "subtype_modifier": null, + "supported": true + }, + { + "capability": "LINEAGE_COARSE", + "description": "Enabled by default to get lineage for views via `include_view_lineage`", + "subtype_modifier": [ + "View" + ], + "supported": true + }, + { + "capability": "TEST_CONNECTION", + "description": "Enabled by default", + "subtype_modifier": null, + "supported": true + } + ], + "classname": "datahub.ingestion.source.sql.doris.DorisSource", + "platform_id": "doris", + "platform_name": "Apache Doris", + "support_status": "INCUBATING" + }, "dremio": { "capabilities": [ { diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/doris.py b/metadata-ingestion/src/datahub/ingestion/source/sql/doris.py new file mode 100644 index 0000000000000..0665f0bc89c87 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/doris.py @@ -0,0 +1,141 @@ +from typing import Any, List + +from pydantic.fields import Field +from sqlalchemy.dialects.mysql import base +from sqlalchemy.engine.reflection import Inspector + +from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) +from datahub.ingestion.source.sql.mysql import MySQLConfig, MySQLSource +from datahub.ingestion.source.sql.sql_common import ( + make_sqlalchemy_type, + register_custom_type, +) +from datahub.ingestion.source.sql.stored_procedures.base import BaseProcedure +from datahub.metadata.schema_classes import ( + ArrayTypeClass, + BytesTypeClass, + RecordTypeClass, +) + +# Register Doris-specific data types +# These types are unique to Apache Doris and not present in standard MySQL + +# HyperLogLog - Used for approximate distinct count aggregations +HLL = make_sqlalchemy_type("HLL") +register_custom_type(HLL, BytesTypeClass) # Treat as binary data in DataHub + +# Bitmap - Used for bitmap indexing and set operations +BITMAP = make_sqlalchemy_type("BITMAP") +register_custom_type(BITMAP, BytesTypeClass) # Treat as binary data in DataHub + +# Array - Native array type support +DORIS_ARRAY = make_sqlalchemy_type("ARRAY") +register_custom_type(DORIS_ARRAY, ArrayTypeClass) # Proper array type in DataHub + +# JSONB - Binary JSON format (more efficient than MySQL's JSON) +JSONB = make_sqlalchemy_type("JSONB") +register_custom_type(JSONB, RecordTypeClass) # Treat as record/struct in DataHub + +# QUANTILE_STATE - For approximate percentile calculations +QUANTILE_STATE = make_sqlalchemy_type("QUANTILE_STATE") +register_custom_type(QUANTILE_STATE, BytesTypeClass) + +# Register these types with the MySQL dialect so SQLAlchemy recognizes them +base.ischema_names["hll"] = HLL +base.ischema_names["bitmap"] = BITMAP +base.ischema_names["array"] = DORIS_ARRAY +base.ischema_names["jsonb"] = JSONB +base.ischema_names["quantile_state"] = QUANTILE_STATE + +# Handle case variations +base.ischema_names["HLL"] = HLL +base.ischema_names["BITMAP"] = BITMAP +base.ischema_names["ARRAY"] = DORIS_ARRAY +base.ischema_names["JSONB"] = JSONB +base.ischema_names["QUANTILE_STATE"] = QUANTILE_STATE + + +class DorisConfig(MySQLConfig): + # Override host_port to document Doris's default port + host_port: str = Field( + default="localhost:9030", + description="Doris FE (Frontend) host and port in the format host:port. Default port is 9030 (MySQL protocol), not 3306.", + ) + + # Override to hide stored procedure-related fields from docs since they don't work in Doris + # information_schema.ROUTINES is always empty per Doris documentation + # https://doris.apache.org/docs/3.x/admin-manual/system-tables/information_schema/routines + include_stored_procedures: HiddenFromDocs[bool] = Field( + default=False, + description="Stored procedures are not supported in Apache Doris. The information_schema.ROUTINES table is always empty.", + ) + + procedure_pattern: HiddenFromDocs[AllowDenyPattern] = Field( + default=AllowDenyPattern.allow_all(), + description="Not applicable for Apache Doris as stored procedures are not supported.", + ) + + +@platform_name("Apache Doris", id="doris") +@config_class(DorisConfig) +@support_status(SupportStatus.INCUBATING) +@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") +@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field") +@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") +class DorisSource(MySQLSource): + """ + This plugin extracts metadata from Apache Doris, which is largely MySQL-compatible. + + Apache Doris is a modern MPP analytical database that uses the MySQL protocol + for client connections. While Doris aims for MySQL compatibility, there are some + differences to be aware of: + + - Data Types: Doris has unique types (HyperLogLog, Bitmap, Array, JSONB) not in MySQL + - Stored Procedures: Limited support compared to MySQL (disabled by default) + - System Tables: Uses virtual system tables that are read-only + - Default Port: 9030 (query port) instead of MySQL's 3306 + + This connector extends the MySQL connector and inherits most of its functionality, + including table/view metadata extraction and profiling capabilities. + """ + + config: DorisConfig + + def __init__(self, config: DorisConfig, ctx: Any): + super().__init__(config, ctx) + + def get_platform(self): + return "doris" + + def get_procedures_for_schema( + self, inspector: Inspector, schema: str, db_name: str + ) -> List[BaseProcedure]: + """ + Override to handle Doris's empty information_schema.ROUTINES table. + + According to Apache Doris documentation: + https://doris.apache.org/docs/3.x/admin-manual/system-tables/information_schema/routines + "This table is solely for the purpose of maintaining compatibility with MySQL behavior. + It is always empty." + + Therefore, stored procedures are always disabled for Doris. + """ + if not self.config.include_stored_procedures: + return [] + + # Even if user explicitly enables stored procedures, return empty list + # because information_schema.ROUTINES is documented as always empty in Doris + self.report.report_warning( + f"{db_name}.{schema}", + "Stored procedures are not supported in Apache Doris. " + "The information_schema.ROUTINES table is always empty per Doris documentation.", + ) + return [] diff --git a/metadata-ingestion/tests/integration/doris/docker-compose.yml b/metadata-ingestion/tests/integration/doris/docker-compose.yml new file mode 100644 index 0000000000000..848e8a40efc90 --- /dev/null +++ b/metadata-ingestion/tests/integration/doris/docker-compose.yml @@ -0,0 +1,39 @@ +networks: + doris-net: + +services: + doris-fe: + image: apache/doris:fe-3.0.8 + container_name: "testdoris-fe" + hostname: fe + environment: + # Workaround for Java 17 cgroup v2 incompatibility in GitHub Actions + # These flags disable container metrics detection which causes NullPointerException + - JAVA_OPTS=-XX:-UseContainerSupport -XX:+UnlockExperimentalVMOptions -XX:+UseCGroupMemoryLimitForHeap + ports: + - 58030:8030 # FE web UI + - 59030:9030 # MySQL protocol port + - 59010:9010 # FE edit log port + volumes: + - ./setup:/setup + networks: + - doris-net + entrypoint: ["/setup/fe-entrypoint.sh"] + + doris-be: + image: apache/doris:be-3.0.8 + container_name: "testdoris-be" + hostname: be + environment: + # Workaround for Java 17 cgroup v2 incompatibility in GitHub Actions + - JAVA_OPTS=-XX:-UseContainerSupport -XX:+UnlockExperimentalVMOptions -XX:+UseCGroupMemoryLimitForHeap + ports: + - 58040:8040 # BE web UI + - 59050:9050 # BE heartbeat port + volumes: + - ./setup:/setup + depends_on: + - doris-fe + networks: + - doris-net + entrypoint: ["/setup/be-entrypoint.sh"] diff --git a/metadata-ingestion/tests/integration/doris/doris_mces_golden.json b/metadata-ingestion/tests/integration/doris/doris_mces_golden.json new file mode 100644 index 0000000000000..f568b58034d36 --- /dev/null +++ b/metadata-ingestion/tests/integration/doris/doris_mces_golden.json @@ -0,0 +1,999 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:b6f0701f59028362849a2a8ebbb1de41", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "doris", + "env": "PROD", + "database": "dorisdb" + }, + "name": "dorisdb", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "doris-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:b6f0701f59028362849a2a8ebbb1de41", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "doris-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:b6f0701f59028362849a2a8ebbb1de41", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:doris" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "doris-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:b6f0701f59028362849a2a8ebbb1de41", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Database" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "doris-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:b6f0701f59028362849a2a8ebbb1de41", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "doris-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:doris,dorisdb.analytics_data,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:b6f0701f59028362849a2a8ebbb1de41" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "doris-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:doris,dorisdb.analytics_data,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": {}, + "name": "analytics_data", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "dorisdb.analytics_data", + "platform": "urn:li:dataPlatform:doris", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "name", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(100)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "user_ids_hll", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "TEXT", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "user_bitmap", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "TEXT", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "tags", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "TEXT", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "metadata", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "TEXT", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "created_at", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.TimeType": {} + } + }, + "nativeDataType": "DATETIME", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "doris-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:doris,dorisdb.analytics_data,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "doris-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:doris,dorisdb.analytics_data,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:b6f0701f59028362849a2a8ebbb1de41", + "urn": "urn:li:container:b6f0701f59028362849a2a8ebbb1de41" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "doris-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:doris,dorisdb.customers,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:b6f0701f59028362849a2a8ebbb1de41" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "doris-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:doris,dorisdb.customers,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": {}, + "name": "customers", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "dorisdb.customers", + "platform": "urn:li:dataPlatform:doris", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "customer_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "customer_name", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(100)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "email", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(100)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "country", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(50)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "created_at", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.TimeType": {} + } + }, + "nativeDataType": "DATETIME", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "doris-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:doris,dorisdb.customers,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "doris-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:doris,dorisdb.customers,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:b6f0701f59028362849a2a8ebbb1de41", + "urn": "urn:li:container:b6f0701f59028362849a2a8ebbb1de41" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "doris-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:doris,dorisdb.orders,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:b6f0701f59028362849a2a8ebbb1de41" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "doris-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:doris,dorisdb.orders,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": {}, + "name": "orders", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "dorisdb.orders", + "platform": "urn:li:dataPlatform:doris", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "order_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "customer_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "order_date", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.DateType": {} + } + }, + "nativeDataType": "DATE", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "total_amount", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "DECIMAL(10, 2)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "status", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(50)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "created_at", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.TimeType": {} + } + }, + "nativeDataType": "DATETIME", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "doris-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:doris,dorisdb.orders,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "doris-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:doris,dorisdb.orders,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:b6f0701f59028362849a2a8ebbb1de41", + "urn": "urn:li:container:b6f0701f59028362849a2a8ebbb1de41" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "doris-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:doris,dorisdb.customer_orders,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:b6f0701f59028362849a2a8ebbb1de41" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "doris-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:doris,dorisdb.customer_orders,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "is_view": "True", + "view_definition": "CREATE VIEW `customer_orders` AS SELECT \n `internal`.`dorisdb`.`c`.`customer_id`,\n `internal`.`dorisdb`.`c`.`customer_name`,\n COUNT(`internal`.`dorisdb`.`o`.`order_id`) as `order_count`,\n SUM(`internal`.`dorisdb`.`o`.`total_amount`) as `total_spent`\nFROM `internal`.`dorisdb`.`customers` `c`\nLEFT JOIN `internal`.`dorisdb`.`orders` `o` ON `internal`.`dorisdb`.`c`.`customer_id` = `internal`.`dorisdb`.`o`.`customer_id`\nGROUP BY `internal`.`dorisdb`.`c`.`customer_id`, `internal`.`dorisdb`.`c`.`customer_name`;" + }, + "name": "customer_orders", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "dorisdb.customer_orders", + "platform": "urn:li:dataPlatform:doris", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "customer_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "customer_name", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(100)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "order_count", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "BIGINT", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "total_spent", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "DECIMAL(38, 2)", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "doris-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:doris,dorisdb.customer_orders,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "doris-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:doris,dorisdb.customer_orders,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "CREATE VIEW `customer_orders` AS SELECT \n `internal`.`dorisdb`.`c`.`customer_id`,\n `internal`.`dorisdb`.`c`.`customer_name`,\n COUNT(`internal`.`dorisdb`.`o`.`order_id`) as `order_count`,\n SUM(`internal`.`dorisdb`.`o`.`total_amount`) as `total_spent`\nFROM `internal`.`dorisdb`.`customers` `c`\nLEFT JOIN `internal`.`dorisdb`.`orders` `o` ON `internal`.`dorisdb`.`c`.`customer_id` = `internal`.`dorisdb`.`o`.`customer_id`\nGROUP BY `internal`.`dorisdb`.`c`.`customer_id`, `internal`.`dorisdb`.`c`.`customer_name`;", + "viewLanguage": "SQL" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "doris-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:doris,dorisdb.customer_orders,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 1586847600000, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:doris,internal.dorisdb.customers,PROD)", + "type": "VIEW", + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adoris%2Cdorisdb.customer_orders%2CPROD%29" + }, + { + "auditStamp": { + "time": 1586847600000, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:doris,internal.dorisdb.orders,PROD)", + "type": "VIEW", + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adoris%2Cdorisdb.customer_orders%2CPROD%29" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:doris,internal.dorisdb.customers,PROD),customer_id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:doris,dorisdb.customer_orders,PROD),customer_id)" + ], + "transformOperation": "COPY: `internal`.`dorisdb`.`c`.`customer_id` AS `customer_id`", + "confidenceScore": 0.3, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adoris%2Cdorisdb.customer_orders%2CPROD%29" + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:doris,internal.dorisdb.customers,PROD),customer_name)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:doris,dorisdb.customer_orders,PROD),customer_name)" + ], + "transformOperation": "COPY: `internal`.`dorisdb`.`c`.`customer_name` AS `customer_name`", + "confidenceScore": 0.3, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adoris%2Cdorisdb.customer_orders%2CPROD%29" + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:doris,internal.dorisdb.orders,PROD),order_id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:doris,dorisdb.customer_orders,PROD),order_count)" + ], + "transformOperation": "SQL: COUNT(`internal`.`dorisdb`.`o`.`order_id`) AS `order_count`", + "confidenceScore": 0.3, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adoris%2Cdorisdb.customer_orders%2CPROD%29" + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:doris,internal.dorisdb.orders,PROD),total_amount)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:doris,dorisdb.customer_orders,PROD),total_spent)" + ], + "transformOperation": "SQL: SUM(`internal`.`dorisdb`.`o`.`total_amount`) AS `total_spent`", + "confidenceScore": 0.3, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adoris%2Cdorisdb.customer_orders%2CPROD%29" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "doris-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:doris,dorisdb.customer_orders,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:b6f0701f59028362849a2a8ebbb1de41", + "urn": "urn:li:container:b6f0701f59028362849a2a8ebbb1de41" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "doris-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adoris%2Cdorisdb.customer_orders%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "queryProperties", + "aspect": { + "json": { + "customProperties": {}, + "statement": { + "value": "CREATE VIEW `customer_orders` AS\nSELECT\n `internal`.`dorisdb`.`c`.`customer_id`,\n `internal`.`dorisdb`.`c`.`customer_name`,\n COUNT(`internal`.`dorisdb`.`o`.`order_id`) AS `order_count`,\n SUM(`internal`.`dorisdb`.`o`.`total_amount`) AS `total_spent`\nFROM `internal`.`dorisdb`.`customers` AS `c`\nLEFT JOIN `internal`.`dorisdb`.`orders` AS `o`\n ON `internal`.`dorisdb`.`c`.`customer_id` = `internal`.`dorisdb`.`o`.`customer_id`\nGROUP BY\n `internal`.`dorisdb`.`c`.`customer_id`,\n `internal`.`dorisdb`.`c`.`customer_name`", + "language": "SQL" + }, + "source": "SYSTEM", + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "lastModified": { + "time": 1586847600000, + "actor": "urn:li:corpuser:_ingestion" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "doris-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adoris%2Cdorisdb.customer_orders%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "querySubjects", + "aspect": { + "json": { + "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:doris,internal.dorisdb.customers,PROD)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:doris,internal.dorisdb.orders,PROD)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:doris,dorisdb.customer_orders,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:doris,internal.dorisdb.customers,PROD),customer_id)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:doris,internal.dorisdb.customers,PROD),customer_name)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:doris,internal.dorisdb.orders,PROD),order_id)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:doris,internal.dorisdb.orders,PROD),total_amount)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:doris,dorisdb.customer_orders,PROD),customer_id)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:doris,dorisdb.customer_orders,PROD),customer_name)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:doris,dorisdb.customer_orders,PROD),order_count)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:doris,dorisdb.customer_orders,PROD),total_spent)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "doris-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adoris%2Cdorisdb.customer_orders%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:doris" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "doris-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adoris%2Cdorisdb.customer_orders%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "doris-test", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/doris/doris_to_file.yml b/metadata-ingestion/tests/integration/doris/doris_to_file.yml new file mode 100644 index 0000000000000..af7e7b2903984 --- /dev/null +++ b/metadata-ingestion/tests/integration/doris/doris_to_file.yml @@ -0,0 +1,15 @@ +run_id: doris-test + +source: + type: doris + config: + username: root + password: "" + host_port: localhost:59030 + database: dorisdb + +sink: + type: file + config: + filename: "./doris_mces.json" + diff --git a/metadata-ingestion/tests/integration/doris/setup/be-entrypoint.sh b/metadata-ingestion/tests/integration/doris/setup/be-entrypoint.sh new file mode 100755 index 0000000000000..72d4145cdbe74 --- /dev/null +++ b/metadata-ingestion/tests/integration/doris/setup/be-entrypoint.sh @@ -0,0 +1,41 @@ +#!/bin/bash +set -e + +echo "Doris BE entrypoint starting..." + +# Wait for FE to be somewhat ready +sleep 5 + +# Try to resolve hostnames to IPs, fallback if needed +FE_IP=$(getent hosts fe 2>/dev/null | awk '{ print $1 }' || echo "") +BE_IP=$(getent hosts be 2>/dev/null | awk '{ print $1 }' || hostname -i || echo "") + +if [ -z "$FE_IP" ]; then + echo "WARNING: Could not resolve FE IP via DNS, trying ping" + FE_IP=$(ping -c 1 fe 2>/dev/null | grep -oP '\(\K[0-9.]+' || echo "fe") +fi + +if [ -z "$BE_IP" ]; then + echo "WARNING: Could not resolve BE IP, using Docker internal IP" + BE_IP=$(hostname -i) +fi + +echo "Using FE IP: $FE_IP" +echo "Using BE IP: $BE_IP" + +# Set environment variables with resolved IPs +export FE_SERVERS="fe1:${FE_IP}:9010" +export BE_ADDR="${BE_IP}:9050" + +echo "Starting Doris BE with FE_SERVERS=${FE_SERVERS} BE_ADDR=${BE_ADDR}" + +# Ensure JAVA_OPTS is exported for the Doris startup script +# This workaround fixes Java 17 cgroup v2 incompatibility +if [ -n "$JAVA_OPTS" ]; then + echo "Applying JAVA_OPTS: $JAVA_OPTS" + export JAVA_OPTS +fi + +# Call the original Doris entrypoint +exec bash entry_point.sh + diff --git a/metadata-ingestion/tests/integration/doris/setup/fe-entrypoint.sh b/metadata-ingestion/tests/integration/doris/setup/fe-entrypoint.sh new file mode 100755 index 0000000000000..d895b44879131 --- /dev/null +++ b/metadata-ingestion/tests/integration/doris/setup/fe-entrypoint.sh @@ -0,0 +1,34 @@ +#!/bin/bash +set -e + +echo "Doris FE entrypoint starting..." + +# Wait briefly for network to be ready +sleep 2 + +# Try to resolve hostname to IP, fallback if needed +FE_IP=$(getent hosts fe 2>/dev/null | awk '{ print $1 }' || hostname -i || echo "") + +if [ -z "$FE_IP" ]; then + echo "WARNING: Could not resolve FE IP, using Docker internal IP" + FE_IP=$(hostname -i) +fi + +echo "Using FE IP: $FE_IP" + +# Set FE_SERVERS with resolved IP +export FE_SERVERS="fe1:${FE_IP}:9010" +export FE_ID=1 + +echo "Starting Doris FE with FE_SERVERS=${FE_SERVERS} FE_ID=${FE_ID}" + +# Ensure JAVA_OPTS is exported for the Doris startup script +# This workaround fixes Java 17 cgroup v2 incompatibility +if [ -n "$JAVA_OPTS" ]; then + echo "Applying JAVA_OPTS: $JAVA_OPTS" + export JAVA_OPTS +fi + +# Call the original Doris entrypoint +exec bash init_fe.sh + diff --git a/metadata-ingestion/tests/integration/doris/setup/setup.sql b/metadata-ingestion/tests/integration/doris/setup/setup.sql new file mode 100644 index 0000000000000..c78a732cda4a0 --- /dev/null +++ b/metadata-ingestion/tests/integration/doris/setup/setup.sql @@ -0,0 +1,84 @@ +-- Setup script for Apache Doris integration tests +-- Note: Doris syntax differs from MySQL in some ways + +DROP DATABASE IF EXISTS dorisdb; +CREATE DATABASE IF NOT EXISTS dorisdb; +USE dorisdb; + +-- Table: customers +-- Doris doesn't support INT(11) syntax, just use INT +CREATE TABLE IF NOT EXISTS customers ( + customer_id INT NOT NULL, + customer_name VARCHAR(100) NOT NULL, + email VARCHAR(100), + country VARCHAR(50), + created_at DATETIME DEFAULT CURRENT_TIMESTAMP +) +DUPLICATE KEY(customer_id) +DISTRIBUTED BY HASH(customer_id) BUCKETS 1 +PROPERTIES ( + "replication_num" = "1" +); + +-- Table: orders +CREATE TABLE IF NOT EXISTS orders ( + order_id INT NOT NULL, + customer_id INT NOT NULL, + order_date DATE, + total_amount DECIMAL(10,2), + status VARCHAR(50), + created_at DATETIME DEFAULT CURRENT_TIMESTAMP +) +DUPLICATE KEY(order_id) +DISTRIBUTED BY HASH(order_id) BUCKETS 1 +PROPERTIES ( + "replication_num" = "1" +); + +-- Table: analytics_data +-- In real Doris, this would use HLL, BITMAP, ARRAY, JSONB types +-- For now, using compatible types with comments +CREATE TABLE IF NOT EXISTS analytics_data ( + id INT NOT NULL, + name VARCHAR(100), + -- user_ids_hll would be HLL type in production + user_ids_hll STRING COMMENT 'HyperLogLog type in production Doris', + -- user_bitmap would be BITMAP type in production + user_bitmap STRING COMMENT 'Bitmap type in production Doris', + -- tags would be ARRAY in production + tags STRING COMMENT 'Array type in production Doris', + -- metadata would be JSONB in production + metadata STRING COMMENT 'JSONB type in production Doris', + created_at DATETIME DEFAULT CURRENT_TIMESTAMP +) +DUPLICATE KEY(id) +DISTRIBUTED BY HASH(id) BUCKETS 1 +PROPERTIES ( + "replication_num" = "1" +); + +-- Create a view +CREATE VIEW IF NOT EXISTS customer_orders AS +SELECT + c.customer_id, + c.customer_name, + COUNT(o.order_id) as order_count, + SUM(o.total_amount) as total_spent +FROM customers c +LEFT JOIN orders o ON c.customer_id = o.customer_id +GROUP BY c.customer_id, c.customer_name; + +-- Insert sample data +INSERT INTO customers (customer_id, customer_name, email, country) VALUES +(1, 'Alice Smith', 'alice@example.com', 'USA'), +(2, 'Bob Johnson', 'bob@example.com', 'UK'), +(3, 'Charlie Brown', 'charlie@example.com', 'Canada'); + +INSERT INTO orders (order_id, customer_id, order_date, total_amount, status) VALUES +(1001, 1, '2024-01-15', 150.00, 'completed'), +(1002, 1, '2024-02-20', 200.00, 'completed'), +(1003, 2, '2024-01-25', 75.50, 'completed'), +(1004, 3, '2024-03-01', 300.00, 'pending'); + +INSERT INTO analytics_data (id, name, tags, metadata) VALUES +(1, 'Test Record', 'tag1,tag2', '{"key":"value"}'); diff --git a/metadata-ingestion/tests/integration/doris/test_doris.py b/metadata-ingestion/tests/integration/doris/test_doris.py new file mode 100644 index 0000000000000..3a2a275000804 --- /dev/null +++ b/metadata-ingestion/tests/integration/doris/test_doris.py @@ -0,0 +1,149 @@ +import os +import subprocess +import time + +import pytest +from freezegun import freeze_time + +from datahub.ingestion.source.sql.doris import DorisSource +from datahub.testing import mce_helpers +from tests.test_helpers import test_connection_helpers +from tests.test_helpers.click_helpers import run_datahub_cmd +from tests.test_helpers.docker_helpers import wait_for_port + +FROZEN_TIME = "2020-04-14 07:00:00" +DORIS_PORT = 9030 # Doris MySQL protocol port + +# Note: Doris FE 3.0.8 uses Java 17 which has cgroup v2 incompatibility issues in CI +# Workaround: JAVA_OPTS=-XX:-UseContainerSupport is explicitly exported in entrypoint scripts +pytestmark = pytest.mark.integration_batch_4 + + +@pytest.fixture(scope="module") +def test_resources_dir(pytestconfig): + return pytestconfig.rootpath / "tests/integration/doris" + + +def is_doris_up(container_name: str) -> bool: + """Check if Doris FE is responsive via MySQL protocol connection""" + # The most reliable way to check if Doris is ready is to try connecting via MySQL protocol + # If we can execute a query, FE is fully operational + mysql_cmd = f"docker exec {container_name}-fe mysql -h 127.0.0.1 -P 9030 -u root -e 'SELECT 1' 2>/dev/null" + result = subprocess.run(mysql_cmd, shell=True) + return result.returncode == 0 + + +@pytest.fixture(scope="module") +def doris_runner(docker_compose_runner, pytestconfig, test_resources_dir): + with docker_compose_runner( + test_resources_dir / "docker-compose.yml", "doris" + ) as docker_services: + # Wait for Doris FE to be ready + print("Waiting for Doris FE to start...") + try: + wait_for_port( + docker_services, + "testdoris-fe", + DORIS_PORT, + timeout=400, # Longer timeout for CI (includes image pull) + checker=lambda: is_doris_up("testdoris"), + ) + print("Doris FE is ready!") + except Exception: + # Print logs for debugging + print("ERROR: Doris FE failed to start. Container logs:") + subprocess.run("docker logs testdoris-fe 2>&1 | tail -50", shell=True) + raise + + # Give BE extra time to register with FE and be fully ready + be_wait = 120 if os.getenv("CI") == "true" else 60 + print( + f"Waiting {be_wait}s for BE to register with FE and cluster to stabilize..." + ) + time.sleep(be_wait) + + # Run setup script to create database and tables + setup_sql = test_resources_dir / "setup" / "setup.sql" + setup_cmd = f"docker exec -i testdoris-fe mysql -h 127.0.0.1 -P {DORIS_PORT} -u root < {setup_sql}" + + # Retry setup a few times as BE might still be registering + for attempt in range(5): + result = subprocess.run( + setup_cmd, shell=True, capture_output=True, text=True + ) + if result.returncode == 0: + print("Setup script executed successfully") + break + print(f"Setup attempt {attempt + 1}/5 failed: {result.stderr}") + if attempt < 4: + time.sleep(15) + else: + print("WARNING: Setup script failed after 5 attempts") + # Print BE logs for debugging + subprocess.run("docker logs testdoris-be 2>&1 | tail -30", shell=True) + + yield docker_services + + +@pytest.mark.parametrize( + "config_file,golden_file", + [ + ("doris_to_file.yml", "doris_mces_golden.json"), + ], +) +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_doris_ingest( + doris_runner, + pytestconfig, + test_resources_dir, + tmp_path, + mock_time, + config_file, + golden_file, +): + # Run the metadata ingestion pipeline. + config_file = (test_resources_dir / config_file).resolve() + run_datahub_cmd(["ingest", "-c", f"{config_file}"], tmp_path=tmp_path) + + # Verify the output. + mce_helpers.check_golden_file( + pytestconfig, + output_path=tmp_path / "doris_mces.json", + golden_path=test_resources_dir / golden_file, + ) + + +@pytest.mark.parametrize( + "config_dict, is_success", + [ + ( + { + "host_port": "localhost:59030", + "database": "dorisdb", + "username": "root", + "password": "", + }, + True, + ), + ( + { + "host_port": "localhost:59999", + "database": "wrong_db", + "username": "wrong_user", + "password": "wrong_pass", + }, + False, + ), + ], +) +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_doris_test_connection(doris_runner, config_dict, is_success): + report = test_connection_helpers.run_test_connection(DorisSource, config_dict) + if is_success: + test_connection_helpers.assert_basic_connectivity_success(report) + else: + test_connection_helpers.assert_basic_connectivity_failure( + report, "Connection refused" + ) diff --git a/metadata-ingestion/tests/unit/test_doris_source.py b/metadata-ingestion/tests/unit/test_doris_source.py new file mode 100644 index 0000000000000..e9bd0f0ea4cab --- /dev/null +++ b/metadata-ingestion/tests/unit/test_doris_source.py @@ -0,0 +1,85 @@ +from sqlalchemy.dialects.mysql import base + +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.source.sql.doris import ( + BITMAP, + DORIS_ARRAY, + HLL, + JSONB, + QUANTILE_STATE, + DorisConfig, + DorisSource, +) +from datahub.ingestion.source.sql.mysql import MySQLConfig, MySQLSource +from datahub.ingestion.source.sql.sql_common import _field_type_mapping +from datahub.metadata.schema_classes import ( + ArrayTypeClass, + BytesTypeClass, + RecordTypeClass, +) + + +def test_platform_correctly_set_doris(): + source = DorisSource( + ctx=PipelineContext(run_id="doris-source-test"), + config=DorisConfig(), + ) + assert source.platform == "doris" + + +def test_doris_stored_procedures_disabled_by_default(): + """Test that stored procedures are disabled by default for Doris""" + config = DorisConfig() + assert config.include_stored_procedures is False + + +def test_mysql_stored_procedures_enabled_by_default(): + """Test that stored procedures are enabled by default for MySQL""" + config = MySQLConfig() + assert config.include_stored_procedures is True + + +def test_platform_correctly_set_mysql(): + source = MySQLSource( + ctx=PipelineContext(run_id="mysql-source-test"), + config=MySQLConfig(), + ) + assert source.platform == "mysql" + + +def test_doris_custom_types_registered(): + """Test that Doris-specific types are properly registered with SQLAlchemy""" + # Test that custom types are registered in MySQL dialect + assert "hll" in base.ischema_names + assert "bitmap" in base.ischema_names + assert "array" in base.ischema_names + assert "jsonb" in base.ischema_names + assert "quantile_state" in base.ischema_names + + # Test case insensitive versions + assert "HLL" in base.ischema_names + assert "BITMAP" in base.ischema_names + assert "ARRAY" in base.ischema_names + assert "JSONB" in base.ischema_names + assert "QUANTILE_STATE" in base.ischema_names + + # Verify they map to the correct SQLAlchemy types + assert base.ischema_names["hll"] == HLL + assert base.ischema_names["bitmap"] == BITMAP + assert base.ischema_names["array"] == DORIS_ARRAY + assert base.ischema_names["jsonb"] == JSONB + assert base.ischema_names["quantile_state"] == QUANTILE_STATE + + +def test_doris_custom_types_mapped_to_datahub_types(): + """Test that Doris custom types map to appropriate DataHub types""" + # HLL and BITMAP should map to BytesTypeClass + assert _field_type_mapping[HLL] == BytesTypeClass + assert _field_type_mapping[BITMAP] == BytesTypeClass + assert _field_type_mapping[QUANTILE_STATE] == BytesTypeClass + + # ARRAY should map to ArrayTypeClass + assert _field_type_mapping[DORIS_ARRAY] == ArrayTypeClass + + # JSONB should map to RecordTypeClass (like JSON) + assert _field_type_mapping[JSONB] == RecordTypeClass diff --git a/metadata-service/configuration/src/main/resources/bootstrap_mcps/data-platforms.yaml b/metadata-service/configuration/src/main/resources/bootstrap_mcps/data-platforms.yaml index 9882c6af4537c..035df02160eef 100644 --- a/metadata-service/configuration/src/main/resources/bootstrap_mcps/data-platforms.yaml +++ b/metadata-service/configuration/src/main/resources/bootstrap_mcps/data-platforms.yaml @@ -60,6 +60,16 @@ displayName: CockroachDb type: RELATIONAL_DB logoUrl: "assets/platforms/cockroachdblogo.png" +- entityUrn: urn:li:dataPlatform:doris + entityType: dataPlatform + aspectName: dataPlatformInfo + changeType: UPSERT + aspect: + datasetNameDelimiter: "." + name: doris + displayName: Apache Doris + type: OLAP_DATASTORE + logoUrl: "assets/platforms/dorislogo.png" - entityUrn: urn:li:dataPlatform:couchbase entityType: dataPlatform aspectName: dataPlatformInfo