22import re
33import time
44import webbrowser
5- from typing import List , Optional , Dict
5+ from typing import List , Optional , Dict , Tuple , Union
66import keyring
7-
87import pydantic
98import rich
109from rich .prompt import Confirm , Prompt
1110
11+ from data_diff .errors import DataDiffCustomSchemaNoConfigError , DataDiffDbtProjectVarsNotFoundError
12+
1213from . import connect_to_table , diff_tables , Algorithm
1314from .cloud import DatafoldAPI , TCloudApiDataDiff , TCloudApiOrgMeta , get_or_create_data_source
14- from .dbt_parser import DbtParser , PROJECT_FILE
15+ from .dbt_parser import DbtParser , PROJECT_FILE , TDatadiffConfig
1516from .tracking import (
1617 bool_ask_for_email ,
1718 create_email_signup_event_json ,
@@ -55,22 +56,21 @@ def dbt_diff(
5556 project_dir_override : Optional [str ] = None ,
5657 is_cloud : bool = False ,
5758 dbt_selection : Optional [str ] = None ,
59+ state : Optional [str ] = None ,
5860) -> None :
5961 print_version_info ()
6062 diff_threads = []
6163 set_entrypoint_name ("CLI-dbt" )
62- dbt_parser = DbtParser (profiles_dir_override , project_dir_override )
64+ dbt_parser = DbtParser (profiles_dir_override , project_dir_override , state )
6365 models = dbt_parser .get_models (dbt_selection )
64- datadiff_variables = dbt_parser .get_datadiff_variables ()
65- config_prod_database = datadiff_variables .get ("prod_database" )
66- config_prod_schema = datadiff_variables .get ("prod_schema" )
67- config_prod_custom_schema = datadiff_variables .get ("prod_custom_schema" )
68- datasource_id = datadiff_variables .get ("datasource_id" )
66+ config = dbt_parser .get_datadiff_config ()
6967 _initialize_events (dbt_parser .dbt_user_id , dbt_parser .dbt_version , dbt_parser .dbt_project_id )
7068
71- if datadiff_variables .get ("custom_schemas" ) is not None :
72- logger .warning (
73- "vars: data_diff: custom_schemas: is no longer used and can be removed.\n To utilize custom schemas, see the documentation here: https://docs.datafold.com/development_testing/open_source"
69+
70+ if not state and not (config .prod_database or config .prod_schema ):
71+ doc_url = "https://docs.datafold.com/development_testing/open_source#configure-your-dbt-project"
72+ raise DataDiffDbtProjectVarsNotFoundError (
73+ f"""vars: data_diff: section not found in dbt_project.yml.\n \n To solve this, please configure your dbt project: \n { doc_url } \n \n Or specify a production manifest using the `--state` flag."""
7474 )
7575
7676 if is_cloud :
@@ -80,13 +80,13 @@ def dbt_diff(
8080 return
8181 org_meta = api .get_org_meta ()
8282
83- if datasource_id is None :
83+ if config . datasource_id is None :
8484 rich .print ("[red]Data source ID not found in dbt_project.yml" )
8585 is_create_data_source = Confirm .ask ("Would you like to create a new data source?" )
8686 if is_create_data_source :
87- datasource_id = get_or_create_data_source (api = api , dbt_parser = dbt_parser )
87+ config . datasource_id = get_or_create_data_source (api = api , dbt_parser = dbt_parser )
8888 rich .print (f'To use the data source in next runs, please, update your "{ PROJECT_FILE } " with a block:' )
89- rich .print (f"[green]vars:\n data_diff:\n datasource_id: { datasource_id } \n " )
89+ rich .print (f"[green]vars:\n data_diff:\n datasource_id: { config . datasource_id } \n " )
9090 rich .print (
9191 "Read more about Datafold vars in docs: "
9292 "https://docs.datafold.com/os_diff/dbt_integration/#configure-a-data-source\n "
@@ -97,21 +97,29 @@ def dbt_diff(
9797 "\n vars:\n data_diff:\n datasource_id: 1234"
9898 )
9999
100- data_source = api .get_data_source (datasource_id )
100+ data_source = api .get_data_source (config . datasource_id )
101101 dbt_parser .set_casing_policy_for (connection_type = data_source .type )
102102 rich .print ("[green][bold]\n Diffs in progress...[/][/]\n " )
103103
104104 else :
105105 dbt_parser .set_connection ()
106106
107107 for model in models :
108- diff_vars = _get_diff_vars (
109- dbt_parser , config_prod_database , config_prod_schema , config_prod_custom_schema , model
110- )
108+ diff_vars = _get_diff_vars (dbt_parser , config , model )
109+
110+ # we won't always have a prod path when using state
111+ # when the model DNE in prod manifest, skip the model diff
112+ if (
113+ state and len (diff_vars .prod_path ) < 2
114+ ): # < 2 because some providers like databricks can legitimately have *only* 2
115+ diff_output_str = _diff_output_base ("." .join (diff_vars .dev_path ), "." .join (diff_vars .prod_path ))
116+ diff_output_str += "[green]New model: nothing to diff![/] \n "
117+ rich .print (diff_output_str )
118+ continue
111119
112120 if diff_vars .primary_keys :
113121 if is_cloud :
114- diff_thread = run_as_daemon (_cloud_diff , diff_vars , datasource_id , api , org_meta )
122+ diff_thread = run_as_daemon (_cloud_diff , diff_vars , config . datasource_id , api , org_meta )
115123 diff_threads .append (diff_thread )
116124 else :
117125 _local_diff (diff_vars )
@@ -129,41 +137,19 @@ def dbt_diff(
129137
130138def _get_diff_vars (
131139 dbt_parser : "DbtParser" ,
132- config_prod_database : Optional [str ],
133- config_prod_schema : Optional [str ],
134- config_prod_custom_schema : Optional [str ],
140+ config : TDatadiffConfig ,
135141 model ,
136142) -> TDiffVars :
137143 dev_database = model .database
138144 dev_schema = model .schema_
139145
140146 primary_keys = dbt_parser .get_pk_from_model (model , dbt_parser .unique_columns , "primary-key" )
141147
142- # "custom" dbt config database
143- if model .config .database :
144- prod_database = model .config .database
145- elif config_prod_database :
146- prod_database = config_prod_database
148+ # prod path is constructed via configuration or the prod manifest via --state
149+ if dbt_parser .prod_manifest_obj :
150+ prod_database , prod_schema = _get_prod_path_from_manifest (model , dbt_parser .prod_manifest_obj )
147151 else :
148- prod_database = dev_database
149-
150- # prod schema name differs from dev schema name
151- if config_prod_schema :
152- custom_schema = model .config .schema_
153-
154- # the model has a custom schema config(schema='some_schema')
155- if custom_schema :
156- if not config_prod_custom_schema :
157- raise ValueError (
158- f"Found a custom schema on model { model .name } , but no value for\n vars:\n data_diff:\n prod_custom_schema:\n Please set a value!\n "
159- + "For more details see: https://docs.datafold.com/development_testing/open_source"
160- )
161- prod_schema = config_prod_custom_schema .replace ("<custom_schema>" , custom_schema )
162- # no custom schema, use the default
163- else :
164- prod_schema = config_prod_schema
165- else :
166- prod_schema = dev_schema
152+ prod_database , prod_schema = _get_prod_path_from_config (config , model , dev_database , dev_schema )
167153
168154 if dbt_parser .requires_upper :
169155 dev_qualified_list = [x .upper () for x in [dev_database , dev_schema , model .alias ] if x ]
@@ -187,6 +173,45 @@ def _get_diff_vars(
187173 )
188174
189175
176+ def _get_prod_path_from_config (config , model , dev_database , dev_schema ) -> Tuple [str , str ]:
177+ # "custom" dbt config database
178+ if model .config .database :
179+ prod_database = model .config .database
180+ elif config .prod_database :
181+ prod_database = config .prod_database
182+ else :
183+ prod_database = dev_database
184+
185+ # prod schema name differs from dev schema name
186+ if config .prod_schema :
187+ custom_schema = model .config .schema_
188+
189+ # the model has a custom schema config(schema='some_schema')
190+ if custom_schema :
191+ if not config .prod_custom_schema :
192+ raise DataDiffCustomSchemaNoConfigError (
193+ f"Found a custom schema on model { model .name } , but no value for\n vars:\n data_diff:\n prod_custom_schema:\n Please set a value or utilize the `--state` flag!\n \n "
194+ + "For more details see: https://docs.datafold.com/development_testing/open_source"
195+ )
196+ prod_schema = config .prod_custom_schema .replace ("<custom_schema>" , custom_schema )
197+ # no custom schema, use the default
198+ else :
199+ prod_schema = config .prod_schema
200+ else :
201+ prod_schema = dev_schema
202+ return prod_database , prod_schema
203+
204+
205+ def _get_prod_path_from_manifest (model , prod_manifest ) -> Union [Tuple [str , str ], Tuple [None , None ]]:
206+ prod_database = None
207+ prod_schema = None
208+ prod_model = prod_manifest .nodes .get (model .unique_id , None )
209+ if prod_model :
210+ prod_database = prod_model .database
211+ prod_schema = prod_model .schema_
212+ return prod_database , prod_schema
213+
214+
190215def _local_diff (diff_vars : TDiffVars ) -> None :
191216 dev_qualified_str = "." .join (diff_vars .dev_path )
192217 prod_qualified_str = "." .join (diff_vars .prod_path )
0 commit comments