Skip to content

Commit a529505

Browse files
chalmerlowegoogle-labs-jules[bot]gcf-owl-bot[bot]
authored
feat: Allow loading a DataFrame even if the provided BigQuery schema includes columns not in the DataFrame (#953)
* feat: Allow loading a DataFrame with a subset of BigQuery columns This change modifies the behavior when a DataFrame is loaded to BigQuery with a schema that contains fields not present in the DataFrame. Instead of raising a `ValueError`, a `UserWarning` is now issued, and the extra fields are appended to the schema. This allows for more flexible data loading scenarios. * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --------- Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com> Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
1 parent 48a91df commit a529505

File tree

2 files changed

+33
-13
lines changed

2 files changed

+33
-13
lines changed

pandas_gbq/schema/pandas_to_bigquery.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -139,14 +139,16 @@ def dataframe_to_bigquery_fields(
139139
bq_schema_out.append(bq_field)
140140
unknown_type_fields.append(bq_field)
141141

142-
# Catch any schema mismatch. The developer explicitly asked to serialize a
143-
# column, but it was not found.
142+
# Append any fields from the BigQuery schema that are not in the
143+
# DataFrame.
144144
if override_fields_unused:
145-
raise ValueError(
146-
"Provided BigQuery fields contain field(s) not present in DataFrame: {}".format(
147-
override_fields_unused
148-
)
145+
warnings.warn(
146+
"Provided BigQuery fields contain field(s) not present in "
147+
"DataFrame: {}".format(sorted(override_fields_unused)),
148+
UserWarning,
149149
)
150+
for field_name in sorted(override_fields_unused):
151+
bq_schema_out.append(override_fields_by_name[field_name])
150152

151153
# If schema detection was not successful for all columns, also try with
152154
# pyarrow, if available.

tests/unit/schema/test_pandas_to_bigquery.py

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -179,16 +179,34 @@ def test_dataframe_to_bigquery_fields_fallback_needed_w_pyarrow(module_under_tes
179179

180180

181181
def test_dataframe_to_bigquery_fields_w_extra_fields(module_under_test):
182-
with pytest.raises(ValueError) as exc_context:
183-
module_under_test.dataframe_to_bigquery_fields(
184-
pandas.DataFrame(),
185-
override_bigquery_fields=(schema.SchemaField("not_in_df", "STRING"),),
182+
dataframe = pandas.DataFrame({"in_df": [1, 2, 3]})
183+
bq_schema = (
184+
schema.SchemaField("in_df", "INTEGER"),
185+
schema.SchemaField("not_in_df", "STRING"),
186+
schema.SchemaField("also_not_in_df", "INTEGER"),
187+
)
188+
189+
with pytest.warns(UserWarning) as record:
190+
returned_schema = module_under_test.dataframe_to_bigquery_fields(
191+
dataframe, override_bigquery_fields=bq_schema
186192
)
187-
message = str(exc_context.value)
193+
194+
assert len(record) == 1
195+
message = str(record[0].message)
188196
assert (
189-
"Provided BigQuery fields contain field(s) not present in DataFrame:" in message
197+
"Provided BigQuery fields contain field(s) not present in DataFrame" in message
190198
)
191-
assert "not_in_df" in message
199+
# Note: The field names are sorted in the warning message.
200+
assert "['also_not_in_df', 'not_in_df']" in message
201+
202+
expected_schema = (
203+
schema.SchemaField("in_df", "INTEGER"),
204+
# Note: The fields are sorted by name as they are added from the set of
205+
# unused fields.
206+
schema.SchemaField("also_not_in_df", "INTEGER"),
207+
schema.SchemaField("not_in_df", "STRING"),
208+
)
209+
assert returned_schema == expected_schema
192210

193211

194212
def test_dataframe_to_bigquery_fields_geography(module_under_test):

0 commit comments

Comments
 (0)