Skip to content

Commit 9460ad6

Browse files
bsolomon1124max-sixty
authored andcommitted
BUG: Ensure table_schema arg not modified inplace (#278)
Fixes #277. If any dictionary entry in the `table_schema` arg did not contain a "mode" key, a mode="NULLABLE" kv pair would be created; because `schema.update_schema()` returns an object with references to its mutable input, this allows the argument to ultimately be modified by the function rather than the caller. This pattern was used in both gbq.py and load.py, so it was refactored into a helper function in schema.py, which now returns a modified *copy*. Deepcopy is used because the input is a list of dictionaries, so a shallow copy would be insufficient.
1 parent 59228d9 commit 9460ad6

File tree

4 files changed

+62
-12
lines changed

4 files changed

+62
-12
lines changed

pandas_gbq/gbq.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
bigquery_storage_v1beta1 = None
1414

1515
from pandas_gbq.exceptions import AccessDenied
16+
import pandas_gbq.schema
1617

1718
logger = logging.getLogger(__name__)
1819

@@ -1269,12 +1270,7 @@ def create(self, table_id, schema):
12691270
table_ref = self.client.dataset(self.dataset_id).table(table_id)
12701271
table = Table(table_ref)
12711272

1272-
# Manually create the schema objects, adding NULLABLE mode
1273-
# as a workaround for
1274-
# https://github.com/GoogleCloudPlatform/google-cloud-python/issues/4456
1275-
for field in schema["fields"]:
1276-
if "mode" not in field:
1277-
field["mode"] = "NULLABLE"
1273+
schema = pandas_gbq.schema.add_default_nullable_mode(schema)
12781274

12791275
table.schema = [
12801276
SchemaField.from_api_repr(field) for field in schema["fields"]

pandas_gbq/load.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -66,12 +66,7 @@ def load_chunks(
6666
if schema is None:
6767
schema = pandas_gbq.schema.generate_bq_schema(dataframe)
6868

69-
# Manually create the schema objects, adding NULLABLE mode
70-
# as a workaround for
71-
# https://github.com/GoogleCloudPlatform/google-cloud-python/issues/4456
72-
for field in schema["fields"]:
73-
if "mode" not in field:
74-
field["mode"] = "NULLABLE"
69+
schema = pandas_gbq.schema.add_default_nullable_mode(schema)
7570

7671
job_config.schema = [
7772
bigquery.SchemaField.from_api_repr(field) for field in schema["fields"]

pandas_gbq/schema.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
"""Helper methods for BigQuery schemas"""
22

3+
import copy
4+
35

46
def generate_bq_schema(dataframe, default_type="STRING"):
57
"""Given a passed dataframe, generate the associated Google BigQuery schema.
@@ -62,3 +64,16 @@ def update_schema(schema_old, schema_new):
6264
output_fields.append(field)
6365

6466
return {"fields": output_fields}
67+
68+
69+
def add_default_nullable_mode(schema):
70+
"""Manually create the schema objects, adding NULLABLE mode."""
71+
# Workaround for:
72+
# https://github.com/GoogleCloudPlatform/google-cloud-python/issues/4456
73+
#
74+
# Returns a copy rather than modifying the mutable arg,
75+
# per Issue #277
76+
result = copy.deepcopy(schema)
77+
for field in result["fields"]:
78+
field.setdefault("mode", "NULLABLE")
79+
return result

tests/unit/test_gbq.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# -*- coding: utf-8 -*-
22

3+
import copy
4+
import datetime
35
from unittest import mock
46

57
import numpy
@@ -477,3 +479,45 @@ def test_generate_bq_schema_deprecated():
477479
with pytest.warns(FutureWarning):
478480
df = DataFrame([[1, "two"], [3, "four"]])
479481
gbq.generate_bq_schema(df)
482+
483+
484+
def test_load_does_not_modify_schema_arg():
485+
# Test of Issue # 277
486+
df = DataFrame(
487+
{
488+
"field1": ["a", "b"],
489+
"field2": [1, 2],
490+
"field3": [datetime.date(2019, 1, 1), datetime.date(2019, 5, 1)],
491+
}
492+
)
493+
original_schema = [
494+
{"name": "field1", "type": "STRING", "mode": "REQUIRED"},
495+
{"name": "field2", "type": "INTEGER"},
496+
{"name": "field3", "type": "DATE"},
497+
]
498+
original_schema_cp = copy.deepcopy(original_schema)
499+
gbq.to_gbq(
500+
df,
501+
"dataset.schematest",
502+
project_id="my-project",
503+
table_schema=original_schema,
504+
if_exists="fail",
505+
)
506+
assert original_schema == original_schema_cp
507+
508+
# Test again now that table exists - behavior will differ internally
509+
# branch at if table.exists(table_id)
510+
original_schema = [
511+
{"name": "field1", "type": "STRING", "mode": "REQUIRED"},
512+
{"name": "field2", "type": "INTEGER"},
513+
{"name": "field3", "type": "DATE"},
514+
]
515+
original_schema_cp = copy.deepcopy(original_schema)
516+
gbq.to_gbq(
517+
df,
518+
"dataset.schematest",
519+
project_id="my-project",
520+
table_schema=original_schema,
521+
if_exists="append",
522+
)
523+
assert original_schema == original_schema_cp

0 commit comments

Comments
 (0)