diff --git a/google/cloud/bigquery/_helpers.py b/google/cloud/bigquery/_helpers.py index d40217c4d..76c4f1fbd 100644 --- a/google/cloud/bigquery/_helpers.py +++ b/google/cloud/bigquery/_helpers.py @@ -21,8 +21,9 @@ import math import re import os +import textwrap import warnings -from typing import Optional, Union, Any, Tuple, Type +from typing import Any, Optional, Tuple, Type, Union from dateutil import relativedelta from google.cloud._helpers import UTC # type: ignore @@ -133,243 +134,320 @@ def _not_null(value, field): return value is not None or (field is not None and field.mode != "NULLABLE") -def _int_from_json(value, field): - """Coerce 'value' to an int, if set or not nullable.""" - if _not_null(value, field): - return int(value) - - -def _interval_from_json( - value: Optional[str], field -) -> Optional[relativedelta.relativedelta]: - """Coerce 'value' to an interval, if set or not nullable.""" - if not _not_null(value, field): - return None - if value is None: - raise TypeError(f"got {value} for REQUIRED field: {repr(field)}") - - parsed = _INTERVAL_PATTERN.match(value) - if parsed is None: - raise ValueError(f"got interval: '{value}' with unexpected format") - - calendar_sign = -1 if parsed.group("calendar_sign") == "-" else 1 - years = calendar_sign * int(parsed.group("years")) - months = calendar_sign * int(parsed.group("months")) - days = int(parsed.group("days")) - time_sign = -1 if parsed.group("time_sign") == "-" else 1 - hours = time_sign * int(parsed.group("hours")) - minutes = time_sign * int(parsed.group("minutes")) - seconds = time_sign * int(parsed.group("seconds")) - fraction = parsed.group("fraction") - microseconds = time_sign * int(fraction.ljust(6, "0")[:6]) if fraction else 0 - - return relativedelta.relativedelta( - years=years, - months=months, - days=days, - hours=hours, - minutes=minutes, - seconds=seconds, - microseconds=microseconds, - ) - - -def _float_from_json(value, field): - """Coerce 'value' to a float, if set or not nullable.""" - if _not_null(value, field): - return float(value) +class CellDataParser: + """Converter from BigQuery REST resource to Python value for RowIterator and similar classes. + See: "rows" field of + https://cloud.google.com/bigquery/docs/reference/rest/v2/tabledata/list and + https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/getQueryResults. + """ -def _decimal_from_json(value, field): - """Coerce 'value' to a Decimal, if set or not nullable.""" - if _not_null(value, field): - return decimal.Decimal(value) - + def to_py(self, resource, field): + def default_converter(value, field): + _warn_unknown_field_type(field) + return value -def _bool_from_json(value, field): - """Coerce 'value' to a bool, if set or not nullable.""" - if _not_null(value, field): - return value.lower() in ["t", "true", "1"] + converter = getattr( + self, f"{field.field_type.lower()}_to_py", default_converter + ) + if field.mode == "REPEATED": + return [converter(item["v"], field) for item in resource] + else: + return converter(resource, field) + + def bool_to_py(self, value, field): + """Coerce 'value' to a bool, if set or not nullable.""" + if _not_null(value, field): + # TODO(tswast): Why does _not_null care if the field is NULLABLE or + # REQUIRED? Do we actually need such client-side validation? + if value is None: + raise TypeError(f"got None for required boolean field {field}") + return value.lower() in ("t", "true", "1") + + def boolean_to_py(self, value, field): + """Coerce 'value' to a bool, if set or not nullable.""" + return self.bool_to_py(value, field) + + def integer_to_py(self, value, field): + """Coerce 'value' to an int, if set or not nullable.""" + if _not_null(value, field): + return int(value) + + def int64_to_py(self, value, field): + """Coerce 'value' to an int, if set or not nullable.""" + return self.integer_to_py(value, field) + + def interval_to_py( + self, value: Optional[str], field + ) -> Optional[relativedelta.relativedelta]: + """Coerce 'value' to an interval, if set or not nullable.""" + if not _not_null(value, field): + return None + if value is None: + raise TypeError(f"got {value} for REQUIRED field: {repr(field)}") + + parsed = _INTERVAL_PATTERN.match(value) + if parsed is None: + raise ValueError( + textwrap.dedent( + f""" + Got interval: '{value}' with unexpected format. + Expected interval in canonical format of "[sign]Y-M [sign]D [sign]H:M:S[.F]". + See: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#interval_type + for more information. + """ + ), + ) + calendar_sign = -1 if parsed.group("calendar_sign") == "-" else 1 + years = calendar_sign * int(parsed.group("years")) + months = calendar_sign * int(parsed.group("months")) + days = int(parsed.group("days")) + time_sign = -1 if parsed.group("time_sign") == "-" else 1 + hours = time_sign * int(parsed.group("hours")) + minutes = time_sign * int(parsed.group("minutes")) + seconds = time_sign * int(parsed.group("seconds")) + fraction = parsed.group("fraction") + microseconds = time_sign * int(fraction.ljust(6, "0")[:6]) if fraction else 0 + + return relativedelta.relativedelta( + years=years, + months=months, + days=days, + hours=hours, + minutes=minutes, + seconds=seconds, + microseconds=microseconds, + ) -def _string_from_json(value, _): - """NOOP string -> string coercion""" - return value + def float_to_py(self, value, field): + """Coerce 'value' to a float, if set or not nullable.""" + if _not_null(value, field): + return float(value) + def float64_to_py(self, value, field): + """Coerce 'value' to a float, if set or not nullable.""" + return self.float_to_py(value, field) -def _bytes_from_json(value, field): - """Base64-decode value""" - if _not_null(value, field): - return base64.standard_b64decode(_to_bytes(value)) + def numeric_to_py(self, value, field): + """Coerce 'value' to a Decimal, if set or not nullable.""" + if _not_null(value, field): + return decimal.Decimal(value) + def bignumeric_to_py(self, value, field): + """Coerce 'value' to a Decimal, if set or not nullable.""" + return self.numeric_to_py(value, field) -def _timestamp_from_json(value, field): - """Coerce 'value' to a datetime, if set or not nullable.""" - if _not_null(value, field): - # value will be a integer in seconds, to microsecond precision, in UTC. - return _datetime_from_microseconds(int(value)) + def string_to_py(self, value, _): + """NOOP string -> string coercion""" + return value + def geography_to_py(self, value, _): + """NOOP string -> string coercion""" + return value -def _timestamp_query_param_from_json(value, field): - """Coerce 'value' to a datetime, if set or not nullable. + def bytes_to_py(self, value, field): + """Base64-decode value""" + if _not_null(value, field): + return base64.standard_b64decode(_to_bytes(value)) + + def timestamp_to_py(self, value, field): + """Coerce 'value' to a datetime, if set or not nullable.""" + if _not_null(value, field): + # value will be a integer in seconds, to microsecond precision, in UTC. + return _datetime_from_microseconds(int(value)) + + def datetime_to_py(self, value, field): + """Coerce 'value' to a datetime, if set or not nullable. + + Args: + value (str): The timestamp. + field (google.cloud.bigquery.schema.SchemaField): + The field corresponding to the value. + + Returns: + Optional[datetime.datetime]: + The parsed datetime object from + ``value`` if the ``field`` is not null (otherwise it is + :data:`None`). + """ + if _not_null(value, field): + if "." in value: + # YYYY-MM-DDTHH:MM:SS.ffffff + return datetime.datetime.strptime(value, _RFC3339_MICROS_NO_ZULU) + else: + # YYYY-MM-DDTHH:MM:SS + return datetime.datetime.strptime(value, _RFC3339_NO_FRACTION) + else: + return None - Args: - value (str): The timestamp. + def date_to_py(self, value, field): + """Coerce 'value' to a datetime date, if set or not nullable""" + if _not_null(value, field): + # value will be a string, in YYYY-MM-DD form. + return _date_from_iso8601_date(value) + + def time_to_py(self, value, field): + """Coerce 'value' to a datetime date, if set or not nullable""" + if _not_null(value, field): + if len(value) == 8: # HH:MM:SS + fmt = _TIMEONLY_WO_MICROS + elif len(value) == 15: # HH:MM:SS.micros + fmt = _TIMEONLY_W_MICROS + else: + raise ValueError( + textwrap.dedent( + f""" + Got {repr(value)} with unknown time format. + Expected HH:MM:SS or HH:MM:SS.micros. See + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#time_type + for more information. + """ + ), + ) + return datetime.datetime.strptime(value, fmt).time() + + def record_to_py(self, value, field): + """Coerce 'value' to a mapping, if set or not nullable.""" + if _not_null(value, field): + record = {} + record_iter = zip(field.fields, value["f"]) + for subfield, cell in record_iter: + record[subfield.name] = self.to_py(cell["v"], subfield) + return record + + def struct_to_py(self, value, field): + """Coerce 'value' to a mapping, if set or not nullable.""" + return self.record_to_py(value, field) + + def json_to_py(self, value, field): + """Coerce 'value' to a Pythonic JSON representation.""" + if _not_null(value, field): + return json.loads(value) + else: + return None - field (google.cloud.bigquery.schema.SchemaField): - The field corresponding to the value. + def _range_element_to_py(self, value, field_element_type): + """Coerce 'value' to a range element value.""" + # Avoid circular imports by importing here. + from google.cloud.bigquery import schema - Returns: - Optional[datetime.datetime]: - The parsed datetime object from - ``value`` if the ``field`` is not null (otherwise it is - :data:`None`). - """ - if _not_null(value, field): - # Canonical formats for timestamps in BigQuery are flexible. See: - # g.co/cloud/bigquery/docs/reference/standard-sql/data-types#timestamp-type - # The separator between the date and time can be 'T' or ' '. - value = value.replace(" ", "T", 1) - # The UTC timezone may be formatted as Z or +00:00. - value = value.replace("Z", "") - value = value.replace("+00:00", "") - - if "." in value: - # YYYY-MM-DDTHH:MM:SS.ffffff - return datetime.datetime.strptime(value, _RFC3339_MICROS_NO_ZULU).replace( - tzinfo=UTC + if value == "UNBOUNDED": + return None + if field_element_type.element_type in _SUPPORTED_RANGE_ELEMENTS: + return self.to_py( + value, + schema.SchemaField("placeholder", field_element_type.element_type), ) else: - # YYYY-MM-DDTHH:MM:SS - return datetime.datetime.strptime(value, _RFC3339_NO_FRACTION).replace( - tzinfo=UTC + raise ValueError( + textwrap.dedent( + f""" + Got unsupported range element type: {field_element_type.element_type}. + Exptected one of {repr(_SUPPORTED_RANGE_ELEMENTS)}. See: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#declare_a_range_type + for more information. + """ + ), ) - else: - return None - - -def _datetime_from_json(value, field): - """Coerce 'value' to a datetime, if set or not nullable. - - Args: - value (str): The timestamp. - field (google.cloud.bigquery.schema.SchemaField): - The field corresponding to the value. - - Returns: - Optional[datetime.datetime]: - The parsed datetime object from - ``value`` if the ``field`` is not null (otherwise it is - :data:`None`). - """ - if _not_null(value, field): - if "." in value: - # YYYY-MM-DDTHH:MM:SS.ffffff - return datetime.datetime.strptime(value, _RFC3339_MICROS_NO_ZULU) - else: - # YYYY-MM-DDTHH:MM:SS - return datetime.datetime.strptime(value, _RFC3339_NO_FRACTION) - else: - return None + def range_to_py(self, value, field): + """Coerce 'value' to a range, if set or not nullable. + + Args: + value (str): The literal representation of the range. + field (google.cloud.bigquery.schema.SchemaField): + The field corresponding to the value. + + Returns: + Optional[dict]: + The parsed range object from ``value`` if the ``field`` is not + null (otherwise it is :data:`None`). + """ + if _not_null(value, field): + if _RANGE_PATTERN.match(value): + start, end = value[1:-1].split(", ") + start = self._range_element_to_py(start, field.range_element_type) + end = self._range_element_to_py(end, field.range_element_type) + return {"start": start, "end": end} + else: + raise ValueError( + textwrap.dedent( + f""" + Got unknown format for range value: {value}. + Expected format '[lower_bound, upper_bound)'. See: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#range_with_literal + for more information. + """ + ), + ) -def _date_from_json(value, field): - """Coerce 'value' to a datetime date, if set or not nullable""" - if _not_null(value, field): - # value will be a string, in YYYY-MM-DD form. - return _date_from_iso8601_date(value) +CELL_DATA_PARSER = CellDataParser() -def _time_from_json(value, field): - """Coerce 'value' to a datetime date, if set or not nullable""" - if _not_null(value, field): - if len(value) == 8: # HH:MM:SS - fmt = _TIMEONLY_WO_MICROS - elif len(value) == 15: # HH:MM:SS.micros - fmt = _TIMEONLY_W_MICROS - else: - raise ValueError("Unknown time format: {}".format(value)) - return datetime.datetime.strptime(value, fmt).time() +class DataFrameCellDataParser(CellDataParser): + """Override of CellDataParser to handle differences in expection of values in DataFrame-like outputs. -def _record_from_json(value, field): - """Coerce 'value' to a mapping, if set or not nullable.""" - if _not_null(value, field): - record = {} - record_iter = zip(field.fields, value["f"]) - for subfield, cell in record_iter: - record[subfield.name] = _field_from_json(cell["v"], subfield) - return record + This is used to turn the output of the REST API into a pyarrow Table, + emulating the serialized arrow from the BigQuery Storage Read API. + """ + def json_to_py(self, value, _): + """No-op because DataFrame expects string for JSON output.""" + return value -def _json_from_json(value, field): - """Coerce 'value' to a Pythonic JSON representation.""" - if _not_null(value, field): - return json.loads(value) - else: - return None - -def _range_element_from_json(value, field): - """Coerce 'value' to a range element value.""" - if value == "UNBOUNDED": - return None - if field.element_type in _SUPPORTED_RANGE_ELEMENTS: - return _CELLDATA_FROM_JSON[field.element_type](value, field.element_type) - else: - raise ValueError(f"Unsupported range element type: {field.element_type}") +DATA_FRAME_CELL_DATA_PARSER = DataFrameCellDataParser() -def _range_from_json(value, field): - """Coerce 'value' to a range, if set or not nullable. +class ScalarQueryParamParser(CellDataParser): + """Override of CellDataParser to handle the differences in the response from query params. - Args: - value (str): The literal representation of the range. - field (google.cloud.bigquery.schema.SchemaField): - The field corresponding to the value. - - Returns: - Optional[dict]: - The parsed range object from ``value`` if the ``field`` is not - null (otherwise it is :data:`None`). + See: "value" field of + https://cloud.google.com/bigquery/docs/reference/rest/v2/QueryParameter#QueryParameterValue """ - if _not_null(value, field): - if _RANGE_PATTERN.match(value): - start, end = value[1:-1].split(", ") - start = _range_element_from_json(start, field.range_element_type) - end = _range_element_from_json(end, field.range_element_type) - return {"start": start, "end": end} - else: - raise ValueError(f"Unknown format for range value: {value}") - else: - return None + def timestamp_to_py(self, value, field): + """Coerce 'value' to a datetime, if set or not nullable. + + Args: + value (str): The timestamp. + + field (google.cloud.bigquery.schema.SchemaField): + The field corresponding to the value. + + Returns: + Optional[datetime.datetime]: + The parsed datetime object from + ``value`` if the ``field`` is not null (otherwise it is + :data:`None`). + """ + if _not_null(value, field): + # Canonical formats for timestamps in BigQuery are flexible. See: + # g.co/cloud/bigquery/docs/reference/standard-sql/data-types#timestamp-type + # The separator between the date and time can be 'T' or ' '. + value = value.replace(" ", "T", 1) + # The UTC timezone may be formatted as Z or +00:00. + value = value.replace("Z", "") + value = value.replace("+00:00", "") + + if "." in value: + # YYYY-MM-DDTHH:MM:SS.ffffff + return datetime.datetime.strptime( + value, _RFC3339_MICROS_NO_ZULU + ).replace(tzinfo=UTC) + else: + # YYYY-MM-DDTHH:MM:SS + return datetime.datetime.strptime(value, _RFC3339_NO_FRACTION).replace( + tzinfo=UTC + ) + else: + return None -# Parse BigQuery API response JSON into a Python representation. -_CELLDATA_FROM_JSON = { - "INTEGER": _int_from_json, - "INT64": _int_from_json, - "INTERVAL": _interval_from_json, - "FLOAT": _float_from_json, - "FLOAT64": _float_from_json, - "NUMERIC": _decimal_from_json, - "BIGNUMERIC": _decimal_from_json, - "BOOLEAN": _bool_from_json, - "BOOL": _bool_from_json, - "STRING": _string_from_json, - "GEOGRAPHY": _string_from_json, - "BYTES": _bytes_from_json, - "TIMESTAMP": _timestamp_from_json, - "DATETIME": _datetime_from_json, - "DATE": _date_from_json, - "TIME": _time_from_json, - "RECORD": _record_from_json, - "JSON": _json_from_json, - "RANGE": _range_from_json, -} -_QUERY_PARAMS_FROM_JSON = dict(_CELLDATA_FROM_JSON) -_QUERY_PARAMS_FROM_JSON["TIMESTAMP"] = _timestamp_query_param_from_json +SCALAR_QUERY_PARAM_PARSER = ScalarQueryParamParser() def _field_to_index_mapping(schema): @@ -377,18 +455,6 @@ def _field_to_index_mapping(schema): return {f.name: i for i, f in enumerate(schema)} -def _field_from_json(resource, field): - def default_converter(value, field): - _warn_unknown_field_type(field) - return value - - converter = _CELLDATA_FROM_JSON.get(field.field_type, default_converter) - if field.mode == "REPEATED": - return [converter(item["v"], field) for item in resource] - else: - return converter(resource, field) - - def _row_tuple_from_json(row, schema): """Convert JSON row data to row with appropriate types. @@ -410,7 +476,7 @@ def _row_tuple_from_json(row, schema): row_data = [] for field, cell in zip(schema, row["f"]): - row_data.append(_field_from_json(cell["v"], field)) + row_data.append(CELL_DATA_PARSER.to_py(cell["v"], field)) return tuple(row_data) diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index 0017d92ce..c921b4e04 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -158,6 +158,7 @@ def finish(self): b"ARROW:extension:metadata": b'{"encoding": "WKT"}', }, "DATETIME": {b"ARROW:extension:name": b"google:sqlType:datetime"}, + "JSON": {b"ARROW:extension:name": b"google:sqlType:json"}, } diff --git a/google/cloud/bigquery/_pyarrow_helpers.py b/google/cloud/bigquery/_pyarrow_helpers.py index 1b42cd5c7..6e007350c 100644 --- a/google/cloud/bigquery/_pyarrow_helpers.py +++ b/google/cloud/bigquery/_pyarrow_helpers.py @@ -15,7 +15,9 @@ """Shared helper functions for connecting BigQuery and pyarrow. NOTE: This module is DEPRECATED. Please make updates in the pandas-gbq package, -instead. See: go/pandas-gbq-and-bigframes-redundancy and +instead. See: go/pandas-gbq-and-bigframes-redundancy, +https://github.com/googleapis/python-bigquery-pandas/blob/main/pandas_gbq/schema/bigquery_to_pyarrow.py +and https://github.com/googleapis/python-bigquery-pandas/blob/main/pandas_gbq/schema/pyarrow_to_bigquery.py """ @@ -28,6 +30,14 @@ except ImportError: pyarrow = None +try: + import db_dtypes # type: ignore + + db_dtypes_import_exception = None +except ImportError as exc: + db_dtypes = None + db_dtypes_import_exception = exc + def pyarrow_datetime(): return pyarrow.timestamp("us", tz=None) @@ -69,12 +79,18 @@ def pyarrow_timestamp(): "GEOGRAPHY": pyarrow.string, "INT64": pyarrow.int64, "INTEGER": pyarrow.int64, + # Normally, we'd prefer JSON type built-in to pyarrow (added in 19.0.0), + # but we'd like this to map as closely to the BQ Storage API as + # possible, which uses the string() dtype, as JSON support in Arrow + # predates JSON support in BigQuery by several years. + "JSON": pyarrow.string, "NUMERIC": pyarrow_numeric, "STRING": pyarrow.string, "TIME": pyarrow_time, "TIMESTAMP": pyarrow_timestamp, } + # DEPRECATED: update pandas_gbq.schema.pyarrow_to_bigquery, instead. _ARROW_SCALAR_IDS_TO_BQ = { # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes pyarrow.bool_().id: "BOOL", @@ -99,6 +115,9 @@ def pyarrow_timestamp(): pyarrow.large_string().id: "STRING", # The exact scale and precision don't matter, see below. pyarrow.decimal128(38, scale=9).id: "NUMERIC", + # NOTE: all extension types (e.g. json_, uuid, db_dtypes.JSONArrowType) + # have the same id (31 as of version 19.0.1), so these should not be + # matched by id. } # Adds bignumeric support only if pyarrow version >= 3.0.0 @@ -113,6 +132,9 @@ def pyarrow_timestamp(): def bq_to_arrow_scalars(bq_scalar: str): """ + DEPRECATED: update pandas_gbq.schema.bigquery_to_pyarrow, instead, which is + to be added in https://github.com/googleapis/python-bigquery-pandas/pull/893. + Returns: The Arrow scalar type that the input BigQuery scalar type maps to. If it cannot find the BigQuery scalar, return None. @@ -122,6 +144,8 @@ def bq_to_arrow_scalars(bq_scalar: str): def arrow_scalar_ids_to_bq(arrow_scalar: Any): """ + DEPRECATED: update pandas_gbq.schema.pyarrow_to_bigquery, instead. + Returns: The BigQuery scalar type that the input arrow scalar type maps to. If it cannot find the arrow scalar, return None. diff --git a/google/cloud/bigquery/query.py b/google/cloud/bigquery/query.py index f1090a7dc..8745c09f5 100644 --- a/google/cloud/bigquery/query.py +++ b/google/cloud/bigquery/query.py @@ -18,11 +18,11 @@ import copy import datetime import decimal -from typing import Any, Optional, Dict, Union +from typing import Any, cast, Optional, Dict, Union from google.cloud.bigquery.table import _parse_schema_resource +from google.cloud.bigquery import _helpers from google.cloud.bigquery._helpers import _rows_from_json -from google.cloud.bigquery._helpers import _QUERY_PARAMS_FROM_JSON from google.cloud.bigquery._helpers import _SCALAR_VALUE_TO_JSON_PARAM from google.cloud.bigquery._helpers import _SUPPORTED_RANGE_ELEMENTS @@ -571,6 +571,9 @@ def from_api_repr(cls, resource: dict) -> "ScalarQueryParameter": Returns: google.cloud.bigquery.query.ScalarQueryParameter: Instance """ + # Import here to avoid circular imports. + from google.cloud.bigquery import schema + name = resource.get("name") type_ = resource["parameterType"]["type"] @@ -578,7 +581,9 @@ def from_api_repr(cls, resource: dict) -> "ScalarQueryParameter": # from the back-end - the latter omits it for None values. value = resource.get("parameterValue", {}).get("value") if value is not None: - converted = _QUERY_PARAMS_FROM_JSON[type_](value, None) + converted = _helpers.SCALAR_QUERY_PARAM_PARSER.to_py( + value, schema.SchemaField(cast(str, name), type_) + ) else: converted = None @@ -693,13 +698,20 @@ def _from_api_repr_struct(cls, resource): @classmethod def _from_api_repr_scalar(cls, resource): + """Converts REST resource into a list of scalar values.""" + # Import here to avoid circular imports. + from google.cloud.bigquery import schema + name = resource.get("name") array_type = resource["parameterType"]["arrayType"]["type"] parameter_value = resource.get("parameterValue", {}) array_values = parameter_value.get("arrayValues", ()) values = [value["value"] for value in array_values] converted = [ - _QUERY_PARAMS_FROM_JSON[array_type](value, None) for value in values + _helpers.SCALAR_QUERY_PARAM_PARSER.to_py( + value, schema.SchemaField(name, array_type) + ) + for value in values ] return cls(name, array_type, converted) @@ -850,6 +862,9 @@ def from_api_repr(cls, resource: dict) -> "StructQueryParameter": Returns: google.cloud.bigquery.query.StructQueryParameter: Instance """ + # Import here to avoid circular imports. + from google.cloud.bigquery import schema + name = resource.get("name") instance = cls(name) type_resources = {} @@ -877,7 +892,9 @@ def from_api_repr(cls, resource: dict) -> "StructQueryParameter": converted = ArrayQueryParameter.from_api_repr(struct_resource) else: value = value["value"] - converted = _QUERY_PARAMS_FROM_JSON[type_](value, None) + converted = _helpers.SCALAR_QUERY_PARAM_PARSER.to_py( + value, schema.SchemaField(cast(str, name), type_) + ) instance.struct_values[key] = converted return instance diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index c70a0ebea..c30824cd9 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -3468,7 +3468,9 @@ def _row_iterator_page_columns(schema, response): def get_column_data(field_index, field): for row in rows: - yield _helpers._field_from_json(row["f"][field_index]["v"], field) + yield _helpers.DATA_FRAME_CELL_DATA_PARSER.to_py( + row["f"][field_index]["v"], field + ) for field_index, field in enumerate(schema): columns.append(get_column_data(field_index, field)) diff --git a/google/cloud/bigquery/version.py b/google/cloud/bigquery/version.py index 01c4c51ca..b367f479e 100644 --- a/google/cloud/bigquery/version.py +++ b/google/cloud/bigquery/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "3.30.0" +__version__ = "3.31.0rc1" diff --git a/tests/system/test_arrow.py b/tests/system/test_arrow.py index 82cf11f85..f2aed656c 100644 --- a/tests/system/test_arrow.py +++ b/tests/system/test_arrow.py @@ -194,3 +194,32 @@ def test_list_rows_range_csv( range_type = schema.field("range_date").type assert range_type == expected_type + + +def test_to_arrow_query_with_empty_results(bigquery_client): + """ + JSON regression test for https://github.com/googleapis/python-bigquery/issues/1580. + """ + job = bigquery_client.query( + """ + select + 123 as int_col, + '' as string_col, + to_json('{}') as json_col, + struct(to_json('[]') as json_field, -1 as int_field) as struct_col, + [to_json('null')] as json_array_col, + from unnest([]) + """ + ) + table = job.to_arrow() + assert list(table.column_names) == [ + "int_col", + "string_col", + "json_col", + "struct_col", + "json_array_col", + ] + assert table.shape == (0, 5) + struct_type = table.field("struct_col").type + assert struct_type.get_field_index("json_field") == 0 + assert struct_type.get_field_index("int_field") == 1 diff --git a/tests/system/test_pandas.py b/tests/system/test_pandas.py index a9e76d416..1fe7ff2cd 100644 --- a/tests/system/test_pandas.py +++ b/tests/system/test_pandas.py @@ -1299,6 +1299,32 @@ def test_upload_time_and_datetime_56(bigquery_client, dataset_id): ] +def test_to_dataframe_query_with_empty_results(bigquery_client): + """ + JSON regression test for https://github.com/googleapis/python-bigquery/issues/1580. + """ + job = bigquery_client.query( + """ + select + 123 as int_col, + '' as string_col, + to_json('{}') as json_col, + struct(to_json('[]') as json_field, -1 as int_field) as struct_col, + [to_json('null')] as json_array_col, + from unnest([]) + """ + ) + df = job.to_dataframe() + assert list(df.columns) == [ + "int_col", + "string_col", + "json_col", + "struct_col", + "json_array_col", + ] + assert len(df.index) == 0 + + def test_to_dataframe_geography_as_objects(bigquery_client, dataset_id): wkt = pytest.importorskip("shapely.wkt") bigquery_client.query( diff --git a/tests/unit/_helpers/test_cell_data_parser.py b/tests/unit/_helpers/test_cell_data_parser.py new file mode 100644 index 000000000..14721a26c --- /dev/null +++ b/tests/unit/_helpers/test_cell_data_parser.py @@ -0,0 +1,467 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import base64 +import datetime +import decimal +import json + +from dateutil.relativedelta import relativedelta +import pytest + +import google.cloud.bigquery.schema + + +def create_field(mode="NULLABLE", type_="IGNORED", name="test_field", **kwargs): + return google.cloud.bigquery.schema.SchemaField(name, type_, mode=mode, **kwargs) + + +@pytest.fixture +def mut(): + from google.cloud.bigquery import _helpers + + return _helpers + + +@pytest.fixture +def object_under_test(mut): + return mut.CELL_DATA_PARSER + + +ALL_TYPES = { + "BOOL", + "BOOLEAN", + "BYTES", + "INTEGER", + "INT64", + "INTERVAL", + "FLOAT", + "FLOAT64", + "NUMERIC", + "BIGNUMERIC", + "STRING", + "GEOGRAPHY", + "TIMESTAMP", + "DATETIME", + "DATE", + "TIME", + "RECORD", + "STRUCT", + "JSON", + "RANGE", +} + +TYPES_WITH_CLIENT_SIDE_NULL_VALIDATION = ALL_TYPES - { + "STRING", + "GEOGRAPHY", +} + + +@pytest.mark.parametrize( + "type_", + list(sorted(ALL_TYPES)), +) +def test_to_py_w_none_nullable(object_under_test, type_): + assert object_under_test.to_py(None, create_field("NULLABLE", type_)) is None + + +@pytest.mark.parametrize("type_", list(sorted(TYPES_WITH_CLIENT_SIDE_NULL_VALIDATION))) +def test_to_py_w_none_required(object_under_test, type_): + with pytest.raises(TypeError): + object_under_test.to_py(None, create_field("REQUIRED", type_)) + + +def test_interval_to_py_w_invalid_format(object_under_test): + with pytest.raises(ValueError, match="NOT_AN_INTERVAL"): + object_under_test.interval_to_py("NOT_AN_INTERVAL", create_field()) + + +@pytest.mark.parametrize( + ("value", "expected"), + ( + ("0-0 0 0:0:0", relativedelta()), + # SELECT INTERVAL X YEAR + ("-10000-0 0 0:0:0", relativedelta(years=-10000)), + ("-1-0 0 0:0:0", relativedelta(years=-1)), + ("1-0 0 0:0:0", relativedelta(years=1)), + ("10000-0 0 0:0:0", relativedelta(years=10000)), + # SELECT INTERVAL X MONTH + ("-0-11 0 0:0:0", relativedelta(months=-11)), + ("-0-1 0 0:0:0", relativedelta(months=-1)), + ("0-1 0 0:0:0", relativedelta(months=1)), + ("0-11 0 0:0:0", relativedelta(months=11)), + # SELECT INTERVAL X DAY + ("0-0 -3660000 0:0:0", relativedelta(days=-3660000)), + ("0-0 -1 0:0:0", relativedelta(days=-1)), + ("0-0 1 0:0:0", relativedelta(days=1)), + ("0-0 3660000 0:0:0", relativedelta(days=3660000)), + # SELECT INTERVAL X HOUR + ("0-0 0 -87840000:0:0", relativedelta(hours=-87840000)), + ("0-0 0 -1:0:0", relativedelta(hours=-1)), + ("0-0 0 1:0:0", relativedelta(hours=1)), + ("0-0 0 87840000:0:0", relativedelta(hours=87840000)), + # SELECT INTERVAL X MINUTE + ("0-0 0 -0:59:0", relativedelta(minutes=-59)), + ("0-0 0 -0:1:0", relativedelta(minutes=-1)), + ("0-0 0 0:1:0", relativedelta(minutes=1)), + ("0-0 0 0:59:0", relativedelta(minutes=59)), + # SELECT INTERVAL X SECOND + ("0-0 0 -0:0:59", relativedelta(seconds=-59)), + ("0-0 0 -0:0:1", relativedelta(seconds=-1)), + ("0-0 0 0:0:1", relativedelta(seconds=1)), + ("0-0 0 0:0:59", relativedelta(seconds=59)), + # SELECT (INTERVAL -1 SECOND) / 1000000 + ("0-0 0 -0:0:0.000001", relativedelta(microseconds=-1)), + ("0-0 0 -0:0:59.999999", relativedelta(seconds=-59, microseconds=-999999)), + ("0-0 0 -0:0:59.999", relativedelta(seconds=-59, microseconds=-999000)), + ("0-0 0 0:0:59.999", relativedelta(seconds=59, microseconds=999000)), + ("0-0 0 0:0:59.999999", relativedelta(seconds=59, microseconds=999999)), + # Test with multiple digits in each section. + ( + "32-11 45 67:16:23.987654", + relativedelta( + years=32, + months=11, + days=45, + hours=67, + minutes=16, + seconds=23, + microseconds=987654, + ), + ), + ( + "-32-11 -45 -67:16:23.987654", + relativedelta( + years=-32, + months=-11, + days=-45, + hours=-67, + minutes=-16, + seconds=-23, + microseconds=-987654, + ), + ), + # Test with mixed +/- sections. + ( + "9999-9 -999999 9999999:59:59.999999", + relativedelta( + years=9999, + months=9, + days=-999999, + hours=9999999, + minutes=59, + seconds=59, + microseconds=999999, + ), + ), + # Test with fraction that is not microseconds. + ("0-0 0 0:0:42.", relativedelta(seconds=42)), + ("0-0 0 0:0:59.1", relativedelta(seconds=59, microseconds=100000)), + ("0-0 0 0:0:0.12", relativedelta(microseconds=120000)), + ("0-0 0 0:0:0.123", relativedelta(microseconds=123000)), + ("0-0 0 0:0:0.1234", relativedelta(microseconds=123400)), + # Fractional seconds can cause rounding problems if cast to float. See: + # https://github.com/googleapis/python-db-dtypes-pandas/issues/18 + ("0-0 0 0:0:59.876543", relativedelta(seconds=59, microseconds=876543)), + ( + "0-0 0 01:01:01.010101", + relativedelta(hours=1, minutes=1, seconds=1, microseconds=10101), + ), + ( + "0-0 0 09:09:09.090909", + relativedelta(hours=9, minutes=9, seconds=9, microseconds=90909), + ), + ( + "0-0 0 11:11:11.111111", + relativedelta(hours=11, minutes=11, seconds=11, microseconds=111111), + ), + ( + "0-0 0 19:16:23.987654", + relativedelta(hours=19, minutes=16, seconds=23, microseconds=987654), + ), + # Nanoseconds are not expected, but should not cause error. + ("0-0 0 0:0:00.123456789", relativedelta(microseconds=123456)), + ("0-0 0 0:0:59.87654321", relativedelta(seconds=59, microseconds=876543)), + ), +) +def test_interval_to_py_w_string_values(object_under_test, value, expected): + got = object_under_test.interval_to_py(value, create_field()) + assert got == expected + + +def test_integer_to_py_w_string_value(object_under_test): + coerced = object_under_test.integer_to_py("42", object()) + assert coerced == 42 + + +def test_integer_to_py_w_float_value(object_under_test): + coerced = object_under_test.integer_to_py(42.0, object()) + assert coerced == 42 + + +def test_json_to_py_w_json_field(object_under_test): + data_field = create_field("REQUIRED", "data", "JSON") + + value = json.dumps( + {"v": {"key": "value"}}, + ) + + expected_output = {"v": {"key": "value"}} + coerced_output = object_under_test.json_to_py(value, data_field) + assert coerced_output == expected_output + + +def test_json_to_py_w_string_value(object_under_test): + coerced = object_under_test.json_to_py('"foo"', create_field()) + assert coerced == "foo" + + +def test_float_to_py_w_string_value(object_under_test): + coerced = object_under_test.float_to_py("3.1415", object()) + assert coerced == 3.1415 + + +def test_float_to_py_w_float_value(object_under_test): + coerced = object_under_test.float_to_py(3.1415, object()) + assert coerced == 3.1415 + + +def test_numeric_to_py_w_string_value(object_under_test): + coerced = object_under_test.numeric_to_py("3.1415", object()) + assert coerced == decimal.Decimal("3.1415") + + +def test_numeric_to_py_w_float_value(object_under_test): + coerced = object_under_test.numeric_to_py(3.1415, object()) + # There is no exact float representation of 3.1415. + assert coerced == decimal.Decimal(3.1415) + + +def test_bool_to_py_w_value_t(object_under_test): + coerced = object_under_test.bool_to_py("T", object()) + assert coerced is True + + +def test_bool_to_py_w_value_true(object_under_test): + coerced = object_under_test.bool_to_py("True", object()) + assert coerced is True + + +def test_bool_to_py_w_value_1(object_under_test): + coerced = object_under_test.bool_to_py("1", object()) + assert coerced is True + + +def test_bool_to_py_w_value_other(object_under_test): + coerced = object_under_test.bool_to_py("f", object()) + assert coerced is False + + +def test_string_to_py_w_string_value(object_under_test): + coerced = object_under_test.string_to_py("Wonderful!", object()) + assert coerced == "Wonderful!" + + +def test_bytes_to_py_w_base64_encoded_bytes(object_under_test): + expected = b"Wonderful!" + encoded = base64.standard_b64encode(expected) + coerced = object_under_test.bytes_to_py(encoded, object()) + assert coerced == expected + + +def test_bytes_to_py_w_base64_encoded_text(object_under_test): + expected = b"Wonderful!" + encoded = base64.standard_b64encode(expected).decode("ascii") + coerced = object_under_test.bytes_to_py(encoded, object()) + assert coerced == expected + + +def test_timestamp_to_py_w_string_int_value(object_under_test): + from google.cloud._helpers import _EPOCH + + coerced = object_under_test.timestamp_to_py("1234567", object()) + assert coerced == _EPOCH + datetime.timedelta(seconds=1, microseconds=234567) + + +def test_timestamp_to_py_w_int_value(object_under_test): + from google.cloud._helpers import _EPOCH + + coerced = object_under_test.timestamp_to_py(1234567, object()) + assert coerced == _EPOCH + datetime.timedelta(seconds=1, microseconds=234567) + + +def test_datetime_to_py_w_string_value(object_under_test): + coerced = object_under_test.datetime_to_py("2016-12-02T18:51:33", object()) + assert coerced == datetime.datetime(2016, 12, 2, 18, 51, 33) + + +def test_datetime_to_py_w_microseconds(object_under_test): + coerced = object_under_test.datetime_to_py("2015-05-22T10:11:12.987654", object()) + assert coerced == datetime.datetime(2015, 5, 22, 10, 11, 12, 987654) + + +def test_date_to_py_w_string_value(object_under_test): + coerced = object_under_test.date_to_py("1987-09-22", object()) + assert coerced == datetime.date(1987, 9, 22) + + +def test_time_to_py_w_string_value(object_under_test): + coerced = object_under_test.time_to_py("12:12:27", object()) + assert coerced == datetime.time(12, 12, 27) + + +def test_time_to_py_w_subsecond_string_value(object_under_test): + coerced = object_under_test.time_to_py("12:12:27.123456", object()) + assert coerced == datetime.time(12, 12, 27, 123456) + + +def test_time_to_py_w_bogus_string_value(object_under_test): + with pytest.raises(ValueError): + object_under_test.time_to_py("12:12:27.123", object()) + + +def test_range_to_py_w_wrong_format(object_under_test): + range_field = create_field( + "NULLABLE", + "RANGE", + range_element_type="DATE", + ) + with pytest.raises(ValueError): + object_under_test.range_to_py("[2009-06-172019-06-17)", range_field) + + +def test_range_to_py_w_wrong_element_type(object_under_test): + range_field = create_field( + "NULLABLE", + "RANGE", + range_element_type=google.cloud.bigquery.schema.FieldElementType( + element_type="TIME" + ), + ) + with pytest.raises(ValueError): + object_under_test.range_to_py("[15:31:38, 15:50:38)", range_field) + + +def test_range_to_py_w_unbounded_value(object_under_test): + range_field = create_field( + "NULLABLE", + "RANGE", + range_element_type="DATE", + ) + coerced = object_under_test.range_to_py("[UNBOUNDED, 2019-06-17)", range_field) + assert coerced == {"start": None, "end": datetime.date(2019, 6, 17)} + + +def test_range_to_py_w_date_value(object_under_test): + range_field = create_field( + "NULLABLE", + "RANGE", + range_element_type="DATE", + ) + coerced = object_under_test.range_to_py("[2009-06-17, 2019-06-17)", range_field) + assert coerced == { + "start": datetime.date(2009, 6, 17), + "end": datetime.date(2019, 6, 17), + } + + +def test_range_to_py_w_datetime_value(object_under_test): + range_field = create_field( + "NULLABLE", + "RANGE", + range_element_type=google.cloud.bigquery.schema.FieldElementType( + element_type="DATETIME" + ), + ) + coerced = object_under_test.range_to_py( + "[2009-06-17T13:45:30, 2019-06-17T13:45:30)", range_field + ) + assert coerced == { + "start": datetime.datetime(2009, 6, 17, 13, 45, 30), + "end": datetime.datetime(2019, 6, 17, 13, 45, 30), + } + + +def test_range_to_py_w_timestamp_value(object_under_test): + from google.cloud._helpers import _EPOCH + + range_field = create_field( + "NULLABLE", + "RANGE", + range_element_type=google.cloud.bigquery.schema.FieldElementType( + element_type="TIMESTAMP" + ), + ) + coerced = object_under_test.range_to_py("[1234567, 1234789)", range_field) + assert coerced == { + "start": _EPOCH + datetime.timedelta(seconds=1, microseconds=234567), + "end": _EPOCH + datetime.timedelta(seconds=1, microseconds=234789), + } + + +def test_record_to_py_w_nullable_subfield_none(object_under_test): + subfield = create_field("NULLABLE", "INTEGER", name="age") + field = create_field("REQUIRED", fields=[subfield]) + value = {"f": [{"v": None}]} + coerced = object_under_test.record_to_py(value, field) + assert coerced == {"age": None} + + +def test_record_to_py_w_scalar_subfield(object_under_test): + subfield = create_field("REQUIRED", "INTEGER", name="age") + field = create_field("REQUIRED", fields=[subfield]) + value = {"f": [{"v": 42}]} + coerced = object_under_test.record_to_py(value, field) + assert coerced == {"age": 42} + + +def test_record_to_py_w_scalar_subfield_geography(object_under_test): + subfield = create_field("REQUIRED", "GEOGRAPHY", name="geo") + field = create_field("REQUIRED", fields=[subfield]) + value = {"f": [{"v": "POINT(1, 2)"}]} + coerced = object_under_test.record_to_py(value, field) + assert coerced == {"geo": "POINT(1, 2)"} + + +def test_record_to_py_w_repeated_subfield(object_under_test): + subfield = create_field("REPEATED", "STRING", name="color") + field = create_field("REQUIRED", fields=[subfield]) + value = {"f": [{"v": [{"v": "red"}, {"v": "yellow"}, {"v": "blue"}]}]} + coerced = object_under_test.record_to_py(value, field) + assert coerced == {"color": ["red", "yellow", "blue"]} + + +def test_record_to_py_w_record_subfield(object_under_test): + full_name = create_field("REQUIRED", "STRING", name="full_name") + area_code = create_field("REQUIRED", "STRING", name="area_code") + local_number = create_field("REQUIRED", "STRING", name="local_number") + rank = create_field("REQUIRED", "INTEGER", name="rank") + phone = create_field( + "NULLABLE", "RECORD", name="phone", fields=[area_code, local_number, rank] + ) + person = create_field( + "REQUIRED", "RECORD", name="person", fields=[full_name, phone] + ) + value = { + "f": [ + {"v": "Phred Phlyntstone"}, + {"v": {"f": [{"v": "800"}, {"v": "555-1212"}, {"v": 1}]}}, + ] + } + expected = { + "full_name": "Phred Phlyntstone", + "phone": {"area_code": "800", "local_number": "555-1212", "rank": 1}, + } + coerced = object_under_test.record_to_py(value, person) + assert coerced == expected diff --git a/tests/unit/_helpers/test_data_frame_cell_data_parser.py b/tests/unit/_helpers/test_data_frame_cell_data_parser.py new file mode 100644 index 000000000..c3332dc89 --- /dev/null +++ b/tests/unit/_helpers/test_data_frame_cell_data_parser.py @@ -0,0 +1,71 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import google.cloud.bigquery.schema + + +def create_field(mode="NULLABLE", type_="IGNORED", name="test_field", **kwargs): + return google.cloud.bigquery.schema.SchemaField(name, type_, mode=mode, **kwargs) + + +@pytest.fixture +def mut(): + from google.cloud.bigquery import _helpers + + return _helpers + + +@pytest.fixture +def object_under_test(mut): + return mut.DATA_FRAME_CELL_DATA_PARSER + + +def test_json_to_py_doesnt_parse_json(object_under_test): + coerced = object_under_test.json_to_py('{"key":"value"}', create_field()) + assert coerced == '{"key":"value"}' + + +def test_json_to_py_repeated_doesnt_parse_json(object_under_test): + coerced = object_under_test.json_to_py('{"key":"value"}', create_field("REPEATED")) + assert coerced == '{"key":"value"}' + + +def test_record_to_py_doesnt_parse_json(object_under_test): + subfield = create_field(type_="JSON", name="json") + field = create_field(fields=[subfield]) + value = {"f": [{"v": '{"key":"value"}'}]} + coerced = object_under_test.record_to_py(value, field) + assert coerced == {"json": '{"key":"value"}'} + + +def test_record_to_py_doesnt_parse_repeated_json(object_under_test): + subfield = create_field("REPEATED", "JSON", name="json") + field = create_field("REQUIRED", fields=[subfield]) + value = { + "f": [ + { + "v": [ + {"v": '{"key":"value0"}'}, + {"v": '{"key":"value1"}'}, + {"v": '{"key":"value2"}'}, + ] + } + ] + } + coerced = object_under_test.record_to_py(value, field) + assert coerced == { + "json": ['{"key":"value0"}', '{"key":"value1"}', '{"key":"value2"}'] + } diff --git a/tests/unit/_helpers/test_from_json.py b/tests/unit/_helpers/test_from_json.py deleted file mode 100644 index 65b054f44..000000000 --- a/tests/unit/_helpers/test_from_json.py +++ /dev/null @@ -1,157 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dateutil.relativedelta import relativedelta -import pytest - -from google.cloud.bigquery.schema import SchemaField - - -def create_field(mode="NULLABLE", type_="IGNORED"): - return SchemaField("test_field", type_, mode=mode) - - -@pytest.fixture -def mut(): - from google.cloud.bigquery import _helpers - - return _helpers - - -def test_interval_from_json_w_none_nullable(mut): - got = mut._interval_from_json(None, create_field()) - assert got is None - - -def test_interval_from_json_w_none_required(mut): - with pytest.raises(TypeError): - mut._interval_from_json(None, create_field(mode="REQUIRED")) - - -def test_interval_from_json_w_invalid_format(mut): - with pytest.raises(ValueError, match="NOT_AN_INTERVAL"): - mut._interval_from_json("NOT_AN_INTERVAL", create_field()) - - -@pytest.mark.parametrize( - ("value", "expected"), - ( - ("0-0 0 0:0:0", relativedelta()), - # SELECT INTERVAL X YEAR - ("-10000-0 0 0:0:0", relativedelta(years=-10000)), - ("-1-0 0 0:0:0", relativedelta(years=-1)), - ("1-0 0 0:0:0", relativedelta(years=1)), - ("10000-0 0 0:0:0", relativedelta(years=10000)), - # SELECT INTERVAL X MONTH - ("-0-11 0 0:0:0", relativedelta(months=-11)), - ("-0-1 0 0:0:0", relativedelta(months=-1)), - ("0-1 0 0:0:0", relativedelta(months=1)), - ("0-11 0 0:0:0", relativedelta(months=11)), - # SELECT INTERVAL X DAY - ("0-0 -3660000 0:0:0", relativedelta(days=-3660000)), - ("0-0 -1 0:0:0", relativedelta(days=-1)), - ("0-0 1 0:0:0", relativedelta(days=1)), - ("0-0 3660000 0:0:0", relativedelta(days=3660000)), - # SELECT INTERVAL X HOUR - ("0-0 0 -87840000:0:0", relativedelta(hours=-87840000)), - ("0-0 0 -1:0:0", relativedelta(hours=-1)), - ("0-0 0 1:0:0", relativedelta(hours=1)), - ("0-0 0 87840000:0:0", relativedelta(hours=87840000)), - # SELECT INTERVAL X MINUTE - ("0-0 0 -0:59:0", relativedelta(minutes=-59)), - ("0-0 0 -0:1:0", relativedelta(minutes=-1)), - ("0-0 0 0:1:0", relativedelta(minutes=1)), - ("0-0 0 0:59:0", relativedelta(minutes=59)), - # SELECT INTERVAL X SECOND - ("0-0 0 -0:0:59", relativedelta(seconds=-59)), - ("0-0 0 -0:0:1", relativedelta(seconds=-1)), - ("0-0 0 0:0:1", relativedelta(seconds=1)), - ("0-0 0 0:0:59", relativedelta(seconds=59)), - # SELECT (INTERVAL -1 SECOND) / 1000000 - ("0-0 0 -0:0:0.000001", relativedelta(microseconds=-1)), - ("0-0 0 -0:0:59.999999", relativedelta(seconds=-59, microseconds=-999999)), - ("0-0 0 -0:0:59.999", relativedelta(seconds=-59, microseconds=-999000)), - ("0-0 0 0:0:59.999", relativedelta(seconds=59, microseconds=999000)), - ("0-0 0 0:0:59.999999", relativedelta(seconds=59, microseconds=999999)), - # Test with multiple digits in each section. - ( - "32-11 45 67:16:23.987654", - relativedelta( - years=32, - months=11, - days=45, - hours=67, - minutes=16, - seconds=23, - microseconds=987654, - ), - ), - ( - "-32-11 -45 -67:16:23.987654", - relativedelta( - years=-32, - months=-11, - days=-45, - hours=-67, - minutes=-16, - seconds=-23, - microseconds=-987654, - ), - ), - # Test with mixed +/- sections. - ( - "9999-9 -999999 9999999:59:59.999999", - relativedelta( - years=9999, - months=9, - days=-999999, - hours=9999999, - minutes=59, - seconds=59, - microseconds=999999, - ), - ), - # Test with fraction that is not microseconds. - ("0-0 0 0:0:42.", relativedelta(seconds=42)), - ("0-0 0 0:0:59.1", relativedelta(seconds=59, microseconds=100000)), - ("0-0 0 0:0:0.12", relativedelta(microseconds=120000)), - ("0-0 0 0:0:0.123", relativedelta(microseconds=123000)), - ("0-0 0 0:0:0.1234", relativedelta(microseconds=123400)), - # Fractional seconds can cause rounding problems if cast to float. See: - # https://github.com/googleapis/python-db-dtypes-pandas/issues/18 - ("0-0 0 0:0:59.876543", relativedelta(seconds=59, microseconds=876543)), - ( - "0-0 0 01:01:01.010101", - relativedelta(hours=1, minutes=1, seconds=1, microseconds=10101), - ), - ( - "0-0 0 09:09:09.090909", - relativedelta(hours=9, minutes=9, seconds=9, microseconds=90909), - ), - ( - "0-0 0 11:11:11.111111", - relativedelta(hours=11, minutes=11, seconds=11, microseconds=111111), - ), - ( - "0-0 0 19:16:23.987654", - relativedelta(hours=19, minutes=16, seconds=23, microseconds=987654), - ), - # Nanoseconds are not expected, but should not cause error. - ("0-0 0 0:0:00.123456789", relativedelta(microseconds=123456)), - ("0-0 0 0:0:59.87654321", relativedelta(seconds=59, microseconds=876543)), - ), -) -def test_w_string_values(mut, value, expected): - got = mut._interval_from_json(value, create_field()) - assert got == expected diff --git a/tests/unit/_helpers/test_scalar_query_param_parser.py b/tests/unit/_helpers/test_scalar_query_param_parser.py new file mode 100644 index 000000000..8e0d2a34e --- /dev/null +++ b/tests/unit/_helpers/test_scalar_query_param_parser.py @@ -0,0 +1,93 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime + +import pytest + +import google.cloud.bigquery.schema + + +def create_field(mode="NULLABLE", type_="IGNORED"): + return google.cloud.bigquery.schema.SchemaField("test_field", type_, mode=mode) + + +@pytest.fixture +def mut(): + from google.cloud.bigquery import _helpers + + return _helpers + + +@pytest.fixture +def object_under_test(mut): + return mut.SCALAR_QUERY_PARAM_PARSER + + +def test_timestamp_to_py_w_none_nullable(object_under_test): + assert object_under_test.timestamp_to_py(None, create_field()) is None + + +@pytest.mark.parametrize( + ("value", "expected"), + [ + ( + "2016-12-20 15:58:27.339328+00:00", + datetime.datetime( + 2016, 12, 20, 15, 58, 27, 339328, tzinfo=datetime.timezone.utc + ), + ), + ( + "2016-12-20 15:58:27+00:00", + datetime.datetime(2016, 12, 20, 15, 58, 27, tzinfo=datetime.timezone.utc), + ), + ( + "2016-12-20T15:58:27.339328+00:00", + datetime.datetime( + 2016, 12, 20, 15, 58, 27, 339328, tzinfo=datetime.timezone.utc + ), + ), + ( + "2016-12-20T15:58:27+00:00", + datetime.datetime(2016, 12, 20, 15, 58, 27, tzinfo=datetime.timezone.utc), + ), + ( + "2016-12-20 15:58:27.339328Z", + datetime.datetime( + 2016, 12, 20, 15, 58, 27, 339328, tzinfo=datetime.timezone.utc + ), + ), + ( + "2016-12-20 15:58:27Z", + datetime.datetime(2016, 12, 20, 15, 58, 27, tzinfo=datetime.timezone.utc), + ), + ( + "2016-12-20T15:58:27.339328Z", + datetime.datetime( + 2016, 12, 20, 15, 58, 27, 339328, tzinfo=datetime.timezone.utc + ), + ), + ( + "2016-12-20T15:58:27Z", + datetime.datetime(2016, 12, 20, 15, 58, 27, tzinfo=datetime.timezone.utc), + ), + ], +) +def test_timestamp_to_py_w_timestamp_valid(object_under_test, value, expected): + assert object_under_test.timestamp_to_py(value, create_field()) == expected + + +def test_timestamp_to_py_w_timestamp_invalid(object_under_test): + with pytest.raises(ValueError): + object_under_test.timestamp_to_py("definitely-not-a-timestamp", create_field()) diff --git a/tests/unit/test__helpers.py b/tests/unit/test__helpers.py index adba6327c..4e53236e3 100644 --- a/tests/unit/test__helpers.py +++ b/tests/unit/test__helpers.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import base64 import datetime import decimal import json @@ -133,484 +132,6 @@ def test_w_value(self): self.assertTrue(self._call_fut(object(), object())) -class Test_int_from_json(unittest.TestCase): - def _call_fut(self, value, field): - from google.cloud.bigquery._helpers import _int_from_json - - return _int_from_json(value, field) - - def test_w_none_nullable(self): - self.assertIsNone(self._call_fut(None, _Field("NULLABLE"))) - - def test_w_none_required(self): - with self.assertRaises(TypeError): - self._call_fut(None, _Field("REQUIRED")) - - def test_w_string_value(self): - coerced = self._call_fut("42", object()) - self.assertEqual(coerced, 42) - - def test_w_float_value(self): - coerced = self._call_fut(42, object()) - self.assertEqual(coerced, 42) - - -class Test_json_from_json(unittest.TestCase): - def _call_fut(self, value, field): - from google.cloud.bigquery._helpers import _json_from_json - - return _json_from_json(value, field) - - def test_w_none_nullable(self): - self.assertIsNone(self._call_fut(None, _Field("NULLABLE"))) - - def test_w_none_required(self): - with self.assertRaises(TypeError): - self._call_fut(None, _Field("REQUIRED")) - - def test_w_json_field(self): - data_field = _Field("REQUIRED", "data", "JSON") - - value = json.dumps( - {"v": {"key": "value"}}, - ) - - expected_output = {"v": {"key": "value"}} - coerced_output = self._call_fut(value, data_field) - self.assertEqual(coerced_output, expected_output) - - def test_w_string_value(self): - coerced = self._call_fut('"foo"', object()) - self.assertEqual(coerced, "foo") - - -class Test_float_from_json(unittest.TestCase): - def _call_fut(self, value, field): - from google.cloud.bigquery._helpers import _float_from_json - - return _float_from_json(value, field) - - def test_w_none_nullable(self): - self.assertIsNone(self._call_fut(None, _Field("NULLABLE"))) - - def test_w_none_required(self): - with self.assertRaises(TypeError): - self._call_fut(None, _Field("REQUIRED")) - - def test_w_string_value(self): - coerced = self._call_fut("3.1415", object()) - self.assertEqual(coerced, 3.1415) - - def test_w_float_value(self): - coerced = self._call_fut(3.1415, object()) - self.assertEqual(coerced, 3.1415) - - -class Test_decimal_from_json(unittest.TestCase): - def _call_fut(self, value, field): - from google.cloud.bigquery._helpers import _decimal_from_json - - return _decimal_from_json(value, field) - - def test_w_none_nullable(self): - self.assertIsNone(self._call_fut(None, _Field("NULLABLE"))) - - def test_w_none_required(self): - with self.assertRaises(TypeError): - self._call_fut(None, _Field("REQUIRED")) - - def test_w_string_value(self): - coerced = self._call_fut("3.1415", object()) - self.assertEqual(coerced, decimal.Decimal("3.1415")) - - def test_w_float_value(self): - coerced = self._call_fut(3.1415, object()) - # There is no exact float representation of 3.1415. - self.assertEqual(coerced, decimal.Decimal(3.1415)) - - -class Test_bool_from_json(unittest.TestCase): - def _call_fut(self, value, field): - from google.cloud.bigquery._helpers import _bool_from_json - - return _bool_from_json(value, field) - - def test_w_none_nullable(self): - self.assertIsNone(self._call_fut(None, _Field("NULLABLE"))) - - def test_w_none_required(self): - with self.assertRaises(AttributeError): - self._call_fut(None, _Field("REQUIRED")) - - def test_w_value_t(self): - coerced = self._call_fut("T", object()) - self.assertTrue(coerced) - - def test_w_value_true(self): - coerced = self._call_fut("True", object()) - self.assertTrue(coerced) - - def test_w_value_1(self): - coerced = self._call_fut("1", object()) - self.assertTrue(coerced) - - def test_w_value_other(self): - coerced = self._call_fut("f", object()) - self.assertFalse(coerced) - - -class Test_string_from_json(unittest.TestCase): - def _call_fut(self, value, field): - from google.cloud.bigquery._helpers import _string_from_json - - return _string_from_json(value, field) - - def test_w_none_nullable(self): - self.assertIsNone(self._call_fut(None, _Field("NULLABLE"))) - - def test_w_none_required(self): - self.assertIsNone(self._call_fut(None, _Field("REQUIRED"))) - - def test_w_string_value(self): - coerced = self._call_fut("Wonderful!", object()) - self.assertEqual(coerced, "Wonderful!") - - -class Test_bytes_from_json(unittest.TestCase): - def _call_fut(self, value, field): - from google.cloud.bigquery._helpers import _bytes_from_json - - return _bytes_from_json(value, field) - - def test_w_none_nullable(self): - self.assertIsNone(self._call_fut(None, _Field("NULLABLE"))) - - def test_w_none_required(self): - with self.assertRaises(TypeError): - self._call_fut(None, _Field("REQUIRED")) - - def test_w_base64_encoded_bytes(self): - expected = b"Wonderful!" - encoded = base64.standard_b64encode(expected) - coerced = self._call_fut(encoded, object()) - self.assertEqual(coerced, expected) - - def test_w_base64_encoded_text(self): - expected = b"Wonderful!" - encoded = base64.standard_b64encode(expected).decode("ascii") - coerced = self._call_fut(encoded, object()) - self.assertEqual(coerced, expected) - - -class Test_timestamp_from_json(unittest.TestCase): - def _call_fut(self, value, field): - from google.cloud.bigquery._helpers import _timestamp_from_json - - return _timestamp_from_json(value, field) - - def test_w_none_nullable(self): - self.assertIsNone(self._call_fut(None, _Field("NULLABLE"))) - - def test_w_none_required(self): - with self.assertRaises(TypeError): - self._call_fut(None, _Field("REQUIRED")) - - def test_w_string_int_value(self): - from google.cloud._helpers import _EPOCH - - coerced = self._call_fut("1234567", object()) - self.assertEqual( - coerced, _EPOCH + datetime.timedelta(seconds=1, microseconds=234567) - ) - - def test_w_int_value(self): - from google.cloud._helpers import _EPOCH - - coerced = self._call_fut(1234567, object()) - self.assertEqual( - coerced, _EPOCH + datetime.timedelta(seconds=1, microseconds=234567) - ) - - -class Test_timestamp_query_param_from_json(unittest.TestCase): - def _call_fut(self, value, field): - from google.cloud.bigquery import _helpers - - return _helpers._timestamp_query_param_from_json(value, field) - - def test_w_none_nullable(self): - self.assertIsNone(self._call_fut(None, _Field("NULLABLE"))) - - def test_w_timestamp_valid(self): - from google.cloud._helpers import UTC - - samples = [ - ( - "2016-12-20 15:58:27.339328+00:00", - datetime.datetime(2016, 12, 20, 15, 58, 27, 339328, tzinfo=UTC), - ), - ( - "2016-12-20 15:58:27+00:00", - datetime.datetime(2016, 12, 20, 15, 58, 27, tzinfo=UTC), - ), - ( - "2016-12-20T15:58:27.339328+00:00", - datetime.datetime(2016, 12, 20, 15, 58, 27, 339328, tzinfo=UTC), - ), - ( - "2016-12-20T15:58:27+00:00", - datetime.datetime(2016, 12, 20, 15, 58, 27, tzinfo=UTC), - ), - ( - "2016-12-20 15:58:27.339328Z", - datetime.datetime(2016, 12, 20, 15, 58, 27, 339328, tzinfo=UTC), - ), - ( - "2016-12-20 15:58:27Z", - datetime.datetime(2016, 12, 20, 15, 58, 27, tzinfo=UTC), - ), - ( - "2016-12-20T15:58:27.339328Z", - datetime.datetime(2016, 12, 20, 15, 58, 27, 339328, tzinfo=UTC), - ), - ( - "2016-12-20T15:58:27Z", - datetime.datetime(2016, 12, 20, 15, 58, 27, tzinfo=UTC), - ), - ] - for timestamp_str, expected_result in samples: - self.assertEqual( - self._call_fut(timestamp_str, _Field("NULLABLE")), expected_result - ) - - def test_w_timestamp_invalid(self): - with self.assertRaises(ValueError): - self._call_fut("definitely-not-a-timestamp", _Field("NULLABLE")) - - -class Test_datetime_from_json(unittest.TestCase): - def _call_fut(self, value, field): - from google.cloud.bigquery._helpers import _datetime_from_json - - return _datetime_from_json(value, field) - - def test_w_none_nullable(self): - self.assertIsNone(self._call_fut(None, _Field("NULLABLE"))) - - def test_w_none_required(self): - with self.assertRaises(TypeError): - self._call_fut(None, _Field("REQUIRED")) - - def test_w_string_value(self): - coerced = self._call_fut("2016-12-02T18:51:33", object()) - self.assertEqual(coerced, datetime.datetime(2016, 12, 2, 18, 51, 33)) - - def test_w_microseconds(self): - coerced = self._call_fut("2015-05-22T10:11:12.987654", object()) - self.assertEqual(coerced, datetime.datetime(2015, 5, 22, 10, 11, 12, 987654)) - - -class Test_date_from_json(unittest.TestCase): - def _call_fut(self, value, field): - from google.cloud.bigquery._helpers import _date_from_json - - return _date_from_json(value, field) - - def test_w_none_nullable(self): - self.assertIsNone(self._call_fut(None, _Field("NULLABLE"))) - - def test_w_none_required(self): - with self.assertRaises(TypeError): - self._call_fut(None, _Field("REQUIRED")) - - def test_w_string_value(self): - coerced = self._call_fut("1987-09-22", object()) - self.assertEqual(coerced, datetime.date(1987, 9, 22)) - - -class Test_time_from_json(unittest.TestCase): - def _call_fut(self, value, field): - from google.cloud.bigquery._helpers import _time_from_json - - return _time_from_json(value, field) - - def test_w_none_nullable(self): - self.assertIsNone(self._call_fut(None, _Field("NULLABLE"))) - - def test_w_none_required(self): - with self.assertRaises(TypeError): - self._call_fut(None, _Field("REQUIRED")) - - def test_w_string_value(self): - coerced = self._call_fut("12:12:27", object()) - self.assertEqual(coerced, datetime.time(12, 12, 27)) - - def test_w_subsecond_string_value(self): - coerced = self._call_fut("12:12:27.123456", object()) - self.assertEqual(coerced, datetime.time(12, 12, 27, 123456)) - - def test_w_bogus_string_value(self): - with self.assertRaises(ValueError): - self._call_fut("12:12:27.123", object()) - - -class Test_range_from_json(unittest.TestCase): - def _call_fut(self, value, field): - from google.cloud.bigquery._helpers import _range_from_json - - return _range_from_json(value, field) - - def test_w_none_nullable(self): - self.assertIsNone(self._call_fut(None, _Field("NULLABLE"))) - - def test_w_none_required(self): - with self.assertRaises(TypeError): - self._call_fut(None, _Field("REQUIRED")) - - def test_w_wrong_format(self): - range_field = _Field( - "NULLABLE", - field_type="RANGE", - range_element_type=_Field("NULLABLE", element_type="DATE"), - ) - with self.assertRaises(ValueError): - self._call_fut("[2009-06-172019-06-17)", range_field) - - def test_w_wrong_element_type(self): - range_field = _Field( - "NULLABLE", - field_type="RANGE", - range_element_type=_Field("NULLABLE", element_type="TIME"), - ) - with self.assertRaises(ValueError): - self._call_fut("[15:31:38, 15:50:38)", range_field) - - def test_w_unbounded_value(self): - range_field = _Field( - "NULLABLE", - field_type="RANGE", - range_element_type=_Field("NULLABLE", element_type="DATE"), - ) - coerced = self._call_fut("[UNBOUNDED, 2019-06-17)", range_field) - self.assertEqual( - coerced, - {"start": None, "end": datetime.date(2019, 6, 17)}, - ) - - def test_w_date_value(self): - range_field = _Field( - "NULLABLE", - field_type="RANGE", - range_element_type=_Field("NULLABLE", element_type="DATE"), - ) - coerced = self._call_fut("[2009-06-17, 2019-06-17)", range_field) - self.assertEqual( - coerced, - { - "start": datetime.date(2009, 6, 17), - "end": datetime.date(2019, 6, 17), - }, - ) - - def test_w_datetime_value(self): - range_field = _Field( - "NULLABLE", - field_type="RANGE", - range_element_type=_Field("NULLABLE", element_type="DATETIME"), - ) - coerced = self._call_fut( - "[2009-06-17T13:45:30, 2019-06-17T13:45:30)", range_field - ) - self.assertEqual( - coerced, - { - "start": datetime.datetime(2009, 6, 17, 13, 45, 30), - "end": datetime.datetime(2019, 6, 17, 13, 45, 30), - }, - ) - - def test_w_timestamp_value(self): - from google.cloud._helpers import _EPOCH - - range_field = _Field( - "NULLABLE", - field_type="RANGE", - range_element_type=_Field("NULLABLE", element_type="TIMESTAMP"), - ) - coerced = self._call_fut("[1234567, 1234789)", range_field) - self.assertEqual( - coerced, - { - "start": _EPOCH + datetime.timedelta(seconds=1, microseconds=234567), - "end": _EPOCH + datetime.timedelta(seconds=1, microseconds=234789), - }, - ) - - -class Test_record_from_json(unittest.TestCase): - def _call_fut(self, value, field): - from google.cloud.bigquery._helpers import _record_from_json - - return _record_from_json(value, field) - - def test_w_none_nullable(self): - self.assertIsNone(self._call_fut(None, _Field("NULLABLE"))) - - def test_w_none_required(self): - with self.assertRaises(TypeError): - self._call_fut(None, _Field("REQUIRED")) - - def test_w_nullable_subfield_none(self): - subfield = _Field("NULLABLE", "age", "INTEGER") - field = _Field("REQUIRED", fields=[subfield]) - value = {"f": [{"v": None}]} - coerced = self._call_fut(value, field) - self.assertEqual(coerced, {"age": None}) - - def test_w_scalar_subfield(self): - subfield = _Field("REQUIRED", "age", "INTEGER") - field = _Field("REQUIRED", fields=[subfield]) - value = {"f": [{"v": 42}]} - coerced = self._call_fut(value, field) - self.assertEqual(coerced, {"age": 42}) - - def test_w_scalar_subfield_geography(self): - subfield = _Field("REQUIRED", "geo", "GEOGRAPHY") - field = _Field("REQUIRED", fields=[subfield]) - value = {"f": [{"v": "POINT(1, 2)"}]} - coerced = self._call_fut(value, field) - self.assertEqual(coerced, {"geo": "POINT(1, 2)"}) - - def test_w_repeated_subfield(self): - subfield = _Field("REPEATED", "color", "STRING") - field = _Field("REQUIRED", fields=[subfield]) - value = {"f": [{"v": [{"v": "red"}, {"v": "yellow"}, {"v": "blue"}]}]} - coerced = self._call_fut(value, field) - self.assertEqual(coerced, {"color": ["red", "yellow", "blue"]}) - - def test_w_record_subfield(self): - full_name = _Field("REQUIRED", "full_name", "STRING") - area_code = _Field("REQUIRED", "area_code", "STRING") - local_number = _Field("REQUIRED", "local_number", "STRING") - rank = _Field("REQUIRED", "rank", "INTEGER") - phone = _Field( - "NULLABLE", "phone", "RECORD", fields=[area_code, local_number, rank] - ) - person = _Field("REQUIRED", "person", "RECORD", fields=[full_name, phone]) - value = { - "f": [ - {"v": "Phred Phlyntstone"}, - {"v": {"f": [{"v": "800"}, {"v": "555-1212"}, {"v": 1}]}}, - ] - } - expected = { - "full_name": "Phred Phlyntstone", - "phone": {"area_code": "800", "local_number": "555-1212", "rank": 1}, - } - coerced = self._call_fut(value, person) - self.assertEqual(coerced, expected) - - class Test_field_to_index_mapping(unittest.TestCase): def _call_fut(self, schema): from google.cloud.bigquery._helpers import _field_to_index_mapping diff --git a/tests/unit/test__pyarrow_helpers.py b/tests/unit/test__pyarrow_helpers.py index f0a872c88..06fc2eb85 100644 --- a/tests/unit/test__pyarrow_helpers.py +++ b/tests/unit/test__pyarrow_helpers.py @@ -27,8 +27,16 @@ def module_under_test(): def test_bq_to_arrow_scalars(module_under_test): assert ( - module_under_test.bq_to_arrow_scalars("BIGNUMERIC") - == module_under_test.pyarrow_bignumeric + module_under_test.bq_to_arrow_scalars("BIGNUMERIC")() + == module_under_test.pyarrow_bignumeric() + ) + assert ( + # Normally, we'd prefer JSON type built-in to pyarrow (added in 19.0.0), + # but we'd like this to map as closely to the BQ Storage API as + # possible, which uses the string() dtype, as JSON support in Arrow + # predates JSON support in BigQuery by several years. + module_under_test.bq_to_arrow_scalars("JSON")() + == pyarrow.string() ) assert module_under_test.bq_to_arrow_scalars("UNKNOWN_TYPE") is None diff --git a/tests/unit/test_table_arrow.py b/tests/unit/test_table_arrow.py index 6f1e6f76a..830c4ceb7 100644 --- a/tests/unit/test_table_arrow.py +++ b/tests/unit/test_table_arrow.py @@ -28,6 +28,7 @@ def test_to_arrow_with_jobs_query_response(): "fields": [ {"name": "name", "type": "STRING", "mode": "NULLABLE"}, {"name": "number", "type": "INTEGER", "mode": "NULLABLE"}, + {"name": "json", "type": "JSON", "mode": "NULLABLE"}, ] }, "jobReference": { @@ -37,15 +38,21 @@ def test_to_arrow_with_jobs_query_response(): }, "totalRows": "9", "rows": [ - {"f": [{"v": "Tiarra"}, {"v": "6"}]}, - {"f": [{"v": "Timothy"}, {"v": "325"}]}, - {"f": [{"v": "Tina"}, {"v": "26"}]}, - {"f": [{"v": "Tierra"}, {"v": "10"}]}, - {"f": [{"v": "Tia"}, {"v": "17"}]}, - {"f": [{"v": "Tiara"}, {"v": "22"}]}, - {"f": [{"v": "Tiana"}, {"v": "6"}]}, - {"f": [{"v": "Tiffany"}, {"v": "229"}]}, - {"f": [{"v": "Tiffani"}, {"v": "8"}]}, + {"f": [{"v": "Tiarra"}, {"v": "6"}, {"v": "123"}]}, + {"f": [{"v": "Timothy"}, {"v": "325"}, {"v": '{"key":"value"}'}]}, + {"f": [{"v": "Tina"}, {"v": "26"}, {"v": "[1,2,3]"}]}, + { + "f": [ + {"v": "Tierra"}, + {"v": "10"}, + {"v": '{"aKey": {"bKey": {"cKey": -123}}}'}, + ] + }, + {"f": [{"v": "Tia"}, {"v": "17"}, {"v": None}]}, + {"f": [{"v": "Tiara"}, {"v": "22"}, {"v": '"some-json-string"'}]}, + {"f": [{"v": "Tiana"}, {"v": "6"}, {"v": '{"nullKey":null}'}]}, + {"f": [{"v": "Tiffany"}, {"v": "229"}, {"v": '""'}]}, + {"f": [{"v": "Tiffani"}, {"v": "8"}, {"v": "[]"}]}, ], "totalBytesProcessed": "154775150", "jobComplete": True, @@ -65,7 +72,7 @@ def test_to_arrow_with_jobs_query_response(): ) records = rows.to_arrow() - assert records.column_names == ["name", "number"] + assert records.column_names == ["name", "number", "json"] assert records["name"].to_pylist() == [ "Tiarra", "Timothy", @@ -78,6 +85,17 @@ def test_to_arrow_with_jobs_query_response(): "Tiffani", ] assert records["number"].to_pylist() == [6, 325, 26, 10, 17, 22, 6, 229, 8] + assert records["json"].to_pylist() == [ + "123", + '{"key":"value"}', + "[1,2,3]", + '{"aKey": {"bKey": {"cKey": -123}}}', + None, + '"some-json-string"', + '{"nullKey":null}', + '""', + "[]", + ] def test_to_arrow_with_jobs_query_response_and_max_results(): @@ -87,6 +105,7 @@ def test_to_arrow_with_jobs_query_response_and_max_results(): "fields": [ {"name": "name", "type": "STRING", "mode": "NULLABLE"}, {"name": "number", "type": "INTEGER", "mode": "NULLABLE"}, + {"name": "json", "type": "JSON", "mode": "NULLABLE"}, ] }, "jobReference": { @@ -96,15 +115,21 @@ def test_to_arrow_with_jobs_query_response_and_max_results(): }, "totalRows": "9", "rows": [ - {"f": [{"v": "Tiarra"}, {"v": "6"}]}, - {"f": [{"v": "Timothy"}, {"v": "325"}]}, - {"f": [{"v": "Tina"}, {"v": "26"}]}, - {"f": [{"v": "Tierra"}, {"v": "10"}]}, - {"f": [{"v": "Tia"}, {"v": "17"}]}, - {"f": [{"v": "Tiara"}, {"v": "22"}]}, - {"f": [{"v": "Tiana"}, {"v": "6"}]}, - {"f": [{"v": "Tiffany"}, {"v": "229"}]}, - {"f": [{"v": "Tiffani"}, {"v": "8"}]}, + {"f": [{"v": "Tiarra"}, {"v": "6"}, {"v": "123"}]}, + {"f": [{"v": "Timothy"}, {"v": "325"}, {"v": '{"key":"value"}'}]}, + {"f": [{"v": "Tina"}, {"v": "26"}, {"v": "[1,2,3]"}]}, + { + "f": [ + {"v": "Tierra"}, + {"v": "10"}, + {"v": '{"aKey": {"bKey": {"cKey": -123}}}'}, + ] + }, + {"f": [{"v": "Tia"}, {"v": "17"}, {"v": None}]}, + {"f": [{"v": "Tiara"}, {"v": "22"}, {"v": '"some-json-string"'}]}, + {"f": [{"v": "Tiana"}, {"v": "6"}, {"v": '{"nullKey":null}'}]}, + {"f": [{"v": "Tiffany"}, {"v": "229"}, {"v": '""'}]}, + {"f": [{"v": "Tiffani"}, {"v": "8"}, {"v": "[]"}]}, ], "totalBytesProcessed": "154775150", "jobComplete": True, @@ -125,10 +150,11 @@ def test_to_arrow_with_jobs_query_response_and_max_results(): ) records = rows.to_arrow() - assert records.column_names == ["name", "number"] + assert records.column_names == ["name", "number", "json"] assert records["name"].to_pylist() == [ "Tiarra", "Timothy", "Tina", ] assert records["number"].to_pylist() == [6, 325, 26] + assert records["json"].to_pylist() == ["123", '{"key":"value"}', "[1,2,3]"] diff --git a/tests/unit/test_table_pandas.py b/tests/unit/test_table_pandas.py index 02a7a6a79..5269d0b3b 100644 --- a/tests/unit/test_table_pandas.py +++ b/tests/unit/test_table_pandas.py @@ -55,6 +55,7 @@ def test_to_dataframe_nullable_scalars(monkeypatch, class_under_test): pyarrow.field( "timestamp_col", pyarrow.timestamp("us", tz=datetime.timezone.utc) ), + pyarrow.field("json_col", pyarrow.string()), ] ) arrow_table = pyarrow.Table.from_pydict( @@ -74,6 +75,7 @@ def test_to_dataframe_nullable_scalars(monkeypatch, class_under_test): 2021, 8, 9, 13, 30, 44, 123456, tzinfo=datetime.timezone.utc ) ], + "json_col": ["{}"], }, schema=arrow_schema, ) @@ -90,6 +92,7 @@ def test_to_dataframe_nullable_scalars(monkeypatch, class_under_test): bigquery.SchemaField("string_col", "STRING"), bigquery.SchemaField("time_col", "TIME"), bigquery.SchemaField("timestamp_col", "TIMESTAMP"), + bigquery.SchemaField("json_col", "JSON"), ] mock_client = mock.create_autospec(bigquery.Client) mock_client.project = "test-proj" @@ -113,6 +116,7 @@ def test_to_dataframe_nullable_scalars(monkeypatch, class_under_test): assert df.dtypes["string_col"].name == "object" assert df.dtypes["time_col"].name == "dbtime" assert df.dtypes["timestamp_col"].name == "datetime64[ns, UTC]" + assert df.dtypes["json_col"].name == "object" # Check for expected values. assert df["bignumeric_col"][0] == decimal.Decimal("123.456789101112131415") pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy