diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ea2792a221..a7d3d2301e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -56,7 +56,6 @@ repos: # Hooks from all other repos # NOTE : keep these in hook-name (aka 'id') order - - repo: https://github.com/adamchainz/blacken-docs # This template does not keep up-to-date with versions, visit the repo to see the most recent release. rev: 1.20.0 diff --git a/lib/iris/cube.py b/lib/iris/cube.py index 44be3a63d7..0bed1c2d10 100644 --- a/lib/iris/cube.py +++ b/lib/iris/cube.py @@ -1280,10 +1280,6 @@ def __init__( ... (longitude, 1)]) """ - # Temporary error while we transition the API. - if isinstance(data, str): - raise TypeError("Invalid data type: {!r}.".format(data)) - # Configure the metadata manager. self._metadata_manager = metadata_manager_factory(CubeMetadata) @@ -4468,15 +4464,20 @@ def __eq__(self, other): # Having checked everything else, check approximate data equality. if result and not dataless_equality: - # TODO: why do we use allclose() here, but strict equality in - # _DimensionalMetadata (via util.array_equal())? - result = bool( - np.allclose( - self.core_data(), - other.core_data(), - equal_nan=True, + if self.dtype.kind in "if": + # numbers + # TODO: why do we use allclose() here, but strict equality in + # _DimensionalMetadata (via util.array_equal())? + result = bool( + np.allclose( + self.core_data(), + other.core_data(), + equal_nan=True, + ) ) - ) + else: + # non-numeric: use exact equality + result = bool(np.all(self.core_data() == other.core_data())) return result # Must supply __ne__, Python does not defer to __eq__ for negative equality diff --git a/lib/iris/fileformats/_nc_load_rules/helpers.py b/lib/iris/fileformats/_nc_load_rules/helpers.py index 35c2e96924..7c4810ffe7 100644 --- a/lib/iris/fileformats/_nc_load_rules/helpers.py +++ b/lib/iris/fileformats/_nc_load_rules/helpers.py @@ -708,13 +708,13 @@ def build_and_add_global_attributes(engine: Engine): ), ) if problem is not None: - stack_notes = problem.stack_trace.__notes__ + stack_notes = problem.stack_trace.__notes__ # type: ignore[attr-defined] if stack_notes is None: stack_notes = [] stack_notes.append( f"Skipping disallowed global attribute '{attr_name}' (see above error)" ) - problem.stack_trace.__notes__ = stack_notes + problem.stack_trace.__notes__ = stack_notes # type: ignore[attr-defined] ################################################################################ @@ -1209,9 +1209,6 @@ def get_attr_units(cf_var, attributes, capture_invalid=False): attributes["invalid_units"] = attr_units attr_units = UNKNOWN_UNIT_STRING - if np.issubdtype(cf_var.dtype, np.str_): - attr_units = NO_UNIT_STRING - if any( hasattr(cf_var.cf_data, name) for name in ("flag_values", "flag_masks", "flag_meanings") @@ -1536,14 +1533,14 @@ def build_and_add_dimension_coordinate( ) if problem is not None: coord_var_name = str(cf_coord_var.cf_name) - stack_notes = problem.stack_trace.__notes__ + stack_notes = problem.stack_trace.__notes__ # type: ignore[attr-defined] if stack_notes is None: stack_notes = [] stack_notes.append( f"Failed to create {coord_var_name} dimension coordinate:\n" f"Gracefully creating {coord_var_name!r} auxiliary coordinate instead." ) - problem.stack_trace.__notes__ = stack_notes + problem.stack_trace.__notes__ = stack_notes # type: ignore[attr-defined] problem.handled = True _ = _add_or_capture( @@ -1574,11 +1571,7 @@ def _build_auxiliary_coordinate( # Get units attr_units = get_attr_units(cf_coord_var, attributes) - # Get any coordinate point data. - if isinstance(cf_coord_var, cf.CFLabelVariable): - points_data = cf_coord_var.cf_label_data(engine.cf_var) - else: - points_data = _get_cf_var_data(cf_coord_var) + points_data = _get_cf_var_data(cf_coord_var) # Get any coordinate bounds. cf_bounds_var, climatological = get_cf_bounds_var(cf_coord_var) @@ -1643,9 +1636,9 @@ def _add_auxiliary_coordinate( # Determine the name of the dimension/s shared between the CF-netCDF data variable # and the coordinate being built. - common_dims = [ - dim for dim in cf_coord_var.dimensions if dim in engine.cf_var.dimensions - ] + coord_dims = cf_coord_var.dimensions + datavar_dims = engine.cf_var.dimensions + common_dims = [dim for dim in coord_dims if dim in datavar_dims] data_dims = None if common_dims: # Calculate the offset of each common dimension. diff --git a/lib/iris/fileformats/cf.py b/lib/iris/fileformats/cf.py index e87423a0ae..a72167b7d4 100644 --- a/lib/iris/fileformats/cf.py +++ b/lib/iris/fileformats/cf.py @@ -32,7 +32,7 @@ import iris.exceptions import iris.fileformats._nc_load_rules.helpers as hh -from iris.fileformats.netcdf import _thread_safe_nc +from iris.fileformats.netcdf import _bytecoding_datasets, _thread_safe_nc from iris.mesh.components import Connectivity import iris.util import iris.warnings @@ -73,7 +73,9 @@ # NetCDF returns a different type for strings depending on Python version. def _is_str_dtype(var): - return np.issubdtype(var.dtype, np.bytes_) + # N.B. use 'datatype' not 'dtype', to "look inside" variable wrappers which + # represent 'S1' type data as 'U'. + return np.dtype(var.dtype).kind in "SU" ################################################################################ @@ -774,73 +776,6 @@ def identify(cls, variables, ignore=None, target=None, warn=True): return result - def cf_label_data(self, cf_data_var): - """Return the associated CF-netCDF label variable strings. - - Parameters - ---------- - cf_data_var : :class:`iris.fileformats.cf.CFDataVariable` - The CF-netCDF data variable which the CF-netCDF label variable - describes. - - Returns - ------- - str labels - - """ - if not isinstance(cf_data_var, CFDataVariable): - raise TypeError( - "cf_data_var argument should be of type CFDataVariable. Got %r." - % type(cf_data_var) - ) - - # Determine the name of the label string (or length) dimension by - # finding the dimension name that doesn't exist within the data dimensions. - str_dim_name = list(set(self.dimensions) - set(cf_data_var.dimensions)) - - if len(str_dim_name) != 1: - raise ValueError( - "Invalid string dimensions for CF-netCDF label variable %r" - % self.cf_name - ) - - str_dim_name = str_dim_name[0] - label_data = self[:] - - if ma.isMaskedArray(label_data): - label_data = label_data.filled() - - # Determine whether we have a string-valued scalar label - # i.e. a character variable that only has one dimension (the length of the string). - if self.ndim == 1: - label_string = b"".join(label_data).strip() - label_string = label_string.decode("utf8") - data = np.array([label_string]) - else: - # Determine the index of the string dimension. - str_dim = self.dimensions.index(str_dim_name) - - # Calculate new label data shape (without string dimension) and create payload array. - new_shape = tuple( - dim_len for i, dim_len in enumerate(self.shape) if i != str_dim - ) - string_basetype = "|U%d" - string_dtype = string_basetype % self.shape[str_dim] - data = np.empty(new_shape, dtype=string_dtype) - - for index in np.ndindex(new_shape): - # Create the slice for the label data. - if str_dim == 0: - label_index = (slice(None, None),) + index - else: - label_index = index + (slice(None, None),) - - label_string = b"".join(label_data[label_index]).strip() - label_string = label_string.decode("utf8") - data[index] = label_string - - return data - def cf_label_dimensions(self, cf_data_var): """Return the name of the associated CF-netCDF label variable data dimensions. @@ -1371,7 +1306,12 @@ def __init__(self, file_source, warn=False, monotonic=False): else: self._filename = file_source - self._dataset = _thread_safe_nc.DatasetWrapper(self._filename, mode="r") + if _bytecoding_datasets.DECODE_TO_STRINGS_ON_READ: + ds_type = _bytecoding_datasets.EncodedDataset + else: + ds_type = _thread_safe_nc.DatasetWrapper + + self._dataset = ds_type(self._filename, mode="r") self._own_file = True else: # We have been passed an open dataset. @@ -1404,6 +1344,9 @@ def __init__(self, file_source, warn=False, monotonic=False): self._with_ugrid = False # Read the variables in the dataset only once to reduce runtime. + ds = self._dataset + # Turn off *any* automatic decoding in the underlying netCDF4 dataset. + ds.set_auto_chartostring(False) variables = self._dataset.variables self._translate(variables) self._build_cf_groups(variables) diff --git a/lib/iris/fileformats/netcdf/__init__.py b/lib/iris/fileformats/netcdf/__init__.py index f1e37f2545..60b4b5895a 100644 --- a/lib/iris/fileformats/netcdf/__init__.py +++ b/lib/iris/fileformats/netcdf/__init__.py @@ -25,6 +25,12 @@ # Note: these probably shouldn't be public, but for now they are. from .._nc_load_rules.helpers import UnknownCellMethodWarning, parse_cell_methods +from ._bytecoding_datasets import ( + DECODE_TO_STRINGS_ON_READ, + DEFAULT_READ_ENCODING, + DEFAULT_WRITE_ENCODING, + SUPPORTED_ENCODINGS, +) from .loader import DEBUG, NetCDFDataProxy, load_cubes from .saver import ( CF_CONVENTIONS_VERSION, @@ -42,9 +48,13 @@ "CFNameCoordMap", "CF_CONVENTIONS_VERSION", "DEBUG", + "DECODE_TO_STRINGS_ON_READ", + "DEFAULT_READ_ENCODING", + "DEFAULT_WRITE_ENCODING", "MESH_ELEMENTS", "NetCDFDataProxy", "SPATIO_TEMPORAL_AXES", + "SUPPORTED_ENCODINGS", "Saver", "UnknownCellMethodWarning", "load_cubes", diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py new file mode 100644 index 0000000000..00c63ffe0a --- /dev/null +++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py @@ -0,0 +1,445 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the BSD license. +# See LICENSE in the root of the repository for full licensing details. +"""Module providing to netcdf datasets with automatic character encoding. + +The requirement is to convert numpy fixed-width unicode arrays on writing to a variable +which is declared as a byte (character) array with a fixed-length string dimension. + +Numpy unicode string arrays are ones with dtypes of the form "U". +Numpy character variables have the dtype "S1", and map to a fixed-length "string +dimension". + +In principle, netCDF4 already performs these translations, but in practice current +releases are not functional for anything other than "ascii" encoding -- including UTF-8, +which is the most obvious and desirable "general" solution. + +There is also the question of whether we should like to implement UTF-8 as our default. +Current discussions on this are inconclusive and neither CF conventions nor the NetCDF +User Guide are definite on what possible values of "_Encoding" are, or what the effective +default is, even though they do both mention the "_Encoding" attribute as a potential +way to handle the issue. + +Because of this, we interpret as follows: + * when reading bytes : in the absence of an "_Encoding" attribute, we will attempt to + decode bytes as UTF-8 + * when writing strings : in the absence of an "_Encoding" attribute (on the Iris + cube or coord object), we will attempt to encode data with "ascii" : If this fails, + it raise an error prompting the user to supply an "_Encoding" attribute. + +Where an "_Encoding" attribute is provided to Iris, we will honour it where possible, +identifying with "codecs.lookup" : This means we support the encodings in the Python +Standard Library, and the name aliases which it recognises. + +See: + +* known problems https://github.com/Unidata/netcdf4-python/issues/1440 +* suggestions for how this "ought" to work, discussed in the netcdf-c library + * https://github.com/Unidata/netcdf-c/issues/402 + +""" + +import codecs +import contextlib +import dataclasses +import threading +from typing import Any, Callable +import warnings + +import numpy as np + +from iris.fileformats.netcdf._thread_safe_nc import ( + DatasetWrapper, + GroupWrapper, + NetCDFDataProxy, + NetCDFWriteProxy, + VariableWrapper, +) +import iris.warnings +from iris.warnings import IrisCfLoadWarning, IrisCfSaveWarning + + +def decode_bytesarray_to_stringarray( + byte_array: np.ndarray, encoding: str, string_width: int, var_name: str +) -> np.ndarray: + """Convert an array of bytes to an array of strings, with one less dimension. + + N.B. for now at least, we assume the string dim is **always the last one**. + If 'string_width' is not given, it is set to the final dimension of 'byte_array'. + """ + if np.ma.isMaskedArray(byte_array): + # netCDF4-python sees zeros as "missing" -- we don't need or want that + byte_array = byte_array.data + bytes_shape = byte_array.shape + var_shape = bytes_shape[:-1] + string_dtype = f"U{string_width}" + result = np.empty(var_shape, dtype=string_dtype) + for ndindex in np.ndindex(var_shape): + element_bytes = byte_array[ndindex] + bytes = b"".join([b or b"\0" for b in element_bytes]) + try: + string = bytes.decode(encoding) + except UnicodeDecodeError as err: + msg = ( + f"Character data in variable {var_name!r} could not be decoded " + f"with the {encoding!r} encoding. This can be fixed by setting the " + "variable '_Encoding' attribute to suit the content." + ) + raise ValueError(msg) from err + result[ndindex] = string + return result + + +def encode_stringarray_as_bytearray( + data: np.typing.ArrayLike, + encoding: str, + string_dimension_length: int, + var_name: str, +) -> np.ndarray: + """Encode strings as a bytes array.""" + data = np.asanyarray(data) + element_shape = data.shape + result = np.zeros(element_shape + (string_dimension_length,), dtype="S1") + right_pad = b"\0" * string_dimension_length + for index in np.ndindex(element_shape): + string = data[index] + try: + bytes = string.encode(encoding=encoding) + except UnicodeEncodeError as err: + msg = ( + f"String data written to netcdf character variable {var_name!r} " + f"could not be represented in encoding {encoding!r}. " + "This can be fixed by setting a suitable variable '_Encoding' " + 'attribute, e.g. variable._Encoding="UTF-8".' + ) + raise ValueError(msg) from err + + n_bytes = len(bytes) + if n_bytes > string_dimension_length: + from iris.exceptions import TranslationError + + msg = ( + f"String '{string}' written into netcdf variable {var_name!r} with " + f"encoding {encoding!r} is {n_bytes} bytes long, which exceeds the " + f"string dimension length, {string_dimension_length}. " + 'This can be fixed by converting the data to a "wider" string dtype, ' + f'e.g. cube.data = cube.data.astype("U{n_bytes}").' + ) + raise TranslationError(msg) + + # It's all a bit nasty ... + bytes = (bytes + right_pad)[:string_dimension_length] + result[index] = [bytes[i : i + 1] for i in range(string_dimension_length)] + + return result + + +@dataclasses.dataclass +class VariableEncoder: + """A record of encoding details which can apply them to variable data.""" + + varname: str # just for the error messages + dtype: np.dtype + is_chardata: bool # just a shortcut for the dtype test + read_encoding: str # IF 'is_chardata': one of the supported encodings + write_encoding: str # IF 'is_chardata': one of the supported encodings + n_chars_dim: int # IF 'is_chardata': length of associated character dimension + string_width: int # IF 'is_chardata': width when viewed as strings (i.e. "Uxx") + + def __init__(self, cf_var): + """Capture the encoding info for a netCDF4 variable. + + Can be either an actual netCDF4.Variable, or a _thread_safe_nc.VariableWrapper. + + Can *not* be a _bytecoding_datasets.EncodedVariable, since we need to see the + true, underlying .dtype. + + Most importantly, we do *not* store 'cf_var' : instead we extract the + necessary information and store it in this object. + So, this object has static state + is serialisable. + """ + self.varname = cf_var.name + self.dtype = cf_var.dtype + self.is_chardata = np.issubdtype(self.dtype, np.bytes_) + if self.is_chardata: + encoding_attr = getattr(cf_var, "_Encoding", None) + self.read_encoding = _identify_encoding( + encoding_attr, var_name=cf_var.name, writing=False + ) + self.write_encoding = _identify_encoding( + encoding_attr, var_name=cf_var.name, writing=True + ) + n_chars_dim = 1 # default to 1 for a scalar var + if len(cf_var.dimensions) >= 1: + dim_name = cf_var.dimensions[-1] + if dim_name in cf_var.group().dimensions: + n_chars_dim = cf_var.group().dimensions[dim_name].size + self.n_chars_dim = n_chars_dim + self.string_width = self._get_string_width() + + def _get_string_width(self) -> int: + """Return the string-length defined for this variable.""" + # Work out the actual byte width from the parent dataset dimensions. + n_bytes = self.n_chars_dim + # Convert the string dimension length (i.e. bytes) to a sufficiently-long + # string width, depending on the (read) encoding used. + encoding = self.read_encoding + n_chars = _ENCODING_WIDTH_TRANSLATIONS[encoding].nbytes_2_nchars(n_bytes) + return n_chars + + def decode_bytes_to_stringarray(self, data: np.ndarray) -> np.ndarray: + if self.is_chardata: + # N.B. read encoding default is UTF-8 --> a "usually safe" choice + encoding = self.read_encoding + strlen = self.string_width + data = decode_bytesarray_to_stringarray( + data, encoding, strlen, self.varname + ) + + return data + + def encode_strings_as_bytearray(self, data: np.ndarray) -> np.ndarray: + if self.is_chardata and data.dtype.kind == "U": + # N.B. it is also possible to pass a byte array (dtype "S1"), + # to be written directly, without processing. + # N.B. write encoding *default* is "ascii" --> fails bad content + encoding = self.write_encoding + strlen = self.n_chars_dim + data = encode_stringarray_as_bytearray(data, encoding, strlen, self.varname) + + return data + + +class NetcdfStringDecodeSetting(threading.local): + def __init__(self, perform_decoding: bool = True): + self.set(perform_decoding) + + def set(self, perform_decoding: bool): + self.perform_decoding = perform_decoding + + def __bool__(self): + return self.perform_decoding + + @contextlib.contextmanager + def context(self, perform_decoding: bool): + old_setting = self.perform_decoding + self.perform_decoding = perform_decoding + try: + yield + finally: + self.perform_decoding = old_setting + + +DECODE_TO_STRINGS_ON_READ = NetcdfStringDecodeSetting() +DEFAULT_READ_ENCODING = "utf-8" +DEFAULT_WRITE_ENCODING = "ascii" + + +@dataclasses.dataclass +class EncodingWidthRelations: + """Encode the default string-width <-> byte-dimension relations. + + These translations are just a "best guess"... + + When translating bytes (dtype S1) to strings (dtype Uxx), the chosen (default) + string width may be longer than is needed for the actual content. But it is at + least "safe". + + When translating strings to bytes, we *can* get more bytes than the default + byte dimension length, and the code will then truncate + ( with a warning : see '_identify_encoding' ). + This can be avoided if necessary, in specific cases, by recasting the data to a + dtype with greater width (Uxx). + """ + + nchars_2_nbytes: Callable[[int], int] + nbytes_2_nchars: Callable[[int], int] + + +_ENCODING_WIDTH_TRANSLATIONS = { + "ascii": EncodingWidthRelations(lambda x: x, lambda x: x), + "utf-8": EncodingWidthRelations(lambda x: x, lambda x: x), + "utf-16": EncodingWidthRelations( + nchars_2_nbytes=lambda x: (x + 1) * 2, + nbytes_2_nchars=lambda x: x // 2 - 1, + ), + "utf-32": EncodingWidthRelations( + nchars_2_nbytes=lambda x: (x + 1) * 4, + nbytes_2_nchars=lambda x: x // 4 - 1, + ), +} +SUPPORTED_ENCODINGS = list(_ENCODING_WIDTH_TRANSLATIONS.keys()) + + +def _identify_encoding(encoding, var_name: str, writing: bool = False) -> str: + """Normalise an encoding name + check it is supported. + + Parameters + ---------- + encoding : Any + Select an encoding : None, or a string, or anything printable (via str()). + var_name : str + Name of the relevant dataset variable (i.e. 'var_name') : + used only to produce warning messages. + writing : bool + Specify whether reading or writing, which affects any *default* return value, + i.e. select between DEFAULT_READ_ENCODING / DEFAULT_WRITE_ENCODING. + + If given, and supported, return a normalised encoding name, + -- i.e. always one of SUPPORTED_ENCODINGS. + If not given, or not supported, return the default encoding name. + + If given **but not recognised/supported**, also emit a warning (and return default). + """ + if encoding is not None: + encoding = str(encoding) + + result: str | None = None # not yet 'found' : we will never *return* this + + if encoding is not None: + # Normalise the name : NB must recognised by Python "codecs". + try: + result = codecs.lookup(encoding).name + except LookupError: + pass + + if result and result not in SUPPORTED_ENCODINGS: + # Python "codecs" recognised it, but we don't support it. + result = None + + if result is None: + # Unrecognised encoding name : handle this as just a warning + msg = ( + f"Ignoring unsupported encoding for netCDF variable {var_name!r}: " + f"_Encoding = {encoding!r}, is not recognised as one of the supported " + f"encodings, {SUPPORTED_ENCODINGS}." + ) + warntype = IrisCfSaveWarning if writing else IrisCfLoadWarning + warnings.warn(msg, category=warntype) + + if result is None: + if writing: + result = DEFAULT_WRITE_ENCODING + else: + result = DEFAULT_READ_ENCODING + + return result + + +class Mixin_Block_AutoChartostring: + # Adjusted support for "set_auto_chartostring", for all of variable/group/dataset. + def set_auto_chartostring(self, onoff: bool): + # Though the concept doesn't really apply, support the method for simplicity's + # sake, but forbid turning it *on*. + if onoff: + msg = '"auto_chartostring" is not supported by Iris EncodedDataset\'s.' + raise TypeError(msg) + + +class EncodedVariable(Mixin_Block_AutoChartostring, VariableWrapper): + """A variable wrapper that translates variable data according to byte encodings.""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # Override specific properties of the contained instance, making changes in the case + # that the variable contains char data, which is presented instead as strings + # with one less dimension. + + @property + def shape(self): + shape = self._contained_instance.shape + is_chardata = np.issubdtype(self._contained_instance.dtype, np.bytes_) + if is_chardata: + # Translated char data appears without the final dimension + shape = shape[:-1] # remove final dimension + return shape + + @property + def dimensions(self): + dimensions = self._contained_instance.dimensions + is_chardata = np.issubdtype(self._contained_instance.dtype, np.bytes_) + if is_chardata: + # Translated char data appears without the final dimension + dimensions = dimensions[:-1] # remove final dimension + return dimensions + + @property + def dtype(self): + dtype = self._contained_instance.dtype + is_chardata = np.issubdtype(self._contained_instance.dtype, np.bytes_) + if is_chardata: + # Create a coding spec : redo every time in case "_Encoding" has changed + encoding_spec = VariableEncoder(self._contained_instance) + dtype = np.dtype(f"U{encoding_spec.string_width}") + return dtype + + def __getitem__(self, keys): + self._contained_instance.set_auto_chartostring(False) + data = super().__getitem__(keys) + # Create a coding spec : redo every time in case "_Encoding" has changed + encoding_spec = VariableEncoder(self._contained_instance) + data = encoding_spec.decode_bytes_to_stringarray(data) + return data + + def __setitem__(self, keys, data): + data = np.asanyarray(data) + # Create a coding spec : redo every time in case "_Encoding" has changed + encoding_spec = VariableEncoder(self._contained_instance) + data = encoding_spec.encode_strings_as_bytearray(data) + super().__setitem__(keys, data) + + +class EncodedGroup(Mixin_Block_AutoChartostring, GroupWrapper): + """A specialised GroupWrapper whose variables are EncodedVariables.""" + + VAR_WRAPPER_CLS = EncodedVariable + GRP_WRAPPER_CLS: Any | None = None + + +EncodedGroup.GRP_WRAPPER_CLS = EncodedGroup + + +class EncodedDataset(Mixin_Block_AutoChartostring, DatasetWrapper): + """A specialised DatasetWrapper. + + Its groups are EncodedGroups and variables are EncodedVariables. + """ + + VAR_WRAPPER_CLS = EncodedVariable + GRP_WRAPPER_CLS = EncodedGroup + + +class EncodedNetCDFDataProxy(NetCDFDataProxy): + __slots__ = NetCDFDataProxy.__slots__ + ("encoding_details",) + + def __init__(self, cf_var, *args, **kwargs): + # When creating, also capture + record the encoding to be performed. + kwargs["use_byte_data"] = True + super().__init__(cf_var, *args, **kwargs) + if not isinstance(cf_var, EncodedVariable): + msg = ( + f"Unexpected variable type : {type(cf_var)} of variable '{cf_var.name}'" + ": expected EncodedVariable." + ) + raise TypeError(msg) + self.encoding_details = VariableEncoder(cf_var._contained_instance) + + def __getitem__(self, keys): + data = super().__getitem__(keys) + # Apply the optional bytes-to-strings conversion + data = self.encoding_details.decode_bytes_to_stringarray(data) + return data + + +class EncodedNetCDFWriteProxy(NetCDFWriteProxy): + def __init__(self, filepath, cf_var, file_write_lock): + super().__init__(filepath, cf_var, file_write_lock) + self.encoding_details = VariableEncoder(cf_var._contained_instance) + + def __setitem__(self, key, data): + data = np.asanyarray(data) + # Apply the optional strings-to-bytes conversion + data = self.encoding_details.encode_strings_as_bytearray(data) + super().__setitem__(key, data) diff --git a/lib/iris/fileformats/netcdf/_thread_safe_nc.py b/lib/iris/fileformats/netcdf/_thread_safe_nc.py index 33183ef0fa..e5d6fdd0f1 100644 --- a/lib/iris/fileformats/netcdf/_thread_safe_nc.py +++ b/lib/iris/fileformats/netcdf/_thread_safe_nc.py @@ -106,7 +106,14 @@ class DimensionWrapper(_ThreadSafeWrapper): _DUCKTYPE_CHECK_PROPERTIES = ["isunlimited"] -class VariableWrapper(_ThreadSafeWrapper): +class ThreadSafeWrapper_With_AutoChartostring(_ThreadSafeWrapper): + # A method supported by all of variables/groups/datasets. + def set_auto_chartostring(self, onoff: bool): + with _GLOBAL_NETCDF4_LOCK: + self._contained_instance.set_auto_chartostring(onoff) + + +class VariableWrapper(ThreadSafeWrapper_With_AutoChartostring): """Accessor for a netCDF4.Variable, always acquiring _GLOBAL_NETCDF4_LOCK. All API calls should be identical to those for netCDF4.Variable. @@ -150,7 +157,7 @@ def get_dims(self, *args, **kwargs) -> typing.Tuple[DimensionWrapper]: return tuple([DimensionWrapper.from_existing(d) for d in dimensions_]) -class GroupWrapper(_ThreadSafeWrapper): +class GroupWrapper(ThreadSafeWrapper_With_AutoChartostring): """Accessor for a netCDF4.Group, always acquiring _GLOBAL_NETCDF4_LOCK. All API calls should be identical to those for netCDF4.Group. @@ -159,6 +166,10 @@ class GroupWrapper(_ThreadSafeWrapper): CONTAINED_CLASS = netCDF4.Group # Note: will also accept a whole Dataset object, but that is OK. _DUCKTYPE_CHECK_PROPERTIES = ["createVariable"] + # Class to use when creating variable wrappers (default=VariableWrapper). + # - needed to support _bytecoding_datasets.EncodedDataset. + VAR_WRAPPER_CLS = VariableWrapper + GRP_WRAPPER_CLS: typing.Any | None = None # self-reference : fill in later # All Group API that returns Dimension(s) is wrapped to instead return # DimensionWrapper(s). @@ -203,7 +214,7 @@ def variables(self) -> typing.Dict[str, VariableWrapper]: """ with _GLOBAL_NETCDF4_LOCK: variables_ = self._contained_instance.variables - return {k: VariableWrapper.from_existing(v) for k, v in variables_.items()} + return {k: self.VAR_WRAPPER_CLS.from_existing(v) for k, v in variables_.items()} def createVariable(self, *args, **kwargs) -> VariableWrapper: """Call createVariable() from netCDF4.Group/Dataset within _GLOBAL_NETCDF4_LOCK. @@ -216,7 +227,7 @@ def createVariable(self, *args, **kwargs) -> VariableWrapper: """ with _GLOBAL_NETCDF4_LOCK: new_variable = self._contained_instance.createVariable(*args, **kwargs) - return VariableWrapper.from_existing(new_variable) + return self.VAR_WRAPPER_CLS.from_existing(new_variable) def get_variables_by_attributes( self, *args, **kwargs @@ -234,7 +245,7 @@ def get_variables_by_attributes( variables_ = list( self._contained_instance.get_variables_by_attributes(*args, **kwargs) ) - return [VariableWrapper.from_existing(v) for v in variables_] + return [self.VAR_WRAPPER_CLS.from_existing(v) for v in variables_] # All Group API that returns Group(s) is wrapped to instead return # GroupWrapper(s). @@ -252,7 +263,7 @@ def groups(self): """ with _GLOBAL_NETCDF4_LOCK: groups_ = self._contained_instance.groups - return {k: GroupWrapper.from_existing(v) for k, v in groups_.items()} + return {k: self.GRP_WRAPPER_CLS.from_existing(v) for k, v in groups_.items()} @property def parent(self): @@ -268,7 +279,7 @@ def parent(self): """ with _GLOBAL_NETCDF4_LOCK: parent_ = self._contained_instance.parent - return GroupWrapper.from_existing(parent_) + return self.GRP_WRAPPER_CLS.from_existing(parent_) def createGroup(self, *args, **kwargs): """Call createGroup() from netCDF4.Group/Dataset. @@ -281,7 +292,10 @@ def createGroup(self, *args, **kwargs): """ with _GLOBAL_NETCDF4_LOCK: new_group = self._contained_instance.createGroup(*args, **kwargs) - return GroupWrapper.from_existing(new_group) + return self.GRP_WRAPPER_CLS.from_existing(new_group) + + +GroupWrapper.GRP_WRAPPER_CLS = GroupWrapper class DatasetWrapper(GroupWrapper): @@ -311,14 +325,22 @@ def fromcdl(cls, *args, **kwargs): class NetCDFDataProxy: """A reference to the data payload of a single NetCDF file variable.""" - __slots__ = ("shape", "dtype", "path", "variable_name", "fill_value") - - def __init__(self, shape, dtype, path, variable_name, fill_value): - self.shape = shape + __slots__ = ( + "shape", + "dtype", + "path", + "variable_name", + "fill_value", + "use_byte_data", + ) + + def __init__(self, cf_var, dtype, path, fill_value, *, use_byte_data=False): + self.shape = cf_var.shape + self.variable_name = cf_var.name self.dtype = dtype self.path = path - self.variable_name = variable_name self.fill_value = fill_value + self.use_byte_data = use_byte_data @property def ndim(self): @@ -337,6 +359,8 @@ def __getitem__(self, keys): dataset = netCDF4.Dataset(self.path) try: variable = dataset.variables[self.variable_name] + if self.use_byte_data: + variable.set_auto_chartostring(False) # Get the NetCDF variable data and slice. var = variable[keys] finally: diff --git a/lib/iris/fileformats/netcdf/loader.py b/lib/iris/fileformats/netcdf/loader.py index 85cb147796..044b026da5 100644 --- a/lib/iris/fileformats/netcdf/loader.py +++ b/lib/iris/fileformats/netcdf/loader.py @@ -41,7 +41,7 @@ import iris.coord_systems import iris.coords import iris.fileformats.cf -from iris.fileformats.netcdf import _thread_safe_nc +from iris.fileformats.netcdf import _bytecoding_datasets, _thread_safe_nc from iris.fileformats.netcdf.saver import _CF_ATTRS import iris.io import iris.util @@ -55,7 +55,11 @@ # An expected part of the public loader API, but includes thread safety # concerns so is housed in _thread_safe_nc. -NetCDFDataProxy = _thread_safe_nc.NetCDFDataProxy +# NOTE: this is the *default*, as required for public legacy api +# - in practice, when creating our proxies we dynamically choose between this and +# :class:`_thread_safe_nc.DatasetWrapper`, depending on +# :data:`_bytecoding_datasets.DECODE_TO_STRINGS_ON_READ` +NetCDFDataProxy = _bytecoding_datasets.EncodedNetCDFDataProxy class _WarnComboIgnoringBoundsLoad( @@ -284,7 +288,7 @@ def _get_cf_var_data(cf_var): # correct dtype. Note: this is not an issue for masked arrays, # only masked scalar values. if result is np.ma.masked: - result = np.ma.masked_all(1, dtype=cf_var.datatype) + result = np.ma.masked_all(1, dtype=cf_var.dtype) else: # Get lazy chunked data out of a cf variable. # Creates Dask wrappers around data arrays for any cube components which @@ -294,15 +298,27 @@ def _get_cf_var_data(cf_var): # Make a data-proxy that mimics array access and can fetch from the file. # Note: Special handling needed for "variable length string" types which # return a dtype of `str`, rather than a numpy type; use `S1` in this case. - fill_dtype = "S1" if cf_var.dtype is str else cf_var.dtype.str[1:] - fill_value = getattr( - cf_var.cf_data, - "_FillValue", - _thread_safe_nc.default_fillvals[fill_dtype], - ) - proxy = NetCDFDataProxy( - cf_var.shape, dtype, cf_var.filename, cf_var.cf_name, fill_value - ) + if getattr(cf_var.dtype, "kind", None) == "U": + # Special handling for "string variables". + fill_value = "" + else: + fill_dtype = "S1" if cf_var.dtype is str else cf_var.dtype.str[1:] + fill_value = getattr( + cf_var.cf_data, + "_FillValue", + _thread_safe_nc.default_fillvals[fill_dtype], + ) + + # Switch type of proxy, based on type of variable. + # It is done this way, instead of using an instance variable, because the + # limited nature of the wrappers makes a stateful choice awkward, + # e.g. especially, "variable.group()" is *not* the parent DatasetWrapper. + if isinstance(cf_var.cf_data, _bytecoding_datasets.EncodedVariable): + proxy_class = _bytecoding_datasets.EncodedNetCDFDataProxy + else: + proxy_class = _thread_safe_nc.NetCDFDataProxy + + proxy = proxy_class(cf_var.cf_data, dtype, cf_var.filename, fill_value) # Get the chunking specified for the variable : this is either a shape, or # maybe the string "contiguous". if CHUNK_CONTROL.mode is ChunkControl.Modes.AS_DASK: diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py index 98c565e990..1972dae567 100644 --- a/lib/iris/fileformats/netcdf/saver.py +++ b/lib/iris/fileformats/netcdf/saver.py @@ -19,6 +19,7 @@ """ +import codecs import collections from itertools import repeat, zip_longest import os @@ -53,7 +54,9 @@ from iris.coords import AncillaryVariable, AuxCoord, CellMeasure, DimCoord import iris.exceptions import iris.fileformats.cf -from iris.fileformats.netcdf import _dask_locks, _thread_safe_nc +from iris.fileformats.netcdf import _bytecoding_datasets as bytecoding_datasets +from iris.fileformats.netcdf import _dask_locks +from iris.fileformats.netcdf import _thread_safe_nc as threadsafe_nc from iris.fileformats.netcdf._attribute_handlers import ATTRIBUTE_HANDLERS import iris.io import iris.util @@ -305,7 +308,7 @@ class VariableEmulator(typing.Protocol): shape: tuple[int, ...] -CFVariable = typing.Union[_thread_safe_nc.VariableWrapper, VariableEmulator] +CFVariable = typing.Union[bytecoding_datasets.VariableWrapper, VariableEmulator] class Saver: @@ -408,7 +411,7 @@ def __init__(self, filename, netcdf_format, compute=True): # Put it inside a _thread_safe_nc wrapper to ensure thread-safety. # Except if it already is one, since they forbid "re-wrapping". if not hasattr(self._dataset, "THREAD_SAFE_FLAG"): - self._dataset = _thread_safe_nc.DatasetWrapper.from_existing( + self._dataset = bytecoding_datasets.EncodedDataset.from_existing( self._dataset ) @@ -419,7 +422,7 @@ def __init__(self, filename, netcdf_format, compute=True): # Given a filepath string/path : create a dataset from that try: self.filepath = Path(filename).absolute() - self._dataset = _thread_safe_nc.DatasetWrapper( + self._dataset = bytecoding_datasets.EncodedDataset( self.filepath, mode="w", format=netcdf_format ) except RuntimeError: @@ -764,7 +767,7 @@ def _create_cf_dimensions(self, cube, dimension_names, unlimited_dimensions=None # used for a different one pass else: - dim_name = self._get_coord_variable_name(cube, coord) + dim_name = self._get_element_variable_name(cube, coord) unlimited_dim_names.append(dim_name) for dim_name in dimension_names: @@ -995,12 +998,12 @@ def _add_aux_coords( ] # Include any relevant mesh location coordinates. - mesh: MeshXY | None = getattr(cube, "mesh") - mesh_location: str | None = getattr(cube, "location") + mesh: MeshXY | None = getattr(cube, "mesh") # type: ignore[annotation-unchecked] + mesh_location: str | None = getattr(cube, "location") # type: ignore[annotation-unchecked] if mesh and mesh_location: location_coords: MeshNodeCoords | MeshEdgeCoords | MeshFaceCoords = getattr( mesh, f"{mesh_location}_coords" - ) + ) # type: ignore[annotation-unchecked] coords_to_add.extend(list(location_coords)) return self._add_inner_related_vars( @@ -1370,7 +1373,7 @@ def record_dimension(names_list, dim_name, length, matching_coords=None): if dim_name is None: # Not already present : create a unique dimension name # from the coord. - dim_name = self._get_coord_variable_name(cube, coord) + dim_name = self._get_element_variable_name(cube, coord) # Disambiguate if it has the same name as an # existing dimension. # OR if it matches an existing file variable name. @@ -1546,38 +1549,14 @@ def _create_cf_bounds(self, coord, cf_var, cf_name, /, *, compression_kwargs=Non ) self._lazy_stream_data(data=bounds, cf_var=cf_var_bounds) - def _get_cube_variable_name(self, cube): - """Return a CF-netCDF variable name for the given cube. - - Parameters - ---------- - cube : :class:`iris.cube.Cube` - An instance of a cube for which a CF-netCDF variable - name is required. - - Returns - ------- - str - A CF-netCDF variable name as a string. - - """ - if cube.var_name is not None: - cf_name = cube.var_name - else: - # Convert to lower case and replace whitespace by underscores. - cf_name = "_".join(cube.name().lower().split()) - - cf_name = self.cf_valid_var_name(cf_name) - return cf_name - - def _get_coord_variable_name(self, cube_or_mesh, coord): - """Return a CF-netCDF variable name for a given coordinate-like element. + def _get_element_variable_name(self, cube_or_mesh, element): + """Return a CF-netCDF variable name for a given coordinate-like element, or cube. Parameters ---------- cube_or_mesh : :class:`iris.cube.Cube` or :class:`iris.mesh.MeshXY` The Cube or Mesh being saved to the netCDF file. - coord : :class:`iris.coords._DimensionalMetadata` + element : :class:`iris.coords._DimensionalMetadata` | :class:``iris.cube.Cube`` An instance of a coordinate (or similar), for which a CF-netCDF variable name is required. @@ -1597,17 +1576,21 @@ def _get_coord_variable_name(self, cube_or_mesh, coord): cube = None mesh = cube_or_mesh - if coord.var_name is not None: - cf_name = coord.var_name + if element.var_name is not None: + cf_name = element.var_name + elif isinstance(element, Cube): + # Make name for a Cube without a var_name. + cf_name = "_".join(element.name().lower().split()) else: - name = coord.standard_name or coord.long_name + # Make name for a Coord-like element without a var_name + name = element.standard_name or element.long_name if not name or set(name).intersection(string.whitespace): # We need to invent a name, based on its associated dimensions. - if cube is not None and cube.coords(coord): + if cube is not None and cube.coords(element): # It is a regular cube coordinate. # Auto-generate a name based on the dims. name = "" - for dim in cube.coord_dims(coord): + for dim in cube.coord_dims(element): name += f"dim{dim}" # Handle scalar coordinate (dims == ()). if not name: @@ -1621,8 +1604,8 @@ def _get_coord_variable_name(self, cube_or_mesh, coord): # At present, a location-coord cannot be nameless, as the # MeshXY code relies on guess_coord_axis. - assert isinstance(coord, Connectivity) - location = coord.cf_role.split("_")[0] + assert isinstance(element, Connectivity) + location = element.cf_role.split("_")[0] location_dim_attr = f"{location}_dimension" name = getattr(mesh, location_dim_attr) @@ -1698,6 +1681,8 @@ def _create_mesh(self, mesh): return cf_mesh_name def _set_cf_var_attributes(self, cf_var, element): + from iris.cube import Cube + # Deal with CF-netCDF units, and add the name+units properties. if isinstance(element, iris.coords.Coord): # Fix "degree" units if needed. @@ -1705,34 +1690,62 @@ def _set_cf_var_attributes(self, cf_var, element): else: units_str = str(element.units) - if cf_units.as_unit(units_str).is_udunits(): - _setncattr(cf_var, "units", units_str) - - standard_name = element.standard_name - if standard_name is not None: - _setncattr(cf_var, "standard_name", standard_name) - - long_name = element.long_name - if long_name is not None: - _setncattr(cf_var, "long_name", long_name) + # NB this bit is a nasty hack to preserve existing behaviour through a refactor: + # The attributes for Coords are created in the order units, standard_name, + # whereas for data-variables (aka Cubes) it is the other way around. + # Needed now that this routine is also called from _create_cf_data_variable. + # TODO: when we can break things, rationalise these to be the same. + def add_units_attr(): + if cf_units.as_unit(units_str).is_udunits(): + _setncattr(cf_var, "units", units_str) + + def add_names_attrs(): + standard_name = element.standard_name + if standard_name is not None: + _setncattr(cf_var, "standard_name", standard_name) + + long_name = element.long_name + if long_name is not None: + _setncattr(cf_var, "long_name", long_name) + + if isinstance(element, Cube): + add_names_attrs() + add_units_attr() + else: + add_units_attr() + add_names_attrs() # Add the CF-netCDF calendar attribute. if element.units.calendar: _setncattr(cf_var, "calendar", str(element.units.calendar)) - # Add any other custom coordinate attributes. - for name in sorted(element.attributes): - value = element.attributes[name] - - if name == "STASH": - # Adopting provisional Metadata Conventions for representing MO - # Scientific Data encoded in NetCDF Format. - name = "um_stash_source" - value = str(value) - - # Don't clobber existing attributes. - if not hasattr(cf_var, name): - _setncattr(cf_var, name, value) + # Take a copy so we can remove things + element_attrs = element.attributes.copy() + + # Note: when writing UGRID, "element" can be a Mesh which has no "dtype", + # and for dataless cubes it will have a 'None' dtype. + if getattr(element, "dtype", None) is not None: + # Most attributes are dealt with later. But _Encoding needs to be defined + # *before* we can write to a character variable. + if element.dtype.kind in "SU" and "_Encoding" in element_attrs: + encoding = element_attrs.pop("_Encoding") + _setncattr(cf_var, "_Encoding", encoding) + + if not isinstance(element, Cube): + # Add any other custom coordinate attributes. + # N.B. not Cube, which has specific handling in _create_cf_data_variable + for name in sorted(element_attrs): + value = element_attrs[name] + + if name == "STASH": + # Adopting provisional Metadata Conventions for representing MO + # Scientific Data encoded in NetCDF Format. + name = "um_stash_source" + value = str(value) + + # Don't clobber existing attributes. + if not hasattr(cf_var, name): + _setncattr(cf_var, name, value) def _create_generic_cf_array_var( self, @@ -1744,6 +1757,8 @@ def _create_generic_cf_array_var( element_dims=None, fill_value=None, compression_kwargs=None, + packing_controls: dict | None = None, + is_dataless=False, ): """Create theCF-netCDF variable given dimensional_metadata. @@ -1796,7 +1811,7 @@ def _create_generic_cf_array_var( # Work out the var-name to use. # N.B. the only part of this routine that may use a mesh _or_ a cube. - cf_name = self._get_coord_variable_name(cube_or_mesh, element) + cf_name = self._get_element_variable_name(cube_or_mesh, element) while cf_name in self._dataset.variables: cf_name = self._increment_name(cf_name) @@ -1809,18 +1824,46 @@ def _create_generic_cf_array_var( # Get the data values, in a way which works for any element type, as # all are subclasses of _DimensionalMetadata. # (e.g. =points if a coord, =data if an ancillary, etc) - data = element._core_values() + if isinstance(element, Cube): + data = element.core_data() + else: + data = element._core_values() # This compression contract is *not* applicable to a mesh. - if cube and cube.shape != data.shape: + if cube is not None and data is not None and cube.shape != data.shape: compression_kwargs = {} - if np.issubdtype(data.dtype, np.str_): - # Deal with string-type variables. + if not is_dataless and data.dtype.kind == "U": + # Deal with unicode-string-type variables. # Typically CF label variables, but also possibly ancil-vars ? + + # NOTE: all we are doing here is to calculate the byte dimension length, + # based on the dtype and any encoding attribute. + # The actual char --> byte data *translation* is done by the variable, + # being a _bytecoding_datasets.EncodedVariable. string_dimension_depth = data.dtype.itemsize - if data.dtype.kind == "U": - string_dimension_depth //= 4 + + # String content (U) instead of bytes (S). + # For numpy strings, itemsize is **always** a multiple of 4 + if string_dimension_depth % 4 != 0: + msg = ( + "Unexpected numpy string 'dtype.itemsize' for element " + f"{cube_or_mesh.name()!r}: " + f"'dtype.itemsize = {string_dimension_depth}, expected " + "a multiple of four (always)." + ) + raise ValueError(msg) + nchars = string_dimension_depth // 4 + + encoding_attr = element.attributes.get("_Encoding", "ascii") + # Look this up + return a supported encoding name + # NB implements defaults and raises a warning if given not recognised. + encoding = bytecoding_datasets._identify_encoding( + encoding=encoding_attr, var_name=cf_name, writing=True + ) + width_fns = bytecoding_datasets._ENCODING_WIDTH_TRANSLATIONS[encoding] + string_dimension_depth = width_fns.nchars_2_nbytes(nchars) + string_dimension_name = "string%d" % string_dimension_depth # Determine whether to create the string length dimension. @@ -1838,29 +1881,39 @@ def _create_generic_cf_array_var( # Create the label coordinate variable. cf_var = self._dataset.createVariable(cf_name, "|S1", element_dims) - - # Convert data from an array of strings into a character array - # with an extra string-length dimension. - if len(element_dims) == 1: - data_first = data[0] - if is_lazy_data(data_first): - data_first = dask.compute(data_first) - data = list("%- *s" % (string_dimension_depth, data_first)) - else: - orig_shape = data.shape - new_shape = orig_shape + (string_dimension_depth,) - new_data = np.zeros(new_shape, cf_var.dtype) - for index in np.ndindex(orig_shape): - index_slice = tuple(list(index) + [slice(None, None)]) - new_data[index_slice] = list( - "%- *s" % (string_dimension_depth, data[index]) - ) - data = new_data else: - # A normal (numeric) variable. + # A non-string variable. # ensure a valid datatype for the file format. - element_type = type(element).__name__ - data = self._ensure_valid_dtype(data, element_type, element) + if is_dataless: + dtype = self._DATALESS_DTYPE + fill_value = self._DATALESS_FILLVALUE + else: + # Normal non-string data. + # NOTE: this includes byte-arrays (S1 only) : however these must + # use an actual cube dimension for the 'string dimension', which + # seriously limits the utility of DECODE_TO_STRINGS_ON_READ. + # TODO: also support netCDF variable-length strings ("string" type). + # Currently hit a **write error here**, being numpy object dtype ("O"). + if data.dtype.kind not in "iufSU" or ( + data.dtype.kind == "S" and data.dtype.itemsize != 1 + ): + # This is a type of data we don't "understand". + # NB this includes "Sxx" types other than "S1" : It seems that + # netCDF4 saves Sxx as variable-length strings. + # But we don't support that type in Iris. + msg = ( + f"Variable {cf_name!r} has unexpected dtype, {data.dtype!r}." + f"Data content arrays must be numeric, or contain " + "single-bytes (dtype 'S1'), or unicode strings (dtype 'U')." + ) + raise ValueError(msg) + + element_type = type(element).__name__ + data = self._ensure_valid_dtype(data, element_type, element) + if not packing_controls: + dtype = data.dtype.newbyteorder("=") + else: + dtype = packing_controls["dtype"] # Check if this is a dim-coord. is_dimcoord = cube is not None and element in cube.dim_coords @@ -1874,7 +1927,7 @@ def _create_generic_cf_array_var( # Create the CF-netCDF variable. cf_var = self._dataset.createVariable( cf_name, - data.dtype.newbyteorder("="), + dtype, element_dims, fill_value=fill_value, **compression_kwargs, @@ -1891,12 +1944,18 @@ def _create_generic_cf_array_var( element, cf_var, cf_name, compression_kwargs=compression_kwargs ) - # Add the data to the CF-netCDF variable. - self._lazy_stream_data(data=data, cf_var=cf_var) - # Add names + units + # NOTE: *must* now do first, as we may need '_Encoding' set to write it ! self._set_cf_var_attributes(cf_var, element) + # Add the data to the CF-netCDF variable. + if not is_dataless: + if packing_controls: + # We must set packing attributes (if any), before assigning values. + for key, value in packing_controls["attributes"]: + _setncattr(cf_var, key, value) + self._lazy_stream_data(data=data, cf_var=cf_var) + return cf_name def _create_cf_cell_methods(self, cube, dimension_names): @@ -2243,9 +2302,9 @@ def _create_cf_grid_mapping(self, cube, cf_var_cube): cfvar = self._name_coord_map.name(coord) if not cfvar: # not found - create and store it: - cfvar = self._get_coord_variable_name(cube, coord) + cfvar = self._get_element_variable_name(cube, coord) self._name_coord_map.append( - cfvar, self._get_coord_variable_name(cube, coord) + cfvar, self._get_element_variable_name(cube, coord) ) cfvar_names.append(cfvar) @@ -2325,18 +2384,10 @@ def _create_cf_data_variable( # be removed. # Get the values in a form which is valid for the file format. is_dataless = cube.is_dataless() - if is_dataless: - data = None - else: - data = self._ensure_valid_dtype(cube.core_data(), "cube", cube) - if is_dataless: - # The variable must have *some* dtype, and it must be maskable - dtype = self._DATALESS_DTYPE - fill_value = self._DATALESS_FILLVALUE - elif not packing: - dtype = data.dtype.newbyteorder("=") - else: + packing_controls = None + if packing and not is_dataless: + data = self._ensure_valid_dtype(cube.core_data(), "cube", cube) if isinstance(packing, dict): if "dtype" not in packing: msg = "The dtype attribute is required for packing." @@ -2375,45 +2426,29 @@ def _create_cf_data_variable( else: add_offset = cmin + 2 ** (n - 1) * scale_factor - def set_packing_ncattrs(cfvar): - """Set netCDF packing attributes. - - NOTE: cfvar needs to be a _thread_safe_nc._ThreadSafeWrapper subclass. - - """ - assert hasattr(cfvar, "THREAD_SAFE_FLAG") - if packing: - if scale_factor: - _setncattr(cfvar, "scale_factor", scale_factor) - if add_offset: - _setncattr(cfvar, "add_offset", add_offset) - - cf_name = self._get_cube_variable_name(cube) - while cf_name in self._dataset.variables: - cf_name = self._increment_name(cf_name) + packing_controls = { + "dtype": dtype, + "attributes": [ + ("scale_factor", scale_factor), + ("add_offset", add_offset), + ], + } # Create the cube CF-netCDF data variable with data payload. - cf_var = self._dataset.createVariable( - cf_name, dtype, dimension_names, fill_value=fill_value, **kwargs + cf_name = self._create_generic_cf_array_var( + cube, + dimension_names, + cube, + element_dims=dimension_names, + fill_value=fill_value, + compression_kwargs=kwargs, + packing_controls=packing_controls, + is_dataless=is_dataless, ) + cf_var = self._dataset.variables[cf_name] - if not is_dataless: - set_packing_ncattrs(cf_var) - self._lazy_stream_data(data=data, cf_var=cf_var) - - if cube.standard_name: - _setncattr(cf_var, "standard_name", cube.standard_name) - - if cube.long_name: - _setncattr(cf_var, "long_name", cube.long_name) - - if cube.units.is_udunits(): - _setncattr(cf_var, "units", str(cube.units)) - - # Add the CF-netCDF calendar attribute. - if cube.units.calendar: - _setncattr(cf_var, "calendar", cube.units.calendar) - + # Set general attrs: NB this part is cube-specific (not the same for components) + # - so 'set_cf_var_attributes' *doesn't* set these, if element is a Cube if iris.FUTURE.save_split_attrs: attr_names = cube.attributes.locals.keys() else: @@ -2509,7 +2544,7 @@ def _increment_name(self, varname): def _lazy_stream_data( self, data: np.typing.ArrayLike, - cf_var: CFVariable, + cf_var: threadsafe_nc.VariableWrapper, ) -> None: if hasattr(data, "shape") and data.shape == (1,) + cf_var.shape: # (Don't do this check for string data). @@ -2536,11 +2571,13 @@ def _lazy_stream_data( # later by a call to delayed_completion(). def store( data: np.typing.ArrayLike, - cf_var: CFVariable, + cf_var: threadsafe_nc.VariableWrapper, ) -> None: # Create a data-writeable object that we can stream into, which # encapsulates the file to be opened + variable to be written. - write_wrapper = _thread_safe_nc.NetCDFWriteProxy( + # Note: we do *not* support selectable string encoding for writes, + # so this never needs to be a _thread_safe_nc.NetCDFWriteProxy. + write_wrapper = bytecoding_datasets.EncodedNetCDFWriteProxy( self.filepath, cf_var, self.file_write_lock ) # Add to the list of delayed writes, used in delayed_completion(). @@ -2550,7 +2587,7 @@ def store( # Real data is always written directly, i.e. not via lazy save. def store( data: np.typing.ArrayLike, - cf_var: CFVariable, + cf_var: threadsafe_nc.VariableWrapper, ) -> None: cf_var[:] = data # type: ignore[index] diff --git a/lib/iris/tests/integration/netcdf/test_stringdata.py b/lib/iris/tests/integration/netcdf/test_stringdata.py new file mode 100644 index 0000000000..b6aa6dfe3d --- /dev/null +++ b/lib/iris/tests/integration/netcdf/test_stringdata.py @@ -0,0 +1,716 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the BSD license. +# See LICENSE in the root of the repository for full licensing details. +"""Integration tests for various uses of character/string arrays in netcdf file variables. + +This covers both the loading and saving of variables which are the content of +data-variables, auxiliary coordinates, ancillary variables and -possibly?- cell measures. +""" + +from dataclasses import dataclass +from pathlib import Path +from typing import Iterable + +import numpy as np +from numpy.typing import ArrayLike +import pytest + +import iris +from iris.coords import AuxCoord, DimCoord +from iris.cube import Cube +import iris.exceptions +from iris.fileformats.netcdf import ( + DECODE_TO_STRINGS_ON_READ, + SUPPORTED_ENCODINGS, + _thread_safe_nc, +) + + +@pytest.fixture(scope="module") +def all_lazy_auxcoords(): + """Ensure that *all* aux-coords are loaded lazily, even really small ones.""" + old_minlazybytes = iris.fileformats.netcdf.loader._LAZYVAR_MIN_BYTES + iris.fileformats.netcdf.loader._LAZYVAR_MIN_BYTES = 0 + yield + iris.fileformats.netcdf.loader._LAZYVAR_MIN_BYTES = old_minlazybytes + + +N_XDIM = 3 +N_CHARS_DIM = 64 + +NO_ENCODING_STR = "" +ALIAS_UTF8_STR = "UTF8" # an alternative acceptable form (should be written as-is) +TEST_ENCODINGS = [NO_ENCODING_STR, ALIAS_UTF8_STR] + SUPPORTED_ENCODINGS + + +# Common fixture to save with split-attrs ONLY in these tests +@pytest.fixture(scope="module", autouse=True) +def all_split_attrs(): + with iris.FUTURE.context(save_split_attrs=True): + yield + + +# +# Routines to convert between byte and string arrays. +# Independently defined here, to avoid relying on any code we are testing. +# +def convert_strings_to_chararray( + string_array_1d: ArrayLike, maxlen: int, encoding: str | None = None +) -> np.ndarray: + # Note: this is limited to 1-D arrays of strings. + # Could generalise that if needed, but for now this makes it simpler. + if encoding is None: + encoding = "ascii" + bbytes = [text.encode(encoding) for text in string_array_1d] + pad = b"\0" * maxlen + bbytes = [(x + pad)[:maxlen] for x in bbytes] + chararray = np.array([[bb[i : i + 1] for i in range(maxlen)] for bb in bbytes]) + return chararray + + +def convert_bytearray_to_strings( + byte_array: ArrayLike, encoding: str = "utf-8", string_length: int | None = None +) -> np.ndarray: + """Convert bytes to strings. + + N.B. for now at least, we assume the string dim is **always the last one**. + """ + byte_array = np.asanyarray(byte_array) + bytes_shape = byte_array.shape + var_shape = bytes_shape[:-1] + if string_length is None: + string_length = bytes_shape[-1] + string_dtype = f"U{string_length}" + result = np.empty(var_shape, dtype=string_dtype) + for ndindex in np.ndindex(var_shape): + element_bytes = byte_array[ndindex] + bytes = b"".join([b or b"\0" for b in element_bytes]) + string = bytes.decode(encoding) + result[ndindex] = string + return result + + +@dataclass +class SamplefileDetails: + """Convenience container for information about a sample file.""" + + filepath: Path + datavar_data: ArrayLike + datavar_bytes: ArrayLike + stringcoord_data: ArrayLike + stringcoord_bytes: ArrayLike + numericcoord_data: ArrayLike + + +def make_testfile( + testfile_path: Path, + encoding_str: str, + coords_on_separate_dim: bool = False, + # If set, determines the "_Encoding" attrs content, including None --> no attr. + # Otherwise, they follow 'encoding_str', including NO_ENCODING_STR --> no attr. + encoding_attr: str | None = "", +) -> SamplefileDetails: + """Create a test netcdf file. + + Also returns content information for checking loaded results. + """ + if encoding_str == NO_ENCODING_STR: + encoding = None + else: + encoding = encoding_str + + if encoding_attr == "": + encoding_attr = encoding + + data_is_ascii = encoding in (None, "ascii") + + numeric_values = np.arange(3.0) + if data_is_ascii: + coordvar_strings = ["mOnster", "London", "Amsterdam"] + datavar_strings = ["bun", "Eclair", "sandwich"] + else: + coordvar_strings = ["Münster", "London", "Amsterdam"] + datavar_strings = ["bun", "éclair", "sandwich"] + + coordvar_bytearray = convert_strings_to_chararray( + string_array_1d=coordvar_strings, maxlen=N_CHARS_DIM, encoding=encoding + ) + datavar_bytearray = convert_strings_to_chararray( + string_array_1d=datavar_strings, maxlen=N_CHARS_DIM, encoding=encoding + ) + + ds = _thread_safe_nc.DatasetWrapper(testfile_path, "w") + try: + ds.createDimension("x", N_XDIM) + ds.createDimension("nstr", N_CHARS_DIM) + if coords_on_separate_dim: + ds.createDimension("nstr2", N_CHARS_DIM) + v_xdim = ds.createVariable("x", int, dimensions=("x")) + v_xdim[:] = np.arange(N_XDIM) + + v_co = ds.createVariable( + "v_co", + "S1", + dimensions=( + "x", + "nstr2" if coords_on_separate_dim else "nstr", + ), + ) + v_co[:] = coordvar_bytearray + + if encoding_attr is not None: + v_co._Encoding = encoding_attr + + v_numeric = ds.createVariable( + "v_numeric", + float, + dimensions=("x",), + ) + v_numeric[:] = numeric_values + + v_datavar = ds.createVariable( + "v", + "S1", + dimensions=( + "x", + "nstr", + ), + ) + v_datavar[:] = datavar_bytearray + + if encoding_attr is not None: + v_datavar._Encoding = encoding_attr + + v_datavar.coordinates = "v_co v_numeric" + finally: + ds.close() + + return SamplefileDetails( + filepath=testfile_path, + datavar_data=datavar_strings, + datavar_bytes=datavar_bytearray, + stringcoord_data=coordvar_strings, + stringcoord_bytes=coordvar_bytearray, + numericcoord_data=numeric_values, + ) + + +@pytest.fixture(params=TEST_ENCODINGS) +def encoding(request): + return request.param + + +def load_problems_list(): + return [str(prob) for prob in iris.loading.LOAD_PROBLEMS.problems] + + +class TestReadEncodings: + """Test loading of testfiles with encoded string data.""" + + @pytest.fixture(autouse=True) + def _clear_load_problems(self): + iris.loading.LOAD_PROBLEMS.reset() + return + + @pytest.fixture(params=["coordsSameDim", "coordsOwnDim"]) + def use_separate_dims(self, request): + return request.param == "coordsOwnDim" + + @pytest.fixture + def readtest_path( + self, + encoding, + tmp_path, + use_separate_dims, + ) -> Iterable[SamplefileDetails]: + """Create a suitable valid testfile, and return expected string content.""" + if encoding == "": + filetag = "noencoding" + else: + filetag = encoding + dimtag = "diffdims" if use_separate_dims else "samedims" + tempfile_path = tmp_path / f"sample_stringdata_read_{filetag}_{dimtag}.nc" + return tempfile_path + + @pytest.fixture + def readtest_data( + self, + encoding, + readtest_path, + use_separate_dims, + ) -> SamplefileDetails: + """Create a suitable valid testfile, and return expected string content.""" + testdata = make_testfile( + testfile_path=readtest_path, + encoding_str=encoding, + coords_on_separate_dim=use_separate_dims, + ) + return testdata + + @pytest.fixture(params=["strings", "bytes"]) + def readmode(self, request): + return request.param + + def test_valid_encodings( + self, encoding, readtest_data: SamplefileDetails, readmode, use_separate_dims + ): + ( + testfile_path, + datavar_strings, + datavar_bytes, + coordvar_strings, + coordvar_bytes, + numeric_data, + ) = ( + readtest_data.filepath, + readtest_data.datavar_data, + readtest_data.datavar_bytes, + readtest_data.stringcoord_data, + readtest_data.stringcoord_bytes, + readtest_data.numericcoord_data, + ) + + if readmode == "bytes" and use_separate_dims == True: + msg = ( + "Unsupported load combination : character coordinates with a non-cube " + "string dimension can't attach to the cube, when read as bytes." + ) + pytest.skip(msg) + + as_strings = readmode == "strings" + if as_strings: + # Regular load + cube = iris.load_cube(testfile_path) + expected_shape: tuple = (N_XDIM,) + else: + # Special NON-decoded read + with DECODE_TO_STRINGS_ON_READ.context(False): + cube = iris.load_cube(testfile_path) + expected_shape = (N_XDIM, N_CHARS_DIM) + + assert load_problems_list() == [] + assert cube.shape == expected_shape + + if as_strings: + if encoding == "utf-32": + expected_string_width = (N_CHARS_DIM // 4) - 1 + elif encoding == "utf-16": + expected_string_width = (N_CHARS_DIM) // 2 - 1 + else: + expected_string_width = N_CHARS_DIM + expected_dtype = f" SampleCubeDetails: + data_is_ascii = encoding_str in (NO_ENCODING_STR, "ascii") + + numeric_values = np.arange(3.0) + if data_is_ascii: + coordvar_strings = ["mOnster", "London", "Amsterdam"] + datavar_strings = ["bun", "Eclair", "sandwich"] + else: + coordvar_strings = ["Münster", "London", "Amsterdam"] + datavar_strings = ["bun", "éclair", "sandwich"] + + if not byte_data: + # Do our own conversion between intended byte dimension and string width + # N.B. N_CHARS_DIM is set big enough so the test strings will never overflow + charlen = N_CHARS_DIM + if encoding_str == "utf-32": + charlen = (charlen // 4) - 1 + elif encoding_str == "utf-16": + charlen = (charlen // 2) - 1 + strings_dtype = np.dtype(f"U{charlen}") + coordvar_array = np.array(coordvar_strings, dtype=strings_dtype) + datavar_array = np.array(datavar_strings, dtype=strings_dtype) + else: + write_encoding = encoding_str + if write_encoding == NO_ENCODING_STR: + write_encoding = "ascii" + coordvar_array = convert_strings_to_chararray( + coordvar_strings, maxlen=N_CHARS_DIM, encoding=write_encoding + ) + datavar_array = convert_strings_to_chararray( + datavar_strings, maxlen=N_CHARS_DIM, encoding=write_encoding + ) + + if lazy_data: + from iris._lazy_data import as_lazy_data + + datavar_array, coordvar_array = ( + as_lazy_data(arr) for arr in [datavar_array, coordvar_array] + ) + cube = Cube(datavar_array, var_name="v") + cube.add_dim_coord(DimCoord(np.arange(N_XDIM), var_name="x"), 0) + if encoding_str != NO_ENCODING_STR: + cube.attributes["_Encoding"] = encoding_str + co_x = AuxCoord(coordvar_array, var_name="v_co") + if encoding_str != NO_ENCODING_STR: + co_x.attributes["_Encoding"] = encoding_str + co_dims = (0, 1) if byte_data else (0,) + cube.add_aux_coord(co_x, co_dims) + + result = SampleCubeDetails( + cube=cube, + datavar_data=datavar_array, + stringcoord_data=coordvar_array, + ) + return result + + +class TestWriteEncodings: + """Test saving of testfiles with encoded string data. + + To avoid circularity, we generate and save *cube* data. + """ + + @pytest.fixture(params=["allLazy", "smallReal"]) + def lazy_data(self, request, mocker): + is_lazy = request.param == "allLazy" + if is_lazy: + mocker.patch("iris.fileformats.netcdf.loader._LAZYVAR_MIN_BYTES", 0) + return is_lazy + + @pytest.fixture(params=["dataAsStrings", "dataAsBytes"]) + def write_bytes(self, request): + return request.param == "dataAsBytes" + + @pytest.fixture + def writetest_path(self, encoding, write_bytes, lazy_data, tmp_path): + """Create a suitable test cube, with either string or byte content.""" + if encoding == "": + filetag = "noencoding" + else: + filetag = encoding + datatag = "writebytes" if write_bytes else "writestrings" + lazytag = "alllazy" if lazy_data else "smallreal" + tempfile_path = ( + tmp_path / f"sample_stringdata_write_{filetag}_{datatag}_{lazytag}.nc" + ) + return tempfile_path + + @pytest.fixture + def writetest_data(self, writetest_path, encoding, write_bytes, lazy_data): + """Create a suitable test cube + save to a file. + + Apply the given encoding to both coord and cube data. + Form the data as bytes, or as strings, depending on 'write_bytes'.' + """ + cube_info = make_testcube( + encoding_str=encoding, + byte_data=write_bytes, + lazy_data=lazy_data, + ) + cube_info.save_path = writetest_path + cube = cube_info.cube + iris.save(cube, writetest_path) + return cube_info + + def test_valid_encodings(self, encoding, writetest_data, write_bytes): + cube_info = writetest_data + cube, path = cube_info.cube, cube_info.save_path + + # N.B. file content should not depend on whether bytes or strings were written + vararray, coordarray = cube_info.datavar_data, cube_info.stringcoord_data + ds = _thread_safe_nc.DatasetWrapper(path) + ds.set_auto_chartostring(False) + v_main = ds.variables["v"] + v_co = ds.variables["v_co"] + assert v_main.shape == (N_XDIM, N_CHARS_DIM) + assert v_co.shape == (N_XDIM, N_CHARS_DIM) + assert v_main.dtype == "'\)." + ) + with pytest.raises(ValueError, match=msg): + iris.save(cube, filepath) + + def test_write_stringobjects__fail(self, tmp_path): + string_array = np.array(["one", "four"], dtype="O") + cube = Cube(string_array) + filepath = tmp_path / "write_stringobjects.nc" + msg = ( + r"Variable 'unknown' has unexpected dtype, dtype\('O'\)." + "Data content arrays must be numeric, or contain single-bytes " + r"\(dtype 'S1'\), or unicode strings \(dtype 'U'\)." + ) + with pytest.raises(ValueError, match=msg): + iris.save(cube, filepath) + + def test_write_unexpected_dtype_itemsize(self, mocker, tmp_path): + # Test unexpected form of numpy character data. Not clear if this can actually + # happen, but we do have a runtime test for it, so this just exercises that. + mock_dtype = mocker.Mock(spec=np.dtype, kind="U", itemsize=3) + mock_data = mocker.MagicMock(spec=np.ndarray, dtype=mock_dtype) + mocker.patch("numpy.asarray", return_value=mock_data) + cube = Cube(mock_data) + filepath = tmp_path / "write_unexpected_dtype_itemsize.nc" + msg = ( + r"Unexpected numpy string 'dtype\.itemsize' for element 'unknown': " + r"'dtype\.itemsize = 3, expected a multiple of four \(always\)\." + ) + with pytest.raises(ValueError, match=msg): + iris.save(cube, filepath) + + +class TestSaveloadBadUnicodeAsBytes: + def test_save_load_bad_unicode(self, tmp_path): + filepath = tmp_path / "bad_unicode_utf8.nc" + test_string = "marré" + bytes_array = test_string.encode("utf8") + s1_array = np.array([bytes_array[i : i + 1] for i in range(len(bytes_array))]) + s1_array_bad_utf8 = s1_array[:-1] # invalid without the last byte + cube = Cube(s1_array_bad_utf8, attributes={"_Encoding": "utf8"}) + iris.save(cube, filepath) + # First check for error when reading back *normally* + msg = "could not be decoded with the 'utf-8' encoding" + with pytest.raises(ValueError, match=msg): + iris.load(filepath) + # .. but OK in byte-reading mode + with iris.fileformats.netcdf.DECODE_TO_STRINGS_ON_READ.context(False): + readback_cube = iris.load_cube(filepath) + assert readback_cube.dtype == "S1" + assert np.all(readback_cube.data == s1_array_bad_utf8) diff --git a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_multi_dtype.cdl b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_multi_dtype.cdl index 8a8f481492..27d8f55a45 100644 --- a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_multi_dtype.cdl +++ b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_multi_dtype.cdl @@ -5,10 +5,10 @@ dimensions: time = 360 ; variables: short air_temperature(time, latitude, longitude) ; - air_temperature:scale_factor = 0.00242575f ; - air_temperature:add_offset = 261.648f ; air_temperature:standard_name = "air_temperature" ; air_temperature:units = "K" ; + air_temperature:scale_factor = 0.00242575f ; + air_temperature:add_offset = 261.648f ; air_temperature:um_stash_source = "m01s03i236" ; air_temperature:cell_methods = "time: maximum (interval: 1 hour)" ; air_temperature:grid_mapping = "latitude_longitude" ; @@ -53,10 +53,10 @@ variables: precipitation_flux:grid_mapping = "latitude_longitude" ; precipitation_flux:coordinates = "forecast_period forecast_reference_time" ; ushort air_temperature_0(time, latitude, longitude) ; - air_temperature_0:scale_factor = 0.002014167f ; - air_temperature_0:add_offset = 176.7872f ; air_temperature_0:standard_name = "air_temperature" ; air_temperature_0:units = "K" ; + air_temperature_0:scale_factor = 0.002014167f ; + air_temperature_0:add_offset = 176.7872f ; air_temperature_0:um_stash_source = "m01s03i236" ; air_temperature_0:cell_methods = "time: minimum (interval: 1 hour)" ; air_temperature_0:grid_mapping = "latitude_longitude" ; diff --git a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_single_dtype.cdl b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_single_dtype.cdl index 3f2c909ce8..c85fd35efd 100644 --- a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_single_dtype.cdl +++ b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_single_dtype.cdl @@ -5,10 +5,10 @@ dimensions: time = 360 ; variables: short air_temperature(time, latitude, longitude) ; - air_temperature:scale_factor = 0.00242575f ; - air_temperature:add_offset = 261.648f ; air_temperature:standard_name = "air_temperature" ; air_temperature:units = "K" ; + air_temperature:scale_factor = 0.00242575f ; + air_temperature:add_offset = 261.648f ; air_temperature:um_stash_source = "m01s03i236" ; air_temperature:cell_methods = "time: maximum (interval: 1 hour)" ; air_temperature:grid_mapping = "latitude_longitude" ; @@ -46,19 +46,19 @@ variables: height:standard_name = "height" ; height:positive = "up" ; short precipitation_flux(time, latitude, longitude) ; - precipitation_flux:scale_factor = 2.989738e-08f ; - precipitation_flux:add_offset = 0.0009796774f ; precipitation_flux:standard_name = "precipitation_flux" ; precipitation_flux:units = "kg m-2 s-1" ; + precipitation_flux:scale_factor = 2.989738e-08f ; + precipitation_flux:add_offset = 0.0009796774f ; precipitation_flux:um_stash_source = "m01s05i216" ; precipitation_flux:cell_methods = "time: mean (interval: 1 hour)" ; precipitation_flux:grid_mapping = "latitude_longitude" ; precipitation_flux:coordinates = "forecast_period forecast_reference_time" ; short air_temperature_0(time, latitude, longitude) ; - air_temperature_0:scale_factor = 0.002014167f ; - air_temperature_0:add_offset = 242.7874f ; air_temperature_0:standard_name = "air_temperature" ; air_temperature_0:units = "K" ; + air_temperature_0:scale_factor = 0.002014167f ; + air_temperature_0:add_offset = 242.7874f ; air_temperature_0:um_stash_source = "m01s03i236" ; air_temperature_0:cell_methods = "time: minimum (interval: 1 hour)" ; air_temperature_0:grid_mapping = "latitude_longitude" ; diff --git a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_manual.cdl b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_manual.cdl index 83e7329575..ed89a25d9f 100644 --- a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_manual.cdl +++ b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_manual.cdl @@ -4,10 +4,10 @@ dimensions: longitude = 96 ; variables: short air_temperature(latitude, longitude) ; - air_temperature:scale_factor = 0.001198068f ; - air_temperature:add_offset = 267.4006f ; air_temperature:standard_name = "air_temperature" ; air_temperature:units = "K" ; + air_temperature:scale_factor = 0.001198068f ; + air_temperature:add_offset = 267.4006f ; air_temperature:um_stash_source = "m01s03i236" ; air_temperature:cell_methods = "time: mean (interval: 6 hour)" ; air_temperature:grid_mapping = "latitude_longitude" ; diff --git a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_signed.cdl b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_signed.cdl index 83e7329575..ed89a25d9f 100644 --- a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_signed.cdl +++ b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_signed.cdl @@ -4,10 +4,10 @@ dimensions: longitude = 96 ; variables: short air_temperature(latitude, longitude) ; - air_temperature:scale_factor = 0.001198068f ; - air_temperature:add_offset = 267.4006f ; air_temperature:standard_name = "air_temperature" ; air_temperature:units = "K" ; + air_temperature:scale_factor = 0.001198068f ; + air_temperature:add_offset = 267.4006f ; air_temperature:um_stash_source = "m01s03i236" ; air_temperature:cell_methods = "time: mean (interval: 6 hour)" ; air_temperature:grid_mapping = "latitude_longitude" ; diff --git a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_unsigned.cdl b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_unsigned.cdl index 7b9114309e..eedad33e03 100644 --- a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_unsigned.cdl +++ b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_unsigned.cdl @@ -4,10 +4,10 @@ dimensions: longitude = 96 ; variables: ubyte air_temperature(latitude, longitude) ; - air_temperature:scale_factor = 0.3079035f ; - air_temperature:add_offset = 228.1423f ; air_temperature:standard_name = "air_temperature" ; air_temperature:units = "K" ; + air_temperature:scale_factor = 0.3079035f ; + air_temperature:add_offset = 228.1423f ; air_temperature:um_stash_source = "m01s03i236" ; air_temperature:cell_methods = "time: mean (interval: 6 hour)" ; air_temperature:grid_mapping = "latitude_longitude" ; diff --git a/lib/iris/tests/test_cf.py b/lib/iris/tests/test_cf.py index d6f49d9f95..ca066c6c7b 100644 --- a/lib/iris/tests/test_cf.py +++ b/lib/iris/tests/test_cf.py @@ -6,14 +6,30 @@ import contextlib import io +from typing import Iterable import pytest import iris import iris.fileformats.cf as cf +from iris.fileformats.netcdf import DECODE_TO_STRINGS_ON_READ +from iris.fileformats.netcdf.loader import _get_cf_var_data from iris.tests import _shared_utils +def fetch_cfvar_data(var, indices=()): + """Fetch variable data, or part of it. + + Optionally subindex it. If lazy, .compute() it. + """ + if not isinstance(indices, Iterable): + indices = (indices,) + data = _get_cf_var_data(var)[*indices] + if hasattr(data, "compute"): + data = data.compute() + return data + + class TestCaching: def test_cached(self, mocker): # Make sure attribute access to the underlying netCDF4.Variable @@ -335,7 +351,10 @@ def _setup(self): "A1B-99999a-river-sep-2070-2099.nc", ) ) - self.cfr_start = cf.CFReader(filename) + # NOTE: this one should now be read without byte-to-string translation, + # since we no longer support a non-final string dimension. + with DECODE_TO_STRINGS_ON_READ.context(perform_decoding=False): + self.cfr_start = cf.CFReader(filename) filename = _shared_utils.get_data_path( ("NetCDF", "label_and_climate", "small_FC_167_mon_19601101.nc") @@ -354,7 +373,17 @@ def test_label_dim_start(self): assert sorted(cf_data_var.cf_group.labels.keys()) == ["region_name"] assert region_group.cf_label_dimensions(cf_data_var) == ("georegion",) - assert region_group.cf_label_data(cf_data_var)[0] == "Anglian" + # data was specifically read as bytes, owing to non-standard dimension order + sample_chars = fetch_cfvar_data(region_group, [slice(None), 0]) # first string + + def chars2string(chararray_1d): + # Translating byte arrays is slightly awkward + bytes = b"".join(b for b in chararray_1d if b != "b\0") + string = bytes.decode("ascii") # not testing encodings, expect only ascii + return string + + sample_string = chars2string(sample_chars) + assert sample_string == "Anglian" cf_data_var = self.cfr_start.cf_group["cdf_temp_dmax_tmean_abs"] @@ -362,7 +391,9 @@ def test_label_dim_start(self): assert sorted(cf_data_var.cf_group.labels.keys()) == ["region_name"] assert region_group.cf_label_dimensions(cf_data_var) == ("georegion",) - assert region_group.cf_label_data(cf_data_var)[0] == "Anglian" + sample_chars = fetch_cfvar_data(region_group, [slice(None), 0]) + sample_string = chars2string(sample_chars) + assert sample_string == "Anglian" def test_label_dim_end(self): cf_data_var = self.cfr_end.cf_group["tas"] @@ -378,26 +409,22 @@ def test_label_dim_end(self): "source", ] - assert self.cfr_end.cf_group.labels["experiment_id"].cf_label_dimensions( - cf_data_var - ) == ("ensemble",) - assert ( - self.cfr_end.cf_group.labels["experiment_id"].cf_label_data(cf_data_var)[0] - == "2005" - ) - - assert self.cfr_end.cf_group.labels["institution"].cf_label_dimensions( - cf_data_var - ) == ("ensemble",) - assert ( - self.cfr_end.cf_group.labels["institution"].cf_label_data(cf_data_var)[0] - == "ECMWF" - ) - - assert self.cfr_end.cf_group.labels["source"].cf_label_dimensions( - cf_data_var - ) == ("ensemble",) - assert ( - self.cfr_end.cf_group.labels["source"].cf_label_data(cf_data_var)[0] - == "IFS33R1/HOPE-E, Sys 1, Met 1, ENSEMBLES" - ) + var = self.cfr_end.cf_group.labels["experiment_id"] + assert var.cf_label_dimensions(cf_data_var) == ("ensemble",) + content = fetch_cfvar_data(var, 0) + expect = "2005".ljust(len(content)) + assert content == expect + + var = self.cfr_end.cf_group.labels["institution"] + assert var.cf_label_dimensions(cf_data_var) == ("ensemble",) + content = fetch_cfvar_data(var, 0) + expect = "ECMWF" + expect = expect.ljust(len(str(content))) # expand to full string width + assert content == expect + + var = self.cfr_end.cf_group.labels["source"] + assert var.cf_label_dimensions(cf_data_var) == ("ensemble",) + content = fetch_cfvar_data(var, 0) + expect = "IFS33R1/HOPE-E, Sys 1, Met 1, ENSEMBLES" + expect = expect.ljust(len(str(content))) # expand to full string width + assert content == expect diff --git a/lib/iris/tests/test_coding_standards.py b/lib/iris/tests/test_coding_standards.py index 2cb621c873..56ed2deca3 100644 --- a/lib/iris/tests/test_coding_standards.py +++ b/lib/iris/tests/test_coding_standards.py @@ -17,6 +17,7 @@ import iris from iris.tests import system_test +from iris.tests.unit.fileformats.netcdf import test_bytecoding_datasets LICENSE_TEMPLATE = """# Copyright Iris contributors # @@ -60,6 +61,7 @@ def test_netcdf4_import(): Path(test_NetCDFWriteProxy.__file__), Path(system_test.__file__), Path(__file__), + Path(test_bytecoding_datasets.__file__), ] assert set(files_including_import) == set(expected) diff --git a/lib/iris/tests/test_load.py b/lib/iris/tests/test_load.py index 53433ee0d3..621b12d638 100644 --- a/lib/iris/tests/test_load.py +++ b/lib/iris/tests/test_load.py @@ -9,7 +9,7 @@ import pytest import iris -from iris.fileformats.netcdf import _thread_safe_nc +from iris.fileformats.netcdf import _bytecoding_datasets import iris.io from iris.tests import _shared_utils @@ -180,10 +180,10 @@ def test_net_cdf_dataset_call(self, mocker): filename = _shared_utils.get_data_path( ("NetCDF", "global", "xyt", "SMALL_total_column_co2.nc") ) - fake_dataset = _thread_safe_nc.DatasetWrapper(filename) + fake_dataset = _bytecoding_datasets.EncodedDataset(filename) dataset_loader = mocker.patch( - "iris.fileformats.netcdf._thread_safe_nc.DatasetWrapper", + "iris.fileformats.netcdf._bytecoding_datasets.EncodedDataset", return_value=fake_dataset, ) next(iris.io.load_http([self.url], callback=None)) diff --git a/lib/iris/tests/unit/cube/test_Cube.py b/lib/iris/tests/unit/cube/test_Cube.py index d91c7e81c0..756809e128 100644 --- a/lib/iris/tests/unit/cube/test_Cube.py +++ b/lib/iris/tests/unit/cube/test_Cube.py @@ -3620,6 +3620,16 @@ def test_data_bool_not_eq(self): cube2 = Cube([True, True]) assert cube1 != cube2 + def test_data_string_eq(self): + cube1 = Cube(["a", "b", "c"]) + cube2 = Cube(["a", "b", "c"]) + assert cube1 == cube2 + + def test_data_string_not_eq(self): + cube1 = Cube(["a", "b", "c"]) + cube2 = Cube(["a", "b", "d"]) + assert cube1 != cube2 + class Test__eq__meta: def test_ancillary_fail(self): diff --git a/lib/iris/tests/unit/fileformats/cf/test_CFReader.py b/lib/iris/tests/unit/fileformats/cf/test_CFReader.py index d0dd0175a2..f293c9d77f 100644 --- a/lib/iris/tests/unit/fileformats/cf/test_CFReader.py +++ b/lib/iris/tests/unit/fileformats/cf/test_CFReader.py @@ -78,7 +78,7 @@ def _setup(self, mocker): getncattr=getncattr, ) mocker.patch( - "iris.fileformats.netcdf._thread_safe_nc.DatasetWrapper", + "iris.fileformats.netcdf._bytecoding_datasets.EncodedDataset", return_value=dataset, ) @@ -141,7 +141,7 @@ def _setup(self, mocker): mocker.patch("iris.fileformats.cf.CFReader._build_cf_groups") mocker.patch("iris.fileformats.cf.CFReader._reset") mocker.patch( - "iris.fileformats.netcdf._thread_safe_nc.DatasetWrapper", + "iris.fileformats.netcdf._bytecoding_datasets.EncodedDataset", return_value=self.dataset, ) @@ -237,7 +237,7 @@ def _setup(self, mocker): # and building first level cf-groups for variables. mocker.patch("iris.fileformats.cf.CFReader._reset") mocker.patch( - "iris.fileformats.netcdf._thread_safe_nc.DatasetWrapper", + "iris.fileformats.netcdf._bytecoding_datasets.EncodedDataset", return_value=self.dataset, ) @@ -375,7 +375,7 @@ def _setup_class(self, mocker): # translations and building first level cf-groups for variables. mocker.patch("iris.fileformats.cf.CFReader._reset") mocker.patch( - "iris.fileformats.netcdf._thread_safe_nc.DatasetWrapper", + "iris.fileformats.netcdf._bytecoding_datasets.EncodedDataset", return_value=self.dataset, ) cf_reader = CFReader("dummy") diff --git a/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_auxiliary_coordinate.py b/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_auxiliary_coordinate.py index a6d613eb9a..0d4b16e8da 100644 --- a/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_auxiliary_coordinate.py +++ b/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_auxiliary_coordinate.py @@ -41,7 +41,9 @@ def _setup(self, mocker): self.engine = mocker.Mock( cube=mocker.Mock(), - cf_var=mocker.Mock(dimensions=("foo", "bar"), cf_data=cf_data), + cf_var=mocker.Mock( + dimensions=("foo", "bar"), cf_data=cf_data, dtype=np.int32 + ), filename="DUMMY", cube_parts=dict(coordinates=[]), ) @@ -167,12 +169,12 @@ class TestDtype(MockerMixin): def _setup(self, mocker): # Create coordinate cf variables and pyke engine. points = np.arange(6).reshape(2, 3) - cf_data = mocker.MagicMock(_FillValue=None) + cf_data = mocker.MagicMock(_FillValue=None, shape=points.shape) cf_data.chunking = mocker.MagicMock(return_value=points.shape) self.engine = mocker.Mock( cube=mocker.Mock(), - cf_var=mocker.Mock(dimensions=("foo", "bar")), + cf_var=mocker.Mock(dimensions=("foo", "bar"), dtype=np.int32), filename="DUMMY", cube_parts=dict(coordinates=[]), ) @@ -242,7 +244,7 @@ def _setup(self, mocker): # Create dummy pyke engine. self.engine = mocker.Mock( cube=mocker.Mock(), - cf_var=mocker.Mock(dimensions=("foo", "bar")), + cf_var=mocker.Mock(dimensions=("foo", "bar"), dtype=np.float32), filename="DUMMY", cube_parts=dict(coordinates=[]), ) diff --git a/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_dimension_coordinate.py b/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_dimension_coordinate.py index 9cf983d0a3..7e3155ad68 100644 --- a/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_dimension_coordinate.py +++ b/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_dimension_coordinate.py @@ -24,7 +24,7 @@ def _mixin_setup(self, mocker): # Create dummy pyke engine. self.engine = mocker.Mock( cube=mocker.Mock(), - cf_var=mocker.Mock(dimensions=("foo", "bar")), + cf_var=mocker.Mock(dimensions=("foo", "bar"), dtype=np.int32), filename="DUMMY", cube_parts=dict(coordinates=[]), ) diff --git a/lib/iris/tests/unit/fileformats/netcdf/loader/test__get_cf_var_data.py b/lib/iris/tests/unit/fileformats/netcdf/loader/test__get_cf_var_data.py index f92e8288b7..8a05bb712d 100644 --- a/lib/iris/tests/unit/fileformats/netcdf/loader/test__get_cf_var_data.py +++ b/lib/iris/tests/unit/fileformats/netcdf/loader/test__get_cf_var_data.py @@ -24,14 +24,15 @@ def _setup(self): self.expected_chunks = _optimum_chunksize(self.shape, self.shape) def _make(self, chunksizes=None, shape=None, dtype="i4", **extra_properties): + if shape is None: + shape = self.shape cf_data = self.mocker.MagicMock( _FillValue=None, __getitem__="", - dimensions=["dim_" + str(x) for x in range(len(shape or "1"))], + dimensions=["dim_" + str(x) for x in range(len(shape))], + shape=shape, ) cf_data.chunking = self.mocker.MagicMock(return_value=chunksizes) - if shape is None: - shape = self.shape if dtype is not str: # for testing VLen str arrays (dtype=`class `) dtype = np.dtype(dtype) cf_var = self.mocker.MagicMock( diff --git a/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver.py b/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver.py index a2d2ff71ee..f8aec84c8e 100644 --- a/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver.py +++ b/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver.py @@ -30,6 +30,7 @@ from iris.coords import AncillaryVariable, AuxCoord, DimCoord from iris.cube import Cube from iris.fileformats.netcdf import Saver, _thread_safe_nc +from iris.fileformats.netcdf import _bytecoding_datasets as ds_wrappers from iris.tests import _shared_utils from iris.tests._shared_utils import assert_CDL import iris.tests.stock as stock @@ -215,7 +216,7 @@ def test_big_endian(self, request, tmp_path): def test_zlib(self, mocker): cube = self._simple_cube(">f4") - api = mocker.patch("iris.fileformats.netcdf.saver._thread_safe_nc") + api = mocker.patch("iris.fileformats.netcdf.saver.bytecoding_datasets") # Define mocked default fill values to prevent deprecation warning (#4374). api.default_fillvals = collections.defaultdict(lambda: -99.0) # Mock the apparent dtype of mocked variables, to avoid an error. @@ -226,7 +227,7 @@ def test_zlib(self, mocker): # a fill-value report on a non-compliant variable in a non-file (!) with Saver("/dummy/path", "NETCDF4", compute=False) as saver: saver.write(cube, zlib=True) - dataset = api.DatasetWrapper.return_value + dataset = api.EncodedDataset.return_value create_var_call = mocker.call( "air_pressure_anomaly", np.dtype("float32"), @@ -257,9 +258,6 @@ def test_compression(self, mocker, tmp_path): ) cube.add_ancillary_variable(anc_coord, data_dims=data_dims) - patch = mocker.patch( - "iris.fileformats.netcdf.saver._thread_safe_nc.DatasetWrapper.createVariable" - ) compression_kwargs = { "complevel": 9, "fletcher32": True, @@ -269,10 +267,20 @@ def test_compression(self, mocker, tmp_path): nc_path = tmp_path / "temp.nc" with Saver(nc_path, "NETCDF4", compute=False) as saver: + tgt = ( + "iris.fileformats.netcdf.saver.bytecoding_datasets" + ".EncodedDataset.createVariable" + ) + createvar_spy = mocker.patch( + tgt, + # Use 'wraps' to allow the patched methods to function as normal + # - the patch object just acts as a 'spy' on its calls. + wraps=saver._dataset.createVariable, + ) saver.write(cube, **compression_kwargs) - assert 5 == patch.call_count - result = self._filter_compression_calls(patch, compression_kwargs) + assert 5 == createvar_spy.call_count + result = self._filter_compression_calls(createvar_spy, compression_kwargs) assert 3 == len(result) assert {cube.name(), aux_coord.name(), anc_coord.name()} == set(result) @@ -290,9 +298,6 @@ def test_non_compression__shape(self, mocker, tmp_path): ) cube.add_ancillary_variable(anc_coord, data_dims=data_dims[1]) - patch = mocker.patch( - "iris.fileformats.netcdf.saver._thread_safe_nc.DatasetWrapper.createVariable" - ) compression_kwargs = { "complevel": 9, "fletcher32": True, @@ -302,11 +307,21 @@ def test_non_compression__shape(self, mocker, tmp_path): nc_path = tmp_path / "temp.nc" with Saver(nc_path, "NETCDF4", compute=False) as saver: + tgt = ( + "iris.fileformats.netcdf.saver.bytecoding_datasets" + ".EncodedDataset.createVariable" + ) + createvar_spy = mocker.patch( + tgt, + # Use 'wraps' to allow the patched methods to function as normal + # - the patch object just acts as a 'spy' on its calls. + wraps=saver._dataset.createVariable, + ) saver.write(cube, **compression_kwargs) - assert 5 == patch.call_count + assert 5 == createvar_spy.call_count result = self._filter_compression_calls( - patch, compression_kwargs, mismatch=True + createvar_spy, compression_kwargs, mismatch=True ) assert 4 == len(result) # the aux coord and ancil variable are not compressed due to shape, and @@ -323,10 +338,6 @@ def test_non_compression__dtype(self, mocker, tmp_path): aux_coord = AuxCoord(data, var_name="non_compress_aux", units="1") cube.add_aux_coord(aux_coord, data_dims=data_dims) - patch = mocker.patch( - "iris.fileformats.netcdf.saver._thread_safe_nc.DatasetWrapper.createVariable" - ) - patch.return_value = mocker.MagicMock(dtype=np.dtype("S1")) compression_kwargs = { "complevel": 9, "fletcher32": True, @@ -336,11 +347,21 @@ def test_non_compression__dtype(self, mocker, tmp_path): nc_path = tmp_path / "temp.nc" with Saver(nc_path, "NETCDF4", compute=False) as saver: + tgt = ( + "iris.fileformats.netcdf.saver.bytecoding_datasets" + ".EncodedDataset.createVariable" + ) + createvar_spy = mocker.patch( + tgt, + # Use 'wraps' to allow the patched methods to function as normal + # - the patch object just acts as a 'spy' on its calls. + wraps=saver._dataset.createVariable, + ) saver.write(cube, **compression_kwargs) - assert 4 == patch.call_count + assert 4 == createvar_spy.call_count result = self._filter_compression_calls( - patch, compression_kwargs, mismatch=True + createvar_spy, compression_kwargs, mismatch=True ) assert 3 == len(result) # the aux coord is not compressed due to its string dtype, and @@ -370,7 +391,7 @@ def test_default_unlimited_dimensions(self, tmp_path): nc_path = tmp_path / "temp.nc" with Saver(nc_path, "NETCDF4") as saver: saver.write(cube) - ds = _thread_safe_nc.DatasetWrapper(nc_path) + ds = ds_wrappers.EncodedDataset(nc_path) assert not ds.dimensions["dim0"].isunlimited() assert not ds.dimensions["dim1"].isunlimited() ds.close() @@ -380,7 +401,7 @@ def test_no_unlimited_dimensions(self, tmp_path): nc_path = tmp_path / "temp.nc" with Saver(nc_path, "NETCDF4") as saver: saver.write(cube, unlimited_dimensions=None) - ds = _thread_safe_nc.DatasetWrapper(nc_path) + ds = ds_wrappers.EncodedDataset(nc_path) for dim in ds.dimensions.values(): assert not dim.isunlimited() ds.close() @@ -402,7 +423,7 @@ def test_custom_unlimited_dimensions(self, tmp_path): nc_path = tmp_path / "temp.nc" with Saver(nc_path, "NETCDF4") as saver: saver.write(cube, unlimited_dimensions=unlimited_dimensions) - ds = _thread_safe_nc.DatasetWrapper(nc_path) + ds = ds_wrappers.EncodedDataset(nc_path) for dim in unlimited_dimensions: assert ds.dimensions[dim].isunlimited() ds.close() @@ -411,7 +432,7 @@ def test_custom_unlimited_dimensions(self, tmp_path): coords = [cube.coord(dim) for dim in unlimited_dimensions] with Saver(nc_path, "NETCDF4") as saver: saver.write(cube, unlimited_dimensions=coords) - ds = _thread_safe_nc.DatasetWrapper(nc_path) + ds = ds_wrappers.EncodedDataset(nc_path) for dim in unlimited_dimensions: assert ds.dimensions[dim].isunlimited() ds.close() @@ -422,7 +443,7 @@ def test_reserved_attributes(self, tmp_path): nc_path = tmp_path / "temp.nc" with Saver(nc_path, "NETCDF4") as saver: saver.write(cube) - ds = _thread_safe_nc.DatasetWrapper(nc_path) + ds = ds_wrappers.EncodedDataset(nc_path) res = ds.getncattr("dimensions") ds.close() assert res == "something something_else" @@ -444,7 +465,7 @@ def test_dimensional_to_scalar(self, tmp_path): nc_path = tmp_path / "temp.nc" with Saver(nc_path, "NETCDF4") as saver: saver.write(cube) - ds = _thread_safe_nc.DatasetWrapper(nc_path) + ds = ds_wrappers.EncodedDataset(nc_path) # Confirm that the only dimension is the one denoting the number # of bounds - have successfully saved the 2D bounds array into 1D. assert ["bnds"] == list(ds.dimensions.keys()) @@ -484,7 +505,7 @@ def _check_bounds_setting(self, climatological=False): saver._ensure_valid_dtype.return_value = self.mocker.Mock( shape=coord.bounds.shape, dtype=coord.bounds.dtype ) - var = self.mocker.MagicMock(spec=_thread_safe_nc.VariableWrapper) + var = self.mocker.MagicMock(spec=ds_wrappers.EncodedVariable) # Make the main call. Saver._create_cf_bounds(saver, coord, var, "time") @@ -525,7 +546,7 @@ def test_valid_range_saved(self, tmp_path): nc_path = tmp_path / "temp.nc" with Saver(nc_path, "NETCDF4") as saver: saver.write(cube, unlimited_dimensions=[]) - ds = _thread_safe_nc.DatasetWrapper(nc_path) + ds = ds_wrappers.EncodedDataset(nc_path) _shared_utils.assert_array_equal(ds.valid_range, vrange) ds.close() @@ -537,7 +558,7 @@ def test_valid_min_saved(self, tmp_path): nc_path = tmp_path / "temp.nc" with Saver(nc_path, "NETCDF4") as saver: saver.write(cube, unlimited_dimensions=[]) - ds = _thread_safe_nc.DatasetWrapper(nc_path) + ds = ds_wrappers.EncodedDataset(nc_path) _shared_utils.assert_array_equal(ds.valid_min, 1) ds.close() @@ -549,7 +570,7 @@ def test_valid_max_saved(self, tmp_path): nc_path = tmp_path / "temp.nc" with Saver(nc_path, "NETCDF4") as saver: saver.write(cube, unlimited_dimensions=[]) - ds = _thread_safe_nc.DatasetWrapper(nc_path) + ds = ds_wrappers.EncodedDataset(nc_path) _shared_utils.assert_array_equal(ds.valid_max, 2) ds.close() @@ -569,7 +590,7 @@ def test_valid_range_saved(self, tmp_path): nc_path = tmp_path / "temp.nc" with Saver(nc_path, "NETCDF4") as saver: saver.write(cube, unlimited_dimensions=[]) - ds = _thread_safe_nc.DatasetWrapper(nc_path) + ds = ds_wrappers.EncodedDataset(nc_path) _shared_utils.assert_array_equal(ds.variables["longitude"].valid_range, vrange) ds.close() @@ -581,7 +602,7 @@ def test_valid_min_saved(self, tmp_path): nc_path = tmp_path / "temp.nc" with Saver(nc_path, "NETCDF4") as saver: saver.write(cube, unlimited_dimensions=[]) - ds = _thread_safe_nc.DatasetWrapper(nc_path) + ds = ds_wrappers.EncodedDataset(nc_path) _shared_utils.assert_array_equal(ds.variables["longitude"].valid_min, 1) ds.close() @@ -593,7 +614,7 @@ def test_valid_max_saved(self, tmp_path): nc_path = tmp_path / "temp.nc" with Saver(nc_path, "NETCDF4") as saver: saver.write(cube, unlimited_dimensions=[]) - ds = _thread_safe_nc.DatasetWrapper(nc_path) + ds = ds_wrappers.EncodedDataset(nc_path) _shared_utils.assert_array_equal(ds.variables["longitude"].valid_max, 2) ds.close() @@ -627,7 +648,7 @@ def netCDF_var(cube, **kwargs): nc_path = tmp_path / "temp.nc" with Saver(nc_path, "NETCDF4") as saver: saver.write(cube, **kwargs) - ds = _thread_safe_nc.DatasetWrapper(nc_path) + ds = ds_wrappers.EncodedDataset(nc_path) (var,) = [ var for var in ds.variables.values() @@ -707,7 +728,7 @@ def _setup(self, mocker): ) ) _ = mocker.patch( - "iris.fileformats.netcdf._thread_safe_nc.DatasetWrapper", + "iris.fileformats.netcdf._bytecoding_datasets.EncodedDataset", dataset_class, ) diff --git a/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver__lazy_stream_data.py b/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver__lazy_stream_data.py index 0f3a91fec2..8bfd05f06e 100644 --- a/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver__lazy_stream_data.py +++ b/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver__lazy_stream_data.py @@ -16,6 +16,7 @@ import numpy as np import pytest +import iris.fileformats.netcdf._bytecoding_datasets as bytecoding_datasets import iris.fileformats.netcdf._thread_safe_nc as threadsafe_nc from iris.fileformats.netcdf.saver import Saver @@ -29,7 +30,7 @@ def saver_patch(mocker): mock_dataset = mocker.MagicMock() mock_dataset_class = mocker.Mock(return_value=mock_dataset) # Mock the wrapper within the netcdf saver - target1 = "iris.fileformats.netcdf.saver._thread_safe_nc.DatasetWrapper" + target1 = "iris.fileformats.netcdf.saver.bytecoding_datasets.DatasetWrapper" # Mock the real netCDF4.Dataset within the threadsafe-nc module, as this is # used by NetCDFDataProxy and NetCDFWriteProxy. target2 = "iris.fileformats.netcdf._thread_safe_nc.netCDF4.Dataset" @@ -52,9 +53,10 @@ def data_form(request) -> Iterator[bool]: return request.param @staticmethod - def saver(compute) -> Saver: + def saver(compute, data_form, tmp_path) -> Saver: # Create a test Saver object - return Saver(filename="", netcdf_format="NETCDF4", compute=compute) + filepath = tmp_path / f"tmp_{compute}_{data_form}.nc" + return Saver(filename=filepath, netcdf_format="NETCDF4", compute=compute) @staticmethod def mock_var(shape, with_data_array, mocker): @@ -68,6 +70,7 @@ def mock_var(shape, with_data_array, mocker): spec=threadsafe_nc.VariableWrapper, shape=tuple(shape), dtype=np.dtype(np.float32), + _contained_instance=mocker.Mock(dtype="f4"), **extra_properties, ) # Give the mock cf-var a name property, as required by '_lazy_stream_data'. @@ -76,9 +79,9 @@ def mock_var(shape, with_data_array, mocker): mock_cfvar.name = "" return mock_cfvar - def test_data_save(self, compute, data_form, mocker): + def test_data_save(self, compute, data_form, mocker, tmp_path): """Real data is transferred immediately, lazy data creates a delayed write.""" - saver = self.saver(compute=compute) + saver = self.saver(compute, data_form, tmp_path) data = np.arange(5.0) if data_form == "lazydata": diff --git a/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver__ugrid.py b/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver__ugrid.py index e5783925b0..2ace3f4f86 100644 --- a/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver__ugrid.py +++ b/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver__ugrid.py @@ -398,12 +398,12 @@ def test_compression(self, check_save_cubes, mocker): # into the iris.fileformats.netcdf.saver. Also we want to check that the # compression kwargs are passed into the NetCDF4 createVariable method patch = mocker.patch( - "iris.fileformats.netcdf.saver._thread_safe_nc.DatasetWrapper.createVariable", + "iris.fileformats.netcdf.saver.bytecoding_datasets.EncodedDataset.createVariable", ) # No need to patch this NetCDF4 variable to compensate for the previous patch # on createVariable, which doesn't actually create the variable. mocker.patch( - "iris.fileformats.netcdf.saver._thread_safe_nc.DatasetWrapper.variables" + "iris.fileformats.netcdf.saver.bytecoding_datasets.EncodedDataset.variables" ) cube = make_cube(var_name=(var_name := "a")) compression_kwargs = { @@ -776,10 +776,10 @@ def test_compression(self, check_save_mesh, mocker): """ patch = mocker.patch( - "iris.fileformats.netcdf.saver._thread_safe_nc.DatasetWrapper.createVariable", + "iris.fileformats.netcdf.saver.bytecoding_datasets.EncodedDataset.createVariable", ) mocker.patch( - "iris.fileformats.netcdf.saver._thread_safe_nc.DatasetWrapper.variables" + "iris.fileformats.netcdf.saver.bytecoding_datasets.EncodedDataset.variables" ) mesh = make_mesh() compression_kwargs = { diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py new file mode 100644 index 0000000000..a3137612a1 --- /dev/null +++ b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py @@ -0,0 +1,559 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the BSD license. +# See LICENSE in the root of the repository for full licensing details. +"""Unit tests for :class:`iris.fileformats.netcdf._bytecoding_datasets` module.""" + +from pathlib import Path +import warnings + +import netCDF4 +import numpy as np +import pytest + +from iris import tests +from iris.exceptions import TranslationError +from iris.fileformats.netcdf._bytecoding_datasets import ( + SUPPORTED_ENCODINGS, + EncodedDataset, + EncodedGroup, + EncodedVariable, +) +from iris.fileformats.netcdf._thread_safe_nc import ( + DatasetWrapper, + GroupWrapper, + VariableWrapper, +) +import iris.tests._shared_utils as testutils +from iris.tests.stock.netcdf import ncgen_from_cdl +from iris.warnings import IrisCfLoadWarning, IrisCfSaveWarning + +# Note: for test options, include "no encoding" and an alias name +ENCODING_NONE = None +ENCODING_UTF8_ALIAS = "UTF8" +encoding_options = [ENCODING_NONE, ENCODING_UTF8_ALIAS] + SUPPORTED_ENCODINGS + +samples_3_ascii = np.array( + ["one", "", "seven"], # N.B. include empty! +) +samples_3_nonascii = np.array(["two", "", "épéé"]) + + +def strings_maxbytes(strings, encoding): + return max(len(string.encode(encoding)) for string in strings) + + +@pytest.fixture(params=encoding_options) +def encoding(request): + return request.param + + +@pytest.fixture(scope="module") +def tempdir(tmp_path_factory): + path = tmp_path_factory.mktemp("netcdf") + return path + + +def make_encoded_dataset( + path: Path, strlen: int, encoding: str | None = None +) -> EncodedDataset: + """Create a test EncodedDataset linked to an actual file. + + * strlen becomes the string dimension (i.e. a number of *bytes*) + * a variable "vxs" is created + * If 'encoding' is given, the "vxs::_Encoding" attribute is created with this value + """ + ds = EncodedDataset(path, "w") + ds.createDimension("x", 3) + ds.createDimension("strlen", strlen) + v = ds.createVariable("vxs", "S1", ("x", "strlen")) + if encoding is not None: + v.setncattr("_Encoding", encoding) + return ds + + +def fetch_undecoded_var(path, varname): + # Open a path as a "normal" dataset, and return a given variable. + ds_normal = DatasetWrapper(path) + ds_normal._contained_instance.set_auto_chartostring(False) + v = ds_normal.variables[varname] + # Return a variable, rather than its data, so we can check attributes etc. + return v + + +def check_array_matching(arr1, arr2): + """Check for arrays matching shape, dtype and content.""" + assert arr1.shape == arr2.shape + assert arr1.dtype == arr2.dtype + assert np.all(arr1 == arr2) + + +def check_raw_content(path, varname, expected_byte_array): + v = fetch_undecoded_var(path, varname) + bytes_result = v[:] + check_array_matching(bytes_result, expected_byte_array) + + +def _make_bytearray_inner(data, bytewidth, encoding): + # Convert to a (list of [lists of..]) strings or bytes to a + # (list of [lists of..]) length-1 bytes with an extra dimension. + if isinstance(data, str): + # Convert input strings to bytes + data = data.encode(encoding) + if isinstance(data, bytes): + # iterate over bytes to get a sequence of length-1 bytes (what np.array wants) + result = [data[i : i + 1] for i in range(len(data))] + # pad or truncate everything to the required bytewidth + result = (result + [b"\0"] * bytewidth)[:bytewidth] + else: + # If not string/bytes, expect the input to be a list. + # N.B. the recursion is inefficient, but we don't care about that here + result = [_make_bytearray_inner(part, bytewidth, encoding) for part in data] + return result + + +def make_bytearray(data, bytewidth, encoding="ascii"): + """Convert bytes or lists of bytes into a numpy byte array. + + This is largely to avoid using "encode_stringarray_as_bytearray", since we don't + want to depend on that when we should be testing it. + So, it mostly replicates the function of that, but it does also support bytes in the + input. + """ + # First, Convert to a (list of [lists of]..) length-1 bytes objects + data = _make_bytearray_inner(data, bytewidth, encoding) + # We should now be able to create an array of single bytes. + result = np.array(data) + assert result.dtype == "S1" + return result + + +class TestWriteStrings: + """Test how string data is saved to a file. + + Mostly, we read back data as a "normal" dataset to avoid relying on the read code, + which is separately tested -- see 'TestReadStrings'. + """ + + def test_encodings(self, encoding, tempdir): + # Create a dataset with the variable + path = tempdir / f"test_bytecoded_writestrings_encoding_{encoding!s}.nc" + + if encoding in [None, "ascii"]: + writedata = samples_3_ascii + write_encoding = "ascii" + else: + writedata = samples_3_nonascii + write_encoding = encoding + + writedata = writedata.copy() # just for safety? + strlen = strings_maxbytes(writedata, write_encoding) + + ds_encoded = make_encoded_dataset(path, strlen, encoding) + v = ds_encoded.variables["vxs"] + + # Effectively, checks that we *can* write strings + v[:] = writedata + + # Close, re-open as an "ordinary" dataset, and check the raw content. + ds_encoded.close() + expected_bytes = make_bytearray(writedata, strlen, write_encoding) + check_raw_content(path, "vxs", expected_bytes) + + # Check also that the "_Encoding" property is as expected + v = fetch_undecoded_var(path, "vxs") + result_attr = v.getncattr("_Encoding") if "_Encoding" in v.ncattrs() else None + assert result_attr == encoding + + def test_scalar(self, tempdir): + # Like 'test_write_strings', but the variable has *only* the string dimension. + path = tempdir / "test_bytecoded_writestrings_scalar.nc" + + strlen = 5 + ds_encoded = make_encoded_dataset(path, strlen=strlen) + v = ds_encoded.createVariable("v0_scalar", "S1", ("strlen",)) + + # Checks that we *can* write a string + v[:] = np.array("stuff", dtype=str) + + # Close, re-open as an "ordinary" dataset, and check the raw content. + ds_encoded.close() + expected_bytes = make_bytearray(b"stuff", strlen) + check_raw_content(path, "v0_scalar", expected_bytes) + + def test_multidim(self, tempdir): + # Like 'test_write_strings', but the variable has additional dimensions. + path = tempdir / "test_bytecoded_writestrings_multidim.nc" + + strlen = 5 + ds_encoded = make_encoded_dataset(path, strlen=strlen) + ds_encoded.createDimension("y", 2) + v = ds_encoded.createVariable( + "vyxn", + "S1", + ( + "y", + "x", + "strlen", + ), + ) + + # Check that we *can* write a multidimensional string array + test_data = [ + ["one", "n", ""], + ["two", "xxxxx", "four"], + ] + v[:] = test_data + + # Close, re-open as an "ordinary" dataset, and check the raw content. + ds_encoded.close() + expected_bytes = make_bytearray(test_data, strlen) + check_raw_content(path, "vyxn", expected_bytes) + + @pytest.mark.parametrize("encoding", [None, "ascii"]) + def test_write_encoding_failure(self, tempdir, encoding): + path = tempdir / f"test_bytecoded_writestrings_encoding_{encoding}_fail.nc" + ds = make_encoded_dataset(path, strlen=5, encoding=encoding) + v = ds.variables["vxs"] + encoding_name = encoding + if encoding_name == None: + encoding_name = "ascii" + msg = ( + "String data written to netcdf character variable 'vxs'.*" + f" could not be represented in encoding '{encoding_name}'. " + ) + with pytest.raises(ValueError, match=msg): + v[:] = samples_3_nonascii + + @pytest.mark.parametrize("mode", ["invalid", "unsupported"]) + def test_write_badencoding_ignore(self, tempdir, mode): + if mode == "invalid": + encoding = "" + else: + encoding = "latin1" # "latin1" is a real thing + path = tempdir / f"test_bytecoded_writestrings_badencoding_{encoding}_ignore.nc" + ds = make_encoded_dataset(path, strlen=5, encoding=encoding) + v = ds.variables["vxs"] + msg = ( + r"Ignoring unsupported encoding for netCDF variable 'vxs': " + f".*'{encoding}', is not recognised as one of the supported encodings" + ) + with pytest.warns(IrisCfSaveWarning, match=msg): + v[:] = samples_3_ascii # will work OK + + def test_overlength(self, tempdir): + # Check expected behaviour with over-length data + path = tempdir / "test_bytecoded_writestrings_overlength.nc" + strlen = 6 + ds = make_encoded_dataset(path, strlen=strlen, encoding="utf8") + v = ds.variables["vxs"] + msg = ( + r"String .* written into netcdf variable 'vxs' with encoding \'utf-8\' " + r"is 7 bytes long, which exceeds .* 6\. This can be fixed by " + ) + with pytest.raises(TranslationError, match=msg): + v[:] = ["1", "éclair", "two"] + + def test_overlength_splitcoding(self, tempdir): + # Check expected behaviour when non-ascii multibyte coding gets truncated + path = tempdir / "test_bytecoded_writestrings_overlength_splitcoding.nc" + strlen = 5 + ds = make_encoded_dataset(path, strlen=strlen, encoding="utf-8") + v = ds.variables["vxs"] + # Note: we must do the assignment as a single byte array, to avoid hitting the + # safety check for this exact problem : see previous check. + byte_arrays = [ + string.encode("utf-8")[:strlen] for string in ("1", "1234ü", "two") + ] + nd_bytes_array = np.array( + [ + [bytes[i : i + 1] if i < len(bytes) else b"\0" for i in range(strlen)] + for bytes in byte_arrays + ] + ) + v[:] = nd_bytes_array + # This creates a problem: it won't read back + msg = ( + "Character data in variable 'vxs' could not be decoded " + "with the 'utf-8' encoding." + ) + with pytest.raises(ValueError, match=msg): + v[:] + + # Check also that we *can* read the raw content. + ds.close() + expected_bytes = [ + b"1", + b"1234\xc3", # NOTE: truncated encoding + b"two", + ] + expected_bytearray = make_bytearray(expected_bytes, strlen) + check_raw_content(path, "vxs", expected_bytearray) + + +class TestWriteChars: + @pytest.mark.parametrize("write_form", ["strings", "bytes"]) + def test_write_chars(self, tempdir, write_form): + encoding = "utf-8" + write_strings = samples_3_nonascii + strlen = strings_maxbytes(write_strings, encoding) + write_bytes = make_bytearray(write_strings, strlen, encoding=encoding) + # NOTE: 'flexi' form util decides the width needs to be 7 !! + path = tempdir / f"test_bytecoded_writechars_{write_form}.nc" + ds = make_encoded_dataset(path, encoding=encoding, strlen=strlen) + v = ds.variables["vxs"] + + # assign in *either* way.. + if write_form == "strings": + v[:] = write_strings + else: + v[:] = write_bytes + + # .. the result should be the same + ds.close() + check_raw_content(path, "vxs", write_bytes) + + +class TestRead: + """Test how character data is read and converted to strings. + + N.B. many testcases here parallel the 'TestWriteStrings' : we are creating test + datafiles with 'make_dataset' and assigning raw bytes, as-per 'TestWriteChars'. + + We are mostly checking here that reading back produces string arrays as expected. + However, each testcase also reads and checks the "raw" byte content by re-opening + with a non-encoded _thread_safe_nc.DatasetWrapper, to check content is as expected. + """ + + @pytest.fixture(params=["strings", "bytes"]) + def readmode(self, request): + return request.param + + def undecoded_testvar(self, ds_encoded, varname: str): + path = ds_encoded.filepath() + ds_encoded.close() + ds = DatasetWrapper(path) + v = ds.variables[varname] + v.set_auto_chartostring(False) + return v + + def test_encodings(self, encoding, tempdir, readmode): + # Create a dataset with the variable + path = tempdir / f"test_bytecoded_read_encodings_{encoding!s}_{readmode}.nc" + + if encoding in [None, "ascii"]: + write_strings = samples_3_ascii + write_encoding = "ascii" + else: + write_strings = samples_3_nonascii + write_encoding = encoding + + write_strings = write_strings.copy() # just for safety? + strlen = strings_maxbytes(write_strings, write_encoding) + write_bytes = make_bytearray(write_strings, strlen, encoding=write_encoding) + + ds_encoded = make_encoded_dataset(path, strlen, encoding) + v = ds_encoded.variables["vxs"] + v[:] = write_bytes + + if readmode == "strings": + # Test "normal" read --> string array + result = v[:] + expected = write_strings + if encoding in ("utf-8", ENCODING_UTF8_ALIAS, "utf-16"): + # In these cases, with the given non-ascii sample data, the + # "default minimum string length" is overestimated. + if encoding in ["utf-8", ENCODING_UTF8_ALIAS]: + assert strlen == 7 + assert result.dtype == "U7" + # correct the result dtype to pass the write_strings comparison below + truncated_result = result.astype("U4") + elif encoding == "utf-16": + assert strlen == 10 + assert result.dtype == "U4" + # correct the result dtype to pass the write_strings comparison below + truncated_result = result.astype("U4") + # Also check that content is the same (i.e. not actually truncated) + assert np.all(truncated_result == result) + result = truncated_result + else: + # Close and re-open as "regular" dataset -- just to check "raw" byte content + v = self.undecoded_testvar(ds_encoded, "vxs") + result = v[:] + expected = write_bytes + + check_array_matching(result, expected) + + def test_scalar(self, tempdir, readmode): + # Like 'test_write_strings', but the variable has *only* the string dimension. + path = tempdir / f"test_bytecoded_read_scalar_{readmode}.nc" + + strlen = 5 + ds_encoded = make_encoded_dataset(path, strlen=strlen) + v = ds_encoded.createVariable("v0_scalar", "S1", ("strlen",)) + + data_string = "stuff" + data_bytes = make_bytearray(data_string, 5) + + # Checks that we *can* write a string + v[:] = data_bytes + + if readmode == "strings": + # Test "normal" read --> string array + result = v[:] + expected = np.array(data_string) + else: + # Test "raw" read --> byte array + v = self.undecoded_testvar(ds_encoded, "v0_scalar") + result = v[:] + expected = data_bytes + + check_array_matching(result, expected) + + def test_multidim(self, tempdir, readmode): + # Like 'test_write_strings', but the variable has additional dimensions. + path = tempdir / f"test_bytecoded_read_multidim_{readmode}.nc" + + strlen = 5 + ds_encoded = make_encoded_dataset(path, strlen=strlen) + ds_encoded.createDimension("y", 2) + v = ds_encoded.createVariable( + "vyxn", + "S1", + ( + "y", + "x", + "strlen", + ), + ) + + # Check that we *can* write a multidimensional string array + test_strings = [ + ["one", "n", ""], + ["two", "xxxxx", "four"], + ] + test_bytes = make_bytearray(test_strings, strlen) + v[:] = test_bytes + + if readmode == "strings": + # Test "normal" read --> string array + result = v[:] + expected = np.array(test_strings) + else: + # Test "raw" read --> byte array + v = self.undecoded_testvar(ds_encoded, "vyxn") + result = v[:] + expected = test_bytes + + check_array_matching(result, expected) + + def test_read_encoding_failure(self, tempdir, readmode): + path = tempdir / f"test_bytecoded_read_encoding_failure_{readmode}.nc" + strlen = 10 + ds_encoded = make_encoded_dataset(path, strlen=strlen, encoding="ascii") + v = ds_encoded.variables["vxs"] + test_utf8_bytes = make_bytearray( + samples_3_nonascii, bytewidth=strlen, encoding="utf-8" + ) + v[:] = test_utf8_bytes + + if readmode == "strings": + msg = ( + "Character data in variable 'vxs' could not be decoded " + "with the 'ascii' encoding." + ) + with pytest.raises(ValueError, match=msg): + v[:] + else: + v = self.undecoded_testvar(ds_encoded, "vxs") + result = v[:] # this ought to be ok! + + assert np.all(result == test_utf8_bytes) + + @pytest.mark.parametrize("mode", ["invalid", "unsupported"]) + def test_read_badencoding_ignore(self, tempdir, mode): + if mode == "invalid": + encoding = "" + else: + encoding = "latin1" # "latin1" is a real thing + path = tempdir / f"test_bytecoded_read_badencoding_{encoding}_ignore.nc" + strlen = 10 + ds = make_encoded_dataset(path, strlen=strlen, encoding=encoding) + v = ds.variables["vxs"] + test_utf8_bytes = make_bytearray( + samples_3_nonascii, bytewidth=strlen, encoding="utf-8" + ) + v[:] = test_utf8_bytes + + msg = ( + r"Ignoring unsupported encoding for netCDF variable 'vxs': " + f".*'{encoding}', is not recognised as one of the supported encodings" + ) + with pytest.warns(IrisCfLoadWarning, match=msg): + # raises warning but succeeds, due to default read encoding of 'utf-8' + v[:] + + +class TestObjectTypes: + """Check that the types of dataset content objects are consistent.""" + + @pytest.fixture + def samplefile_path(self, tmp_path): + testpath = tmp_path / "test.nc" + ds = netCDF4.Dataset(testpath, "w") + ds.createDimension("x", 4) + grp_a = ds.createGroup("grp_a") + ds.createVariable("vx", float, ["x"]) + grp_a.createVariable("a_vx", int, ["x"]) + ds.close() + return testpath + + @pytest.fixture(params=["netCDF4", "unencoded", "encoded"]) + def classtype(self, request): + param = request.param + if param == "netCDF4": + self.dataset_class = netCDF4.Dataset + self.group_class = netCDF4.Group + self.variable_class = netCDF4.Variable + elif param == "unencoded": + self.dataset_class = DatasetWrapper + self.group_class = GroupWrapper + self.variable_class = VariableWrapper + else: + self.dataset_class = EncodedDataset + self.group_class = EncodedGroup + self.variable_class = EncodedVariable + return param + + def test_dataset_nonencoded_types(self, samplefile_path, classtype): + ds = self.dataset_class(samplefile_path) + try: + grps = ds.groups + grp_a = grps["grp_a"] + assert type(grp_a) is self.group_class + assert grps == {"grp_a": grp_a} + + var_vx = ds.variables["vx"] + assert type(var_vx) is self.variable_class + + var_a_vx = grp_a.variables["a_vx"] + assert type(var_a_vx) is self.variable_class + + finally: + ds.close() + + @pytest.mark.parametrize("is_on", [True, False], ids=["c2sOn", "c2sOff"]) + @pytest.mark.parametrize("component_type", ["ds", "var", "group"]) + def test_auto_chartostring(self, samplefile_path, classtype, component_type, is_on): + ds = self.dataset_class(samplefile_path) + var = ds.variables["vx"] + grp = ds.groups["grp_a"] + component = {"ds": ds, "var": var, "group": grp}[component_type] + if classtype == "encoded" and is_on: + # In this case cannot turn "on": expect error + msg = '"auto_chartostring" is not supported by Iris EncodedDataset' + with pytest.raises(TypeError, match=msg): + component.set_auto_chartostring(is_on) + else: + # Just check method exists + doesn't error. + component.set_auto_chartostring(is_on)