Skip to content

Commit

Permalink
Original variable encodings are retained (pangeo-forge#465)
Browse files Browse the repository at this point in the history
  • Loading branch information
derekocallaghan committed Jan 12, 2023
1 parent 4b5f124 commit 496036a
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 5 deletions.
27 changes: 23 additions & 4 deletions pangeo_forge_recipes/aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,17 @@ class XarraySchema(TypedDict):


def dataset_to_schema(ds: xr.Dataset) -> XarraySchema:
"""Convert the output of `dataset.to_dict(data=False)` to a schema
"""Convert the output of `dataset.to_dict(data=False, encoding=True)` to a schema
(Basically justs adds chunks, which is not part of the Xarray ouput).
"""

d = ds.to_dict(data=False)
# Remove redundant encoding options
for v in ds.variables:
for option in ['_FillValue', 'source']:
# TODO: should be okay to remove _FillValue?
if option in ds[v].encoding:
del ds[v].encoding[option]
d = ds.to_dict(data=False, encoding=True)
return XarraySchema(
attrs=d.get("attrs"),
coords=d.get("coords"),
Expand Down Expand Up @@ -164,6 +170,8 @@ def _combine_vars(v1, v2, concat_dim, allow_both=False):
raise DatasetCombineError(f"Can't merge datasets with the same variable {vname}")
attrs = _combine_attrs(v1[vname]["attrs"], v2[vname]["attrs"])
dtype = _combine_dtype(v1[vname]["dtype"], v2[vname]["dtype"])
# Can combine encoding using the same approach as attrs
encoding = _combine_attrs(v1[vname]["encoding"], v2[vname]["encoding"])
(d1, s1), (d2, s2) = (
(v1[vname]["dims"], v1[vname]["shape"]),
(v2[vname]["dims"], v2[vname]["shape"]),
Expand All @@ -182,7 +190,13 @@ def _combine_vars(v1, v2, concat_dim, allow_both=False):
)
else:
shape.append(l1)
new_vars[vname] = {"dims": dims, "attrs": attrs, "dtype": dtype, "shape": tuple(shape)}
new_vars[vname] = {"dims": dims,
"attrs": attrs,
"dtype": dtype,
"shape": tuple(shape),
"encoding": encoding
}

return new_vars


Expand All @@ -199,9 +213,14 @@ def _to_variable(template, target_chunks):
# Xarray will pick a time encoding for the dataset (e.g. "days since days since 1970-01-01")
# and this may not be compatible with the actual values in the time coordinate
# (which we don't know yet)
# TODO: previous comment regarding encoding should no longer
# be relevant now that variable encoding will be used if available
data = dsa.zeros(shape=shape, chunks=chunks, dtype=dtype)
# TODO: add more encoding
encoding = {"chunks": chunks}
# TODO: is the previous comment still relevant now that
# variable encoding will be used if available?
encoding = template.get("encoding", {})
encoding["chunks"] = chunks
return xr.Variable(dims=dims, data=data, attrs=template["attrs"], encoding=encoding)


Expand Down
6 changes: 6 additions & 0 deletions tests/data_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,10 @@ def make_ds(nt=10, non_dim_coords=False):
coords=coords,
attrs={"conventions": "CF 1.6"},
)

# Add time coord encoding
ds.time.encoding = {"units": f"days since {time[0]}",
"calendar": "proleptic_gregorian"
}

return ds
2 changes: 2 additions & 0 deletions tests/test_aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ def test_schema_to_template_ds(specified_chunks):
chunksize = var.chunksizes[dim]
expected_chunksize = _expected_chunks(size, specified_chunks.get(dim, None))
assert chunksize == expected_chunksize
# Confirm original time units have been preserved
assert ds.time.encoding['units'] == dst.time.encoding['units']
schema2 = dataset_to_schema(dst)
assert schema == schema2

Expand Down
6 changes: 5 additions & 1 deletion tests/test_writers.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def temp_store(tmp_path):
def test_store_dataset_fragment(temp_store):

ds = make_ds(non_dim_coords=True)
schema = ds.to_dict(data=False)
schema = ds.to_dict(data=False, encoding=True)
schema["chunks"] = {}

ds.to_zarr(temp_store)
Expand Down Expand Up @@ -138,3 +138,7 @@ def test_store_dataset_fragment(temp_store):
ds_target = xr.open_dataset(temp_store, engine="zarr").load()

xr.testing.assert_identical(ds, ds_target)
# assert_identical() doesn't check encoding
# Checking the original time encoding units should be sufficient
# Zarr retains the original "days since %Y:%m%d" and removes " %H:%M:%S"
assert " ".join(ds.time.encoding["units"].split(" ")[0:-1]) == ds_target.time.encoding["units"]

0 comments on commit 496036a

Please sign in to comment.