From f3d74e82657ed4d4e6cf1d2aa5fdb93d0370e9e1 Mon Sep 17 00:00:00 2001 From: Guido Imperiale Date: Sun, 25 Sep 2016 19:09:20 +0100 Subject: [PATCH 01/11] Disabled auto-caching dask arrays when pickling and when invoking the .values property. Added new method .compute(). --- doc/whats-new.rst | 15 ++++--- xarray/core/dataarray.py | 13 ++++++ xarray/core/dataset.py | 43 ++++++++++++++++-- xarray/core/variable.py | 51 +++++++++++++++++---- xarray/test/test_backends.py | 86 ++++++++++++++++++++++++++++++------ 5 files changed, 178 insertions(+), 30 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 9d300741955..583e3062665 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -25,6 +25,10 @@ Breaking changes merges will now succeed in cases that previously raised ``xarray.MergeError``. Set ``compat='broadcast_equals'`` to restore the previous default. +- Pickling an xarray object based on the dask backend, or reading its + :py:meth:`values` property, won't automatically convert the backend to numpy + in the original object anymore. + By `Guido Imperiale `_. Deprecations ~~~~~~~~~~~~ @@ -45,33 +49,32 @@ Deprecations Enhancements ~~~~~~~~~~~~ - Add checking of ``attr`` names and values when saving to netCDF, raising useful -error messages if they are invalid. (:issue:`911`). -By `Robin Wilson `_. - + error messages if they are invalid. (:issue:`911`). + By `Robin Wilson `_. - Added ability to save ``DataArray`` objects directly to netCDF files using :py:meth:`~xarray.DataArray.to_netcdf`, and to load directly from netCDF files using :py:func:`~xarray.open_dataarray` (:issue:`915`). These remove the need to convert a ``DataArray`` to a ``Dataset`` before saving as a netCDF file, and deals with names to ensure a perfect 'roundtrip' capability. By `Robin Wilson `_. - - Added the ``compat`` option ``'no_conflicts'`` to ``merge``, allowing the combination of xarray objects with disjoint (:issue:`742`) or overlapping (:issue:`835`) coordinates as long as all present data agrees. By `Johnnie Gray `_. See :ref:`combining.no_conflicts` for more details. - - It is now possible to set ``concat_dim=None`` explicitly in :py:func:`~xarray.open_mfdataset` to disable inferring a dimension along which to concatenate. By `Stephan Hoyer `_. +- Added :py:meth:`compute` method to :py:class:`DataArray`, :py:class:`Dataset`, + and :py:class:`Variable` as a non-destructive alternative to :py:meth:`load`. + By `Guido Imperiale `_. Bug fixes ~~~~~~~~~ diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 44d5865ad37..79fdd7c1100 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -561,6 +561,19 @@ def load(self): self._coords = new._coords return self + def compute(self): + """Manually trigger loading of this array's data from disk or a + remote source into memory and return a new array. The original is + left unaltered. + + Normally, it should not be necessary to call this method in user code, + because all xarray functions should either work on deferred data or + load data automatically. However, this method can be necessary when + working with many file objects on disk. + """ + new = self.copy(deep=False) + return new.load() + def copy(self, deep=True): """Returns a copy of this array. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 45650d21501..475fcdab231 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -255,8 +255,11 @@ def load_store(cls, store, decoder=None): return obj def __getstate__(self): - """Always load data in-memory before pickling""" - self.load() + """Load data in-memory before pickling (except for Dask data)""" + for v in self.variables.values(): + if not isinstance(v.data, dask_array_type): + v.load() + # self.__dict__ is the default pickle object, we don't need to # implement our own __setstate__ method to make pickle work state = self.__dict__.copy() @@ -306,7 +309,7 @@ def load(self): """ # access .data to coerce everything to numpy or dask arrays all_data = dict((k, v.data) for k, v in self.variables.items()) - lazy_data = dict((k, v) for k, v in all_data.items() + lazy_data = OrderedDict((k, v) for k, v in all_data.items() if isinstance(v, dask_array_type)) if lazy_data: import dask.array as da @@ -319,6 +322,40 @@ def load(self): return self + def compute(self): + """Manually trigger loading of this dataset's data from disk or a + remote source into memory and return a new dataset. The original is + left unaltered. + + Normally, it should not be necessary to call this method in user code, + because all xarray functions should either work on deferred data or + load data automatically. However, this method can be necessary when + working with many file objects on disk. + """ + # Can't just do the below, because new.variables[k] is the + # same object as self.variables[k]! + # new = self.copy(deep=False) + # return new.load() + + variables = OrderedDict((k, v.copy(deep=False)) + for (k, v) in self.variables.items()) + lazy_data = OrderedDict((k, v.data) for k, v in variables.items() + if isinstance(v.data, dask_array_type)) + + if lazy_data: + import dask.array as da + + # evaluate all the dask arrays simultaneously + evaluated_data = da.compute(*lazy_data.values()) + + for k, data in zip(lazy_data, evaluated_data): + variables[k] = variables[k].copy(deep=False) + variables[k].data = data + + # skip __init__ to avoid costly validation + return self._construct_direct(variables, self._coord_names.copy(), + self._dims.copy(), self._attrs_copy()) + @classmethod def _construct_direct(cls, variables, coord_names, dims=None, attrs=None, file_obj=None): diff --git a/xarray/core/variable.py b/xarray/core/variable.py index ed8d15f03b5..2fa05c07404 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -272,9 +272,19 @@ def data(self, data): self._data = data def _data_cached(self): - if not isinstance(self._data, (np.ndarray, PandasIndexAdapter)): - self._data = np.asarray(self._data) - return self._data + """Load data into memory and return it. + Do not cache dask arrays automatically; that should + require an explicit load() call. + """ + if isinstance(self._data, (np.ndarray, PandasIndexAdapter)): + new_data = self._data + else: + new_data = np.asarray(self._data) + + if not isinstance(self._data, dask_array_type): + self._data = new_data + + return new_data @property def _indexable_data(self): @@ -288,11 +298,26 @@ def load(self): because all xarray functions should either work on deferred data or load data automatically. """ - self._data_cached() + new_data = self._data_cached() + if isinstance(self._data, dask_array_type): + self._data = new_data return self + def compute(self): + """Manually trigger loading of this variable's data from disk or a + remote source into memory and return a new variable. The original is + left unaltered. + + Normally, it should not be necessary to call this method in user code, + because all xarray functions should either work on deferred data or + load data automatically. + """ + new = self.copy(deep=False) + return new.load() + def __getstate__(self): - """Always cache data as an in-memory array before pickling""" + """Always cache data as an in-memory array before pickling + (with the exception of dask backend)""" self._data_cached() # self.__dict__ is the default pickle object, we don't need to # implement our own __setstate__ method to make pickle work @@ -1091,9 +1116,19 @@ def __init__(self, dims, data, attrs=None, encoding=None, fastpath=False): type(self).__name__) def _data_cached(self): - if not isinstance(self._data, PandasIndexAdapter): - self._data = PandasIndexAdapter(self._data) - return self._data + """Load data into memory and return it. + Do not cache dask arrays automatically; that should + require an explicit load() call. + """ + if isinstance(self._data, PandasIndexAdapter): + new_data = self._data + else: + new_data = PandasIndexAdapter(self._data) + + if not isinstance(self._data, dask_array_type): + self._data = new_data + + return new_data def __getitem__(self, key): key = self._item_key_to_tuple(key) diff --git a/xarray/test/test_backends.py b/xarray/test/test_backends.py index eeb5561579b..8cae1b9b98c 100644 --- a/xarray/test/test_backends.py +++ b/xarray/test/test_backends.py @@ -149,6 +149,27 @@ def assert_loads(vars=None): actual = ds.load() self.assertDatasetAllClose(expected, actual) + def test_dataset_compute(self): + expected = create_test_data() + + with self.roundtrip(expected) as actual: + # Test Dataset.compute() + for v in actual.variables.values(): + self.assertFalse(v._in_memory) + + computed = actual.compute() + + # indexes are going to be loaded in memory anyway, even + # with compute(). This is by design. + for v in actual.data_vars.values(): + self.assertFalse(v._in_memory) + + for v in computed.variables.values(): + self.assertTrue(v._in_memory) + + self.assertDatasetAllClose(expected, actual) + self.assertDatasetAllClose(expected, computed) + def test_roundtrip_None_variable(self): expected = Dataset({None: (('x', 'y'), [[0, 1], [2, 3]])}) with self.roundtrip(expected) as actual: @@ -230,18 +251,6 @@ def test_roundtrip_coordinates(self): with self.roundtrip(expected) as actual: self.assertDatasetIdentical(expected, actual) - expected = original.copy() - expected.attrs['coordinates'] = 'something random' - with self.assertRaisesRegexp(ValueError, 'cannot serialize'): - with self.roundtrip(expected): - pass - - expected = original.copy(deep=True) - expected['foo'].attrs['coordinates'] = 'something random' - with self.assertRaisesRegexp(ValueError, 'cannot serialize'): - with self.roundtrip(expected): - pass - def test_roundtrip_boolean_dtype(self): original = create_boolean_data() self.assertEqual(original['x'].dtype, 'bool') @@ -872,7 +881,26 @@ def test_read_byte_attrs_as_unicode(self): @requires_dask @requires_scipy @requires_netCDF4 -class DaskTest(TestCase): +class DaskTest(TestCase, DatasetIOTestCases): + @contextlib.contextmanager + def create_store(self): + yield Dataset() + + @contextlib.contextmanager + def roundtrip(self, data, save_kwargs={}, open_kwargs={}): + yield data.chunk() + + def test_roundtrip_datetime_data(self): + # Override method in DatasetIOTestCases - remove not applicable save_kwds + times = pd.to_datetime(['2000-01-01', '2000-01-02', 'NaT']) + expected = Dataset({'t': ('t', times), 't0': times[0]}) + with self.roundtrip(expected) as actual: + self.assertDatasetIdentical(expected, actual) + + def test_write_store(self): + # Override method in DatasetIOTestCases - not applicable to dask + pass + def test_open_mfdataset(self): original = Dataset({'foo': ('x', np.random.randn(10))}) with create_tmp_file() as tmp1: @@ -992,6 +1020,38 @@ def test_deterministic_names(self): self.assertIn(tmp, dask_name) self.assertEqual(original_names, repeat_names) + def test_dataarray_compute(self): + actual = DataArray([1,2]).chunk() + computed = actual.compute() + self.assertFalse(actual._in_memory) + self.assertTrue(computed._in_memory) + self.assertDataArrayAllClose(actual, computed) + + def test_pickle(self): + # Test that pickling/unpickling does not convert the dask + # backend to numpy + a1 = DataArray([1,2]).chunk() + self.assertFalse(a1._in_memory) + a2 = pickle.loads(pickle.dumps(a1)) + self.assertDataArrayIdentical(a1, a2) + self.assertFalse(a1._in_memory) + self.assertFalse(a2._in_memory) + + ds1 = Dataset({'a': [1,2]}).chunk() + self.assertFalse(ds1['a']._in_memory) + ds2 = pickle.loads(pickle.dumps(ds1)) + self.assertDatasetIdentical(ds1, ds2) + self.assertFalse(ds1['a']._in_memory) + self.assertFalse(ds2['a']._in_memory) + + def test_values(self): + # Test that invoking the values property does not convert the dask + # backend to numpy + a = DataArray([1,2]).chunk() + self.assertFalse(a._in_memory) + self.assertEquals(a.values.tolist(), [1, 2]) + self.assertFalse(a._in_memory) + @requires_scipy_or_netCDF4 @requires_pydap From 26a69971ef04afb04e64d61e3f2bd6bcc82c79be Mon Sep 17 00:00:00 2001 From: Guido Imperiale Date: Fri, 30 Sep 2016 12:33:29 +0100 Subject: [PATCH 02/11] Minor tweaks --- doc/whats-new.rst | 9 +++++---- xarray/core/variable.py | 28 ++++++++++------------------ xarray/test/test_backends.py | 34 ++++------------------------------ xarray/test/test_dask.py | 27 +++++++++++++++++++++++++++ 4 files changed, 46 insertions(+), 52 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 583e3062665..c7d3a6d0878 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -26,8 +26,8 @@ Breaking changes ``xarray.MergeError``. Set ``compat='broadcast_equals'`` to restore the previous default. - Pickling an xarray object based on the dask backend, or reading its - :py:meth:`values` property, won't automatically convert the backend to numpy - in the original object anymore. + :py:meth:`values` property, won't automatically convert the array from dask + to numpy in the original object anymore. By `Guido Imperiale `_. Deprecations @@ -72,8 +72,9 @@ Enhancements :py:func:`~xarray.open_mfdataset` to disable inferring a dimension along which to concatenate. By `Stephan Hoyer `_. -- Added :py:meth:`compute` method to :py:class:`DataArray`, :py:class:`Dataset`, - and :py:class:`Variable` as a non-destructive alternative to :py:meth:`load`. +- Added methods :py:meth:`DataArray.compute`, :py:meth:`Dataset.compute`, and + :py:meth:`Variable.compute` as a non-mutating alternative to + :py:meth:`~DataArray.load`. By `Guido Imperiale `_. Bug fixes diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 2fa05c07404..1b0eb3f0dee 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -271,19 +271,20 @@ def data(self, data): "replacement data must match the Variable's shape") self._data = data + def _data_cast(self): + if isinstance(self._data, (np.ndarray, PandasIndexAdapter)): + return self._data + else: + return np.asarray(self._data) + def _data_cached(self): """Load data into memory and return it. Do not cache dask arrays automatically; that should require an explicit load() call. """ - if isinstance(self._data, (np.ndarray, PandasIndexAdapter)): - new_data = self._data - else: - new_data = np.asarray(self._data) - + new_data = self._data_cast() if not isinstance(self._data, dask_array_type): self._data = new_data - return new_data @property @@ -1115,20 +1116,11 @@ def __init__(self, dims, data, attrs=None, encoding=None, fastpath=False): raise ValueError('%s objects must be 1-dimensional' % type(self).__name__) - def _data_cached(self): - """Load data into memory and return it. - Do not cache dask arrays automatically; that should - require an explicit load() call. - """ + def _data_cast(self): if isinstance(self._data, PandasIndexAdapter): - new_data = self._data + return self._data else: - new_data = PandasIndexAdapter(self._data) - - if not isinstance(self._data, dask_array_type): - self._data = new_data - - return new_data + return PandasIndexAdapter(self._data) def __getitem__(self, key): key = self._item_key_to_tuple(key) diff --git a/xarray/test/test_backends.py b/xarray/test/test_backends.py index 8cae1b9b98c..0ff5cffec26 100644 --- a/xarray/test/test_backends.py +++ b/xarray/test/test_backends.py @@ -159,11 +159,8 @@ def test_dataset_compute(self): computed = actual.compute() - # indexes are going to be loaded in memory anyway, even - # with compute(). This is by design. for v in actual.data_vars.values(): self.assertFalse(v._in_memory) - for v in computed.variables.values(): self.assertTrue(v._in_memory) @@ -1021,38 +1018,15 @@ def test_deterministic_names(self): self.assertEqual(original_names, repeat_names) def test_dataarray_compute(self): + # Test DataArray.compute() on dask backend. + # The test for Dataset.compute() is already in DatasetIOTestCases; + # however dask is the only tested backend which supports DataArrays actual = DataArray([1,2]).chunk() computed = actual.compute() self.assertFalse(actual._in_memory) self.assertTrue(computed._in_memory) self.assertDataArrayAllClose(actual, computed) - - def test_pickle(self): - # Test that pickling/unpickling does not convert the dask - # backend to numpy - a1 = DataArray([1,2]).chunk() - self.assertFalse(a1._in_memory) - a2 = pickle.loads(pickle.dumps(a1)) - self.assertDataArrayIdentical(a1, a2) - self.assertFalse(a1._in_memory) - self.assertFalse(a2._in_memory) - - ds1 = Dataset({'a': [1,2]}).chunk() - self.assertFalse(ds1['a']._in_memory) - ds2 = pickle.loads(pickle.dumps(ds1)) - self.assertDatasetIdentical(ds1, ds2) - self.assertFalse(ds1['a']._in_memory) - self.assertFalse(ds2['a']._in_memory) - - def test_values(self): - # Test that invoking the values property does not convert the dask - # backend to numpy - a = DataArray([1,2]).chunk() - self.assertFalse(a._in_memory) - self.assertEquals(a.values.tolist(), [1, 2]) - self.assertFalse(a._in_memory) - - + @requires_scipy_or_netCDF4 @requires_pydap class PydapTest(TestCase): diff --git a/xarray/test/test_dask.py b/xarray/test/test_dask.py index 13a8817ce68..7af8aed4975 100644 --- a/xarray/test/test_dask.py +++ b/xarray/test/test_dask.py @@ -1,3 +1,4 @@ +import pickle import numpy as np import pandas as pd @@ -321,3 +322,29 @@ def test_dot(self): eager = self.eager_array.dot(self.eager_array[0]) lazy = self.lazy_array.dot(self.lazy_array[0]) self.assertLazyAndAllClose(eager, lazy) + + def test_dataarray_pickle(self): + # Test that pickling/unpickling does not convert the dask + # backend to numpy + a1 = DataArray([1,2]).chunk() + self.assertFalse(a1._in_memory) + a2 = pickle.loads(pickle.dumps(a1)) + self.assertDataArrayIdentical(a1, a2) + self.assertFalse(a1._in_memory) + self.assertFalse(a2._in_memory) + + def test_dataset_pickle(self): + ds1 = Dataset({'a': [1,2]}).chunk() + self.assertFalse(ds1['a']._in_memory) + ds2 = pickle.loads(pickle.dumps(ds1)) + self.assertDatasetIdentical(ds1, ds2) + self.assertFalse(ds1['a']._in_memory) + self.assertFalse(ds2['a']._in_memory) + + def test_values(self): + # Test that invoking the values property does not convert the dask + # backend to numpy + a = DataArray([1,2]).chunk() + self.assertFalse(a._in_memory) + self.assertEquals(a.values.tolist(), [1, 2]) + self.assertFalse(a._in_memory) From 03fbdd1a424991b1f2d6f6e2d304a7e751a03864 Mon Sep 17 00:00:00 2001 From: Guido Imperiale Date: Fri, 30 Sep 2016 13:18:49 +0100 Subject: [PATCH 03/11] Simplified Dataset.copy() and Dataset.compute() --- xarray/core/dataset.py | 32 ++++---------------------------- xarray/test/test_dataset.py | 7 +++++-- 2 files changed, 9 insertions(+), 30 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 475fcdab231..d060e845ceb 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -332,29 +332,8 @@ def compute(self): load data automatically. However, this method can be necessary when working with many file objects on disk. """ - # Can't just do the below, because new.variables[k] is the - # same object as self.variables[k]! - # new = self.copy(deep=False) - # return new.load() - - variables = OrderedDict((k, v.copy(deep=False)) - for (k, v) in self.variables.items()) - lazy_data = OrderedDict((k, v.data) for k, v in variables.items() - if isinstance(v.data, dask_array_type)) - - if lazy_data: - import dask.array as da - - # evaluate all the dask arrays simultaneously - evaluated_data = da.compute(*lazy_data.values()) - - for k, data in zip(lazy_data, evaluated_data): - variables[k] = variables[k].copy(deep=False) - variables[k].data = data - - # skip __init__ to avoid costly validation - return self._construct_direct(variables, self._coord_names.copy(), - self._dims.copy(), self._attrs_copy()) + new = self.copy(deep=False) + return new.load() @classmethod def _construct_direct(cls, variables, coord_names, dims=None, attrs=None, @@ -441,11 +420,8 @@ def copy(self, deep=False): Otherwise, a shallow copy is made, so each variable in the new dataset is also a variable in the original dataset. """ - if deep: - variables = OrderedDict((k, v.copy(deep=True)) - for k, v in iteritems(self._variables)) - else: - variables = self._variables.copy() + variables = OrderedDict((k, v.copy(deep=deep)) + for k, v in iteritems(self._variables)) # skip __init__ to avoid costly validation return self._construct_direct(variables, self._coord_names.copy(), self._dims.copy(), self._attrs_copy()) diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py index b9798dbd611..bac07d68d0f 100644 --- a/xarray/test/test_dataset.py +++ b/xarray/test/test_dataset.py @@ -1287,10 +1287,13 @@ def test_copy(self): for copied in [data.copy(deep=False), copy(data)]: self.assertDatasetIdentical(data, copied) - for k in data: + # Note: IndexVariable objects with string dtype are always + # copied because of xarray.core.util.safe_cast_to_index. + # Limiting the test to data variables. + for k in data.data_vars: v0 = data.variables[k] v1 = copied.variables[k] - self.assertIs(v0, v1) + assert source_ndarray(v0.data) is source_ndarray(v1.data) copied['foo'] = ('z', np.arange(5)) self.assertNotIn('foo', data) From b04167ce84f18724482435591e4eaa619f6ced9a Mon Sep 17 00:00:00 2001 From: Guido Imperiale Date: Fri, 30 Sep 2016 18:24:43 +0100 Subject: [PATCH 04/11] Minor cleanup --- xarray/core/dataset.py | 7 ++++--- xarray/core/variable.py | 6 ++---- xarray/test/test_dask.py | 40 +++++++++------------------------------- 3 files changed, 15 insertions(+), 38 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index d060e845ceb..30eaf167f6e 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -309,7 +309,7 @@ def load(self): """ # access .data to coerce everything to numpy or dask arrays all_data = dict((k, v.data) for k, v in self.variables.items()) - lazy_data = OrderedDict((k, v) for k, v in all_data.items() + lazy_data = dict((k, v) for k, v in all_data.items() if isinstance(v, dask_array_type)) if lazy_data: import dask.array as da @@ -417,8 +417,9 @@ def copy(self, deep=False): """Returns a copy of this dataset. If `deep=True`, a deep copy is made of each of the component variables. - Otherwise, a shallow copy is made, so each variable in the new dataset - is also a variable in the original dataset. + Otherwise, a shallow copy of each of the component variable is made, so + that the underlying memory region of the new dataset is the same as in + the original dataset. """ variables = OrderedDict((k, v.copy(deep=deep)) for k, v in iteritems(self._variables)) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 1b0eb3f0dee..9c752365281 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -298,10 +298,8 @@ def load(self): Normally, it should not be necessary to call this method in user code, because all xarray functions should either work on deferred data or load data automatically. - """ - new_data = self._data_cached() - if isinstance(self._data, dask_array_type): - self._data = new_data + """ + self._data = self._data_cast() return self def compute(self): diff --git a/xarray/test/test_dask.py b/xarray/test/test_dask.py index 7af8aed4975..9db710427d2 100644 --- a/xarray/test/test_dask.py +++ b/xarray/test/test_dask.py @@ -13,44 +13,22 @@ import dask.array as da -def _copy_at_variable_level(arg): - """We need to copy the argument at the level of xarray.Variable objects, so - that viewing its values does not trigger lazy loading. - """ - if isinstance(arg, Variable): - return arg.copy(deep=False) - elif isinstance(arg, DataArray): - ds = arg.to_dataset(name='__copied__') - return _copy_at_variable_level(ds)['__copied__'] - elif isinstance(arg, Dataset): - ds = arg.copy() - for k in list(ds): - ds._variables[k] = ds._variables[k].copy(deep=False) - return ds - else: - assert False - - class DaskTestCase(TestCase): def assertLazyAnd(self, expected, actual, test): - expected_copy = _copy_at_variable_level(expected) - actual_copy = _copy_at_variable_level(actual) + expected.name = None + actual.name = None with dask.set_options(get=dask.get): - test(actual_copy, expected_copy) - var = getattr(actual, 'variable', actual) - self.assertIsInstance(var.data, da.Array) + test(actual, expected) + if isinstance(actual, Dataset): + for var in actual.variables.values(): + self.assertIsInstance(var.data, da.Array) + else: + var = getattr(actual, 'variable', actual) + self.assertIsInstance(var.data, da.Array) @requires_dask class TestVariable(DaskTestCase): - def assertLazyAnd(self, expected, actual, test): - expected_copy = expected.copy(deep=False) - actual_copy = actual.copy(deep=False) - with dask.set_options(get=dask.get): - test(actual_copy, expected_copy) - var = getattr(actual, 'variable', actual) - self.assertIsInstance(var.data, da.Array) - def assertLazyAndIdentical(self, expected, actual): self.assertLazyAnd(expected, actual, self.assertVariableIdentical) From ca94cc7c0023f38725b8ce842c20a83e7c68f51a Mon Sep 17 00:00:00 2001 From: Guido Imperiale Date: Sat, 1 Oct 2016 11:45:51 +0100 Subject: [PATCH 05/11] Cleaned up dask test --- xarray/test/test_dask.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/xarray/test/test_dask.py b/xarray/test/test_dask.py index 9db710427d2..b78907f5708 100644 --- a/xarray/test/test_dask.py +++ b/xarray/test/test_dask.py @@ -15,8 +15,6 @@ class DaskTestCase(TestCase): def assertLazyAnd(self, expected, actual, test): - expected.name = None - actual.name = None with dask.set_options(get=dask.get): test(actual, expected) if isinstance(actual, Dataset): @@ -178,6 +176,9 @@ def assertLazyAndIdentical(self, expected, actual): def assertLazyAndAllClose(self, expected, actual): self.assertLazyAnd(expected, actual, self.assertDataArrayAllClose) + def assertLazyAndEqual(self, expected, actual): + self.assertLazyAnd(expected, actual, self.assertDataArrayEqual) + def setUp(self): self.values = np.random.randn(4, 6) self.data = da.from_array(self.values, chunks=(2, 2)) @@ -245,7 +246,7 @@ def test_to_dataset_roundtrip(self): v = self.lazy_array expected = u.assign_coords(x=u['x']) - self.assertLazyAndIdentical(expected, v.to_dataset('x').to_array('x')) + self.assertLazyAndEqual(expected, v.to_dataset('x').to_array('x')) def test_merge(self): @@ -254,7 +255,7 @@ def duplicate_and_merge(array): expected = duplicate_and_merge(self.eager_array) actual = duplicate_and_merge(self.lazy_array) - self.assertLazyAndIdentical(expected, actual) + self.assertLazyAndEqual(expected, actual) def test_ufuncs(self): u = self.eager_array @@ -267,9 +268,9 @@ def test_where_dispatching(self): x = da.from_array(a, 5) y = da.from_array(b, 5) expected = DataArray(a).where(b) - self.assertLazyAndIdentical(expected, DataArray(a).where(y)) - self.assertLazyAndIdentical(expected, DataArray(x).where(b)) - self.assertLazyAndIdentical(expected, DataArray(x).where(y)) + self.assertLazyAndEqual(expected, DataArray(a).where(y)) + self.assertLazyAndEqual(expected, DataArray(x).where(b)) + self.assertLazyAndEqual(expected, DataArray(x).where(y)) def test_simultaneous_compute(self): ds = Dataset({'foo': ('x', range(5)), @@ -294,7 +295,7 @@ def test_stack(self): expected = DataArray(data.reshape(2, -1), {'w': [0, 1], 'z': z}, dims=['w', 'z']) assert stacked.data.chunks == expected.data.chunks - self.assertLazyAndIdentical(expected, stacked) + self.assertLazyAndEqual(expected, stacked) def test_dot(self): eager = self.eager_array.dot(self.eager_array[0]) From e46b61f788152e45416ab68e012ab9cc9c3f03ee Mon Sep 17 00:00:00 2001 From: Guido Imperiale Date: Sat, 1 Oct 2016 12:22:57 +0100 Subject: [PATCH 06/11] Integrate no_dask_resolve with dask_broadcast branches --- xarray/test/test_dask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/test/test_dask.py b/xarray/test/test_dask.py index 7f93ab18a50..1326d936249 100644 --- a/xarray/test/test_dask.py +++ b/xarray/test/test_dask.py @@ -332,5 +332,5 @@ def test_from_dask_variable(self): # Test array creation from Variable with dask backend. # This is used e.g. in broadcast() a = DataArray(self.lazy_array.variable) - self.assertLazyAndIdentical(self.lazy_array, a) + self.assertLazyAndEqual(self.lazy_array, a) From 90743f0bc51b45ab3ac9f91d9d534d303f6d6b1b Mon Sep 17 00:00:00 2001 From: Guido Imperiale Date: Sat, 1 Oct 2016 18:37:20 +0100 Subject: [PATCH 07/11] Don't chunk coords --- xarray/core/dataset.py | 11 ++++++++++- xarray/test/test_backends.py | 6 +++--- xarray/test/test_dask.py | 32 ++++++++++++++++++++------------ xarray/test/test_dataset.py | 17 ++++++++++++----- 4 files changed, 45 insertions(+), 21 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 30eaf167f6e..4c0a8163b95 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -806,13 +806,19 @@ def chunks(self): array. """ chunks = {} - for v in self.variables.values(): + for v in self.data_vars.values(): if v.chunks is not None: new_chunks = list(zip(v.dims, v.chunks)) if any(chunk != chunks[d] for d, chunk in new_chunks if d in chunks): raise ValueError('inconsistent chunks') chunks.update(new_chunks) + if chunks: + # Add dims that are defined in the coords but are not in data_vars + for v in self.coords.values(): + for dim in v.dims: + if dim not in chunks: + chunks[dim] = (v.size,) return Frozen(SortedKeysDict(chunks)) def chunk(self, chunks=None, name_prefix='xarray-', token=None, @@ -865,6 +871,9 @@ def selkeys(dict_, keys): return dict((d, dict_[d]) for d in keys if d in dict_) def maybe_chunk(name, var, chunks): + if name not in self.data_vars: + return var + chunks = selkeys(chunks, var.dims) if not chunks: chunks = None diff --git a/xarray/test/test_backends.py b/xarray/test/test_backends.py index 0ff5cffec26..8d1a1e0f783 100644 --- a/xarray/test/test_backends.py +++ b/xarray/test/test_backends.py @@ -125,7 +125,7 @@ def assert_loads(vars=None): if vars is None: vars = expected with self.roundtrip(expected) as actual: - for v in actual.variables.values(): + for v in actual.data_vars.values(): self.assertFalse(v._in_memory) yield actual for k, v in actual.variables.items(): @@ -154,7 +154,7 @@ def test_dataset_compute(self): with self.roundtrip(expected) as actual: # Test Dataset.compute() - for v in actual.variables.values(): + for v in actual.data_vars.values(): self.assertFalse(v._in_memory) computed = actual.compute() @@ -1026,7 +1026,7 @@ def test_dataarray_compute(self): self.assertFalse(actual._in_memory) self.assertTrue(computed._in_memory) self.assertDataArrayAllClose(actual, computed) - + @requires_scipy_or_netCDF4 @requires_pydap class PydapTest(TestCase): diff --git a/xarray/test/test_dask.py b/xarray/test/test_dask.py index 1326d936249..9ae83adddde 100644 --- a/xarray/test/test_dask.py +++ b/xarray/test/test_dask.py @@ -18,11 +18,19 @@ def assertLazyAnd(self, expected, actual, test): with dask.set_options(get=dask.get): test(actual, expected) if isinstance(actual, Dataset): - for var in actual.variables.values(): - self.assertIsInstance(var.data, da.Array) - else: - var = getattr(actual, 'variable', actual) - self.assertIsInstance(var.data, da.Array) + for k, v in actual.variables.items(): + if k in actual.data_vars: + self.assertIsInstance(var.data, da.Array) + else: + self.assertIsInstance(var.data, np.ndarray) + elif isinstance(actual, DataArray): + self.assertIsInstance(actual.data, da.Array) + for coord in actual.coords.values(): + self.assertIsInstance(coord.data, np.ndarray) + elif isinstance(actual, Variable): + self.assertIsInstance(actual.data, da.Array) + else: + assert False @requires_dask @@ -188,6 +196,7 @@ def setUp(self): def test_rechunk(self): chunked = self.eager_array.chunk({'x': 2}).chunk({'y': 2}) self.assertEqual(chunked.chunks, ((2,) * 2, (2,) * 3)) + self.assertLazyAndIdentical(self.lazy_array, chunked) def test_new_chunk(self): chunked = self.eager_array.chunk() @@ -306,19 +315,19 @@ def test_dataarray_pickle(self): # Test that pickling/unpickling does not convert the dask # backend to numpy a1 = DataArray([1,2]).chunk() - self.assertFalse(a1._in_memory) + self.assertFalse(a1._in_memory) a2 = pickle.loads(pickle.dumps(a1)) self.assertDataArrayIdentical(a1, a2) self.assertFalse(a1._in_memory) - self.assertFalse(a2._in_memory) + self.assertFalse(a2._in_memory) def test_dataset_pickle(self): - ds1 = Dataset({'a': [1,2]}).chunk() - self.assertFalse(ds1['a']._in_memory) + ds1 = Dataset({'a': DataArray([1,2])}).chunk() + self.assertFalse(ds1['a']._in_memory) ds2 = pickle.loads(pickle.dumps(ds1)) self.assertDatasetIdentical(ds1, ds2) self.assertFalse(ds1['a']._in_memory) - self.assertFalse(ds2['a']._in_memory) + self.assertFalse(ds2['a']._in_memory) def test_values(self): # Test that invoking the values property does not convert the dask @@ -326,11 +335,10 @@ def test_values(self): a = DataArray([1,2]).chunk() self.assertFalse(a._in_memory) self.assertEquals(a.values.tolist(), [1, 2]) - self.assertFalse(a._in_memory) + self.assertFalse(a._in_memory) def test_from_dask_variable(self): # Test array creation from Variable with dask backend. # This is used e.g. in broadcast() a = DataArray(self.lazy_array.variable) self.assertLazyAndEqual(self.lazy_array, a) - diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py index bac07d68d0f..ada342b42ec 100644 --- a/xarray/test/test_dataset.py +++ b/xarray/test/test_dataset.py @@ -667,14 +667,21 @@ def test_chunk(self): self.assertEqual(data.chunks, {}) reblocked = data.chunk() - for v in reblocked.variables.values(): - self.assertIsInstance(v.data, da.Array) - expected_chunks = dict((d, (s,)) for d, s in data.dims.items()) + for k, v in reblocked.variables.items(): + if k in reblocked.data_vars: + self.assertIsInstance(v.data, da.Array) + else: + self.assertIsInstance(v.data, np.ndarray) + + expected_chunks = {'time': (20,), 'dim1': (8,), + 'dim2': (9,), 'dim3': (10,)} self.assertEqual(reblocked.chunks, expected_chunks) reblocked = data.chunk({'time': 5, 'dim1': 5, 'dim2': 5, 'dim3': 5}) - expected_chunks = {'time': (5,) * 4, 'dim1': (5, 3), - 'dim2': (5, 4), 'dim3': (5, 5)} + # time is not a dim in any of the data_vars, so it + # doesn't get chunked + expected_chunks = {'time': (20,), 'dim1': (5, 3), + 'dim2': (5, 4), 'dim3': (5, 5)} self.assertEqual(reblocked.chunks, expected_chunks) reblocked = data.chunk(expected_chunks) From ac8e0cb72ede763694663b0a58e7ef968d6d8bbc Mon Sep 17 00:00:00 2001 From: Guido Imperiale Date: Thu, 6 Oct 2016 15:37:59 +0100 Subject: [PATCH 08/11] Added performance warning to release notes --- doc/whats-new.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index b62d727e960..1a0bbcaf81a 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -28,6 +28,9 @@ Breaking changes - Pickling an xarray object based on the dask backend, or reading its :py:meth:`values` property, won't automatically convert the array from dask to numpy in the original object anymore. + If a dask object is used as a coord of a :py:class:`~xarray.DataArray` or + :py:class:`~xarray.Dataset`, its values won't be automatically cached, likely + causing performance degradation. By `Guido Imperiale `_. Deprecations @@ -57,8 +60,6 @@ Enhancements to convert a ``DataArray`` to a ``Dataset`` before saving as a netCDF file, and deals with names to ensure a perfect 'roundtrip' capability. By `Robin Wilson `_. - ->>>>>>> master - Multi-index levels are now accessible as "virtual" coordinate variables, e.g., ``ds['time']`` can pull out the ``'time'`` level of a multi-index (see :ref:`coordinates`). ``sel`` also accepts providing multi-index levels @@ -78,7 +79,6 @@ Enhancements :py:meth:`Variable.compute` as a non-mutating alternative to :py:meth:`~DataArray.load`. By `Guido Imperiale `_. - - Adds DataArray and Dataset methods :py:meth:`~xarray.DataArray.cumsum` and :py:meth:`~xarray.DataArray.cumprod`. By `Phillip J. Wolfram `_. From e7f600c698084d47b797bbb73d1615eed2e310c7 Mon Sep 17 00:00:00 2001 From: Guido Imperiale Date: Tue, 11 Oct 2016 00:01:59 +0100 Subject: [PATCH 09/11] Fix bug that caused dask array to be computed and then discarded when pickling --- xarray/core/variable.py | 5 +++-- xarray/test/test_dask.py | 40 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 41 insertions(+), 4 deletions(-) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 3fe3cb3d81a..b447b0b0f6b 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -301,7 +301,7 @@ def load(self): Normally, it should not be necessary to call this method in user code, because all xarray functions should either work on deferred data or load data automatically. - """ + """ self._data = self._data_cast() return self @@ -320,7 +320,8 @@ def compute(self): def __getstate__(self): """Always cache data as an in-memory array before pickling (with the exception of dask backend)""" - self._data_cached() + if not isinstance(self._data, dask_array_type): + self._data_cached() # self.__dict__ is the default pickle object, we don't need to # implement our own __setstate__ method to make pickle work return self.__dict__ diff --git a/xarray/test/test_dask.py b/xarray/test/test_dask.py index 9b1d51c1966..19e72e5884c 100644 --- a/xarray/test/test_dask.py +++ b/xarray/test/test_dask.py @@ -309,20 +309,39 @@ def test_dot(self): lazy = self.lazy_array.dot(self.lazy_array[0]) self.assertLazyAndAllClose(eager, lazy) + def test_variable_pickle(self): + # Test that pickling/unpickling does not convert the dask + # backend to numpy + a1 = Variable(['x'], build_dask_array()) + a1.compute() + self.assertFalse(a1._in_memory) + self.assertEquals(kernel_call_count, 1) + a2 = pickle.loads(pickle.dumps(a1)) + self.assertEquals(kernel_call_count, 1) + self.assertVariableIdentical(a1, a2) + self.assertFalse(a1._in_memory) + self.assertFalse(a2._in_memory) + def test_dataarray_pickle(self): # Test that pickling/unpickling does not convert the dask # backend to numpy - a1 = DataArray([1,2]).chunk() + a1 = DataArray(build_dask_array()) + a1.compute() self.assertFalse(a1._in_memory) + self.assertEquals(kernel_call_count, 1) a2 = pickle.loads(pickle.dumps(a1)) + self.assertEquals(kernel_call_count, 1) self.assertDataArrayIdentical(a1, a2) self.assertFalse(a1._in_memory) self.assertFalse(a2._in_memory) def test_dataset_pickle(self): - ds1 = Dataset({'a': DataArray([1,2])}).chunk() + ds1 = Dataset({'a': DataArray(build_dask_array())}) + ds1.compute() self.assertFalse(ds1['a']._in_memory) + self.assertEquals(kernel_call_count, 1) ds2 = pickle.loads(pickle.dumps(ds1)) + self.assertEquals(kernel_call_count, 1) self.assertDatasetIdentical(ds1, ds2) self.assertFalse(ds1['a']._in_memory) self.assertFalse(ds2['a']._in_memory) @@ -340,3 +359,20 @@ def test_from_dask_variable(self): # This is used e.g. in broadcast() a = DataArray(self.lazy_array.variable) self.assertLazyAndEqual(self.lazy_array, a) + + +kernel_call_count = 0 +def kernel(): + """Dask kernel to test pickling/unpickling. + Must be global to make it pickleable. + """ + global kernel_call_count + kernel_call_count += 1 + return np.ones(1) + +def build_dask_array(): + global kernel_call_count + kernel_call_count = 0 + return dask.array.Array( + dask={('foo', 0): (kernel, )}, name='foo', + chunks=((1,),), dtype=int) From 27b091604dbb3be89f3deb3e3fe7ec057dd46986 Mon Sep 17 00:00:00 2001 From: Guido Imperiale Date: Sat, 12 Nov 2016 17:23:08 +0000 Subject: [PATCH 10/11] Eagerly cache IndexVariables only Eagerly cache only IndexVariables (e.g. coords that are not in dims. Coords that are not in dims are chunked and not cached. --- doc/whats-new.rst | 4 ++-- xarray/core/dataset.py | 19 ++++++------------- xarray/core/variable.py | 5 +++++ xarray/test/test_backends.py | 17 +++++++++++------ xarray/test/test_dask.py | 13 ++++++++----- xarray/test/test_dataset.py | 12 +++++------- 6 files changed, 37 insertions(+), 33 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 7bb3ba0d7c0..23ac6445c98 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -29,8 +29,8 @@ Breaking changes :py:meth:`values` property, won't automatically convert the array from dask to numpy in the original object anymore. If a dask object is used as a coord of a :py:class:`~xarray.DataArray` or - :py:class:`~xarray.Dataset`, its values won't be automatically cached, likely - causing performance degradation. + :py:class:`~xarray.Dataset`, its values will still be automatically cached, + but only if it's used to index a dim (e.g. it's used for alignment). By `Guido Imperiale `_. Deprecations diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index f3aef2f2286..3b1d19606d4 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -829,19 +829,12 @@ def chunks(self): array. """ chunks = {} - for v in self.data_vars.values(): + for v in self.variables.values(): if v.chunks is not None: - new_chunks = list(zip(v.dims, v.chunks)) - if any(chunk != chunks[d] for d, chunk in new_chunks - if d in chunks): - raise ValueError('inconsistent chunks') - chunks.update(new_chunks) - if chunks: - # Add dims that are defined in the coords but are not in data_vars - for v in self.coords.values(): - for dim in v.dims: - if dim not in chunks: - chunks[dim] = (v.size,) + for dim, c in zip(v.dims, v.chunks): + if dim in chunks and c != chunks[dim]: + raise ValueError('inconsistent chunks') + chunks[dim] = c return Frozen(SortedKeysDict(chunks)) def chunk(self, chunks=None, name_prefix='xarray-', token=None, @@ -894,7 +887,7 @@ def selkeys(dict_, keys): return dict((d, dict_[d]) for d in keys if d in dict_) def maybe_chunk(name, var, chunks): - if name not in self.data_vars: + if name in self.dims: return var chunks = selkeys(chunks, var.dims) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index b3ebcab0afe..7e8b13c08d3 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1100,6 +1100,11 @@ def __init__(self, dims, data, attrs=None, encoding=None, fastpath=False): raise ValueError('%s objects must be 1-dimensional' % type(self).__name__) + def _data_cached(self): + # Unlike in Variable._data_cached, always eagerly resolve dask arrays + self._data = self._data_cast() + return self._data + def _data_cast(self): if isinstance(self._data, PandasIndexAdapter): return self._data diff --git a/xarray/test/test_backends.py b/xarray/test/test_backends.py index 01cc0269f49..524fe50d7f9 100644 --- a/xarray/test/test_backends.py +++ b/xarray/test/test_backends.py @@ -128,8 +128,10 @@ def assert_loads(vars=None): if vars is None: vars = expected with self.roundtrip(expected) as actual: - for v in actual.data_vars.values(): - self.assertFalse(v._in_memory) + for k, v in actual.variables.items(): + # IndexVariables are eagerly cached into memory + if k not in actual.dims: + self.assertFalse(v._in_memory) yield actual for k, v in actual.variables.items(): if k in vars: @@ -157,13 +159,16 @@ def test_dataset_compute(self): with self.roundtrip(expected) as actual: # Test Dataset.compute() - for v in actual.data_vars.values(): - self.assertFalse(v._in_memory) + for k, v in actual.variables.items(): + # IndexVariables are eagerly cached + if k not in actual.dims: + self.assertFalse(v._in_memory) computed = actual.compute() - for v in actual.data_vars.values(): - self.assertFalse(v._in_memory) + for k, v in actual.variables.items(): + if k not in actual.dims: + self.assertFalse(v._in_memory) for v in computed.variables.values(): self.assertTrue(v._in_memory) diff --git a/xarray/test/test_dask.py b/xarray/test/test_dask.py index 9485347802c..69a434a4f72 100644 --- a/xarray/test/test_dask.py +++ b/xarray/test/test_dask.py @@ -24,14 +24,17 @@ def assertLazyAnd(self, expected, actual, test): test(actual, expected) if isinstance(actual, Dataset): for k, v in actual.variables.items(): - if k in actual.data_vars: - self.assertIsInstance(var.data, da.Array) - else: + if k in actual.dims: self.assertIsInstance(var.data, np.ndarray) + else: + self.assertIsInstance(var.data, da.Array) elif isinstance(actual, DataArray): self.assertIsInstance(actual.data, da.Array) - for coord in actual.coords.values(): - self.assertIsInstance(coord.data, np.ndarray) + for k, v in actual.coords.items(): + if k in actual.dims: + self.assertIsInstance(v.data, np.ndarray) + else: + self.assertIsInstance(v.data, da.Array) elif isinstance(actual, Variable): self.assertIsInstance(actual.data, da.Array) else: diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py index 254059cb128..57a18f0c5cc 100644 --- a/xarray/test/test_dataset.py +++ b/xarray/test/test_dataset.py @@ -673,20 +673,18 @@ def test_chunk(self): reblocked = data.chunk() for k, v in reblocked.variables.items(): - if k in reblocked.data_vars: - self.assertIsInstance(v.data, da.Array) - else: + if k in reblocked.dims: self.assertIsInstance(v.data, np.ndarray) + else: + self.assertIsInstance(v.data, da.Array) - expected_chunks = {'time': (20,), 'dim1': (8,), - 'dim2': (9,), 'dim3': (10,)} + expected_chunks = {'dim1': (8,), 'dim2': (9,), 'dim3': (10,)} self.assertEqual(reblocked.chunks, expected_chunks) reblocked = data.chunk({'time': 5, 'dim1': 5, 'dim2': 5, 'dim3': 5}) # time is not a dim in any of the data_vars, so it # doesn't get chunked - expected_chunks = {'time': (20,), 'dim1': (5, 3), - 'dim2': (5, 4), 'dim3': (5, 5)} + expected_chunks = {'dim1': (5, 3), 'dim2': (5, 4), 'dim3': (5, 5)} self.assertEqual(reblocked.chunks, expected_chunks) reblocked = data.chunk(expected_chunks) From 376200a48e642b1a8097085eee3f47a1792b3787 Mon Sep 17 00:00:00 2001 From: Guido Imperiale Date: Sun, 13 Nov 2016 18:19:55 +0000 Subject: [PATCH 11/11] Load IndexVariable.data into memory in init IndexVariables to eagerly load their data into memory (from disk or dask) as soon as they're created --- doc/whats-new.rst | 2 +- xarray/core/dataset.py | 3 --- xarray/core/variable.py | 21 ++++++++++++--------- xarray/test/test_backends.py | 14 ++++++++++---- xarray/test/test_dataset.py | 21 +++++++++++++++++---- xarray/test/test_variable.py | 6 ++---- 6 files changed, 42 insertions(+), 25 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 23ac6445c98..31048f333ab 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -29,7 +29,7 @@ Breaking changes :py:meth:`values` property, won't automatically convert the array from dask to numpy in the original object anymore. If a dask object is used as a coord of a :py:class:`~xarray.DataArray` or - :py:class:`~xarray.Dataset`, its values will still be automatically cached, + :py:class:`~xarray.Dataset`, its values are eagerly computed and cached, but only if it's used to index a dim (e.g. it's used for alignment). By `Guido Imperiale `_. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 3b1d19606d4..282409cf4ca 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -887,9 +887,6 @@ def selkeys(dict_, keys): return dict((d, dict_[d]) for d in keys if d in dict_) def maybe_chunk(name, var, chunks): - if name in self.dims: - return var - chunks = selkeys(chunks, var.dims) if not chunks: chunks = None diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 7e8b13c08d3..1b6f5b55dda 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1100,16 +1100,19 @@ def __init__(self, dims, data, attrs=None, encoding=None, fastpath=False): raise ValueError('%s objects must be 1-dimensional' % type(self).__name__) - def _data_cached(self): - # Unlike in Variable._data_cached, always eagerly resolve dask arrays - self._data = self._data_cast() - return self._data + # Unlike in Variable, always eagerly load values into memory + if not isinstance(self._data, PandasIndexAdapter): + self._data = PandasIndexAdapter(self._data) - def _data_cast(self): - if isinstance(self._data, PandasIndexAdapter): - return self._data - else: - return PandasIndexAdapter(self._data) + @Variable.data.setter + def data(self, data): + Variable.data.fset(self, data) + if not isinstance(self._data, PandasIndexAdapter): + self._data = PandasIndexAdapter(self._data) + + def chunk(self, chunks=None, name=None, lock=False): + # Dummy - do not chunk. This method is invoked e.g. by Dataset.chunk() + return self.copy(deep=False) def __getitem__(self, key): key = self._item_key_to_tuple(key) diff --git a/xarray/test/test_backends.py b/xarray/test/test_backends.py index 524fe50d7f9..65e0f3d51ac 100644 --- a/xarray/test/test_backends.py +++ b/xarray/test/test_backends.py @@ -129,8 +129,10 @@ def assert_loads(vars=None): vars = expected with self.roundtrip(expected) as actual: for k, v in actual.variables.items(): - # IndexVariables are eagerly cached into memory - if k not in actual.dims: + # IndexVariables are eagerly loaded into memory + if k in actual.dims: + self.assertTrue(v._in_memory) + else: self.assertFalse(v._in_memory) yield actual for k, v in actual.variables.items(): @@ -161,13 +163,17 @@ def test_dataset_compute(self): # Test Dataset.compute() for k, v in actual.variables.items(): # IndexVariables are eagerly cached - if k not in actual.dims: + if k in actual.dims: + self.assertTrue(v._in_memory) + else: self.assertFalse(v._in_memory) computed = actual.compute() for k, v in actual.variables.items(): - if k not in actual.dims: + if k in actual.dims: + self.assertTrue(v._in_memory) + else: self.assertFalse(v._in_memory) for v in computed.variables.values(): self.assertTrue(v._in_memory) diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py index 57a18f0c5cc..67af48e897c 100644 --- a/xarray/test/test_dataset.py +++ b/xarray/test/test_dataset.py @@ -56,11 +56,24 @@ def create_test_multiindex(): class InaccessibleVariableDataStore(backends.InMemoryDataStore): + def __init__(self, writer=None): + super(InaccessibleVariableDataStore, self).__init__(writer) + self._indexvars = set() + + def store(self, variables, attributes, check_encoding_set=frozenset()): + super(InaccessibleVariableDataStore, self).store( + variables, attributes, check_encoding_set) + for k, v in variables.items(): + if isinstance(v, IndexVariable): + self._indexvars.add(k) + def get_variables(self): - def lazy_inaccessible(x): - data = indexing.LazilyIndexedArray(InaccessibleArray(x.values)) - return Variable(x.dims, data, x.attrs) - return dict((k, lazy_inaccessible(v)) for + def lazy_inaccessible(k, v): + if k in self._indexvars: + return v + data = indexing.LazilyIndexedArray(InaccessibleArray(v.values)) + return Variable(v.dims, data, v.attrs) + return dict((k, lazy_inaccessible(k, v)) for k, v in iteritems(self._variables)) diff --git a/xarray/test/test_variable.py b/xarray/test/test_variable.py index 8693ed39f65..e79d30eeb6e 100644 --- a/xarray/test/test_variable.py +++ b/xarray/test/test_variable.py @@ -1052,13 +1052,11 @@ def test_multiindex_default_level_names(self): def test_data(self): x = IndexVariable('x', np.arange(3.0)) - # data should be initially saved as an ndarray - self.assertIs(type(x._data), np.ndarray) + self.assertIsInstance(x._data, PandasIndexAdapter) + self.assertIsInstance(x.data, np.ndarray) self.assertEqual(float, x.dtype) self.assertArrayEqual(np.arange(3), x) self.assertEqual(float, x.values.dtype) - # after inspecting x.values, the IndexVariable value will be saved as an Index - self.assertIsInstance(x._data, PandasIndexAdapter) with self.assertRaisesRegexp(TypeError, 'cannot be modified'): x[:] = 0