From f3d74e82657ed4d4e6cf1d2aa5fdb93d0370e9e1 Mon Sep 17 00:00:00 2001
From: Guido Imperiale <guido.imperiale@gmail.com>
Date: Sun, 25 Sep 2016 19:09:20 +0100
Subject: [PATCH 01/11] Disabled auto-caching dask arrays when pickling and
 when invoking the .values property. Added new method .compute().

---
 doc/whats-new.rst            | 15 ++++---
 xarray/core/dataarray.py     | 13 ++++++
 xarray/core/dataset.py       | 43 ++++++++++++++++--
 xarray/core/variable.py      | 51 +++++++++++++++++----
 xarray/test/test_backends.py | 86 ++++++++++++++++++++++++++++++------
 5 files changed, 178 insertions(+), 30 deletions(-)

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
index 9d300741955..583e3062665 100644
--- a/doc/whats-new.rst
+++ b/doc/whats-new.rst
@@ -25,6 +25,10 @@ Breaking changes
   merges will now succeed in cases that previously raised
   ``xarray.MergeError``. Set ``compat='broadcast_equals'`` to restore the
   previous default.
+- Pickling an xarray object based on the dask backend, or reading its
+  :py:meth:`values` property, won't automatically convert the backend to numpy
+  in the original object anymore.
+  By `Guido Imperiale <https://github.com/crusaderky>`_.
 
 Deprecations
 ~~~~~~~~~~~~
@@ -45,33 +49,32 @@ Deprecations
 Enhancements
 ~~~~~~~~~~~~
 - Add checking of ``attr`` names and values when saving to netCDF, raising useful
-error messages if they are invalid. (:issue:`911`).
-By `Robin Wilson <https://github.com/robintw>`_.
-
+  error messages if they are invalid. (:issue:`911`).
+  By `Robin Wilson <https://github.com/robintw>`_.
 - Added ability to save ``DataArray`` objects directly to netCDF files using
   :py:meth:`~xarray.DataArray.to_netcdf`, and to load directly from netCDF files
   using :py:func:`~xarray.open_dataarray` (:issue:`915`). These remove the need
   to convert a ``DataArray`` to a ``Dataset`` before saving as a netCDF file,
   and deals with names to ensure a perfect 'roundtrip' capability.
   By `Robin Wilson <https://github.com/robintw`_.
-
 - Multi-index levels are now accessible as "virtual" coordinate variables,
   e.g., ``ds['time']`` can pull out the ``'time'`` level of a multi-index
   (see :ref:`coordinates`). ``sel`` also accepts providing multi-index levels
   as keyword arguments, e.g., ``ds.sel(time='2000-01')``
   (see :ref:`multi-level indexing`).
   By `Benoit Bovy <https://github.com/benbovy>`_.
-
 - Added the ``compat`` option ``'no_conflicts'`` to ``merge``, allowing the
   combination of xarray objects with disjoint (:issue:`742`) or
   overlapping (:issue:`835`) coordinates as long as all present data agrees.
   By `Johnnie Gray <https://github.com/jcmgray>`_. See
   :ref:`combining.no_conflicts` for more details.
-
 - It is now possible to set ``concat_dim=None`` explicitly in
   :py:func:`~xarray.open_mfdataset` to disable inferring a dimension along
   which to concatenate.
   By `Stephan Hoyer <https://github.com/shoyer>`_.
+- Added :py:meth:`compute` method to :py:class:`DataArray`, :py:class:`Dataset`,
+  and :py:class:`Variable` as a non-destructive alternative to :py:meth:`load`.
+  By `Guido Imperiale <https://github.com/crusaderky>`_.
 
 Bug fixes
 ~~~~~~~~~
diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py
index 44d5865ad37..79fdd7c1100 100644
--- a/xarray/core/dataarray.py
+++ b/xarray/core/dataarray.py
@@ -561,6 +561,19 @@ def load(self):
         self._coords = new._coords
         return self
 
+    def compute(self):
+        """Manually trigger loading of this array's data from disk or a
+        remote source into memory and return a new array. The original is
+        left unaltered.
+
+        Normally, it should not be necessary to call this method in user code,
+        because all xarray functions should either work on deferred data or
+        load data automatically. However, this method can be necessary when
+        working with many file objects on disk.
+        """
+        new = self.copy(deep=False)
+        return new.load()
+
     def copy(self, deep=True):
         """Returns a copy of this array.
 
diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
index 45650d21501..475fcdab231 100644
--- a/xarray/core/dataset.py
+++ b/xarray/core/dataset.py
@@ -255,8 +255,11 @@ def load_store(cls, store, decoder=None):
         return obj
 
     def __getstate__(self):
-        """Always load data in-memory before pickling"""
-        self.load()
+        """Load data in-memory before pickling (except for Dask data)"""
+        for v in self.variables.values():
+            if not isinstance(v.data, dask_array_type):
+                v.load()
+
         # self.__dict__ is the default pickle object, we don't need to
         # implement our own __setstate__ method to make pickle work
         state = self.__dict__.copy()
@@ -306,7 +309,7 @@ def load(self):
         """
         # access .data to coerce everything to numpy or dask arrays
         all_data = dict((k, v.data) for k, v in self.variables.items())
-        lazy_data = dict((k, v) for k, v in all_data.items()
+        lazy_data = OrderedDict((k, v) for k, v in all_data.items()
                          if isinstance(v, dask_array_type))
         if lazy_data:
             import dask.array as da
@@ -319,6 +322,40 @@ def load(self):
 
         return self
 
+    def compute(self):
+        """Manually trigger loading of this dataset's data from disk or a
+        remote source into memory and return a new dataset. The original is
+        left unaltered.
+
+        Normally, it should not be necessary to call this method in user code,
+        because all xarray functions should either work on deferred data or
+        load data automatically. However, this method can be necessary when
+        working with many file objects on disk.
+        """
+        # Can't just do the below, because new.variables[k] is the
+        # same object as self.variables[k]!
+        # new = self.copy(deep=False)
+        # return new.load()
+
+        variables = OrderedDict((k, v.copy(deep=False))
+                                for (k, v) in self.variables.items())
+        lazy_data = OrderedDict((k, v.data) for k, v in variables.items()
+                                if isinstance(v.data, dask_array_type))
+
+        if lazy_data:
+            import dask.array as da
+
+            # evaluate all the dask arrays simultaneously
+            evaluated_data = da.compute(*lazy_data.values())
+
+            for k, data in zip(lazy_data, evaluated_data):
+                variables[k] = variables[k].copy(deep=False)
+                variables[k].data = data
+
+        # skip __init__ to avoid costly validation
+        return self._construct_direct(variables, self._coord_names.copy(),
+                                     self._dims.copy(), self._attrs_copy())
+
     @classmethod
     def _construct_direct(cls, variables, coord_names, dims=None, attrs=None,
                           file_obj=None):
diff --git a/xarray/core/variable.py b/xarray/core/variable.py
index ed8d15f03b5..2fa05c07404 100644
--- a/xarray/core/variable.py
+++ b/xarray/core/variable.py
@@ -272,9 +272,19 @@ def data(self, data):
         self._data = data
 
     def _data_cached(self):
-        if not isinstance(self._data, (np.ndarray, PandasIndexAdapter)):
-            self._data = np.asarray(self._data)
-        return self._data
+        """Load data into memory and return it.
+        Do not cache dask arrays automatically; that should
+        require an explicit load() call.
+        """
+        if isinstance(self._data, (np.ndarray, PandasIndexAdapter)):
+            new_data = self._data
+        else:
+            new_data = np.asarray(self._data)
+
+        if not isinstance(self._data, dask_array_type):
+            self._data = new_data
+
+        return new_data
 
     @property
     def _indexable_data(self):
@@ -288,11 +298,26 @@ def load(self):
         because all xarray functions should either work on deferred data or
         load data automatically.
         """
-        self._data_cached()
+        new_data = self._data_cached()
+        if isinstance(self._data, dask_array_type):
+            self._data = new_data
         return self
 
+    def compute(self):
+        """Manually trigger loading of this variable's data from disk or a
+        remote source into memory and return a new variable. The original is
+        left unaltered.
+
+        Normally, it should not be necessary to call this method in user code,
+        because all xarray functions should either work on deferred data or
+        load data automatically.
+        """
+        new = self.copy(deep=False)
+        return new.load()
+
     def __getstate__(self):
-        """Always cache data as an in-memory array before pickling"""
+        """Always cache data as an in-memory array before pickling
+        (with the exception of dask backend)"""
         self._data_cached()
         # self.__dict__ is the default pickle object, we don't need to
         # implement our own __setstate__ method to make pickle work
@@ -1091,9 +1116,19 @@ def __init__(self, dims, data, attrs=None, encoding=None, fastpath=False):
                              type(self).__name__)
 
     def _data_cached(self):
-        if not isinstance(self._data, PandasIndexAdapter):
-            self._data = PandasIndexAdapter(self._data)
-        return self._data
+        """Load data into memory and return it.
+        Do not cache dask arrays automatically; that should
+        require an explicit load() call.
+        """
+        if isinstance(self._data, PandasIndexAdapter):
+            new_data = self._data
+        else:
+            new_data = PandasIndexAdapter(self._data)
+
+        if not isinstance(self._data, dask_array_type):
+            self._data = new_data
+
+        return new_data
 
     def __getitem__(self, key):
         key = self._item_key_to_tuple(key)
diff --git a/xarray/test/test_backends.py b/xarray/test/test_backends.py
index eeb5561579b..8cae1b9b98c 100644
--- a/xarray/test/test_backends.py
+++ b/xarray/test/test_backends.py
@@ -149,6 +149,27 @@ def assert_loads(vars=None):
             actual = ds.load()
         self.assertDatasetAllClose(expected, actual)
 
+    def test_dataset_compute(self):
+        expected = create_test_data()
+
+        with self.roundtrip(expected) as actual:
+            # Test Dataset.compute()
+            for v in actual.variables.values():
+                self.assertFalse(v._in_memory)
+
+            computed = actual.compute()
+
+            # indexes are going to be loaded in memory anyway, even
+            # with compute(). This is by design.
+            for v in actual.data_vars.values():
+                self.assertFalse(v._in_memory)
+    
+            for v in computed.variables.values():
+                self.assertTrue(v._in_memory)
+
+            self.assertDatasetAllClose(expected, actual)
+            self.assertDatasetAllClose(expected, computed)
+
     def test_roundtrip_None_variable(self):
         expected = Dataset({None: (('x', 'y'), [[0, 1], [2, 3]])})
         with self.roundtrip(expected) as actual:
@@ -230,18 +251,6 @@ def test_roundtrip_coordinates(self):
         with self.roundtrip(expected) as actual:
             self.assertDatasetIdentical(expected, actual)
 
-        expected = original.copy()
-        expected.attrs['coordinates'] = 'something random'
-        with self.assertRaisesRegexp(ValueError, 'cannot serialize'):
-            with self.roundtrip(expected):
-                pass
-
-        expected = original.copy(deep=True)
-        expected['foo'].attrs['coordinates'] = 'something random'
-        with self.assertRaisesRegexp(ValueError, 'cannot serialize'):
-            with self.roundtrip(expected):
-                pass
-
     def test_roundtrip_boolean_dtype(self):
         original = create_boolean_data()
         self.assertEqual(original['x'].dtype, 'bool')
@@ -872,7 +881,26 @@ def test_read_byte_attrs_as_unicode(self):
 @requires_dask
 @requires_scipy
 @requires_netCDF4
-class DaskTest(TestCase):
+class DaskTest(TestCase, DatasetIOTestCases):
+    @contextlib.contextmanager
+    def create_store(self):
+        yield Dataset()
+
+    @contextlib.contextmanager
+    def roundtrip(self, data, save_kwargs={}, open_kwargs={}):
+        yield data.chunk()
+
+    def test_roundtrip_datetime_data(self):
+        # Override method in DatasetIOTestCases - remove not applicable save_kwds
+        times = pd.to_datetime(['2000-01-01', '2000-01-02', 'NaT'])
+        expected = Dataset({'t': ('t', times), 't0': times[0]})
+        with self.roundtrip(expected) as actual:
+            self.assertDatasetIdentical(expected, actual)
+
+    def test_write_store(self):
+        # Override method in DatasetIOTestCases - not applicable to dask
+        pass
+
     def test_open_mfdataset(self):
         original = Dataset({'foo': ('x', np.random.randn(10))})
         with create_tmp_file() as tmp1:
@@ -992,6 +1020,38 @@ def test_deterministic_names(self):
                 self.assertIn(tmp, dask_name)
             self.assertEqual(original_names, repeat_names)
 
+    def test_dataarray_compute(self):
+        actual = DataArray([1,2]).chunk()
+        computed = actual.compute()
+        self.assertFalse(actual._in_memory)
+        self.assertTrue(computed._in_memory)
+        self.assertDataArrayAllClose(actual, computed)
+
+    def test_pickle(self):
+        # Test that pickling/unpickling does not convert the dask
+        # backend to numpy
+        a1 = DataArray([1,2]).chunk()
+        self.assertFalse(a1._in_memory)        
+        a2 = pickle.loads(pickle.dumps(a1))
+        self.assertDataArrayIdentical(a1, a2)
+        self.assertFalse(a1._in_memory)
+        self.assertFalse(a2._in_memory)        
+
+        ds1 = Dataset({'a': [1,2]}).chunk()
+        self.assertFalse(ds1['a']._in_memory)        
+        ds2 = pickle.loads(pickle.dumps(ds1))
+        self.assertDatasetIdentical(ds1, ds2)
+        self.assertFalse(ds1['a']._in_memory)
+        self.assertFalse(ds2['a']._in_memory)        
+
+    def test_values(self):
+        # Test that invoking the values property does not convert the dask
+        # backend to numpy
+        a = DataArray([1,2]).chunk()
+        self.assertFalse(a._in_memory)
+        self.assertEquals(a.values.tolist(), [1, 2])
+        self.assertFalse(a._in_memory)
+
 
 @requires_scipy_or_netCDF4
 @requires_pydap

From 26a69971ef04afb04e64d61e3f2bd6bcc82c79be Mon Sep 17 00:00:00 2001
From: Guido Imperiale <guido.imperiale@gmail.com>
Date: Fri, 30 Sep 2016 12:33:29 +0100
Subject: [PATCH 02/11] Minor tweaks

---
 doc/whats-new.rst            |  9 +++++----
 xarray/core/variable.py      | 28 ++++++++++------------------
 xarray/test/test_backends.py | 34 ++++------------------------------
 xarray/test/test_dask.py     | 27 +++++++++++++++++++++++++++
 4 files changed, 46 insertions(+), 52 deletions(-)

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
index 583e3062665..c7d3a6d0878 100644
--- a/doc/whats-new.rst
+++ b/doc/whats-new.rst
@@ -26,8 +26,8 @@ Breaking changes
   ``xarray.MergeError``. Set ``compat='broadcast_equals'`` to restore the
   previous default.
 - Pickling an xarray object based on the dask backend, or reading its
-  :py:meth:`values` property, won't automatically convert the backend to numpy
-  in the original object anymore.
+  :py:meth:`values` property, won't automatically convert the array from dask
+  to numpy in the original object anymore.
   By `Guido Imperiale <https://github.com/crusaderky>`_.
 
 Deprecations
@@ -72,8 +72,9 @@ Enhancements
   :py:func:`~xarray.open_mfdataset` to disable inferring a dimension along
   which to concatenate.
   By `Stephan Hoyer <https://github.com/shoyer>`_.
-- Added :py:meth:`compute` method to :py:class:`DataArray`, :py:class:`Dataset`,
-  and :py:class:`Variable` as a non-destructive alternative to :py:meth:`load`.
+- Added methods :py:meth:`DataArray.compute`, :py:meth:`Dataset.compute`, and
+  :py:meth:`Variable.compute` as a non-mutating alternative to
+  :py:meth:`~DataArray.load`.
   By `Guido Imperiale <https://github.com/crusaderky>`_.
 
 Bug fixes
diff --git a/xarray/core/variable.py b/xarray/core/variable.py
index 2fa05c07404..1b0eb3f0dee 100644
--- a/xarray/core/variable.py
+++ b/xarray/core/variable.py
@@ -271,19 +271,20 @@ def data(self, data):
                 "replacement data must match the Variable's shape")
         self._data = data
 
+    def _data_cast(self):
+        if isinstance(self._data, (np.ndarray, PandasIndexAdapter)):
+            return self._data
+        else:
+            return np.asarray(self._data)
+
     def _data_cached(self):
         """Load data into memory and return it.
         Do not cache dask arrays automatically; that should
         require an explicit load() call.
         """
-        if isinstance(self._data, (np.ndarray, PandasIndexAdapter)):
-            new_data = self._data
-        else:
-            new_data = np.asarray(self._data)
-
+        new_data = self._data_cast()
         if not isinstance(self._data, dask_array_type):
             self._data = new_data
-
         return new_data
 
     @property
@@ -1115,20 +1116,11 @@ def __init__(self, dims, data, attrs=None, encoding=None, fastpath=False):
             raise ValueError('%s objects must be 1-dimensional' %
                              type(self).__name__)
 
-    def _data_cached(self):
-        """Load data into memory and return it.
-        Do not cache dask arrays automatically; that should
-        require an explicit load() call.
-        """
+    def _data_cast(self):
         if isinstance(self._data, PandasIndexAdapter):
-            new_data = self._data
+            return self._data
         else:
-            new_data = PandasIndexAdapter(self._data)
-
-        if not isinstance(self._data, dask_array_type):
-            self._data = new_data
-
-        return new_data
+            return PandasIndexAdapter(self._data)
 
     def __getitem__(self, key):
         key = self._item_key_to_tuple(key)
diff --git a/xarray/test/test_backends.py b/xarray/test/test_backends.py
index 8cae1b9b98c..0ff5cffec26 100644
--- a/xarray/test/test_backends.py
+++ b/xarray/test/test_backends.py
@@ -159,11 +159,8 @@ def test_dataset_compute(self):
 
             computed = actual.compute()
 
-            # indexes are going to be loaded in memory anyway, even
-            # with compute(). This is by design.
             for v in actual.data_vars.values():
                 self.assertFalse(v._in_memory)
-    
             for v in computed.variables.values():
                 self.assertTrue(v._in_memory)
 
@@ -1021,38 +1018,15 @@ def test_deterministic_names(self):
             self.assertEqual(original_names, repeat_names)
 
     def test_dataarray_compute(self):
+        # Test DataArray.compute() on dask backend.
+        # The test for Dataset.compute() is already in DatasetIOTestCases;
+        # however dask is the only tested backend which supports DataArrays
         actual = DataArray([1,2]).chunk()
         computed = actual.compute()
         self.assertFalse(actual._in_memory)
         self.assertTrue(computed._in_memory)
         self.assertDataArrayAllClose(actual, computed)
-
-    def test_pickle(self):
-        # Test that pickling/unpickling does not convert the dask
-        # backend to numpy
-        a1 = DataArray([1,2]).chunk()
-        self.assertFalse(a1._in_memory)        
-        a2 = pickle.loads(pickle.dumps(a1))
-        self.assertDataArrayIdentical(a1, a2)
-        self.assertFalse(a1._in_memory)
-        self.assertFalse(a2._in_memory)        
-
-        ds1 = Dataset({'a': [1,2]}).chunk()
-        self.assertFalse(ds1['a']._in_memory)        
-        ds2 = pickle.loads(pickle.dumps(ds1))
-        self.assertDatasetIdentical(ds1, ds2)
-        self.assertFalse(ds1['a']._in_memory)
-        self.assertFalse(ds2['a']._in_memory)        
-
-    def test_values(self):
-        # Test that invoking the values property does not convert the dask
-        # backend to numpy
-        a = DataArray([1,2]).chunk()
-        self.assertFalse(a._in_memory)
-        self.assertEquals(a.values.tolist(), [1, 2])
-        self.assertFalse(a._in_memory)
-
-
+        
 @requires_scipy_or_netCDF4
 @requires_pydap
 class PydapTest(TestCase):
diff --git a/xarray/test/test_dask.py b/xarray/test/test_dask.py
index 13a8817ce68..7af8aed4975 100644
--- a/xarray/test/test_dask.py
+++ b/xarray/test/test_dask.py
@@ -1,3 +1,4 @@
+import pickle
 import numpy as np
 import pandas as pd
 
@@ -321,3 +322,29 @@ def test_dot(self):
         eager = self.eager_array.dot(self.eager_array[0])
         lazy = self.lazy_array.dot(self.lazy_array[0])
         self.assertLazyAndAllClose(eager, lazy)
+
+    def test_dataarray_pickle(self):
+        # Test that pickling/unpickling does not convert the dask
+        # backend to numpy
+        a1 = DataArray([1,2]).chunk()
+        self.assertFalse(a1._in_memory)        
+        a2 = pickle.loads(pickle.dumps(a1))
+        self.assertDataArrayIdentical(a1, a2)
+        self.assertFalse(a1._in_memory)
+        self.assertFalse(a2._in_memory)        
+
+    def test_dataset_pickle(self):
+        ds1 = Dataset({'a': [1,2]}).chunk()
+        self.assertFalse(ds1['a']._in_memory)        
+        ds2 = pickle.loads(pickle.dumps(ds1))
+        self.assertDatasetIdentical(ds1, ds2)
+        self.assertFalse(ds1['a']._in_memory)
+        self.assertFalse(ds2['a']._in_memory)        
+
+    def test_values(self):
+        # Test that invoking the values property does not convert the dask
+        # backend to numpy
+        a = DataArray([1,2]).chunk()
+        self.assertFalse(a._in_memory)
+        self.assertEquals(a.values.tolist(), [1, 2])
+        self.assertFalse(a._in_memory)        

From 03fbdd1a424991b1f2d6f6e2d304a7e751a03864 Mon Sep 17 00:00:00 2001
From: Guido Imperiale <guido.imperiale@gmail.com>
Date: Fri, 30 Sep 2016 13:18:49 +0100
Subject: [PATCH 03/11] Simplified Dataset.copy() and Dataset.compute()

---
 xarray/core/dataset.py      | 32 ++++----------------------------
 xarray/test/test_dataset.py |  7 +++++--
 2 files changed, 9 insertions(+), 30 deletions(-)

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
index 475fcdab231..d060e845ceb 100644
--- a/xarray/core/dataset.py
+++ b/xarray/core/dataset.py
@@ -332,29 +332,8 @@ def compute(self):
         load data automatically. However, this method can be necessary when
         working with many file objects on disk.
         """
-        # Can't just do the below, because new.variables[k] is the
-        # same object as self.variables[k]!
-        # new = self.copy(deep=False)
-        # return new.load()
-
-        variables = OrderedDict((k, v.copy(deep=False))
-                                for (k, v) in self.variables.items())
-        lazy_data = OrderedDict((k, v.data) for k, v in variables.items()
-                                if isinstance(v.data, dask_array_type))
-
-        if lazy_data:
-            import dask.array as da
-
-            # evaluate all the dask arrays simultaneously
-            evaluated_data = da.compute(*lazy_data.values())
-
-            for k, data in zip(lazy_data, evaluated_data):
-                variables[k] = variables[k].copy(deep=False)
-                variables[k].data = data
-
-        # skip __init__ to avoid costly validation
-        return self._construct_direct(variables, self._coord_names.copy(),
-                                     self._dims.copy(), self._attrs_copy())
+        new = self.copy(deep=False)
+        return new.load()
 
     @classmethod
     def _construct_direct(cls, variables, coord_names, dims=None, attrs=None,
@@ -441,11 +420,8 @@ def copy(self, deep=False):
         Otherwise, a shallow copy is made, so each variable in the new dataset
         is also a variable in the original dataset.
         """
-        if deep:
-            variables = OrderedDict((k, v.copy(deep=True))
-                                    for k, v in iteritems(self._variables))
-        else:
-            variables = self._variables.copy()
+        variables = OrderedDict((k, v.copy(deep=deep))
+                                for k, v in iteritems(self._variables))
         # skip __init__ to avoid costly validation
         return self._construct_direct(variables, self._coord_names.copy(),
                                       self._dims.copy(), self._attrs_copy())
diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py
index b9798dbd611..bac07d68d0f 100644
--- a/xarray/test/test_dataset.py
+++ b/xarray/test/test_dataset.py
@@ -1287,10 +1287,13 @@ def test_copy(self):
 
         for copied in [data.copy(deep=False), copy(data)]:
             self.assertDatasetIdentical(data, copied)
-            for k in data:
+            # Note: IndexVariable objects with string dtype are always
+            # copied because of xarray.core.util.safe_cast_to_index.
+            # Limiting the test to data variables.
+            for k in data.data_vars:
                 v0 = data.variables[k]
                 v1 = copied.variables[k]
-                self.assertIs(v0, v1)
+                assert source_ndarray(v0.data) is source_ndarray(v1.data)
             copied['foo'] = ('z', np.arange(5))
             self.assertNotIn('foo', data)
 

From b04167ce84f18724482435591e4eaa619f6ced9a Mon Sep 17 00:00:00 2001
From: Guido Imperiale <guido.imperiale@gmail.com>
Date: Fri, 30 Sep 2016 18:24:43 +0100
Subject: [PATCH 04/11] Minor cleanup

---
 xarray/core/dataset.py   |  7 ++++---
 xarray/core/variable.py  |  6 ++----
 xarray/test/test_dask.py | 40 +++++++++-------------------------------
 3 files changed, 15 insertions(+), 38 deletions(-)

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
index d060e845ceb..30eaf167f6e 100644
--- a/xarray/core/dataset.py
+++ b/xarray/core/dataset.py
@@ -309,7 +309,7 @@ def load(self):
         """
         # access .data to coerce everything to numpy or dask arrays
         all_data = dict((k, v.data) for k, v in self.variables.items())
-        lazy_data = OrderedDict((k, v) for k, v in all_data.items()
+        lazy_data = dict((k, v) for k, v in all_data.items()
                          if isinstance(v, dask_array_type))
         if lazy_data:
             import dask.array as da
@@ -417,8 +417,9 @@ def copy(self, deep=False):
         """Returns a copy of this dataset.
 
         If `deep=True`, a deep copy is made of each of the component variables.
-        Otherwise, a shallow copy is made, so each variable in the new dataset
-        is also a variable in the original dataset.
+        Otherwise, a shallow copy of each of the component variable is made, so
+        that the underlying memory region of the new dataset is the same as in
+        the original dataset.
         """
         variables = OrderedDict((k, v.copy(deep=deep))
                                 for k, v in iteritems(self._variables))
diff --git a/xarray/core/variable.py b/xarray/core/variable.py
index 1b0eb3f0dee..9c752365281 100644
--- a/xarray/core/variable.py
+++ b/xarray/core/variable.py
@@ -298,10 +298,8 @@ def load(self):
         Normally, it should not be necessary to call this method in user code,
         because all xarray functions should either work on deferred data or
         load data automatically.
-        """
-        new_data = self._data_cached()
-        if isinstance(self._data, dask_array_type):
-            self._data = new_data
+        """        
+        self._data = self._data_cast()
         return self
 
     def compute(self):
diff --git a/xarray/test/test_dask.py b/xarray/test/test_dask.py
index 7af8aed4975..9db710427d2 100644
--- a/xarray/test/test_dask.py
+++ b/xarray/test/test_dask.py
@@ -13,44 +13,22 @@
     import dask.array as da
 
 
-def _copy_at_variable_level(arg):
-    """We need to copy the argument at the level of xarray.Variable objects, so
-    that viewing its values does not trigger lazy loading.
-    """
-    if isinstance(arg, Variable):
-        return arg.copy(deep=False)
-    elif isinstance(arg, DataArray):
-        ds = arg.to_dataset(name='__copied__')
-        return _copy_at_variable_level(ds)['__copied__']
-    elif isinstance(arg, Dataset):
-        ds = arg.copy()
-        for k in list(ds):
-            ds._variables[k] = ds._variables[k].copy(deep=False)
-        return ds
-    else:
-        assert False
-
-
 class DaskTestCase(TestCase):
     def assertLazyAnd(self, expected, actual, test):
-        expected_copy = _copy_at_variable_level(expected)
-        actual_copy = _copy_at_variable_level(actual)
+        expected.name = None
+        actual.name = None
         with dask.set_options(get=dask.get):
-            test(actual_copy, expected_copy)
-        var = getattr(actual, 'variable', actual)
-        self.assertIsInstance(var.data, da.Array)
+            test(actual, expected)
+        if isinstance(actual, Dataset):
+            for var in actual.variables.values():
+                self.assertIsInstance(var.data, da.Array)            
+        else:    
+            var = getattr(actual, 'variable', actual)
+            self.assertIsInstance(var.data, da.Array)
 
 
 @requires_dask
 class TestVariable(DaskTestCase):
-    def assertLazyAnd(self, expected, actual, test):
-        expected_copy = expected.copy(deep=False)
-        actual_copy = actual.copy(deep=False)
-        with dask.set_options(get=dask.get):
-            test(actual_copy, expected_copy)
-        var = getattr(actual, 'variable', actual)
-        self.assertIsInstance(var.data, da.Array)
-
     def assertLazyAndIdentical(self, expected, actual):
         self.assertLazyAnd(expected, actual, self.assertVariableIdentical)
 

From ca94cc7c0023f38725b8ce842c20a83e7c68f51a Mon Sep 17 00:00:00 2001
From: Guido Imperiale <guido.imperiale@gmail.com>
Date: Sat, 1 Oct 2016 11:45:51 +0100
Subject: [PATCH 05/11] Cleaned up dask test

---
 xarray/test/test_dask.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/xarray/test/test_dask.py b/xarray/test/test_dask.py
index 9db710427d2..b78907f5708 100644
--- a/xarray/test/test_dask.py
+++ b/xarray/test/test_dask.py
@@ -15,8 +15,6 @@
 
 class DaskTestCase(TestCase):
     def assertLazyAnd(self, expected, actual, test):
-        expected.name = None
-        actual.name = None
         with dask.set_options(get=dask.get):
             test(actual, expected)
         if isinstance(actual, Dataset):
@@ -178,6 +176,9 @@ def assertLazyAndIdentical(self, expected, actual):
     def assertLazyAndAllClose(self, expected, actual):
         self.assertLazyAnd(expected, actual, self.assertDataArrayAllClose)
 
+    def assertLazyAndEqual(self, expected, actual):
+        self.assertLazyAnd(expected, actual, self.assertDataArrayEqual)
+
     def setUp(self):
         self.values = np.random.randn(4, 6)
         self.data = da.from_array(self.values, chunks=(2, 2))
@@ -245,7 +246,7 @@ def test_to_dataset_roundtrip(self):
         v = self.lazy_array
 
         expected = u.assign_coords(x=u['x'])
-        self.assertLazyAndIdentical(expected, v.to_dataset('x').to_array('x'))
+        self.assertLazyAndEqual(expected, v.to_dataset('x').to_array('x'))
 
     def test_merge(self):
 
@@ -254,7 +255,7 @@ def duplicate_and_merge(array):
 
         expected = duplicate_and_merge(self.eager_array)
         actual = duplicate_and_merge(self.lazy_array)
-        self.assertLazyAndIdentical(expected, actual)
+        self.assertLazyAndEqual(expected, actual)
 
     def test_ufuncs(self):
         u = self.eager_array
@@ -267,9 +268,9 @@ def test_where_dispatching(self):
         x = da.from_array(a, 5)
         y = da.from_array(b, 5)
         expected = DataArray(a).where(b)
-        self.assertLazyAndIdentical(expected, DataArray(a).where(y))
-        self.assertLazyAndIdentical(expected, DataArray(x).where(b))
-        self.assertLazyAndIdentical(expected, DataArray(x).where(y))
+        self.assertLazyAndEqual(expected, DataArray(a).where(y))
+        self.assertLazyAndEqual(expected, DataArray(x).where(b))
+        self.assertLazyAndEqual(expected, DataArray(x).where(y))
 
     def test_simultaneous_compute(self):
         ds = Dataset({'foo': ('x', range(5)),
@@ -294,7 +295,7 @@ def test_stack(self):
         expected = DataArray(data.reshape(2, -1), {'w': [0, 1], 'z': z},
                              dims=['w', 'z'])
         assert stacked.data.chunks == expected.data.chunks
-        self.assertLazyAndIdentical(expected, stacked)
+        self.assertLazyAndEqual(expected, stacked)
 
     def test_dot(self):
         eager = self.eager_array.dot(self.eager_array[0])

From e46b61f788152e45416ab68e012ab9cc9c3f03ee Mon Sep 17 00:00:00 2001
From: Guido Imperiale <guido.imperiale@gmail.com>
Date: Sat, 1 Oct 2016 12:22:57 +0100
Subject: [PATCH 06/11] Integrate no_dask_resolve with dask_broadcast branches

---
 xarray/test/test_dask.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xarray/test/test_dask.py b/xarray/test/test_dask.py
index 7f93ab18a50..1326d936249 100644
--- a/xarray/test/test_dask.py
+++ b/xarray/test/test_dask.py
@@ -332,5 +332,5 @@ def test_from_dask_variable(self):
         # Test array creation from Variable with dask backend.
         # This is used e.g. in broadcast()
         a = DataArray(self.lazy_array.variable)
-        self.assertLazyAndIdentical(self.lazy_array, a)
+        self.assertLazyAndEqual(self.lazy_array, a)
 

From 90743f0bc51b45ab3ac9f91d9d534d303f6d6b1b Mon Sep 17 00:00:00 2001
From: Guido Imperiale <guido.imperiale@gmail.com>
Date: Sat, 1 Oct 2016 18:37:20 +0100
Subject: [PATCH 07/11] Don't chunk coords

---
 xarray/core/dataset.py       | 11 ++++++++++-
 xarray/test/test_backends.py |  6 +++---
 xarray/test/test_dask.py     | 32 ++++++++++++++++++++------------
 xarray/test/test_dataset.py  | 17 ++++++++++++-----
 4 files changed, 45 insertions(+), 21 deletions(-)

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
index 30eaf167f6e..4c0a8163b95 100644
--- a/xarray/core/dataset.py
+++ b/xarray/core/dataset.py
@@ -806,13 +806,19 @@ def chunks(self):
         array.
         """
         chunks = {}
-        for v in self.variables.values():
+        for v in self.data_vars.values():
             if v.chunks is not None:
                 new_chunks = list(zip(v.dims, v.chunks))
                 if any(chunk != chunks[d] for d, chunk in new_chunks
                        if d in chunks):
                     raise ValueError('inconsistent chunks')
                 chunks.update(new_chunks)
+        if chunks:
+            # Add dims that are defined in the coords but are not in data_vars
+            for v in self.coords.values():
+                for dim in v.dims:
+                    if dim not in chunks:
+                        chunks[dim] = (v.size,)
         return Frozen(SortedKeysDict(chunks))
 
     def chunk(self, chunks=None, name_prefix='xarray-', token=None,
@@ -865,6 +871,9 @@ def selkeys(dict_, keys):
             return dict((d, dict_[d]) for d in keys if d in dict_)
 
         def maybe_chunk(name, var, chunks):
+            if name not in self.data_vars:
+                return var
+
             chunks = selkeys(chunks, var.dims)
             if not chunks:
                 chunks = None
diff --git a/xarray/test/test_backends.py b/xarray/test/test_backends.py
index 0ff5cffec26..8d1a1e0f783 100644
--- a/xarray/test/test_backends.py
+++ b/xarray/test/test_backends.py
@@ -125,7 +125,7 @@ def assert_loads(vars=None):
             if vars is None:
                 vars = expected
             with self.roundtrip(expected) as actual:
-                for v in actual.variables.values():
+                for v in actual.data_vars.values():
                     self.assertFalse(v._in_memory)
                 yield actual
                 for k, v in actual.variables.items():
@@ -154,7 +154,7 @@ def test_dataset_compute(self):
 
         with self.roundtrip(expected) as actual:
             # Test Dataset.compute()
-            for v in actual.variables.values():
+            for v in actual.data_vars.values():
                 self.assertFalse(v._in_memory)
 
             computed = actual.compute()
@@ -1026,7 +1026,7 @@ def test_dataarray_compute(self):
         self.assertFalse(actual._in_memory)
         self.assertTrue(computed._in_memory)
         self.assertDataArrayAllClose(actual, computed)
-        
+
 @requires_scipy_or_netCDF4
 @requires_pydap
 class PydapTest(TestCase):
diff --git a/xarray/test/test_dask.py b/xarray/test/test_dask.py
index 1326d936249..9ae83adddde 100644
--- a/xarray/test/test_dask.py
+++ b/xarray/test/test_dask.py
@@ -18,11 +18,19 @@ def assertLazyAnd(self, expected, actual, test):
         with dask.set_options(get=dask.get):
             test(actual, expected)
         if isinstance(actual, Dataset):
-            for var in actual.variables.values():
-                self.assertIsInstance(var.data, da.Array)            
-        else:    
-            var = getattr(actual, 'variable', actual)
-            self.assertIsInstance(var.data, da.Array)
+            for k, v in actual.variables.items():
+                if k in actual.data_vars:
+                    self.assertIsInstance(var.data, da.Array)
+                else:
+                    self.assertIsInstance(var.data, np.ndarray)
+        elif isinstance(actual, DataArray):
+            self.assertIsInstance(actual.data, da.Array)
+            for coord in actual.coords.values():
+                self.assertIsInstance(coord.data, np.ndarray)
+        elif isinstance(actual, Variable):
+            self.assertIsInstance(actual.data, da.Array)
+        else:
+            assert False
 
 
 @requires_dask
@@ -188,6 +196,7 @@ def setUp(self):
     def test_rechunk(self):
         chunked = self.eager_array.chunk({'x': 2}).chunk({'y': 2})
         self.assertEqual(chunked.chunks, ((2,) * 2, (2,) * 3))
+        self.assertLazyAndIdentical(self.lazy_array, chunked)
 
     def test_new_chunk(self):
         chunked = self.eager_array.chunk()
@@ -306,19 +315,19 @@ def test_dataarray_pickle(self):
         # Test that pickling/unpickling does not convert the dask
         # backend to numpy
         a1 = DataArray([1,2]).chunk()
-        self.assertFalse(a1._in_memory)        
+        self.assertFalse(a1._in_memory)
         a2 = pickle.loads(pickle.dumps(a1))
         self.assertDataArrayIdentical(a1, a2)
         self.assertFalse(a1._in_memory)
-        self.assertFalse(a2._in_memory)        
+        self.assertFalse(a2._in_memory)
 
     def test_dataset_pickle(self):
-        ds1 = Dataset({'a': [1,2]}).chunk()
-        self.assertFalse(ds1['a']._in_memory)        
+        ds1 = Dataset({'a': DataArray([1,2])}).chunk()
+        self.assertFalse(ds1['a']._in_memory)
         ds2 = pickle.loads(pickle.dumps(ds1))
         self.assertDatasetIdentical(ds1, ds2)
         self.assertFalse(ds1['a']._in_memory)
-        self.assertFalse(ds2['a']._in_memory)        
+        self.assertFalse(ds2['a']._in_memory)
 
     def test_values(self):
         # Test that invoking the values property does not convert the dask
@@ -326,11 +335,10 @@ def test_values(self):
         a = DataArray([1,2]).chunk()
         self.assertFalse(a._in_memory)
         self.assertEquals(a.values.tolist(), [1, 2])
-        self.assertFalse(a._in_memory)        
+        self.assertFalse(a._in_memory)
 
     def test_from_dask_variable(self):
         # Test array creation from Variable with dask backend.
         # This is used e.g. in broadcast()
         a = DataArray(self.lazy_array.variable)
         self.assertLazyAndEqual(self.lazy_array, a)
-
diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py
index bac07d68d0f..ada342b42ec 100644
--- a/xarray/test/test_dataset.py
+++ b/xarray/test/test_dataset.py
@@ -667,14 +667,21 @@ def test_chunk(self):
         self.assertEqual(data.chunks, {})
 
         reblocked = data.chunk()
-        for v in reblocked.variables.values():
-            self.assertIsInstance(v.data, da.Array)
-        expected_chunks = dict((d, (s,)) for d, s in data.dims.items())
+        for k, v in reblocked.variables.items():
+            if k in reblocked.data_vars:
+                self.assertIsInstance(v.data, da.Array)
+            else:
+                self.assertIsInstance(v.data, np.ndarray)
+
+        expected_chunks = {'time': (20,), 'dim1': (8,),
+                           'dim2': (9,), 'dim3': (10,)}
         self.assertEqual(reblocked.chunks, expected_chunks)
 
         reblocked = data.chunk({'time': 5, 'dim1': 5, 'dim2': 5, 'dim3': 5})
-        expected_chunks = {'time': (5,) * 4, 'dim1': (5, 3),
-                           'dim2': (5, 4), 'dim3': (5, 5)}
+        # time is not a dim in any of the data_vars, so it
+        # doesn't get chunked
+        expected_chunks = {'time': (20,), 'dim1': (5, 3),
+                   'dim2': (5, 4), 'dim3': (5, 5)}
         self.assertEqual(reblocked.chunks, expected_chunks)
 
         reblocked = data.chunk(expected_chunks)

From ac8e0cb72ede763694663b0a58e7ef968d6d8bbc Mon Sep 17 00:00:00 2001
From: Guido Imperiale <guido.imperiale@gmail.com>
Date: Thu, 6 Oct 2016 15:37:59 +0100
Subject: [PATCH 08/11] Added performance warning to release notes

---
 doc/whats-new.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
index b62d727e960..1a0bbcaf81a 100644
--- a/doc/whats-new.rst
+++ b/doc/whats-new.rst
@@ -28,6 +28,9 @@ Breaking changes
 - Pickling an xarray object based on the dask backend, or reading its
   :py:meth:`values` property, won't automatically convert the array from dask
   to numpy in the original object anymore.
+  If a dask object is used as a coord of a :py:class:`~xarray.DataArray` or
+  :py:class:`~xarray.Dataset`, its values won't be automatically cached, likely
+  causing performance degradation.
   By `Guido Imperiale <https://github.com/crusaderky>`_.
 
 Deprecations
@@ -57,8 +60,6 @@ Enhancements
   to convert a ``DataArray`` to a ``Dataset`` before saving as a netCDF file,
   and deals with names to ensure a perfect 'roundtrip' capability.
   By `Robin Wilson <https://github.com/robintw>`_.
-
->>>>>>> master
 - Multi-index levels are now accessible as "virtual" coordinate variables,
   e.g., ``ds['time']`` can pull out the ``'time'`` level of a multi-index
   (see :ref:`coordinates`). ``sel`` also accepts providing multi-index levels
@@ -78,7 +79,6 @@ Enhancements
   :py:meth:`Variable.compute` as a non-mutating alternative to
   :py:meth:`~DataArray.load`.
   By `Guido Imperiale <https://github.com/crusaderky>`_.
-
 - Adds DataArray and Dataset methods :py:meth:`~xarray.DataArray.cumsum` and
   :py:meth:`~xarray.DataArray.cumprod`.  By `Phillip J. Wolfram
   <https://github.com/pwolfram>`_.

From e7f600c698084d47b797bbb73d1615eed2e310c7 Mon Sep 17 00:00:00 2001
From: Guido Imperiale <guido.imperiale@gmail.com>
Date: Tue, 11 Oct 2016 00:01:59 +0100
Subject: [PATCH 09/11] Fix bug that caused dask array to be computed and then
 discarded when pickling

---
 xarray/core/variable.py  |  5 +++--
 xarray/test/test_dask.py | 40 ++++++++++++++++++++++++++++++++++++++--
 2 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/xarray/core/variable.py b/xarray/core/variable.py
index 3fe3cb3d81a..b447b0b0f6b 100644
--- a/xarray/core/variable.py
+++ b/xarray/core/variable.py
@@ -301,7 +301,7 @@ def load(self):
         Normally, it should not be necessary to call this method in user code,
         because all xarray functions should either work on deferred data or
         load data automatically.
-        """        
+        """
         self._data = self._data_cast()
         return self
 
@@ -320,7 +320,8 @@ def compute(self):
     def __getstate__(self):
         """Always cache data as an in-memory array before pickling
         (with the exception of dask backend)"""
-        self._data_cached()
+        if not isinstance(self._data, dask_array_type):
+            self._data_cached()
         # self.__dict__ is the default pickle object, we don't need to
         # implement our own __setstate__ method to make pickle work
         return self.__dict__
diff --git a/xarray/test/test_dask.py b/xarray/test/test_dask.py
index 9b1d51c1966..19e72e5884c 100644
--- a/xarray/test/test_dask.py
+++ b/xarray/test/test_dask.py
@@ -309,20 +309,39 @@ def test_dot(self):
         lazy = self.lazy_array.dot(self.lazy_array[0])
         self.assertLazyAndAllClose(eager, lazy)
 
+    def test_variable_pickle(self):
+        # Test that pickling/unpickling does not convert the dask
+        # backend to numpy
+        a1 = Variable(['x'], build_dask_array())
+        a1.compute()
+        self.assertFalse(a1._in_memory)
+        self.assertEquals(kernel_call_count, 1)
+        a2 = pickle.loads(pickle.dumps(a1))
+        self.assertEquals(kernel_call_count, 1)
+        self.assertVariableIdentical(a1, a2)
+        self.assertFalse(a1._in_memory)
+        self.assertFalse(a2._in_memory)
+
     def test_dataarray_pickle(self):
         # Test that pickling/unpickling does not convert the dask
         # backend to numpy
-        a1 = DataArray([1,2]).chunk()
+        a1 = DataArray(build_dask_array())
+        a1.compute()
         self.assertFalse(a1._in_memory)
+        self.assertEquals(kernel_call_count, 1)
         a2 = pickle.loads(pickle.dumps(a1))
+        self.assertEquals(kernel_call_count, 1)
         self.assertDataArrayIdentical(a1, a2)
         self.assertFalse(a1._in_memory)
         self.assertFalse(a2._in_memory)
 
     def test_dataset_pickle(self):
-        ds1 = Dataset({'a': DataArray([1,2])}).chunk()
+        ds1 = Dataset({'a': DataArray(build_dask_array())})
+        ds1.compute()
         self.assertFalse(ds1['a']._in_memory)
+        self.assertEquals(kernel_call_count, 1)
         ds2 = pickle.loads(pickle.dumps(ds1))
+        self.assertEquals(kernel_call_count, 1)
         self.assertDatasetIdentical(ds1, ds2)
         self.assertFalse(ds1['a']._in_memory)
         self.assertFalse(ds2['a']._in_memory)
@@ -340,3 +359,20 @@ def test_from_dask_variable(self):
         # This is used e.g. in broadcast()
         a = DataArray(self.lazy_array.variable)
         self.assertLazyAndEqual(self.lazy_array, a)
+
+
+kernel_call_count = 0
+def kernel():
+    """Dask kernel to test pickling/unpickling.
+    Must be global to make it pickleable.
+    """
+    global kernel_call_count
+    kernel_call_count += 1
+    return np.ones(1)
+
+def build_dask_array():
+    global kernel_call_count
+    kernel_call_count = 0
+    return dask.array.Array(
+        dask={('foo', 0): (kernel, )}, name='foo',
+        chunks=((1,),), dtype=int)

From 27b091604dbb3be89f3deb3e3fe7ec057dd46986 Mon Sep 17 00:00:00 2001
From: Guido Imperiale <guido.imperiale@gmail.com>
Date: Sat, 12 Nov 2016 17:23:08 +0000
Subject: [PATCH 10/11] Eagerly cache IndexVariables only

Eagerly cache only IndexVariables (e.g. coords that are not in dims. Coords that are not in dims are chunked and not cached.
---
 doc/whats-new.rst            |  4 ++--
 xarray/core/dataset.py       | 19 ++++++-------------
 xarray/core/variable.py      |  5 +++++
 xarray/test/test_backends.py | 17 +++++++++++------
 xarray/test/test_dask.py     | 13 ++++++++-----
 xarray/test/test_dataset.py  | 12 +++++-------
 6 files changed, 37 insertions(+), 33 deletions(-)

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
index 7bb3ba0d7c0..23ac6445c98 100644
--- a/doc/whats-new.rst
+++ b/doc/whats-new.rst
@@ -29,8 +29,8 @@ Breaking changes
   :py:meth:`values` property, won't automatically convert the array from dask
   to numpy in the original object anymore.
   If a dask object is used as a coord of a :py:class:`~xarray.DataArray` or
-  :py:class:`~xarray.Dataset`, its values won't be automatically cached, likely
-  causing performance degradation.
+  :py:class:`~xarray.Dataset`, its values will still be automatically cached,
+  but only if it's used to index a dim (e.g. it's used for alignment).
   By `Guido Imperiale <https://github.com/crusaderky>`_.
 
 Deprecations
diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
index f3aef2f2286..3b1d19606d4 100644
--- a/xarray/core/dataset.py
+++ b/xarray/core/dataset.py
@@ -829,19 +829,12 @@ def chunks(self):
         array.
         """
         chunks = {}
-        for v in self.data_vars.values():
+        for v in self.variables.values():
             if v.chunks is not None:
-                new_chunks = list(zip(v.dims, v.chunks))
-                if any(chunk != chunks[d] for d, chunk in new_chunks
-                       if d in chunks):
-                    raise ValueError('inconsistent chunks')
-                chunks.update(new_chunks)
-        if chunks:
-            # Add dims that are defined in the coords but are not in data_vars
-            for v in self.coords.values():
-                for dim in v.dims:
-                    if dim not in chunks:
-                        chunks[dim] = (v.size,)
+                for dim, c in zip(v.dims, v.chunks):
+                    if dim in chunks and c != chunks[dim]:
+                        raise ValueError('inconsistent chunks')
+                    chunks[dim] = c
         return Frozen(SortedKeysDict(chunks))
 
     def chunk(self, chunks=None, name_prefix='xarray-', token=None,
@@ -894,7 +887,7 @@ def selkeys(dict_, keys):
             return dict((d, dict_[d]) for d in keys if d in dict_)
 
         def maybe_chunk(name, var, chunks):
-            if name not in self.data_vars:
+            if name in self.dims:
                 return var
 
             chunks = selkeys(chunks, var.dims)
diff --git a/xarray/core/variable.py b/xarray/core/variable.py
index b3ebcab0afe..7e8b13c08d3 100644
--- a/xarray/core/variable.py
+++ b/xarray/core/variable.py
@@ -1100,6 +1100,11 @@ def __init__(self, dims, data, attrs=None, encoding=None, fastpath=False):
             raise ValueError('%s objects must be 1-dimensional' %
                              type(self).__name__)
 
+    def _data_cached(self):
+        # Unlike in Variable._data_cached, always eagerly resolve dask arrays
+        self._data = self._data_cast()
+        return self._data
+
     def _data_cast(self):
         if isinstance(self._data, PandasIndexAdapter):
             return self._data
diff --git a/xarray/test/test_backends.py b/xarray/test/test_backends.py
index 01cc0269f49..524fe50d7f9 100644
--- a/xarray/test/test_backends.py
+++ b/xarray/test/test_backends.py
@@ -128,8 +128,10 @@ def assert_loads(vars=None):
             if vars is None:
                 vars = expected
             with self.roundtrip(expected) as actual:
-                for v in actual.data_vars.values():
-                    self.assertFalse(v._in_memory)
+                for k, v in actual.variables.items():
+                    # IndexVariables are eagerly cached into memory
+                    if k not in actual.dims:
+                        self.assertFalse(v._in_memory)
                 yield actual
                 for k, v in actual.variables.items():
                     if k in vars:
@@ -157,13 +159,16 @@ def test_dataset_compute(self):
 
         with self.roundtrip(expected) as actual:
             # Test Dataset.compute()
-            for v in actual.data_vars.values():
-                self.assertFalse(v._in_memory)
+            for k, v in actual.variables.items():
+                # IndexVariables are eagerly cached
+                if k not in actual.dims:
+                    self.assertFalse(v._in_memory)
 
             computed = actual.compute()
 
-            for v in actual.data_vars.values():
-                self.assertFalse(v._in_memory)
+            for k, v in actual.variables.items():
+                if k not in actual.dims:
+                    self.assertFalse(v._in_memory)
             for v in computed.variables.values():
                 self.assertTrue(v._in_memory)
 
diff --git a/xarray/test/test_dask.py b/xarray/test/test_dask.py
index 9485347802c..69a434a4f72 100644
--- a/xarray/test/test_dask.py
+++ b/xarray/test/test_dask.py
@@ -24,14 +24,17 @@ def assertLazyAnd(self, expected, actual, test):
             test(actual, expected)
         if isinstance(actual, Dataset):
             for k, v in actual.variables.items():
-                if k in actual.data_vars:
-                    self.assertIsInstance(var.data, da.Array)
-                else:
+                if k in actual.dims:
                     self.assertIsInstance(var.data, np.ndarray)
+                else:
+                    self.assertIsInstance(var.data, da.Array)
         elif isinstance(actual, DataArray):
             self.assertIsInstance(actual.data, da.Array)
-            for coord in actual.coords.values():
-                self.assertIsInstance(coord.data, np.ndarray)
+            for k, v in actual.coords.items():
+                if k in actual.dims:
+                    self.assertIsInstance(v.data, np.ndarray)
+                else:
+                    self.assertIsInstance(v.data, da.Array)
         elif isinstance(actual, Variable):
             self.assertIsInstance(actual.data, da.Array)
         else:
diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py
index 254059cb128..57a18f0c5cc 100644
--- a/xarray/test/test_dataset.py
+++ b/xarray/test/test_dataset.py
@@ -673,20 +673,18 @@ def test_chunk(self):
 
         reblocked = data.chunk()
         for k, v in reblocked.variables.items():
-            if k in reblocked.data_vars:
-                self.assertIsInstance(v.data, da.Array)
-            else:
+            if k in reblocked.dims:
                 self.assertIsInstance(v.data, np.ndarray)
+            else:
+                self.assertIsInstance(v.data, da.Array)
 
-        expected_chunks = {'time': (20,), 'dim1': (8,),
-                           'dim2': (9,), 'dim3': (10,)}
+        expected_chunks = {'dim1': (8,), 'dim2': (9,), 'dim3': (10,)}
         self.assertEqual(reblocked.chunks, expected_chunks)
 
         reblocked = data.chunk({'time': 5, 'dim1': 5, 'dim2': 5, 'dim3': 5})
         # time is not a dim in any of the data_vars, so it
         # doesn't get chunked
-        expected_chunks = {'time': (20,), 'dim1': (5, 3),
-                   'dim2': (5, 4), 'dim3': (5, 5)}
+        expected_chunks = {'dim1': (5, 3), 'dim2': (5, 4), 'dim3': (5, 5)}
         self.assertEqual(reblocked.chunks, expected_chunks)
 
         reblocked = data.chunk(expected_chunks)

From 376200a48e642b1a8097085eee3f47a1792b3787 Mon Sep 17 00:00:00 2001
From: Guido Imperiale <guido.imperiale@gmail.com>
Date: Sun, 13 Nov 2016 18:19:55 +0000
Subject: [PATCH 11/11] Load IndexVariable.data into memory in init

IndexVariables to eagerly load their data into memory (from disk or dask) as soon as they're created
---
 doc/whats-new.rst            |  2 +-
 xarray/core/dataset.py       |  3 ---
 xarray/core/variable.py      | 21 ++++++++++++---------
 xarray/test/test_backends.py | 14 ++++++++++----
 xarray/test/test_dataset.py  | 21 +++++++++++++++++----
 xarray/test/test_variable.py |  6 ++----
 6 files changed, 42 insertions(+), 25 deletions(-)

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
index 23ac6445c98..31048f333ab 100644
--- a/doc/whats-new.rst
+++ b/doc/whats-new.rst
@@ -29,7 +29,7 @@ Breaking changes
   :py:meth:`values` property, won't automatically convert the array from dask
   to numpy in the original object anymore.
   If a dask object is used as a coord of a :py:class:`~xarray.DataArray` or
-  :py:class:`~xarray.Dataset`, its values will still be automatically cached,
+  :py:class:`~xarray.Dataset`, its values are eagerly computed and cached,
   but only if it's used to index a dim (e.g. it's used for alignment).
   By `Guido Imperiale <https://github.com/crusaderky>`_.
 
diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
index 3b1d19606d4..282409cf4ca 100644
--- a/xarray/core/dataset.py
+++ b/xarray/core/dataset.py
@@ -887,9 +887,6 @@ def selkeys(dict_, keys):
             return dict((d, dict_[d]) for d in keys if d in dict_)
 
         def maybe_chunk(name, var, chunks):
-            if name in self.dims:
-                return var
-
             chunks = selkeys(chunks, var.dims)
             if not chunks:
                 chunks = None
diff --git a/xarray/core/variable.py b/xarray/core/variable.py
index 7e8b13c08d3..1b6f5b55dda 100644
--- a/xarray/core/variable.py
+++ b/xarray/core/variable.py
@@ -1100,16 +1100,19 @@ def __init__(self, dims, data, attrs=None, encoding=None, fastpath=False):
             raise ValueError('%s objects must be 1-dimensional' %
                              type(self).__name__)
 
-    def _data_cached(self):
-        # Unlike in Variable._data_cached, always eagerly resolve dask arrays
-        self._data = self._data_cast()
-        return self._data
+        # Unlike in Variable, always eagerly load values into memory
+        if not isinstance(self._data, PandasIndexAdapter):
+            self._data = PandasIndexAdapter(self._data)
 
-    def _data_cast(self):
-        if isinstance(self._data, PandasIndexAdapter):
-            return self._data
-        else:
-            return PandasIndexAdapter(self._data)
+    @Variable.data.setter
+    def data(self, data):
+        Variable.data.fset(self, data)
+        if not isinstance(self._data, PandasIndexAdapter):
+            self._data = PandasIndexAdapter(self._data)
+
+    def chunk(self, chunks=None, name=None, lock=False):
+        # Dummy - do not chunk. This method is invoked e.g. by Dataset.chunk()
+        return self.copy(deep=False)
 
     def __getitem__(self, key):
         key = self._item_key_to_tuple(key)
diff --git a/xarray/test/test_backends.py b/xarray/test/test_backends.py
index 524fe50d7f9..65e0f3d51ac 100644
--- a/xarray/test/test_backends.py
+++ b/xarray/test/test_backends.py
@@ -129,8 +129,10 @@ def assert_loads(vars=None):
                 vars = expected
             with self.roundtrip(expected) as actual:
                 for k, v in actual.variables.items():
-                    # IndexVariables are eagerly cached into memory
-                    if k not in actual.dims:
+                    # IndexVariables are eagerly loaded into memory
+                    if k in actual.dims:
+                        self.assertTrue(v._in_memory)
+                    else:
                         self.assertFalse(v._in_memory)
                 yield actual
                 for k, v in actual.variables.items():
@@ -161,13 +163,17 @@ def test_dataset_compute(self):
             # Test Dataset.compute()
             for k, v in actual.variables.items():
                 # IndexVariables are eagerly cached
-                if k not in actual.dims:
+                if k in actual.dims:
+                    self.assertTrue(v._in_memory)
+                else:
                     self.assertFalse(v._in_memory)
 
             computed = actual.compute()
 
             for k, v in actual.variables.items():
-                if k not in actual.dims:
+                if k in actual.dims:
+                    self.assertTrue(v._in_memory)
+                else:
                     self.assertFalse(v._in_memory)
             for v in computed.variables.values():
                 self.assertTrue(v._in_memory)
diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py
index 57a18f0c5cc..67af48e897c 100644
--- a/xarray/test/test_dataset.py
+++ b/xarray/test/test_dataset.py
@@ -56,11 +56,24 @@ def create_test_multiindex():
 
 
 class InaccessibleVariableDataStore(backends.InMemoryDataStore):
+    def __init__(self, writer=None):
+        super(InaccessibleVariableDataStore, self).__init__(writer)
+        self._indexvars = set()
+
+    def store(self, variables, attributes, check_encoding_set=frozenset()):
+        super(InaccessibleVariableDataStore, self).store(
+            variables, attributes, check_encoding_set)
+        for k, v in variables.items():
+            if isinstance(v, IndexVariable):
+                self._indexvars.add(k)
+
     def get_variables(self):
-        def lazy_inaccessible(x):
-            data = indexing.LazilyIndexedArray(InaccessibleArray(x.values))
-            return Variable(x.dims, data, x.attrs)
-        return dict((k, lazy_inaccessible(v)) for
+        def lazy_inaccessible(k, v):
+            if k in self._indexvars:
+                return v
+            data = indexing.LazilyIndexedArray(InaccessibleArray(v.values))
+            return Variable(v.dims, data, v.attrs)
+        return dict((k, lazy_inaccessible(k, v)) for
                     k, v in iteritems(self._variables))
 
 
diff --git a/xarray/test/test_variable.py b/xarray/test/test_variable.py
index 8693ed39f65..e79d30eeb6e 100644
--- a/xarray/test/test_variable.py
+++ b/xarray/test/test_variable.py
@@ -1052,13 +1052,11 @@ def test_multiindex_default_level_names(self):
 
     def test_data(self):
         x = IndexVariable('x', np.arange(3.0))
-        # data should be initially saved as an ndarray
-        self.assertIs(type(x._data), np.ndarray)
+        self.assertIsInstance(x._data, PandasIndexAdapter)
+        self.assertIsInstance(x.data, np.ndarray)
         self.assertEqual(float, x.dtype)
         self.assertArrayEqual(np.arange(3), x)
         self.assertEqual(float, x.values.dtype)
-        # after inspecting x.values, the IndexVariable value will be saved as an Index
-        self.assertIsInstance(x._data, PandasIndexAdapter)
         with self.assertRaisesRegexp(TypeError, 'cannot be modified'):
             x[:] = 0