Cloud Fetch e2e tests (#154)

* Cloud Fetch e2e tests Signed-off-by: Matthew Kim <11141331+mattdeekay@users.noreply.github.com> * Test case works for e2-dogfood shared unity catalog Signed-off-by: Matthew Kim <11141331+mattdeekay@users.noreply.github.com> * Moving test to LargeQueriesSuite and setting catalog to hive_metastore Signed-off-by: Matthew Kim <11141331+mattdeekay@users.noreply.github.com> * Align default value of buffer_size_bytes in driver tests Signed-off-by: Matthew Kim <11141331+mattdeekay@users.noreply.github.com> * Adding comment to specify what's needed to run successfully Signed-off-by: Matthew Kim <11141331+mattdeekay@users.noreply.github.com> --------- Signed-off-by: Matthew Kim <11141331+mattdeekay@users.noreply.github.com>
databricks · Jul 7, 2023 · 759401c · 759401c
1 parent 5a34a4a
commit 759401c
Showing 1 changed file with 33 additions and 1 deletion.
diff --git a/tests/e2e/test_driver.py b/tests/e2e/test_driver.py
@@ -1,3 +1,4 @@
+import itertools
 from contextlib import contextmanager
 from collections import OrderedDict
 import datetime
@@ -52,6 +53,7 @@ def __init__(self, method_name):
         # If running in local mode, just use environment variables for params.
         self.arguments = os.environ if get_args_from_env else {}
         self.arraysize = 1000
+        self.buffer_size_bytes = 104857600
 
     def connection_params(self, arguments):
         params = {
@@ -84,7 +86,7 @@ def connection(self, extra_params=()):
     @contextmanager
     def cursor(self, extra_params=()):
         with self.connection(extra_params) as conn:
-            cursor = conn.cursor(arraysize=self.arraysize)
+            cursor = conn.cursor(arraysize=self.arraysize, buffer_size_bytes=self.buffer_size_bytes)
             try:
                 yield cursor
             finally:
@@ -104,6 +106,36 @@ def get_some_rows(self, cursor, fetchmany_size):
         else:
             return None
 
+    @skipUnless(pysql_supports_arrow(), 'needs arrow support')
+    def test_cloud_fetch(self):
+        # This test can take several minutes to run
+        limits = [100000, 300000]
+        threads = [10, 25]
+        self.arraysize = 100000
+        # This test requires a large table with many rows to properly initiate cloud fetch.
+        # e2-dogfood host > hive_metastore catalog > main schema has such a table called store_sales.
+        # If this table is deleted or this test is run on a different host, a different table may need to be used.
+        base_query = "SELECT * FROM store_sales WHERE ss_sold_date_sk = 2452234 "
+        for num_limit, num_threads, lz4_compression in itertools.product(limits, threads, [True, False]):
+            with self.subTest(num_limit=num_limit, num_threads=num_threads, lz4_compression=lz4_compression):
+                cf_result, noop_result = None, None
+                query = base_query + "LIMIT " + str(num_limit)
+                with self.cursor({
+                    "use_cloud_fetch": True,
+                    "max_download_threads": num_threads,
+                    "catalog": "hive_metastore"
+                }) as cursor:
+                    cursor.execute(query)
+                    cf_result = cursor.fetchall()
+                with self.cursor({
+                    "catalog": "hive_metastore"
+                }) as cursor:
+                    cursor.execute(query)
+                    noop_result = cursor.fetchall()
+                assert len(cf_result) == len(noop_result)
+                for i in range(len(cf_result)):
+                    assert cf_result[i] == noop_result[i]
+
 
 # Exclude Retry tests because they require specific setups, and LargeQueries too slow for core
 # tests