From 48733519a46cd3c77dd07ea5cd058828c1c131c0 Mon Sep 17 00:00:00 2001
From: sft-managed
 <u00u7rh1e72hXfsipJ357@rl-dgx2-r11-u7-rapids-dgx202.raplab.nvidia.com>
Date: Wed, 9 Mar 2022 17:49:11 -0800
Subject: [PATCH 1/6] Enable read_text with dask_cudf using byte_range

---
 python/dask_cudf/dask_cudf/__init__.py        |  2 +-
 python/dask_cudf/dask_cudf/io/__init__.py     |  1 +
 .../dask_cudf/dask_cudf/io/tests/sample.pgn   | 53 +++++++++++++++++
 .../dask_cudf/dask_cudf/io/tests/test_text.py | 20 +++++++
 python/dask_cudf/dask_cudf/io/text.py         | 57 +++++++++++++++++++
 5 files changed, 132 insertions(+), 1 deletion(-)
 create mode 100644 python/dask_cudf/dask_cudf/io/tests/sample.pgn
 create mode 100644 python/dask_cudf/dask_cudf/io/tests/test_text.py
 create mode 100644 python/dask_cudf/dask_cudf/io/text.py

diff --git a/python/dask_cudf/dask_cudf/__init__.py b/python/dask_cudf/dask_cudf/__init__.py
index e5d85debf6e..c301714ed14 100644
--- a/python/dask_cudf/dask_cudf/__init__.py
+++ b/python/dask_cudf/dask_cudf/__init__.py
@@ -6,7 +6,7 @@
 from . import backends
 from .core import DataFrame, Series, concat, from_cudf, from_dask_dataframe
 from .groupby import groupby_agg
-from .io import read_csv, read_json, read_orc, to_orc
+from .io import read_csv, read_json, read_orc, to_orc, read_text
 
 try:
     from .io import read_parquet
diff --git a/python/dask_cudf/dask_cudf/io/__init__.py b/python/dask_cudf/dask_cudf/io/__init__.py
index f495107e219..85d85b7d1d1 100644
--- a/python/dask_cudf/dask_cudf/io/__init__.py
+++ b/python/dask_cudf/dask_cudf/io/__init__.py
@@ -1,6 +1,7 @@
 from .csv import read_csv
 from .json import read_json
 from .orc import read_orc, to_orc
+from .text import read_text
 
 try:
     from .parquet import read_parquet, to_parquet
diff --git a/python/dask_cudf/dask_cudf/io/tests/sample.pgn b/python/dask_cudf/dask_cudf/io/tests/sample.pgn
new file mode 100644
index 00000000000..e927c24926d
--- /dev/null
+++ b/python/dask_cudf/dask_cudf/io/tests/sample.pgn
@@ -0,0 +1,53 @@
+[Event "Rated Bullet tournament https://lichess.org/tournament/IaRkDsvp"]
+[Site "https://lichess.org/r0cYFhsy"]
+[White "GreatGig"]
+[Black "hackattack"]
+[Result "0-1"]
+[UTCDate "2016.04.30"]
+[UTCTime "22:00:03"]
+[WhiteElo "1777"]
+[BlackElo "1809"]
+[WhiteRatingDiff "-11"]
+[BlackRatingDiff "+11"]
+[ECO "B01"]
+[Opening "Scandinavian Defense: Mieses-Kotroc Variation"]
+[TimeControl "60+0"]
+[Termination "Time forfeit"]
+
+1. e4 d5 2. exd5 Qxd5 3. Nc3 Qd8 4. d4 Nf6 5. Nf3 Bg4 6. h3 Bxf3 7. gxf3 c6 8. Bg2 Nbd7 9. Be3 e6 10. Qd2 Nd5 11. Nxd5 cxd5 12. O-O-O Be7 13. c3 Qc7 14. Kb1 O-O-O 15. f4 Kb8 16. Rhg1 Ka8 17. Bh1 g6 18. h4 Bxh4 19. f3 Be7 20. Qc2 Nf6 21. Bg2 Nh5 22. Bh3 Nxf4 23. Bxf4 Qxf4 24. Rdf1 Qd6 25. Rg4 Rdf8 26. Rfg1 f5 27. R4g2 Bf6 28. Rg3 Rfg8 29. Bf1 Rg7 30. Bd3 Rhg8 31. Qh2 Qb8 32. Qg2 Qc8 33. f4 Qc6 34. Qf2 Bh4 35. Rxg6 Bxf2 36. Rxg7 Rxg7 37. Rxg7 a6 38. Rg8+ Ka7 39. Rh8 Qd7 40. Rxh7 Qxh7 0-1
+
+[Event "Rated Bullet tournament https://lichess.org/tournament/IaRkDsvp"]
+[Site "https://lichess.org/s7lpBNiu"]
+[White "kh447"]
+[Black "blueskyminer23"]
+[Result "0-1"]
+[UTCDate "2016.04.30"]
+[UTCTime "22:00:03"]
+[WhiteElo "2025"]
+[BlackElo "2046"]
+[WhiteRatingDiff "-12"]
+[BlackRatingDiff "+11"]
+[ECO "D94"]
+[Opening "Gruenfeld Defense: Three Knights Variation, Paris Variation"]
+[TimeControl "60+0"]
+[Termination "Time forfeit"]
+
+1. d4 Nf6 2. c4 g6 3. Nc3 d5 4. Nf3 Bg7 5. e3 O-O 6. Bd3 c5 7. cxd5 Nxd5 8. O-O Nc6 9. Qe2 Bg4 10. dxc5 Nxc3 11. bxc3 Bxf3 12. gxf3 Bxc3 13. Rb1 Rb8 14. Kh1 Qd5 15. Rg1 Qxc5 16. Qc2 Qa5 17. Qb3 Bg7 18. Qc2 Ne5 19. Be4 Rfc8 20. Qe2 f5 21. Bc2 Qd5 22. e4 Qc6 23. Bb3+ Kh8 24. Bf4 fxe4 25. fxe4 Nd7 26. Rbc1 Qf6 27. Bxb8 Rxb8 28. Rg3 Ne5 29. Rcg1 h6 30. f4 Qxf4 31. Rxg6 Nxg6 32. Qg2 Qe5 33. Bc2 Kg8 34. Qe2 Rf8 35. Qf3 Rf7 36. Qe2 Rf6 37. Qg2 Re6 0-1
+
+[Event "Rated Bullet tournament https://lichess.org/tournament/IaRkDsvp"]
+[Site "https://lichess.org/9CTXrWUB"]
+[White "Demis115"]
+[Black "churrosagogo"]
+[Result "1-0"]
+[UTCDate "2016.04.30"]
+[UTCTime "22:00:03"]
+[WhiteElo "1944"]
+[BlackElo "2007"]
+[WhiteRatingDiff "+14"]
+[BlackRatingDiff "-13"]
+[ECO "C28"]
+[Opening "Bishop's Opening: Vienna Hybrid"]
+[TimeControl "60+0"]
+[Termination "Normal"]
+
+1. e4 e5 2. Nc3 Nc6 3. Bc4 Nf6 4. d3 Bc5 5. Be3 O-O 6. Bxc5 d6 7. Be3 a6 8. Nge2 b5 9. Bb3 b4 10. Nd5 Na5 11. Bg5 Nxb3 12. axb3 Be6 13. Bxf6 gxf6 14. Ng3 Bxd5 15. exd5 Qd7 16. O-O Kh8 17. Qe2 f5 18. Qh5 f4 19. Nf5 Rg8 20. Nh6 Rg6 21. Rfe1 Rag8 22. g3 f5 23. Qxf5 Qg7 24. Nf7+ Qxf7 25. Qxf7 fxg3 26. hxg3 R6g7 27. Qf6 1-0
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_text.py b/python/dask_cudf/dask_cudf/io/tests/test_text.py
new file mode 100644
index 00000000000..0b3df51e1dc
--- /dev/null
+++ b/python/dask_cudf/dask_cudf/io/tests/test_text.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+import os
+import pytest
+
+import cudf
+import dask_cudf
+
+import dask.dataframe as dd
+
+cur_dir = os.path.dirname(__file__)
+text_file = os.path.join(cur_dir, "sample.pgn")
+
+
+@pytest.mark.parametrize("file", [text_file, [text_file]])
+@pytest.mark.parametrize("chunksize", [12, "50 B", None])
+def test_read_text(file, chunksize):
+    df1 = cudf.read_text(text_file, delimiter='"]')
+    df2 = dask_cudf.read_text(file, chunksize=chunksize, delimiter='"]')
+    dd.assert_eq(df1, df2, check_index=False)
diff --git a/python/dask_cudf/dask_cudf/io/text.py b/python/dask_cudf/dask_cudf/io/text.py
new file mode 100644
index 00000000000..1e39a74880d
--- /dev/null
+++ b/python/dask_cudf/dask_cudf/io/text.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+import os
+from glob import glob
+
+from dask.base import tokenize
+
+import cudf
+
+from dask.utils import apply, parse_bytes
+
+import dask.dataframe as dd
+
+
+def read_text(path, chunksize="256 MiB", **kwargs):
+
+    if isinstance(chunksize, str):
+        chunksize = parse_bytes(chunksize)
+
+    if isinstance(path, list):
+        filenames = path
+    elif isinstance(path, str):
+        filenames = sorted(glob(path))
+    elif hasattr(path, "__fspath__"):
+        filenames = sorted(glob(path.__fspath__()))
+    else:
+        raise TypeError(f"Path type not understood:{type(path)}")
+
+    if not filenames:
+        msg = f"A file in: {filenames} does not exist."
+        raise FileNotFoundError(msg)
+
+    name = "read-text-" + tokenize(path, tokenize, **kwargs)
+
+    if chunksize:
+        dsk = {}
+        i = 0
+        for fn in filenames:
+            size = os.path.getsize(fn)
+            for start in range(0, size, chunksize):
+                kwargs1 = kwargs.copy()
+                kwargs1["byte_range"] = (
+                    start,
+                    chunksize - 1,
+                )  # specify which chunk of the file we care about
+
+                dsk[(name, i)] = (apply, cudf.read_text, [fn], kwargs1)
+                i += 1
+    else:
+        dsk = {
+            (name, i): (apply, cudf.read_text, [fn], kwargs)
+            for i, fn in enumerate(filenames)
+        }
+
+    meta = cudf.Series([], dtype="O")
+    divisions = [None] * (len(dsk) + 1)
+    return dd.core.new_dd_object(dsk, name, meta, divisions)

From 83a6a574555f31e5f13ebce4238e2fa9a60a2430 Mon Sep 17 00:00:00 2001
From: sft-managed
 <u00u7rh1e72hXfsipJ357@rl-dgx2-r11-u7-rapids-dgx202.raplab.nvidia.com>
Date: Wed, 9 Mar 2022 18:10:48 -0800
Subject: [PATCH 2/6] Fix copyright

---
 python/dask_cudf/dask_cudf/__init__.py    | 2 ++
 python/dask_cudf/dask_cudf/io/__init__.py | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/python/dask_cudf/dask_cudf/__init__.py b/python/dask_cudf/dask_cudf/__init__.py
index c301714ed14..a1ae812e37c 100644
--- a/python/dask_cudf/dask_cudf/__init__.py
+++ b/python/dask_cudf/dask_cudf/__init__.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+
 from dask.dataframe import from_delayed
 
 import cudf
diff --git a/python/dask_cudf/dask_cudf/io/__init__.py b/python/dask_cudf/dask_cudf/io/__init__.py
index 85d85b7d1d1..76bb2ea99b4 100644
--- a/python/dask_cudf/dask_cudf/io/__init__.py
+++ b/python/dask_cudf/dask_cudf/io/__init__.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+
 from .csv import read_csv
 from .json import read_json
 from .orc import read_orc, to_orc

From 721f9f951ee361d98758431a47c7e46298b75229 Mon Sep 17 00:00:00 2001
From: sft-managed
 <u00u7rh1e72hXfsipJ357@rl-dgx2-r11-u7-rapids-dgx202.raplab.nvidia.com>
Date: Thu, 10 Mar 2022 09:27:08 -0800
Subject: [PATCH 3/6] Move sample data to data directory

---
 python/dask_cudf/dask_cudf/__init__.py              |   2 +-
 .../dask_cudf/io/tests/{ => data}/sample.orc        | Bin
 .../dask_cudf/io/tests/{ => data}/sample.pgn        |   0
 python/dask_cudf/dask_cudf/io/tests/test_orc.py     |   2 +-
 python/dask_cudf/dask_cudf/io/tests/test_text.py    |   8 +++++---
 python/dask_cudf/dask_cudf/io/text.py               |   6 ++----
 6 files changed, 9 insertions(+), 9 deletions(-)
 rename python/dask_cudf/dask_cudf/io/tests/{ => data}/sample.orc (100%)
 rename python/dask_cudf/dask_cudf/io/tests/{ => data}/sample.pgn (100%)

diff --git a/python/dask_cudf/dask_cudf/__init__.py b/python/dask_cudf/dask_cudf/__init__.py
index a1ae812e37c..5e3a9342c25 100644
--- a/python/dask_cudf/dask_cudf/__init__.py
+++ b/python/dask_cudf/dask_cudf/__init__.py
@@ -8,7 +8,7 @@
 from . import backends
 from .core import DataFrame, Series, concat, from_cudf, from_dask_dataframe
 from .groupby import groupby_agg
-from .io import read_csv, read_json, read_orc, to_orc, read_text
+from .io import read_csv, read_json, read_orc, read_text, to_orc
 
 try:
     from .io import read_parquet
diff --git a/python/dask_cudf/dask_cudf/io/tests/sample.orc b/python/dask_cudf/dask_cudf/io/tests/data/sample.orc
similarity index 100%
rename from python/dask_cudf/dask_cudf/io/tests/sample.orc
rename to python/dask_cudf/dask_cudf/io/tests/data/sample.orc
diff --git a/python/dask_cudf/dask_cudf/io/tests/sample.pgn b/python/dask_cudf/dask_cudf/io/tests/data/sample.pgn
similarity index 100%
rename from python/dask_cudf/dask_cudf/io/tests/sample.pgn
rename to python/dask_cudf/dask_cudf/io/tests/data/sample.pgn
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_orc.py b/python/dask_cudf/dask_cudf/io/tests/test_orc.py
index d8ac9e52fd8..4dbc8f52665 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_orc.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_orc.py
@@ -13,7 +13,7 @@
 # import pyarrow.orc as orc
 
 cur_dir = os.path.dirname(__file__)
-sample_orc = os.path.join(cur_dir, "sample.orc")
+sample_orc = os.path.join(cur_dir, "data/sample.orc")
 
 
 def test_read_orc_defaults():
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_text.py b/python/dask_cudf/dask_cudf/io/tests/test_text.py
index 0b3df51e1dc..2ef6576818f 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_text.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_text.py
@@ -1,15 +1,17 @@
 # Copyright (c) 2022, NVIDIA CORPORATION.
 
 import os
+
 import pytest
 
+import dask.dataframe as dd
+
 import cudf
-import dask_cudf
 
-import dask.dataframe as dd
+import dask_cudf
 
 cur_dir = os.path.dirname(__file__)
-text_file = os.path.join(cur_dir, "sample.pgn")
+text_file = os.path.join(cur_dir, "data/sample.pgn")
 
 
 @pytest.mark.parametrize("file", [text_file, [text_file]])
diff --git a/python/dask_cudf/dask_cudf/io/text.py b/python/dask_cudf/dask_cudf/io/text.py
index 1e39a74880d..5582e16b384 100644
--- a/python/dask_cudf/dask_cudf/io/text.py
+++ b/python/dask_cudf/dask_cudf/io/text.py
@@ -3,13 +3,11 @@
 import os
 from glob import glob
 
+import dask.dataframe as dd
 from dask.base import tokenize
-
-import cudf
-
 from dask.utils import apply, parse_bytes
 
-import dask.dataframe as dd
+import cudf
 
 
 def read_text(path, chunksize="256 MiB", **kwargs):

From fd81f73f340959be8580358d09ad1ac9e8b4826e Mon Sep 17 00:00:00 2001
From: sft-managed
 <u00u7rh1e72hXfsipJ357@rl-dgx2-r11-u7-rapids-dgx202.raplab.nvidia.com>
Date: Thu, 10 Mar 2022 09:35:53 -0800
Subject: [PATCH 4/6] Fix copyright

---
 python/dask_cudf/dask_cudf/io/tests/test_orc.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/dask_cudf/dask_cudf/io/tests/test_orc.py b/python/dask_cudf/dask_cudf/io/tests/test_orc.py
index 4dbc8f52665..f02adcc1cec 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_orc.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_orc.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+
 import glob
 import os
 from datetime import datetime, timezone

From 2d74d42403c83af864018498e22b478fc9165701 Mon Sep 17 00:00:00 2001
From: sft-managed
 <u00u7rh1e72hXfsipJ357@rl-dgx2-r11-u7-rapids-dgx202.raplab.nvidia.com>
Date: Tue, 15 Mar 2022 08:43:29 -0700
Subject: [PATCH 5/6] Create subdirectories in data directory

---
 .../dask_cudf/io/tests/data/{ => orc}/sample.orc    | Bin
 .../dask_cudf/io/tests/data/{ => text}/sample.pgn   |   0
 python/dask_cudf/dask_cudf/io/tests/test_orc.py     |   2 +-
 python/dask_cudf/dask_cudf/io/tests/test_text.py    |   2 +-
 4 files changed, 2 insertions(+), 2 deletions(-)
 rename python/dask_cudf/dask_cudf/io/tests/data/{ => orc}/sample.orc (100%)
 rename python/dask_cudf/dask_cudf/io/tests/data/{ => text}/sample.pgn (100%)

diff --git a/python/dask_cudf/dask_cudf/io/tests/data/sample.orc b/python/dask_cudf/dask_cudf/io/tests/data/orc/sample.orc
similarity index 100%
rename from python/dask_cudf/dask_cudf/io/tests/data/sample.orc
rename to python/dask_cudf/dask_cudf/io/tests/data/orc/sample.orc
diff --git a/python/dask_cudf/dask_cudf/io/tests/data/sample.pgn b/python/dask_cudf/dask_cudf/io/tests/data/text/sample.pgn
similarity index 100%
rename from python/dask_cudf/dask_cudf/io/tests/data/sample.pgn
rename to python/dask_cudf/dask_cudf/io/tests/data/text/sample.pgn
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_orc.py b/python/dask_cudf/dask_cudf/io/tests/test_orc.py
index f02adcc1cec..f19396a9b37 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_orc.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_orc.py
@@ -15,7 +15,7 @@
 # import pyarrow.orc as orc
 
 cur_dir = os.path.dirname(__file__)
-sample_orc = os.path.join(cur_dir, "data/sample.orc")
+sample_orc = os.path.join(cur_dir, "data/orc/sample.orc")
 
 
 def test_read_orc_defaults():
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_text.py b/python/dask_cudf/dask_cudf/io/tests/test_text.py
index 2ef6576818f..2b3215dd725 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_text.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_text.py
@@ -11,7 +11,7 @@
 import dask_cudf
 
 cur_dir = os.path.dirname(__file__)
-text_file = os.path.join(cur_dir, "data/sample.pgn")
+text_file = os.path.join(cur_dir, "data/text/sample.pgn")
 
 
 @pytest.mark.parametrize("file", [text_file, [text_file]])

From 48596233d40af947b535c8361babd804cd0f5d14 Mon Sep 17 00:00:00 2001
From: sft-managed
 <u00u7rh1e72hXfsipJ357@rl-dgx2-r11-u7-rapids-dgx202.raplab.nvidia.com>
Date: Mon, 21 Mar 2022 21:33:34 -0700
Subject: [PATCH 6/6] Add a test for byte_range

---
 python/dask_cudf/dask_cudf/io/tests/test_text.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/python/dask_cudf/dask_cudf/io/tests/test_text.py b/python/dask_cudf/dask_cudf/io/tests/test_text.py
index 2b3215dd725..a14eec1fea9 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_text.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_text.py
@@ -20,3 +20,13 @@ def test_read_text(file, chunksize):
     df1 = cudf.read_text(text_file, delimiter='"]')
     df2 = dask_cudf.read_text(file, chunksize=chunksize, delimiter='"]')
     dd.assert_eq(df1, df2, check_index=False)
+
+
+@pytest.mark.parametrize("offset", [0, 100, 250, 500, 1000])
+@pytest.mark.parametrize("size", [64, 128, 256])
+def test_read_text_byte_range(offset, size):
+    df1 = cudf.read_text(text_file, delimiter=".", byte_range=(offset, size))
+    df2 = dask_cudf.read_text(
+        text_file, chunksize=None, delimiter=".", byte_range=(offset, size)
+    )
+    dd.assert_eq(df1, df2, check_index=False)