From 48733519a46cd3c77dd07ea5cd058828c1c131c0 Mon Sep 17 00:00:00 2001 From: sft-managed Date: Wed, 9 Mar 2022 17:49:11 -0800 Subject: [PATCH 1/6] Enable read_text with dask_cudf using byte_range --- python/dask_cudf/dask_cudf/__init__.py | 2 +- python/dask_cudf/dask_cudf/io/__init__.py | 1 + .../dask_cudf/dask_cudf/io/tests/sample.pgn | 53 +++++++++++++++++ .../dask_cudf/dask_cudf/io/tests/test_text.py | 20 +++++++ python/dask_cudf/dask_cudf/io/text.py | 57 +++++++++++++++++++ 5 files changed, 132 insertions(+), 1 deletion(-) create mode 100644 python/dask_cudf/dask_cudf/io/tests/sample.pgn create mode 100644 python/dask_cudf/dask_cudf/io/tests/test_text.py create mode 100644 python/dask_cudf/dask_cudf/io/text.py diff --git a/python/dask_cudf/dask_cudf/__init__.py b/python/dask_cudf/dask_cudf/__init__.py index e5d85debf6e..c301714ed14 100644 --- a/python/dask_cudf/dask_cudf/__init__.py +++ b/python/dask_cudf/dask_cudf/__init__.py @@ -6,7 +6,7 @@ from . import backends from .core import DataFrame, Series, concat, from_cudf, from_dask_dataframe from .groupby import groupby_agg -from .io import read_csv, read_json, read_orc, to_orc +from .io import read_csv, read_json, read_orc, to_orc, read_text try: from .io import read_parquet diff --git a/python/dask_cudf/dask_cudf/io/__init__.py b/python/dask_cudf/dask_cudf/io/__init__.py index f495107e219..85d85b7d1d1 100644 --- a/python/dask_cudf/dask_cudf/io/__init__.py +++ b/python/dask_cudf/dask_cudf/io/__init__.py @@ -1,6 +1,7 @@ from .csv import read_csv from .json import read_json from .orc import read_orc, to_orc +from .text import read_text try: from .parquet import read_parquet, to_parquet diff --git a/python/dask_cudf/dask_cudf/io/tests/sample.pgn b/python/dask_cudf/dask_cudf/io/tests/sample.pgn new file mode 100644 index 00000000000..e927c24926d --- /dev/null +++ b/python/dask_cudf/dask_cudf/io/tests/sample.pgn @@ -0,0 +1,53 @@ +[Event "Rated Bullet tournament https://lichess.org/tournament/IaRkDsvp"] +[Site "https://lichess.org/r0cYFhsy"] +[White "GreatGig"] +[Black "hackattack"] +[Result "0-1"] +[UTCDate "2016.04.30"] +[UTCTime "22:00:03"] +[WhiteElo "1777"] +[BlackElo "1809"] +[WhiteRatingDiff "-11"] +[BlackRatingDiff "+11"] +[ECO "B01"] +[Opening "Scandinavian Defense: Mieses-Kotroc Variation"] +[TimeControl "60+0"] +[Termination "Time forfeit"] + +1. e4 d5 2. exd5 Qxd5 3. Nc3 Qd8 4. d4 Nf6 5. Nf3 Bg4 6. h3 Bxf3 7. gxf3 c6 8. Bg2 Nbd7 9. Be3 e6 10. Qd2 Nd5 11. Nxd5 cxd5 12. O-O-O Be7 13. c3 Qc7 14. Kb1 O-O-O 15. f4 Kb8 16. Rhg1 Ka8 17. Bh1 g6 18. h4 Bxh4 19. f3 Be7 20. Qc2 Nf6 21. Bg2 Nh5 22. Bh3 Nxf4 23. Bxf4 Qxf4 24. Rdf1 Qd6 25. Rg4 Rdf8 26. Rfg1 f5 27. R4g2 Bf6 28. Rg3 Rfg8 29. Bf1 Rg7 30. Bd3 Rhg8 31. Qh2 Qb8 32. Qg2 Qc8 33. f4 Qc6 34. Qf2 Bh4 35. Rxg6 Bxf2 36. Rxg7 Rxg7 37. Rxg7 a6 38. Rg8+ Ka7 39. Rh8 Qd7 40. Rxh7 Qxh7 0-1 + +[Event "Rated Bullet tournament https://lichess.org/tournament/IaRkDsvp"] +[Site "https://lichess.org/s7lpBNiu"] +[White "kh447"] +[Black "blueskyminer23"] +[Result "0-1"] +[UTCDate "2016.04.30"] +[UTCTime "22:00:03"] +[WhiteElo "2025"] +[BlackElo "2046"] +[WhiteRatingDiff "-12"] +[BlackRatingDiff "+11"] +[ECO "D94"] +[Opening "Gruenfeld Defense: Three Knights Variation, Paris Variation"] +[TimeControl "60+0"] +[Termination "Time forfeit"] + +1. d4 Nf6 2. c4 g6 3. Nc3 d5 4. Nf3 Bg7 5. e3 O-O 6. Bd3 c5 7. cxd5 Nxd5 8. O-O Nc6 9. Qe2 Bg4 10. dxc5 Nxc3 11. bxc3 Bxf3 12. gxf3 Bxc3 13. Rb1 Rb8 14. Kh1 Qd5 15. Rg1 Qxc5 16. Qc2 Qa5 17. Qb3 Bg7 18. Qc2 Ne5 19. Be4 Rfc8 20. Qe2 f5 21. Bc2 Qd5 22. e4 Qc6 23. Bb3+ Kh8 24. Bf4 fxe4 25. fxe4 Nd7 26. Rbc1 Qf6 27. Bxb8 Rxb8 28. Rg3 Ne5 29. Rcg1 h6 30. f4 Qxf4 31. Rxg6 Nxg6 32. Qg2 Qe5 33. Bc2 Kg8 34. Qe2 Rf8 35. Qf3 Rf7 36. Qe2 Rf6 37. Qg2 Re6 0-1 + +[Event "Rated Bullet tournament https://lichess.org/tournament/IaRkDsvp"] +[Site "https://lichess.org/9CTXrWUB"] +[White "Demis115"] +[Black "churrosagogo"] +[Result "1-0"] +[UTCDate "2016.04.30"] +[UTCTime "22:00:03"] +[WhiteElo "1944"] +[BlackElo "2007"] +[WhiteRatingDiff "+14"] +[BlackRatingDiff "-13"] +[ECO "C28"] +[Opening "Bishop's Opening: Vienna Hybrid"] +[TimeControl "60+0"] +[Termination "Normal"] + +1. e4 e5 2. Nc3 Nc6 3. Bc4 Nf6 4. d3 Bc5 5. Be3 O-O 6. Bxc5 d6 7. Be3 a6 8. Nge2 b5 9. Bb3 b4 10. Nd5 Na5 11. Bg5 Nxb3 12. axb3 Be6 13. Bxf6 gxf6 14. Ng3 Bxd5 15. exd5 Qd7 16. O-O Kh8 17. Qe2 f5 18. Qh5 f4 19. Nf5 Rg8 20. Nh6 Rg6 21. Rfe1 Rag8 22. g3 f5 23. Qxf5 Qg7 24. Nf7+ Qxf7 25. Qxf7 fxg3 26. hxg3 R6g7 27. Qf6 1-0 diff --git a/python/dask_cudf/dask_cudf/io/tests/test_text.py b/python/dask_cudf/dask_cudf/io/tests/test_text.py new file mode 100644 index 00000000000..0b3df51e1dc --- /dev/null +++ b/python/dask_cudf/dask_cudf/io/tests/test_text.py @@ -0,0 +1,20 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +import os +import pytest + +import cudf +import dask_cudf + +import dask.dataframe as dd + +cur_dir = os.path.dirname(__file__) +text_file = os.path.join(cur_dir, "sample.pgn") + + +@pytest.mark.parametrize("file", [text_file, [text_file]]) +@pytest.mark.parametrize("chunksize", [12, "50 B", None]) +def test_read_text(file, chunksize): + df1 = cudf.read_text(text_file, delimiter='"]') + df2 = dask_cudf.read_text(file, chunksize=chunksize, delimiter='"]') + dd.assert_eq(df1, df2, check_index=False) diff --git a/python/dask_cudf/dask_cudf/io/text.py b/python/dask_cudf/dask_cudf/io/text.py new file mode 100644 index 00000000000..1e39a74880d --- /dev/null +++ b/python/dask_cudf/dask_cudf/io/text.py @@ -0,0 +1,57 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +import os +from glob import glob + +from dask.base import tokenize + +import cudf + +from dask.utils import apply, parse_bytes + +import dask.dataframe as dd + + +def read_text(path, chunksize="256 MiB", **kwargs): + + if isinstance(chunksize, str): + chunksize = parse_bytes(chunksize) + + if isinstance(path, list): + filenames = path + elif isinstance(path, str): + filenames = sorted(glob(path)) + elif hasattr(path, "__fspath__"): + filenames = sorted(glob(path.__fspath__())) + else: + raise TypeError(f"Path type not understood:{type(path)}") + + if not filenames: + msg = f"A file in: {filenames} does not exist." + raise FileNotFoundError(msg) + + name = "read-text-" + tokenize(path, tokenize, **kwargs) + + if chunksize: + dsk = {} + i = 0 + for fn in filenames: + size = os.path.getsize(fn) + for start in range(0, size, chunksize): + kwargs1 = kwargs.copy() + kwargs1["byte_range"] = ( + start, + chunksize - 1, + ) # specify which chunk of the file we care about + + dsk[(name, i)] = (apply, cudf.read_text, [fn], kwargs1) + i += 1 + else: + dsk = { + (name, i): (apply, cudf.read_text, [fn], kwargs) + for i, fn in enumerate(filenames) + } + + meta = cudf.Series([], dtype="O") + divisions = [None] * (len(dsk) + 1) + return dd.core.new_dd_object(dsk, name, meta, divisions) From 83a6a574555f31e5f13ebce4238e2fa9a60a2430 Mon Sep 17 00:00:00 2001 From: sft-managed Date: Wed, 9 Mar 2022 18:10:48 -0800 Subject: [PATCH 2/6] Fix copyright --- python/dask_cudf/dask_cudf/__init__.py | 2 ++ python/dask_cudf/dask_cudf/io/__init__.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/python/dask_cudf/dask_cudf/__init__.py b/python/dask_cudf/dask_cudf/__init__.py index c301714ed14..a1ae812e37c 100644 --- a/python/dask_cudf/dask_cudf/__init__.py +++ b/python/dask_cudf/dask_cudf/__init__.py @@ -1,3 +1,5 @@ +# Copyright (c) 2018-2022, NVIDIA CORPORATION. + from dask.dataframe import from_delayed import cudf diff --git a/python/dask_cudf/dask_cudf/io/__init__.py b/python/dask_cudf/dask_cudf/io/__init__.py index 85d85b7d1d1..76bb2ea99b4 100644 --- a/python/dask_cudf/dask_cudf/io/__init__.py +++ b/python/dask_cudf/dask_cudf/io/__init__.py @@ -1,3 +1,5 @@ +# Copyright (c) 2018-2022, NVIDIA CORPORATION. + from .csv import read_csv from .json import read_json from .orc import read_orc, to_orc From 721f9f951ee361d98758431a47c7e46298b75229 Mon Sep 17 00:00:00 2001 From: sft-managed Date: Thu, 10 Mar 2022 09:27:08 -0800 Subject: [PATCH 3/6] Move sample data to data directory --- python/dask_cudf/dask_cudf/__init__.py | 2 +- .../dask_cudf/io/tests/{ => data}/sample.orc | Bin .../dask_cudf/io/tests/{ => data}/sample.pgn | 0 python/dask_cudf/dask_cudf/io/tests/test_orc.py | 2 +- python/dask_cudf/dask_cudf/io/tests/test_text.py | 8 +++++--- python/dask_cudf/dask_cudf/io/text.py | 6 ++---- 6 files changed, 9 insertions(+), 9 deletions(-) rename python/dask_cudf/dask_cudf/io/tests/{ => data}/sample.orc (100%) rename python/dask_cudf/dask_cudf/io/tests/{ => data}/sample.pgn (100%) diff --git a/python/dask_cudf/dask_cudf/__init__.py b/python/dask_cudf/dask_cudf/__init__.py index a1ae812e37c..5e3a9342c25 100644 --- a/python/dask_cudf/dask_cudf/__init__.py +++ b/python/dask_cudf/dask_cudf/__init__.py @@ -8,7 +8,7 @@ from . import backends from .core import DataFrame, Series, concat, from_cudf, from_dask_dataframe from .groupby import groupby_agg -from .io import read_csv, read_json, read_orc, to_orc, read_text +from .io import read_csv, read_json, read_orc, read_text, to_orc try: from .io import read_parquet diff --git a/python/dask_cudf/dask_cudf/io/tests/sample.orc b/python/dask_cudf/dask_cudf/io/tests/data/sample.orc similarity index 100% rename from python/dask_cudf/dask_cudf/io/tests/sample.orc rename to python/dask_cudf/dask_cudf/io/tests/data/sample.orc diff --git a/python/dask_cudf/dask_cudf/io/tests/sample.pgn b/python/dask_cudf/dask_cudf/io/tests/data/sample.pgn similarity index 100% rename from python/dask_cudf/dask_cudf/io/tests/sample.pgn rename to python/dask_cudf/dask_cudf/io/tests/data/sample.pgn diff --git a/python/dask_cudf/dask_cudf/io/tests/test_orc.py b/python/dask_cudf/dask_cudf/io/tests/test_orc.py index d8ac9e52fd8..4dbc8f52665 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_orc.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_orc.py @@ -13,7 +13,7 @@ # import pyarrow.orc as orc cur_dir = os.path.dirname(__file__) -sample_orc = os.path.join(cur_dir, "sample.orc") +sample_orc = os.path.join(cur_dir, "data/sample.orc") def test_read_orc_defaults(): diff --git a/python/dask_cudf/dask_cudf/io/tests/test_text.py b/python/dask_cudf/dask_cudf/io/tests/test_text.py index 0b3df51e1dc..2ef6576818f 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_text.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_text.py @@ -1,15 +1,17 @@ # Copyright (c) 2022, NVIDIA CORPORATION. import os + import pytest +import dask.dataframe as dd + import cudf -import dask_cudf -import dask.dataframe as dd +import dask_cudf cur_dir = os.path.dirname(__file__) -text_file = os.path.join(cur_dir, "sample.pgn") +text_file = os.path.join(cur_dir, "data/sample.pgn") @pytest.mark.parametrize("file", [text_file, [text_file]]) diff --git a/python/dask_cudf/dask_cudf/io/text.py b/python/dask_cudf/dask_cudf/io/text.py index 1e39a74880d..5582e16b384 100644 --- a/python/dask_cudf/dask_cudf/io/text.py +++ b/python/dask_cudf/dask_cudf/io/text.py @@ -3,13 +3,11 @@ import os from glob import glob +import dask.dataframe as dd from dask.base import tokenize - -import cudf - from dask.utils import apply, parse_bytes -import dask.dataframe as dd +import cudf def read_text(path, chunksize="256 MiB", **kwargs): From fd81f73f340959be8580358d09ad1ac9e8b4826e Mon Sep 17 00:00:00 2001 From: sft-managed Date: Thu, 10 Mar 2022 09:35:53 -0800 Subject: [PATCH 4/6] Fix copyright --- python/dask_cudf/dask_cudf/io/tests/test_orc.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_orc.py b/python/dask_cudf/dask_cudf/io/tests/test_orc.py index 4dbc8f52665..f02adcc1cec 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_orc.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_orc.py @@ -1,3 +1,5 @@ +# Copyright (c) 2018-2022, NVIDIA CORPORATION. + import glob import os from datetime import datetime, timezone From 2d74d42403c83af864018498e22b478fc9165701 Mon Sep 17 00:00:00 2001 From: sft-managed Date: Tue, 15 Mar 2022 08:43:29 -0700 Subject: [PATCH 5/6] Create subdirectories in data directory --- .../dask_cudf/io/tests/data/{ => orc}/sample.orc | Bin .../dask_cudf/io/tests/data/{ => text}/sample.pgn | 0 python/dask_cudf/dask_cudf/io/tests/test_orc.py | 2 +- python/dask_cudf/dask_cudf/io/tests/test_text.py | 2 +- 4 files changed, 2 insertions(+), 2 deletions(-) rename python/dask_cudf/dask_cudf/io/tests/data/{ => orc}/sample.orc (100%) rename python/dask_cudf/dask_cudf/io/tests/data/{ => text}/sample.pgn (100%) diff --git a/python/dask_cudf/dask_cudf/io/tests/data/sample.orc b/python/dask_cudf/dask_cudf/io/tests/data/orc/sample.orc similarity index 100% rename from python/dask_cudf/dask_cudf/io/tests/data/sample.orc rename to python/dask_cudf/dask_cudf/io/tests/data/orc/sample.orc diff --git a/python/dask_cudf/dask_cudf/io/tests/data/sample.pgn b/python/dask_cudf/dask_cudf/io/tests/data/text/sample.pgn similarity index 100% rename from python/dask_cudf/dask_cudf/io/tests/data/sample.pgn rename to python/dask_cudf/dask_cudf/io/tests/data/text/sample.pgn diff --git a/python/dask_cudf/dask_cudf/io/tests/test_orc.py b/python/dask_cudf/dask_cudf/io/tests/test_orc.py index f02adcc1cec..f19396a9b37 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_orc.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_orc.py @@ -15,7 +15,7 @@ # import pyarrow.orc as orc cur_dir = os.path.dirname(__file__) -sample_orc = os.path.join(cur_dir, "data/sample.orc") +sample_orc = os.path.join(cur_dir, "data/orc/sample.orc") def test_read_orc_defaults(): diff --git a/python/dask_cudf/dask_cudf/io/tests/test_text.py b/python/dask_cudf/dask_cudf/io/tests/test_text.py index 2ef6576818f..2b3215dd725 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_text.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_text.py @@ -11,7 +11,7 @@ import dask_cudf cur_dir = os.path.dirname(__file__) -text_file = os.path.join(cur_dir, "data/sample.pgn") +text_file = os.path.join(cur_dir, "data/text/sample.pgn") @pytest.mark.parametrize("file", [text_file, [text_file]]) From 48596233d40af947b535c8361babd804cd0f5d14 Mon Sep 17 00:00:00 2001 From: sft-managed Date: Mon, 21 Mar 2022 21:33:34 -0700 Subject: [PATCH 6/6] Add a test for byte_range --- python/dask_cudf/dask_cudf/io/tests/test_text.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_text.py b/python/dask_cudf/dask_cudf/io/tests/test_text.py index 2b3215dd725..a14eec1fea9 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_text.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_text.py @@ -20,3 +20,13 @@ def test_read_text(file, chunksize): df1 = cudf.read_text(text_file, delimiter='"]') df2 = dask_cudf.read_text(file, chunksize=chunksize, delimiter='"]') dd.assert_eq(df1, df2, check_index=False) + + +@pytest.mark.parametrize("offset", [0, 100, 250, 500, 1000]) +@pytest.mark.parametrize("size", [64, 128, 256]) +def test_read_text_byte_range(offset, size): + df1 = cudf.read_text(text_file, delimiter=".", byte_range=(offset, size)) + df2 = dask_cudf.read_text( + text_file, chunksize=None, delimiter=".", byte_range=(offset, size) + ) + dd.assert_eq(df1, df2, check_index=False)