From d708172db0fdd3c55cce92f541ac676675b7e57b Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Mon, 14 Dec 2020 09:15:41 -0800 Subject: [PATCH 01/23] API Overhaul First draft of the API overhauls changes. Adds most core functionality, including defining workflow graphs with a ColumnGroup class, the workflow and dataset changes , most operators converted to use the new api, etc. --- nvtabular/__init__.py | 5 +- nvtabular/column_group.py | 230 +++++ nvtabular/io/dataset.py | 158 +++- nvtabular/ops/__init__.py | 10 +- nvtabular/ops/bucketize.py | 43 +- nvtabular/ops/categorify.py | 144 +--- nvtabular/ops/clip.py | 29 +- nvtabular/ops/column_similarity.py | 85 +- nvtabular/ops/difference_lag.py | 23 +- nvtabular/ops/dropna.py | 19 +- nvtabular/ops/fill.py | 83 +- nvtabular/ops/filter.py | 15 +- nvtabular/ops/groupby_statistics.py | 41 +- nvtabular/ops/hash_bucket.py | 38 +- nvtabular/ops/hashed_cross.py | 43 +- nvtabular/ops/join_external.py | 22 +- nvtabular/ops/join_groupby.py | 9 +- nvtabular/ops/lambdaop.py | 29 +- nvtabular/ops/logop.py | 18 +- nvtabular/ops/median.py | 61 -- nvtabular/ops/minmax.py | 72 -- nvtabular/ops/moments.py | 57 -- nvtabular/ops/normalize.py | 89 +- nvtabular/ops/operator.py | 90 +- nvtabular/ops/rename.py | 48 ++ nvtabular/ops/stat_operator.py | 22 +- nvtabular/ops/target_encoding.py | 101 +-- nvtabular/ops/transform_operator.py | 119 --- nvtabular/workflow.py | 1163 +++----------------------- tests/unit/test_column_similarity.py | 26 +- tests/unit/test_dask_nvt.py | 117 +-- tests/unit/test_io.py | 73 +- tests/unit/test_ops.py | 43 +- tests/unit/test_s3.py | 11 +- tests/unit/test_workflow.py | 135 +-- 35 files changed, 1058 insertions(+), 2213 deletions(-) create mode 100644 nvtabular/column_group.py delete mode 100644 nvtabular/ops/median.py delete mode 100644 nvtabular/ops/minmax.py create mode 100644 nvtabular/ops/rename.py delete mode 100644 nvtabular/ops/transform_operator.py diff --git a/nvtabular/__init__.py b/nvtabular/__init__.py index 7c0b532c69f..b04e7a9763b 100644 --- a/nvtabular/__init__.py +++ b/nvtabular/__init__.py @@ -15,7 +15,7 @@ # import warnings -from . import io, workflow # noqa +from . import column_group, io, workflow # noqa from ._version import get_versions # suppress some warnings with cudf warning about column ordering with dlpack @@ -24,11 +24,12 @@ warnings.filterwarnings("ignore", module="numba.cuda.envvars") +ColumnGroup = column_group.ColumnGroup Workflow = workflow.Workflow Dataset = io.dataset.Dataset -__all__ = ["Workflow", "Dataset"] +__all__ = ["Workflow", "Dataset", "ColumnGroup"] # cudf warns about column ordering with dlpack methods, ignore it warnings.filterwarnings("ignore", module="cudf.io.dlpack") diff --git a/nvtabular/column_group.py b/nvtabular/column_group.py new file mode 100644 index 00000000000..cc7eb655ce3 --- /dev/null +++ b/nvtabular/column_group.py @@ -0,0 +1,230 @@ +# +# Copyright (c) 2020, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import collections.abc + +from nvtabular.ops import LambdaOp, Operator + + +class ColumnGroup: + """A ColumnGroup is a group of columns that you want to apply the same transformations to. + ColumnGroup's can be transformed by shifting operators on to them, which returns a new + ColumnGroup with the transformations applied. This lets you define a graph of operations + that makes up your workflow + + Parameters + ---------- + columns: list of str + The columns to select from the input Dataset + """ + + def __init__(self, columns): + if isinstance(columns, str): + columns = [columns] + + self.columns = columns + self.parents = [] + self.children = [] + self.op = None + self.kind = None + + def __rshift__(self, operator): + """Transforms this ColumnGroup by applying an Operator + + Parameters + ----------- + operators: Operator or callable + + Returns + ------- + ColumnGroup + """ + if isinstance(operator, type) and issubclass(operator, Operator): + # handle case where an operator class is passed + operator = operator() + elif callable(operator): + # implicit lambdaop conversion. + operator = LambdaOp(operator) + + if not isinstance(operator, Operator): + raise ValueError(f"Expected operator or callable, got {operator.__class__}") + + child = ColumnGroup(operator.output_column_names(self.columns)) + child.parents = [self] + self.children.append(child) + child.op = operator + + dependencies = operator.dependencies() + if dependencies: + if not isinstance(dependencies, collections.abc.Sequence): + dependencies = [dependencies] + + for dependency in dependencies: + if not isinstance(dependency, ColumnGroup): + dependency = ColumnGroup(dependency) + dependency.children.append(child) + child.parents.append(dependency) + + return child + + def __add__(self, other): + """Adds columns from this ColumnGroup with another to return a new ColumnGroup + + Parameters + ----------- + other: ColumnGroup or str or list of str + + Returns + ------- + ColumnGroup + """ + if isinstance(other, str): + other = ColumnGroup([other]) + elif isinstance(other, collections.abc.Sequence): + other = ColumnGroup(other) + + # check if there are any columns with the same name in both column groups + overlap = set(self.columns).intersection(other.columns) + if overlap: + raise ValueError(f"duplicate column names found: {overlap}") + + child = ColumnGroup(self.columns + other.columns) + child.parents = [self, other] + child.kind = "+" + self.children.append(child) + other.children.append(child) + return child + + # handle the "column_name" + ColumnGroup case + __radd__ = __add__ + + def __sub__(self, other): + """Adds columns from this ColumnGroup with another to return a new ColumnGroup + + Parameters + ----------- + other: ColumnGroup or str or list of str + + Returns + ------- + ColumnGroup + """ + if isinstance(other, ColumnGroup): + to_remove = set(other.columns) + elif isinstance(other, str): + to_remove = {other} + elif isinstance(other, collections.abc.Sequence): + to_remove = set(other) + else: + raise ValueError(f"Expected ColumnGroup, str, or list of str. Got {other.__class__}") + new_columns = [c for c in self.columns if c not in to_remove] + child = ColumnGroup(new_columns) + child.parents = [self] + self.children.append(child) + child.kind = f"- {list(to_remove)}" + return child + + def __repr__(self): + output = " output" if not self.children else "" + return f"" + + @property + def label(self): + if self.op: + return str(self.op.__class__.__name__) + elif self.kind: + return self.kind + elif not self.parents: + return f"input cols=[{self._cols_repr}]" + else: + return "??" + + @property + def _cols_repr(self): + cols = ", ".join(self.columns[:3]) + if len(self.columns) > 3: + cols += "..." + return cols + + @property + def graph(self): + return _to_graphviz(self) + + +def iter_nodes(nodes): + queue = nodes[:] + while queue: + current = queue.pop() + yield current + # TODO: deduplicate nodes? + for parent in current.parents: + queue.append(parent) + + +def _to_graphviz(column_group): + """ converts a columngroup to a GraphViz DiGraph object useful for display in notebooks """ + from graphviz import Digraph + + column_group = _merge_add_nodes(column_group) + graph = Digraph() + + # get all the nodes from parents of this columngroup + # and add edges between each of them + allnodes = list(set(iter_nodes([column_group]))) + node_ids = {v: str(k) for k, v in enumerate(allnodes)} + for node, nodeid in node_ids.items(): + graph.node(nodeid, node.label) + for parent in node.parents: + graph.edge(node_ids[parent], nodeid) + + # add a single 'output' node representing the final state + output_node_id = str(len(allnodes)) + graph.node(output_node_id, f"output cols=[{column_group._cols_repr}]") + graph.edge(node_ids[column_group], output_node_id) + return graph + + +def _merge_add_nodes(graph): + """ merges repeat '+' nodes, leading to nicer looking outputs """ + # lets take a copy to avoid mutating the input + import copy + + graph = copy.deepcopy(graph) + + queue = [graph] + while queue: + current = queue.pop() + if current.kind == "+": + changed = True + while changed: + changed = False + parents = [] + for i, parent in enumerate(current.parents): + if parent.kind == "+" and len(parent.children) == 1: + changed = True + # disconnect parent, point all the grandparents at current instead + parents.extend(parent.parents) + for grandparent in parent.parents: + grandparent.children = [ + current if child == parent else child + for child in grandparent.children + ] + else: + parents.append(parent) + current.parents = parents + + queue.extend(current.parents) + + return graph diff --git a/nvtabular/io/dataset.py b/nvtabular/io/dataset.py index 1655a58db33..e4503dac984 100644 --- a/nvtabular/io/dataset.py +++ b/nvtabular/io/dataset.py @@ -27,10 +27,14 @@ from dask.dataframe.core import new_dd_object from dask.highlevelgraph import HighLevelGraph from dask.utils import parse_bytes +from fsspec.core import get_fs_token_paths from fsspec.utils import stringify_path +from nvtabular.io.shuffle import _check_shuffle_arg + from ..utils import device_mem_size from .csv import CSVDatasetEngine +from .dask import _ddf_to_dataset from .dataframe_engine import DataFrameDatasetEngine from .parquet import ParquetDatasetEngine @@ -173,9 +177,11 @@ def __init__( part_mem_fraction=None, storage_options=None, dtypes=None, + client=None, **kwargs, ): self.dtypes = dtypes + self.client = client if isinstance(path_or_source, (dask.dataframe.DataFrame, cudf.DataFrame, pd.DataFrame)): # User is passing in a .DataFrame # Use DataFrameDatasetEngine @@ -298,7 +304,7 @@ def to_iter(self, columns=None, indices=None, shuffle=False, seed=None): each iteration. Parameters - ----------- + ---------- columns : str or list(str); default None Columns to include in each `DataFrame`. If not specified, the outputs will contain all known columns in the Dataset. @@ -323,6 +329,156 @@ def to_iter(self, columns=None, indices=None, shuffle=False, seed=None): self.to_ddf(columns=columns, shuffle=shuffle, seed=seed), indices=indices ) + def to_parquet( + self, output_path, shuffle=None, out_files_per_proc=None, num_threads=0, dtypes=None + ): + """Writes out to a parquet dataset + + Parameters + ---------- + output_path : string + Path to write processed/shuffled output data + shuffle : nvt.io.Shuffle enum + How to shuffle the output dataset. Shuffling is only + performed if the data is written to disk. For all options, + other than `None` (which means no shuffling), the partitions + of the underlying dataset/ddf will be randomly ordered. If + `PER_PARTITION` is specified, each worker/process will also + shuffle the rows within each partition before splitting and + appending the data to a number (`out_files_per_proc`) of output + files. Output files are distinctly mapped to each worker process. + If `PER_WORKER` is specified, each worker will follow the same + procedure as `PER_PARTITION`, but will re-shuffle each file after + all data is persisted. This results in a full shuffle of the + data processed by each worker. To improve performace, this option + currently uses host-memory `BytesIO` objects for the intermediate + persist stage. The `FULL` option is not yet implemented. + out_files_per_proc : integer + Number of files to create (per process) after + shuffling the data + num_threads : integer + Number of IO threads to use for writing the output dataset. + For `0` (default), no dedicated IO threads will be used. + dtypes : dict + Dictionary containing desired datatypes for output columns. + Keys are column names, values are datatypes. + """ + shuffle = _check_shuffle_arg(shuffle) + ddf = self.to_ddf(shuffle=shuffle) + + if dtypes: + _meta = _set_dtypes(ddf._meta, dtypes) + ddf = ddf.map_partitions(_set_dtypes, dtypes, meta=_meta) + + fs = get_fs_token_paths(output_path)[0] + fs.mkdirs(output_path, exist_ok=True) + if shuffle or out_files_per_proc: + # Output dask_cudf DataFrame to dataset + _ddf_to_dataset( + ddf, + fs, + output_path, + shuffle, + out_files_per_proc, + [], + [], + [], + "parquet", + self.client, + num_threads, + ) + return + + # Default (shuffle=None and out_files_per_proc=None) + # Just use `dask_cudf.to_parquet` + fut = ddf.to_parquet(output_path, compression=None, write_index=False, compute=False) + if self.client is None: + fut.compute(scheduler="synchronous") + else: + fut.compute() + + def to_hugectr( + self, + output_path, + cats, + conts, + labels, + shuffle=None, + out_files_per_proc=None, + num_threads=0, + dtypes=None, + ): + """Writes out to a parquet dataset + + Parameters + ---------- + output_path : string + Path to write processed/shuffled output data + cats : list of str + List of categorical columns + conts : list of str + List of continuous columns + labels : list of str + List of label columns + shuffle : nvt.io.Shuffle, optional + How to shuffle the output dataset. Shuffling is only + performed if the data is written to disk. For all options, + other than `None` (which means no shuffling), the partitions + of the underlying dataset/ddf will be randomly ordered. If + `PER_PARTITION` is specified, each worker/process will also + shuffle the rows within each partition before splitting and + appending the data to a number (`out_files_per_proc`) of output + files. Output files are distinctly mapped to each worker process. + If `PER_WORKER` is specified, each worker will follow the same + procedure as `PER_PARTITION`, but will re-shuffle each file after + all data is persisted. This results in a full shuffle of the + data processed by each worker. To improve performace, this option + currently uses host-memory `BytesIO` objects for the intermediate + persist stage. The `FULL` option is not yet implemented. + out_files_per_proc : integer + Number of files to create (per process) after + shuffling the data + num_threads : integer + Number of IO threads to use for writing the output dataset. + For `0` (default), no dedicated IO threads will be used. + dtypes : dict + Dictionary containing desired datatypes for output columns. + Keys are column names, values are datatypes. + """ + shuffle = _check_shuffle_arg(shuffle) + shuffle = _check_shuffle_arg(shuffle) + ddf = self.to_ddf(shuffle=shuffle) + if dtypes: + _meta = _set_dtypes(ddf._meta, dtypes) + ddf = ddf.map_partitions(_set_dtypes, dtypes, meta=_meta) + + fs = get_fs_token_paths(output_path)[0] + fs.mkdirs(output_path, exist_ok=True) + if shuffle or out_files_per_proc: + # Output dask_cudf DataFrame to dataset + _ddf_to_dataset( + ddf, + fs, + output_path, + shuffle, + out_files_per_proc, + cats, + conts, + labels, + "hugectr", + self.client, + num_threads, + ) + return + + # Default (shuffle=None and out_files_per_proc=None) + # Just use `dask_cudf.to_parquet` + fut = ddf.to_parquet(output_path, compression=None, write_index=False, compute=False) + if self.client is None: + fut.compute(scheduler="synchronous") + else: + fut.compute() + @property def num_rows(self): return self.engine.num_rows diff --git a/nvtabular/ops/__init__.py b/nvtabular/ops/__init__.py index a7ee1dfdc1f..aece2a6fb89 100644 --- a/nvtabular/ops/__init__.py +++ b/nvtabular/ops/__init__.py @@ -17,24 +17,20 @@ # alias submodules here to avoid breaking everything with moving to submodules # flake8: noqa from .bucketize import Bucketize -from .categorify import Categorify, SetBuckets, _get_embedding_order, get_embedding_sizes +from .categorify import Categorify, _get_embedding_order, get_embedding_sizes from .clip import Clip from .difference_lag import DifferenceLag from .dropna import Dropna from .fill import FillMedian, FillMissing from .filter import Filter -from .groupby_statistics import GroupbyStatistics from .hash_bucket import HashBucket from .hashed_cross import HashedCross from .join_external import JoinExternal from .join_groupby import JoinGroupby from .lambdaop import LambdaOp from .logop import LogOp -from .median import Median -from .minmax import MinMax -from .moments import Moments from .normalize import Normalize, NormalizeMinMax -from .operator import ALL, CAT, CONT, Operator +from .operator import Operator +from .rename import Rename from .stat_operator import StatOperator from .target_encoding import TargetEncoding -from .transform_operator import DFOperator, TransformOperator diff --git a/nvtabular/ops/bucketize.py b/nvtabular/ops/bucketize.py index 8376471c629..6e5e27e6ab2 100644 --- a/nvtabular/ops/bucketize.py +++ b/nvtabular/ops/bucketize.py @@ -16,46 +16,29 @@ from nvtx import annotate from six import callable -from .operator import CAT, CONT -from .transform_operator import TransformOperator +from .operator import Operator -class Bucketize(TransformOperator): +class Bucketize(Operator): """""" - default_in = CONT - default_out = CAT - - def __init__(self, boundaries, columns=None, **kwargs): - if isinstance(boundaries, dict): - columns = [i for i in boundaries.keys()] - self.boundaries = boundaries + def __init__(self, boundaries): + # transform boundaries into a lookup function on column names + if isinstance(boundaries, int): + self.boundaries = lambda col: boundaries + elif isinstance(boundaries, dict): + self.boundaries = lambda col: boundaries[col] elif callable(boundaries): self.boundaries = boundaries - elif isinstance(boundaries, (tuple, list)): - if any([isinstance(x, (tuple, list)) for x in boundaries]): - assert all([isinstance(x, (tuple, list)) for x in boundaries]) - assert columns is not None - assert len(columns) == len(boundaries) - self.boundaries = {col: b for col, b in zip(columns, boundaries)} - else: - self.boundaries = lambda name: boundaries else: raise TypeError( - "`boundaries` must be dict, callable, or iterable, got type {}".format( - type(boundaries) - ) + "`boundaries` must be dict, callable, or int, got type {}".format(type(boundaries)) ) - super().__init__(columns=columns, **kwargs) - - @annotate("HashedCross_op", color="darkgreen", domain="nvt_python") - def op_logic(self, gdf: cudf.DataFrame, target_columns: list, stats_context=None): - cont_names = target_columns - if callable(self.boundaries): - boundaries = {name: self.boundaries(name) for name in cont_names} - else: - boundaries = self.boundaries + super().__init__() + @annotate("Bucketize_op", color="darkgreen", domain="nvt_python") + def transform(self, columns, gdf: cudf.DataFrame): + boundaries = {name: self.boundaries(name) for name in columns} new_gdf = cudf.DataFrame() for col, b in boundaries.items(): # TODO: should just be using cupy.digitize but it's not in 7.8 diff --git a/nvtabular/ops/categorify.py b/nvtabular/ops/categorify.py index 78a3a5703c4..d1e96a8213e 100644 --- a/nvtabular/ops/categorify.py +++ b/nvtabular/ops/categorify.py @@ -27,6 +27,7 @@ from dask.base import tokenize from dask.core import flatten from dask.dataframe.core import _concat +from dask.delayed import Delayed from dask.highlevelgraph import HighLevelGraph from fsspec.core import get_fs_token_paths from nvtx import annotate @@ -34,13 +35,10 @@ from nvtabular.worker import fetch_table_data, get_worker_cache -from .groupby_statistics import GroupbyStatistics -from .operator import CAT from .stat_operator import StatOperator -from .transform_operator import DFOperator -class Categorify(DFOperator): +class Categorify(StatOperator): """ Most of the data set will contain categorical features, and these variables are typically stored as text values. @@ -115,21 +113,12 @@ class Categorify(DFOperator): a dictionary with column names as keys and frequency limit as value. If dictionary is used, all columns targeted must be included in the dictionary. - columns : list of str or list(str), default None - Categorical columns (or multi-column "groups") to target for this op. - If None, the operation will target all known categorical columns. - If columns contains 1+ list(str) elements, the columns within each - list/group will be encoded according to the `encode_type` setting. encode_type : {"joint", "combo"}, default "joint" If "joint", the columns within any multi-column group will be jointly encoded. If "combo", the combination of values will be encoded as a new column. Note that replacement is not allowed for "combo", because the same column name can be included in multiple groups. - replace : bool, default True - Replaces the transformed column with the original input. - Note that this does not apply to multi-column groups with - `encoded_type="combo"`. tree_width : dict or int, optional Passed to `GroupbyStatistics` dependency. out_path : str, optional @@ -161,14 +150,9 @@ class Categorify(DFOperator): will be transformed. """ - default_in = CAT - default_out = CAT - def __init__( self, freq_threshold=0, - columns=None, - replace=True, out_path=None, tree_width=None, na_sentinel=None, @@ -213,35 +197,13 @@ def __init__( # For case (3), we also use this "storage name" to signify the name of # the file with the required "combination" groupby statistics. self.storage_name = {} - if isinstance(columns, str): - columns = [columns] - if isinstance(columns, list): - # User passed in a list of column groups. We need to figure out - # if this list contains any multi-column groups, and if there - # are any (obvious) problems with these groups - self.column_groups = columns - columns = list(set(flatten(columns, container=list))) - columns_all = list(flatten(columns, container=list)) - if sorted(columns_all) != sorted(columns) and encode_type == "joint": - # If we are doing "joint" encoding, there must be unique mapping - # between input column names and column groups. Otherwise, more - # than one unique-value table could be used to encode the same - # column. - raise ValueError("Same column name included in multiple groups.") - for group in self.column_groups: - if isinstance(group, list) and len(group) > 1: - # For multi-column groups, we concatenate column names - # to get the "group" name. - name = _make_name(*group, sep=self.name_sep) - for col in group: - self.storage_name[col] = name # Only support two kinds of multi-column encoding if encode_type not in ("joint", "combo"): raise ValueError(f"encode_type={encode_type} not supported.") # Other self-explanatory intialization - super().__init__(columns=columns, replace=replace) + super().__init__() self.freq_threshold = freq_threshold or 0 self.out_path = out_path or "./" self.tree_width = tree_width @@ -249,9 +211,9 @@ def __init__( self.dtype = dtype self.on_host = on_host self.cat_cache = cat_cache - self.stat_name = "categories" self.encode_type = encode_type self.search_sorted = search_sorted + self.categories = {} if self.search_sorted and self.freq_threshold: raise ValueError( @@ -262,7 +224,6 @@ def __init__( "For hashing num_buckets should be an int > 1, otherwise set num_buckets=None." ) elif isinstance(num_buckets, dict): - columns = list(num_buckets) self.num_buckets = num_buckets elif isinstance(num_buckets, int) or num_buckets is None: self.num_buckets = num_buckets @@ -271,50 +232,37 @@ def __init__( "`num_buckets` must be dict or int, got type {}".format(type(num_buckets)) ) - @property - def req_stats(self): - stats = [ - GroupbyStatistics( - columns=self.column_groups or self.columns, - concat_groups=self.encode_type == "joint", - cont_names=[], - stats=[], - freq_threshold=self.freq_threshold, - tree_width=self.tree_width, - out_path=self.out_path, - on_host=self.on_host, - stat_name=self.stat_name, - name_sep=self.name_sep, - ) - ] - if self.num_buckets: - stats += [ - SetBuckets( - columns=self.column_groups or self.columns, - num_buckets=self.num_buckets, - freq_limit=self.freq_threshold, - encode_type=self.encode_type, - ) - ] - return stats + @annotate("Categorify_transform", color="darkgreen", domain="nvt_python") + def fit(self, columns, ddf): + dsk, key = _category_stats( + ddf, + columns, + [], + [], + self.out_path, + self.freq_threshold, + self.tree_width, + self.on_host, + concat_groups=self.encode_type == "joint", + name_sep=self.name_sep, + ) + return Delayed(key, dsk) + + def fit_finalize(self, dask_stats): + for col in dask_stats: + self.categories[col] = dask_stats[col] - @annotate("Categorify_op", color="darkgreen", domain="nvt_python") - def apply_op( + @annotate("Categorify_transform", color="darkgreen", domain="nvt_python") + def transform( self, + columns, gdf: cudf.DataFrame, - columns_ctx: dict, - input_cols, - target_cols=["base"], - stats_context={}, ): new_gdf = gdf.copy(deep=False) - target_columns = self.get_columns(columns_ctx, input_cols, target_cols) if isinstance(self.freq_threshold, dict): - assert all(x in self.freq_threshold for x in target_columns) - if not target_columns: - return new_gdf + assert all(x in self.freq_threshold for x in columns) - if self.column_groups and not self.encode_type == "joint": + if self.encode_type == "combo": # Case (3) - We want to track multi- and single-column groups separately # when we are NOT performing a joint encoding. This is because # there is not a 1-to-1 mapping for columns in multi-col groups. @@ -322,20 +270,17 @@ def apply_op( # multi-column groups only, and use `cat_names` to store the # string representation of both single- and multi-column groups. # - cat_names, multi_col_group = _get_multicolumn_names( - self.column_groups, gdf.columns, self.name_sep - ) + cat_names, multi_col_group = _get_multicolumn_names(columns, gdf.columns, self.name_sep) else: # Case (1) & (2) - Simple 1-to-1 mapping multi_col_group = {} - cat_names = [name for name in target_columns if name in gdf.columns] + cat_names = columns # Encode each column-group separately for name in cat_names: - new_col = f"{name}_{self._id}" - # Use the column-group `list` directly (not the string name) use_name = multi_col_group.get(name, name) + # Storage name may be different than group for case (2) # Only use the "aliased" `storage_name` if we are dealing with # a multi-column group, or if we are doing joint encoding @@ -343,13 +288,16 @@ def apply_op( storage_name = self.storage_name.get(name, name) else: storage_name = name - path = stats_context[self.stat_name][storage_name] + + path = self.categories[storage_name] + """ TODO ?? if not self.column_groups and _is_list_col([name], gdf): if "mh" not in columns_ctx["categorical"]: columns_ctx["categorical"]["mh"] = [] if name not in columns_ctx["categorical"]["mh"]: columns_ctx["categorical"]["mh"].append(name) - new_gdf[new_col] = _encode( + """ + new_gdf[name] = _encode( use_name, storage_name, path, @@ -365,18 +313,16 @@ def apply_op( cat_names=cat_names, ) if self.dtype: - new_gdf[new_col] = new_gdf[new_col].astype(self.dtype, copy=False) + new_gdf[name] = new_gdf[name].astype(self.dtype, copy=False) - # Deal with replacement - if self.replace: - for name in cat_names: - new_col = f"{name}_{self._id}" - new_gdf[name] = new_gdf[new_col] - new_gdf.drop(columns=[new_col], inplace=True) - - self.update_columns_ctx(columns_ctx, input_cols, new_gdf.columns, target_columns) return new_gdf + def output_column_names(self, columns): + if self.encode_type == "combo": + cat_names, _ = _get_multicolumn_names(columns, columns, self.name_sep) + return cat_names + return list(flatten(columns)) + def _get_embedding_order(cat_names): """Returns a consistent sorder order for categorical variables @@ -434,8 +380,8 @@ def _get_embeddings_dask(paths, cat_names, buckets=None, freq_limit=0): return embeddings -def _emb_sz_rule(n_cat: int) -> int: - return n_cat, int(min(16, round(1.6 * n_cat ** 0.56))) +def _emb_sz_rule(n_cat: int, minimum_size=16, maximum_size=512) -> int: + return n_cat, min(max(minimum_size, round(1.6 * n_cat ** 0.56)), maximum_size) def _make_name(*args, sep="_"): diff --git a/nvtabular/ops/clip.py b/nvtabular/ops/clip.py index edfe3bb4440..7af12fa0d0c 100644 --- a/nvtabular/ops/clip.py +++ b/nvtabular/ops/clip.py @@ -16,11 +16,10 @@ import cudf from nvtx import annotate -from .operator import CONT -from .transform_operator import TransformOperator +from .operator import Operator -class Clip(TransformOperator): +class Clip(Operator): """ This operation clips continuous values so that they are within a min/max bound. For instance by setting the min value to 0, you can replace all negative values with 0. @@ -28,7 +27,7 @@ class Clip(TransformOperator): # clip all continuous columns to be positive only, and then take the log of the clipped # columns - workflow.add_cont_feature([Clip(min_value=0), LogOp()]) + columns = nvt.ColumnGroup(CONT_NAMES) >> Clip(min_value=0) >> LogOp() Parameters ---------- @@ -38,34 +37,20 @@ class Clip(TransformOperator): max_value : float, default None The maximum value to clip values to: values greater than this will be replaced with this value. Specifying ``None`` means don't apply a maximum threshold. - columns : list of str, default None - Continuous columns to target for this op. If None, the operation will target all known - continuous columns. - replace : bool, default False - Whether to replace existing columns or create new ones. """ - default_in = CONT - default_out = CONT - - def __init__(self, min_value=None, max_value=None, columns=None, replace=True): + def __init__(self, min_value=None, max_value=None): if min_value is None and max_value is None: raise ValueError("Must specify a min or max value to clip to") - super().__init__(columns=columns, replace=replace) + super().__init__() self.min_value = min_value self.max_value = max_value @annotate("Clip_op", color="darkgreen", domain="nvt_python") - def op_logic(self, gdf: cudf.DataFrame, target_columns: list, stats_context=None): - cont_names = target_columns - if not cont_names: - return gdf - - z_gdf = gdf[cont_names] - z_gdf.columns = [f"{col}_{self._id}" for col in z_gdf.columns] + def transform(self, columns: list, gdf: cudf.DataFrame): + z_gdf = gdf[columns] if self.min_value is not None: z_gdf[z_gdf < self.min_value] = self.min_value if self.max_value is not None: z_gdf[z_gdf > self.max_value] = self.max_value - return z_gdf diff --git a/nvtabular/ops/column_similarity.py b/nvtabular/ops/column_similarity.py index 6b774c79451..51ce7999408 100644 --- a/nvtabular/ops/column_similarity.py +++ b/nvtabular/ops/column_similarity.py @@ -22,11 +22,10 @@ from cupyx.scipy.sparse import coo_matrix from nvtx import annotate -from .operator import CAT, CONT -from .transform_operator import TransformOperator +from .operator import Operator -class ColumnSimilarity(TransformOperator): +class ColumnSimilarity(Operator): """Calculates the similarity between two columns using tf-idf, cosine or inner product as the distance metric. For each row, this calculates the distance between the two columns by looking up features for those columns in a sparse matrix, @@ -46,68 +45,48 @@ class ColumnSimilarity(TransformOperator): Parameters ----------- - name : str - Name of the output column - a : str - Name of the first column to calculate similarity for - a_features : csr_matrix - Sparse feature matrix for the 'a' column - b : str - Name of the second column to calculate similarity for - b_features : csr_matrix, optional - Sparse feature matrix for the 'b' column. If not given will use the - same feature matrix as for 'a' (for example when calculating document-document distances) + left_features : csr_matrix + Sparse feature matrix for the left column + right_features : csr_matrix, optional + Sparse feature matrix for the right column in each pair. If not given will use the + same feature matrix as for the left (for example when calculating document-document i + distances) on_device : bool Whether to compute on the GPU or CPU. Computing on the GPU will be - faster, but requires that the a_features/b_features sparse matrices + faster, but requires that the left_features/right_features sparse matrices fit into GPU memory. """ - default_in = CAT - default_out = CONT - - def __init__( - self, name, a_col, a_features, b_col, b_features=None, metric="tfidf", on_device=True - ): - super(ColumnSimilarity, self).__init__(columns=[a_col, b_col], replace=False) - self.name = name - self.a_col = a_col - self.b_col = b_col - - self.a_features = _convert_features(a_features, metric, on_device) - self.b_features = ( - _convert_features(b_features, metric, on_device) - if b_features is not None - else self.a_features + def __init__(self, left_features, right_features=None, metric="tfidf", on_device=True): + super(ColumnSimilarity, self).__init__() + + self.left_features = _convert_features(left_features, metric, on_device) + self.right_features = ( + _convert_features(right_features, metric, on_device) + if right_features is not None + else self.left_features ) self.on_device = on_device @annotate("ColumnSimilarity_op", color="darkgreen", domain="nvt_python") - def apply_op( - self, - gdf: cudf.DataFrame, - columns_ctx: dict, - input_cols, - target_cols=["base"], - stats_context=None, - ): - a = gdf[self.a_col].values if self.on_device else gdf[self.a_col].values_host - b = gdf[self.b_col].values if self.on_device else gdf[self.b_col].values_host - - if len(a) and len(b): - similarities = row_wise_inner_product( - a, self.a_features, b, self.b_features, self.on_device - ) - else: - similarities = [] - gdf[self.name] = similarities + def transform(self, columns, gdf: cudf.DataFrame): + names = self.output_column_names(columns) + for name, (left, right) in zip(names, columns): + a = gdf[left].values if self.on_device else gdf[left].values_host + b = gdf[right].values if self.on_device else gdf[right].values_host + + if len(a) and len(b): + similarities = row_wise_inner_product( + a, self.left_features, b, self.right_features, self.on_device + ) + else: + similarities = [] + gdf[name] = similarities - columns_ctx[input_cols][self._id] = [self.name] return gdf - @property - def _id(self): - return f"{self.__class__.__name__}_{self.name}" + def output_column_names(self, columns): + return [f"{a}_{b}_sim" for a, b in columns] def row_wise_inner_product(a, a_features, b, b_features, on_device=True): diff --git a/nvtabular/ops/difference_lag.py b/nvtabular/ops/difference_lag.py index d00a60910f9..5b01f1ae65d 100644 --- a/nvtabular/ops/difference_lag.py +++ b/nvtabular/ops/difference_lag.py @@ -16,11 +16,10 @@ import cudf from nvtx import annotate -from .operator import CONT -from .transform_operator import TransformOperator +from .operator import Operator -class DifferenceLag(TransformOperator): +class DifferenceLag(Operator): """Calculates the difference between two consecutive rows of the dataset. For instance, this operator can calculate the time since a user last had another interaction. @@ -57,16 +56,13 @@ class DifferenceLag(TransformOperator): Whether to replace existing columns or create new ones. """ - default_in = CONT - default_out = CONT - def __init__(self, partition_cols, shift=1, columns=None, replace=False): super(DifferenceLag, self).__init__(columns=columns, replace=replace) self.partition_cols = partition_cols self.shifts = [shift] if isinstance(shift, int) else shift @annotate("DifferenceLag_op", color="darkgreen", domain="nvt_python") - def op_logic(self, gdf: cudf.DataFrame, target_columns: list, stats_context=None): + def transform(self, columns, gdf: cudf.DataFrame): # compute a mask indicating partition boundaries, handling multiple partition_cols # represent partition boundaries by None values output = {} @@ -76,6 +72,15 @@ def op_logic(self, gdf: cudf.DataFrame, target_columns: list, stats_context=None mask = mask.all(axis=1) mask[mask == False] = None # noqa - for col in target_columns: - output[f"{col}_{self._id}_{shift}"] = (gdf[col] - gdf[col].shift(shift)) * mask + for col in columns: + output[self._column_name(col, shift)] = (gdf[col] - gdf[col].shift(shift)) * mask return cudf.DataFrame(output) + + def dependencies(self): + return self.partition_cols + + def output_column_names(self, columns): + return [self._column_name(col, shift) for shift in self.shifts for col in columns] + + def _column_name(self, col, shift): + return f"{col}_difference_lag_{shift}" diff --git a/nvtabular/ops/dropna.py b/nvtabular/ops/dropna.py index 6a8245d5615..46d6fcdf06a 100644 --- a/nvtabular/ops/dropna.py +++ b/nvtabular/ops/dropna.py @@ -16,11 +16,10 @@ import cudf from nvtx import annotate -from .operator import ALL -from .transform_operator import TransformOperator +from .operator import Operator -class Dropna(TransformOperator): +class Dropna(Operator): """ This operation detects missing values, and filters out rows with null values. @@ -44,20 +43,12 @@ class Dropna(TransformOperator): for null values. """ - default_in = ALL - default_out = ALL - @annotate("Dropna_op", color="darkgreen", domain="nvt_python") - def apply_op( + def transform( self, + columns, gdf: cudf.DataFrame, - columns_ctx: dict, - input_cols, - target_cols=["base"], - stats_context=None, ): - target_columns = self.get_columns(columns_ctx, input_cols, target_cols) - new_gdf = gdf.dropna(subset=target_columns or None) + new_gdf = gdf.dropna(subset=columns or None) new_gdf.reset_index(drop=True, inplace=True) - self.update_columns_ctx(columns_ctx, input_cols, new_gdf.columns, target_columns) return new_gdf diff --git a/nvtabular/ops/fill.py b/nvtabular/ops/fill.py index 65f6498e139..c00fc75abe5 100644 --- a/nvtabular/ops/fill.py +++ b/nvtabular/ops/fill.py @@ -16,12 +16,11 @@ import cudf from nvtx import annotate -from .median import Median -from .operator import CONT -from .transform_operator import DFOperator +from .operator import Operator +from .stat_operator import StatOperator -class FillMissing(DFOperator): +class FillMissing(Operator): """ This operation replaces missing values with a constant pre-defined value @@ -42,35 +41,18 @@ class FillMissing(DFOperator): ----------- fill_val : float, default 0 The constant value to replace missing values with. - columns : list of str, default None - Continuous columns to target for this op. If None, the operation will target all known - continuous columns. - replace : bool, default True - Whether to replace existing columns or create new ones. """ - default_in = CONT - default_out = CONT - - def __init__(self, fill_val=0, columns=None, replace=True): - super().__init__(columns=columns, replace=replace) + def __init__(self, fill_val=0): + super().__init__() self.fill_val = fill_val - @property - def req_stats(self): - return [] - @annotate("FillMissing_op", color="darkgreen", domain="nvt_python") - def op_logic(self, gdf: cudf.DataFrame, target_columns: list, stats_context=None): - cont_names = target_columns - if not cont_names: - return gdf - z_gdf = gdf[cont_names].fillna(self.fill_val) - z_gdf.columns = [f"{col}_{self._id}" for col in z_gdf.columns] - return z_gdf + def transform(self, columns, gdf: cudf.DataFrame): + return gdf[columns].fillna(self.fill_val) -class FillMedian(DFOperator): +class FillMedian(StatOperator): """ This operation replaces missing values with the median value for the column. @@ -85,31 +67,28 @@ class FillMedian(DFOperator): # Add FillMedian to the workflow for continuous columns proc.add_cont_feature(nvt.ops.FillMedian()) - - Parameters - ----------- - columns : list of str, default None - Continuous columns to target for this op. If None, the operation will target all known - continuous columns. - replace : bool, default True - Whether to replace existing columns or create new ones. """ - default_in = CONT - default_out = CONT - - @property - def req_stats(self): - return [Median(columns=self.columns)] - - @annotate("FillMedian_op", color="darkgreen", domain="nvt_python") - def op_logic(self, gdf: cudf.DataFrame, target_columns: list, stats_context=None): - if not target_columns: - return gdf - - new_gdf = cudf.DataFrame() - for col in target_columns: - stat_val = stats_context["medians"][col] - new_gdf[col] = gdf[col].fillna(stat_val) - new_gdf.columns = [f"{col}_{self._id}" for col in new_gdf.columns] - return new_gdf + def __init__(self): + super().__init__() + self.medians = {} + + @annotate("FillMedian_transform", color="darkgreen", domain="nvt_python") + def transform(self, columns, gdf: cudf.DataFrame): + if not self.medians: + raise RuntimeError("need to call 'fit' before running transform") + + for col in columns: + gdf[col] = gdf[col].fillna(self.medians[col]) + return gdf + + @annotate("FillMedian_fit", color="green", domain="nvt_python") + def fit(self, columns, ddf): + # TODO: Use `method="tidigest"` when crick supports device + dask_stats = ddf[columns].quantile(q=0.5, method="dask") + return dask_stats + + @annotate("FillMedian_finalize", color="green", domain="nvt_python") + def fit_finalize(self, dask_stats): + for col in dask_stats.index.values_host: + self.medians[col] = float(dask_stats[col]) diff --git a/nvtabular/ops/filter.py b/nvtabular/ops/filter.py index 5fc5469a43d..e289196b60c 100644 --- a/nvtabular/ops/filter.py +++ b/nvtabular/ops/filter.py @@ -16,11 +16,10 @@ import cudf from nvtx import annotate -from .operator import ALL -from .transform_operator import TransformOperator +from .operator import Operator -class Filter(TransformOperator): +class Filter(Operator): """ Filters rows from the dataset. This works by taking a callable that takes a dataframe, and returns a dataframe with unwanted rows filtered out. @@ -36,11 +35,8 @@ class Filter(TransformOperator): dataframe with unwanted rows filtered out. """ - default_in = ALL - default_out = ALL - def __init__(self, f): - super().__init__(replace=True) + super().__init__() if f is None: raise ValueError("f cannot be None. Filter op applies f to dataframe") self.f = f @@ -48,11 +44,8 @@ def __init__(self, f): @annotate("Filter_op", color="darkgreen", domain="nvt_python") def apply_op( self, + columns, gdf: cudf.DataFrame, - columns_ctx: dict, - input_cols, - target_cols=["base"], - stats_context=None, ): filtered = self.f(gdf) if isinstance(filtered, cudf.DataFrame): diff --git a/nvtabular/ops/groupby_statistics.py b/nvtabular/ops/groupby_statistics.py index e1782f7c937..b7dec9d143d 100644 --- a/nvtabular/ops/groupby_statistics.py +++ b/nvtabular/ops/groupby_statistics.py @@ -15,7 +15,6 @@ # import cupy import numpy as np -from dask.core import flatten from dask.delayed import Delayed from . import categorify as nvt_cat @@ -85,37 +84,18 @@ def __init__( self, cont_names=None, stats=None, - columns=None, fold_groups=None, tree_width=None, out_path=None, on_host=True, freq_threshold=None, - stat_name=None, concat_groups=False, name_sep="_", fold_name="__fold__", fold_seed=42, kfold=None, ): - # Set column_groups if the user has passed in a list of columns - self.column_groups = None - if isinstance(columns, str): - columns = [columns] - if isinstance(columns, list): - self.column_groups = columns - columns = list(set(flatten(columns, container=list))) - - # Add fold_groups to columns - if fold_groups and kfold > 1: - fold_groups = [fold_groups] if isinstance(fold_groups, str) else fold_groups - columns = columns or [] - self.column_groups = self.column_groups or [] - for col in list(set(flatten(fold_groups, container=list))): - if col not in columns: - columns.append(col) - - super(GroupbyStatistics, self).__init__(columns) + super(GroupbyStatistics, self).__init__() self.cont_names = cont_names or [] self.stats = stats or [] self.categories = {} @@ -123,8 +103,6 @@ def __init__( self.on_host = on_host self.freq_threshold = freq_threshold self.out_path = out_path or "./" - self.stat_name = stat_name or "categories" - self.op_name = "GroupbyStatistics-" + self.stat_name self.concat_groups = concat_groups self.name_sep = name_sep self.kfold = kfold or 3 @@ -132,23 +110,14 @@ def __init__( self.fold_seed = fold_seed self.fold_groups = fold_groups - @property - def _id(self): - c_id = self._id_set - if not self._id_set: - c_id = str(self.op_name) - return c_id - - def stat_logic(self, ddf, columns_ctx, input_cols, target_cols): - if self.column_groups is None: - col_groups = self.get_columns(columns_ctx, input_cols, target_cols) - else: - col_groups = self.column_groups.copy() + def fit(self, columns, ddf): supported_ops = ["count", "sum", "mean", "std", "var", "min", "max"] for op in self.stats: if op not in supported_ops: raise ValueError(op + " operation is not supported.") + # TODO: move all this 'fold' stuff into TargetEncoding + col_groups = columns if self.fold_groups and self.kfold > 1: # Add new fold column if necessary if self.fold_name not in ddf.columns: @@ -176,6 +145,7 @@ def _add_fold(s, kfold, fold_seed): self._ddf_out = ddf # Add new col_groups with fold + for group in self.fold_groups: if isinstance(group, list): col_groups.append([self.fold_name] + group) @@ -197,7 +167,6 @@ def _add_fold(s, kfold, fold_seed): self.freq_threshold, self.tree_width, self.on_host, - stat_name=self.stat_name, concat_groups=self.concat_groups, name_sep=self.name_sep, ) diff --git a/nvtabular/ops/hash_bucket.py b/nvtabular/ops/hash_bucket.py index f0075ae6201..651ecd91617 100644 --- a/nvtabular/ops/hash_bucket.py +++ b/nvtabular/ops/hash_bucket.py @@ -17,12 +17,11 @@ from cudf.utils.dtypes import is_list_dtype from nvtx import annotate -from .categorify import SetBuckets, _encode_list_column -from .operator import CAT -from .transform_operator import DFOperator +from .categorify import _encode_list_column +from .operator import Operator -class HashBucket(DFOperator): +class HashBucket(Operator): """ This op maps categorical columns to a contiguous integer range by first hashing the column then modulating by the number of @@ -93,10 +92,7 @@ class HashBucket(DFOperator): an `int`. """ - default_in = CAT - default_out = CAT - - def __init__(self, num_buckets, columns=None, **kwargs): + def __init__(self, num_buckets): if isinstance(num_buckets, dict): columns = [i for i in num_buckets.keys()] self.num_buckets = num_buckets @@ -112,32 +108,18 @@ def __init__(self, num_buckets, columns=None, **kwargs): type(num_buckets) ) ) - super(HashBucket, self).__init__(columns=columns, **kwargs) + super(HashBucket, self).__init__() @annotate("HashBucket_op", color="darkgreen", domain="nvt_python") - def op_logic(self, gdf: cudf.DataFrame, target_columns: list, stats_context=None): - cat_names = target_columns + def transform(self, columns, gdf: cudf.DataFrame): if isinstance(self.num_buckets, int): - num_buckets = {name: self.num_buckets for name in cat_names} + num_buckets = {name: self.num_buckets for name in columns} else: num_buckets = self.num_buckets - new_gdf = cudf.DataFrame() for col, nb in num_buckets.items(): - new_col = f"{col}_{self._id}" if is_list_dtype(gdf[col].dtype): - encoded = _encode_list_column(gdf[col], gdf[col].list.leaves.hash_values() % nb) + gdf[col] = _encode_list_column(gdf[col], gdf[col].list.leaves.hash_values() % nb) else: - encoded = gdf[col].hash_values() % nb - - new_gdf[new_col] = encoded - return new_gdf - - @property - def req_stats(self): - return [ - SetBuckets( - columns=self.columns, - num_buckets=self.num_buckets, - ) - ] + gdf[col] = gdf[col].hash_values() % nb + return gdf diff --git a/nvtabular/ops/hashed_cross.py b/nvtabular/ops/hashed_cross.py index d9df21cfbe7..e2839bd183e 100644 --- a/nvtabular/ops/hashed_cross.py +++ b/nvtabular/ops/hashed_cross.py @@ -16,42 +16,25 @@ import cudf from nvtx import annotate -from .operator import CAT -from .transform_operator import TransformOperator +from .operator import Operator -class HashedCross(TransformOperator): - """""" - - default_in = CAT - default_out = CAT - - def __init__(self, crosses, num_buckets=None, **kwargs): - if isinstance(crosses, dict): - cross_sets = list(crosses.keys()) - num_buckets = [crosses[c] for c in cross_sets] - crosses = cross_sets - else: - if num_buckets is None: - raise ValueError("Must provide `num_buckets` if crosses is not dict") - assert len(num_buckets) == len(crosses) - assert all([isinstance(c, (tuple, list)) for c in crosses]) - assert all([len(c) > 1 for c in crosses]) - - kwargs["replace"] = False - kwargs["columns"] = list(set([i for j in crosses for i in j])) - super().__init__(**kwargs) - - self.crosses = crosses +class HashedCross(Operator): + def __init__(self, num_buckets): + super().__init__() self.num_buckets = num_buckets @annotate("HashedCross_op", color="darkgreen", domain="nvt_python") - def op_logic(self, gdf: cudf.DataFrame, target_columns: list, stats_context=None): + def op_logic(self, columns, gdf: cudf.DataFrame): new_gdf = cudf.DataFrame() - for columns, bucket_size in zip(self.crosses, self.num_buckets): + for cross in columns: val = 0 - for column in columns: + for column in cross: val ^= gdf[column].hash_values() # or however we want to do this aggregation - val = val % bucket_size - new_gdf["_X_".join(columns)] = val + # TODO: support different size buckets per cross + val = val % self.bucket_size + new_gdf["_X_".join(cross)] = val return new_gdf + + def output_column_names(self, columns): + return ["_X_".join(cross) for cross in columns] diff --git a/nvtabular/ops/join_external.py b/nvtabular/ops/join_external.py index ea536d0b157..2c0631a09e4 100644 --- a/nvtabular/ops/join_external.py +++ b/nvtabular/ops/join_external.py @@ -20,11 +20,10 @@ from nvtabular.worker import fetch_table_data, get_worker_cache -from .operator import ALL -from .transform_operator import TransformOperator +from .operator import Operator -class JoinExternal(TransformOperator): +class JoinExternal(Operator): """ Join each dataset partition to an external table. For performance reasons, only "left" and "inner" join transformations are supported. @@ -77,9 +76,6 @@ class JoinExternal(TransformOperator): if the data is originally stored on disk. """ - default_in = ALL - default_out = ALL - def __init__( self, df_ext, @@ -144,15 +140,11 @@ def _ext(self): return _ext - def apply_op( + def transform( self, + columns, gdf: cudf.DataFrame, - columns_ctx: dict, - input_cols, - target_cols=["base"], - stats_context=None, ): - target_columns = self.get_columns(columns_ctx, input_cols, target_cols) tmp = "__tmp__" # Temporary column for sorting gdf[tmp] = cupy.arange(len(gdf), dtype="int32") new_gdf = gdf.merge(self._ext, left_on=self.on, right_on=self.on_ext, how=self.how) @@ -160,9 +152,13 @@ def apply_op( new_gdf.drop(columns=[tmp], inplace=True) gdf.drop(columns=[tmp], inplace=True) new_gdf.reset_index(drop=True, inplace=True) - self.update_columns_ctx(columns_ctx, input_cols, new_gdf.columns, target_columns) return new_gdf + def output_column_names(self, columns): + if self.ext_columns: + return columns + self.ext_columns + return columns + self._ext.columns + def _detect_format(data): """Utility to detect the format of `data`""" diff --git a/nvtabular/ops/join_groupby.py b/nvtabular/ops/join_groupby.py index a2896cc5eeb..064a4d4a2cb 100644 --- a/nvtabular/ops/join_groupby.py +++ b/nvtabular/ops/join_groupby.py @@ -19,11 +19,11 @@ from . import categorify as nvt_cat from .groupby_statistics import GroupbyStatistics -from .operator import CAT -from .transform_operator import DFOperator +from .stat_operator import StatOperator -class JoinGroupby(DFOperator): +# TODO: statoperator new api +class JoinGroupby(StatOperator): """ One of the ways to create new features is to calculate the basic statistics of the data that is grouped by categorical @@ -74,9 +74,6 @@ class JoinGroupby(DFOperator): for multi-column groups. """ - default_in = CAT - default_out = CAT - def __init__( self, cont_names=None, diff --git a/nvtabular/ops/lambdaop.py b/nvtabular/ops/lambdaop.py index 3e86064f9b2..2da4c0fa8ff 100644 --- a/nvtabular/ops/lambdaop.py +++ b/nvtabular/ops/lambdaop.py @@ -18,11 +18,10 @@ import cudf from nvtx import annotate -from .operator import ALL -from .transform_operator import TransformOperator +from .operator import Operator -class LambdaOp(TransformOperator): +class LambdaOp(Operator): """ LambdaOp allows you to apply row level functions to an NVTabular workflow. @@ -76,33 +75,19 @@ class LambdaOp(TransformOperator): Whether to replace existing columns or create new ones. """ - default_in = ALL - default_out = ALL - - def __init__(self, op_name, f, columns=None, replace=True): - super().__init__(columns=columns, replace=replace) - if op_name is None: - raise ValueError("op_name cannot be None. It is required for naming the column.") + def __init__(self, f): + super().__init__() if f is None: raise ValueError("f cannot be None. LambdaOp op applies f to dataframe") self.f = f - self.op_name = op_name self._param_count = len(signature(self.f).parameters) if self._param_count not in (1, 2): raise ValueError("lambda function must accept either one or two parameters") - @property - def _id(self): - c_id = self._id_set - if not self._id_set: - c_id = str(self.op_name) - return c_id - @annotate("DFLambda_op", color="darkgreen", domain="nvt_python") - def op_logic(self, gdf: cudf.DataFrame, target_columns: list, stats_context=None): + def transform(self, columns, gdf: cudf.DataFrame): new_gdf = cudf.DataFrame() - - for col in target_columns: + for col in columns: if self._param_count == 2: new_gdf[col] = self.f(gdf[col], gdf) elif self._param_count == 1: @@ -110,6 +95,4 @@ def op_logic(self, gdf: cudf.DataFrame, target_columns: list, stats_context=None else: # shouldn't ever happen, raise RuntimeError(f"unhandled lambda param count {self._param_count}") - - new_gdf.columns = [f"{col}_{self.op_name}" for col in new_gdf.columns] return new_gdf diff --git a/nvtabular/ops/logop.py b/nvtabular/ops/logop.py index 6225fcc2c90..73ba45892f4 100644 --- a/nvtabular/ops/logop.py +++ b/nvtabular/ops/logop.py @@ -17,11 +17,10 @@ import numpy as np from nvtx import annotate -from .operator import CONT -from .transform_operator import TransformOperator +from .operator import Operator -class LogOp(TransformOperator): +class LogOp(Operator): """ This operator calculates the log of continuous columns. Note that to handle the common case of zerofilling null values, this @@ -48,15 +47,6 @@ class LogOp(TransformOperator): Whether to replace existing columns or create new ones. """ - default_in = CONT - default_out = CONT - @annotate("LogOp_op", color="darkgreen", domain="nvt_python") - def op_logic(self, gdf: cudf.DataFrame, target_columns: list, stats_context=None): - cont_names = target_columns - if not cont_names: - return gdf - new_gdf = np.log(gdf[cont_names].astype(np.float32) + 1) - new_cols = [f"{col}_{self._id}" for col in new_gdf.columns] - new_gdf.columns = new_cols - return new_gdf + def transform(self, columns, gdf: cudf.DataFrame): + return np.log(gdf[columns].astype(np.float32) + 1) diff --git a/nvtabular/ops/median.py b/nvtabular/ops/median.py deleted file mode 100644 index 23eb442b213..00000000000 --- a/nvtabular/ops/median.py +++ /dev/null @@ -1,61 +0,0 @@ -# -# Copyright (c) 2020, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from nvtx import annotate - -from .stat_operator import StatOperator - - -class Median(StatOperator): - """ - This operation calculates median of features. - - Parameters - ----------- - columns : - fill : float, default None - batch_medians : list, default None - medians : list, default None - """ - - def __init__(self, columns=None, fill=None, batch_medians=None, medians=None): - super().__init__(columns=columns) - self.fill = fill - self.batch_medians = batch_medians if batch_medians is not None else {} - self.medians = medians if medians is not None else {} - - @annotate("Median_op", color="green", domain="nvt_python") - def stat_logic(self, ddf, columns_ctx, input_cols, target_cols): - cols = self.get_columns(columns_ctx, input_cols, target_cols) - # TODO: Use `method="tidigest"` when crick supports device - dask_stats = ddf[cols].quantile(q=0.5, method="dask") - return dask_stats - - @annotate("Median_finalize", color="green", domain="nvt_python") - def finalize(self, dask_stats): - for col in dask_stats.index.values_host: - self.medians[col] = float(dask_stats[col]) - - def registered_stats(self): - return ["medians"] - - def stats_collected(self): - result = [("medians", self.medians)] - return result - - def clear(self): - self.batch_medians = {} - self.medians = {} - return diff --git a/nvtabular/ops/minmax.py b/nvtabular/ops/minmax.py deleted file mode 100644 index 72e41d3e020..00000000000 --- a/nvtabular/ops/minmax.py +++ /dev/null @@ -1,72 +0,0 @@ -# -# Copyright (c) 2020, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from nvtx import annotate - -from .stat_operator import StatOperator - - -class MinMax(StatOperator): - """ - MinMax operation calculates min and max statistics of features. - - Parameters - ----------- - columns : - batch_mins : list of float, default None - batch_maxs : list of float, default None - mins : list of float, default None - maxs : list of float, default None - """ - - def __init__(self, columns=None, batch_mins=None, batch_maxs=None, mins=None, maxs=None): - super().__init__(columns=columns) - self.batch_mins = batch_mins if batch_mins is not None else {} - self.batch_maxs = batch_maxs if batch_maxs is not None else {} - self.mins = mins if mins is not None else {} - self.maxs = maxs if maxs is not None else {} - - @annotate("MinMax_op", color="green", domain="nvt_python") - def stat_logic(self, ddf, columns_ctx, input_cols, target_cols): - cols = self.get_columns(columns_ctx, input_cols, target_cols) - dask_stats = {} - dask_stats["mins"] = ddf[cols].min() - dask_stats["maxs"] = ddf[cols].max() - return dask_stats - - @annotate("MinMax_finalize", color="green", domain="nvt_python") - def finalize(self, stats): - for col in stats["mins"].index.values_host: - self.mins[col] = stats["mins"][col] - self.maxs[col] = stats["maxs"][col] - - def registered_stats(self): - return ["mins", "maxs", "batch_mins", "batch_maxs"] - - def stats_collected(self): - result = [ - ("mins", self.mins), - ("maxs", self.maxs), - ("batch_mins", self.batch_mins), - ("batch_maxs", self.batch_maxs), - ] - return result - - def clear(self): - self.batch_mins = {} - self.batch_maxs = {} - self.mins = {} - self.maxs = {} - return diff --git a/nvtabular/ops/moments.py b/nvtabular/ops/moments.py index e4f14a8d06f..0691fa424b9 100644 --- a/nvtabular/ops/moments.py +++ b/nvtabular/ops/moments.py @@ -22,63 +22,6 @@ from dask.dataframe.core import _concat from dask.delayed import Delayed from dask.highlevelgraph import HighLevelGraph -from nvtx import annotate - -from .stat_operator import StatOperator - - -class Moments(StatOperator): - """ - Moments operation calculates some of the statistics of features including - mean, variance, standarded deviation, and count. - - Parameters - ----------- - columns : - counts : list of float, default None - means : list of float, default None - varis : list of float, default None - stds : list of float, default None - """ - - def __init__(self, columns=None, counts=None, means=None, varis=None, stds=None): - super().__init__(columns=columns) - self.counts = counts if counts is not None else {} - self.means = means if means is not None else {} - self.varis = varis if varis is not None else {} - self.stds = stds if stds is not None else {} - - @annotate("Moments_op", color="green", domain="nvt_python") - def stat_logic(self, ddf, columns_ctx, input_cols, target_cols): - cols = self.get_columns(columns_ctx, input_cols, target_cols) - return _custom_moments(ddf[cols]) - - @annotate("Moments_finalize", color="green", domain="nvt_python") - def finalize(self, dask_stats): - for col in dask_stats.index: - self.counts[col] = float(dask_stats["count"].loc[col]) - self.means[col] = float(dask_stats["mean"].loc[col]) - self.stds[col] = float(dask_stats["std"].loc[col]) - self.varis[col] = float(dask_stats["var"].loc[col]) - - def registered_stats(self): - return ["means", "stds", "vars", "counts"] - - def stats_collected(self): - result = [ - ("means", self.means), - ("stds", self.stds), - ("vars", self.varis), - ("counts", self.counts), - ] - return result - - def clear(self): - self.counts = {} - self.means = {} - self.varis = {} - self.stds = {} - return def _custom_moments(ddf, split_every=32): diff --git a/nvtabular/ops/normalize.py b/nvtabular/ops/normalize.py index b80f58219fa..de4cf009d38 100644 --- a/nvtabular/ops/normalize.py +++ b/nvtabular/ops/normalize.py @@ -16,13 +16,11 @@ import cudf from nvtx import annotate -from .minmax import MinMax -from .moments import Moments -from .operator import CONT -from .transform_operator import DFOperator +from .moments import _custom_moments +from .stat_operator import StatOperator -class Normalize(DFOperator): +class Normalize(StatOperator): """ Standardizing the features around 0 with a standard deviation of 1 is a common technique to compare measurements that have @@ -52,34 +50,31 @@ class Normalize(DFOperator): Whether to replace existing columns or create new ones. """ - default_in = CONT - default_out = CONT + def __init__(self): + super().__init__() + self.means = {} + self.stds = {} - @property - def req_stats(self): - return [Moments(columns=self.columns)] + @annotate("Normalize_fit", color="green", domain="nvt_python") + def fit(self, columns, ddf): + return _custom_moments(ddf[columns]) + + def fit_finalize(self, dask_stats): + for col in dask_stats.index: + self.means[col] = float(dask_stats["mean"].loc[col]) + self.stds[col] = float(dask_stats["std"].loc[col]) @annotate("Normalize_op", color="darkgreen", domain="nvt_python") - def op_logic(self, gdf: cudf.DataFrame, target_columns: list, stats_context=None): - cont_names = target_columns - if not cont_names or not stats_context["stds"]: - return - gdf = self.apply_mean_std(gdf, stats_context, cont_names) - return gdf - - def apply_mean_std(self, gdf, stats_context, cont_names): + def transform(self, columns, gdf: cudf.DataFrame): new_gdf = cudf.DataFrame() - for name in cont_names: - if stats_context["stds"][name] > 0: - new_col = f"{name}_{self._id}" - new_gdf[new_col] = (gdf[name] - stats_context["means"][name]) / ( - stats_context["stds"][name] - ) - new_gdf[new_col] = new_gdf[new_col].astype("float32") + for name in columns: + if self.stds[name] > 0: + new_gdf[name] = (gdf[name] - self.means[name]) / (self.stds[name]) + new_gdf[name] = new_gdf[name].astype("float32") return new_gdf -class NormalizeMinMax(DFOperator): +class NormalizeMinMax(StatOperator): """ Standardizing the features around 0 with a standard deviation of 1 is a common technique to compare measurements that have @@ -109,29 +104,31 @@ class NormalizeMinMax(DFOperator): Whether to replace existing columns or create new ones. """ - default_in = CONT - default_out = CONT - - @property - def req_stats(self): - return [MinMax(columns=self.columns)] + def __init__(self): + self.mins = {} + self.maxs = {} @annotate("NormalizeMinMax_op", color="darkgreen", domain="nvt_python") - def op_logic(self, gdf: cudf.DataFrame, target_columns: list, stats_context=None): - cont_names = target_columns - if not cont_names or not stats_context["mins"]: - return - gdf = self.apply_min_max(gdf, stats_context, cont_names) - return gdf - - def apply_min_max(self, gdf, stats_context, cont_names): + def transform(self, columns, gdf: cudf.DataFrame): new_gdf = cudf.DataFrame() - for name in cont_names: - dif = stats_context["maxs"][name] - stats_context["mins"][name] - new_col = f"{name}_{self._id}" + for name in columns: + dif = self.maxs[name] - self.mins[name] if dif > 0: - new_gdf[new_col] = (gdf[name] - stats_context["mins"][name]) / dif + new_gdf[name] = (gdf[name] - self.mins[name]) / dif elif dif == 0: - new_gdf[new_col] = gdf[name] / (2 * gdf[name]) - new_gdf[new_col] = new_gdf[new_col].astype("float32") + new_gdf[name] = gdf[name] / (2 * gdf[name]) + new_gdf[name] = new_gdf[name].astype("float32") return new_gdf + + @annotate("MinMax_op", color="green", domain="nvt_python") + def fit(self, columns, ddf): + return { + "mins": ddf[columns].min(), + "maxs": ddf[columns].max(), + } + + @annotate("MinMax_finalize", color="green", domain="nvt_python") + def fit_finalize(self, dask_stats): + for col in dask_stats["mins"].index.values_host: + self.mins[col] = dask_stats["mins"][col] + self.maxs[col] = dask_stats["maxs"][col] diff --git a/nvtabular/ops/operator.py b/nvtabular/ops/operator.py index 9d8889f649a..8099262d75c 100644 --- a/nvtabular/ops/operator.py +++ b/nvtabular/ops/operator.py @@ -13,11 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import warnings - -CONT = "continuous" -CAT = "categorical" -ALL = "all" class Operator: @@ -25,38 +20,53 @@ class Operator: Base class for all operator classes. """ - def __init__(self, columns=None): - self.columns = columns - self._id_set = None - - @property - def _id(self): - c_id = self._id_set - if not self._id_set: - c_id = str(self.__class__.__name__) - return c_id - - def _set_id(self, id_to_set): - # only set one time - if not self._id_set: - self._id_set = id_to_set - else: - warnings.warn( - f"trying to reset operator id, for operator: {self._id_set} to {id_to_set}" - ) - - def describe(self): - raise NotImplementedError("All operators must have a desription.") - - def get_columns(self, cols_ctx, cols_grp, target_cols): - # providing any operator with direct list of columns overwrites cols dict - # burden on user to ensure columns exist in dataset (as discussed) - if self.columns: - return self.columns - tar_cols = [] - for tar in target_cols: - if tar in cols_ctx[cols_grp].keys(): - tar_cols = tar_cols + cols_ctx[cols_grp][tar] - if len(tar_cols) < 1: - tar_cols = cols_ctx[cols_grp]["base"] - return tar_cols + def transform(self, columns, gdf): + """Transform the dataframe by applying this operator to the set of input columns + + Parameters + ----------- + columns: list of str or list of list of str + The columns to apply this operator to + gdf: Dataframe + A cudf dataframe that this operator will work on + + Returns + ------- + Dataframe + Returns a transformed dataframe for this operator + + """ + raise NotImplementedError + + def output_column_names(self, columns): + """Given a set of columns names returns the names of the transformed columns this + operator will produce + + Parameters + ----------- + columns: list of str, or list of list of str + The columns to apply this operator to + + Returns + ------- + list of str, or list of list of str + The names of columns produced by this operator + + """ + return columns + + def dependencies(self): + """Defines an optional list of column dependencies for this operator. This lets you consume columns + that aren't part of the main transformation workflow. + + Returns + ------- + str, list of str, ColumnGroup or None + Extra dependencies of this operator. Defaults to None + """ + return None + + def __rrshift__(self, other): + import nvtabular + + return nvtabular.ColumnGroup(other) >> self diff --git a/nvtabular/ops/rename.py b/nvtabular/ops/rename.py new file mode 100644 index 00000000000..aaf562f5973 --- /dev/null +++ b/nvtabular/ops/rename.py @@ -0,0 +1,48 @@ +# +# Copyright (c) 2020, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from .operator import Operator + + +class Rename(Operator): + """This operation renames columns, either by using a user defined lambda function to + transform column names, or by appending a postfix string to every column name + + Parameters + ---------- + f : callable, optional + Function that takes a column name and returns a new column name + postfix : str, optional + If set each column name in the output will have this string appended to it + """ + + def __init__(self, f=None, postfix=None): + if not f and postfix is None: + raise ValueError("must specify either f or postfix for Rename op") + + self.postfix = postfix + self.f = f + + def transform(self, columns, gdf): + gdf.columns = self.output_column_names(columns) + return gdf + + def output_column_names(self, columns): + if self.f: + return [self.f(col) for col in columns] + elif self.postfix: + return [col + self.postfix for col in columns] + else: + raise RuntimeError("invalid rename op state found") diff --git a/nvtabular/ops/stat_operator.py b/nvtabular/ops/stat_operator.py index 69ee1f870ce..01b9f614955 100644 --- a/nvtabular/ops/stat_operator.py +++ b/nvtabular/ops/stat_operator.py @@ -18,33 +18,23 @@ class StatOperator(Operator): """ - Base class for statistical operator classes. + Base class for statistical operator classes. This adds a 'fit' and 'finalize' method + on top of """ - def __init__(self, columns=None): - super(StatOperator, self).__init__(columns) + def __init__(self): + super(StatOperator, self).__init__() self._ddf_out = None - def stat_logic(self, ddf, columns_ctx, input_cols, target_cols): + def fit(self, columns, ddf): raise NotImplementedError( """The dask operations needed to return a dictionary of uncomputed statistics.""" ) - def finalize(self, dask_stats): + def fit_finalize(self, dask_stats): raise NotImplementedError( """Follow-up operations to convert dask statistics in to member variables""" ) - def registered_stats(self): - raise NotImplementedError( - """Should return a list of statistics this operator will collect. - The list is comprised of simple string values.""" - ) - - def stats_collected(self): - raise NotImplementedError( - """Should return a list of tuples of name and statistics operator.""" - ) - def clear(self): raise NotImplementedError("""zero and reinitialize all relevant statistical properties""") diff --git a/nvtabular/ops/target_encoding.py b/nvtabular/ops/target_encoding.py index fd075059583..c2492ed9ff0 100644 --- a/nvtabular/ops/target_encoding.py +++ b/nvtabular/ops/target_encoding.py @@ -17,13 +17,10 @@ import cupy from . import categorify as nvt_cat -from .groupby_statistics import GroupbyStatistics -from .moments import Moments -from .operator import ALL -from .transform_operator import DFOperator +from .stat_operator import StatOperator -class TargetEncoding(DFOperator): +class TargetEncoding(StatOperator): """ Target encoding is a common feature-engineering technique for categorical columns in tabular datasets. For each categorical group, @@ -63,26 +60,18 @@ class TargetEncoding(DFOperator): proc.add_feature( TargetEncoding( cat_groups = ['cat1', 'cat2', ['cat2','cat3']], - cont_target = LABEL_COLUMNS, + target = LABEL_COLUMNS, kfold = 5, p_smooth = 20) ) Parameters ----------- - cat_groups : list of column-groups - Columns, or column groups, to target encode. A single encoding - will include multiple categorical columns if the column names are - enclosed within two layers of square brackets. For example, - `["a", "b"]` means "a" and "b" will be separately encoded, while - `[["a", "b"]]` means they will be encoded as a single group. - Note that the same column can be used for multiple encodings. - For example, `["a", ["a", "b"]]` is valid. - cont_target : str + target : str Continuous target column to use for the encoding of cat_groups. The same continuous target will be used for all `cat_groups`. target_mean : float - Global mean of the cont_target column to use for encoding. + Global mean of the target column to use for encoding. Supplying this value up-front will improve performance. kfold : int, default 3 Number of cross-validation folds to use while gathering @@ -100,8 +89,6 @@ class TargetEncoding(DFOperator): elements must be unique). out_dtype : str, default is problem-specific dtype of output target-encoding columns. - replace : bool, default False - This parameter is ignored tree_width : dict or int, optional Passed to `GroupbyStatistics` dependency. out_path : str, optional @@ -113,20 +100,15 @@ class TargetEncoding(DFOperator): for multi-column groups. """ - default_in = ALL - default_out = ALL - def __init__( self, - cat_groups, - cont_target, + target, target_mean=None, kfold=None, fold_seed=42, p_smooth=20, out_col=None, out_dtype=None, - replace=False, tree_width=None, cat_cache="host", out_path=None, @@ -135,13 +117,14 @@ def __init__( stat_name=None, drop_folds=True, ): - super().__init__(replace=replace) - # Make sure cat_groups is a list of lists - self.cat_groups = cat_groups if isinstance(cat_groups, list) else [cat_groups] - for i in range(len(self.cat_groups)): - if not isinstance(self.cat_groups[i], list): - self.cat_groups[i] = [self.cat_groups[i]] - self.cont_target = [cont_target] if isinstance(cont_target, str) else cont_target + super().__init__() + + self.target = [target] if isinstance(target, str) else target + self.dependency = self.target + + if hasattr(self.target, "columns"): + self.target = self.target.columns + self.target_mean = target_mean self.kfold = kfold or 3 self.fold_seed = fold_seed @@ -156,33 +139,21 @@ def __init__( self.drop_folds = drop_folds self.stat_name = stat_name or "te_stats" - @property - def req_stats(self): - stats = [] - if self.target_mean is None: - stats.append(Moments(columns=self.cont_target)) - stats.append( - GroupbyStatistics( - columns=self.cat_groups, - concat_groups=False, - cont_names=self.cont_target, - stats=["count", "sum"], - tree_width=self.tree_width, - out_path=self.out_path, - on_host=self.on_host, - stat_name=self.stat_name, - name_sep=self.name_sep, - fold_name="__fold__", - kfold=self.kfold, - fold_seed=self.fold_seed, - fold_groups=self.cat_groups, - ) - ) - return stats + # TODO: fit/fit_finalize methods + + def dependencies(self): + return self.dependency + + def output_column_names(self, columns): + ret = [] + for cat in columns: + cat = [cat] if isinstance(cat, str) else cat + ret.extend(self._make_te_name(cat)) + return ret def _make_te_name(self, cat_group): tag = nvt_cat._make_name(*cat_group, sep=self.name_sep) - return [f"TE_{tag}_{x}" for x in self.cont_target] + return [f"TE_{tag}_{x}" for x in self.target] def _op_group_logic(self, cat_group, gdf, stats_context, y_mean, fit_folds, group_ind): @@ -193,8 +164,8 @@ def _op_group_logic(self, cat_group, gdf, stats_context, y_mean, fit_folds, grou out_col = self.out_col[group_ind] out_col = [out_col] if isinstance(out_col, str) else out_col # ToDo Test - if len(out_col) != len(self.cont_target): - raise ValueError("out_col and cont_target are different sizes.") + if len(out_col) != len(self.target): + raise ValueError("out_col and target are different sizes.") else: out_col = self._make_te_name(cat_group) @@ -210,7 +181,7 @@ def _op_group_logic(self, cat_group, gdf, stats_context, y_mean, fit_folds, grou agg_each_fold = nvt_cat._read_groupby_stat_df( path_folds, storage_name_folds, self.cat_cache ) - agg_each_fold.columns = cols + ["count_y"] + [x + "_sum_y" for x in self.cont_target] + agg_each_fold.columns = cols + ["count_y"] + [x + "_sum_y" for x in self.target] else: cols = cat_group @@ -218,12 +189,12 @@ def _op_group_logic(self, cat_group, gdf, stats_context, y_mean, fit_folds, grou storage_name_all = nvt_cat._make_name(*cat_group, sep=self.name_sep) path_all = stats_context[self.stat_name][storage_name_all] agg_all = nvt_cat._read_groupby_stat_df(path_all, storage_name_all, self.cat_cache) - agg_all.columns = cat_group + ["count_y_all"] + [x + "_sum_y_all" for x in self.cont_target] + agg_all.columns = cat_group + ["count_y_all"] + [x + "_sum_y_all" for x in self.target] if fit_folds: agg_each_fold = agg_each_fold.merge(agg_all, on=cat_group, how="left") agg_each_fold["count_y_all"] = agg_each_fold["count_y_all"] - agg_each_fold["count_y"] - for i, x in enumerate(self.cont_target): + for i, x in enumerate(self.target): agg_each_fold[x + "_sum_y_all"] = ( agg_each_fold[x + "_sum_y_all"] - agg_each_fold[x + "_sum_y"] ) @@ -233,19 +204,19 @@ def _op_group_logic(self, cat_group, gdf, stats_context, y_mean, fit_folds, grou agg_each_fold = agg_each_fold.drop( ["count_y_all", "count_y"] - + [x + "_sum_y" for x in self.cont_target] - + [x + "_sum_y_all" for x in self.cont_target], + + [x + "_sum_y" for x in self.target] + + [x + "_sum_y_all" for x in self.target], axis=1, ) tran_gdf = gdf[cols + [tmp]].merge(agg_each_fold, on=cols, how="left") del agg_each_fold else: - for i, x in enumerate(self.cont_target): + for i, x in enumerate(self.target): agg_all[out_col[i]] = (agg_all[x + "_sum_y_all"] + self.p_smooth * y_mean[x]) / ( agg_all["count_y_all"] + self.p_smooth ) agg_all = agg_all.drop( - ["count_y_all"] + [x + "_sum_y_all" for x in self.cont_target], axis=1 + ["count_y_all"] + [x + "_sum_y_all" for x in self.target], axis=1 ) tran_gdf = gdf[cols + [tmp]].merge(agg_all, on=cols, how="left") del agg_all @@ -253,7 +224,7 @@ def _op_group_logic(self, cat_group, gdf, stats_context, y_mean, fit_folds, grou # TODO: There is no need to perform the `agg_each_fold.merge(agg_all, ...)` merge # for every partition. We can/should cache the result for better performance. - for i, x in enumerate(self.cont_target): + for i, x in enumerate(self.target): tran_gdf[out_col[i]] = tran_gdf[out_col[i]].fillna(y_mean[x]) if self.out_dtype is not None: tran_gdf[out_col] = tran_gdf[out_col].astype(self.out_dtype) diff --git a/nvtabular/ops/transform_operator.py b/nvtabular/ops/transform_operator.py deleted file mode 100644 index 14bae4c5b61..00000000000 --- a/nvtabular/ops/transform_operator.py +++ /dev/null @@ -1,119 +0,0 @@ -# -# Copyright (c) 2020, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import cudf - -from .operator import Operator - - -class TransformOperator(Operator): - """ - Base class for transformer operator classes. - """ - - default_in = None - default_out = None - - def __init__(self, columns=None, preprocessing=True, replace=True): - super().__init__(columns=columns) - self.preprocessing = preprocessing - self.replace = replace - self.delim = None - - def out_columns(self, tar_cols, extra_cols, delim): - new_cols = [] - if not self.replace: - new_cols = [f"{col}{delim}{self._id}" for col in tar_cols] - return new_cols + tar_cols, extra_cols - - def get_default_in(self): - if self.default_in is None: - raise NotImplementedError( - "default_in columns have not been specified for this operator" - ) - return self.default_in - - def get_default_out(self): - if self.default_out is None: - raise NotImplementedError( - "default_out columns have not been specified for this operator" - ) - return self.default_out - - def update_columns_ctx(self, columns_ctx, input_cols, new_cols, origin_targets, pro=False): - """ - columns_ctx: columns context, belonging to the container workflow object - input_cols: input columns; columns actioned on origin columns context key - new_cols: new columns; new columns generated by operator to be added to columns context - ---- - This function generalizes the action of updating the columns context dictionary - of the container workflow object, after an operator has created new columns via a - new transformation of a subset or entire dataset. - """ - - new_key = self._id - - columns_ctx[input_cols][new_key] = [] - if self.replace and self.preprocessing: - # not making new columns instead using old ones - # must reference original target with new operator for chaining - columns_ctx[input_cols][new_key] = origin_targets - return - columns_ctx[input_cols][new_key] = list(new_cols) - if not self.preprocessing and self._id not in columns_ctx["final"]["ctx"][input_cols]: - if "base" in columns_ctx["final"]["ctx"][input_cols]: - columns_ctx["final"]["ctx"][input_cols].remove("base") - columns_ctx["final"]["ctx"][input_cols].append(self._id) - - def apply_op( - self, - gdf: cudf.DataFrame, - columns_ctx: dict, - input_cols, - target_cols=["base"], - stats_context=None, - ): - target_columns = self.get_columns(columns_ctx, input_cols, target_cols) - new_gdf = self.op_logic(gdf, target_columns, stats_context=stats_context) - self.update_columns_ctx(columns_ctx, input_cols, new_gdf.columns, target_columns) - return self.assemble_new_df(gdf, new_gdf, target_columns) - - def assemble_new_df(self, origin_gdf, new_gdf, target_columns): - if self.replace and self.preprocessing and target_columns: - if new_gdf.shape[0] < origin_gdf.shape[0]: - return new_gdf - else: - origin_gdf[target_columns] = new_gdf - return origin_gdf - return cudf.concat([origin_gdf, new_gdf], axis=1) - - def op_logic(self, gdf, target_columns, stats_context=None): - raise NotImplementedError( - """Must implement transform in the op_logic method, - The return value must be a dataframe with all required - transforms.""" - ) - - -class DFOperator(TransformOperator): - """ - Base class for data frame operator classes. - """ - - @property - def req_stats(self): - raise NotImplementedError( - "Should consist of a list of identifiers, that should map to available statistics" - ) diff --git a/nvtabular/workflow.py b/nvtabular/workflow.py index 94879405ed1..c95f6158163 100644 --- a/nvtabular/workflow.py +++ b/nvtabular/workflow.py @@ -13,1078 +13,179 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import collections import logging -import time -import warnings +import cudf import dask -import dask_cudf -import yaml -from fsspec.core import get_fs_token_paths +from dask.core import flatten -from nvtabular.io.dask import _ddf_to_dataset -from nvtabular.io.dataset import Dataset, _set_dtypes -from nvtabular.io.shuffle import Shuffle, _check_shuffle_arg -from nvtabular.io.writer_factory import writer_factory -from nvtabular.ops import DFOperator, Operator, StatOperator, TransformOperator +from nvtabular.column_group import ColumnGroup, iter_nodes +from nvtabular.io.dataset import Dataset +from nvtabular.ops import StatOperator from nvtabular.worker import clean_worker_cache -LOG = logging.getLogger("nvtabular") +# import yaml + +LOG = logging.getLogger("nvtabular") -class BaseWorkflow: +class Workflow: """ - BaseWorkflow class organizes and runs all the feature engineering - and pre-processing operators for your workflow. + Workflow organizes and runs all feature engineering and preprocessing operators for your + workflow. Parameters - ----------- - cat_names : list of str - Names of the categorical columns. - cont_names : list of str - Names of the continuous columns. - label_name : list of str - Names of the label column. - config : object + ---------- + columns_group: ColumnGroup + The graph of transformations this workflow should apply + client: Dask.client, optional + The Dask client to use for multi-gpu processing """ - def __init__(self, cat_names=None, cont_names=None, label_name=None, config=None, delim="_"): - self.phases = [] - - self.columns_ctx = {} - self.columns_ctx["all"] = {} - self.columns_ctx["continuous"] = {} - self.columns_ctx["categorical"] = {} - self.columns_ctx["label"] = {} - self.columns_ctx["all"]["base"] = cont_names + cat_names + label_name - self.columns_ctx["continuous"]["base"] = cont_names - self.columns_ctx["categorical"]["base"] = cat_names - self.columns_ctx["label"]["base"] = label_name - - self.stats = {} - self.current_file_num = 0 - self.delim = delim - self.timings = {"write_df": 0.0, "preproc_apply": 0.0} - self.ops_in = [] - if config: - self.load_config(config) - else: - # create blank config and for later fill in - self.config = get_new_config() - - self.clear_stats() - - def _register_ops(self, operators): - if not isinstance(operators, list): - operators = [operators] - # order matters!!! - if "full" not in self.columns_ctx: - self.columns_ctx["full"] = {} - # full current will always be the most up to date version of columns, - # based on operators added - self.columns_ctx["full"]["base"] = (self.columns_ctx["all"]["base"],) - current = (self.columns_ctx["all"]["base"],) - # for all what does not exist in - for op, col_focus, chain_on_ops, child, parent in operators: - target_cols = op.get_columns(self.columns_ctx, col_focus, chain_on_ops) - # grab the input target columns and the input extra columns - extra_cols = [] - cur_extra_cols = [] if len(current) < 2 else current[1] - full_list = current[0] + cur_extra_cols - for col in full_list: - if col not in target_cols: - extra_cols.append(col) - current = self._create_full_col_ctx_entry(op, target_cols, extra_cols, parent=parent) - self._reduce(self.columns_ctx["full"]) - - def _reduce(self, full_dict): - self._remove_dupes(full_dict) - # this will guide phase placement - self._analyze_placement(full_dict) - - def _analyze_placement(self, full_dict): - # detect num collisions for each op_id to find correct placement. - self.placement = {} - for op_id, cols_ops in full_dict.items(): - if op_id in "base": - continue - in_tar_cols, _, _, _, _, _ = cols_ops - in_tar_cols = in_tar_cols if in_tar_cols else [] - self.placement[op_id] = self._detect_num_col_collisions(in_tar_cols.copy(), op_id) - - def _remove_dupes(self, full_dict): - remove_keys = [] - for op_id, cols_ops in full_dict.items(): - parent = None - if op_id not in "base": - in_tar_cols, in_extra_cols, op, parent, fin_tar_cols, fin_extra_cols = cols_ops - if not parent: - continue - if parent not in full_dict: - remove_keys.append(op_id) - for key in remove_keys: - del full_dict[key] - - def _detect_cols_collision(self, columns, op_id, index=0): - """ - Given a list of columns find the task AFTER which all - columns in list exists and return that task. - """ - action_cols = columns - # start at the index provided, scan forward. - keys = list(self.columns_ctx["full"].keys()) - keys = keys[index:] - for idx, k in enumerate(keys): - if k == op_id: - # reach yourself is end, and last found location - return k, index - if k in "base": - fin_tar_cols = self.columns_ctx["full"][k][0] - else: - ( - in_tar_cols, - in_extra_cols, - op, - parent, - fin_tar_cols, - fin_extra_cols, - ) = self.columns_ctx["full"][k] - # if anything is inside, find it and remove it - # found col, cannot remove in mid iteration... causes issues - found = set(fin_tar_cols).intersection(action_cols) - action_cols = [col for col in action_cols if col not in found] - # if empty found end - if not action_cols: - return k, idx + index - raise ValueError(f"Unknown columns found: {action_cols}") - - def _detect_num_col_collisions(self, columns, op_id): - """ - Detect the number of times you see all columns in tasks, before getting - to self in task list - """ - current_op = None - index = 0 - indexes = [] - while current_op != op_id: - current_op, index = self._detect_cols_collision(columns.copy(), op_id, index=index) - indexes.append((current_op, index)) - index = index + 1 - return indexes - - def _check_op_count(self, op): - if op._id_set is None: - count = self._get_op_count(op._id) - # reset id based on count of op in workflow already - op_id = f"{op._id}{self.delim}{str(count + 1)}" - op._set_id(op_id) - - def _create_full_col_ctx_entry(self, op, target_cols, extra_cols, parent=None): - if isinstance(parent, Operator): - parent = parent._id - tup_rep = None - # requires target columns, extra columns (target+extra == all columns in df) and delim - if isinstance(op, TransformOperator): - fin_tar_cols, fin_extra_cols = op.out_columns(target_cols, extra_cols, self.delim) - tup_rep = target_cols, extra_cols, op, parent, fin_tar_cols, fin_extra_cols - # for stat ops, which do not change data in columns - if not tup_rep: - tup_rep = target_cols, extra_cols, op, parent, target_cols, extra_cols - self.columns_ctx["full"][op._id] = tup_rep - return tup_rep - - def _get_op_count(self, op_id): - return sum(1 for op in self.ops_in if op_id in op) - - def _create_phases(self): - # create new ordering based on placement and full_dict keys list - ordered_ops = self._find_order() - phases = [] - excess = ordered_ops - while excess: - phase, excess = self._create_phase(excess) - phases.append(phase) - return phases - - def _find_order(self): - ops_ordered = [] - ops_origin = list(self.columns_ctx["full"].keys()).copy() - ops_not_added = ops_origin[1:] - for op_focus in ops_origin: - ops_added, ops_not_added = self._find_order_single(op_focus, ops_not_added) - ops_ordered.append(ops_added) - ops_ordered.append(ops_not_added) - res_list = [] - for ops_set in ops_ordered: - res_list = res_list + ops_set - return res_list - - def _find_order_single(self, op_focus, op_ids): - op_ordered, not_added, parents_ref = [], [], [] - for op_id in op_ids: - place = self.placement[op_id] - # k is op._id, v is all finds... need second to last - if len(place) > 1: - op_after, idx = place[-2] - else: - op_after, idx = place[-1] - op_task = self.columns_ctx["full"][op_id] - target_cols, extra_cols, op, parent, fin_tar_cols, fin_extra_cols = op_task - if parent: - parents_ref.append(parent) - if op_after in op_focus and op_id not in parents_ref and op_after not in parents_ref: - # op has no requirements move to front - op_ordered.append(op_id) - else: - not_added.append(op_id) - return op_ordered, not_added - - def _create_phase(self, op_ordered): - # given the correctly ordered op_task list (full_dict), - # decide index splits for individual phases - parents_ref = [] - for idx, op_id in enumerate(op_ordered): - target_cols, extra_cols, op, parent, fin_tar_cols, fin_extra_cols = self.columns_ctx[ - "full" - ][op_id] - if op._id in parents_ref: - return op_ordered[:idx], op_ordered[idx:] - if parent: - parents_ref.append(parent) - - return op_ordered, [] - - def _get_target_cols(self, operators): - # all operators in a list are chained therefore based on parent in list - if type(operators) is list: - target_cols = operators[0].get_default_in() - else: - target_cols = operators.get_default_in() - return target_cols - - def _config_add_ops(self, operators, phase): - """ - This function serves to translate the operator list api into backend - ready dependency dictionary. - - Parameters - ---------- - operators: list - list of operators or single operator to be added into the - preprocessing phase - phase: - identifier for feature engineering FE or preprocessing PP - """ - target_cols = self._get_target_cols(operators) - if not target_cols or ( - target_cols in self.columns_ctx and not self.columns_ctx[target_cols]["base"] - ): - warnings.warn(f"Did not add operators: {operators}, target columns is empty.") - return - if phase in self.config and target_cols in self.config[phase]: - for op in operators: - self._check_op_count(op) - self.config[phase][target_cols].append(operators) - return - - warnings.warn(f"No main key {phase} or sub key {target_cols} found in config") - - def op_default_check(self, operators, default_in): - if not type(operators) is list: - operators = [operators] - for op in operators: - if op.default_in != default_in and op.default_in != "all": - warnings.warn( - f"{op._id} was not added. This op is not designed for use" - f" with {default_in} columns" - ) - operators.remove(op) - return operators - - def add_feature(self, operators): - """ - Adds feature engineering operator(s), while mapping - to the correct columns given operator dependencies. - - Parameters - ----------- - operators : object - list of operators or single operator to be - added into the feature engineering phase - """ - if not isinstance(operators, list): - operators = [operators] - self._config_add_ops(operators, "FE") - - def add_cat_feature(self, operators): - """ - Adds categorical feature engineering operator(s), while mapping - to the correct columns given operator dependencies. - - Parameters - ----------- - operators : object - list of categorical operators or single operator to be - added into the feature engineering phase - """ - - operators = self.op_default_check(operators, "categorical") - if operators: - self.add_feature(operators) - - def add_cont_feature(self, operators): - - """ - Adds continuous feature engineering operator(s) - to the workflow. - - Parameters - ----------- - operators : object - continuous objects such as FillMissing, Clip and LogOp - """ - - operators = self.op_default_check(operators, "continuous") - if operators: - self.add_feature(operators) - - def add_cat_preprocess(self, operators): - - """ - Adds categorical pre-processing operator(s) - to the workflow. - - Parameters - ----------- - operators : object - categorical objects such as Categorify - """ - - operators = self.op_default_check(operators, "categorical") - if operators: - self.add_preprocess(operators) - - def add_cont_preprocess(self, operators): - - """ - Adds continuous pre-processing operator(s) - to the workflow. - - Parameters - ----------- - operators : object - continuous objects such as Normalize - """ - - operators = self.op_default_check(operators, "continuous") - if operators: - self.add_preprocess(operators) - - def add_preprocess(self, operators): - - """ - Adds preprocessing operator(s), while mapping - to the correct columns given operator dependencies. - - Parameters - ----------- - operators : object - list of operators or single operator, Op/s to be - added into the preprocessing phase - """ - # must add last operator from FE for get_default_in - target_cols = self._get_target_cols(operators) - if self.config["FE"][target_cols]: - op_to_add = self.config["FE"][target_cols][-1] - else: - op_to_add = [] - if type(op_to_add) is list and op_to_add: - op_to_add = op_to_add[-1] - if op_to_add: - op_to_add = [op_to_add] - if type(operators) is list: - op_to_add = op_to_add + operators - else: - op_to_add.append(operators) - self._config_add_ops(op_to_add, "PP") - - def finalize(self): - """ - When using operator list api, this allows the user to declare they - have finished adding all operators and are ready to start processing - data. - """ - self.load_config(self.config) - - def load_config(self, config, pro=False): - """ - This function extracts all the operators from the given phases and produces a - set of phases with necessary operators to complete configured pipeline. - - Parameters - ---------- - config : dict - this object contains the phases and user specified operators - pro: bool - signals if config should be parsed via dependency dictionary or - operator list api - """ - # separate FE and PP - if not pro: - config = self._compile_dict_from_list(config) - task_sets = {} - master_task_list = [] - for task_set in config.keys(): - task_sets[task_set] = self._build_tasks(config[task_set], task_set, master_task_list) - master_task_list = master_task_list + task_sets[task_set] - - self._register_ops(master_task_list.copy()) - phases = self._create_phases() - self.phases = self.translate(master_task_list, phases) - self._create_final_col_refs(task_sets) + def __init__(self, column_group: ColumnGroup, client=None): + self.column_group = column_group + self.client = client - def translate(self, mtl, phases): - real_phases = [] - for phase in phases: - real_phase = [] - for op_id in phase: - for op_task in mtl: - op = op_task[0] - if op._id == op_id: - real_phase.append(op_task) - break - real_phases.append(real_phase) - return real_phases + def transform(self, dataset): + """Transforms the dataset by applying the graph of operators to it. Requires the 'fit' + method to have already been called, or calculated statistics to be loaded from disk - def _compile_dict_from_list(self, config): - """ - This function retrieves all the operators from the different keys in - the config object. + This method returns a Dataset object, with the transformations lazily loaded. None + of the actual computation will happen until the produced Dataset is consumed, or + written out to disk. Parameters ----------- - config : dict - this dictionary has phases(key) and the corresponding list of operators for - each phase. - """ - ret = {} - for phase, task_list in config.items(): - ret[phase] = {} - for k, v in task_list.items(): - tasks = [] - for obj in v: - if not isinstance(obj, collections.abc.Sequence): - obj = [obj] - for idx, op in enumerate(obj): - tasks.append((op, [obj[idx - 1]._id] if idx > 0 else [])) - - ret[phase][k] = tasks - return ret - - def _create_final_col_refs(self, task_sets): - """ - This function creates a reference of all the operators whose produced - columns will be available in the final set of columns. First step in - creating the final columns list. - """ - - if "final" in self.columns_ctx.keys(): - return - final = {} - # all preprocessing tasks have a parent operator, it could be None - # task (operator, main_columns_class, col_sub_key, required_operators) - for task in task_sets["PP"]: - # an operator cannot exist twice - if not task[1] in final.keys(): - final[task[1]] = [] - # detect incorrect dependency loop - for x in final[task[1]]: - if x in task[2]: - final[task[1]].remove(x) - # stats dont create columns so id would not be in columns ctx - if not task[0].__class__.__base__ == StatOperator: - final[task[1]].append(task[0]._id) - # add labels too specific because not specifically required in init - final["label"] = [] - for col_ctx in self.columns_ctx["label"].values(): - if not final["label"]: - final["label"] = ["base"] - else: - final["label"] = final["label"] + col_ctx - # if no operators run in preprocessing we grab base columns - if "continuous" not in final: - # set base columns - final["continuous"] = ["base"] - if "categorical" not in final: - final["categorical"] = ["base"] - if "all" not in final: - final["all"] = ["base"] - self.columns_ctx["final"] = {} - self.columns_ctx["final"]["ctx"] = final + dataset: Dataset - def create_final_cols(self): - """ - This function creates an entry in the columns context dictionary, - not the references to the operators. In this method we detail all - operator references with actual column names, and create a list. - The entry represents the final columns that should be in finalized - dataframe. + Returns + ------- + Dataset """ - # still adding double need to stop that - final_ctx = {} - for key, ctx_list in self.columns_ctx["final"]["ctx"].items(): - to_add = None - for ctx in ctx_list: - if ctx not in self.columns_ctx[key].keys(): - ctx = "base" - to_add = ( - self.columns_ctx[key][ctx] - if not to_add - else to_add + self.columns_ctx[key][ctx] - ) - if key not in final_ctx.keys(): - final_ctx[key] = to_add - else: - final_ctx[key] = final_ctx[key] + to_add - self.columns_ctx["final"]["cols"] = final_ctx + self._clear_worker_cache() + ddf = dataset.to_ddf(columns=self._input_columns()) + return Dataset(_transform_ddf(ddf, self.column_group), client=self.client) - def get_final_cols_names(self, col_type): - """ - Returns all the column names after preprocessing and feature - engineering. + def fit(self, dataset): + """Calculates statistics for this workflow on the input dataset Parameters ----------- - col_type : str - """ - col_names = [] - for c_names in self.columns_ctx[col_type].values(): - for name in c_names: - if name not in col_names: - col_names.append(name) - return col_names - - def _build_tasks(self, task_dict: dict, task_set, master_task_list): - """ - task_dict: the task dictionary retrieved from the config - Based on input config information - """ - # task format = (operator, main_columns_class, col_sub_key, required_operators, parent) - dep_tasks = [] - for cols, task_list in task_dict.items(): - for target_op, dep_grp in task_list: - if isinstance(target_op, DFOperator): - # check that the required stat is grabbed - # for all necessary parents - for opo in target_op.req_stats: - self._check_op_count(opo) - self.ops_in.append(opo._id) - dep_grp = dep_grp if dep_grp else ["base"] - dep_tasks.append((opo, cols, dep_grp, [], target_op)) - # after req stats handle target_op - self.ops_in.append(target_op._id) - dep_grp = dep_grp if dep_grp else ["base"] - req_ops = [] if not hasattr(target_op, "req_stats") else target_op.req_stats - dep_tasks.append((target_op, cols, dep_grp, req_ops, [])) - return dep_tasks - - def _run_trans_ops_for_phase(self, gdf, tasks): - for task in tasks: - op, cols_grp, target_cols, _, _ = task - if isinstance(op, DFOperator): - gdf = op.apply_op(gdf, self.columns_ctx, cols_grp, target_cols, self.stats) - elif isinstance(op, TransformOperator): - gdf = op.apply_op(gdf, self.columns_ctx, cols_grp, target_cols=target_cols) - return gdf - - def apply_ops( - self, gdf, start_phase=None, end_phase=None, writer=None, output_path=None, dtypes=None - ): - """ - gdf: cudf dataframe - Controls the application of registered preprocessing phase op - tasks, can only be used after apply has been performed - """ - # put phases that you want to run represented in a slice - # dont run stat_ops in apply - # run the PP ops - start = start_phase if start_phase else 0 - end = end_phase if end_phase else len(self.phases) - for phase_index in range(start, end): - start = time.time() - gdf = self._run_trans_ops_for_phase(gdf, self.phases[phase_index]) - self.timings["preproc_apply"] += time.time() - start - if phase_index == len(self.phases) - 1 and writer and output_path: - - if writer.need_cal_col_names: - cat_names = self.get_final_cols_names("categorical") - cont_names = self.get_final_cols_names("continuous") - label_names = self.get_final_cols_names("label") - writer.set_col_names(labels=label_names, cats=cat_names, conts=cont_names) - writer.need_cal_col_names = False - start_write = time.time() - # Special dtype conversion - gdf = _set_dtypes(gdf, dtypes) - writer.add_data(gdf) - self.timings["write_df"] += time.time() - start_write - return gdf - - def _update_statistics(self, stat_op): - stats = [stat for stat in stat_op.registered_stats() if stat in self.stats.keys()] - if not stats: - # add if doesnt exist - self.stats.update(stat_op.stats_collected()) - else: - # if it does exist, append to it - for key, val in stat_op.stats_collected(): - self.stats[key].update(val) - - def save_stats(self, path): - main_obj = {} - stats_drop = {} - for name, stat in self.stats.items(): - if name not in stats_drop.keys(): - stats_drop[name] = stat - main_obj["stats"] = stats_drop - main_obj["columns_ctx"] = {} - for key in self.columns_ctx.keys(): - if "full" != key: - main_obj["columns_ctx"][key] = self.columns_ctx[key] - self.columns_ctx - with open(path, "w") as outfile: - yaml.safe_dump(main_obj, outfile, default_flow_style=False) - - def load_stats(self, path): - def _set_stats(self, stats_dict): - for key, stat in stats_dict.items(): - self.stats[key] = stat - - with open(path, "r") as infile: - main_obj = yaml.safe_load(infile) - _set_stats(self, main_obj["stats"]) - self.columns_ctx = main_obj["columns_ctx"] - - def clear_stats(self): - self.stats = {} - - -def get_new_config(): - """ - boiler config object, to be filled in with targeted operator tasks - """ - config = {} - config["FE"] = {} - config["FE"]["all"] = [] - config["FE"]["continuous"] = [] - config["FE"]["categorical"] = [] - config["PP"] = {} - config["PP"]["all"] = [] - config["PP"]["continuous"] = [] - config["PP"]["categorical"] = [] - return config - - -class Workflow(BaseWorkflow): - """ - Dask-based NVTabular Workflow Class - """ - - def __init__(self, client=None, **kwargs): - super().__init__(**kwargs) - self.ddf = None - self.client = client - self._shuffle_parts = False - self._base_phase = 0 + dataset: Dataset + The input dataset to calculate statistics for. If there is a train/test split this + data should be the training dataset only. + """ + self._clear_worker_cache() + ddf = dataset.to_ddf(columns=self._input_columns()) + + # Get a dictionary mapping all StatOperators we need to fit to a set of any dependant + # StatOperators (having StatOperators that depend on the output of other StatOperators + # means that will have multiple phases in the fit cycle here) + def get_stat_ops(nodes): + return set(node for node in iter_nodes(nodes) if isinstance(node.op, StatOperator)) + + stat_ops = {op: get_stat_ops(op.parents) for op in get_stat_ops([self.column_group])} + + while stat_ops: + # get all the StatOperators that we can currently call fit on (no outstanding + # dependencies) + current_phase = [op for op, dependencies in stat_ops.items() if not dependencies] + if not current_phase: + # this shouldn't happen, but lets not infinite loop just in case + raise RuntimeError("failed to find dependency-free StatOperator to fit") + + stats, ops = [], [] + for column_group in current_phase: + # apply transforms necessary for the inputs to the current column group, ignoring + # the transforms from the statop itself + transformed_ddf = _transform_ddf(ddf, column_group, parent_only=True) + + input_column_names = [ + col for parent in column_group.parents for col in parent.columns + ] + stats.append(column_group.op.fit(input_column_names, transformed_ddf)) + ops.append(column_group.op) - def set_ddf(self, ddf, shuffle=None): - if isinstance(ddf, (dask_cudf.DataFrame, Dataset)): - self.ddf = ddf - if shuffle is not None: - self._shuffle_parts = shuffle - else: - raise TypeError("ddf type not supported.") - - def get_ddf(self): - if self.ddf is None: - raise ValueError("No dask_cudf frame available.") - elif isinstance(self.ddf, Dataset): - # Right now we can't distinguish between input columns and generated columns - # in the dataset, we don't limit the columm set right now in the to_ddf call - # (https://github.com/NVIDIA/NVTabular/issues/409 ) - return self.ddf.to_ddf(shuffle=self._shuffle_parts) - return self.ddf - - @staticmethod - def _aggregated_op(gdf, ops): - for op in ops: - columns_ctx, cols_grp, target_cols, logic, stats_context = op - gdf = logic(gdf, columns_ctx, cols_grp, target_cols, stats_context) - return gdf - - def _aggregated_dask_transform(self, ddf, transforms): - # Assuming order of transforms corresponds to dependency ordering - meta = ddf._meta - for transform in transforms: - columns_ctx, cols_grp, target_cols, logic, stats_context = transform - meta = logic(meta, columns_ctx, cols_grp, target_cols, stats_context) - return ddf.map_partitions(self.__class__._aggregated_op, transforms, meta=meta) - - def exec_phase(self, phase_index, record_stats=True, update_ddf=True): - """ - Gather necessary column statistics in single pass. - Execute statistics for one phase only (given by phase index), - but (laxily) perform all transforms for current and previous phases. - """ - transforms = [] - - # Need to perform all transforms up to, and including, - # the current phase (not only the current phase). We do this - # so that we can avoid persisitng intermediate transforms - # needed for statistics. - phases = range(self._base_phase, phase_index + 1) - for ind in phases: - for task in self.phases[ind]: - op, cols_grp, target_cols, _, _ = task - if isinstance(op, TransformOperator): - stats_context = self.stats if isinstance(op, DFOperator) else None - logic = op.apply_op - transforms.append( - (self.columns_ctx, cols_grp, target_cols, logic, stats_context) - ) - elif not isinstance(op, StatOperator): - raise ValueError("Unknown Operator Type") - - # Perform transforms as single dask task (per ddf partition) - _ddf = self.get_ddf() - if transforms: - _ddf = self._aggregated_dask_transform(_ddf, transforms) - - stats = [] - if record_stats: - for task in self.phases[phase_index]: - op, cols_grp, target_cols, _, _ = task - if isinstance(op, StatOperator): - stats.append((op.stat_logic(_ddf, self.columns_ctx, cols_grp, target_cols), op)) - # TODO: Don't want to update the internal ddf here if we can - # avoid it. It may be better to just add the new column? - if op._ddf_out is not None: - self.set_ddf(op._ddf_out) - # We are updating the internal `ddf`, so we shouldn't - # redo transforms up to this phase in later phases. - self._base_phase = phase_index - - # Compute statistics if necessary - if stats: if self.client: - for r in self.client.compute(stats): - computed_stats, op = r.result() - op.finalize(computed_stats) - self._update_statistics(op) - op.clear() + results = [r.result() for r in self.client.compute(stats)] else: - for r in dask.compute(stats, scheduler="synchronous")[0]: - computed_stats, op = r - op.finalize(computed_stats) - self._update_statistics(op) - op.clear() - del stats + results = dask.compute(stats, schedule="synchronous")[0] - # Update interal ddf. - # Cancel futures and delete _ddf if allowed. - if transforms and update_ddf: - self.set_ddf(_ddf) - else: - if self.client: - self.client.cancel(_ddf) - del _ddf + for computed_stats, op in zip(results, ops): + op.fit_finalize(computed_stats) - def apply( - self, - dataset, - apply_offline=True, - record_stats=True, - shuffle=None, - output_path="./ds_export", - output_format="parquet", - out_files_per_proc=None, - num_io_threads=0, - dtypes=None, - ): - """ - Runs all the preprocessing and feature engineering operators. - Also, shuffles the data if a `shuffle` option is specified. + # Remove all the operators we processed in this phase, and remove + # from the dependencies of other ops too + for stat_op in current_phase: + stat_ops.pop(stat_op) + for dependencies in stat_ops.values(): + dependencies.difference_update(current_phase) + + def fit_transform(self, dataset): + """Convenience method to both fit the workflow and transform the dataset in a single + call. Equivalent to calling workflow.fit(dataset) followed by workflow.transform(dataset) Parameters ----------- - dataset : object - apply_offline : boolean - Runs operators in offline mode or not - record_stats : boolean - Record the stats in file or not. Only available - for apply_offline=True - shuffle : nvt.io.Shuffle enum - How to shuffle the output dataset. Shuffling is only - performed if the data is written to disk. For all options, - other than `None` (which means no shuffling), the partitions - of the underlying dataset/ddf will be randomly ordered. If - `PER_PARTITION` is specified, each worker/process will also - shuffle the rows within each partition before splitting and - appending the data to a number (`out_files_per_proc`) of output - files. Output files are distinctly mapped to each worker process. - If `PER_WORKER` is specified, each worker will follow the same - procedure as `PER_PARTITION`, but will re-shuffle each file after - all data is persisted. This results in a full shuffle of the - data processed by each worker. To improve performace, this option - currently uses host-memory `BytesIO` objects for the intermediate - persist stage. The `FULL` option is not yet implemented. - output_path : string - Path to write processed/shuffled output data - output_format : {"parquet", "hugectr", None} - Output format to write processed/shuffled data. If None, - no output dataset will be written (and shuffling skipped). - out_files_per_proc : integer - Number of files to create (per process) after - shuffling the data - num_io_threads : integer - Number of IO threads to use for writing the output dataset. - For `0` (default), no dedicated IO threads will be used. - dtypes : dict - Dictionary containing desired datatypes for output columns. - Keys are column names, values are datatypes. - """ - - # Check shuffle argument - shuffle = _check_shuffle_arg(shuffle) - - # If no tasks have been loaded then we need to load internal config - if not self.phases: - self.finalize() - - # Gather statstics (if apply_offline), and/or transform - # and write out processed data - if apply_offline: - self.build_and_process_graph( - dataset, - output_path=output_path, - record_stats=record_stats, - shuffle=shuffle, - output_format=output_format, - out_files_per_proc=out_files_per_proc, - num_io_threads=num_io_threads, - dtypes=dtypes, - ) - else: - self.iterate_online( - dataset, - output_path=output_path, - shuffle=shuffle, - output_format=output_format, - out_files_per_proc=out_files_per_proc, - num_io_threads=num_io_threads, - dtypes=dtypes, - ) - - def iterate_online( - self, - dataset, - end_phase=None, - output_path=None, - shuffle=None, - output_format=None, - out_files_per_proc=None, - apply_ops=True, - num_io_threads=0, - dtypes=None, - ): - """Iterate through dataset and (optionally) apply/shuffle/write.""" - # Check shuffle argument - shuffle = _check_shuffle_arg(shuffle) + dataset: Dataset - # Check if we have a (supported) writer - output_path = output_path or "./" - output_path = str(output_path) - writer = writer_factory( - output_format, - output_path, - out_files_per_proc, - shuffle, - bytes_io=(shuffle == Shuffle.PER_WORKER), - num_threads=num_io_threads, - ) - - # Iterate through dataset, apply ops, and write out processed data - if apply_ops: - columns = self.columns_ctx["all"]["base"] - for gdf in dataset.to_iter(shuffle=(shuffle is not None), columns=columns): - self.apply_ops(gdf, output_path=output_path, writer=writer, dtypes=dtypes) - - # Close writer and write general/specialized metadata - if writer: - general_md, special_md = writer.close() - - # Note that we "could" have the special and general metadata - # written during `writer.close()` (just above) for the single-GPU case. - # Instead, the metadata logic is separated from the `Writer` object to - # simplify multi-GPU integration. When using Dask, we cannot assume - # that the "shared" metadata files can/will be written by the same - # process that writes the data. - writer.write_special_metadata(special_md, writer.fs, output_path) - writer.write_general_metadata(general_md, writer.fs, output_path) - - def update_stats(self, dataset, end_phase=None): - """Collect statistics only.""" - self.build_and_process_graph(dataset, end_phase=end_phase, record_stats=True) - - def build_and_process_graph( - self, - dataset, - end_phase=None, - output_path=None, - record_stats=True, - shuffle=None, - output_format=None, - out_files_per_proc=None, - apply_ops=True, - num_io_threads=0, - dtypes=None, - ): - """Build Dask-task graph for workflow. - - Full graph is only executed if `output_format` is specified. + Returns + ------- + Dataset """ - # Check shuffle argument - shuffle = _check_shuffle_arg(shuffle) + self.fit(dataset) + return self.transform(dataset) - end = end_phase if end_phase else len(self.phases) - - if output_format not in ("parquet", "hugectr", None): - raise ValueError(f"Output format {output_format} not yet supported with Dask.") + def _input_columns(self): + input_nodes = set(node for node in iter_nodes([self.column_group]) if not node.parents) + return list(set(col for node in input_nodes for col in flatten(node.columns))) + def _clear_worker_cache(self): # Clear worker caches to be "safe" if self.client: self.client.run(clean_worker_cache) else: clean_worker_cache() - self.set_ddf(dataset, shuffle=(shuffle is not None)) - if apply_ops: - self._base_phase = 0 # Set _base_phase - for idx, _ in enumerate(self.phases[:end]): - self.exec_phase(idx, record_stats=record_stats, update_ddf=(idx == (end - 1))) - self._base_phase = 0 # Re-Set _base_phase - - if dtypes: - ddf = self.get_ddf() - _meta = _set_dtypes(ddf._meta, dtypes) - self.set_ddf(ddf.map_partitions(_set_dtypes, dtypes, meta=_meta)) - - if output_format: - output_path = output_path or "./" - output_path = str(output_path) - self.ddf_to_dataset( - output_path, - output_format=output_format, - shuffle=shuffle, - out_files_per_proc=out_files_per_proc, - num_threads=num_io_threads, - ) - def write_to_dataset( - self, - path, - dataset, - apply_ops=False, - out_files_per_proc=None, - shuffle=None, - output_format="parquet", - iterate=False, - nfiles=None, - num_io_threads=0, - dtypes=None, - ): - """Write data to shuffled parquet dataset. - - Assumes statistics are already gathered. - """ - # Check shuffle argument - shuffle = _check_shuffle_arg(shuffle) - - if nfiles: - warnings.warn("nfiles is deprecated. Use out_files_per_proc") - if out_files_per_proc is None: - out_files_per_proc = nfiles - out_files_per_proc = out_files_per_proc or 1 - - path = str(path) - if iterate: - self.iterate_online( - dataset, - output_path=path, - shuffle=shuffle, - output_format=output_format, - out_files_per_proc=out_files_per_proc, - apply_ops=apply_ops, - num_io_threads=num_io_threads, - dtypes=dtypes, - ) - else: - self.build_and_process_graph( - dataset, - output_path=path, - record_stats=False, - shuffle=shuffle, - output_format=output_format, - out_files_per_proc=out_files_per_proc, - apply_ops=apply_ops, - num_io_threads=num_io_threads, - dtypes=dtypes, - ) - - def ddf_to_dataset( - self, - output_path, - shuffle=None, - out_files_per_proc=None, - output_format="parquet", - num_threads=0, - ): - """Dask-based dataset output. - - Currently supports parquet only. - """ - if output_format not in ("parquet", "hugectr"): - raise ValueError("Only parquet/hugectr output supported with Dask.") - ddf = self.get_ddf() - fs = get_fs_token_paths(output_path)[0] - fs.mkdirs(output_path, exist_ok=True) - if shuffle or out_files_per_proc: - - cat_names = self.get_final_cols_names("categorical") - cont_names = self.get_final_cols_names("continuous") - label_names = self.get_final_cols_names("label") - - # Output dask_cudf DataFrame to dataset - _ddf_to_dataset( - ddf, - fs, - output_path, - shuffle, - out_files_per_proc, - cat_names, - cont_names, - label_names, - output_format, - self.client, - num_threads, - ) - return - - # Default (shuffle=None and out_files_per_proc=None) - # Just use `dask_cudf.to_parquet` - fut = ddf.to_parquet(output_path, compression=None, write_index=False, compute=False) - if self.client is None: - fut.compute(scheduler="synchronous") - else: - fut.compute() +def _transform_ddf(ddf, column_group, parent_only=False): + return ddf.map_partitions( + lambda gdf: _transform_partition(gdf, column_group, parent_only), + meta=cudf.DataFrame({k: [] for k in column_group.columns}), + ) + + +def _transform_partition(root_gdf, column_group, parent_only=False): + """ Transforms a single partition by appyling all operators in a ColumnGroup """ + # collect dependencies recursively if we have parents + if column_group.parents: + gdf = cudf.DataFrame() + for parent in column_group.parents: + parent_gdf = _transform_partition(root_gdf, parent) + for column in parent.columns: + gdf[column] = parent_gdf[column] + else: + # otherwise select the input from the root gdf + gdf = root_gdf[list(flatten(column_group.columns))] + + # apply the operator if necessary + if column_group.op and not parent_only: + input_column_names = [col for parent in column_group.parents for col in parent.columns] + try: + gdf = column_group.op.transform(input_column_names, gdf) + except Exception: + LOG.exception("Failed to transform operator %s", column_group.op) + raise + + # dask needs output to be in the same order defined as meta, reorder partitions here + # this also selects columns (handling the case of removing columns from the output using + # "-" overload) + output = cudf.DataFrame() + if column_group.op: + print("transform", column_group.columns, column_group.op, column_group.op.encode_type) + print(gdf) + print(column_group.columns) + for column in column_group.columns: + output[column] = gdf[column] + return output diff --git a/tests/unit/test_column_similarity.py b/tests/unit/test_column_similarity.py index e6130d08bb9..f9d56fc302e 100644 --- a/tests/unit/test_column_similarity.py +++ b/tests/unit/test_column_similarity.py @@ -36,13 +36,14 @@ def test_column_similarity(on_device, metric): ) input_df = cudf.DataFrame({"left": [0, 0, 0, 0, 4], "right": [0, 1, 2, 3, 5]}) - op = ColumnSimilarity("output", "left", categories, "right", metric=metric, on_device=on_device) - workflow = nvtabular.Workflow(cat_names=["left", "right"], cont_names=[], label_name=[]) - workflow.add_feature(op) - workflow.apply(nvtabular.Dataset(input_df), output_path=None) - df = workflow.get_ddf().compute() - output = df.output.values + sim_features = [["left", "right"]] >> ColumnSimilarity( + categories, metric=metric, on_device=on_device + ) + workflow = nvtabular.Workflow(sim_features) + + df = workflow.transform(nvtabular.Dataset(input_df)).to_ddf().compute() + output = df["left_right_sim"].values if metric in ("tfidf", "cosine"): # distance from document 0 to itself should be 1, since these metrics are fully normalized assert float(output[0]) == pytest.approx(1) @@ -54,12 +55,11 @@ def test_column_similarity(on_device, metric): assert output[4] != 0 # make sure that we can operate multiple times on the same matrix correctly - op = ColumnSimilarity( - "output", "left", categories, "right", metric="inner", on_device=on_device + sim_features = [["left", "right"]] >> ColumnSimilarity( + categories, metric="inner", on_device=on_device ) + workflow = nvtabular.Workflow(sim_features) + + df = workflow.transform(nvtabular.Dataset(input_df)).to_ddf().compute() - workflow = nvtabular.Workflow(cat_names=["left", "right"], cont_names=[], label_name=[]) - workflow.add_feature(op) - workflow.apply(nvtabular.Dataset(df), output_path=None) - df = workflow.get_ddf().compute() - assert float(df.output.values[0]) == pytest.approx(3) + assert float(df["left_right_sim"].values[0]) == pytest.approx(3) diff --git a/tests/unit/test_dask_nvt.py b/tests/unit/test_dask_nvt.py index 2acc79614a8..806caf5ad8a 100644 --- a/tests/unit/test_dask_nvt.py +++ b/tests/unit/test_dask_nvt.py @@ -71,35 +71,32 @@ def test_dask_workflow_api_dlrm( cont_names = ["x", "y", "id"] label_name = ["label"] - processor = Workflow( - client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name + cats = cat_names >> ops.Categorify( + freq_threshold=freq_threshold, out_path=str(tmpdir), cat_cache=cat_cache, on_host=on_host ) - processor.add_feature([ops.FillMissing(), ops.Clip(min_value=0), ops.LogOp()]) - processor.add_preprocess( - ops.Categorify( - freq_threshold=freq_threshold, - out_path=str(tmpdir), - cat_cache=cat_cache, - on_host=on_host, - ) - ) - processor.finalize() + conts = cont_names >> ops.FillMissing() >> ops.Clip(min_value=0) >> ops.LogOp() + + workflow = Workflow(cats + conts + label_name, client=client) if engine in ("parquet", "csv"): dataset = Dataset(paths, part_mem_fraction=part_mem_fraction) else: dataset = Dataset(paths, names=allcols_csv, part_mem_fraction=part_mem_fraction) + output_path = os.path.join(tmpdir, "processed") - processor.apply(dataset, output_path=output_path, shuffle=shuffle) + + transformed = workflow.fit_transform(dataset) + transformed.to_parquet(output_path=output_path, shuffle=shuffle) # Can still access the final ddf if we didn't shuffle if not shuffle: - result = processor.get_ddf().compute() + result = transformed.to_ddf().compute() assert len(df0) == len(result) assert result["x"].min() == 0.0 assert result["x"].isna().sum() == 0 assert result["y"].min() == 0.0 + assert result["y"].isna().sum() == 0 # Check category counts @@ -124,7 +121,6 @@ def test_dask_workflow_api_dlrm( assert_eq(result[col], df_disk[col]) else: - # Read back from disk df_disk = dask_cudf.read_parquet(output_path, index=False).compute() assert len(df0) == len(df_disk) @@ -230,80 +226,6 @@ def test_cats_and_groupby_stats(client, tmpdir, datasets, part_mem_fraction, use assert "name-string_x_sum" in result.columns -@pytest.mark.parametrize("engine", ["parquet"]) -def test_dask_minmax_dummyop(client, tmpdir, datasets, engine): - - paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) - cat_names = ["name-cat", "name-string"] - cont_names = ["x", "y", "id"] - label_name = ["label"] - - class DummyOp(ops.DFOperator): - - default_in, default_out = "continuous", "continuous" - - @property - def req_stats(self): - return [ops.MinMax()] - - def op_logic(self, *args, **kwargs): - return _dummy_op_logic(*args, _id=self._id, **kwargs) - - processor = Workflow( - client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name - ) - processor.add_preprocess(DummyOp()) - processor.finalize() - - dataset = Dataset(paths, engine) - processor.apply(dataset) - result = processor.get_ddf().compute() - - assert math.isclose(result.x.min(), processor.stats["mins"]["x"], rel_tol=1e-3) - assert math.isclose(result.y.min(), processor.stats["mins"]["y"], rel_tol=1e-3) - assert math.isclose(result.id.min(), processor.stats["mins"]["id"], rel_tol=1e-3) - assert math.isclose(result.x.max(), processor.stats["maxs"]["x"], rel_tol=1e-3) - assert math.isclose(result.y.max(), processor.stats["maxs"]["y"], rel_tol=1e-3) - assert math.isclose(result.id.max(), processor.stats["maxs"]["id"], rel_tol=1e-3) - - -@pytest.mark.parametrize("engine", ["parquet"]) -def test_dask_median_dummyop(client, tmpdir, datasets, engine): - - paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) - cat_names = ["name-cat", "name-string"] - cont_names = ["x", "y", "id"] - label_name = ["label"] - - class DummyOp(ops.DFOperator): - - default_in, default_out = "continuous", "continuous" - - @property - def req_stats(self): - return [ops.Median()] - - def op_logic(self, *args, **kwargs): - return _dummy_op_logic(*args, _id=self._id, **kwargs) - - processor = Workflow( - client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name - ) - processor.add_preprocess(DummyOp()) - processor.finalize() - - dataset = Dataset(paths, engine) - processor.apply(dataset) - result = processor.get_ddf().compute() - - # TODO: Improve the accuracy! "tidigest" with crick could help, - # but current version seems to have cupy/numpy problems here - medians = result[cont_names].quantile(q=0.5) - assert math.isclose(medians["x"], processor.stats["medians"]["x"], abs_tol=1e-1) - assert math.isclose(medians["y"], processor.stats["medians"]["y"], abs_tol=1e-1) - assert math.isclose(medians["id"], processor.stats["medians"]["id"], rel_tol=1e-2) - - @pytest.mark.parametrize("engine", ["parquet"]) def test_dask_normalize(client, tmpdir, datasets, engine): @@ -316,24 +238,19 @@ def test_dask_normalize(client, tmpdir, datasets, engine): cont_names = ["x", "y", "id"] label_name = ["label"] - processor = Workflow( - client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name - ) - processor.add_preprocess([ops.FillMissing(), ops.Normalize()]) - processor.finalize() + normalize = ops.Normalize() + conts = cont_names >> ops.FillMissing() >> normalize + workflow = Workflow(conts + cat_names + label_name, client=client) dataset = Dataset(paths, engine) - processor.apply(dataset) - result = processor.get_ddf().compute() + result = workflow.fit_transform(dataset).to_ddf().compute() # Make sure we collected accurate statistics means = df0[cont_names].mean() stds = df0[cont_names].std() - counts = df0[cont_names].count() for name in cont_names: - assert math.isclose(means[name], processor.stats["means"][name], rel_tol=1e-3) - assert math.isclose(stds[name], processor.stats["stds"][name], rel_tol=1e-3) - assert math.isclose(counts[name], processor.stats["counts"][name], rel_tol=1e-3) + assert math.isclose(means[name], normalize.means[name], rel_tol=1e-3) + assert math.isclose(stds[name], normalize.stds[name], rel_tol=1e-3) # New (normalized) means should all be close to zero new_means = result[cont_names].mean() diff --git a/tests/unit/test_io.py b/tests/unit/test_io.py index cd34b7107c6..6b45671414c 100644 --- a/tests/unit/test_io.py +++ b/tests/unit/test_io.py @@ -117,30 +117,27 @@ def test_hugectr( outdir = tmpdir + "/hugectr" os.mkdir(outdir) - # process data - processor = nvt.Workflow( - client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_names - ) - processor.add_feature( - [ - ops.FillMissing(columns=op_columns), - ops.Clip(min_value=0, columns=op_columns), - ops.LogOp(), - ] - ) - processor.add_preprocess(ops.Normalize()) - processor.add_preprocess(ops.Categorify()) - processor.finalize() - - # apply the workflow and write out the dataset - processor.apply( - dataset, - output_path=outdir, - out_files_per_proc=nfiles, - output_format=output_format, - shuffle=None, - num_io_threads=num_io_threads, - ) + conts = nvt.ColumnGroup(cont_names) >> ops.Normalize + cats = nvt.ColumnGroup(cat_names) >> ops.Categorify + + workflow = nvt.Workflow(conts + cats + label_names) + transformed = workflow.fit_transform(dataset) + + if output_format == "hugectr": + transformed.to_hugectr( + cats=cat_names, + conts=cont_names, + labels=label_names, + output_path=outdir, + out_files_per_proc=nfiles, + num_thread=num_io_threads, + ) + else: + transformed.to_parquet( + output_path=outdir, + out_files_per_proc=nfiles, + num_threads=num_io_threads, + ) # Check for _file_list.txt assert os.path.isfile(outdir + "/_file_list.txt") @@ -241,18 +238,12 @@ def test_multifile_parquet(tmpdir, dataset, df, engine, num_io_threads, nfiles, cont_names = ["x", "y"] label_names = ["label"] columns = cat_names + cont_names + label_names + workflow = nvt.Workflow(nvt.ColumnGroup(columns)) outdir = str(tmpdir.mkdir("out")) - - processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_names) - processor.finalize() - processor.apply( - nvt.Dataset(df), - output_format="parquet", - output_path=outdir, - out_files_per_proc=nfiles, - num_io_threads=num_io_threads, - shuffle=shuffle, + transformed = workflow.transform(nvt.Dataset(df)) + transformed.to_parquet( + output_path=outdir, num_threads=num_io_threads, shuffle=shuffle, out_files_per_proc=nfiles ) # Check that our output data is exactly the same @@ -283,15 +274,11 @@ def test_parquet_lists(tmpdir, freq_threshold, shuffle, out_files_per_proc): df.to_parquet(filename) cat_names = ["Authors", "Engaging User"] - cont_names = [] - label_name = ["Post"] - - processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) - processor.add_preprocess(ops.Categorify(out_path=str(output_dir))) - processor.finalize() - processor.apply( - nvt.Dataset(filename), - output_format="parquet", + cats = nvt.ColumnGroup(cat_names) >> ops.Categorify(out_path=str(output_dir)) + workflow = nvt.Workflow(cats) + + transformed = workflow.fit_transform(nvt.Dataset(filename)) + transformed.to_parquet( output_path=output_dir, shuffle=shuffle, out_files_per_proc=out_files_per_proc, diff --git a/tests/unit/test_ops.py b/tests/unit/test_ops.py index 3799b0bc1bc..fc5a024ad63 100644 --- a/tests/unit/test_ops.py +++ b/tests/unit/test_ops.py @@ -728,14 +728,13 @@ def test_categorify_lists(tmpdir, freq_threshold): } ) cat_names = ["Authors", "Engaging User"] - cont_names = [] label_name = ["Post"] - processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) - processor.add_preprocess(ops.Categorify(out_path=str(tmpdir), freq_threshold=freq_threshold)) - processor.finalize() - processor.apply(nvt.Dataset(df), output_format=None) - df_out = processor.get_ddf().compute(scheduler="synchronous") + cat_features = cat_names >> ops.Categorify(out_path=str(tmpdir), freq_threshold=freq_threshold) + + workflow = nvt.Workflow(cat_features + label_name) + df_out = workflow.fit_transform(nvt.Dataset(df)).to_ddf().compute() + print(df_out) # Columns are encoded independently if freq_threshold < 2: @@ -744,10 +743,9 @@ def test_categorify_lists(tmpdir, freq_threshold): assert df_out["Authors"].to_arrow().to_pylist() == [[1], [1, 0], [0, 2], [2]] -@pytest.mark.parametrize("groups", [[["Author", "Engaging User"]], None]) +@pytest.mark.parametrize("cat_names", [[["Author", "Engaging User"]], ["Author", "Engaging User"]]) @pytest.mark.parametrize("kind", ["joint", "combo"]) -def test_categorify_multi(tmpdir, groups, kind): - +def test_categorify_multi(tmpdir, cat_names, kind): df = pd.DataFrame( { "Author": ["User_A", "User_E", "User_B", "User_C"], @@ -756,18 +754,15 @@ def test_categorify_multi(tmpdir, groups, kind): } ) - cat_names = ["Author", "Engaging User"] - cont_names = [] label_name = ["Post"] - processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) + cats = cat_names >> ops.Categorify(out_path=str(tmpdir), encode_type=kind) - processor.add_preprocess(ops.Categorify(columns=groups, out_path=str(tmpdir), encode_type=kind)) - processor.finalize() - processor.apply(nvt.Dataset(df), output_format=None) - df_out = processor.get_ddf().compute(scheduler="synchronous") + workflow = nvt.Workflow(cats + label_name) + + df_out = workflow.fit_transform(nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous") - if groups: + if len(cat_names) == 1: if kind == "joint": # Columns are encoded jointly assert df_out["Author"].to_arrow().to_pylist() == [1, 5, 2, 3] @@ -782,7 +777,7 @@ def test_categorify_multi(tmpdir, groups, kind): def test_categorify_multi_combo(tmpdir): - groups = [["Author", "Engaging User"], ["Author"], "Engaging User"] + cat_names = [["Author", "Engaging User"], ["Author"], "Engaging User"] kind = "combo" df = pd.DataFrame( { @@ -792,16 +787,10 @@ def test_categorify_multi_combo(tmpdir): } ) - cat_names = ["Author", "Engaging User"] - cont_names = [] label_name = ["Post"] - - processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) - - processor.add_preprocess(ops.Categorify(columns=groups, out_path=str(tmpdir), encode_type=kind)) - processor.finalize() - processor.apply(nvt.Dataset(df), output_format=None) - df_out = processor.get_ddf().compute(scheduler="synchronous") + cats = cat_names >> ops.Categorify(out_path=str(tmpdir), encode_type=kind) + workflow = nvt.Workflow(cats + label_name) + df_out = workflow.fit_transform(nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous") # Column combinations are encoded assert df_out["Author"].to_arrow().to_pylist() == [1, 4, 2, 3] diff --git a/tests/unit/test_s3.py b/tests/unit/test_s3.py index 0286f4e1bcc..1d9e4afd8f6 100644 --- a/tests/unit/test_s3.py +++ b/tests/unit/test_s3.py @@ -70,11 +70,8 @@ def test_s3_dataset(s3, paths, engine, df): cont_names = ["x", "y", "id"] label_name = ["label"] - processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) + conts = cont_names >> ops.FillMissing() >> ops.Clip(min_value=0) >> ops.LogOp() + cats = cat_names >> ops.Categorify(cat_cache="host") - processor.add_feature([ops.FillMissing(), ops.Clip(min_value=0), ops.LogOp()]) - processor.add_preprocess(ops.Normalize()) - processor.add_preprocess(ops.Categorify(cat_cache="host")) - processor.finalize() - - processor.update_stats(dataset) + processor = nvt.Workflow(conts + cats + label_name) + processor.fit(dataset) diff --git a/tests/unit/test_workflow.py b/tests/unit/test_workflow.py index ac3243cd7e9..b6946ca7e71 100644 --- a/tests/unit/test_workflow.py +++ b/tests/unit/test_workflow.py @@ -26,45 +26,35 @@ from pandas.api.types import is_integer_dtype import nvtabular as nvt +from nvtabular import ColumnGroup, Dataset, Workflow from nvtabular import ops as ops -from nvtabular.io import Dataset from tests.conftest import get_cats, mycols_csv @pytest.mark.parametrize("gpu_memory_frac", [0.01, 0.1]) @pytest.mark.parametrize("engine", ["parquet", "csv", "csv-no-header"]) @pytest.mark.parametrize("dump", [True, False]) -@pytest.mark.parametrize("op_columns", [["x"], None]) @pytest.mark.parametrize("use_client", [True, False]) -def test_gpu_workflow_api( - tmpdir, client, df, dataset, gpu_memory_frac, engine, dump, op_columns, use_client -): +def test_gpu_workflow_api(tmpdir, client, df, dataset, gpu_memory_frac, engine, dump, use_client): cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] - processor = nvt.Workflow( - cat_names=cat_names, - cont_names=cont_names, - label_name=label_name, - client=client if use_client else None, - ) + norms = ops.Normalize() + cat_features = cat_names >> ops.Categorify(cat_cache="host") + cont_features = cont_names >> ops.FillMissing() >> ops.Clip(min_value=0) >> ops.LogOp >> norms - processor.add_feature( - [ops.FillMissing(), ops.Clip(min_value=0, columns=op_columns), ops.LogOp()] + workflow = Workflow( + cat_features + cont_features + label_name, client=client if use_client else None ) - processor.add_preprocess(ops.Normalize()) - processor.add_preprocess(ops.Categorify(cat_cache="host")) - processor.finalize() - assert len(processor.phases) == 2 - processor.update_stats(dataset) + workflow.fit(dataset) if dump: config_file = tmpdir + "/temp.yaml" - processor.save_stats(config_file) - processor.clear_stats() - processor.load_stats(config_file) + workflow.save_stats(config_file) + workflow.clear_stats() + workflow.load_stats(config_file) def get_norms(tar: cudf.Series): gdf = tar.fillna(0) @@ -73,27 +63,27 @@ def get_norms(tar: cudf.Series): return gdf # Check mean and std - No good right now we have to add all other changes; Clip, Log - - if not op_columns: - assert math.isclose(get_norms(df.y).mean(), processor.stats["means"]["y"], rel_tol=1e-1) - assert math.isclose(get_norms(df.y).std(), processor.stats["stds"]["y"], rel_tol=1e-1) - assert math.isclose(get_norms(df.x).mean(), processor.stats["means"]["x"], rel_tol=1e-1) - assert math.isclose(get_norms(df.x).std(), processor.stats["stds"]["x"], rel_tol=1e-1) + assert math.isclose(get_norms(df.y).mean(), norms.means["y"], rel_tol=1e-1) + assert math.isclose(get_norms(df.y).std(), norms.stds["y"], rel_tol=1e-1) + assert math.isclose(get_norms(df.x).mean(), norms.means["x"], rel_tol=1e-1) + assert math.isclose(get_norms(df.x).std(), norms.stds["x"], rel_tol=1e-1) # Check that categories match if engine == "parquet": cats_expected0 = df["name-cat"].unique().values_host - cats0 = get_cats(processor, "name-cat") + cats0 = get_cats(workflow, "name-cat") # adding the None entry as a string because of move from gpu assert cats0.tolist() == [None] + cats_expected0.tolist() cats_expected1 = df["name-string"].unique().values_host - cats1 = get_cats(processor, "name-string") + cats1 = get_cats(workflow, "name-string") # adding the None entry as a string because of move from gpu assert cats1.tolist() == [None] + cats_expected1.tolist() # Write to new "shuffled" and "processed" dataset - processor.write_to_dataset( - tmpdir, dataset, out_files_per_proc=10, shuffle=nvt.io.Shuffle.PER_PARTITION, apply_ops=True + workflow.transform(dataset).to_parquet( + tmpdir, + out_files_per_proc=10, + shuffle=nvt.io.Shuffle.PER_PARTITION, ) dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) @@ -125,27 +115,13 @@ def test_spec_set(tmpdir, client): } ) - p = nvt.Workflow( - cat_names=["ad_id", "source_id", "platform"], - cont_names=["cont"], - label_name=["clicked"], - client=client, - ) - p.add_feature(ops.FillMissing()) - p.add_feature(ops.Normalize()) - p.add_feature(ops.Categorify()) - p.add_feature( - ops.TargetEncoding( - cat_groups=["ad_id", "source_id", "platform"], - cont_target="clicked", - kfold=5, - fold_seed=42, - p_smooth=20, - ) - ) + cats = ColumnGroup(["ad_id", "source_id", "platform"]) + cat_features = cats >> ops.Categorify + cont_features = ColumnGroup(["cont"]) >> ops.FillMissing >> ops.Normalize + te_features = cats >> ops.TargetEncoding("clicked", kfold=5, fold_seed=42, p_smooth=20) - p.apply(nvt.Dataset(gdf_test), record_stats=True) - assert p.stats + p = Workflow(cat_features + cont_features + te_features, client=client) + p.fit_transform(nvt.Dataset(gdf_test), record_stats=True).to_ddf().compute() @pytest.mark.parametrize("gpu_memory_frac", [0.01, 0.1]) @@ -312,13 +288,10 @@ def test_parquet_output(client, use_client, tmpdir, shuffle): columns = ["a"] dataset = nvt.Dataset(path, engine="parquet", row_groups_per_part=1) - processor = nvt.Workflow( - cat_names=[], cont_names=columns, label_name=[], client=client if use_client else None - ) - processor.add_preprocess(ops.Normalize()) - processor.finalize() - processor.apply( - dataset, output_path=out_path, shuffle=shuffle, out_files_per_proc=out_files_per_proc + + workflow = nvt.Workflow(columns >> ops.Normalize(), client=client if use_client else None) + workflow.fit_transform(dataset).to_parquet( + output_path=out_path, shuffle=shuffle, out_files_per_proc=out_files_per_proc ) # Check that the number of output files is correct @@ -402,15 +375,12 @@ def test_chaining_1(): ) df["cont01"][:10] = None - workflow = nvt.Workflow( - cat_names=["cat01"], cont_names=["cont01", "cont02"], label_name=["label"] - ) - workflow.add_cont_feature(nvt.ops.FillMissing(columns=["cont01"], replace=True)) - workflow.add_cont_preprocess(nvt.ops.NormalizeMinMax(columns=["cont01", "cont02"])) - workflow.finalize() + cont1 = "cont01" >> ops.FillMissing() + conts = cont1 + "cont02" >> ops.NormalizeMinMax() + workflow = Workflow(conts + "cat01" + "label") + + result = workflow.fit_transform(Dataset(df)).to_ddf().compute() - workflow.apply(nvt.Dataset(df), output_path=None) - result = workflow.get_ddf().compute() assert result["cont01"].max() <= 1.0 assert result["cont02"].max() <= 1.0 @@ -578,3 +548,36 @@ def test_workflow_generate_columns(tmpdir, use_parquet): # just make sure this owrks without errors workflow.apply(dataset, output_path=out_path) + + +def test_transform_geolocation(): + raw = """US>SC>519 US>CA>807 US>MI>505 US>CA>510 CA>NB US>CA>534""".split() + data = cudf.DataFrame({"geo_location": raw}) + + geo_location = ColumnGroup(["geo_location"]) + state = geo_location >> (lambda col: col.str.slice(0, 5)) >> ops.Rename(postfix="_state") + country = geo_location >> (lambda col: col.str.slice(0, 2)) >> ops.Rename(postfix="_country") + geo_features = state + country + geo_location >> ops.HashBucket(num_buckets=100) + + # for this workflow we don't have any statoperators, so we can get away without fitting + workflow = Workflow(geo_features) + transformed = workflow.transform(Dataset(data)).to_ddf().compute() + + expected = cudf.DataFrame() + expected["geo_location_state"] = data["geo_location"].str.slice(0, 5).hash_values() % 100 + expected["geo_location_country"] = data["geo_location"].str.slice(0, 2).hash_values() % 100 + expected["geo_location"] = data["geo_location"].hash_values() % 100 + assert_eq(expected, transformed) + + +def test_fit_simple(): + data = cudf.DataFrame({"x": [0, 1, 2, None, 0, 1, 2], "y": [None, 3, 4, 5, 3, 4, 5]}) + dataset = Dataset(data) + + workflow = Workflow(["x", "y"] >> ops.FillMedian >> (lambda x: x * x)) + + workflow.fit(dataset) + transformed = workflow.transform(dataset).to_ddf().compute() + + expected = cudf.DataFrame({"x": [0, 1, 4, 1, 0, 1, 4], "y": [16, 9, 16, 25, 9, 16, 25]}) + assert_eq(expected, transformed) From abc0cb4c3d3a56ed0e27f86e5ce6faaeff16f9e5 Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Mon, 14 Dec 2020 09:43:19 -0800 Subject: [PATCH 02/23] remove debug print statement --- nvtabular/workflow.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/nvtabular/workflow.py b/nvtabular/workflow.py index c95f6158163..4c8df4b889e 100644 --- a/nvtabular/workflow.py +++ b/nvtabular/workflow.py @@ -182,10 +182,6 @@ def _transform_partition(root_gdf, column_group, parent_only=False): # this also selects columns (handling the case of removing columns from the output using # "-" overload) output = cudf.DataFrame() - if column_group.op: - print("transform", column_group.columns, column_group.op, column_group.op.encode_type) - print(gdf) - print(column_group.columns) for column in column_group.columns: output[column] = gdf[column] return output From b69ad67dc9450ae43632998fe60cdb1c393c1fe0 Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Mon, 14 Dec 2020 13:37:51 -0800 Subject: [PATCH 03/23] Fix test_io unittest Also partially fix some tests inside test_workflow --- nvtabular/io/dask.py | 3 +++ tests/conftest.py | 21 +++++++++++------- tests/unit/test_io.py | 7 +++--- tests/unit/test_workflow.py | 44 ++++++++++++++++--------------------- 4 files changed, 39 insertions(+), 36 deletions(-) diff --git a/nvtabular/io/dask.py b/nvtabular/io/dask.py index 55cc885c71d..51ef0d637d0 100644 --- a/nvtabular/io/dask.py +++ b/nvtabular/io/dask.py @@ -136,6 +136,9 @@ def _finish_dataset(client, ddf, output_path, fs, output_format): general_md, special_md = _worker_finish(output_path) # Write metadata on client + if not isinstance(output_path, str): + output_path = str(output_path) + wc, fs = _writer_cls_factory(output_format, output_path) wc.write_general_metadata(general_md, fs, output_path) wc.write_special_metadata(special_md, fs, output_path) diff --git a/tests/conftest.py b/tests/conftest.py index 83992e3422f..23331165dcb 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -173,11 +173,16 @@ def dataset(request, paths, engine): return nvtabular.Dataset(paths, part_mem_fraction=gpu_memory_frac, **kwargs) -def get_cats(processor, col, stat_name="categories"): - if isinstance(processor, nvtabular.workflow.Workflow): - filename = processor.stats[stat_name][col] - gdf = cudf.read_parquet(filename) - gdf.reset_index(drop=True, inplace=True) - return gdf[col].values_host - else: - return processor.stats["encoders"][col].get_cats().values_host +def get_cats(workflow, col, stat_name="categories"): + # figure out the categorify node from the workflow graph + cats = [ + cg.op + for cg in nvtabular.column_group.iter_nodes([workflow.column_group]) + if isinstance(cg.op, nvtabular.ops.Categorify) + ] + if len(cats) != 1: + raise RuntimeError("Found {} categorical ops, expected 1", len(cats)) + filename = cats[0].categories[col] + gdf = cudf.read_parquet(filename) + gdf.reset_index(drop=True, inplace=True) + return gdf[col].values_host diff --git a/tests/unit/test_io.py b/tests/unit/test_io.py index 6b45671414c..be2678e9d17 100644 --- a/tests/unit/test_io.py +++ b/tests/unit/test_io.py @@ -116,6 +116,7 @@ def test_hugectr( ext = "" outdir = tmpdir + "/hugectr" os.mkdir(outdir) + outdir = str(outdir) conts = nvt.ColumnGroup(cont_names) >> ops.Normalize cats = nvt.ColumnGroup(cat_names) >> ops.Categorify @@ -130,7 +131,7 @@ def test_hugectr( labels=label_names, output_path=outdir, out_files_per_proc=nfiles, - num_thread=num_io_threads, + num_threads=num_io_threads, ) else: transformed.to_parquet( @@ -274,8 +275,8 @@ def test_parquet_lists(tmpdir, freq_threshold, shuffle, out_files_per_proc): df.to_parquet(filename) cat_names = ["Authors", "Engaging User"] - cats = nvt.ColumnGroup(cat_names) >> ops.Categorify(out_path=str(output_dir)) - workflow = nvt.Workflow(cats) + cats = cat_names >> ops.Categorify(out_path=str(output_dir)) + workflow = nvt.Workflow(cats + "Post") transformed = workflow.fit_transform(nvt.Dataset(filename)) transformed.to_parquet( diff --git a/tests/unit/test_workflow.py b/tests/unit/test_workflow.py index b6946ca7e71..d29d852818b 100644 --- a/tests/unit/test_workflow.py +++ b/tests/unit/test_workflow.py @@ -51,6 +51,7 @@ def test_gpu_workflow_api(tmpdir, client, df, dataset, gpu_memory_frac, engine, workflow.fit(dataset) if dump: + # TODO: load/save stats config_file = tmpdir + "/temp.yaml" workflow.save_stats(config_file) workflow.clear_stats() @@ -121,7 +122,7 @@ def test_spec_set(tmpdir, client): te_features = cats >> ops.TargetEncoding("clicked", kfold=5, fold_seed=42, p_smooth=20) p = Workflow(cat_features + cont_features + te_features, client=client) - p.fit_transform(nvt.Dataset(gdf_test), record_stats=True).to_ddf().compute() + p.fit_transform(nvt.Dataset(gdf_test)).to_ddf().compute() @pytest.mark.parametrize("gpu_memory_frac", [0.01, 0.1]) @@ -132,50 +133,43 @@ def test_gpu_workflow(tmpdir, client, df, dataset, gpu_memory_frac, engine, dump cont_names = ["x", "y", "id"] label_name = ["label"] - config = nvt.workflow.get_new_config() - config["FE"]["continuous"] = [ops.FillMissing(), ops.Clip(min_value=0)] - config["PP"]["continuous"] = [[ops.FillMissing(), ops.Clip(min_value=0), ops.Normalize()]] - config["PP"]["categorical"] = [ops.Categorify()] - - processor = nvt.Workflow( - cat_names=cat_names, - cont_names=cont_names, - label_name=label_name, - config=config, - client=client, - ) + norms = ops.Normalize() + conts = cont_names >> ops.FillMissing() >> ops.Clip(min_value=0) >> norms + cats = cat_names >> ops.Categorify() + workflow = nvt.Workflow(conts + cats + label_name) - processor.update_stats(dataset) + workflow.fit(dataset) if dump: + # TODO: serialization config_file = tmpdir + "/temp.yaml" - processor.save_stats(config_file) - processor.clear_stats() - processor.load_stats(config_file) + workflow.save_stats(config_file) + workflow.clear_stats() + workflow.load_stats(config_file) def get_norms(tar: cudf.Series): gdf = tar.fillna(0) gdf = gdf * (gdf >= 0).astype("int") return gdf - assert math.isclose(get_norms(df.x).mean(), processor.stats["means"]["x"], rel_tol=1e-4) - assert math.isclose(get_norms(df.y).mean(), processor.stats["means"]["y"], rel_tol=1e-4) - assert math.isclose(get_norms(df.x).std(), processor.stats["stds"]["x"], rel_tol=1e-3) - assert math.isclose(get_norms(df.y).std(), processor.stats["stds"]["y"], rel_tol=1e-3) + assert math.isclose(get_norms(df.x).mean(), norms.means["x"], rel_tol=1e-4) + assert math.isclose(get_norms(df.y).mean(), norms.means["y"], rel_tol=1e-4) + assert math.isclose(get_norms(df.x).std(), norms.stds["x"], rel_tol=1e-3) + assert math.isclose(get_norms(df.y).std(), norms.stds["y"], rel_tol=1e-3) # Check that categories match if engine == "parquet": cats_expected0 = df["name-cat"].unique().values_host - cats0 = get_cats(processor, "name-cat") + cats0 = get_cats(workflow, "name-cat") # adding the None entry as a string because of move from gpu assert cats0.tolist() == [None] + cats_expected0.tolist() cats_expected1 = df["name-string"].unique().values_host - cats1 = get_cats(processor, "name-string") + cats1 = get_cats(workflow, "name-string") # adding the None entry as a string because of move from gpu assert cats1.tolist() == [None] + cats_expected1.tolist() # Write to new "shuffled" and "processed" dataset - processor.write_to_dataset( - tmpdir, dataset, out_files_per_proc=10, shuffle=nvt.io.Shuffle.PER_PARTITION, apply_ops=True + workflow.transform(dataset).to_parquet( + output_path=tmpdir, out_files_per_proc=10, shuffle=nvt.io.Shuffle.PER_PARTITION ) dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) From 2e918cda00f211803759c120a1cc909178938e5b Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Mon, 14 Dec 2020 16:19:07 -0800 Subject: [PATCH 04/23] Handle multi-column joint/combo categorify --- nvtabular/column_group.py | 27 ++++++++++++--- nvtabular/ops/categorify.py | 41 +++++++++++++++++++--- nvtabular/workflow.py | 68 ++++++++++++++++++++----------------- tests/unit/test_ops.py | 26 +++++--------- 4 files changed, 104 insertions(+), 58 deletions(-) diff --git a/nvtabular/column_group.py b/nvtabular/column_group.py index cc7eb655ce3..0caa1a378df 100644 --- a/nvtabular/column_group.py +++ b/nvtabular/column_group.py @@ -15,6 +15,8 @@ # import collections.abc +from dask.core import flatten + from nvtabular.ops import LambdaOp, Operator @@ -26,15 +28,17 @@ class ColumnGroup: Parameters ---------- - columns: list of str - The columns to select from the input Dataset + columns: list of (str or tuple of str) + The columns to select from the input Dataset. The elements of this list are strings + indicating the column names in most cases, but can also be tuples of strings + for feature crosses. """ def __init__(self, columns): if isinstance(columns, str): - columns = [columns] - - self.columns = columns + self.columns = [columns] + else: + self.columns = [_convert_col(col) for col in columns] self.parents = [] self.children = [] self.op = None @@ -140,6 +144,10 @@ def __repr__(self): output = " output" if not self.children else "" return f"" + @property + def flattened_columns(self): + return list(flatten(self.columns, container=tuple)) + @property def label(self): if self.op: @@ -228,3 +236,12 @@ def _merge_add_nodes(graph): queue.extend(current.parents) return graph + + +def _convert_col(col): + if isinstance(col, (str, tuple)): + return col + elif isinstance(col, list): + return tuple(col) + else: + raise ValueError("Invalid column value for ColumnGroup: %s", col) diff --git a/nvtabular/ops/categorify.py b/nvtabular/ops/categorify.py index d1e96a8213e..d830384165e 100644 --- a/nvtabular/ops/categorify.py +++ b/nvtabular/ops/categorify.py @@ -1,4 +1,3 @@ -# # Copyright (c) 2020, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -234,6 +233,28 @@ def __init__( @annotate("Categorify_transform", color="darkgreen", domain="nvt_python") def fit(self, columns, ddf): + # User passed in a list of column groups. We need to figure out + # if this list contains any multi-column groups, and if there + # are any (obvious) problems with these groups + columns_uniq = list(set(flatten(columns, container=tuple))) + columns_all = list(flatten(columns, container=tuple)) + if sorted(columns_all) != sorted(columns_uniq) and self.encode_type == "joint": + # If we are doing "joint" encoding, there must be unique mapping + # between input column names and column groups. Otherwise, more + # than one unique-value table could be used to encode the same + # column. + raise ValueError("Same column name included in multiple groups.") + + for group in columns: + if isinstance(group, tuple) and len(group) > 1: + # For multi-column groups, we concatenate column names + # to get the "group" name. + name = _make_name(*group, sep=self.name_sep) + for col in group: + self.storage_name[col] = name + + # convert tuples to lists + columns = [list(c) if isinstance(c, tuple) else c for c in columns] dsk, key = _category_stats( ddf, columns, @@ -274,7 +295,7 @@ def transform( else: # Case (1) & (2) - Simple 1-to-1 mapping multi_col_group = {} - cat_names = columns + cat_names = list(flatten(columns, container=tuple)) # Encode each column-group separately for name in cat_names: @@ -284,11 +305,15 @@ def transform( # Storage name may be different than group for case (2) # Only use the "aliased" `storage_name` if we are dealing with # a multi-column group, or if we are doing joint encoding + if use_name != name or self.encode_type == "joint": storage_name = self.storage_name.get(name, name) else: storage_name = name + if isinstance(use_name, tuple): + use_name = list(use_name) + path = self.categories[storage_name] """ TODO ?? if not self.column_groups and _is_list_col([name], gdf): @@ -321,7 +346,7 @@ def output_column_names(self, columns): if self.encode_type == "combo": cat_names, _ = _get_multicolumn_names(columns, columns, self.name_sep) return cat_names - return list(flatten(columns)) + return list(flatten(columns, container=tuple)) def _get_embedding_order(cat_names): @@ -400,6 +425,8 @@ def _top_level_groupby( output = {} k = 0 for i, cat_col_group in enumerate(cat_col_groups): + if isinstance(cat_col_group, tuple): + cat_col_group = list(cat_col_group) if isinstance(cat_col_group, str): cat_col_group = [cat_col_group] @@ -466,6 +493,8 @@ def _mid_level_groupby( ): if isinstance(col_group, str): col_group = [col_group] + elif isinstance(col_group, tuple): + col_group = list(col_group) if concat_groups and len(col_group) > 1: col_group = [_make_name(*col_group, sep=name_sep)] @@ -652,6 +681,9 @@ def _groupby_to_disk( tw = {} for col in col_groups: col = [col] if isinstance(col, str) else col + if isinstance(col, tuple): + col = list(col) + col_str = _make_name(*col, sep=name_sep) if tree_width is None: tw[col_str] = 8 @@ -798,7 +830,6 @@ def _encode( encode_type="joint", cat_names=None, ): - if isinstance(buckets, int): buckets = {name: buckets for name in cat_names} @@ -890,7 +921,7 @@ def _get_multicolumn_names(column_groups, gdf_columns, name_sep): cat_names = [] multi_col_group = {} for col_group in column_groups: - if isinstance(col_group, list): + if isinstance(col_group, (list, tuple)): name = _make_name(*col_group, sep=name_sep) if name not in cat_names: cat_names.append(name) diff --git a/nvtabular/workflow.py b/nvtabular/workflow.py index 4c8df4b889e..63eefe0035b 100644 --- a/nvtabular/workflow.py +++ b/nvtabular/workflow.py @@ -99,7 +99,7 @@ def get_stat_ops(nodes): for column_group in current_phase: # apply transforms necessary for the inputs to the current column group, ignoring # the transforms from the statop itself - transformed_ddf = _transform_ddf(ddf, column_group, parent_only=True) + transformed_ddf = _transform_ddf(ddf, column_group.parents) input_column_names = [ col for parent in column_group.parents for col in parent.columns @@ -139,7 +139,7 @@ def fit_transform(self, dataset): def _input_columns(self): input_nodes = set(node for node in iter_nodes([self.column_group]) if not node.parents) - return list(set(col for node in input_nodes for col in flatten(node.columns))) + return list(set(col for node in input_nodes for col in node.flattened_columns)) def _clear_worker_cache(self): # Clear worker caches to be "safe" @@ -149,39 +149,45 @@ def _clear_worker_cache(self): clean_worker_cache() -def _transform_ddf(ddf, column_group, parent_only=False): +def _transform_ddf(ddf, column_groups): + if isinstance(column_groups, ColumnGroup): + column_groups = [column_groups] + + columns = list(flatten(cg.flattened_columns for cg in column_groups)) + return ddf.map_partitions( - lambda gdf: _transform_partition(gdf, column_group, parent_only), - meta=cudf.DataFrame({k: [] for k in column_group.columns}), + lambda gdf: _transform_partition(gdf, column_groups), + meta=cudf.DataFrame({k: [] for k in columns}), ) -def _transform_partition(root_gdf, column_group, parent_only=False): +def _transform_partition(root_gdf, column_groups): """ Transforms a single partition by appyling all operators in a ColumnGroup """ - # collect dependencies recursively if we have parents - if column_group.parents: - gdf = cudf.DataFrame() - for parent in column_group.parents: - parent_gdf = _transform_partition(root_gdf, parent) - for column in parent.columns: - gdf[column] = parent_gdf[column] - else: - # otherwise select the input from the root gdf - gdf = root_gdf[list(flatten(column_group.columns))] - - # apply the operator if necessary - if column_group.op and not parent_only: - input_column_names = [col for parent in column_group.parents for col in parent.columns] - try: - gdf = column_group.op.transform(input_column_names, gdf) - except Exception: - LOG.exception("Failed to transform operator %s", column_group.op) - raise - - # dask needs output to be in the same order defined as meta, reorder partitions here - # this also selects columns (handling the case of removing columns from the output using - # "-" overload) output = cudf.DataFrame() - for column in column_group.columns: - output[column] = gdf[column] + for column_group in column_groups: + # collect dependencies recursively if we have parents + if column_group.parents: + gdf = cudf.DataFrame() + for parent in column_group.parents: + parent_gdf = _transform_partition(root_gdf, [parent]) + for column in parent.flattened_columns: + gdf[column] = parent_gdf[column] + else: + # otherwise select the input from the root gdf + gdf = root_gdf[column_group.flattened_columns] + + # apply the operator if necessary + if column_group.op: + input_column_names = [col for parent in column_group.parents for col in parent.columns] + try: + gdf = column_group.op.transform(input_column_names, gdf) + except Exception: + LOG.exception("Failed to transform operator %s", column_group.op) + raise + + # dask needs output to be in the same order defined as meta, reorder partitions here + # this also selects columns (handling the case of removing columns from the output using + # "-" overload) + for column in column_group.flattened_columns: + output[column] = gdf[column] return output diff --git a/tests/unit/test_ops.py b/tests/unit/test_ops.py index fc5a024ad63..2f321520feb 100644 --- a/tests/unit/test_ops.py +++ b/tests/unit/test_ops.py @@ -734,7 +734,6 @@ def test_categorify_lists(tmpdir, freq_threshold): workflow = nvt.Workflow(cat_features + label_name) df_out = workflow.fit_transform(nvt.Dataset(df)).to_ddf().compute() - print(df_out) # Columns are encoded independently if freq_threshold < 2: @@ -835,23 +834,16 @@ def test_categorify_freq_limit(tmpdir, freq_limit, buckets, search_sort): if (not search_sort and isfreqthr) or (search_sort and not isfreqthr): cat_names = ["Author", "Engaging User"] - cont_names = [] - label_name = [] - - processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) - - processor.add_preprocess( - ops.Categorify( - columns=cat_names, - freq_threshold=freq_limit, - out_path=str(tmpdir), - search_sorted=search_sort, - num_buckets=buckets, - ) + + cats = cat_names >> ops.Categorify( + freq_threshold=freq_limit, + out_path=str(tmpdir), + search_sorted=search_sort, + num_buckets=buckets, ) - processor.finalize() - processor.apply(nvt.Dataset(df), output_format=None) - df_out = processor.get_ddf().compute(scheduler="synchronous") + + workflow = nvt.Workflow(cats) + df_out = workflow.fit_transform(nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous") if freq_limit and not buckets: # Column combinations are encoded From 23a4eb9feb733a4d76b33b93c79fba80d7955daa Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Mon, 14 Dec 2020 23:00:06 -0800 Subject: [PATCH 05/23] Update JoinGroupby --- nvtabular/column_group.py | 11 ++++ nvtabular/ops/filter.py | 2 +- nvtabular/ops/join_groupby.py | 105 +++++++++++++++++++--------------- nvtabular/workflow.py | 11 ++-- tests/unit/test_dask_nvt.py | 40 ++++--------- tests/unit/test_ops.py | 16 ++---- 6 files changed, 92 insertions(+), 93 deletions(-) diff --git a/nvtabular/column_group.py b/nvtabular/column_group.py index 0caa1a378df..3a2939f6aec 100644 --- a/nvtabular/column_group.py +++ b/nvtabular/column_group.py @@ -43,6 +43,7 @@ def __init__(self, columns): self.children = [] self.op = None self.kind = None + self.dependencies = None def __rshift__(self, operator): """Transforms this ColumnGroup by applying an Operator @@ -72,6 +73,7 @@ def __rshift__(self, operator): dependencies = operator.dependencies() if dependencies: + child.dependencies = set() if not isinstance(dependencies, collections.abc.Sequence): dependencies = [dependencies] @@ -80,6 +82,7 @@ def __rshift__(self, operator): dependency = ColumnGroup(dependency) dependency.children.append(child) child.parents.append(dependency) + child.dependencies.add(dependency) return child @@ -148,6 +151,14 @@ def __repr__(self): def flattened_columns(self): return list(flatten(self.columns, container=tuple)) + @property + def input_column_names(self): + """ returns the names of columns in the main chain """ + dependencies = self.dependencies or set() + return [ + col for parent in self.parents for col in parent.columns if parent not in dependencies + ] + @property def label(self): if self.op: diff --git a/nvtabular/ops/filter.py b/nvtabular/ops/filter.py index e289196b60c..3ee88547f03 100644 --- a/nvtabular/ops/filter.py +++ b/nvtabular/ops/filter.py @@ -42,7 +42,7 @@ def __init__(self, f): self.f = f @annotate("Filter_op", color="darkgreen", domain="nvt_python") - def apply_op( + def transform( self, columns, gdf: cudf.DataFrame, diff --git a/nvtabular/ops/join_groupby.py b/nvtabular/ops/join_groupby.py index 064a4d4a2cb..6077a2f952e 100644 --- a/nvtabular/ops/join_groupby.py +++ b/nvtabular/ops/join_groupby.py @@ -15,14 +15,12 @@ # import cudf import cupy -from dask.core import flatten +from dask.delayed import Delayed from . import categorify as nvt_cat -from .groupby_statistics import GroupbyStatistics from .stat_operator import StatOperator -# TODO: statoperator new api class JoinGroupby(StatOperator): """ One of the ways to create new features is to calculate @@ -78,7 +76,6 @@ def __init__( self, cont_names=None, stats=["count"], - columns=None, tree_width=None, cat_cache="host", out_path=None, @@ -86,64 +83,64 @@ def __init__( name_sep="_", stat_name=None, ): - self.column_groups = None + super().__init__() + self.storage_name = {} self.name_sep = name_sep - if isinstance(columns, str): - columns = [columns] - if isinstance(columns, list): - self.column_groups = columns - columns = list(set(flatten(columns, container=list))) - for group in self.column_groups: - if isinstance(group, list) and len(group) > 1: - name = nvt_cat._make_name(*group, sep=self.name_sep) - for col in group: - self.storage_name[col] = name - - super().__init__(columns=columns, replace=False) self.cont_names = cont_names self.stats = stats self.tree_width = tree_width self.out_path = out_path self.on_host = on_host self.cat_cache = cat_cache - self.stat_name = stat_name or "gb_categories" - - @property - def req_stats(self): - return [ - GroupbyStatistics( - columns=self.column_groups or self.columns, - concat_groups=False, - cont_names=self.cont_names, - stats=self.stats, - tree_width=self.tree_width, - out_path=self.out_path, - on_host=self.on_host, - stat_name=self.stat_name, - name_sep=self.name_sep, - ) - ] + self.categories = {} + + supported_ops = ["count", "sum", "mean", "std", "var", "min", "max"] + for op in self.stats: + if op not in supported_ops: + raise ValueError(op + " operation is not supported.") + + def fit(self, columns, ddf): + if isinstance(columns, list): + for group in columns: + if isinstance(group, (list, tuple)) and len(group) > 1: + name = nvt_cat._make_name(*group, sep=self.name_sep) + for col in group: + self.storage_name[col] = name + + dsk, key = nvt_cat._category_stats( + ddf, + columns, + self.cont_names, + self.stats, + self.out_path, + 0, + self.tree_width, + self.on_host, + concat_groups=False, + name_sep=self.name_sep, + ) + return Delayed(key, dsk) - def op_logic(self, gdf: cudf.DataFrame, target_columns: list, stats_context=None): + def fit_finalize(self, dask_stats): + for col in dask_stats: + self.categories[col] = dask_stats[col] + def transform(self, columns, gdf: cudf.DataFrame): new_gdf = cudf.DataFrame() tmp = "__tmp__" # Temporary column for sorting gdf[tmp] = cupy.arange(len(gdf), dtype="int32") - if self.column_groups: - cat_names, multi_col_group = nvt_cat._get_multicolumn_names( - self.column_groups, gdf.columns, self.name_sep - ) - else: - multi_col_group = {} - cat_names = [name for name in target_columns if name in gdf.columns] + + cat_names, multi_col_group = nvt_cat._get_multicolumn_names( + columns, gdf.columns, self.name_sep + ) for name in cat_names: storage_name = self.storage_name.get(name, name) name = multi_col_group.get(name, name) - path = stats_context[self.stat_name][storage_name] - selection_l = name.copy() if isinstance(name, list) else [name] - selection_r = name if isinstance(name, list) else [storage_name] + path = self.categories[storage_name] + selection_l = list(name) if isinstance(name, tuple) else [name] + selection_r = list(name) if isinstance(name, tuple) else [storage_name] stat_gdf = nvt_cat._read_groupby_stat_df(path, storage_name, self.cat_cache) tran_gdf = gdf[selection_l + [tmp]].merge( @@ -155,3 +152,21 @@ def op_logic(self, gdf: cudf.DataFrame, target_columns: list, stats_context=None new_gdf[new_cols] = tran_gdf[new_cols].reset_index(drop=True) gdf.drop(columns=[tmp], inplace=True) return new_gdf + + def dependencies(self): + return self.cont_names + + def output_column_names(self, columns): + # TODO: the names here are defined in categorify/mid_level_groupby + # refactor to have a common implementation + output = [] + for name in columns: + if isinstance(name, (tuple, list)): + name = nvt_cat._make_name(*name, sep=self.name_sep) + for cont in self.cont_names: + for stat in self.stats: + if stat == "count": + output.append(f"{name}_{stat}") + else: + output.append(f"{name}_{cont}_{stat}") + return output diff --git a/nvtabular/workflow.py b/nvtabular/workflow.py index 63eefe0035b..1878b04dc85 100644 --- a/nvtabular/workflow.py +++ b/nvtabular/workflow.py @@ -101,11 +101,9 @@ def get_stat_ops(nodes): # the transforms from the statop itself transformed_ddf = _transform_ddf(ddf, column_group.parents) - input_column_names = [ - col for parent in column_group.parents for col in parent.columns - ] - stats.append(column_group.op.fit(input_column_names, transformed_ddf)) - ops.append(column_group.op) + op = column_group.op + stats.append(op.fit(column_group.input_column_names, transformed_ddf)) + ops.append(op) if self.client: results = [r.result() for r in self.client.compute(stats)] @@ -178,9 +176,8 @@ def _transform_partition(root_gdf, column_groups): # apply the operator if necessary if column_group.op: - input_column_names = [col for parent in column_group.parents for col in parent.columns] try: - gdf = column_group.op.transform(input_column_names, gdf) + gdf = column_group.op.transform(column_group.input_column_names, gdf) except Exception: LOG.exception("Failed to transform operator %s", column_group.op) raise diff --git a/tests/unit/test_dask_nvt.py b/tests/unit/test_dask_nvt.py index 806caf5ad8a..a2372ef5bb1 100644 --- a/tests/unit/test_dask_nvt.py +++ b/tests/unit/test_dask_nvt.py @@ -24,7 +24,7 @@ import pytest from dask.dataframe import assert_eq -from nvtabular import Dataset, Workflow +from nvtabular import ColumnGroup, Dataset, Workflow from nvtabular import ops as ops from nvtabular.io.shuffle import Shuffle from tests.conftest import allcols_csv, mycols_csv, mycols_pq @@ -138,20 +138,13 @@ def test_dask_groupby_stats(client, tmpdir, datasets, part_mem_fraction): cont_names = ["x", "y", "id"] label_name = ["label"] - processor = Workflow( - client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name + features = cat_names >> ops.JoinGroupby( + cont_names=cont_names, stats=["count", "sum", "std", "min"], out_path=str(tmpdir) ) - processor.add_preprocess( - ops.JoinGroupby( - cont_names=cont_names, stats=["count", "sum", "std", "min"], out_path=str(tmpdir) - ) - ) - processor.finalize() - dataset = Dataset(paths, part_mem_fraction=part_mem_fraction) - processor.apply(dataset) - result = processor.get_ddf().compute(scheduler="synchronous") + workflow = Workflow(features + cat_names + cont_names + label_name, client=client) + result = workflow.fit_transform(dataset).to_ddf().compute(scheduler="synchronous") # Validate result assert len(df0) == len(result) @@ -195,32 +188,21 @@ def test_dask_groupby_stats(client, tmpdir, datasets, part_mem_fraction): @pytest.mark.parametrize("part_mem_fraction", [0.01]) @pytest.mark.parametrize("use_client", [True, False]) def test_cats_and_groupby_stats(client, tmpdir, datasets, part_mem_fraction, use_client): - engine = "parquet" paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] - label_name = ["label"] - processor = Workflow( - client=client if use_client else None, - cat_names=cat_names, - cont_names=cont_names, - label_name=label_name, + cats = ColumnGroup(cat_names) + cat_features = cats >> ops.Categorify(out_path=str(tmpdir), freq_threshold=10, on_host=True) + groupby_features = cats >> ops.JoinGroupby( + cont_names=cont_names, stats=["count", "sum"], out_path=str(tmpdir) ) - processor.add_preprocess(ops.Categorify(out_path=str(tmpdir), freq_threshold=10, on_host=True)) - - processor.add_cat_feature( - ops.JoinGroupby(cont_names=cont_names, stats=["count", "sum"], out_path=str(tmpdir)) - ) - - processor.finalize() + workflow = Workflow(cat_features + groupby_features, client=client) dataset = Dataset(paths, part_mem_fraction=part_mem_fraction) - - processor.apply(dataset, output_path=str(tmpdir)) - result = processor.get_ddf().compute() + result = workflow.fit_transform(dataset).to_ddf().compute() assert "name-cat_x_sum" in result.columns assert "name-string_x_sum" in result.columns diff --git a/tests/unit/test_ops.py b/tests/unit/test_ops.py index 2f321520feb..c11c17cfaa8 100644 --- a/tests/unit/test_ops.py +++ b/tests/unit/test_ops.py @@ -884,18 +884,12 @@ def test_joingroupby_multi(tmpdir, groups): } ) - cat_names = ["Author", "Engaging-User"] - cont_names = ["Cost"] - label_name = ["Post"] - - processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) - - processor.add_preprocess( - ops.JoinGroupby(columns=groups, out_path=str(tmpdir), stats=["sum"], cont_names=["Cost"]) + groupby_features = groups >> ops.JoinGroupby( + out_path=str(tmpdir), stats=["sum"], cont_names=["Cost"] ) - processor.finalize() - processor.apply(nvt.Dataset(df), output_format=None) - df_out = processor.get_ddf().compute(scheduler="synchronous") + workflow = nvt.Workflow(groupby_features + "Post") + + df_out = workflow.fit_transform(nvt.Dataset(df)).to_ddf().compute() if isinstance(groups, list): # Join on ["Author", "Engaging-User"] From 711efd84d927b9a430442493f82f3504c9f77ddf Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Tue, 15 Dec 2020 08:55:57 -0800 Subject: [PATCH 06/23] Fix differencelag --- nvtabular/ops/difference_lag.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/nvtabular/ops/difference_lag.py b/nvtabular/ops/difference_lag.py index 5b01f1ae65d..e457dcc0f06 100644 --- a/nvtabular/ops/difference_lag.py +++ b/nvtabular/ops/difference_lag.py @@ -49,15 +49,10 @@ class DifferenceLag(Operator): shift : int, default 1 The number of rows to look backwards when computing the difference lag. Negative values indicate the number of rows to look forwards, making this compute the lead instead of lag. - columns : list of str, default None - Continuous columns to target for this op. If None, the operation will target all known - continuous columns. - replace: bool, default False - Whether to replace existing columns or create new ones. """ - def __init__(self, partition_cols, shift=1, columns=None, replace=False): - super(DifferenceLag, self).__init__(columns=columns, replace=replace) + def __init__(self, partition_cols, shift=1): + super(DifferenceLag, self).__init__() self.partition_cols = partition_cols self.shifts = [shift] if isinstance(shift, int) else shift From fd3f35ab0b98abd417502e187ea5349e07ad70f9 Mon Sep 17 00:00:00 2001 From: rnyak Date: Tue, 15 Dec 2020 19:06:56 -0500 Subject: [PATCH 07/23] add dependencies method (#498) --- nvtabular/ops/lambdaop.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/nvtabular/ops/lambdaop.py b/nvtabular/ops/lambdaop.py index 2da4c0fa8ff..d52be9f7e7e 100644 --- a/nvtabular/ops/lambdaop.py +++ b/nvtabular/ops/lambdaop.py @@ -75,7 +75,7 @@ class LambdaOp(Operator): Whether to replace existing columns or create new ones. """ - def __init__(self, f): + def __init__(self, f, dependency=None): super().__init__() if f is None: raise ValueError("f cannot be None. LambdaOp op applies f to dataframe") @@ -83,6 +83,7 @@ def __init__(self, f): self._param_count = len(signature(self.f).parameters) if self._param_count not in (1, 2): raise ValueError("lambda function must accept either one or two parameters") + self.dependency = dependency @annotate("DFLambda_op", color="darkgreen", domain="nvt_python") def transform(self, columns, gdf: cudf.DataFrame): @@ -96,3 +97,6 @@ def transform(self, columns, gdf: cudf.DataFrame): # shouldn't ever happen, raise RuntimeError(f"unhandled lambda param count {self._param_count}") return new_gdf + + def dependencies(self): + return self.dependency From 9b78a46ad98e4747babe7f02b4d794bfb57162eb Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Tue, 15 Dec 2020 22:15:05 -0800 Subject: [PATCH 08/23] Convert TargetEncoding op --- nvtabular/ops/stat_operator.py | 1 - nvtabular/ops/target_encoding.py | 108 ++++++++++++++++++++++++++----- nvtabular/workflow.py | 8 ++- tests/unit/test_ops.py | 57 +++++----------- 4 files changed, 114 insertions(+), 60 deletions(-) diff --git a/nvtabular/ops/stat_operator.py b/nvtabular/ops/stat_operator.py index 01b9f614955..d90455b5ff2 100644 --- a/nvtabular/ops/stat_operator.py +++ b/nvtabular/ops/stat_operator.py @@ -24,7 +24,6 @@ class StatOperator(Operator): def __init__(self): super(StatOperator, self).__init__() - self._ddf_out = None def fit(self, columns, ddf): raise NotImplementedError( diff --git a/nvtabular/ops/target_encoding.py b/nvtabular/ops/target_encoding.py index c2492ed9ff0..a3354ecdf26 100644 --- a/nvtabular/ops/target_encoding.py +++ b/nvtabular/ops/target_encoding.py @@ -15,8 +15,11 @@ # import cudf import cupy +import numpy as np +from dask.delayed import Delayed from . import categorify as nvt_cat +from .moments import _custom_moments from .stat_operator import StatOperator @@ -132,14 +135,62 @@ def __init__( self.out_col = [out_col] if isinstance(out_col, str) else out_col self.out_dtype = out_dtype self.tree_width = tree_width - self.out_path = out_path + self.out_path = out_path or "./" self.on_host = on_host self.cat_cache = cat_cache self.name_sep = name_sep self.drop_folds = drop_folds - self.stat_name = stat_name or "te_stats" + self.fold_name = "__fold__" + self.stats = {} + self.means = {} # TODO: just update target_mean? + + def fit(self, columns, ddf): + moments = None + if self.target_mean is None: + # calcualte the mean if we don't have it already + moments = _custom_moments(ddf[self.target]) + + col_groups = columns[:] + if self.kfold > 1: + # Add new fold column if necessary + if self.fold_name not in ddf.columns: + ddf[self.fold_name] = ddf.index.map_partitions( + _add_fold, + self.kfold, + self.fold_seed, + meta=_add_fold(ddf._meta.index, self.kfold, self.fold_seed), + ) + + # Add new col_groups with fold + for group in columns: + if isinstance(group, tuple): + group = list(group) + if isinstance(group, list): + col_groups.append([self.fold_name] + group) + else: + col_groups.append([self.fold_name, group]) + + dsk, key = nvt_cat._category_stats( + ddf, + col_groups, + self.target, + ["count", "sum"], + self.out_path, + 0, + self.tree_width, + self.on_host, + concat_groups=False, + name_sep=self.name_sep, + ) + return Delayed(key, dsk), moments + + def fit_finalize(self, dask_stats): + for col, value in dask_stats[0].items(): + self.stats[col] = value + print(self.stats) - # TODO: fit/fit_finalize methods + for col in dask_stats[1].index: + self.means[col] = float(dask_stats[1]["mean"].loc[col]) def dependencies(self): return self.dependency @@ -149,14 +200,17 @@ def output_column_names(self, columns): for cat in columns: cat = [cat] if isinstance(cat, str) else cat ret.extend(self._make_te_name(cat)) + + if self.kfold > 1 and not self.drop_folds: + ret.append(self.fold_name) + return ret def _make_te_name(self, cat_group): tag = nvt_cat._make_name(*cat_group, sep=self.name_sep) return [f"TE_{tag}_{x}" for x in self.target] - def _op_group_logic(self, cat_group, gdf, stats_context, y_mean, fit_folds, group_ind): - + def _op_group_logic(self, cat_group, gdf, y_mean, fit_folds, group_ind): # Define name of new TE column if isinstance(self.out_col, list): if group_ind >= len(self.out_col): @@ -177,7 +231,7 @@ def _op_group_logic(self, cat_group, gdf, stats_context, y_mean, fit_folds, grou # Groupby Aggregation for each fold cols = ["__fold__"] + cat_group storage_name_folds = nvt_cat._make_name(*cols, sep=self.name_sep) - path_folds = stats_context[self.stat_name][storage_name_folds] + path_folds = self.stats[storage_name_folds] agg_each_fold = nvt_cat._read_groupby_stat_df( path_folds, storage_name_folds, self.cat_cache ) @@ -187,7 +241,7 @@ def _op_group_logic(self, cat_group, gdf, stats_context, y_mean, fit_folds, grou # Groupby Aggregation for all data storage_name_all = nvt_cat._make_name(*cat_group, sep=self.name_sep) - path_all = stats_context[self.stat_name][storage_name_all] + path_all = self.stats[storage_name_all] agg_all = nvt_cat._read_groupby_stat_df(path_all, storage_name_all, self.cat_cache) agg_all.columns = cat_group + ["count_y_all"] + [x + "_sum_y_all" for x in self.target] @@ -238,31 +292,51 @@ def _op_group_logic(self, cat_group, gdf, stats_context, y_mean, fit_folds, grou new_gdf.index = gdf.index return new_gdf - def op_logic(self, gdf: cudf.DataFrame, target_columns: list, stats_context=None): - + def transform(self, columns, gdf: cudf.DataFrame): # Add temporary column for sorting tmp = "__tmp__" gdf[tmp] = cupy.arange(len(gdf), dtype="int32") - # Only perform "fit" if fold column is present - fit_folds = "__fold__" in gdf.columns + fit_folds = self.kfold > 1 + if fit_folds: + gdf[self.fold_name] = _add_fold(gdf.index, self.kfold, self.fold_seed) # Need mean of contiuous target column - y_mean = self.target_mean or stats_context["means"] + y_mean = self.target_mean or self.means # Loop over categorical-column groups and apply logic new_gdf = None - for ind, cat_group in enumerate(self.cat_groups): + for ind, cat_group in enumerate(columns): + if isinstance(cat_group, tuple): + cat_group = list(cat_group) + elif isinstance(cat_group, str): + cat_group = [cat_group] + if new_gdf is None: - new_gdf = self._op_group_logic( - cat_group, gdf, stats_context, y_mean, fit_folds, ind - ) + new_gdf = self._op_group_logic(cat_group, gdf, y_mean, fit_folds, ind) else: - _df = self._op_group_logic(cat_group, gdf, stats_context, y_mean, fit_folds, ind) + _df = self._op_group_logic(cat_group, gdf, y_mean, fit_folds, ind) new_gdf = cudf.concat([new_gdf, _df], axis=1) # Drop temporary columns gdf.drop( columns=[tmp, "__fold__"] if fit_folds and self.drop_folds else [tmp], inplace=True ) + if fit_folds and not self.drop_folds: + new_gdf[self.fold_name] = gdf[self.fold_name] return new_gdf + + +def _add_fold(s, kfold, fold_seed=None): + """Deterministically computes a '__fold__' column, given an optional + random seed""" + typ = np.min_scalar_type(kfold * 2) + if fold_seed is None: + # If we don't have a specific seed, + # just use a simple modulo-based mapping + fold = cupy.arange(len(s), dtype=typ) + cupy.mod(fold, kfold, out=fold) + return fold + else: + state = cupy.random.RandomState(fold_seed) + return state.choice(cupy.arange(kfold, dtype=typ), len(s)) diff --git a/nvtabular/workflow.py b/nvtabular/workflow.py index 1878b04dc85..8e501171330 100644 --- a/nvtabular/workflow.py +++ b/nvtabular/workflow.py @@ -102,8 +102,12 @@ def get_stat_ops(nodes): transformed_ddf = _transform_ddf(ddf, column_group.parents) op = column_group.op - stats.append(op.fit(column_group.input_column_names, transformed_ddf)) - ops.append(op) + try: + stats.append(op.fit(column_group.input_column_names, transformed_ddf)) + ops.append(op) + except Exception: + LOG.exception("Failed to fit operator %s", column_group.op) + raise if self.client: results = [r.result() for r in self.client.compute(stats)] diff --git a/tests/unit/test_ops.py b/tests/unit/test_ops.py index c11c17cfaa8..449e6d77d5e 100644 --- a/tests/unit/test_ops.py +++ b/tests/unit/test_ops.py @@ -195,30 +195,19 @@ def test_target_encode(tmpdir, cat_groups, kfold, fold_seed): ) df = dask_cudf.from_cudf(df, npartitions=3) - cat_names = ["Author", "Engaging-User"] cont_names = ["Cost"] - label_name = ["Post"] - - processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) - processor.add_feature([ops.FillMissing(), ops.Clip(min_value=0), ops.LogOp()]) - processor.add_preprocess( - ops.TargetEncoding( - cat_groups, - "Cost", # cont_target - out_path=str(tmpdir), - kfold=kfold, - out_col="test_name", - out_dtype="float32", - fold_seed=fold_seed, - drop_folds=False, # Keep folds to validate - ) + te_features = cat_groups >> ops.TargetEncoding( + cont_names, + out_path=str(tmpdir), + kfold=kfold, + out_dtype="float32", + fold_seed=fold_seed, + drop_folds=False, # Keep folds to validate ) - processor.finalize() - processor.apply(nvt.Dataset(df), output_format=None) - df_out = processor.get_ddf().compute(scheduler="synchronous") - assert "test_name" in df_out.columns - assert df_out["test_name"].dtype == "float32" + cont_features = cont_names >> ops.FillMissing() >> ops.Clip(min_value=0) >> ops.LogOp() + workflow = nvt.Workflow(te_features + cont_features + ["Author", "Engaging-User"]) + df_out = workflow.fit_transform(nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous") if kfold > 1: # Cat columns are unique. @@ -229,7 +218,7 @@ def test_target_encode(tmpdir, cat_groups, kfold, fold_seed): else: name = "__fold___Author_Engaging-User" cols = ["__fold__", "Author", "Engaging-User"] - check = cudf.io.read_parquet(processor.stats["te_stats"][name]) + check = cudf.io.read_parquet(te_features.op.stats[name]) check = check[cols].sort_values(cols).reset_index(drop=True) df_out_check = df_out[cols].sort_values(cols).reset_index(drop=True) assert_eq(check, df_out_check) @@ -245,26 +234,14 @@ def test_target_encode_multi(tmpdir, npartitions): df = cudf.DataFrame({"cat": cat_1, "cat2": cat_2, "num": num_1, "num_2": num_2}) df = dask_cudf.from_cudf(df, npartitions=npartitions) - cat_names = ["cat", "cat2"] - cont_names = ["num", "num_2"] - label_name = [] - processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) - cat_groups = ["cat", "cat2", ["cat", "cat2"]] - - processor.add_preprocess( - ops.TargetEncoding( - cat_groups, - ["num", "num_2"], # cont_target - out_path=str(tmpdir), - kfold=1, - p_smooth=5, - out_dtype="float32", - ) + te_features = cat_groups >> ops.TargetEncoding( + ["num", "num_2"], out_path=str(tmpdir), kfold=1, p_smooth=5, out_dtype="float32" ) - processor.finalize() - processor.apply(nvt.Dataset(df), output_format=None) - df_out = processor.get_ddf().compute(scheduler="synchronous") + + workflow = nvt.Workflow(te_features) + + df_out = workflow.fit_transform(nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous") assert "TE_cat_cat2_num" in df_out.columns assert "TE_cat_num" in df_out.columns From 5c28e853ff43827fa80f0d430b27a3d75ddf5996 Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Tue, 15 Dec 2020 22:33:29 -0800 Subject: [PATCH 09/23] Update nvtabular/workflow.py Co-authored-by: Richard (Rick) Zamora --- nvtabular/workflow.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/nvtabular/workflow.py b/nvtabular/workflow.py index 8e501171330..c511498b6a3 100644 --- a/nvtabular/workflow.py +++ b/nvtabular/workflow.py @@ -24,8 +24,6 @@ from nvtabular.ops import StatOperator from nvtabular.worker import clean_worker_cache -# import yaml - LOG = logging.getLogger("nvtabular") From 6a84e7dc65a6fc6598742eb2b870b9d8e8c09609 Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Tue, 15 Dec 2020 22:33:43 -0800 Subject: [PATCH 10/23] Update nvtabular/workflow.py Co-authored-by: Richard (Rick) Zamora --- nvtabular/workflow.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nvtabular/workflow.py b/nvtabular/workflow.py index c511498b6a3..adb7c9a4eea 100644 --- a/nvtabular/workflow.py +++ b/nvtabular/workflow.py @@ -156,7 +156,8 @@ def _transform_ddf(ddf, column_groups): columns = list(flatten(cg.flattened_columns for cg in column_groups)) return ddf.map_partitions( - lambda gdf: _transform_partition(gdf, column_groups), + _transform_partition, + column_groups, meta=cudf.DataFrame({k: [] for k in columns}), ) From b21ffd8a0166572ae5daf1b608a421976db552a7 Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Wed, 16 Dec 2020 14:54:18 -0800 Subject: [PATCH 11/23] Remove workflow code from dataloaders We should be doing online transforms like ```KerasSequenceLoader(workflow.transform(dataset), ...``` instead of ```KerasSequenceLoader(dataset, workflows=[workflow], ...``` now --- nvtabular/loader/backend.py | 32 -------------------------------- 1 file changed, 32 deletions(-) diff --git a/nvtabular/loader/backend.py b/nvtabular/loader/backend.py index 3f74dcffe72..05c3557af56 100644 --- a/nvtabular/loader/backend.py +++ b/nvtabular/loader/backend.py @@ -23,7 +23,6 @@ from nvtabular.io.shuffle import _shuffle_gdf from nvtabular.ops import _get_embedding_order -from nvtabular.workflow import BaseWorkflow def _num_steps(num_samples, step_size): @@ -155,21 +154,6 @@ def get_batch_div_chunk(self, chunks, batch_size): return chunks, spill -def _validate_workflows(workflows, cat_names, cont_names, label_names): - assert all([isinstance(w, BaseWorkflow) for w in workflows]) - # TODO: commenting out until it's clearer what the - # columns in workflow.columns_cts["final"]["ctx"] mean - # for workflow in workflows: - # assert set(workflow.columns_ctx["categorical"]["base"]) == set(cat_names) - # assert set(workflow.columns_ctx["continuous"]["base"]) == set(cont_names) - # assert set(workflow.columns_ctx["label"]["base"]) == set(label_names) - - # cat_names = workflow.columns_ctx["final"]["ctx"]["categorical"] - # cont_names = workflow.columns_ctx["final"]["ctx"]["continuous"] - # label_name = workflow.columns_ctx["final"]["ctx"]["label"][0] - return workflows - - # TODO: implement as metaclass and assign methods to children # to avoid having to do Dataset. calls? class DataLoader: @@ -184,15 +168,12 @@ def __init__( batch_size, shuffle, parts_per_chunk=1, - workflows=None, devices=None, ): self.data = dataset self.indices = cp.arange(dataset.to_ddf().npartitions) devices = devices or [0] - workflows = workflows or [] - self.workflows = _validate_workflows(workflows, cat_names, cont_names, label_names) self.cat_names = cat_names or [] self.cont_names = cont_names or [] @@ -298,20 +279,7 @@ def _get_next_batch(self): batch = next(self._batch_itr) return batch - def map(self, workflow): - """ - Map an NVTabular Workflow on to a data loader to do - online preprocessing - """ - workflows = self.workflows + [workflow] - self.workflows = _validate_workflows( - workflows, self.cat_names, self.cont_names, self.label_names - ) - # TODO: update cat/cont/label names after - def make_tensors(self, gdf, use_nnz=False): - for workflow in self.workflows: - gdf = workflow.apply_ops(gdf) split_idx = self._get_segment_lengths(len(gdf)) # map from big chunk to framework-specific tensors From f216edf6186baf2423a632d8a43e0355163a038a Mon Sep 17 00:00:00 2001 From: bschifferer Date: Wed, 16 Dec 2020 23:58:56 +0100 Subject: [PATCH 12/23] Unittest ops + bugfix in Bucketize (#496) * test_minmix * updates test * unittest ops --- nvtabular/ops/bucketize.py | 3 +- nvtabular/ops/hashed_cross.py | 17 +- nvtabular/ops/join_external.py | 8 +- tests/unit/test_ops.py | 770 +++++++++------------------------ 4 files changed, 215 insertions(+), 583 deletions(-) diff --git a/nvtabular/ops/bucketize.py b/nvtabular/ops/bucketize.py index 6e5e27e6ab2..11a5dfbcc8e 100644 --- a/nvtabular/ops/bucketize.py +++ b/nvtabular/ops/bucketize.py @@ -45,6 +45,5 @@ def transform(self, columns, gdf: cudf.DataFrame): val = 0 for boundary in b: val += (gdf[col] >= boundary).astype("int") - new_col = f"{col}_{self._id}" - new_gdf[new_col] = val + new_gdf[col] = val return new_gdf diff --git a/nvtabular/ops/hashed_cross.py b/nvtabular/ops/hashed_cross.py index e2839bd183e..f37601d4e50 100644 --- a/nvtabular/ops/hashed_cross.py +++ b/nvtabular/ops/hashed_cross.py @@ -25,16 +25,15 @@ def __init__(self, num_buckets): self.num_buckets = num_buckets @annotate("HashedCross_op", color="darkgreen", domain="nvt_python") - def op_logic(self, columns, gdf: cudf.DataFrame): + def transform(self, columns, gdf: cudf.DataFrame): new_gdf = cudf.DataFrame() - for cross in columns: - val = 0 - for column in cross: - val ^= gdf[column].hash_values() # or however we want to do this aggregation - # TODO: support different size buckets per cross - val = val % self.bucket_size - new_gdf["_X_".join(cross)] = val + val = 0 + for column in columns: + val ^= gdf[column].hash_values() # or however we want to do this aggregation + # TODO: support different size buckets per cross + val = val % self.num_buckets + new_gdf["_X_".join(columns)] = val return new_gdf def output_column_names(self, columns): - return ["_X_".join(cross) for cross in columns] + return ["_X_".join(columns)] diff --git a/nvtabular/ops/join_external.py b/nvtabular/ops/join_external.py index 2c0631a09e4..773ef78ec2c 100644 --- a/nvtabular/ops/join_external.py +++ b/nvtabular/ops/join_external.py @@ -88,7 +88,7 @@ def __init__( cache="host", **kwargs, ): - super().__init__(replace=False) + super(JoinExternal).__init__() self.on = on self.df_ext = df_ext self.on_ext = on_ext or self.on @@ -155,9 +155,9 @@ def transform( return new_gdf def output_column_names(self, columns): - if self.ext_columns: - return columns + self.ext_columns - return columns + self._ext.columns + if self.columns_ext: + return list(set(columns + self.columns_ext)) + return list(set(columns + list(self._ext.columns))) def _detect_format(data): diff --git a/tests/unit/test_ops.py b/tests/unit/test_ops.py index 449e6d77d5e..20214db89c1 100644 --- a/tests/unit/test_ops.py +++ b/tests/unit/test_ops.py @@ -14,7 +14,6 @@ # limitations under the License. # import math -import os import string import cudf @@ -27,158 +26,30 @@ import nvtabular as nvt import nvtabular.io +from nvtabular import ColumnGroup from nvtabular import ops as ops -from tests.conftest import get_cats, mycols_csv, mycols_pq +from tests.conftest import mycols_csv, mycols_pq @pytest.mark.parametrize("gpu_memory_frac", [0.01, 0.1]) @pytest.mark.parametrize("engine", ["parquet", "csv", "csv-no-header"]) # TODO: dask workflow doesn't support min/max on string columns, so won't work # with op_columns=None -@pytest.mark.parametrize("op_columns", [["x"]]) -def test_minmax(tmpdir, client, df, dataset, gpu_memory_frac, engine, op_columns): - cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"] - cont_names = ["x", "y"] - label_name = ["label"] - - config = nvtabular.workflow.get_new_config() - config["PP"]["all"] = [ops.MinMax(columns=op_columns)] - - processor = nvtabular.Workflow( - cat_names=cat_names, cont_names=cont_names, label_name=label_name, config=config - ) - processor.update_stats(dataset) - x_min = df["x"].min() - - assert x_min == pytest.approx(processor.stats["mins"]["x"], 1e-2) - x_max = df["x"].max() - assert x_max == pytest.approx(processor.stats["maxs"]["x"], 1e-2) - if not op_columns: - name_min = min(df["name-string"].tolist()) - name_max = max(df["name-string"].tolist()) - assert name_min == processor.stats["mins"]["name-string"] - y_max = df["y"].max() - y_min = df["y"].min() - assert y_max == processor.stats["maxs"]["y"] - assert name_max == processor.stats["maxs"]["name-string"] - assert y_min == processor.stats["mins"]["y"] - - -@pytest.mark.parametrize("gpu_memory_frac", [0.01, 0.1]) -@pytest.mark.parametrize("engine", ["parquet", "csv", "csv-no-header"]) -@pytest.mark.parametrize("op_columns", [["x"], None]) -def test_moments(tmpdir, df, dataset, gpu_memory_frac, engine, op_columns): - cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"] - cont_names = ["x", "y", "id"] - label_name = ["label"] - - config = nvt.workflow.get_new_config() - config["PP"]["continuous"] = [ops.Moments(columns=op_columns)] - - processor = nvtabular.Workflow( - cat_names=cat_names, cont_names=cont_names, label_name=label_name, config=config - ) - - processor.update_stats(dataset) - - assert df.x.count() == processor.stats["counts"]["x"] - assert df.x.count() == 4321 - - # Check mean and std - assert math.isclose(df.x.mean(), processor.stats["means"]["x"], rel_tol=1e-4) - assert math.isclose(df.x.std(), processor.stats["stds"]["x"], rel_tol=1e-3) - if not op_columns: - assert math.isclose(df.y.mean(), processor.stats["means"]["y"], rel_tol=1e-4) - assert math.isclose(df.id.mean(), processor.stats["means"]["id"], rel_tol=1e-4) - - assert math.isclose(df.y.std(), processor.stats["stds"]["y"], rel_tol=1e-3) - assert math.isclose(df.id.std(), processor.stats["stds"]["id"], rel_tol=1e-3) - - -@pytest.mark.parametrize("gpu_memory_frac", [0.01, 0.1]) -@pytest.mark.parametrize("engine", ["parquet", "csv", "csv-no-header"]) -@pytest.mark.parametrize("op_columns", [["name-string"], None]) -def test_encoder(tmpdir, df, dataset, gpu_memory_frac, engine, op_columns): - cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"] - cont_names = ["x", "y", "id"] - label_name = ["label"] - - encoder = ops.GroupbyStatistics(columns=op_columns) - config = nvt.workflow.get_new_config() - config["PP"]["categorical"] = [encoder] - - processor = nvt.Workflow( - cat_names=cat_names, cont_names=cont_names, label_name=label_name, config=config - ) - processor.update_stats(dataset) - - if engine == "parquet" and not op_columns: - cats_expected0 = df["name-cat"].unique().values_host - cats0 = get_cats(processor, "name-cat") - assert cats0.tolist() == [None] + cats_expected0.tolist() - - cats_expected1 = df["name-string"].unique().values_host - cats1 = get_cats(processor, "name-string") - assert cats1.tolist() == [None] + cats_expected1.tolist() - - -@pytest.mark.parametrize("engine", ["parquet"]) -@pytest.mark.parametrize("groups", [[["name-cat", "name-string"], "name-cat"], "name-string"]) -@pytest.mark.parametrize("concat_groups", [True, False]) -def test_multicolumn_cats(tmpdir, df, dataset, engine, groups, concat_groups): - cat_names = ["name-cat", "name-string"] - cont_names = ["x", "y", "id"] - label_name = ["label"] - - encoder = ops.GroupbyStatistics( - columns=groups, - cont_names=None if concat_groups else ["x"], - stats=None if concat_groups else ["count", "mean"], - out_path=str(tmpdir), - concat_groups=concat_groups, - ) - config = nvt.workflow.get_new_config() - config["PP"]["categorical"] = [encoder] - - processor = nvt.Workflow( - cat_names=cat_names, cont_names=cont_names, label_name=label_name, config=config - ) - processor.update_stats(dataset) - - groups = [groups] if isinstance(groups, str) else groups - for group in groups: - group = [group] if isinstance(group, str) else group - prefix = "unique." if concat_groups else "cat_stats." - fn = prefix + "_".join(group) + ".parquet" - cudf.read_parquet(os.path.join(tmpdir, "categories", fn)) - - -@pytest.mark.parametrize("engine", ["parquet"]) -@pytest.mark.parametrize("groups", [[["name-cat", "name-string"]], "name-string"]) -@pytest.mark.parametrize("kfold", [3]) -def test_groupby_folds(tmpdir, df, dataset, engine, groups, kfold): - cat_names = ["name-cat", "name-string"] - cont_names = ["x", "y", "id"] - label_name = ["label"] - - gb_stats = ops.GroupbyStatistics( - columns=None, - out_path=str(tmpdir), - kfold=kfold, - fold_groups=groups, - stats=["count", "sum"], - cont_names=["y"], - ) - config = nvt.workflow.get_new_config() - config["PP"]["categorical"] = [gb_stats] - - processor = nvt.Workflow( - cat_names=cat_names, cont_names=cont_names, label_name=label_name, config=config - ) - processor.update_stats(dataset) - for group, path in processor.stats["categories"].items(): - df = cudf.read_parquet(path) - assert "__fold__" in df.columns +@pytest.mark.parametrize("op_columns", [["x"], ["x", "y"]]) +def test_normalize_minmax(tmpdir, client, df, dataset, gpu_memory_frac, engine, op_columns): + cont_features = op_columns >> ops.NormalizeMinMax() + processor = nvtabular.Workflow(cont_features) + processor.fit(dataset) + new_gdf = processor.transform(dataset).to_ddf().compute() + for col in op_columns: + col_min = df[col].min() + assert col_min == pytest.approx(processor.column_group.op.mins[col], 1e-2) + col_max = df[col].max() + assert col_max == pytest.approx(processor.column_group.op.maxs[col], 1e-2) + df[col] = (df[col] - processor.column_group.op.mins[col]) / ( + processor.column_group.op.maxs[col] - processor.column_group.op.mins[col] + ) + assert np.all((df[col] - new_gdf[col]).abs().values <= 1e-2) @pytest.mark.parametrize("cat_groups", ["Author", [["Author", "Engaging-User"]]]) @@ -260,45 +131,27 @@ def test_target_encode_multi(tmpdir, npartitions): @pytest.mark.parametrize("gpu_memory_frac", [0.01, 0.1]) @pytest.mark.parametrize("engine", ["parquet", "csv", "csv-no-header"]) -@pytest.mark.parametrize("op_columns", [["x"], None]) -def test_median(tmpdir, df, dataset, gpu_memory_frac, engine, op_columns): - cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"] - cont_names = ["x", "y", "id"] - label_name = ["label"] - - config = nvt.workflow.get_new_config() - config["PP"]["continuous"] = [ops.Median(columns=op_columns)] - - processor = nvt.Workflow( - cat_names=cat_names, cont_names=cont_names, label_name=label_name, config=config - ) - - processor.update_stats(dataset) - - # Check median (TODO: Improve the accuracy) - x_median = df.x.dropna().quantile(0.5, interpolation="linear") - assert math.isclose(x_median, processor.stats["medians"]["x"], rel_tol=1e1) - if not op_columns: - y_median = df.y.dropna().quantile(0.5, interpolation="linear") - id_median = df.id.dropna().quantile(0.5, interpolation="linear") - assert math.isclose(y_median, processor.stats["medians"]["y"], rel_tol=1e1) - assert math.isclose(id_median, processor.stats["medians"]["id"], rel_tol=1e1) +@pytest.mark.parametrize("op_columns", [["x"], ["x", "y"]]) +def test_fill_median(tmpdir, df, dataset, gpu_memory_frac, engine, op_columns): + cont_features = op_columns >> nvt.ops.FillMedian() + processor = nvt.Workflow(cont_features) + processor.fit(dataset) + new_gdf = processor.transform(dataset).to_ddf().compute() + for col in op_columns: + col_median = df[col].dropna().quantile(0.5, interpolation="linear") + assert math.isclose(col_median, processor.column_group.op.medians[col], rel_tol=1e1) + assert np.all((df[col].fillna(col_median) - new_gdf[col]).abs().values <= 1e-2) @pytest.mark.parametrize("gpu_memory_frac", [0.01, 0.1]) @pytest.mark.parametrize("engine", ["parquet", "csv", "csv-no-header"]) -@pytest.mark.parametrize("op_columns", [["x"], None]) +@pytest.mark.parametrize("op_columns", [["x"], ["x", "y"]]) def test_log(tmpdir, df, dataset, gpu_memory_frac, engine, op_columns): - cont_names = ["x", "y", "id"] - log_op = ops.LogOp(columns=op_columns) - - columns_ctx = {} - columns_ctx["continuous"] = {} - columns_ctx["continuous"]["base"] = cont_names - - for gdf in dataset.to_iter(): - new_gdf = log_op.apply_op(gdf, columns_ctx, "continuous") - assert new_gdf[cont_names] == np.log(gdf[cont_names].astype(np.float32)) + cont_features = op_columns >> nvt.ops.LogOp() + processor = nvt.Workflow(cont_features) + processor.fit(dataset) + new_gdf = processor.transform(dataset).to_ddf().compute() + assert new_gdf[op_columns] == np.log(df[op_columns].astype(np.float32)) @pytest.mark.parametrize("gpu_memory_frac", [0.01, 0.1]) @@ -311,23 +164,18 @@ def test_hash_bucket(tmpdir, df, dataset, gpu_memory_frac, engine, op_columns): num_buckets = 10 else: num_buckets = {column: 10 for column in op_columns} - hash_bucket_op = ops.HashBucket(num_buckets) - columns_ctx = {} - columns_ctx["categorical"] = {} - columns_ctx["categorical"]["base"] = cat_names + hash_features = cat_names >> ops.HashBucket(num_buckets) + processor = nvt.Workflow(hash_features) + processor.fit(dataset) + new_gdf = processor.transform(dataset).to_ddf().compute() # check sums for determinancy - checksums = [] - for gdf in dataset.to_iter(): - new_gdf = hash_bucket_op.apply_op(gdf, columns_ctx, "categorical") - assert np.all(new_gdf[cat_names].values >= 0) - assert np.all(new_gdf[cat_names].values <= 9) - checksums.append(new_gdf[cat_names].sum().values) - - for checksum, gdf in zip(checksums, dataset.to_iter()): - new_gdf = hash_bucket_op.apply_op(gdf, columns_ctx, "categorical") - assert np.all(new_gdf[cat_names].sum().values == checksum) + assert np.all(new_gdf[cat_names].values >= 0) + assert np.all(new_gdf[cat_names].values <= 9) + checksum = new_gdf[cat_names].sum().values + new_gdf = processor.transform(dataset).to_ddf().compute() + np.all(new_gdf[cat_names].sum().values == checksum) def test_hash_bucket_lists(tmpdir): @@ -339,193 +187,120 @@ def test_hash_bucket_lists(tmpdir): } ) cat_names = ["Authors"] # , "Engaging User"] - cont_names = [] - label_name = ["Post"] - processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) - processor.add_preprocess(ops.HashBucket(num_buckets=10)) - processor.finalize() - processor.apply(nvt.Dataset(df), output_format=None) - df_out = processor.get_ddf().compute(scheduler="synchronous") + dataset = nvt.Dataset(df) + hash_features = cat_names >> ops.HashBucket(num_buckets=10) + processor = nvt.Workflow(hash_features) + processor.fit(dataset) + new_gdf = processor.transform(dataset).to_ddf().compute() # check to make sure that the same strings are hashed the same - authors = df_out["Authors"].to_arrow().to_pylist() + authors = new_gdf["Authors"].to_arrow().to_pylist() assert authors[0][0] == authors[1][0] # 'User_A' assert authors[2][1] == authors[3][0] # 'User_C' - # make sure we get the embedding sizes - assert nvt.ops.get_embedding_sizes(processor)["Authors"][0] == 10 + # ToDo: make sure we get the embedding sizes + # assert nvt.ops.get_embedding_sizes(processor)["Authors"][0] == 10 @pytest.mark.parametrize("engine", ["parquet"]) def test_fill_missing(tmpdir, df, dataset, engine): - op = nvt.ops.FillMissing(42) - cont_names = ["x", "y"] - columns_ctx = {} - columns_ctx["continuous"] = {} - columns_ctx["continuous"]["base"] = cont_names + cont_features = cont_names >> nvt.ops.FillMissing(fill_val=42) + for col in cont_names: idx = np.random.choice(df.shape[0] - 1, int(df.shape[0] * 0.2)) df[col].iloc[idx] = None - transformed = cudf.concat([op.apply_op(df, columns_ctx, "continuous")]) - assert_eq(transformed[cont_names], df[cont_names].fillna(42)) + df = df.reset_index() + dataset = nvt.Dataset(df) + processor = nvt.Workflow(cont_features) + processor.fit(dataset) + new_gdf = processor.transform(dataset).to_ddf().compute() + for col in cont_names: + assert np.all((df[col].fillna(42) - new_gdf[col]).abs().values <= 1e-2) + assert new_gdf[col].isna().sum() == 0 @pytest.mark.parametrize("engine", ["parquet"]) def test_dropna(tmpdir, df, dataset, engine): - dropna = ops.Dropna() columns = mycols_pq if engine == "parquet" else mycols_csv + dropna_features = columns >> ops.Dropna() - columns_ctx = {} - columns_ctx["all"] = {} - columns_ctx["all"]["base"] = columns - - for gdf in dataset.to_iter(): - new_gdf = dropna.apply_op(gdf, columns_ctx, "all") - assert new_gdf.columns.all() == gdf.columns.all() - assert new_gdf.isnull().all().sum() < 1, "null values exist" + processor = nvt.Workflow(dropna_features) + processor.fit(dataset) + new_gdf = processor.transform(dataset).to_ddf().compute() + assert new_gdf.columns.all() == df.columns.all() + assert new_gdf.isnull().all().sum() < 1, "null values exist" @pytest.mark.parametrize("gpu_memory_frac", [0.01, 0.1]) @pytest.mark.parametrize("engine", ["parquet", "csv", "csv-no-header"]) -@pytest.mark.parametrize("op_columns", [["x"], None]) +@pytest.mark.parametrize("op_columns", [["x"], ["x", "y"]]) def test_normalize(tmpdir, df, dataset, gpu_memory_frac, engine, op_columns): - cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"] - cont_names = ["x", "y"] - label_name = ["label"] - - config = nvt.workflow.get_new_config() - config["PP"]["continuous"] = [ops.Moments(columns=op_columns)] - - processor = nvtabular.Workflow( - cat_names=cat_names, cont_names=cont_names, label_name=label_name, config=config - ) - - processor.update_stats(dataset) - - op = ops.Normalize() - - columns_ctx = {} - columns_ctx["continuous"] = {} - columns_ctx["continuous"]["base"] = op_columns or cont_names - - new_gdf = op.apply_op(df, columns_ctx, "continuous", stats_context=processor.stats) - df["x"] = (df["x"] - processor.stats["means"]["x"]) / processor.stats["stds"]["x"] - assert new_gdf["x"].equals(df["x"]) + cont_features = op_columns >> ops.Normalize() + processor = nvtabular.Workflow(cont_features) + processor.fit(dataset) + + new_gdf = processor.transform(dataset).to_ddf().compute() + for col in op_columns: + assert math.isclose(df[col].mean(), processor.column_group.op.means[col], rel_tol=1e-4) + assert math.isclose(df[col].std(), processor.column_group.op.stds[col], rel_tol=1e-4) + df[col] = (df[col] - processor.column_group.op.means[col]) / processor.column_group.op.stds[ + col + ] + assert np.all((df[col] - new_gdf[col]).abs().values <= 1e-2) @pytest.mark.parametrize("gpu_memory_frac", [0.1]) @pytest.mark.parametrize("engine", ["parquet"]) -@pytest.mark.parametrize("op_columns", [["x"], None]) +@pytest.mark.parametrize("op_columns", [["x"]]) def test_normalize_upcastfloat64(tmpdir, dataset, gpu_memory_frac, engine, op_columns): df = cudf.DataFrame( {"x": [1.9e10, 2.3e16, 3.4e18, 1.6e19], "label": [1, 0, 1, 0]}, dtype="float32" ) - cat_names = [] - cont_names = ["x"] - label_name = ["label"] - - config = nvt.workflow.get_new_config() - config["PP"]["continuous"] = [ops.Moments(columns=op_columns)] - - processor = nvtabular.Workflow( - cat_names=cat_names, cont_names=cont_names, label_name=label_name, config=config - ) - - processor.update_stats(dataset) - - op = ops.Normalize() - - columns_ctx = {} - columns_ctx["continuous"] = {} - columns_ctx["continuous"]["base"] = op_columns or cont_names - - new_gdf = op.apply_op(df, columns_ctx, "continuous", stats_context=processor.stats) - df["x"] = (df["x"] - processor.stats["means"]["x"]) / processor.stats["stds"]["x"] - assert new_gdf["x"].equals(df["x"]) - - -@pytest.mark.parametrize("gpu_memory_frac", [0.01, 0.1]) -@pytest.mark.parametrize("engine", ["parquet", "csv", "csv-no-header"]) -@pytest.mark.parametrize("op_columns", [["x"], None]) -def test_normalize_minmax(tmpdir, df, dataset, gpu_memory_frac, engine, op_columns): - cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"] - cont_names = ["x", "y"] - label_name = ["label"] - - config = nvt.workflow.get_new_config() - config["PP"]["continuous"] = [ops.MinMax()] - - processor = nvtabular.Workflow( - cat_names=cat_names, cont_names=cont_names, label_name=label_name, config=config - ) - - processor.update_stats(dataset) + cont_features = op_columns >> ops.Normalize() + processor = nvtabular.Workflow(cont_features) + dataset = nvt.Dataset(df) + processor.fit(dataset) - op = ops.NormalizeMinMax() + new_gdf = processor.transform(dataset).to_ddf().compute() - columns_ctx = {} - columns_ctx["continuous"] = {} - columns_ctx["continuous"]["base"] = cont_names - - new_gdf = op.apply_op(df, columns_ctx, "continuous", stats_context=processor.stats) - df["x"] = (df["x"] - processor.stats["mins"]["x"]) / ( - processor.stats["maxs"]["x"] - processor.stats["mins"]["x"] - ) - assert new_gdf["x"].equals(df["x"]) + for col in op_columns: + assert math.isclose(df[col].mean(), processor.column_group.op.means[col], rel_tol=1e-4) + assert math.isclose(df[col].std(), processor.column_group.op.stds[col], rel_tol=1e-4) + df[col] = (df[col] - processor.column_group.op.means[col]) / processor.column_group.op.stds[ + col + ] + assert np.all((df[col] - new_gdf[col]).abs().values <= 1e-2) @pytest.mark.parametrize("gpu_memory_frac", [0.1]) @pytest.mark.parametrize("engine", ["parquet"]) def test_lambdaop(tmpdir, df, dataset, gpu_memory_frac, engine, client): - cat_names = ["name-cat", "name-string"] - cont_names = ["x", "y"] - label_name = ["label"] - columns = mycols_pq if engine == "parquet" else mycols_csv - df_copy = df.copy() - config = nvt.workflow.get_new_config() - - processor = nvtabular.Workflow( - cat_names=cat_names, - cont_names=cont_names, - label_name=label_name, - config=config, - client=client, - ) - - columns_ctx = {} - columns_ctx["continuous"] = {} - columns_ctx["continuous"]["base"] = cont_names - columns_ctx["all"] = {} - columns_ctx["all"]["base"] = columns - # Substring # Replacement - op = ops.LambdaOp( - op_name="slice", - f=lambda col: col.str.slice(1, 3), - columns=["name-cat", "name-string"], - replace=True, - ) + substring = ColumnGroup(["name-cat", "name-string"]) >> (lambda col: col.str.slice(1, 3)) + processor = nvtabular.Workflow(substring) + processor.fit(dataset) + new_gdf = processor.transform(dataset).to_ddf().compute() - new_gdf = op.apply_op(df, columns_ctx, "all", stats_context=None) assert new_gdf["name-cat"].equals(df_copy["name-cat"].str.slice(1, 3)) assert new_gdf["name-string"].equals(df_copy["name-string"].str.slice(1, 3)) - # No Replacement - df = df_copy.copy() - op = ops.LambdaOp( - op_name="slice", - f=lambda col: col.str.slice(1, 3), - columns=["name-cat", "name-string"], - replace=False, + # No Replacement from old API (skipped for other examples) + substring = ( + ColumnGroup(["name-cat", "name-string"]) + >> (lambda col: col.str.slice(1, 3)) + >> ops.Rename(postfix="_slice") ) - new_gdf = op.apply_op(df, columns_ctx, "all", stats_context=None) + processor = nvtabular.Workflow(["name-cat", "name-string"] + substring) + processor.fit(dataset) + new_gdf = processor.transform(dataset).to_ddf().compute() + assert new_gdf["name-cat_slice"].equals(df_copy["name-cat"].str.slice(1, 3)) assert new_gdf["name-string_slice"].equals(df_copy["name-string"].str.slice(1, 3)) assert new_gdf["name-cat"].equals(df_copy["name-cat"]) @@ -533,166 +308,44 @@ def test_lambdaop(tmpdir, df, dataset, gpu_memory_frac, engine, client): # Replace # Replacement - df = df_copy.copy() - op = ops.LambdaOp( - op_name="replace", - f=lambda col: col.str.replace("e", "XX"), - columns=["name-cat", "name-string"], - replace=True, - ) + oplambda = ColumnGroup(["name-cat", "name-string"]) >> (lambda col: col.str.replace("e", "XX")) + processor = nvtabular.Workflow(oplambda) + processor.fit(dataset) + new_gdf = processor.transform(dataset).to_ddf().compute() - new_gdf = op.apply_op(df, columns_ctx, "all", stats_context=None) assert new_gdf["name-cat"].equals(df_copy["name-cat"].str.replace("e", "XX")) assert new_gdf["name-string"].equals(df_copy["name-string"].str.replace("e", "XX")) - # No Replacement - df = df_copy.copy() - op = ops.LambdaOp( - op_name="replace", - f=lambda col: col.str.replace("e", "XX"), - columns=["name-cat", "name-string"], - replace=False, - ) - new_gdf = op.apply_op(df, columns_ctx, "all", stats_context=None) - assert new_gdf["name-cat_replace"].equals(df_copy["name-cat"].str.replace("e", "XX")) - assert new_gdf["name-string_replace"].equals(df_copy["name-string"].str.replace("e", "XX")) - assert new_gdf["name-cat"].equals(df_copy["name-cat"]) - assert new_gdf["name-string"].equals(df_copy["name-string"]) - # astype # Replacement - df = df_copy.copy() - op = ops.LambdaOp( - op_name="astype", f=lambda col: col.astype(float), columns=["id"], replace=True - ) - new_gdf = op.apply_op(df, columns_ctx, "all", stats_context=None) + oplambda = ColumnGroup(["id"]) >> (lambda col: col.astype(float)) + processor = nvtabular.Workflow(oplambda) + processor.fit(dataset) + new_gdf = processor.transform(dataset).to_ddf().compute() + assert new_gdf["id"].dtype == "float64" # Workflow # Replacement - import glob - - processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) - - processor.add_preprocess( - [ - ops.LambdaOp( - op_name="slice", - f=lambda col: col.astype(str).str.slice(0, 1), - columns=["name-cat"], - replace=True, - ), - ops.Categorify(), - ] - ) - processor.finalize() - processor.update_stats(dataset) - outdir = tmpdir.mkdir("out1") - processor.write_to_dataset( - outdir, dataset, out_files_per_proc=10, shuffle=nvt.io.Shuffle.PER_PARTITION, apply_ops=True + oplambda = ( + ColumnGroup(["name-cat"]) + >> (lambda col: col.astype(str).str.slice(0, 1)) + >> ops.Categorify() ) + processor = nvtabular.Workflow(oplambda) + processor.fit(dataset) + new_gdf = processor.transform(dataset).to_ddf().compute() + assert is_integer_dtype(new_gdf["name-cat"].dtype) - dataset_2 = nvtabular.io.Dataset( - glob.glob(str(outdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac + oplambda = ( + ColumnGroup(["name-cat", "name-string"]) >> ops.Categorify() >> (lambda col: col + 100) ) - df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0) - assert is_integer_dtype(df_pp["name-cat"].dtype) + processor = nvtabular.Workflow(oplambda) + processor.fit(dataset) + new_gdf = processor.transform(dataset).to_ddf().compute() - processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) - - processor.add_preprocess( - [ - ops.Categorify(), - ops.LambdaOp(op_name="add100", f=lambda col: col + 100, replace=True), - ] - ) - processor.finalize() - processor.update_stats(dataset) - outdir = tmpdir.mkdir("out2") - processor.write_to_dataset( - outdir, dataset, out_files_per_proc=10, shuffle=nvt.io.Shuffle.PER_PARTITION, apply_ops=True - ) - - dataset_2 = nvtabular.io.Dataset( - glob.glob(str(outdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac - ) - df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0) - assert is_integer_dtype(df_pp["name-cat"].dtype) - assert np.sum(df_pp["name-cat"] < 100) == 0 - - # Workflow - # No Replacement - processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) - - processor.add_preprocess( - [ - ops.LambdaOp( - op_name="slice", - f=lambda col: col.astype(str).str.slice(0, 1), - columns=["name-cat"], - replace=False, - ), - ops.Categorify(), - ] - ) - processor.finalize() - processor.update_stats(dataset) - outdir = tmpdir.mkdir("out3") - processor.write_to_dataset( - outdir, dataset, out_files_per_proc=10, shuffle=nvt.io.Shuffle.PER_PARTITION, apply_ops=True - ) - dataset_2 = nvtabular.io.Dataset( - glob.glob(str(outdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac - ) - df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0) - - assert df_pp["name-cat"].dtype == "O" - print(df_pp) - assert is_integer_dtype(df_pp["name-cat_slice"].dtype) - assert np.sum(df_pp["name-cat_slice"] == 0) == 0 - - processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) - - processor.add_preprocess( - [ - ops.Categorify(), - ops.LambdaOp(op_name="add100", f=lambda col: col + 100, replace=False), - ] - ) - processor.finalize() - processor.update_stats(dataset) - outdir = tmpdir.mkdir("out4") - processor.write_to_dataset( - outdir, dataset, out_files_per_proc=10, shuffle=nvt.io.Shuffle.PER_PARTITION, apply_ops=True - ) - - dataset_2 = nvtabular.io.Dataset( - glob.glob(str(outdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac - ) - df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0) - assert is_integer_dtype(df_pp["name-cat_add100"].dtype) - assert np.sum(df_pp["name-cat_add100"] < 100) == 0 - - processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) - - processor.add_preprocess( - [ - ops.LambdaOp(op_name="mul0", f=lambda col: col * 0, columns=["x"], replace=False), - ops.LambdaOp(op_name="add100", f=lambda col: col + 100, replace=False), - ] - ) - processor.finalize() - processor.update_stats(dataset) - outdir = tmpdir.mkdir("out5") - processor.write_to_dataset( - outdir, dataset, out_files_per_proc=10, shuffle=nvt.io.Shuffle.PER_PARTITION, apply_ops=True - ) - - dataset_2 = nvtabular.io.Dataset( - glob.glob(str(outdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac - ) - df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0) - assert np.sum(df_pp["x_mul0_add100"] < 100) == 0 + assert is_integer_dtype(new_gdf["name-cat"].dtype) + assert np.sum(new_gdf["name-cat"] < 100) == 0 @pytest.mark.parametrize("freq_threshold", [0, 1, 2]) @@ -910,11 +563,12 @@ def test_join_external(tmpdir, df, dataset, engine, kind_ext, cache, how, drop_d # Define Op on = "id" + columns_left = list(df.columns) columns_ext = ["id", "new_col", "new_col_2"] df_ext_check = df_ext_check[columns_ext] if drop_duplicates: df_ext_check.drop_duplicates(ignore_index=True, inplace=True) - merge_op = ops.JoinExternal( + joined = nvt.ColumnGroup(columns_left) >> nvt.ops.JoinExternal( df_ext, on, how=how, @@ -922,61 +576,58 @@ def test_join_external(tmpdir, df, dataset, engine, kind_ext, cache, how, drop_d cache=cache, drop_duplicates_ext=drop_duplicates, ) - columns = mycols_pq if engine == "parquet" else mycols_csv - columns_ctx = {} - columns_ctx["all"] = {} - columns_ctx["all"]["base"] = columns - # Iterate, apply op, and check result - for gdf in dataset.to_iter(): - new_gdf = merge_op.apply_op(gdf, columns_ctx, "all") - check_gdf = gdf.merge(df_ext_check, how=how, on=on) - assert len(check_gdf) == len(new_gdf) - assert (new_gdf["id"] + shift).all() == new_gdf["new_col"].all() - assert gdf["id"].all() == new_gdf["id"].all() - assert "new_col_2" in new_gdf.columns - assert "new_col_3" not in new_gdf.columns + gdf = df.reset_index() + dataset = nvt.Dataset(gdf) + processor = nvt.Workflow(joined) + processor.fit(dataset) + new_gdf = processor.transform(dataset).to_ddf().compute().reset_index() + + check_gdf = gdf.merge(df_ext_check, how=how, on=on) + assert len(check_gdf) == len(new_gdf) + assert (new_gdf["id"] + shift).all() == new_gdf["new_col"].all() + assert gdf["id"].all() == new_gdf["id"].all() + assert "new_col_2" in new_gdf.columns + assert "new_col_3" not in new_gdf.columns @pytest.mark.parametrize("gpu_memory_frac", [0.1]) @pytest.mark.parametrize("engine", ["parquet"]) def test_filter(tmpdir, df, dataset, gpu_memory_frac, engine, client): - cont_names = ["x", "y"] - - columns = mycols_pq if engine == "parquet" else mycols_csv - columns_ctx = {} - columns_ctx["all"] = {} - columns_ctx["all"]["base"] = columns - - filter_op = ops.Filter(f=lambda df: df[df["y"] > 0.5]) - new_gdf = filter_op.apply_op(df, columns_ctx, "all", target_cols=columns) - assert new_gdf.columns.all() == df.columns.all() + filtered = cont_names >> ops.Filter(f=lambda df: df[df["y"] > 0.5]) + processor = nvtabular.Workflow(filtered) + processor.fit(dataset) + new_gdf = processor.transform(dataset).to_ddf().compute().reset_index() + filter_df = df[df["y"] > 0.5].reset_index() + for col in cont_names: + assert np.all((new_gdf[col] - filter_df[col]).abs().values <= 1e-2) # return isnull() rows - columns_ctx["continuous"] = {} - columns_ctx["continuous"]["base"] = cont_names - for col in cont_names: idx = np.random.choice(df.shape[0] - 1, int(df.shape[0] * 0.2)) df[col].iloc[idx] = None - filter_op = ops.Filter(f=lambda df: df[df.x.isnull()]) - new_gdf = filter_op.apply_op(df, columns_ctx, "all", target_cols=columns) - assert new_gdf.columns.all() == df.columns.all() + dataset = nvt.Dataset(df) + filtered = cont_names >> ops.Filter(f=lambda df: df[df.x.isnull()]) + processor = nvtabular.Workflow(filtered) + processor.fit(dataset) + new_gdf = processor.transform(dataset).to_ddf().compute() assert new_gdf.shape[0] < df.shape[0], "null values do not exist" # again testing filtering by returning a series rather than a df - filter_op = ops.Filter(f=lambda df: df.x.isnull()) - new_gdf = filter_op.apply_op(df, columns_ctx, "all", target_cols=columns) - assert new_gdf.columns.all() == df.columns.all() + filtered = cont_names >> ops.Filter(f=lambda df: df.x.isnull()) + processor = nvtabular.Workflow(filtered) + processor.fit(dataset) + new_gdf = processor.transform(dataset).to_ddf().compute() assert new_gdf.shape[0] < df.shape[0], "null values do not exist" # if the filter returns an invalid type we should get an exception immediately # (rather than causing problems downstream in the workflow) - filter_op = ops.Filter(f=lambda df: "some invalid value") + filtered = cont_names >> ops.Filter(f=lambda df: "some invalid value") + processor = nvtabular.Workflow(filtered) with pytest.raises(ValueError): - filter_op.apply_op(df, columns_ctx, "all", target_cols=columns) + new_gdf = processor.transform(dataset).to_ddf().compute() def test_difference_lag(): @@ -984,78 +635,61 @@ def test_difference_lag(): {"userid": [0, 0, 0, 1, 1, 2], "timestamp": [1000, 1005, 1100, 2000, 2001, 3000]} ) - columns = ["userid", "timestamp"] - columns_ctx = {} - columns_ctx["all"] = {} - columns_ctx["all"]["base"] = columns - - op = ops.DifferenceLag("userid", shift=[1, -1], columns=["timestamp"]) - new_gdf = op.apply_op(df, columns_ctx, "all", target_cols=["timestamp"]) + diff_features = ["timestamp"] >> ops.DifferenceLag(partition_cols=["userid"], shift=[1, -1]) + dataset = nvt.Dataset(df) + processor = nvtabular.Workflow(diff_features) + processor.fit(dataset) + new_gdf = processor.transform(dataset).to_ddf().compute() - assert new_gdf["timestamp_DifferenceLag_1"][0] is None - assert new_gdf["timestamp_DifferenceLag_1"][1] == 5 - assert new_gdf["timestamp_DifferenceLag_1"][2] == 95 - assert new_gdf["timestamp_DifferenceLag_1"][3] is None + assert new_gdf["timestamp_difference_lag_1"][0] is None + assert new_gdf["timestamp_difference_lag_1"][1] == 5 + assert new_gdf["timestamp_difference_lag_1"][2] == 95 + assert new_gdf["timestamp_difference_lag_1"][3] is None - assert new_gdf["timestamp_DifferenceLag_-1"][0] == -5 - assert new_gdf["timestamp_DifferenceLag_-1"][1] == -95 - assert new_gdf["timestamp_DifferenceLag_-1"][2] is None - assert new_gdf["timestamp_DifferenceLag_-1"][3] == -1 - assert new_gdf["timestamp_DifferenceLag_-1"][5] is None + assert new_gdf["timestamp_difference_lag_-1"][0] == -5 + assert new_gdf["timestamp_difference_lag_-1"][1] == -95 + assert new_gdf["timestamp_difference_lag_-1"][2] is None + assert new_gdf["timestamp_difference_lag_-1"][3] == -1 + assert new_gdf["timestamp_difference_lag_-1"][5] is None @pytest.mark.parametrize("gpu_memory_frac", [0.01, 0.1]) @pytest.mark.parametrize("engine", ["parquet", "csv", "csv-no-header"]) -@pytest.mark.parametrize("use_dict", [True, False]) -def test_hashed_cross(tmpdir, df, dataset, gpu_memory_frac, engine, use_dict): +def test_hashed_cross(tmpdir, df, dataset, gpu_memory_frac, engine): # TODO: add tests for > 2 features, multiple crosses, etc. - cat_names = ("name-string", "id") + cat_names = ["name-string", "id"] num_buckets = 10 - if use_dict: - hashed_cross_op = ops.HashedCross({cat_names: num_buckets}) - else: - hashed_cross_op = ops.HashedCross([cat_names], [num_buckets]) - - columns_ctx = {} - columns_ctx["categorical"] = {} - columns_ctx["categorical"]["base"] = list(cat_names) + hashed_cross = cat_names >> ops.HashedCross(num_buckets) + dataset = nvt.Dataset(df) + processor = nvtabular.Workflow(hashed_cross) + processor.fit(dataset) + new_gdf = processor.transform(dataset).to_ddf().compute() # check sums for determinancy - checksums = [] - for gdf in dataset.to_iter(): - new_gdf = hashed_cross_op.apply_op(gdf, columns_ctx, "categorical") - new_column_name = "_X_".join(cat_names) - assert np.all(new_gdf[new_column_name].values >= 0) - assert np.all(new_gdf[new_column_name].values <= 9) - checksums.append(new_gdf[new_column_name].sum()) - - for checksum, gdf in zip(checksums, dataset.to_iter()): - new_gdf = hashed_cross_op.apply_op(gdf, columns_ctx, "categorical") - assert new_gdf[new_column_name].sum() == checksum + new_column_name = "_X_".join(cat_names) + assert np.all(new_gdf[new_column_name].values >= 0) + assert np.all(new_gdf[new_column_name].values <= 9) + checksum = new_gdf[new_column_name].sum() + new_gdf = processor.transform(dataset).to_ddf().compute() + assert new_gdf[new_column_name].sum() == checksum @pytest.mark.parametrize("gpu_memory_frac", [0.01, 0.1]) @pytest.mark.parametrize("engine", ["parquet", "csv", "csv-no-header"]) -@pytest.mark.parametrize("use_dict", [True, False]) -def test_bucketized(tmpdir, df, dataset, gpu_memory_frac, engine, use_dict): +def test_bucketized(tmpdir, df, dataset, gpu_memory_frac, engine): cont_names = ["x", "y"] boundaries = [[-1, 0, 1], [-4, 100]] - if use_dict: - bucketize_op = ops.Bucketize( - {name: boundary for name, boundary in zip(cont_names, boundaries)} - ) - else: - bucketize_op = ops.Bucketize(boundaries, cont_names) - - columns_ctx = {} - columns_ctx["continuous"] = {} - columns_ctx["continuous"]["base"] = list(cont_names) - for gdf in dataset.to_iter(): - new_gdf = bucketize_op.apply_op(gdf, columns_ctx, "continuous") - for col, bs in zip(cont_names, boundaries): - assert np.all(new_gdf[col].values >= 0) - assert np.all(new_gdf[col].values <= len(bs)) - # TODO: add checks for correctness here that don't just - # repeat the existing logic + bucketize_op = ops.Bucketize({name: boundary for name, boundary in zip(cont_names, boundaries)}) + + bucket_features = cont_names >> bucketize_op + processor = nvtabular.Workflow(bucket_features) + processor.fit(dataset) + new_gdf = processor.transform(dataset).to_ddf().compute() + + for col, bs in zip(cont_names, boundaries): + assert np.all(new_gdf[col].values >= 0) + assert np.all(new_gdf[col].values <= len(bs)) + # TODO: add checks for correctness here that don't just + # repeat the existing logic From b44dfa6ecfdfd2e981c0ae397510f838f03adbdc Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Wed, 16 Dec 2020 15:35:46 -0800 Subject: [PATCH 13/23] First draft get_embedding_sizes support Re-add get_embedding_sizes . Note that this changes how we support multi-hot columns here (sizes are returned same as single hot, and we don't use this method to distinguish between multi and singlehot columns) --- nvtabular/ops/categorify.py | 84 ++++++------------------------------ nvtabular/ops/hash_bucket.py | 21 ++++----- tests/unit/test_ops.py | 3 +- 3 files changed, 23 insertions(+), 85 deletions(-) diff --git a/nvtabular/ops/categorify.py b/nvtabular/ops/categorify.py index d830384165e..b0e2d6f4fac 100644 --- a/nvtabular/ops/categorify.py +++ b/nvtabular/ops/categorify.py @@ -348,6 +348,9 @@ def output_column_names(self, columns): return cat_names return list(flatten(columns, container=tuple)) + def get_embedding_sizes(self, columns): + return _get_embeddings_dask(self.categories, columns, self.num_buckets, self.freq_threshold) + def _get_embedding_order(cat_names): """Returns a consistent sorder order for categorical variables @@ -361,33 +364,17 @@ def _get_embedding_order(cat_names): def get_embedding_sizes(workflow): - mh_cols = None - cols = _get_embedding_order(workflow.columns_ctx["categorical"]["base"]) - buckets = None - freq = 0 - # when frequency hashing is applied, - # this will return embedding shape:(num_buckets+cardinality, emb_dim) - if "buckets" in workflow.stats.keys() and "freq_limit" in workflow.stats.keys(): - buckets = workflow.stats["buckets"] - freq = workflow.stats["freq_limit"] - # when only hashing is applied, this will return embedding shape as (num_buckets, emb_dim) - elif "buckets" in workflow.stats.keys(): - buckets = workflow.stats["buckets"] - - # if we have hash buckets, but no categories just use the buckets - if buckets and "categories" not in workflow.stats: - return {col: _emb_sz_rule(num_rows) for col, num_rows in buckets.items()} - - if "mh" not in workflow.columns_ctx["categorical"]: - return _get_embeddings_dask(workflow.stats["categories"], cols, buckets, freq) - else: - mh_cols = _get_embedding_order(workflow.columns_ctx["categorical"]["mh"]) - for col in mh_cols: - cols.remove(col) - res = _get_embeddings_dask(workflow.stats["categories"], cols, buckets, freq) - if mh_cols: - res = res, _get_embeddings_dask(workflow.stats["categories"], mh_cols, buckets, freq) - return res + """ Returns a dictionary of best embedding sizes from the workflow """ + # TODO: do we need to distinguish multihot columns here? (if so why? ) + queue = [workflow.column_group] + output = {} + while queue: + current = queue.pop() + if current.op and hasattr(current.op, "get_embedding_sizes"): + output.update(current.op.get_embedding_sizes(current.columns)) + if current.kind == "+" or current.kind.startswith("-"): + queue.extend(current.parents) + return output def _get_embeddings_dask(paths, cat_names, buckets=None, freq_limit=0): @@ -969,46 +956,3 @@ def _hash_bucket(gdf, num_buckets, col, encode_type="joint"): val = val % nb encoded = val return encoded - - -class SetBuckets(StatOperator): - def __init__(self, columns=None, num_buckets=None, freq_limit=0, encode_type="joint"): - if isinstance(columns, list): - columns = list(set(flatten(columns, container=list))) - super().__init__(columns=columns) - self.num_buckets = num_buckets - self.freq_limit = freq_limit - self.encode_type = encode_type - - @annotate("SetBuckets_op", color="green", domain="nvt_python") - def stat_logic(self, ddf, columns_ctx, input_cols, target_cols): - cols = self.get_columns(columns_ctx, input_cols, target_cols) - if isinstance(self.num_buckets, int) and self.encode_type == "joint": - self.num_buckets = {name: self.num_buckets for name in cols} - elif isinstance(self.num_buckets, int) and self.encode_type == "combo": - buckets = {} - for group in cols: - if isinstance(group, list) and len(group) > 1: - # For multi-column groups, we concatenate column names. - name = _make_name(*group, sep="_") - buckets[name] = self.num_buckets - elif isinstance(group, str): - buckets[group] = self.num_buckets - self.num_buckets = buckets - return self.num_buckets - - @annotate("SetBuckets_finalize", color="green", domain="nvt_python") - def finalize(self, dask_stats): - self.num_buckets = dask_stats - - def registered_stats(self): - return ["buckets", "freq_limit"] - - def stats_collected(self): - result = [("buckets", self.num_buckets), ("freq_limit", self.freq_limit)] - return result - - def clear(self): - self.num_buckets = {} - self.freq_limit = {} - return diff --git a/nvtabular/ops/hash_bucket.py b/nvtabular/ops/hash_bucket.py index 651ecd91617..c586b2d8340 100644 --- a/nvtabular/ops/hash_bucket.py +++ b/nvtabular/ops/hash_bucket.py @@ -17,7 +17,7 @@ from cudf.utils.dtypes import is_list_dtype from nvtx import annotate -from .categorify import _encode_list_column +from .categorify import _emb_sz_rule, _encode_list_column from .operator import Operator @@ -83,23 +83,11 @@ class HashBucket(Operator): explicit mappings from a column name to a number of buckets. In this case, only the columns specified in the keys of `num_buckets` will be transformed. - columns: list of str or None - Column names to apply hash bucket transformation to. Ignored if - `num_buckets` is a `dict`. If `num_buckets` is given as a list, - `columns` must not be None and have the same length. If left - as None, transformation will be applied to all categorical - columns. Note that this case is only used if `num_buckets` is - an `int`. """ def __init__(self, num_buckets): if isinstance(num_buckets, dict): - columns = [i for i in num_buckets.keys()] self.num_buckets = num_buckets - elif isinstance(num_buckets, (tuple, list)): - assert columns is not None - assert len(columns) == len(num_buckets) - self.num_buckets = {col: nb for col, nb in zip(columns, num_buckets)} elif isinstance(num_buckets, int): self.num_buckets = num_buckets else: @@ -123,3 +111,10 @@ def transform(self, columns, gdf: cudf.DataFrame): else: gdf[col] = gdf[col].hash_values() % nb return gdf + + def get_embedding_sizes(self, columns): + if isinstance(self.num_buckets, int): + embedding_size = _emb_sz_rule(self.num_buckets) + return {col: embedding_size for col in columns} + else: + return {col: _emb_sz_rule(self.num_buckets[col]) for col in columns} diff --git a/tests/unit/test_ops.py b/tests/unit/test_ops.py index 20214db89c1..db135883612 100644 --- a/tests/unit/test_ops.py +++ b/tests/unit/test_ops.py @@ -199,8 +199,7 @@ def test_hash_bucket_lists(tmpdir): assert authors[0][0] == authors[1][0] # 'User_A' assert authors[2][1] == authors[3][0] # 'User_C' - # ToDo: make sure we get the embedding sizes - # assert nvt.ops.get_embedding_sizes(processor)["Authors"][0] == 10 + assert nvt.ops.get_embedding_sizes(processor)["Authors"][0] == 10 @pytest.mark.parametrize("engine", ["parquet"]) From 27b4e332c37491fd988c3469bb7ccd15766de48a Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Wed, 16 Dec 2020 15:45:42 -0800 Subject: [PATCH 14/23] isort --- nvtabular/workflow.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nvtabular/workflow.py b/nvtabular/workflow.py index adb7c9a4eea..863614a5e92 100644 --- a/nvtabular/workflow.py +++ b/nvtabular/workflow.py @@ -24,7 +24,6 @@ from nvtabular.ops import StatOperator from nvtabular.worker import clean_worker_cache - LOG = logging.getLogger("nvtabular") From a95a4d9b6b004b2490627d753622e7e2ed6d43cf Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Wed, 16 Dec 2020 15:51:40 -0800 Subject: [PATCH 15/23] Remove groupbystatistics --- nvtabular/ops/categorify.py | 14 ++- nvtabular/ops/groupby_statistics.py | 188 ---------------------------- nvtabular/ops/join_groupby.py | 14 ++- nvtabular/ops/target_encoding.py | 23 ++-- 4 files changed, 36 insertions(+), 203 deletions(-) delete mode 100644 nvtabular/ops/groupby_statistics.py diff --git a/nvtabular/ops/categorify.py b/nvtabular/ops/categorify.py index b0e2d6f4fac..1fcf8523308 100644 --- a/nvtabular/ops/categorify.py +++ b/nvtabular/ops/categorify.py @@ -119,11 +119,19 @@ class Categorify(StatOperator): "combo", because the same column name can be included in multiple groups. tree_width : dict or int, optional - Passed to `GroupbyStatistics` dependency. + Tree width of the hash-based groupby reduction for each categorical + column. High-cardinality columns may require a large `tree_width`, + while low-cardinality columns can likely use `tree_width=1`. + If passing a dict, each key and value should correspond to the column + name and width, respectively. The default value is 8 for all columns. out_path : str, optional - Passed to `GroupbyStatistics` dependency. + Root directory where groupby statistics will be written out in + parquet format. on_host : bool, default True - Passed to `GroupbyStatistics` dependency. + Whether to convert cudf data to pandas between tasks in the hash-based + groupby reduction. The extra host <-> device data movement can reduce + performance. However, using `on_host=True` typically improves stability + (by avoiding device-level memory pressure). na_sentinel : default 0 Label to use for null-category mapping cat_cache : {"device", "host", "disk"} or dict diff --git a/nvtabular/ops/groupby_statistics.py b/nvtabular/ops/groupby_statistics.py deleted file mode 100644 index b7dec9d143d..00000000000 --- a/nvtabular/ops/groupby_statistics.py +++ /dev/null @@ -1,188 +0,0 @@ -# -# Copyright (c) 2020, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import cupy -import numpy as np -from dask.delayed import Delayed - -from . import categorify as nvt_cat -from .stat_operator import StatOperator - - -class GroupbyStatistics(StatOperator): - """ - Uses groupby aggregation to determine the unique groups of a categorical - feature and calculates the desired statistics of requested continuous - features (along with the count of rows in each group). The statistics - for each category will be written to a distinct parquet file, and a - dictionary of paths will be returned as the final "statistics". - - Parameters - ----------- - cont_names : list of str - The continuous column names to calculate statistics for - (for each unique group in each column in `columns`) - stats : list of str, default [] - List of statistics to calculate for each unique group. Note - that "count" corresponds to the group itself, while all - other statistics correspond to a specific continuous column. - Supported statistics include ["count", "sum", "mean", "std", "var", "min", "max"]. - columns : list of str or list(str), default None - Categorical columns (or "column groups") to collect statistics for. - If None, the operation will target all known categorical columns. - fold_groups : list, default None - List of groups to to perform a groupby aggregation with an additional - "fold" column (typically for cross-validation). - concat_groups : bool, default False - Applies only if there are list elements in the ``columns`` input. If True, - the values within these column groups will be concatenated, and the - new (temporary) columns will be used to perform the groupby. The purpose of - this option is to enable multiple columns to be label-encoded jointly. - (see Categorify). Note that this option is only allowed for the "count" - statistics (with cont_names == None). - tree_width : dict or int, optional - Tree width of the hash-based groupby reduction for each categorical - column. High-cardinality columns may require a large `tree_width`, - while low-cardinality columns can likely use `tree_width=1`. - If passing a dict, each key and value should correspond to the column - name and width, respectively. The default value is 8 for all columns. - out_path : str, optional - Root directory where groupby statistics will be written out in - parquet format. - freq_threshold : int, default 0 - Categories with a `count` statistic less than this number will - be omitted from the `GroupbyStatistics` output. - on_host : bool, default True - Whether to convert cudf data to pandas between tasks in the hash-based - groupby reduction. The extra host <-> device data movement can reduce - performance. However, using `on_host=True` typically improves stability - (by avoiding device-level memory pressure). - name_sep : str, default "_" - String separator to use between concatenated column names - for multi-column groups. - fold_name : str, default "__fold__" - Name of the fold column to use for all groups in `fold_groups`. - fold_seed : int, default 42 - Random seed to use for cupy-based fold assignment. - kfold : str, default 3 - Number of cross-validation folds to use for all groups in `fold_groups`. - """ - - def __init__( - self, - cont_names=None, - stats=None, - fold_groups=None, - tree_width=None, - out_path=None, - on_host=True, - freq_threshold=None, - concat_groups=False, - name_sep="_", - fold_name="__fold__", - fold_seed=42, - kfold=None, - ): - super(GroupbyStatistics, self).__init__() - self.cont_names = cont_names or [] - self.stats = stats or [] - self.categories = {} - self.tree_width = tree_width or 8 - self.on_host = on_host - self.freq_threshold = freq_threshold - self.out_path = out_path or "./" - self.concat_groups = concat_groups - self.name_sep = name_sep - self.kfold = kfold or 3 - self.fold_name = fold_name - self.fold_seed = fold_seed - self.fold_groups = fold_groups - - def fit(self, columns, ddf): - supported_ops = ["count", "sum", "mean", "std", "var", "min", "max"] - for op in self.stats: - if op not in supported_ops: - raise ValueError(op + " operation is not supported.") - - # TODO: move all this 'fold' stuff into TargetEncoding - col_groups = columns - if self.fold_groups and self.kfold > 1: - # Add new fold column if necessary - if self.fold_name not in ddf.columns: - - def _add_fold(s, kfold, fold_seed): - typ = np.min_scalar_type(kfold * 2) - if fold_seed is None: - # If we don't have a specific seed, - # just use a simple modulo-based mapping - fold = cupy.arange(len(s), dtype=typ) - cupy.mod(fold, kfold, out=fold) - return fold - else: - cupy.random.seed(fold_seed) - return cupy.random.choice(cupy.arange(kfold, dtype=typ), len(s)) - - ddf[self.fold_name] = ddf.index.map_partitions( - _add_fold, - self.kfold, - self.fold_seed, - meta=_add_fold(ddf._meta.index, self.kfold, self.fold_seed), - ) - - # Specify to workflow that the ddf has been updated - self._ddf_out = ddf - - # Add new col_groups with fold - - for group in self.fold_groups: - if isinstance(group, list): - col_groups.append([self.fold_name] + group) - else: - col_groups.append([self.fold_name, group]) - - # Make sure concat - if self.concat_groups: - raise ValueError("cannot use concat_groups=True with folds.") - - agg_cols = self.cont_names - agg_list = self.stats - dsk, key = nvt_cat._category_stats( - ddf, - col_groups, - agg_cols, - agg_list, - self.out_path, - self.freq_threshold, - self.tree_width, - self.on_host, - concat_groups=self.concat_groups, - name_sep=self.name_sep, - ) - return Delayed(key, dsk) - - def finalize(self, dask_stats): - for col in dask_stats: - self.categories[col] = dask_stats[col] - - def registered_stats(self): - return [self.stat_name] - - def stats_collected(self): - result = [(self.stat_name, self.categories)] - return result - - def clear(self): - self.categories = {} - return diff --git a/nvtabular/ops/join_groupby.py b/nvtabular/ops/join_groupby.py index 6077a2f952e..36767755615 100644 --- a/nvtabular/ops/join_groupby.py +++ b/nvtabular/ops/join_groupby.py @@ -62,11 +62,19 @@ class JoinGroupby(StatOperator): Categorical columns (or multi-column "groups") to target for this op. If None, the operation will target all known categorical columns. tree_width : dict or int, optional - Passed to `GroupbyStatistics` dependency. + Tree width of the hash-based groupby reduction for each categorical + column. High-cardinality columns may require a large `tree_width`, + while low-cardinality columns can likely use `tree_width=1`. + If passing a dict, each key and value should correspond to the column + name and width, respectively. The default value is 8 for all columns. out_path : str, optional - Passed to `GroupbyStatistics` dependency. + Root directory where groupby statistics will be written out in + parquet format. on_host : bool, default True - Passed to `GroupbyStatistics` dependency. + Whether to convert cudf data to pandas between tasks in the hash-based + groupby reduction. The extra host <-> device data movement can reduce + performance. However, using `on_host=True` typically improves stability + (by avoiding device-level memory pressure). name_sep : str, default "_" String separator to use between concatenated column names for multi-column groups. diff --git a/nvtabular/ops/target_encoding.py b/nvtabular/ops/target_encoding.py index a3354ecdf26..e341823a65e 100644 --- a/nvtabular/ops/target_encoding.py +++ b/nvtabular/ops/target_encoding.py @@ -77,13 +77,9 @@ class TargetEncoding(StatOperator): Global mean of the target column to use for encoding. Supplying this value up-front will improve performance. kfold : int, default 3 - Number of cross-validation folds to use while gathering - statistics (during `GroupbyStatistics`). + Number of cross-validation folds to use while gathering statistics. fold_seed : int, default 42 Random seed to use for cupy-based fold assignment. - drop_folds : bool, default True - Whether to drop the "__fold__" column created by the - `GroupbyStatistics` dependency (after the transformation). p_smooth : int, default 20 Smoothing factor. out_col : str or list of str, default is problem-specific @@ -93,14 +89,24 @@ class TargetEncoding(StatOperator): out_dtype : str, default is problem-specific dtype of output target-encoding columns. tree_width : dict or int, optional - Passed to `GroupbyStatistics` dependency. + Tree width of the hash-based groupby reduction for each categorical + column. High-cardinality columns may require a large `tree_width`, + while low-cardinality columns can likely use `tree_width=1`. + If passing a dict, each key and value should correspond to the column + name and width, respectively. The default value is 8 for all columns. out_path : str, optional - Passed to `GroupbyStatistics` dependency. + Root directory where category statistics will be written out in + parquet format. on_host : bool, default True - Passed to `GroupbyStatistics` dependency. + Whether to convert cudf data to pandas between tasks in the hash-based + groupby reduction. The extra host <-> device data movement can reduce + performance. However, using `on_host=True` typically improves stability + (by avoiding device-level memory pressure). name_sep : str, default "_" String separator to use between concatenated column names for multi-column groups. + drop_folds : bool, default True + Whether to drop the "__fold__" column created. This is really only useful for unittests. """ def __init__( @@ -117,7 +123,6 @@ def __init__( out_path=None, on_host=True, name_sep="_", - stat_name=None, drop_folds=True, ): super().__init__() From 0e55c2ad534c99afb4001de8117c7e47f85d091b Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Wed, 16 Dec 2020 21:25:06 -0800 Subject: [PATCH 16/23] implement serialization of statistics add save_stats/load_stats/clear_stats methods to the workflow, with each statoperator getting called as appropiate --- nvtabular/ops/categorify.py | 15 ++++++- nvtabular/ops/fill.py | 9 ++++ nvtabular/ops/join_groupby.py | 10 +++++ nvtabular/ops/normalize.py | 22 ++++++++++ nvtabular/ops/stat_operator.py | 13 +++++- nvtabular/ops/target_encoding.py | 13 +++++- nvtabular/workflow.py | 72 ++++++++++++++++++++++++++++++-- 7 files changed, 145 insertions(+), 9 deletions(-) diff --git a/nvtabular/ops/categorify.py b/nvtabular/ops/categorify.py index 1fcf8523308..d443cf3c011 100644 --- a/nvtabular/ops/categorify.py +++ b/nvtabular/ops/categorify.py @@ -239,7 +239,7 @@ def __init__( "`num_buckets` must be dict or int, got type {}".format(type(num_buckets)) ) - @annotate("Categorify_transform", color="darkgreen", domain="nvt_python") + @annotate("Categorify_fit", color="darkgreen", domain="nvt_python") def fit(self, columns, ddf): # User passed in a list of column groups. We need to figure out # if this list contains any multi-column groups, and if there @@ -281,6 +281,15 @@ def fit_finalize(self, dask_stats): for col in dask_stats: self.categories[col] = dask_stats[col] + def save(self): + return self.categories + + def load(self, data): + self.categories = data + + def clear(self): + self.categories = {} + @annotate("Categorify_transform", color="darkgreen", domain="nvt_python") def transform( self, @@ -380,7 +389,9 @@ def get_embedding_sizes(workflow): current = queue.pop() if current.op and hasattr(current.op, "get_embedding_sizes"): output.update(current.op.get_embedding_sizes(current.columns)) - if current.kind == "+" or current.kind.startswith("-"): + elif not current.op: + # only follow parents if its not an operator node (which could + # transform meaning of the get_embedding_sizes queue.extend(current.parents) return output diff --git a/nvtabular/ops/fill.py b/nvtabular/ops/fill.py index c00fc75abe5..9e7bc97c299 100644 --- a/nvtabular/ops/fill.py +++ b/nvtabular/ops/fill.py @@ -92,3 +92,12 @@ def fit(self, columns, ddf): def fit_finalize(self, dask_stats): for col in dask_stats.index.values_host: self.medians[col] = float(dask_stats[col]) + + def save(self): + return self.medians + + def load(self, data): + self.medians = data + + def clear(self): + self.medians = {} diff --git a/nvtabular/ops/join_groupby.py b/nvtabular/ops/join_groupby.py index 36767755615..6b219d23a35 100644 --- a/nvtabular/ops/join_groupby.py +++ b/nvtabular/ops/join_groupby.py @@ -178,3 +178,13 @@ def output_column_names(self, columns): else: output.append(f"{name}_{cont}_{stat}") return output + + def save(self): + return [self.categories, self.storage_name] + + def load(self, stats): + self.categories, self.storage_name = stats + + def clear(self): + self.categories = {} + self.storage_name = {} diff --git a/nvtabular/ops/normalize.py b/nvtabular/ops/normalize.py index de4cf009d38..34237cbea92 100644 --- a/nvtabular/ops/normalize.py +++ b/nvtabular/ops/normalize.py @@ -73,6 +73,17 @@ def transform(self, columns, gdf: cudf.DataFrame): new_gdf[name] = new_gdf[name].astype("float32") return new_gdf + def save(self): + return {"means": self.means, "stds": self.stds} + + def load(self, data): + self.means = data["means"] + self.stds = data["stds"] + + def clear(self): + self.means = {} + self.stds = {} + class NormalizeMinMax(StatOperator): """ @@ -132,3 +143,14 @@ def fit_finalize(self, dask_stats): for col in dask_stats["mins"].index.values_host: self.mins[col] = dask_stats["mins"][col] self.maxs[col] = dask_stats["maxs"][col] + + def save(self): + return {"mins": self.mins, "maxs": self.maxs} + + def load(self, data): + self.mins = data["mins"] + self.maxs = data["maxs"] + + def clear(self): + self.mins = {} + self.maxs = {} diff --git a/nvtabular/ops/stat_operator.py b/nvtabular/ops/stat_operator.py index d90455b5ff2..aa01eb5d79d 100644 --- a/nvtabular/ops/stat_operator.py +++ b/nvtabular/ops/stat_operator.py @@ -35,5 +35,16 @@ def fit_finalize(self, dask_stats): """Follow-up operations to convert dask statistics in to member variables""" ) + def save(self): + """Returns a json-able representation of the statistics for this object. This + is usually called by the workflow rather than diretly""" + raise NotImplementedError(".save isn't implemented for this op!") + + def load(self, data): + """Loads statistics from a json-able blob of data. This is usually called + by the workflow rather than called directly""" + raise NotImplementedError(".load isn't implemented for this op!") + def clear(self): - raise NotImplementedError("""zero and reinitialize all relevant statistical properties""") + """ zero and reinitialize all relevant statistical properties""" + raise NotImplementedError(".clear isn't implemented for this op!") diff --git a/nvtabular/ops/target_encoding.py b/nvtabular/ops/target_encoding.py index e341823a65e..994cd193f40 100644 --- a/nvtabular/ops/target_encoding.py +++ b/nvtabular/ops/target_encoding.py @@ -192,8 +192,6 @@ def fit(self, columns, ddf): def fit_finalize(self, dask_stats): for col, value in dask_stats[0].items(): self.stats[col] = value - print(self.stats) - for col in dask_stats[1].index: self.means[col] = float(dask_stats[1]["mean"].loc[col]) @@ -211,6 +209,17 @@ def output_column_names(self, columns): return ret + def save(self): + return {"stats": self.stats, "means": self.means} + + def load(self, data): + self.stats = data["stats"] + self.means = data["means"] + + def clear(self): + self.stats = {} + self.means = {} + def _make_te_name(self, cat_group): tag = nvt_cat._make_name(*cat_group, sep=self.name_sep) return [f"TE_{tag}_{x}" for x in self.target] diff --git a/nvtabular/workflow.py b/nvtabular/workflow.py index 863614a5e92..36a35be8175 100644 --- a/nvtabular/workflow.py +++ b/nvtabular/workflow.py @@ -17,6 +17,7 @@ import cudf import dask +import yaml from dask.core import flatten from nvtabular.column_group import ColumnGroup, iter_nodes @@ -79,10 +80,7 @@ def fit(self, dataset): # Get a dictionary mapping all StatOperators we need to fit to a set of any dependant # StatOperators (having StatOperators that depend on the output of other StatOperators # means that will have multiple phases in the fit cycle here) - def get_stat_ops(nodes): - return set(node for node in iter_nodes(nodes) if isinstance(node.op, StatOperator)) - - stat_ops = {op: get_stat_ops(op.parents) for op in get_stat_ops([self.column_group])} + stat_ops = {op: _get_stat_ops(op.parents) for op in _get_stat_ops([self.column_group])} while stat_ops: # get all the StatOperators that we can currently call fit on (no outstanding @@ -136,6 +134,68 @@ def fit_transform(self, dataset): self.fit(dataset) return self.transform(dataset) + def save_stats(self, path): + node_ids = {} + output_data = [] + + def add_node(node): + if node in node_ids: + return node_ids[node] + + data = { + "columns": node.columns, + } + if node.parents: + data["name"] = node.label + data["parents"] = [add_node(parent) for parent in node.parents] + else: + data["name"] = "input" + + if isinstance(node.op, StatOperator): + data["stats"] = node.op.save() + + nodeid = len(output_data) + data["id"] = nodeid + node_ids[node] = nodeid + output_data.append(data) + return nodeid + + # recursively save each operator, providing enough context + # to (columns/labels etc) to load again + add_node(self.column_group) + with open(path, "w") as outfile: + yaml.safe_dump(output_data, outfile, default_flow_style=False) + + def load_stats(self, path): + def load_node(nodeid, node): + saved = nodes[nodeid] + if "parents" not in saved: + return + + if node.label != saved["name"]: + raise ValueError( + "Failed to load saved statistics: names %s != %s" % (node.label, saved["name"]) + ) + if node.columns != saved["columns"]: + raise ValueError( + "Failed to load saved statistics: columns %s != %s" + % (node.columns, saved["column"]) + ) + + if isinstance(node.op, StatOperator): + node.op.load(saved["stats"]) + + for parentid, parent in zip(saved["parents"], node.parents): + load_node(parentid, parent) + + # recursively load each operator in the graph + nodes = yaml.safe_load(open(path)) + load_node(nodes[-1]["id"], self.column_group) + + def clear_stats(self): + for stat in _get_stat_ops([self.column_group]): + stat.op.clear() + def _input_columns(self): input_nodes = set(node for node in iter_nodes([self.column_group]) if not node.parents) return list(set(col for node in input_nodes for col in node.flattened_columns)) @@ -161,6 +221,10 @@ def _transform_ddf(ddf, column_groups): ) +def _get_stat_ops(nodes): + return set(node for node in iter_nodes(nodes) if isinstance(node.op, StatOperator)) + + def _transform_partition(root_gdf, column_groups): """ Transforms a single partition by appyling all operators in a ColumnGroup """ output = cudf.DataFrame() From ee9367c168eab47395685683cdb771d2e8803d81 Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Wed, 16 Dec 2020 22:14:56 -0800 Subject: [PATCH 17/23] Fix TF dataloader unittests --- nvtabular/loader/tensorflow.py | 6 ------ nvtabular/loader/torch.py | 1 - tests/unit/test_tf_dataloader.py | 23 ++++++++++------------- 3 files changed, 10 insertions(+), 20 deletions(-) diff --git a/nvtabular/loader/tensorflow.py b/nvtabular/loader/tensorflow.py index 1dc349a404f..0961bb19128 100644 --- a/nvtabular/loader/tensorflow.py +++ b/nvtabular/loader/tensorflow.py @@ -175,10 +175,6 @@ class KerasSequenceLoader(tf.keras.utils.Sequence, DataLoader): the last chunk in a dataset, which will, in general, be smaller). Larger chunk sizes will lead to more efficieny and randomness, but require more memory. - - workflows: list(nvtabular.Workflow) - `nvtabular.Workflow`s for applying online preprocessing. Must - be consistent with the dataset schema specified by `feature_columns` - or `cat_names` and `cont_names` - devices: None Which GPU devices to load from. Ignored for now - parts_per_chunk: int @@ -203,7 +199,6 @@ def __init__( engine=None, shuffle=True, buffer_size=0.1, - workflows=None, devices=None, parts_per_chunk=1, reader_kwargs=None, @@ -229,7 +224,6 @@ def __init__( batch_size, shuffle, parts_per_chunk=parts_per_chunk, - workflows=workflows, devices=devices, ) diff --git a/nvtabular/loader/torch.py b/nvtabular/loader/torch.py index 1549e60dd49..d7c367fb2a0 100644 --- a/nvtabular/loader/torch.py +++ b/nvtabular/loader/torch.py @@ -80,7 +80,6 @@ def __init__( batch_size, shuffle, parts_per_chunk=parts_per_chunk, - workflows=None, devices=devices, ) diff --git a/tests/unit/test_tf_dataloader.py b/tests/unit/test_tf_dataloader.py index 6f8cc3682a3..73dd18d2ee4 100644 --- a/tests/unit/test_tf_dataloader.py +++ b/tests/unit/test_tf_dataloader.py @@ -81,14 +81,15 @@ def test_tf_gpu_dl(tmpdir, paths, use_paths, dataset, batch_size, gpu_memory_fra columns = cont_names + cat_names - processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) - processor.add_feature([ops.FillMedian()]) - processor.add_feature(ops.Normalize()) - processor.add_preprocess(ops.Categorify()) - processor.finalize() + conts = cont_names >> ops.FillMedian() >> ops.Normalize() + cats = cat_names >> ops.Categorify() + + workflow = nvt.Workflow(conts + cats + label_name) + workflow.fit(dataset) + workflow.transform(dataset).to_parquet(tmpdir + "/processed") data_itr = tf_dataloader.KerasSequenceLoader( - paths if use_paths else dataset, + str(tmpdir + "/processed"), # workflow.transform(dataset), cat_names=cat_names, cont_names=cont_names, batch_size=batch_size, @@ -98,8 +99,6 @@ def test_tf_gpu_dl(tmpdir, paths, use_paths, dataset, batch_size, gpu_memory_fra shuffle=False, ) _ = tf.random.uniform((1,)) - processor.update_stats(dataset) - data_itr.map(processor) rows = 0 for idx in range(len(data_itr)): @@ -188,19 +187,17 @@ def test_mh_support(tmpdir, batch_size): cont_names = ["Embedding"] label_name = ["Post"] - processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) - processor.add_preprocess(ops.HashBucket(num_buckets=10)) - processor.finalize() + cats = cat_names >> ops.HashBucket(num_buckets=10) + workflow = nvt.Workflow(cats + cont_names + label_name) data_itr = tf_dataloader.KerasSequenceLoader( - nvt.Dataset(df), + workflow.transform(nvt.Dataset(df)), cat_names=cat_names, cont_names=cont_names, label_names=label_name, batch_size=batch_size, shuffle=False, ) - data_itr.map(processor) idx = 0 for X, y in data_itr: From 7bf624fb7fbe67391de43d18b0e7fdd716fb96ef Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Wed, 16 Dec 2020 23:25:29 -0800 Subject: [PATCH 18/23] test_torch_dataloader fixes --- tests/unit/test_torch_dataloader.py | 78 ++++++++++++----------------- 1 file changed, 32 insertions(+), 46 deletions(-) diff --git a/tests/unit/test_torch_dataloader.py b/tests/unit/test_torch_dataloader.py index d8f864597ca..99332f71402 100644 --- a/tests/unit/test_torch_dataloader.py +++ b/tests/unit/test_torch_dataloader.py @@ -51,27 +51,26 @@ def test_gpu_file_iterator_ds(df, dataset, batch, engine): @pytest.mark.parametrize("cont_names", [["x", "y", "id"], []]) @pytest.mark.parametrize("label_name", [["label"], []]) def test_empty_cols(tmpdir, df, dataset, engine, cat_names, cont_names, label_name): - # test out https://github.com/NVIDIA/NVTabular/issues/149 making sure we can iterate over - # empty cats/conts - processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) + features = [] if cont_names: - processor.add_feature([ops.FillMedian()]) - processor.add_feature(ops.Normalize()) + features.append(cont_names >> ops.FillMedian() >> ops.Normalize()) if cat_names: - processor.add_feature(ops.Categorify()) + features.append(cat_names >> ops.Categorify()) + + # test out https://github.com/NVIDIA/NVTabular/issues/149 making sure we can iterate over + # empty cats/conts + graph = sum(features, nvt.ColumnGroup(label_name)) + if not graph.columns: + # if we don't have conts/cats/labels we're done + return + + processor = nvt.Workflow(sum(features, nvt.ColumnGroup(label_name))) output_train = os.path.join(tmpdir, "train/") os.mkdir(output_train) - processor.apply( - dataset, - apply_offline=True, - record_stats=True, - shuffle=nvt.io.Shuffle.PER_PARTITION, - output_format=None, - ) - df_out = processor.get_ddf().compute(scheduler="synchronous") + df_out = processor.fit_transform(dataset).to_ddf().compute(scheduler="synchronous") data_itr = torch_dataloader.TorchAsyncItr( nvt.Dataset(df_out), cats=cat_names, conts=cont_names, labels=label_name, batch_size=1 @@ -96,19 +95,15 @@ def test_gpu_dl(tmpdir, df, dataset, batch_size, part_mem_fraction, engine, devi cont_names = ["x", "y", "id"] label_name = ["label"] - processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) + conts = cont_names >> ops.FillMedian() >> ops.Normalize() + cats = cat_names >> ops.Categorify() - processor.add_feature([ops.FillMedian()]) - processor.add_feature(ops.Normalize()) - processor.add_preprocess(ops.Categorify()) + processor = nvt.Workflow(conts + cats + label_name) output_train = os.path.join(tmpdir, "train/") os.mkdir(output_train) - processor.apply( - dataset, - apply_offline=True, - record_stats=True, + processor.fit_transform(dataset).to_parquet( shuffle=nvt.io.Shuffle.PER_PARTITION, output_path=output_train, out_files_per_proc=2, @@ -168,23 +163,18 @@ def test_kill_dl(tmpdir, df, dataset, part_mem_fraction, engine): cont_names = ["x", "y", "id"] label_name = ["label"] - processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) + conts = cont_names >> ops.FillMedian() >> ops.Normalize() + cats = cat_names >> ops.Categorify() - processor.add_feature([ops.FillMedian()]) - processor.add_feature(ops.Normalize()) - processor.add_feature(ops.Categorify()) + processor = nvt.Workflow(conts + cats + label_name) output_train = os.path.join(tmpdir, "train/") os.mkdir(output_train) - processor.finalize() - - processor.apply( - dataset, - apply_offline=True, - record_stats=True, + processor.fit_transform(dataset).to_parquet( shuffle=nvt.io.Shuffle.PER_PARTITION, output_path=output_train, + out_files_per_proc=2, ) tar_paths = [ @@ -239,11 +229,10 @@ def test_mh_support(tmpdir): cont_names = [] label_name = ["Post"] - processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) - processor.add_preprocess(ops.HashBucket(num_buckets=10)) - processor.finalize() - processor.apply(nvt.Dataset(df), output_format=None) - df_out = processor.get_ddf().compute(scheduler="synchronous") + cats = cat_names >> ops.HashBucket(num_buckets=10) + + processor = nvt.Workflow(cats + label_name) + df_out = processor.fit_transform(nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous") # check to make sure that the same strings are hashed the same authors = df_out["Authors"].to_arrow().to_pylist() @@ -282,15 +271,12 @@ def test_mh_model_support(tmpdir): label_name = ["Post"] out_path = os.path.join(tmpdir, "train/") os.mkdir(out_path) - processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) - processor.add_preprocess(ops.Normalize()) - processor.add_preprocess(ops.Categorify()) - processor.finalize() - processor.apply( - nvt.Dataset(df), - record_stats=True, - ) - df_out = processor.get_ddf().compute(scheduler="synchronous") + + cats = cat_names >> ops.Categorify() + conts = cont_names >> ops.Normalize() + + processor = nvt.Workflow(cats + conts + label_name) + df_out = processor.fit_transform(nvt.Dataset(df)).to_ddf().compute() data_itr = torch_dataloader.TorchAsyncItr( nvt.Dataset(df_out), cats=cat_names, From 4c9918612c6ec5bfcf36a9517fd066ff6f6b647a Mon Sep 17 00:00:00 2001 From: Benedikt Schifferer Date: Thu, 17 Dec 2020 10:42:13 +0000 Subject: [PATCH 19/23] doc strings --- nvtabular/ops/bucketize.py | 20 ++++++++++++- nvtabular/ops/categorify.py | 50 ++++++++++++++------------------ nvtabular/ops/difference_lag.py | 5 ++-- nvtabular/ops/dropna.py | 15 ++-------- nvtabular/ops/fill.py | 12 ++------ nvtabular/ops/filter.py | 3 +- nvtabular/ops/hash_bucket.py | 33 ++++----------------- nvtabular/ops/join_external.py | 30 +++++++------------ nvtabular/ops/join_groupby.py | 23 ++++----------- nvtabular/ops/logop.py | 18 +++--------- nvtabular/ops/normalize.py | 36 +++++------------------ nvtabular/ops/rename.py | 6 ++++ nvtabular/ops/target_encoding.py | 29 +++++++++--------- 13 files changed, 106 insertions(+), 174 deletions(-) diff --git a/nvtabular/ops/bucketize.py b/nvtabular/ops/bucketize.py index 11a5dfbcc8e..905555a3e38 100644 --- a/nvtabular/ops/bucketize.py +++ b/nvtabular/ops/bucketize.py @@ -20,7 +20,25 @@ class Bucketize(Operator): - """""" + """This operation transforms continuous features into categorical features. + The outputs are bins based on the boundaries + + Example usage:: + + # + cont_names = ['cont1', 'cont2'] + boundaries = { + 'cont1': [-50, 0, 50], + 'cont2': [0, 25, 50, 75, 100] + } + bucketize_op = cont_names >> ops.Bucketize(boundaries) + processor = nvt.Workflow(bucketize_op) + + Parameters + ---------- + boundaries : int, dict or callable + Defines how to transform the continous values into bins + """ def __init__(self, boundaries): # transform boundaries into a lookup function on column names diff --git a/nvtabular/ops/categorify.py b/nvtabular/ops/categorify.py index d443cf3c011..37b2ecb59dd 100644 --- a/nvtabular/ops/categorify.py +++ b/nvtabular/ops/categorify.py @@ -47,15 +47,13 @@ class Categorify(StatOperator): Example usage:: - # Initialize the workflow - proc = nvt.Workflow( - cat_names=CATEGORICAL_COLUMNS, - cont_names=CONTINUOUS_COLUMNS, - label_name=LABEL_COLUMNS - ) + # Define pipeline + cat_features = CATEGORICAL_COLUMNS >> nvt.ops.Categorify(freq_threshold=10) - # Add Categorify for categorical columns to the workflow - proc.add_cat_preprocess(nvt.ops.Categorify(freq_threshold=10)) + # Initialize the workflow and execute it + proc = nvt.Workflow(cat_features) + proc.fit(dataset) + proc.transform(dataset).to_parquet('./test/') Example for frequency hashing:: @@ -65,21 +63,18 @@ class Categorify(StatOperator): 'productID': [100, 101, 102, 101, 102, 103, 103], 'label': [0, 0, 1, 1, 1, 0, 0] }) + CATEGORICAL_COLUMNS = ['author', 'productID'] - # Initialize the workflow - proc = nvt.Workflow( - cat_names=['author', 'productID'], - cont_names=[], - label_name=['label'] - ) - - # Add num_buckets param and freq_threshold param - proc.add_cat_preprocess(nvt.ops.Categorify( + # Define pipeline + cat_features = CATEGORICAL_COLUMNS >> nvt.ops.Categorify( freq_threshold={"author": 3, "productID": 2}, num_buckets={"author": 10, "productID": 20}) ) - # Apply workflow - proc.apply(nvt.Dataset(df), record_stats=True, output_path='./test/') + + # Initialize the workflow and execute it + proc = nvt.Workflow(cat_features) + proc.fit(dataset) + proc.transform(dataset).to_parquet('./test/') Example with multi-hot:: @@ -90,18 +85,15 @@ class Categorify(StatOperator): 'categories': [['Cat A', 'Cat B'], ['Cat C'], ['Cat A', 'Cat C', 'Cat D']], 'label': [0,0,1] }) + CATEGORICAL_COLUMNS = ['userID', 'productID', 'categories'] - # Initialize the workflow - proc = nvt.Workflow( - cat_names=['userID', 'productID', 'categories'], - cont_names=[], - label_name=['label'] - ) + # Define pipeline + cat_features = CATEGORICAL_COLUMNS >> nvt.ops.Categorify() - # Add Categorify for categorical columns to the workflow - proc.add_preprocess(nvt.ops.Categorify()) - # Apply workflow - proc.apply(nvt.Dataset(df), record_stats=True, output_path='./test/') + # Initialize the workflow and execute it + proc = nvt.Workflow(cat_features) + proc.fit(dataset) + proc.transform(dataset).to_parquet('./test/') Parameters ----------- diff --git a/nvtabular/ops/difference_lag.py b/nvtabular/ops/difference_lag.py index e457dcc0f06..e3f9e3c303d 100644 --- a/nvtabular/ops/difference_lag.py +++ b/nvtabular/ops/difference_lag.py @@ -36,11 +36,12 @@ class DifferenceLag(Operator): # create a new nvtabular dataset on the partitioned/sorted values dataset = nvtabular.Dataset(ddf) - Once passed an appropiate dataset, this operator can be added to a nvtabular workflow to + Once passed an appropiate dataset, this operator can be used to create a workflow to compute the lagged difference within a partition:: # compute the delta in timestamp for each users session - workflow.add_feature(DifferenceLag('userid', columns=["timestamp"])) + diff_features = ["quantity"] >> ops.DifferenceLag(partition_cols=["userid"], shift=[1, -1]) + processor = nvtabular.Workflow(diff_features) Parameters ----------- diff --git a/nvtabular/ops/dropna.py b/nvtabular/ops/dropna.py index 46d6fcdf06a..cce3e8aed2b 100644 --- a/nvtabular/ops/dropna.py +++ b/nvtabular/ops/dropna.py @@ -25,22 +25,13 @@ class Dropna(Operator): Example usage:: - # Initialize the workflow - proc = nvt.Workflow( - cat_names=CATEGORICAL_COLUMNS, - cont_names=CONTINUOUS_COLUMNS, - label_name=LABEL_COLUMNS - ) - - # Add Dropna to the workflow and specify which columns to apply to + # Use Dropna to define a NVTabular workflow # Default is None and will check all columns - proc.add_preprocess(nvt.ops.Dropna(columns=['cat1', 'num1'])) + dropna_features = ['cat1', 'num1'] >> ops.Dropna() >> ... + processor = nvtabular.Workflow(dropna_features) Parameters ---------- - columns : list of str, default None - Columns to target for this op. If None, this operator will check all columns - for null values. """ @annotate("Dropna_op", color="darkgreen", domain="nvt_python") diff --git a/nvtabular/ops/fill.py b/nvtabular/ops/fill.py index 9e7bc97c299..4e89c65a7f9 100644 --- a/nvtabular/ops/fill.py +++ b/nvtabular/ops/fill.py @@ -26,16 +26,10 @@ class FillMissing(Operator): Example usage:: - # Initialize the workflow - proc = nvt.Workflow( - cat_names=CATEGORICAL_COLUMNS, - cont_names=CONTINUOUS_COLUMNS, - label_name=LABEL_COLUMNS - ) - - # Add FillMissing to the workflow for continuous columns and specify the fill value + # Use FillMissing to define a workflow for continuous columns and specify the fill value # Default is 0 - proc.add_cont_feature(nvt.ops.FillMissing(fill_val=100)) + cont_features = ['cont1', 'cont2', 'cont3'] >> ops.FillMissing() >> ... + processor = nvtabular.Workflow(cont_features) Parameters ----------- diff --git a/nvtabular/ops/filter.py b/nvtabular/ops/filter.py index 3ee88547f03..1d2de6fcde8 100644 --- a/nvtabular/ops/filter.py +++ b/nvtabular/ops/filter.py @@ -26,7 +26,8 @@ class Filter(Operator): For example to filter out all rows that have a negative value in the ``a`` column:: - op = Filter(lambda df: df["a"] >=0) + filtered = cont_names >> ops.Filter(f=lambda df: df["a"] >=0) + processor = nvtabular.Workflow(filtered) Parameters ----------- diff --git a/nvtabular/ops/hash_bucket.py b/nvtabular/ops/hash_bucket.py index c586b2d8340..fce46f9783f 100644 --- a/nvtabular/ops/hash_bucket.py +++ b/nvtabular/ops/hash_bucket.py @@ -30,34 +30,14 @@ class HashBucket(Operator): Example usage:: cat_names = ["feature_a", "feature_b"] - cont_names = ... - label_name = ... - workflow = nvt.Workflow( - cat_names=cat_names, cont_names=cont_names, label_name=label_names - ) # this will hash both features a and b to 100 buckets - op = nvt.ops.HashBucket(100) + hash_features = cat_names >> ops.HashBucket({"feature_a": 100, "feature_b": 50}) + processor = nvtabular.Workflow(hash_features) - # for different numbers of buckets per feature, initialize with a dict - op = nvt.ops.HashBucket({"feature_a": 100, "feature_b": 50}) - - # or, equivalently - op = nvt.ops.HashBucket( - num_buckets=[100, 50], columns=["feature_a", "feature_b"] - ) - - workflow.add_cat_preprocess(op) The output of this op would be:: - workflow.finalize() - gdf = cudf.DataFrame({ - "feature_a": [101588, 2214177, 92855], - "feature_b": ["foo", "bar", "baz"] - }) - workflow.apply_ops(gdf) - feature_a feature_b 0 90 11 1 70 40 @@ -71,15 +51,12 @@ class HashBucket(Operator): Parameters ---------- - num_buckets : int, list of int, or dictionary:{column: num_hash_buckets} + num_buckets : int or dictionary:{column: num_hash_buckets} Column-wise modulo to apply after hash function. Note that this means that the corresponding value will be the categorical cardinality of the transformed categorical feature. If given as an int, that value - will be used as the number of "hash buckets" for every feature. If - a list is provided, it must be of the same length as `columns` (which - should not be `None`), and the values will correspond to the number - of buckets to use for the feature specified at the same index in - `columns`. If a dictionary is passed, it will be used to specify + will be used as the number of "hash buckets" for every feature. + If a dictionary is passed, it will be used to specify explicit mappings from a column name to a number of buckets. In this case, only the columns specified in the keys of `num_buckets` will be transformed. diff --git a/nvtabular/ops/join_external.py b/nvtabular/ops/join_external.py index 773ef78ec2c..cf56c035204 100644 --- a/nvtabular/ops/join_external.py +++ b/nvtabular/ops/join_external.py @@ -30,28 +30,20 @@ class JoinExternal(Operator): Example usage:: - # Initialize the workflow - proc = nvt.Workflow( - cat_names=CATEGORICAL_COLUMNS, - cont_names=CONTINUOUS_COLUMNS, - label_name=LABEL_COLUMNS - ) - # Load dataset which should be joined to the main dataset df_external = cudf.read_parquet('external.parquet') - # Add JoinExternal to the workflow - proc.add_preprocess( - nvt.ops.JoinExternal( - df_external, - on=['key1', 'key2'], - on_ext=['key1_ext', 'key2_ext'], - how='left', - columns_ext=['key1_ext', 'key2_ext', 'cat1', 'cat2', 'num1'], - kind_ext='cudf', - cache='device' - ) - ) + # Use JoinExternal to define a NVTabular workflow + joined = nvt.ColumnGroup(columns_left) >> nvt.ops.JoinExternal( + df_ext, + on=['key1', 'key2'], + on_ext=['key1_ext', 'key2_ext'], + how='left', + columns_ext=['key1_ext', 'key2_ext', 'cat1', 'cat2', 'num1'], + kind_ext='cudf', + cache='device' + ) >> ... + processor = nvtabular.Workflow(joined) Parameters ----------- diff --git a/nvtabular/ops/join_groupby.py b/nvtabular/ops/join_groupby.py index 6b219d23a35..2768d0eaad0 100644 --- a/nvtabular/ops/join_groupby.py +++ b/nvtabular/ops/join_groupby.py @@ -33,20 +33,11 @@ class JoinGroupby(StatOperator): Example usage:: - # Initialize the workflow - proc = nvt.Workflow( - cat_names=CATEGORICAL_COLUMNS, - cont_names=CONTINUOUS_COLUMNS, - label_name=LABEL_COLUMNS - ) - - # Add JoinGroupby to the workflow - proc.add_feature( - JoinGroupby( - columns=['cat1', 'cat2', 'cat3'], # columns which are groupby - cont_names=['num1'], # continuous column, which the statistics are applied to - stats=['sum','count']), # statistics, which are applied + # Use JoinGroupby to define a NVTabular workflow + groupby_features = ['cat1', 'cat2', 'cat3'] >> ops.JoinGroupby( + out_path=str(tmpdir), stats=['sum','count'], cont_names=['num1'] ) + processor = nvtabular.Workflow(groupby_features) Parameters ----------- @@ -58,15 +49,14 @@ class JoinGroupby(StatOperator): that "count" corresponds to the group itself, while all other statistics correspond to a specific continuous column. Supported statistics include ["count", "sum", "mean", "std", "var"]. - columns : list of str or list(str), default None - Categorical columns (or multi-column "groups") to target for this op. - If None, the operation will target all known categorical columns. tree_width : dict or int, optional Tree width of the hash-based groupby reduction for each categorical column. High-cardinality columns may require a large `tree_width`, while low-cardinality columns can likely use `tree_width=1`. If passing a dict, each key and value should correspond to the column name and width, respectively. The default value is 8 for all columns. + cat_cache: ToDo Describe + TEXT out_path : str, optional Root directory where groupby statistics will be written out in parquet format. @@ -89,7 +79,6 @@ def __init__( out_path=None, on_host=True, name_sep="_", - stat_name=None, ): super().__init__() diff --git a/nvtabular/ops/logop.py b/nvtabular/ops/logop.py index 73ba45892f4..d7d4471bb0d 100644 --- a/nvtabular/ops/logop.py +++ b/nvtabular/ops/logop.py @@ -28,23 +28,13 @@ class LogOp(Operator): Example usage:: - # Initialize the workflow - proc = nvt.Workflow( - cat_names=CATEGORICAL_COLUMNS, - cont_names=CONTINUOUS_COLUMNS, - label_name=LABEL_COLUMNS - ) - - # Add LogOp to the workflow for continuous columns - proc.add_cont_feature(nvt.ops.LogOp()) + # Use LogOp to define NVTabular workflow + cont_features = cont_names >> nvt.ops.LogOp() >> ... + processor = nvt.Workflow(cont_features) Parameters ---------- - columns : list of str, default None - Continuous columns to target for this op. If None, the operation will target all known - continuous columns. - replace : bool, default False - Whether to replace existing columns or create new ones. + """ @annotate("LogOp_op", color="darkgreen", domain="nvt_python") diff --git a/nvtabular/ops/normalize.py b/nvtabular/ops/normalize.py index 34237cbea92..0bc63efbdd1 100644 --- a/nvtabular/ops/normalize.py +++ b/nvtabular/ops/normalize.py @@ -31,23 +31,13 @@ class Normalize(StatOperator): Example usage:: - # Initialize the workflow - proc = nvt.Workflow( - cat_names=CATEGORICAL_COLUMNS, - cont_names=CONTINUOUS_COLUMNS, - label_name=LABEL_COLUMNS - ) - - # Add Normalize to the workflow for continuous columns - proc.add_cont_feature(nvt.ops.Normalize()) + # Use Normalize to define a NVTabular workflow + cont_features = CONTINUOUS_COLUMNS >> ops.Normalize() + processor = nvtabular.Workflow(cont_features) Parameters ---------- - columns : list of str, default None - Continuous columns to target for this op. If None, the operation will target all known - continuous columns. - replace : bool, default False - Whether to replace existing columns or create new ones. + """ def __init__(self): @@ -96,23 +86,13 @@ class NormalizeMinMax(StatOperator): Example usage:: - # Initialize the workflow - proc = nvt.Workflow( - cat_names=CATEGORICAL_COLUMNS, - cont_names=CONTINUOUS_COLUMNS, - label_name=LABEL_COLUMNS - ) - - # Add NormalizeMinMax to the workflow for continuous columns - proc.add_cont_feature(nvt.ops.NormalizeMinMax()) + # Use NormalizeMinMax to define a NVTabular workflow + cont_features = CONTINUOUS_COLUMNS >> ops.NormalizeMinMax() + processor = nvtabular.Workflow(cont_features) Parameters ---------- - columns : list of str, default None - Continuous columns to target for this op. If None, the operation will target all known - continuous columns. - replace : bool, default False - Whether to replace existing columns or create new ones. + """ def __init__(self): diff --git a/nvtabular/ops/rename.py b/nvtabular/ops/rename.py index aaf562f5973..3b59e62376e 100644 --- a/nvtabular/ops/rename.py +++ b/nvtabular/ops/rename.py @@ -20,6 +20,12 @@ class Rename(Operator): """This operation renames columns, either by using a user defined lambda function to transform column names, or by appending a postfix string to every column name + Example usage:: + + # Rename columns after LogOp + cont_features = cont_names >> nvt.ops.LogOp() >> Rename(postfix='_log') + processor = nvt.Workflow(cont_features) + Parameters ---------- f : callable, optional diff --git a/nvtabular/ops/target_encoding.py b/nvtabular/ops/target_encoding.py index 994cd193f40..0035c7e4ec4 100644 --- a/nvtabular/ops/target_encoding.py +++ b/nvtabular/ops/target_encoding.py @@ -52,21 +52,20 @@ class TargetEncoding(StatOperator): Example usage:: - # Initialize the workflow - proc = nvt.Workflow( - cat_names=CATEGORICAL_COLUMNS, - cont_names=CONTINUOUS_COLUMNS, - label_name=LABEL_COLUMNS - ) - - # Add TE op to the workflow - proc.add_feature( - TargetEncoding( - cat_groups = ['cat1', 'cat2', ['cat2','cat3']], - target = LABEL_COLUMNS, - kfold = 5, - p_smooth = 20) + # First, we can transform the label columns to binary targets + LABEL_COLUMNS = ['label1', 'label2'] + labels = nvt.ColumnGroup(LABEL_COLUMNS) >> (lambda col: (col>0).astype('int8')) + # We target encode cat1, cat2 and the cross columns cat1 x cat2 + target_encode = ( + ['cat1', 'cat2', ['cat2','cat3']] >> + nvt.ops.TargetEncoding( + labels, + kfold=5, + p_smooth=20, + out_dtype="float32", + ) ) + processor = nvt.Workflow(target_encode) Parameters ----------- @@ -94,6 +93,8 @@ class TargetEncoding(StatOperator): while low-cardinality columns can likely use `tree_width=1`. If passing a dict, each key and value should correspond to the column name and width, respectively. The default value is 8 for all columns. + cat_cache : ToDo Describe + Text out_path : str, optional Root directory where category statistics will be written out in parquet format. From 4125637199a95cdd4bf49d8c1d153ee6399dc395 Mon Sep 17 00:00:00 2001 From: Benedikt Schifferer Date: Tue, 31 Aug 2021 19:09:50 +0000 Subject: [PATCH 20/23] tfrecords to parquet --- .../tensorflow/TFRecords-To-Parquet.ipynb | 987 ++++++++++++++++++ .../framework_utils/tensorflow/__init__.py | 1 + .../tensorflow/tfrecords_to_parquet.py | 139 +++ 3 files changed, 1127 insertions(+) create mode 100644 examples/tensorflow/TFRecords-To-Parquet.ipynb create mode 100644 nvtabular/framework_utils/tensorflow/tfrecords_to_parquet.py diff --git a/examples/tensorflow/TFRecords-To-Parquet.ipynb b/examples/tensorflow/TFRecords-To-Parquet.ipynb new file mode 100644 index 00000000000..4f664f679f2 --- /dev/null +++ b/examples/tensorflow/TFRecords-To-Parquet.ipynb @@ -0,0 +1,987 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "1d4a2a17", + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright 2021 NVIDIA Corporation. All Rights Reserved.\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# http://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License.\n", + "# ==============================================================================" + ] + }, + { + "cell_type": "markdown", + "id": "7da4cfc5", + "metadata": {}, + "source": [ + "\n", + "\n", + "# TensorFlow: Convert TFRecords to Parquet files\n", + "\n", + "## TFRecords\n", + "\n", + "[TFRecords](https://www.tensorflow.org/tutorials/load_data/tfrecord) are a popular file format to store data for deep learning training with TensorFlow. It is a \"simple format for storing a sequence of binary records\". In many cases the dataset is too large for the host memory and the dataset is converted into (multiple) tfrecords file to disk. TensorFlow's ecosystem enables to stream the tfrecords from disk to train the model without requiring to load the full dataset.

\n", + "That sounds great, but there are some disadvantages when working with tabular dataset. TFRecords stores the dataset as key, values. In other domains, such as computer vision, this representation is efficient as the key is `image` and the values are a the pixels. For an RGB image with 200x200 resoultion, there are 120000 (200x200x3) values. In a tabular dataset, a feature is often a single number and therefore, there is a significant overhead for using a key in each example. **In some of our experiments, we experienced that tfrecords can be ~4-5x larger than `parquet` files for the same dataset.**\n", + "

\n", + "[Parquet](https://en.wikipedia.org/wiki/Apache_Parquet) is another file format to store data. It is a free and open-source data storage format in the Hadoop ecosystem. Many popular systems, such as Spark or Pandas, support to read and write parquet files. \n", + "

\n", + "We developed [NVTabular Data Loaders](https://nvidia.github.io/NVTabular/main/training/index.html) as a customized data loader, fully operating on the GPU. It reads the data from disk into the GPU memory and prepares the next batch on the GPU. Therefore, we do not have any CPU-GPU communication. Our data loader leverages parquet files to reduce the disk pressure. **In our experiments, we experienced that the native data loader is the bottleneck in training tabular deep learning models and by changing the native data loader to NVTabular Data Loader, we saw a 8-9x speed-up.**\n", + "\n", + "### Convert TFRecords to Parquet files\n", + "That is a lot of background information. In many cases, we saw that users have their dataset stored as tfrecords files. In this notebook, we provide a tfrecords to parquet examples. Users can transform their dataset to parquet and be able to experiment with NVTabular data loader." + ] + }, + { + "cell_type": "markdown", + "id": "9a8f4dcd", + "metadata": {}, + "source": [ + "## Create a Synthetic Dataset" + ] + }, + { + "cell_type": "markdown", + "id": "243a5cbd", + "metadata": {}, + "source": [ + "First, we will create a synthetic dataset. Afterwards, we will convert the synthetic data to a tfrecord file. The synthetic dataset contains `continuous features`, `categorical features`, `continuous features in a list with variable length`, `categorical features in a list with variable length` and the `label`.

\n", + "The features of a list have variable length, which are often used in session-based recommender systems. For example, the last page views in a session and sessions have different lengths." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "58949777", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "deeafde3", + "metadata": {}, + "outputs": [], + "source": [ + "def create_synthetic_df(\n", + " N_CONT_FEATURES, N_CAT_FEATURES, N_CONT_LIST_FEATURES, N_CAT_LIST_FEATURES, N_ROWS\n", + "):\n", + " dict_features = {}\n", + " for icont in range(N_CONT_FEATURES):\n", + " dict_features[\"cont\" + str(icont)] = np.random.uniform(-1, 1, size=N_ROWS)\n", + " for icat in range(N_CAT_FEATURES):\n", + " dict_features[\"cat\" + str(icat)] = np.random.choice(list(range(10)), size=N_ROWS)\n", + " for icontlist in range(N_CONT_LIST_FEATURES):\n", + " feature_list = []\n", + " for irow in range(N_ROWS):\n", + " n_elements = np.random.choice(list(range(20)))\n", + " feature_list.append(np.random.uniform(-1, 1, size=n_elements).tolist())\n", + " dict_features[\"cont_list\" + str(icontlist)] = feature_list\n", + " for icatlist in range(N_CAT_LIST_FEATURES):\n", + " feature_list = []\n", + " for irow in range(N_ROWS):\n", + " n_elements = np.random.choice(list(range(20)))\n", + " feature_list.append(np.random.choice(list(range(10)), size=n_elements).tolist())\n", + " dict_features[\"cat_list\" + str(icatlist)] = feature_list\n", + " dict_features[\"label\"] = np.random.choice(list(range(2)), size=N_ROWS)\n", + " df = pd.DataFrame(dict_features)\n", + " return df" + ] + }, + { + "cell_type": "markdown", + "id": "fda49c3f", + "metadata": {}, + "source": [ + "We can configure the size of the dataset and numbers of features of the different type. As this is just a example, we use only 20,000 rows." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0b141d03", + "metadata": {}, + "outputs": [], + "source": [ + "N_ROWS = 20000\n", + "N_CONT_FEATURES = 5\n", + "N_CAT_FEATURES = 7\n", + "N_CONT_LIST_FEATURES = 2\n", + "N_CAT_LIST_FEATURES = 3" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "6616a87b", + "metadata": {}, + "outputs": [], + "source": [ + "df = create_synthetic_df(\n", + " N_CONT_FEATURES, N_CAT_FEATURES, N_CONT_LIST_FEATURES, N_CAT_LIST_FEATURES, N_ROWS\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "22d66e48", + "metadata": {}, + "source": [ + "We can take a look on the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e023dca6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cont0cont1cont2cont3cont4cat0cat1cat2cat3cat4cat5cat6cont_list0cont_list1cat_list0cat_list1cat_list2label
0-0.598272-0.665893-0.3151250.702566-0.3238495352483[0.5072592597602565, -0.47816687381189427, -0....[][4, 2, 5, 1, 7, 6, 4, 8, 8, 7, 6, 0, 3, 0, 9, ...[0, 1, 5, 9, 8, 1, 9, 3, 4, 5, 9, 1, 8, 0, 9, ...[1, 4, 5, 6, 8, 3, 5, 9, 0, 6, 0, 2, 1]0
1-0.3027630.102246-0.1970200.157954-0.1080249487736[-0.08574677514984685, -0.4996899616827948, -0...[0.23373279277235692, 0.9775817910065407, 0.14...[2, 5, 0, 8, 6, 0, 2, 8, 4, 7, 2, 9, 7][9, 1, 0, 6, 5, 1, 5, 3, 8, 1, 3, 6, 8, 7, 4][6, 2, 5, 9, 4, 7, 4, 8, 7, 0, 7, 2, 6, 5, 3, ...1
20.9409490.0285010.7742530.323579-0.4176908972828[-0.21767810959793432, 0.7070469796179955, -0....[0.8299352235765285, -0.0961479916392276, 0.60...[2, 4, 5, 9, 4, 0][2][1, 4, 1, 0, 1, 5, 2, 0, 8, 3]1
3-0.6496650.836329-0.965379-0.6898150.9460579419214[-0.9109288944751086, 0.017280738609947832, -0...[0.47963783275236826][8, 3, 9, 9, 5, 5, 2, 7, 8, 7, 0, 4, 8, 8, 3][3, 9, 8, 1, 6, 3, 4, 2, 1, 2, 3, 2, 7, 6, 1][0, 4, 5, 9, 4, 6, 0, 8, 6, 9, 5, 2, 6]1
40.786037-0.5502520.824308-0.0516280.7422938061593[-0.9196272581988192, 0.26653671490649056][-0.34703584253681274, -0.38656340762419905, -...[8][7, 4][2, 6]1
\n", + "
" + ], + "text/plain": [ + " cont0 cont1 cont2 cont3 cont4 cat0 cat1 cat2 cat3 \\\n", + "0 -0.598272 -0.665893 -0.315125 0.702566 -0.323849 5 3 5 2 \n", + "1 -0.302763 0.102246 -0.197020 0.157954 -0.108024 9 4 8 7 \n", + "2 0.940949 0.028501 0.774253 0.323579 -0.417690 8 9 7 2 \n", + "3 -0.649665 0.836329 -0.965379 -0.689815 0.946057 9 4 1 9 \n", + "4 0.786037 -0.550252 0.824308 -0.051628 0.742293 8 0 6 1 \n", + "\n", + " cat4 cat5 cat6 cont_list0 \\\n", + "0 4 8 3 [0.5072592597602565, -0.47816687381189427, -0.... \n", + "1 7 3 6 [-0.08574677514984685, -0.4996899616827948, -0... \n", + "2 8 2 8 [-0.21767810959793432, 0.7070469796179955, -0.... \n", + "3 2 1 4 [-0.9109288944751086, 0.017280738609947832, -0... \n", + "4 5 9 3 [-0.9196272581988192, 0.26653671490649056] \n", + "\n", + " cont_list1 \\\n", + "0 [] \n", + "1 [0.23373279277235692, 0.9775817910065407, 0.14... \n", + "2 [0.8299352235765285, -0.0961479916392276, 0.60... \n", + "3 [0.47963783275236826] \n", + "4 [-0.34703584253681274, -0.38656340762419905, -... \n", + "\n", + " cat_list0 \\\n", + "0 [4, 2, 5, 1, 7, 6, 4, 8, 8, 7, 6, 0, 3, 0, 9, ... \n", + "1 [2, 5, 0, 8, 6, 0, 2, 8, 4, 7, 2, 9, 7] \n", + "2 [2, 4, 5, 9, 4, 0] \n", + "3 [8, 3, 9, 9, 5, 5, 2, 7, 8, 7, 0, 4, 8, 8, 3] \n", + "4 [8] \n", + "\n", + " cat_list1 \\\n", + "0 [0, 1, 5, 9, 8, 1, 9, 3, 4, 5, 9, 1, 8, 0, 9, ... \n", + "1 [9, 1, 0, 6, 5, 1, 5, 3, 8, 1, 3, 6, 8, 7, 4] \n", + "2 [2] \n", + "3 [3, 9, 8, 1, 6, 3, 4, 2, 1, 2, 3, 2, 7, 6, 1] \n", + "4 [7, 4] \n", + "\n", + " cat_list2 label \n", + "0 [1, 4, 5, 6, 8, 3, 5, 9, 0, 6, 0, 2, 1] 0 \n", + "1 [6, 2, 5, 9, 4, 7, 4, 8, 7, 0, 7, 2, 6, 5, 3, ... 1 \n", + "2 [1, 4, 1, 0, 1, 5, 2, 0, 8, 3] 1 \n", + "3 [0, 4, 5, 9, 4, 6, 0, 8, 6, 9, 5, 2, 6] 1 \n", + "4 [2, 6] 1 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "6a49b022", + "metadata": {}, + "outputs": [], + "source": [ + "CONTINUOUS_COLUMNS = [\"cont\" + str(i) for i in range(N_CONT_FEATURES)]\n", + "CATEGORICAL_COLUMNS = [\"cat\" + str(i) for i in range(N_CAT_FEATURES)]\n", + "CONTINUOUS_LIST_COLUMNS = [\"cont_list\" + str(i) for i in range(N_CONT_LIST_FEATURES)]\n", + "CATEGORICAL_LIST_COLUMNS = [\"cat_list\" + str(i) for i in range(N_CAT_LIST_FEATURES)]\n", + "LABEL_COLUMNS = [\"label\"]" + ] + }, + { + "cell_type": "markdown", + "id": "bb33cb9b", + "metadata": {}, + "source": [ + "## Convert the Synthetic Dataset into TFRecords" + ] + }, + { + "cell_type": "markdown", + "id": "5a8b05b0", + "metadata": {}, + "source": [ + "After we created the synthetic dataset, we store it to tfrecords." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "055a8dae", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2021-08-31 18:57:50.981790: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n" + ] + } + ], + "source": [ + "import tensorflow as tf" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "f8f502ff", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import multiprocessing as mp\n", + "from itertools import repeat\n", + "\n", + "\n", + "def transform_tfrecords(\n", + " df,\n", + " PATH,\n", + " CONTINUOUS_COLUMNS,\n", + " CATEGORICAL_COLUMNS,\n", + " CONTINUOUS_LIST_COLUMNS,\n", + " CATEGORICAL_LIST_COLUMNS,\n", + " LABEL_COLUMNS,\n", + "):\n", + " write_dir = os.path.dirname(PATH)\n", + " if not os.path.exists(write_dir):\n", + " os.makedirs(write_dir)\n", + " file_idx, example_idx = 0, 0\n", + " writer = get_writer(write_dir, file_idx)\n", + " column_names = [\n", + " CONTINUOUS_COLUMNS,\n", + " CATEGORICAL_COLUMNS + LABEL_COLUMNS,\n", + " CONTINUOUS_LIST_COLUMNS,\n", + " CATEGORICAL_LIST_COLUMNS,\n", + " ]\n", + " with mp.Pool(8, pool_initializer, column_names) as pool:\n", + " data = []\n", + " for col_names in column_names:\n", + " if len(col_names) == 0:\n", + " data.append(repeat(None))\n", + " else:\n", + " data.append(df[col_names].values)\n", + " data = zip(*data)\n", + " record_map = pool.imap(build_and_serialize_example, data, chunksize=200)\n", + " for record in record_map:\n", + " writer.write(record)\n", + " example_idx += 1\n", + " writer.close()\n", + "\n", + "\n", + "def pool_initializer(num_cols, cat_cols, num_list_cols, cat_list_cols):\n", + " global numeric_columns\n", + " global categorical_columns\n", + " global numeric_list_columns\n", + " global categorical_list_columns\n", + " numeric_columns = num_cols\n", + " categorical_columns = cat_cols\n", + " numeric_list_columns = num_list_cols\n", + " categorical_list_columns = cat_list_cols\n", + "\n", + "\n", + "def build_and_serialize_example(data):\n", + " numeric_values, categorical_values, numeric_list_values, categorical_list_values = data\n", + " feature = {}\n", + " if numeric_values is not None:\n", + " feature.update(\n", + " {\n", + " col: tf.train.Feature(float_list=tf.train.FloatList(value=[val]))\n", + " for col, val in zip(numeric_columns, numeric_values)\n", + " }\n", + " )\n", + " if categorical_values is not None:\n", + " feature.update(\n", + " {\n", + " col: tf.train.Feature(int64_list=tf.train.Int64List(value=[val]))\n", + " for col, val in zip(categorical_columns, categorical_values)\n", + " }\n", + " )\n", + " if numeric_list_values is not None:\n", + " feature.update(\n", + " {\n", + " col: tf.train.Feature(float_list=tf.train.FloatList(value=val))\n", + " for col, val in zip(numeric_list_columns, numeric_list_values)\n", + " }\n", + " )\n", + " if categorical_list_values is not None:\n", + " feature.update(\n", + " {\n", + " col: tf.train.Feature(int64_list=tf.train.Int64List(value=val))\n", + " for col, val in zip(categorical_list_columns, categorical_list_values)\n", + " }\n", + " )\n", + " return tf.train.Example(features=tf.train.Features(feature=feature)).SerializeToString()\n", + "\n", + "\n", + "def get_writer(write_dir, file_idx):\n", + " filename = str(file_idx).zfill(5) + \".tfrecords\"\n", + " return tf.io.TFRecordWriter(os.path.join(write_dir, filename))" + ] + }, + { + "cell_type": "markdown", + "id": "f0430ce5", + "metadata": {}, + "source": [ + "We define the output path." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "0ca623b3", + "metadata": {}, + "outputs": [], + "source": [ + "PATH = \"/raid/tfrecord-test/\"" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "2619480a", + "metadata": {}, + "outputs": [], + "source": [ + "!rm -rf $PATH\n", + "!mkdir $PATH" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "b88f1b42", + "metadata": {}, + "outputs": [], + "source": [ + "transform_tfrecords(\n", + " df,\n", + " PATH,\n", + " CONTINUOUS_COLUMNS,\n", + " CATEGORICAL_COLUMNS,\n", + " CONTINUOUS_LIST_COLUMNS,\n", + " CATEGORICAL_LIST_COLUMNS,\n", + " LABEL_COLUMNS,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "25ad1044", + "metadata": {}, + "source": [ + "We can check the file." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "31362c7e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "00000.tfrecords\r\n" + ] + } + ], + "source": [ + "!ls $PATH" + ] + }, + { + "cell_type": "markdown", + "id": "69fc385f", + "metadata": {}, + "source": [ + "## Convert TFRecords to parquet files" + ] + }, + { + "cell_type": "markdown", + "id": "3aafe8a0", + "metadata": {}, + "source": [ + "Now, we have a dataset in the tfrecords format. Let's use the `convert_tfrecords_to_parquet` function to convert a tfrecord file into parquet." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "62fa679c", + "metadata": {}, + "outputs": [], + "source": [ + "import glob\n", + "\n", + "from nvtabular.framework_utils.tensorflow.tfrecords_to_parquet import convert_tfrecords_to_parquet" + ] + }, + { + "cell_type": "markdown", + "id": "1e59596b", + "metadata": {}, + "source": [ + "Let's select all TFRecords in the folder." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "fd930951", + "metadata": {}, + "outputs": [], + "source": [ + "filenames = glob.glob(PATH + \"/*.tfrecords\")" + ] + }, + { + "cell_type": "markdown", + "id": "3eab6554", + "metadata": {}, + "source": [ + "Let's call the `convert_tfrecords_to_parquet`.

\n", + "Some details about the parameters:\n", + "* `compression_type` is the compression type of the tfrecords. Options: `\"\"` (no compression), `\"ZLIB\"`, or `\"GZIP\"`\n", + "* `chunks` defines how many data points per `parquet` file should be saved. It splits a tfrecords into multiple parquet files.\n", + "* `convert_lists` defines, if feature lists should be converted into muliple feature columns. Even single dataframe series are 1 dimensional arrays when converted back from tfrecords to parquet. " + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "854f2aa3", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2021-08-31 18:58:05.260995: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n", + "2021-08-31 18:58:05.263090: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:0b:00.0 name: Tesla V100-SXM2-32GB computeCapability: 7.0\n", + "coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 31.75GiB deviceMemoryBandwidth: 836.37GiB/s\n", + "2021-08-31 18:58:05.263118: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2021-08-31 18:58:05.263162: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n", + "2021-08-31 18:58:05.263196: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n", + "2021-08-31 18:58:05.263230: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n", + "2021-08-31 18:58:05.263263: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n", + "2021-08-31 18:58:05.263309: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n", + "2021-08-31 18:58:05.263342: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n", + "2021-08-31 18:58:05.263379: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n", + "2021-08-31 18:58:05.267016: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2021-08-31 18:58:05.267818: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2021-08-31 18:58:05.277016: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:0b:00.0 name: Tesla V100-SXM2-32GB computeCapability: 7.0\n", + "coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 31.75GiB deviceMemoryBandwidth: 836.37GiB/s\n", + "2021-08-31 18:58:05.280616: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2021-08-31 18:58:05.280669: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2021-08-31 18:58:06.732386: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2021-08-31 18:58:06.732444: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2021-08-31 18:58:06.732455: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2021-08-31 18:58:06.738142: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 27675 MB memory) -> physical GPU (device: 0, name: Tesla V100-SXM2-32GB, pci bus id: 0000:0b:00.0, compute capability: 7.0)\n", + "2021-08-31 18:58:06.804145: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n", + "2021-08-31 18:58:06.825396: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2194940000 Hz\n", + "20000it [00:12, 1592.98it/s]\n" + ] + } + ], + "source": [ + "convert_tfrecords_to_parquet(\n", + " filenames=filenames, output_dir=PATH, compression_type=\"\", chunks=1000, convert_lists=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "c6a3881c", + "metadata": {}, + "source": [ + "## Let's take a look" + ] + }, + { + "cell_type": "markdown", + "id": "897c4ea3", + "metadata": {}, + "source": [ + "We can see that `convert_tfrecords_to_parquet` created multiple files per `tfrecord` depending on the chunk size." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "dab31264", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['/raid/tfrecord-test/00000_1.parquet',\n", + " '/raid/tfrecord-test/00000_9.parquet',\n", + " '/raid/tfrecord-test/00000_14.parquet',\n", + " '/raid/tfrecord-test/00000_2.parquet',\n", + " '/raid/tfrecord-test/00000_17.parquet',\n", + " '/raid/tfrecord-test/00000_6.parquet',\n", + " '/raid/tfrecord-test/00000_0.parquet',\n", + " '/raid/tfrecord-test/00000_19.parquet',\n", + " '/raid/tfrecord-test/00000_3.parquet',\n", + " '/raid/tfrecord-test/00000_15.parquet',\n", + " '/raid/tfrecord-test/00000_7.parquet',\n", + " '/raid/tfrecord-test/00000_4.parquet',\n", + " '/raid/tfrecord-test/00000_11.parquet',\n", + " '/raid/tfrecord-test/00000_8.parquet',\n", + " '/raid/tfrecord-test/00000_12.parquet',\n", + " '/raid/tfrecord-test/00000_18.parquet',\n", + " '/raid/tfrecord-test/00000_5.parquet',\n", + " '/raid/tfrecord-test/00000_16.parquet',\n", + " '/raid/tfrecord-test/00000_13.parquet',\n", + " '/raid/tfrecord-test/00000_10.parquet']" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "filenames = glob.glob(PATH + \"/*.parquet\")\n", + "filenames" + ] + }, + { + "cell_type": "markdown", + "id": "453e26eb", + "metadata": {}, + "source": [ + "If we load the first file, we cann see, that it has the same structure as our original synthetic dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "b2ce99e0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cat0cat1cat2cat3cat4cat5cat6cat_list0cat_list1cat_list2cont0cont1cont2cont3cont4cont_list0cont_list1label
06421658[7, 3, 5, 8, 1, 9, 8, 0, 2, 3, 6, 4, 2, 3, 0, ...[7, 8, 3, 3, 9, 3, 7, 7, 1, 3, 2, 8, 2, 2, 9, ...[9, 9, 1, 9, 2, 0, 5, 1, 2, 4, 7]0.616503-0.5622990.2949240.3681520.460910[0.10967133, 0.7335732, -0.32737544, 0.5034231...[0.37586957, 0.57677925, -0.49405763, -0.38141...0
17652270[7, 0, 6, 2, 6, 1, 8, 4, 8, 4, 3, 3, 7, 1, 5][][6, 1, 0, 2, 4, 8]0.356238-0.9966740.638569-0.873461-0.549512[0.038814984][0.9645752, 0.841208, 0.23322387, -0.000990165...0
27703698[2, 8, 5, 3, 2, 9, 4, 0, 8, 6, 0, 5, 9, 5, 4][7, 6, 0, 8, 2, 4, 7, 5][3, 0, 8, 9, 5, 4]-0.639743-0.815482-0.808328-0.9683450.844720[-0.3283896, 0.95472634, -0.8193472, 0.9815109...[-0.2411973, 0.21095003, -0.4520857, -0.961866...1
36489027[8, 4, 3, 7, 7, 6, 7, 4, 4, 1, 8, 8, 8, 9, 4][4, 0, 8, 0][]0.465874-0.8708470.1970970.254493-0.363290[-0.88873893, 0.45026976, 0.93019474, 0.345771...[0.95485276, 0.21282451, 0.32634658, 0.0435548...0
47256097[2, 2, 6, 5, 6, 7, 6, 2, 0, 5, 8][1, 4, 2, 9, 6, 3, 2][5, 8, 0, 0, 2, 7, 8, 6, 5, 8, 9, 8, 7, 5, 6, ...0.6572150.6971090.743880-0.172813-0.301696[-0.67113024, 0.15799437, -0.3753272, 0.132746...[-0.78169054, 0.8858877, 0.10161541, 0.1666716...1
\n", + "
" + ], + "text/plain": [ + " cat0 cat1 cat2 cat3 cat4 cat5 cat6 \\\n", + "0 6 4 2 1 6 5 8 \n", + "1 7 6 5 2 2 7 0 \n", + "2 7 7 0 3 6 9 8 \n", + "3 6 4 8 9 0 2 7 \n", + "4 7 2 5 6 0 9 7 \n", + "\n", + " cat_list0 \\\n", + "0 [7, 3, 5, 8, 1, 9, 8, 0, 2, 3, 6, 4, 2, 3, 0, ... \n", + "1 [7, 0, 6, 2, 6, 1, 8, 4, 8, 4, 3, 3, 7, 1, 5] \n", + "2 [2, 8, 5, 3, 2, 9, 4, 0, 8, 6, 0, 5, 9, 5, 4] \n", + "3 [8, 4, 3, 7, 7, 6, 7, 4, 4, 1, 8, 8, 8, 9, 4] \n", + "4 [2, 2, 6, 5, 6, 7, 6, 2, 0, 5, 8] \n", + "\n", + " cat_list1 \\\n", + "0 [7, 8, 3, 3, 9, 3, 7, 7, 1, 3, 2, 8, 2, 2, 9, ... \n", + "1 [] \n", + "2 [7, 6, 0, 8, 2, 4, 7, 5] \n", + "3 [4, 0, 8, 0] \n", + "4 [1, 4, 2, 9, 6, 3, 2] \n", + "\n", + " cat_list2 cont0 cont1 \\\n", + "0 [9, 9, 1, 9, 2, 0, 5, 1, 2, 4, 7] 0.616503 -0.562299 \n", + "1 [6, 1, 0, 2, 4, 8] 0.356238 -0.996674 \n", + "2 [3, 0, 8, 9, 5, 4] -0.639743 -0.815482 \n", + "3 [] 0.465874 -0.870847 \n", + "4 [5, 8, 0, 0, 2, 7, 8, 6, 5, 8, 9, 8, 7, 5, 6, ... 0.657215 0.697109 \n", + "\n", + " cont2 cont3 cont4 \\\n", + "0 0.294924 0.368152 0.460910 \n", + "1 0.638569 -0.873461 -0.549512 \n", + "2 -0.808328 -0.968345 0.844720 \n", + "3 0.197097 0.254493 -0.363290 \n", + "4 0.743880 -0.172813 -0.301696 \n", + "\n", + " cont_list0 \\\n", + "0 [0.10967133, 0.7335732, -0.32737544, 0.5034231... \n", + "1 [0.038814984] \n", + "2 [-0.3283896, 0.95472634, -0.8193472, 0.9815109... \n", + "3 [-0.88873893, 0.45026976, 0.93019474, 0.345771... \n", + "4 [-0.67113024, 0.15799437, -0.3753272, 0.132746... \n", + "\n", + " cont_list1 label \n", + "0 [0.37586957, 0.57677925, -0.49405763, -0.38141... 0 \n", + "1 [0.9645752, 0.841208, 0.23322387, -0.000990165... 0 \n", + "2 [-0.2411973, 0.21095003, -0.4520857, -0.961866... 1 \n", + "3 [0.95485276, 0.21282451, 0.32634658, 0.0435548... 0 \n", + "4 [-0.78169054, 0.8858877, 0.10161541, 0.1666716... 1 " + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_parquet(filenames[0])\n", + "df.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nvtabular/framework_utils/tensorflow/__init__.py b/nvtabular/framework_utils/tensorflow/__init__.py index bf3c16a024b..8c1e61cead3 100644 --- a/nvtabular/framework_utils/tensorflow/__init__.py +++ b/nvtabular/framework_utils/tensorflow/__init__.py @@ -16,3 +16,4 @@ # flake8: noqa from .feature_column_utils import make_feature_column_workflow +from .tfrecords_to_parquet import convert_tfrecords_to_parquet diff --git a/nvtabular/framework_utils/tensorflow/tfrecords_to_parquet.py b/nvtabular/framework_utils/tensorflow/tfrecords_to_parquet.py new file mode 100644 index 00000000000..d56ab56ad34 --- /dev/null +++ b/nvtabular/framework_utils/tensorflow/tfrecords_to_parquet.py @@ -0,0 +1,139 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import gc + +import pandas as pd + +# Some of the functions are copied or inspired by +# https://github.com/schipiga/pandas-tfrecords/ +import tensorflow as tf +from tqdm import tqdm + + +def convert_tfrecords_to_parquet( + filenames, output_dir, compression_type="", chunks=100000, convert_lists=False +): + """ + Converts tfrecord files to parquet file format + + Parameters + ---------- + filenames: list + List of tfrecord filenames, which should be converted + output_dir: str + Output path where the parquet files will be stored + compression_type: str + Compression type of the tfrecords. Options: `""` (no compression), `"ZLIB"`, or `"GZIP"` + chunks: int + Split tfrecords into multiple parquet files + convert_lists: Boolean + Output of tfrecords are lists. Set True to convert lists with fixed length to + individual columns in the output dataframe + + """ + # TODO: provide row_groupby_size parameter for parquet files + # TODO: optimize parquet files + + for file in filenames: + dataset = tf.data.TFRecordDataset(file, compression_type=compression_type) + features = _detect_schema(dataset) + parser = _read_example(features) + parsed = dataset.map(parser) + _to_parquet(parsed, file, output_dir, chunks, convert_lists) + + +def _read_example(features): + # https://github.com/schipiga/pandas-tfrecords/blob/master/pandas_tfrecords/from_tfrecords.py + def parse(serialized): + example = tf.io.parse_single_example(serialized, features=features) + return example + + return parse + + +def _get_feature_type(feature=None, type_=None): + # https://github.com/schipiga/pandas-tfrecords/blob/master/pandas_tfrecords/from_tfrecords.py + if type_: + return { + int: tf.int64, + float: tf.float32, + str: tf.string, + bytes: tf.string, + }[type_] + + if feature: + if feature.HasField("int64_list"): + return tf.int64 + if feature.HasField("float_list"): + return tf.float32 + if feature.HasField("bytes_list"): + return tf.string + + +def _detect_schema(dataset): + # by https://github.com/schipiga/pandas-tfrecords/blob/master/pandas_tfrecords/from_tfrecords.py + features = {} + + serialized = next(iter(dataset.map(lambda serialized: serialized))) + seq_ex = tf.train.SequenceExample.FromString(serialized.numpy()) + + if seq_ex.context.feature: + for key, feature in seq_ex.context.feature.items(): + features[key] = tf.io.FixedLenSequenceFeature( + (), _get_feature_type(feature=feature), allow_missing=True + ) + + return features + + +def _to_parquet(tfrecords, file, output_dir, chunks, convert_lists): + out = [] + i = 0 + j = 0 + for tfrecord in tqdm(tfrecords): + row = {key: val.numpy() for key, val in tfrecord.items()} + out.append(row) + i += 1 + if i == chunks: + df = pd.DataFrame(out) + if convert_lists: + df = _convert_lists(df) + df.to_parquet( + output_dir + file.split("/")[-1].split(".")[0] + "_" + str(j) + ".parquet" + ) + i = 0 + out = [] + j += 1 + del df + gc.collect() + if len(out) > 0: + df = pd.DataFrame(out) + if convert_lists: + df = _convert_lists(df) + df.to_parquet(output_dir + file.split("/")[-1].split(".")[0] + "_" + str(j) + ".parquet") + del df + + +def _convert_lists(df): + for col in df.columns: + series_length = df[col].apply(lambda x: len(x)) + if series_length.var() == 0 and series_length.min() > 0: + if series_length.max() == 1: + df[col] = df[col].apply(lambda x: x[0]) + else: + for i in range(series_length.max()): + df[col + "_" + str(i)] = df[col].apply(lambda x: x[i]) + return df From f7669bafdb6cd53b05316baa242a2da87cc8ac62 Mon Sep 17 00:00:00 2001 From: Benedikt Schifferer Date: Wed, 1 Sep 2021 23:37:44 +0000 Subject: [PATCH 21/23] leverage pandas-tfrecords --- .../tensorflow/TFRecords-To-Parquet.ipynb | 2019 ++++++++++++++--- .../tensorflow/tfrecords_to_parquet.py | 37 +- 2 files changed, 1761 insertions(+), 295 deletions(-) diff --git a/examples/tensorflow/TFRecords-To-Parquet.ipynb b/examples/tensorflow/TFRecords-To-Parquet.ipynb index 4f664f679f2..5d2ea62791c 100644 --- a/examples/tensorflow/TFRecords-To-Parquet.ipynb +++ b/examples/tensorflow/TFRecords-To-Parquet.ipynb @@ -45,6 +45,1501 @@ "That is a lot of background information. In many cases, we saw that users have their dataset stored as tfrecords files. In this notebook, we provide a tfrecords to parquet examples. Users can transform their dataset to parquet and be able to experiment with NVTabular data loader." ] }, + { + "cell_type": "markdown", + "id": "096a7716", + "metadata": {}, + "source": [ + "We leverage the library pandas-tfrecords." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "35e6c8d4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mWARNING: Value for scheme.platlib does not match. Please report this to \n", + "distutils: /usr/local/lib/python3.8/dist-packages\n", + "sysconfig: /usr/lib/python3.8/site-packages\u001b[0m\n", + "\u001b[33mWARNING: Value for scheme.purelib does not match. Please report this to \n", + "distutils: /usr/local/lib/python3.8/dist-packages\n", + "sysconfig: /usr/lib/python3.8/site-packages\u001b[0m\n", + "\u001b[33mWARNING: Value for scheme.headers does not match. Please report this to \n", + "distutils: /usr/local/include/python3.8/UNKNOWN\n", + "sysconfig: /usr/include/python3.8/UNKNOWN\u001b[0m\n", + "\u001b[33mWARNING: Value for scheme.scripts does not match. Please report this to \n", + "distutils: /usr/local/bin\n", + "sysconfig: /usr/bin\u001b[0m\n", + "\u001b[33mWARNING: Value for scheme.data does not match. Please report this to \n", + "distutils: /usr/local\n", + "sysconfig: /usr\u001b[0m\n", + "\u001b[33mWARNING: Additional context:\n", + "user = False\n", + "home = None\n", + "root = None\n", + "prefix = None\u001b[0m\n", + "Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com\n", + "Collecting pandas-tfrecords==0.1.5\n", + " Downloading pandas_tfrecords-0.1.5-py3-none-any.whl (7.0 kB)\n", + "Requirement already satisfied: pandas==1.2.4 in /usr/local/lib/python3.8/dist-packages (from pandas-tfrecords==0.1.5) (1.2.4)\n", + "Collecting tensorflow==2.5.0\n", + " Downloading tensorflow-2.5.0-cp38-cp38-manylinux2010_x86_64.whl (454.4 MB)\n", + "\u001b[K |██████████████████████████▍ | 374.7 MB 94.2 MB/s eta 0:00:01" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "IOPub data rate exceeded.\n", + "The notebook server will temporarily stop sending output\n", + "to the client in order to avoid crashing it.\n", + "To change this limit, set the config variable\n", + "`--NotebookApp.iopub_data_rate_limit`.\n", + "\n", + "Current values:\n", + "NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)\n", + "NotebookApp.rate_limit_window=3.0 (secs)\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "\u001b[K |███████████████████████████████ | 441.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████ | 441.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 441.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 441.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 441.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 441.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 441.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 441.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 441.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 441.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 441.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 441.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 441.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 441.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 441.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 441.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 441.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 441.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 442.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▏| 443.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 443.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▎| 444.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 444.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.9 MB 85.6 MB/s eta 0:00:01" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "\u001b[K |███████████████████████████████▍| 445.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 445.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▍| 446.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 446.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 447.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 448.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 448.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 448.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 448.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 448.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 448.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 448.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 448.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 448.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 448.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 448.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 448.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 448.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▌| 448.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 448.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▋| 449.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 449.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 449.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 449.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 449.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 449.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 449.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 449.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 449.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 449.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 449.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 449.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 449.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 449.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 449.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 449.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 449.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 449.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 449.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 449.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 449.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 449.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 449.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 449.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 449.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 449.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 449.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 449.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.6 MB 85.6 MB/s eta 0:00:01" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "\u001b[K |███████████████████████████████▊| 450.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 450.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 451.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 451.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 451.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 451.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 451.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 451.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 451.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 451.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 451.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 451.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 451.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 451.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 451.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 451.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 451.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 451.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 451.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 451.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 451.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 451.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 451.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 451.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 451.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 451.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 451.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 451.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 451.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 451.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▊| 451.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 451.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |███████████████████████████████▉| 452.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 452.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 452.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 452.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 452.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 452.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 452.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 452.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 452.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 452.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 452.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 452.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 452.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.5 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.6 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.7 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.8 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 453.9 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.0 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.1 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.2 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.3 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.4 MB 85.6 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 454.4 MB 85.6 MB/s \r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[?25hRequirement already satisfied: numpy>=1.16.5 in /usr/local/lib/python3.8/dist-packages (from pandas-tfrecords==0.1.5) (1.19.5)\n", + "Requirement already satisfied: s3fs==2021.6.0 in /usr/local/lib/python3.8/dist-packages (from pandas-tfrecords==0.1.5) (2021.6.0)\n", + "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.8/dist-packages (from pandas==1.2.4->pandas-tfrecords==0.1.5) (2.8.2)\n", + "Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.8/dist-packages (from pandas==1.2.4->pandas-tfrecords==0.1.5) (2021.1)\n", + "Requirement already satisfied: fsspec==2021.06.0 in /usr/local/lib/python3.8/dist-packages (from s3fs==2021.6.0->pandas-tfrecords==0.1.5) (2021.6.0)\n", + "Requirement already satisfied: aiobotocore>=1.0.1 in /usr/local/lib/python3.8/dist-packages (from s3fs==2021.6.0->pandas-tfrecords==0.1.5) (1.4.0)\n", + "Requirement already satisfied: keras-preprocessing~=1.1.2 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.1.2)\n", + "Requirement already satisfied: flatbuffers~=1.12.0 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.12)\n", + "Requirement already satisfied: protobuf>=3.9.2 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (3.17.3)\n", + "Requirement already satisfied: typing-extensions~=3.7.4 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (3.7.4.3)\n", + "Requirement already satisfied: astunparse~=1.6.3 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.6.3)\n", + "Requirement already satisfied: six~=1.15.0 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.15.0)\n", + "Requirement already satisfied: termcolor~=1.1.0 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.1.0)\n", + "Requirement already satisfied: wrapt~=1.12.1 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.12.1)\n", + "Requirement already satisfied: gast==0.4.0 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (0.4.0)\n", + "Requirement already satisfied: h5py~=3.1.0 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (3.1.0)\n", + "Requirement already satisfied: keras-nightly~=2.5.0.dev in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (2.5.0.dev2021032900)\n", + "Requirement already satisfied: absl-py~=0.10 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (0.12.0)\n", + "Requirement already satisfied: wheel~=0.35 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (0.36.2)\n", + "Collecting tensorflow-estimator<2.6.0,>=2.5.0rc0\n", + " Downloading tensorflow_estimator-2.5.0-py2.py3-none-any.whl (462 kB)\n", + "\u001b[K |████████████████████████████████| 462 kB 66.9 MB/s eta 0:00:01\n", + "\u001b[?25hCollecting grpcio~=1.34.0\n", + " Downloading grpcio-1.34.1-cp38-cp38-manylinux2014_x86_64.whl (4.0 MB)\n", + "\u001b[K |████████████████████████████████| 4.0 MB 54.3 MB/s eta 0:00:01\n", + "\u001b[?25hRequirement already satisfied: opt-einsum~=3.3.0 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (3.3.0)\n", + "Requirement already satisfied: tensorboard~=2.5 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (2.6.0)\n", + "Requirement already satisfied: google-pasta~=0.2 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (0.2.0)\n", + "Requirement already satisfied: botocore<1.20.107,>=1.20.106 in /usr/local/lib/python3.8/dist-packages (from aiobotocore>=1.0.1->s3fs==2021.6.0->pandas-tfrecords==0.1.5) (1.20.106)\n", + "Requirement already satisfied: aioitertools>=0.5.1 in /usr/local/lib/python3.8/dist-packages (from aiobotocore>=1.0.1->s3fs==2021.6.0->pandas-tfrecords==0.1.5) (0.8.0)\n", + "Requirement already satisfied: aiohttp>=3.3.1 in /usr/local/lib/python3.8/dist-packages (from aiobotocore>=1.0.1->s3fs==2021.6.0->pandas-tfrecords==0.1.5) (3.7.4.post0)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp>=3.3.1->aiobotocore>=1.0.1->s3fs==2021.6.0->pandas-tfrecords==0.1.5) (1.6.3)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp>=3.3.1->aiobotocore>=1.0.1->s3fs==2021.6.0->pandas-tfrecords==0.1.5) (21.2.0)\n", + "Requirement already satisfied: async-timeout<4.0,>=3.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp>=3.3.1->aiobotocore>=1.0.1->s3fs==2021.6.0->pandas-tfrecords==0.1.5) (3.0.1)\n", + "Requirement already satisfied: chardet<5.0,>=2.0 in /usr/lib/python3/dist-packages (from aiohttp>=3.3.1->aiobotocore>=1.0.1->s3fs==2021.6.0->pandas-tfrecords==0.1.5) (3.0.4)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.8/dist-packages (from aiohttp>=3.3.1->aiobotocore>=1.0.1->s3fs==2021.6.0->pandas-tfrecords==0.1.5) (5.1.0)\n", + "Requirement already satisfied: urllib3<1.27,>=1.25.4 in /usr/local/lib/python3.8/dist-packages (from botocore<1.20.107,>=1.20.106->aiobotocore>=1.0.1->s3fs==2021.6.0->pandas-tfrecords==0.1.5) (1.26.6)\n", + "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /usr/local/lib/python3.8/dist-packages/jmespath-0.10.0-py3.8.egg (from botocore<1.20.107,>=1.20.106->aiobotocore>=1.0.1->s3fs==2021.6.0->pandas-tfrecords==0.1.5) (0.10.0)\n", + "Requirement already satisfied: werkzeug>=0.11.15 in /usr/local/lib/python3.8/dist-packages (from tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (2.0.1)\n", + "Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /usr/local/lib/python3.8/dist-packages (from tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (0.4.4)\n", + "Requirement already satisfied: tensorboard-data-server<0.7.0,>=0.6.0 in /usr/local/lib/python3.8/dist-packages (from tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (0.6.1)\n", + "Requirement already satisfied: setuptools>=41.0.0 in /usr/local/lib/python3.8/dist-packages (from tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (57.4.0)\n", + "Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /usr/local/lib/python3.8/dist-packages (from tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.8.0)\n", + "Requirement already satisfied: google-auth<2,>=1.6.3 in /usr/local/lib/python3.8/dist-packages (from tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.33.1)\n", + "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.8/dist-packages (from tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (3.3.4)\n", + "Requirement already satisfied: requests<3,>=2.21.0 in /usr/local/lib/python3.8/dist-packages (from tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (2.26.0)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.8/dist-packages (from google-auth<2,>=1.6.3->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (0.2.8)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.8/dist-packages (from google-auth<2,>=1.6.3->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (4.7.2)\n", + "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.8/dist-packages (from google-auth<2,>=1.6.3->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (4.2.2)\n", + "Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.8/dist-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.3.0)\n", + "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.8/dist-packages (from pyasn1-modules>=0.2.1->google-auth<2,>=1.6.3->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (0.4.8)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.8/dist-packages (from requests<3,>=2.21.0->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (3.2)\n", + "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.8/dist-packages (from requests<3,>=2.21.0->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (2.0.3)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.8/dist-packages (from requests<3,>=2.21.0->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (2021.5.30)\n", + "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.8/dist-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (3.1.1)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Installing collected packages: grpcio, tensorflow-estimator, tensorflow, pandas-tfrecords\n", + " Attempting uninstall: grpcio\n", + " Found existing installation: grpcio 1.39.0\n", + " Uninstalling grpcio-1.39.0:\n", + " Successfully uninstalled grpcio-1.39.0\n", + "\u001b[33m WARNING: Value for scheme.headers does not match. Please report this to \n", + " distutils: /usr/local/include/python3.8/grpcio\n", + " sysconfig: /usr/include/python3.8/grpcio\u001b[0m\n", + " Attempting uninstall: tensorflow-estimator\n", + " Found existing installation: tensorflow-estimator 2.6.0\n", + " Uninstalling tensorflow-estimator-2.6.0:\n", + " Successfully uninstalled tensorflow-estimator-2.6.0\n", + "\u001b[33m WARNING: Value for scheme.headers does not match. Please report this to \n", + " distutils: /usr/local/include/python3.8/tensorflow-estimator\n", + " sysconfig: /usr/include/python3.8/tensorflow-estimator\u001b[0m\n", + " Attempting uninstall: tensorflow\n", + " Found existing installation: tensorflow 2.6.0\n", + " Uninstalling tensorflow-2.6.0:\n", + " Successfully uninstalled tensorflow-2.6.0\n", + "\u001b[33m WARNING: Value for scheme.headers does not match. Please report this to \n", + " distutils: /usr/local/include/python3.8/tensorflow\n", + " sysconfig: /usr/include/python3.8/tensorflow\u001b[0m\n", + "\u001b[33m WARNING: Value for scheme.headers does not match. Please report this to \n", + " distutils: /usr/local/include/python3.8/pandas-tfrecords\n", + " sysconfig: /usr/include/python3.8/pandas-tfrecords\u001b[0m\n", + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "tensorflow-transform 1.3.0 requires tensorflow!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,<2.7,>=1.15.2, but you have tensorflow 2.5.0 which is incompatible.\n", + "tensorflow-transform 1.3.0 requires tfx-bsl<1.4.0,>=1.3.0, but you have tfx-bsl 1.2.0 which is incompatible.\n", + "tensorflow-serving-api 2.6.0 requires tensorflow<3,>=2.6.0, but you have tensorflow 2.5.0 which is incompatible.\n", + "tensorflow-gpu 2.4.2 requires gast==0.3.3, but you have gast 0.4.0 which is incompatible.\n", + "tensorflow-gpu 2.4.2 requires grpcio~=1.32.0, but you have grpcio 1.34.1 which is incompatible.\n", + "tensorflow-gpu 2.4.2 requires h5py~=2.10.0, but you have h5py 3.1.0 which is incompatible.\n", + "tensorflow-gpu 2.4.2 requires tensorflow-estimator<2.5.0,>=2.4.0, but you have tensorflow-estimator 2.5.0 which is incompatible.\n", + "grpcio-channelz 1.39.0 requires grpcio>=1.39.0, but you have grpcio 1.34.1 which is incompatible.\u001b[0m\n", + "Successfully installed grpcio-1.34.1 pandas-tfrecords-0.1.5 tensorflow-2.5.0 tensorflow-estimator-2.5.0\n", + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\n", + "\u001b[33mWARNING: You are using pip version 21.2.1; however, version 21.2.4 is available.\n", + "You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.\u001b[0m\n" + ] + } + ], + "source": [ + "!pip install pandas-tfrecords==0.1.5" + ] + }, { "cell_type": "markdown", "id": "9a8f4dcd", @@ -64,7 +1559,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "58949777", "metadata": {}, "outputs": [], @@ -75,7 +1570,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "deeafde3", "metadata": {}, "outputs": [], @@ -115,7 +1610,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "0b141d03", "metadata": {}, "outputs": [], @@ -129,7 +1624,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "6616a87b", "metadata": {}, "outputs": [], @@ -149,7 +1644,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "e023dca6", "metadata": {}, "outputs": [ @@ -197,108 +1692,108 @@ " \n", " \n", " 0\n", - " -0.598272\n", - " -0.665893\n", - " -0.315125\n", - " 0.702566\n", - " -0.323849\n", - " 5\n", - " 3\n", - " 5\n", - " 2\n", - " 4\n", + " -0.397172\n", + " 0.532571\n", + " 0.759104\n", + " 0.594407\n", + " -0.524947\n", " 8\n", - " 3\n", - " [0.5072592597602565, -0.47816687381189427, -0....\n", + " 7\n", + " 2\n", + " 1\n", + " 0\n", + " 5\n", + " 0\n", + " [-0.6707850610776214, -0.05450009496947694, -0...\n", + " [0.5190079351542354, -0.3838151710281379, 0.44...\n", + " [1, 0, 1]\n", " []\n", - " [4, 2, 5, 1, 7, 6, 4, 8, 8, 7, 6, 0, 3, 0, 9, ...\n", - " [0, 1, 5, 9, 8, 1, 9, 3, 4, 5, 9, 1, 8, 0, 9, ...\n", - " [1, 4, 5, 6, 8, 3, 5, 9, 0, 6, 0, 2, 1]\n", + " [7, 1, 9, 8, 0, 8, 6, 7, 2, 8]\n", " 0\n", " \n", " \n", " 1\n", - " -0.302763\n", - " 0.102246\n", - " -0.197020\n", - " 0.157954\n", - " -0.108024\n", - " 9\n", - " 4\n", - " 8\n", - " 7\n", - " 7\n", + " 0.862493\n", + " -0.109889\n", + " -0.603772\n", + " 0.057867\n", + " 0.467439\n", + " 0\n", " 3\n", - " 6\n", - " [-0.08574677514984685, -0.4996899616827948, -0...\n", - " [0.23373279277235692, 0.9775817910065407, 0.14...\n", - " [2, 5, 0, 8, 6, 0, 2, 8, 4, 7, 2, 9, 7]\n", - " [9, 1, 0, 6, 5, 1, 5, 3, 8, 1, 3, 6, 8, 7, 4]\n", - " [6, 2, 5, 9, 4, 7, 4, 8, 7, 0, 7, 2, 6, 5, 3, ...\n", - " 1\n", + " 8\n", + " 8\n", + " 4\n", + " 0\n", + " 9\n", + " [-0.05660807519951194, -0.07055138523986693, 0...\n", + " [-0.6486521298228771, -0.8936482314761995, -0....\n", + " [7, 1, 3, 8, 8, 9, 8, 9, 0, 5, 8, 4, 6, 4, 8, 0]\n", + " [1, 0, 5, 4, 3]\n", + " [3, 0, 7, 4, 8, 6, 6]\n", + " 0\n", " \n", " \n", " 2\n", - " 0.940949\n", - " 0.028501\n", - " 0.774253\n", - " 0.323579\n", - " -0.417690\n", - " 8\n", - " 9\n", + " -0.044618\n", + " -0.087187\n", + " 0.230690\n", + " -0.509112\n", + " -0.697787\n", + " 0\n", " 7\n", + " 0\n", " 2\n", - " 8\n", - " 2\n", - " 8\n", - " [-0.21767810959793432, 0.7070469796179955, -0....\n", - " [0.8299352235765285, -0.0961479916392276, 0.60...\n", - " [2, 4, 5, 9, 4, 0]\n", - " [2]\n", - " [1, 4, 1, 0, 1, 5, 2, 0, 8, 3]\n", - " 1\n", + " 5\n", + " 5\n", + " 9\n", + " [-0.8096748535850016, 0.5605113724404849, -0.1...\n", + " [-0.6596240142354801, 0.8900409553874395, -0.9...\n", + " [9, 1, 2, 5, 3, 2, 1]\n", + " [4, 3, 9, 7, 3, 9, 4, 0, 8, 0, 1, 4, 0, 1, 2, ...\n", + " [8, 4, 6, 6, 7, 3, 0]\n", + " 0\n", " \n", " \n", " 3\n", - " -0.649665\n", - " 0.836329\n", - " -0.965379\n", - " -0.689815\n", - " 0.946057\n", + " 0.326872\n", + " 0.843668\n", + " 0.784894\n", + " -0.827104\n", + " 0.754604\n", " 9\n", - " 4\n", " 1\n", - " 9\n", " 2\n", - " 1\n", - " 4\n", - " [-0.9109288944751086, 0.017280738609947832, -0...\n", - " [0.47963783275236826]\n", - " [8, 3, 9, 9, 5, 5, 2, 7, 8, 7, 0, 4, 8, 8, 3]\n", - " [3, 9, 8, 1, 6, 3, 4, 2, 1, 2, 3, 2, 7, 6, 1]\n", - " [0, 4, 5, 9, 4, 6, 0, 8, 6, 9, 5, 2, 6]\n", + " 9\n", + " 8\n", + " 5\n", + " 6\n", + " [-0.6567031654231053, 0.7465280775584306, -0.4...\n", + " [0.3959883972561651, 0.6893892305272766, 0.037...\n", + " [4, 0, 8, 4, 1, 7, 1, 9, 3, 9, 9, 8, 4]\n", + " [1, 4, 1, 0, 6, 3, 9, 9, 5, 7, 3, 5, 9]\n", + " [9, 5, 4, 1, 8, 0, 9, 6, 0, 6, 4, 7, 6, 5, 8, ...\n", " 1\n", " \n", " \n", " 4\n", - " 0.786037\n", - " -0.550252\n", - " 0.824308\n", - " -0.051628\n", - " 0.742293\n", - " 8\n", - " 0\n", - " 6\n", - " 1\n", + " 0.790391\n", + " -0.497946\n", + " 0.393088\n", + " 0.511822\n", + " 0.178066\n", + " 4\n", " 5\n", - " 9\n", - " 3\n", - " [-0.9196272581988192, 0.26653671490649056]\n", - " [-0.34703584253681274, -0.38656340762419905, -...\n", - " [8]\n", - " [7, 4]\n", - " [2, 6]\n", " 1\n", + " 4\n", + " 7\n", + " 6\n", + " 4\n", + " [-0.7939317698805488, -0.3741631460119641, -0....\n", + " [-0.5244749482733557, 0.8530664847999185]\n", + " [7, 4, 3, 8, 3]\n", + " [5]\n", + " [2, 4, 6, 3, 9, 4, 6]\n", + " 0\n", " \n", " \n", "\n", @@ -306,49 +1801,49 @@ ], "text/plain": [ " cont0 cont1 cont2 cont3 cont4 cat0 cat1 cat2 cat3 \\\n", - "0 -0.598272 -0.665893 -0.315125 0.702566 -0.323849 5 3 5 2 \n", - "1 -0.302763 0.102246 -0.197020 0.157954 -0.108024 9 4 8 7 \n", - "2 0.940949 0.028501 0.774253 0.323579 -0.417690 8 9 7 2 \n", - "3 -0.649665 0.836329 -0.965379 -0.689815 0.946057 9 4 1 9 \n", - "4 0.786037 -0.550252 0.824308 -0.051628 0.742293 8 0 6 1 \n", + "0 -0.397172 0.532571 0.759104 0.594407 -0.524947 8 7 2 1 \n", + "1 0.862493 -0.109889 -0.603772 0.057867 0.467439 0 3 8 8 \n", + "2 -0.044618 -0.087187 0.230690 -0.509112 -0.697787 0 7 0 2 \n", + "3 0.326872 0.843668 0.784894 -0.827104 0.754604 9 1 2 9 \n", + "4 0.790391 -0.497946 0.393088 0.511822 0.178066 4 5 1 4 \n", "\n", " cat4 cat5 cat6 cont_list0 \\\n", - "0 4 8 3 [0.5072592597602565, -0.47816687381189427, -0.... \n", - "1 7 3 6 [-0.08574677514984685, -0.4996899616827948, -0... \n", - "2 8 2 8 [-0.21767810959793432, 0.7070469796179955, -0.... \n", - "3 2 1 4 [-0.9109288944751086, 0.017280738609947832, -0... \n", - "4 5 9 3 [-0.9196272581988192, 0.26653671490649056] \n", + "0 0 5 0 [-0.6707850610776214, -0.05450009496947694, -0... \n", + "1 4 0 9 [-0.05660807519951194, -0.07055138523986693, 0... \n", + "2 5 5 9 [-0.8096748535850016, 0.5605113724404849, -0.1... \n", + "3 8 5 6 [-0.6567031654231053, 0.7465280775584306, -0.4... \n", + "4 7 6 4 [-0.7939317698805488, -0.3741631460119641, -0.... \n", "\n", " cont_list1 \\\n", - "0 [] \n", - "1 [0.23373279277235692, 0.9775817910065407, 0.14... \n", - "2 [0.8299352235765285, -0.0961479916392276, 0.60... \n", - "3 [0.47963783275236826] \n", - "4 [-0.34703584253681274, -0.38656340762419905, -... \n", + "0 [0.5190079351542354, -0.3838151710281379, 0.44... \n", + "1 [-0.6486521298228771, -0.8936482314761995, -0.... \n", + "2 [-0.6596240142354801, 0.8900409553874395, -0.9... \n", + "3 [0.3959883972561651, 0.6893892305272766, 0.037... \n", + "4 [-0.5244749482733557, 0.8530664847999185] \n", "\n", - " cat_list0 \\\n", - "0 [4, 2, 5, 1, 7, 6, 4, 8, 8, 7, 6, 0, 3, 0, 9, ... \n", - "1 [2, 5, 0, 8, 6, 0, 2, 8, 4, 7, 2, 9, 7] \n", - "2 [2, 4, 5, 9, 4, 0] \n", - "3 [8, 3, 9, 9, 5, 5, 2, 7, 8, 7, 0, 4, 8, 8, 3] \n", - "4 [8] \n", + " cat_list0 \\\n", + "0 [1, 0, 1] \n", + "1 [7, 1, 3, 8, 8, 9, 8, 9, 0, 5, 8, 4, 6, 4, 8, 0] \n", + "2 [9, 1, 2, 5, 3, 2, 1] \n", + "3 [4, 0, 8, 4, 1, 7, 1, 9, 3, 9, 9, 8, 4] \n", + "4 [7, 4, 3, 8, 3] \n", "\n", " cat_list1 \\\n", - "0 [0, 1, 5, 9, 8, 1, 9, 3, 4, 5, 9, 1, 8, 0, 9, ... \n", - "1 [9, 1, 0, 6, 5, 1, 5, 3, 8, 1, 3, 6, 8, 7, 4] \n", - "2 [2] \n", - "3 [3, 9, 8, 1, 6, 3, 4, 2, 1, 2, 3, 2, 7, 6, 1] \n", - "4 [7, 4] \n", + "0 [] \n", + "1 [1, 0, 5, 4, 3] \n", + "2 [4, 3, 9, 7, 3, 9, 4, 0, 8, 0, 1, 4, 0, 1, 2, ... \n", + "3 [1, 4, 1, 0, 6, 3, 9, 9, 5, 7, 3, 5, 9] \n", + "4 [5] \n", "\n", " cat_list2 label \n", - "0 [1, 4, 5, 6, 8, 3, 5, 9, 0, 6, 0, 2, 1] 0 \n", - "1 [6, 2, 5, 9, 4, 7, 4, 8, 7, 0, 7, 2, 6, 5, 3, ... 1 \n", - "2 [1, 4, 1, 0, 1, 5, 2, 0, 8, 3] 1 \n", - "3 [0, 4, 5, 9, 4, 6, 0, 8, 6, 9, 5, 2, 6] 1 \n", - "4 [2, 6] 1 " + "0 [7, 1, 9, 8, 0, 8, 6, 7, 2, 8] 0 \n", + "1 [3, 0, 7, 4, 8, 6, 6] 0 \n", + "2 [8, 4, 6, 6, 7, 3, 0] 0 \n", + "3 [9, 5, 4, 1, 8, 0, 9, 6, 0, 6, 4, 7, 6, 5, 8, ... 1 \n", + "4 [2, 4, 6, 3, 9, 4, 6] 0 " ] }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -359,7 +1854,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "id": "6a49b022", "metadata": {}, "outputs": [], @@ -389,7 +1884,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "055a8dae", "metadata": {}, "outputs": [ @@ -397,7 +1892,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "2021-08-31 18:57:50.981790: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n" + "2021-09-01 23:34:16.262872: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n" ] } ], @@ -407,7 +1902,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "f8f502ff", "metadata": {}, "outputs": [], @@ -512,7 +2007,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "id": "0ca623b3", "metadata": {}, "outputs": [], @@ -522,7 +2017,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "id": "2619480a", "metadata": {}, "outputs": [], @@ -533,7 +2028,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "id": "b88f1b42", "metadata": {}, "outputs": [], @@ -559,7 +2054,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "id": "31362c7e", "metadata": {}, "outputs": [ @@ -593,7 +2088,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "id": "62fa679c", "metadata": {}, "outputs": [], @@ -613,7 +2108,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "id": "fd930951", "metadata": {}, "outputs": [], @@ -635,7 +2130,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "id": "854f2aa3", "metadata": {}, "outputs": [ @@ -643,33 +2138,33 @@ "name": "stderr", "output_type": "stream", "text": [ - "2021-08-31 18:58:05.260995: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n", - "2021-08-31 18:58:05.263090: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "2021-09-01 23:34:39.728120: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n", + "2021-09-01 23:34:39.730165: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", "pciBusID: 0000:0b:00.0 name: Tesla V100-SXM2-32GB computeCapability: 7.0\n", "coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 31.75GiB deviceMemoryBandwidth: 836.37GiB/s\n", - "2021-08-31 18:58:05.263118: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", - "2021-08-31 18:58:05.263162: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n", - "2021-08-31 18:58:05.263196: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n", - "2021-08-31 18:58:05.263230: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n", - "2021-08-31 18:58:05.263263: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n", - "2021-08-31 18:58:05.263309: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n", - "2021-08-31 18:58:05.263342: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n", - "2021-08-31 18:58:05.263379: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n", - "2021-08-31 18:58:05.267016: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", - "2021-08-31 18:58:05.267818: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", + "2021-09-01 23:34:39.730197: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2021-09-01 23:34:39.730269: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n", + "2021-09-01 23:34:39.730305: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n", + "2021-09-01 23:34:39.730341: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n", + "2021-09-01 23:34:39.730377: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n", + "2021-09-01 23:34:39.730429: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n", + "2021-09-01 23:34:39.730466: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n", + "2021-09-01 23:34:39.730511: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n", + "2021-09-01 23:34:39.733991: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2021-09-01 23:34:39.735034: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", - "2021-08-31 18:58:05.277016: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "2021-09-01 23:34:39.744239: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", "pciBusID: 0000:0b:00.0 name: Tesla V100-SXM2-32GB computeCapability: 7.0\n", "coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 31.75GiB deviceMemoryBandwidth: 836.37GiB/s\n", - "2021-08-31 18:58:05.280616: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", - "2021-08-31 18:58:05.280669: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", - "2021-08-31 18:58:06.732386: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", - "2021-08-31 18:58:06.732444: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", - "2021-08-31 18:58:06.732455: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", - "2021-08-31 18:58:06.738142: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 27675 MB memory) -> physical GPU (device: 0, name: Tesla V100-SXM2-32GB, pci bus id: 0000:0b:00.0, compute capability: 7.0)\n", - "2021-08-31 18:58:06.804145: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n", - "2021-08-31 18:58:06.825396: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2194940000 Hz\n", - "20000it [00:12, 1592.98it/s]\n" + "2021-09-01 23:34:39.747918: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2021-09-01 23:34:39.747963: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2021-09-01 23:34:41.243588: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2021-09-01 23:34:41.243665: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2021-09-01 23:34:41.243675: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2021-09-01 23:34:41.250181: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 26435 MB memory) -> physical GPU (device: 0, name: Tesla V100-SXM2-32GB, pci bus id: 0000:0b:00.0, compute capability: 7.0)\n", + "2021-09-01 23:34:41.318728: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n", + "2021-09-01 23:34:41.341392: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2194940000 Hz\n", + "20000it [00:13, 1444.97it/s]\n" ] } ], @@ -697,7 +2192,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "id": "dab31264", "metadata": {}, "outputs": [ @@ -726,7 +2221,7 @@ " '/raid/tfrecord-test/00000_10.parquet']" ] }, - "execution_count": 17, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -746,7 +2241,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "id": "b2ce99e0", "metadata": {}, "outputs": [ @@ -794,108 +2289,108 @@ " \n", " \n", " 0\n", - " 6\n", - " 4\n", - " 2\n", + " 7\n", " 1\n", + " 1\n", + " 6\n", " 6\n", " 5\n", - " 8\n", - " [7, 3, 5, 8, 1, 9, 8, 0, 2, 3, 6, 4, 2, 3, 0, ...\n", - " [7, 8, 3, 3, 9, 3, 7, 7, 1, 3, 2, 8, 2, 2, 9, ...\n", - " [9, 9, 1, 9, 2, 0, 5, 1, 2, 4, 7]\n", - " 0.616503\n", - " -0.562299\n", - " 0.294924\n", - " 0.368152\n", - " 0.460910\n", - " [0.10967133, 0.7335732, -0.32737544, 0.5034231...\n", - " [0.37586957, 0.57677925, -0.49405763, -0.38141...\n", + " 5\n", + " [0, 9, 2]\n", + " [7, 9, 0, 1, 1, 3, 5, 5]\n", + " [5, 3, 5, 1, 8, 4, 2, 3, 5, 1, 6, 7, 5, 5, 1, ...\n", + " 0.949754\n", + " -0.748696\n", + " 0.835813\n", + " -0.444217\n", + " 0.907264\n", + " [-0.11095038, -0.26764223, -0.7053523, -0.8485...\n", + " [-0.059809078, -0.022873674, -0.7533006, -0.20...\n", " 0\n", " \n", " \n", " 1\n", - " 7\n", - " 6\n", " 5\n", + " 4\n", " 2\n", - " 2\n", - " 7\n", " 0\n", - " [7, 0, 6, 2, 6, 1, 8, 4, 8, 4, 3, 3, 7, 1, 5]\n", - " []\n", - " [6, 1, 0, 2, 4, 8]\n", - " 0.356238\n", - " -0.996674\n", - " 0.638569\n", - " -0.873461\n", - " -0.549512\n", - " [0.038814984]\n", - " [0.9645752, 0.841208, 0.23322387, -0.000990165...\n", + " 0\n", + " 6\n", + " 8\n", + " [8, 6, 5, 3, 6, 1, 9, 8, 1, 8, 5, 8, 7, 0, 5, ...\n", + " [3, 8, 0, 0, 8, 1, 6, 0]\n", + " [2, 8, 2, 1, 1, 9, 0, 2, 9, 5, 1, 6, 4, 3, 6, 4]\n", + " 0.483914\n", + " -0.067152\n", + " 0.717157\n", + " -0.530880\n", + " -0.458823\n", + " [-0.63131815, -0.39890215, 0.2429156, -0.36485...\n", + " [-0.15147838, -0.50006855, -0.28108948, -0.313...\n", " 0\n", " \n", " \n", " 2\n", - " 7\n", - " 7\n", " 0\n", - " 3\n", + " 2\n", + " 0\n", " 6\n", - " 9\n", - " 8\n", - " [2, 8, 5, 3, 2, 9, 4, 0, 8, 6, 0, 5, 9, 5, 4]\n", - " [7, 6, 0, 8, 2, 4, 7, 5]\n", - " [3, 0, 8, 9, 5, 4]\n", - " -0.639743\n", - " -0.815482\n", - " -0.808328\n", - " -0.968345\n", - " 0.844720\n", - " [-0.3283896, 0.95472634, -0.8193472, 0.9815109...\n", - " [-0.2411973, 0.21095003, -0.4520857, -0.961866...\n", - " 1\n", + " 3\n", + " 7\n", + " 0\n", + " [5, 5, 3, 3, 3, 1, 3, 5, 3, 0, 1, 6, 2, 5, 3, ...\n", + " [0, 2, 3, 7, 1, 4, 0, 8, 8, 2, 5, 6]\n", + " [7, 3, 9, 4, 0, 2, 4, 1, 7, 3, 3]\n", + " -0.623489\n", + " -0.409438\n", + " 0.376929\n", + " 0.011878\n", + " 0.246547\n", + " [0.5586857, 0.69241923, -0.47191077, 0.3478763...\n", + " [0.20118928, 0.254733, -0.59162587, -0.8194611...\n", + " 0\n", " \n", " \n", " 3\n", - " 6\n", + " 5\n", + " 5\n", + " 4\n", " 4\n", - " 8\n", - " 9\n", - " 0\n", - " 2\n", " 7\n", - " [8, 4, 3, 7, 7, 6, 7, 4, 4, 1, 8, 8, 8, 9, 4]\n", - " [4, 0, 8, 0]\n", - " []\n", - " 0.465874\n", - " -0.870847\n", - " 0.197097\n", - " 0.254493\n", - " -0.363290\n", - " [-0.88873893, 0.45026976, 0.93019474, 0.345771...\n", - " [0.95485276, 0.21282451, 0.32634658, 0.0435548...\n", " 0\n", + " 1\n", + " [4, 2, 6, 6, 1, 4, 7, 7, 1, 8, 2, 4, 8, 8, 0]\n", + " [9, 8, 4, 7, 0, 4, 5, 7, 1, 4, 8, 3, 1, 1, 5, ...\n", + " [6, 4, 5, 9, 8, 3, 2, 9, 0, 7, 0, 1, 3, 9, 0, ...\n", + " 0.015761\n", + " 0.559979\n", + " 0.580781\n", + " -0.555637\n", + " 0.755852\n", + " [0.102869034, 0.78194165, -0.5718406, 0.529719...\n", + " [0.48339945, -0.43163055, -0.04833159, 0.37615...\n", + " 1\n", " \n", " \n", " 4\n", - " 7\n", - " 2\n", - " 5\n", " 6\n", - " 0\n", - " 9\n", - " 7\n", - " [2, 2, 6, 5, 6, 7, 6, 2, 0, 5, 8]\n", - " [1, 4, 2, 9, 6, 3, 2]\n", - " [5, 8, 0, 0, 2, 7, 8, 6, 5, 8, 9, 8, 7, 5, 6, ...\n", - " 0.657215\n", - " 0.697109\n", - " 0.743880\n", - " -0.172813\n", - " -0.301696\n", - " [-0.67113024, 0.15799437, -0.3753272, 0.132746...\n", - " [-0.78169054, 0.8858877, 0.10161541, 0.1666716...\n", " 1\n", + " 3\n", + " 4\n", + " 2\n", + " 7\n", + " 8\n", + " [2, 4, 3, 3, 3, 0, 3, 6, 4, 7, 9, 0, 3, 5, 5]\n", + " []\n", + " [6, 7, 0, 6, 2, 8, 2, 5, 3, 9, 9, 3, 5]\n", + " -0.602396\n", + " -0.190180\n", + " 0.340079\n", + " -0.125164\n", + " -0.455232\n", + " [-0.6019757, 0.43704405]\n", + " [0.3293641, -0.97888887, -0.96398735, -0.61759...\n", + " 0\n", " \n", " \n", "\n", @@ -903,56 +2398,56 @@ ], "text/plain": [ " cat0 cat1 cat2 cat3 cat4 cat5 cat6 \\\n", - "0 6 4 2 1 6 5 8 \n", - "1 7 6 5 2 2 7 0 \n", - "2 7 7 0 3 6 9 8 \n", - "3 6 4 8 9 0 2 7 \n", - "4 7 2 5 6 0 9 7 \n", + "0 7 1 1 6 6 5 5 \n", + "1 5 4 2 0 0 6 8 \n", + "2 0 2 0 6 3 7 0 \n", + "3 5 5 4 4 7 0 1 \n", + "4 6 1 3 4 2 7 8 \n", "\n", " cat_list0 \\\n", - "0 [7, 3, 5, 8, 1, 9, 8, 0, 2, 3, 6, 4, 2, 3, 0, ... \n", - "1 [7, 0, 6, 2, 6, 1, 8, 4, 8, 4, 3, 3, 7, 1, 5] \n", - "2 [2, 8, 5, 3, 2, 9, 4, 0, 8, 6, 0, 5, 9, 5, 4] \n", - "3 [8, 4, 3, 7, 7, 6, 7, 4, 4, 1, 8, 8, 8, 9, 4] \n", - "4 [2, 2, 6, 5, 6, 7, 6, 2, 0, 5, 8] \n", + "0 [0, 9, 2] \n", + "1 [8, 6, 5, 3, 6, 1, 9, 8, 1, 8, 5, 8, 7, 0, 5, ... \n", + "2 [5, 5, 3, 3, 3, 1, 3, 5, 3, 0, 1, 6, 2, 5, 3, ... \n", + "3 [4, 2, 6, 6, 1, 4, 7, 7, 1, 8, 2, 4, 8, 8, 0] \n", + "4 [2, 4, 3, 3, 3, 0, 3, 6, 4, 7, 9, 0, 3, 5, 5] \n", "\n", " cat_list1 \\\n", - "0 [7, 8, 3, 3, 9, 3, 7, 7, 1, 3, 2, 8, 2, 2, 9, ... \n", - "1 [] \n", - "2 [7, 6, 0, 8, 2, 4, 7, 5] \n", - "3 [4, 0, 8, 0] \n", - "4 [1, 4, 2, 9, 6, 3, 2] \n", + "0 [7, 9, 0, 1, 1, 3, 5, 5] \n", + "1 [3, 8, 0, 0, 8, 1, 6, 0] \n", + "2 [0, 2, 3, 7, 1, 4, 0, 8, 8, 2, 5, 6] \n", + "3 [9, 8, 4, 7, 0, 4, 5, 7, 1, 4, 8, 3, 1, 1, 5, ... \n", + "4 [] \n", "\n", " cat_list2 cont0 cont1 \\\n", - "0 [9, 9, 1, 9, 2, 0, 5, 1, 2, 4, 7] 0.616503 -0.562299 \n", - "1 [6, 1, 0, 2, 4, 8] 0.356238 -0.996674 \n", - "2 [3, 0, 8, 9, 5, 4] -0.639743 -0.815482 \n", - "3 [] 0.465874 -0.870847 \n", - "4 [5, 8, 0, 0, 2, 7, 8, 6, 5, 8, 9, 8, 7, 5, 6, ... 0.657215 0.697109 \n", + "0 [5, 3, 5, 1, 8, 4, 2, 3, 5, 1, 6, 7, 5, 5, 1, ... 0.949754 -0.748696 \n", + "1 [2, 8, 2, 1, 1, 9, 0, 2, 9, 5, 1, 6, 4, 3, 6, 4] 0.483914 -0.067152 \n", + "2 [7, 3, 9, 4, 0, 2, 4, 1, 7, 3, 3] -0.623489 -0.409438 \n", + "3 [6, 4, 5, 9, 8, 3, 2, 9, 0, 7, 0, 1, 3, 9, 0, ... 0.015761 0.559979 \n", + "4 [6, 7, 0, 6, 2, 8, 2, 5, 3, 9, 9, 3, 5] -0.602396 -0.190180 \n", "\n", " cont2 cont3 cont4 \\\n", - "0 0.294924 0.368152 0.460910 \n", - "1 0.638569 -0.873461 -0.549512 \n", - "2 -0.808328 -0.968345 0.844720 \n", - "3 0.197097 0.254493 -0.363290 \n", - "4 0.743880 -0.172813 -0.301696 \n", + "0 0.835813 -0.444217 0.907264 \n", + "1 0.717157 -0.530880 -0.458823 \n", + "2 0.376929 0.011878 0.246547 \n", + "3 0.580781 -0.555637 0.755852 \n", + "4 0.340079 -0.125164 -0.455232 \n", "\n", " cont_list0 \\\n", - "0 [0.10967133, 0.7335732, -0.32737544, 0.5034231... \n", - "1 [0.038814984] \n", - "2 [-0.3283896, 0.95472634, -0.8193472, 0.9815109... \n", - "3 [-0.88873893, 0.45026976, 0.93019474, 0.345771... \n", - "4 [-0.67113024, 0.15799437, -0.3753272, 0.132746... \n", + "0 [-0.11095038, -0.26764223, -0.7053523, -0.8485... \n", + "1 [-0.63131815, -0.39890215, 0.2429156, -0.36485... \n", + "2 [0.5586857, 0.69241923, -0.47191077, 0.3478763... \n", + "3 [0.102869034, 0.78194165, -0.5718406, 0.529719... \n", + "4 [-0.6019757, 0.43704405] \n", "\n", " cont_list1 label \n", - "0 [0.37586957, 0.57677925, -0.49405763, -0.38141... 0 \n", - "1 [0.9645752, 0.841208, 0.23322387, -0.000990165... 0 \n", - "2 [-0.2411973, 0.21095003, -0.4520857, -0.961866... 1 \n", - "3 [0.95485276, 0.21282451, 0.32634658, 0.0435548... 0 \n", - "4 [-0.78169054, 0.8858877, 0.10161541, 0.1666716... 1 " + "0 [-0.059809078, -0.022873674, -0.7533006, -0.20... 0 \n", + "1 [-0.15147838, -0.50006855, -0.28108948, -0.313... 0 \n", + "2 [0.20118928, 0.254733, -0.59162587, -0.8194611... 0 \n", + "3 [0.48339945, -0.43163055, -0.04833159, 0.37615... 1 \n", + "4 [0.3293641, -0.97888887, -0.96398735, -0.61759... 0 " ] }, - "execution_count": 18, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } diff --git a/nvtabular/framework_utils/tensorflow/tfrecords_to_parquet.py b/nvtabular/framework_utils/tensorflow/tfrecords_to_parquet.py index d56ab56ad34..b3e02ae2df0 100644 --- a/nvtabular/framework_utils/tensorflow/tfrecords_to_parquet.py +++ b/nvtabular/framework_utils/tensorflow/tfrecords_to_parquet.py @@ -16,10 +16,8 @@ import gc import pandas as pd - -# Some of the functions are copied or inspired by -# https://github.com/schipiga/pandas-tfrecords/ import tensorflow as tf +from pandas_tfrecords.from_tfrecords import _get_feature_type, read_example from tqdm import tqdm @@ -50,41 +48,14 @@ def convert_tfrecords_to_parquet( for file in filenames: dataset = tf.data.TFRecordDataset(file, compression_type=compression_type) features = _detect_schema(dataset) - parser = _read_example(features) + parser = read_example(features) parsed = dataset.map(parser) _to_parquet(parsed, file, output_dir, chunks, convert_lists) -def _read_example(features): - # https://github.com/schipiga/pandas-tfrecords/blob/master/pandas_tfrecords/from_tfrecords.py - def parse(serialized): - example = tf.io.parse_single_example(serialized, features=features) - return example - - return parse - - -def _get_feature_type(feature=None, type_=None): - # https://github.com/schipiga/pandas-tfrecords/blob/master/pandas_tfrecords/from_tfrecords.py - if type_: - return { - int: tf.int64, - float: tf.float32, - str: tf.string, - bytes: tf.string, - }[type_] - - if feature: - if feature.HasField("int64_list"): - return tf.int64 - if feature.HasField("float_list"): - return tf.float32 - if feature.HasField("bytes_list"): - return tf.string - - def _detect_schema(dataset): - # by https://github.com/schipiga/pandas-tfrecords/blob/master/pandas_tfrecords/from_tfrecords.py + # inspired by + # https://github.com/schipiga/pandas-tfrecords/blob/master/pandas_tfrecords/from_tfrecords.py features = {} serialized = next(iter(dataset.map(lambda serialized: serialized))) From 390dacba0a7b8549956b081e2d69aac0c3de8016 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 10 Sep 2021 18:33:18 +0000 Subject: [PATCH 22/23] write to one parquet --- .../tensorflow/TFRecords-To-Parquet.ipynb | 2043 +++-------------- .../tensorflow/tfrecords_to_parquet.py | 35 +- 2 files changed, 370 insertions(+), 1708 deletions(-) diff --git a/examples/tensorflow/TFRecords-To-Parquet.ipynb b/examples/tensorflow/TFRecords-To-Parquet.ipynb index 5d2ea62791c..3b4aba397e2 100644 --- a/examples/tensorflow/TFRecords-To-Parquet.ipynb +++ b/examples/tensorflow/TFRecords-To-Parquet.ipynb @@ -84,1452 +84,66 @@ "root = None\n", "prefix = None\u001b[0m\n", "Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com\n", - "Collecting pandas-tfrecords==0.1.5\n", - " Downloading pandas_tfrecords-0.1.5-py3-none-any.whl (7.0 kB)\n", + "Requirement already satisfied: pandas-tfrecords==0.1.5 in /usr/local/lib/python3.8/dist-packages (0.1.5)\n", "Requirement already satisfied: pandas==1.2.4 in /usr/local/lib/python3.8/dist-packages (from pandas-tfrecords==0.1.5) (1.2.4)\n", - "Collecting tensorflow==2.5.0\n", - " Downloading tensorflow-2.5.0-cp38-cp38-manylinux2010_x86_64.whl (454.4 MB)\n", - "\u001b[K |██████████████████████████▍ | 374.7 MB 94.2 MB/s eta 0:00:01" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "IOPub data rate exceeded.\n", - "The notebook server will temporarily stop sending output\n", - "to the client in order to avoid crashing it.\n", - "To change this limit, set the config variable\n", - "`--NotebookApp.iopub_data_rate_limit`.\n", - "\n", - "Current values:\n", - "NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)\n", - "NotebookApp.rate_limit_window=3.0 (secs)\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\r", - "\u001b[K |███████████████████████████████ | 441.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████ | 441.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 441.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 441.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 441.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 441.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 441.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 441.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 441.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 441.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 441.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 441.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 441.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 441.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 441.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 441.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 441.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 441.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 442.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▏| 443.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 443.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▎| 444.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 444.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.9 MB 85.6 MB/s eta 0:00:01" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\r", - "\u001b[K |███████████████████████████████▍| 445.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 445.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▍| 446.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 446.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 447.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 448.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 448.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 448.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 448.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 448.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 448.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 448.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 448.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 448.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 448.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 448.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 448.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 448.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▌| 448.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 448.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▋| 449.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 449.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 449.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 449.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 449.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 449.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 449.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 449.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 449.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 449.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 449.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 449.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 449.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 449.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 449.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 449.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 449.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 449.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 449.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 449.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 449.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 449.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 449.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 449.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 449.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 449.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 449.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 449.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.6 MB 85.6 MB/s eta 0:00:01" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\r", - "\u001b[K |███████████████████████████████▊| 450.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 450.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 451.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 451.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 451.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 451.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 451.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 451.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 451.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 451.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 451.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 451.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 451.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 451.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 451.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 451.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 451.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 451.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 451.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 451.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 451.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 451.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 451.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 451.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 451.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 451.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 451.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 451.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 451.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 451.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▊| 451.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 451.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |███████████████████████████████▉| 452.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 452.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 452.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 452.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 452.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 452.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 452.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 452.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 452.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 452.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 452.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 452.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 452.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.5 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.6 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.7 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.8 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 453.9 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.0 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.1 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.2 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.3 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.4 MB 85.6 MB/s eta 0:00:01\r", - "\u001b[K |████████████████████████████████| 454.4 MB 85.6 MB/s \r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[?25hRequirement already satisfied: numpy>=1.16.5 in /usr/local/lib/python3.8/dist-packages (from pandas-tfrecords==0.1.5) (1.19.5)\n", + "Requirement already satisfied: tensorflow==2.5.0 in /usr/local/lib/python3.8/dist-packages (from pandas-tfrecords==0.1.5) (2.5.0)\n", + "Requirement already satisfied: numpy>=1.16.5 in /usr/local/lib/python3.8/dist-packages (from pandas-tfrecords==0.1.5) (1.19.5)\n", "Requirement already satisfied: s3fs==2021.6.0 in /usr/local/lib/python3.8/dist-packages (from pandas-tfrecords==0.1.5) (2021.6.0)\n", - "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.8/dist-packages (from pandas==1.2.4->pandas-tfrecords==0.1.5) (2.8.2)\n", "Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.8/dist-packages (from pandas==1.2.4->pandas-tfrecords==0.1.5) (2021.1)\n", + "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.8/dist-packages (from pandas==1.2.4->pandas-tfrecords==0.1.5) (2.8.2)\n", + "Requirement already satisfied: aiobotocore>=1.0.1 in /usr/local/lib/python3.8/dist-packages (from s3fs==2021.6.0->pandas-tfrecords==0.1.5) (1.4.1)\n", "Requirement already satisfied: fsspec==2021.06.0 in /usr/local/lib/python3.8/dist-packages (from s3fs==2021.6.0->pandas-tfrecords==0.1.5) (2021.6.0)\n", - "Requirement already satisfied: aiobotocore>=1.0.1 in /usr/local/lib/python3.8/dist-packages (from s3fs==2021.6.0->pandas-tfrecords==0.1.5) (1.4.0)\n", - "Requirement already satisfied: keras-preprocessing~=1.1.2 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.1.2)\n", - "Requirement already satisfied: flatbuffers~=1.12.0 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.12)\n", - "Requirement already satisfied: protobuf>=3.9.2 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (3.17.3)\n", - "Requirement already satisfied: typing-extensions~=3.7.4 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (3.7.4.3)\n", + "Requirement already satisfied: gast==0.4.0 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (0.4.0)\n", "Requirement already satisfied: astunparse~=1.6.3 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.6.3)\n", - "Requirement already satisfied: six~=1.15.0 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.15.0)\n", + "Requirement already satisfied: google-pasta~=0.2 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (0.2.0)\n", + "Requirement already satisfied: protobuf>=3.9.2 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (3.17.3)\n", + "Requirement already satisfied: keras-nightly~=2.5.0.dev in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (2.5.0.dev2021032900)\n", + "Requirement already satisfied: keras-preprocessing~=1.1.2 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.1.2)\n", + "Requirement already satisfied: h5py~=3.1.0 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (3.1.0)\n", "Requirement already satisfied: termcolor~=1.1.0 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.1.0)\n", + "Requirement already satisfied: wheel~=0.35 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (0.36.2)\n", + "Requirement already satisfied: opt-einsum~=3.3.0 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (3.3.0)\n", + "Requirement already satisfied: six~=1.15.0 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.15.0)\n", + "Requirement already satisfied: typing-extensions~=3.7.4 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (3.7.4.3)\n", + "Requirement already satisfied: flatbuffers~=1.12.0 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.12)\n", + "Requirement already satisfied: tensorflow-estimator<2.6.0,>=2.5.0rc0 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (2.5.0)\n", "Requirement already satisfied: wrapt~=1.12.1 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.12.1)\n", - "Requirement already satisfied: gast==0.4.0 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (0.4.0)\n", - "Requirement already satisfied: h5py~=3.1.0 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (3.1.0)\n", - "Requirement already satisfied: keras-nightly~=2.5.0.dev in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (2.5.0.dev2021032900)\n", + "Requirement already satisfied: tensorboard~=2.5 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (2.5.0)\n", "Requirement already satisfied: absl-py~=0.10 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (0.12.0)\n", - "Requirement already satisfied: wheel~=0.35 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (0.36.2)\n", - "Collecting tensorflow-estimator<2.6.0,>=2.5.0rc0\n", - " Downloading tensorflow_estimator-2.5.0-py2.py3-none-any.whl (462 kB)\n", - "\u001b[K |████████████████████████████████| 462 kB 66.9 MB/s eta 0:00:01\n", - "\u001b[?25hCollecting grpcio~=1.34.0\n", - " Downloading grpcio-1.34.1-cp38-cp38-manylinux2014_x86_64.whl (4.0 MB)\n", - "\u001b[K |████████████████████████████████| 4.0 MB 54.3 MB/s eta 0:00:01\n", - "\u001b[?25hRequirement already satisfied: opt-einsum~=3.3.0 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (3.3.0)\n", - "Requirement already satisfied: tensorboard~=2.5 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (2.6.0)\n", - "Requirement already satisfied: google-pasta~=0.2 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (0.2.0)\n", + "Requirement already satisfied: grpcio~=1.34.0 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.34.1)\n", + "Requirement already satisfied: aiohttp>=3.3.1 in /usr/local/lib/python3.8/dist-packages (from aiobotocore>=1.0.1->s3fs==2021.6.0->pandas-tfrecords==0.1.5) (3.7.4.post0)\n", "Requirement already satisfied: botocore<1.20.107,>=1.20.106 in /usr/local/lib/python3.8/dist-packages (from aiobotocore>=1.0.1->s3fs==2021.6.0->pandas-tfrecords==0.1.5) (1.20.106)\n", "Requirement already satisfied: aioitertools>=0.5.1 in /usr/local/lib/python3.8/dist-packages (from aiobotocore>=1.0.1->s3fs==2021.6.0->pandas-tfrecords==0.1.5) (0.8.0)\n", - "Requirement already satisfied: aiohttp>=3.3.1 in /usr/local/lib/python3.8/dist-packages (from aiobotocore>=1.0.1->s3fs==2021.6.0->pandas-tfrecords==0.1.5) (3.7.4.post0)\n", - "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp>=3.3.1->aiobotocore>=1.0.1->s3fs==2021.6.0->pandas-tfrecords==0.1.5) (1.6.3)\n", - "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp>=3.3.1->aiobotocore>=1.0.1->s3fs==2021.6.0->pandas-tfrecords==0.1.5) (21.2.0)\n", - "Requirement already satisfied: async-timeout<4.0,>=3.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp>=3.3.1->aiobotocore>=1.0.1->s3fs==2021.6.0->pandas-tfrecords==0.1.5) (3.0.1)\n", "Requirement already satisfied: chardet<5.0,>=2.0 in /usr/lib/python3/dist-packages (from aiohttp>=3.3.1->aiobotocore>=1.0.1->s3fs==2021.6.0->pandas-tfrecords==0.1.5) (3.0.4)\n", + "Requirement already satisfied: async-timeout<4.0,>=3.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp>=3.3.1->aiobotocore>=1.0.1->s3fs==2021.6.0->pandas-tfrecords==0.1.5) (3.0.1)\n", "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.8/dist-packages (from aiohttp>=3.3.1->aiobotocore>=1.0.1->s3fs==2021.6.0->pandas-tfrecords==0.1.5) (5.1.0)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp>=3.3.1->aiobotocore>=1.0.1->s3fs==2021.6.0->pandas-tfrecords==0.1.5) (1.6.3)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp>=3.3.1->aiobotocore>=1.0.1->s3fs==2021.6.0->pandas-tfrecords==0.1.5) (21.2.0)\n", "Requirement already satisfied: urllib3<1.27,>=1.25.4 in /usr/local/lib/python3.8/dist-packages (from botocore<1.20.107,>=1.20.106->aiobotocore>=1.0.1->s3fs==2021.6.0->pandas-tfrecords==0.1.5) (1.26.6)\n", "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /usr/local/lib/python3.8/dist-packages/jmespath-0.10.0-py3.8.egg (from botocore<1.20.107,>=1.20.106->aiobotocore>=1.0.1->s3fs==2021.6.0->pandas-tfrecords==0.1.5) (0.10.0)\n", - "Requirement already satisfied: werkzeug>=0.11.15 in /usr/local/lib/python3.8/dist-packages (from tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (2.0.1)\n", - "Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /usr/local/lib/python3.8/dist-packages (from tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (0.4.4)\n", + "Requirement already satisfied: requests<3,>=2.21.0 in /usr/local/lib/python3.8/dist-packages (from tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (2.26.0)\n", "Requirement already satisfied: tensorboard-data-server<0.7.0,>=0.6.0 in /usr/local/lib/python3.8/dist-packages (from tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (0.6.1)\n", + "Requirement already satisfied: google-auth<2,>=1.6.3 in /usr/local/lib/python3.8/dist-packages (from tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.33.1)\n", + "Requirement already satisfied: werkzeug>=0.11.15 in /usr/local/lib/python3.8/dist-packages (from tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (2.0.1)\n", "Requirement already satisfied: setuptools>=41.0.0 in /usr/local/lib/python3.8/dist-packages (from tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (57.4.0)\n", "Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /usr/local/lib/python3.8/dist-packages (from tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.8.0)\n", - "Requirement already satisfied: google-auth<2,>=1.6.3 in /usr/local/lib/python3.8/dist-packages (from tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.33.1)\n", - "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.8/dist-packages (from tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (3.3.4)\n", - "Requirement already satisfied: requests<3,>=2.21.0 in /usr/local/lib/python3.8/dist-packages (from tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (2.26.0)\n", - "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.8/dist-packages (from google-auth<2,>=1.6.3->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (0.2.8)\n", - "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.8/dist-packages (from google-auth<2,>=1.6.3->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (4.7.2)\n", - "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.8/dist-packages (from google-auth<2,>=1.6.3->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (4.2.2)\n", - "Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.8/dist-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.3.0)\n", - "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.8/dist-packages (from pyasn1-modules>=0.2.1->google-auth<2,>=1.6.3->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (0.4.8)\n", - "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.8/dist-packages (from requests<3,>=2.21.0->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (3.2)\n", - "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.8/dist-packages (from requests<3,>=2.21.0->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (2.0.3)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.8/dist-packages (from requests<3,>=2.21.0->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (2021.5.30)\n", - "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.8/dist-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (3.1.1)\n" + "Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /usr/local/lib/python3.8/dist-packages (from tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (0.4.4)\n", + "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.8/dist-packages (from tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (3.3.4)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Installing collected packages: grpcio, tensorflow-estimator, tensorflow, pandas-tfrecords\n", - " Attempting uninstall: grpcio\n", - " Found existing installation: grpcio 1.39.0\n", - " Uninstalling grpcio-1.39.0:\n", - " Successfully uninstalled grpcio-1.39.0\n", - "\u001b[33m WARNING: Value for scheme.headers does not match. Please report this to \n", - " distutils: /usr/local/include/python3.8/grpcio\n", - " sysconfig: /usr/include/python3.8/grpcio\u001b[0m\n", - " Attempting uninstall: tensorflow-estimator\n", - " Found existing installation: tensorflow-estimator 2.6.0\n", - " Uninstalling tensorflow-estimator-2.6.0:\n", - " Successfully uninstalled tensorflow-estimator-2.6.0\n", - "\u001b[33m WARNING: Value for scheme.headers does not match. Please report this to \n", - " distutils: /usr/local/include/python3.8/tensorflow-estimator\n", - " sysconfig: /usr/include/python3.8/tensorflow-estimator\u001b[0m\n", - " Attempting uninstall: tensorflow\n", - " Found existing installation: tensorflow 2.6.0\n", - " Uninstalling tensorflow-2.6.0:\n", - " Successfully uninstalled tensorflow-2.6.0\n", - "\u001b[33m WARNING: Value for scheme.headers does not match. Please report this to \n", - " distutils: /usr/local/include/python3.8/tensorflow\n", - " sysconfig: /usr/include/python3.8/tensorflow\u001b[0m\n", - "\u001b[33m WARNING: Value for scheme.headers does not match. Please report this to \n", - " distutils: /usr/local/include/python3.8/pandas-tfrecords\n", - " sysconfig: /usr/include/python3.8/pandas-tfrecords\u001b[0m\n", - "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", - "tensorflow-transform 1.3.0 requires tensorflow!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,<2.7,>=1.15.2, but you have tensorflow 2.5.0 which is incompatible.\n", - "tensorflow-transform 1.3.0 requires tfx-bsl<1.4.0,>=1.3.0, but you have tfx-bsl 1.2.0 which is incompatible.\n", - "tensorflow-serving-api 2.6.0 requires tensorflow<3,>=2.6.0, but you have tensorflow 2.5.0 which is incompatible.\n", - "tensorflow-gpu 2.4.2 requires gast==0.3.3, but you have gast 0.4.0 which is incompatible.\n", - "tensorflow-gpu 2.4.2 requires grpcio~=1.32.0, but you have grpcio 1.34.1 which is incompatible.\n", - "tensorflow-gpu 2.4.2 requires h5py~=2.10.0, but you have h5py 3.1.0 which is incompatible.\n", - "tensorflow-gpu 2.4.2 requires tensorflow-estimator<2.5.0,>=2.4.0, but you have tensorflow-estimator 2.5.0 which is incompatible.\n", - "grpcio-channelz 1.39.0 requires grpcio>=1.39.0, but you have grpcio 1.34.1 which is incompatible.\u001b[0m\n", - "Successfully installed grpcio-1.34.1 pandas-tfrecords-0.1.5 tensorflow-2.5.0 tensorflow-estimator-2.5.0\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.8/dist-packages (from google-auth<2,>=1.6.3->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (0.2.8)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.8/dist-packages (from google-auth<2,>=1.6.3->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (4.7.2)\n", + "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.8/dist-packages (from google-auth<2,>=1.6.3->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (4.2.2)\n", + "Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.8/dist-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.3.0)\n", + "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.8/dist-packages (from pyasn1-modules>=0.2.1->google-auth<2,>=1.6.3->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (0.4.8)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.8/dist-packages (from requests<3,>=2.21.0->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (2021.5.30)\n", + "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.8/dist-packages (from requests<3,>=2.21.0->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (2.0.3)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.8/dist-packages (from requests<3,>=2.21.0->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (3.2)\n", + "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.8/dist-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (3.1.1)\n", "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\n", "\u001b[33mWARNING: You are using pip version 21.2.1; however, version 21.2.4 is available.\n", "You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.\u001b[0m\n" @@ -1692,107 +306,107 @@ " \n", " \n", " 0\n", - " -0.397172\n", - " 0.532571\n", - " 0.759104\n", - " 0.594407\n", - " -0.524947\n", - " 8\n", - " 7\n", - " 2\n", - " 1\n", + " 0.495186\n", + " -0.156929\n", + " 0.924507\n", + " 0.831834\n", + " -0.562554\n", + " 9\n", " 0\n", " 5\n", - " 0\n", - " [-0.6707850610776214, -0.05450009496947694, -0...\n", - " [0.5190079351542354, -0.3838151710281379, 0.44...\n", - " [1, 0, 1]\n", - " []\n", - " [7, 1, 9, 8, 0, 8, 6, 7, 2, 8]\n", + " 8\n", + " 9\n", + " 5\n", + " 3\n", + " [-0.08782642389819229, 0.8463009855676584, -0....\n", + " [0.7467091963784316, 0.6435026054221511, -0.14...\n", + " [1, 6, 3]\n", + " [4, 7, 9, 6, 4, 7, 6, 6, 1, 0, 2, 7, 6, 9]\n", + " [8, 6, 7, 4, 1, 2, 2]\n", " 0\n", " \n", " \n", " 1\n", - " 0.862493\n", - " -0.109889\n", - " -0.603772\n", - " 0.057867\n", - " 0.467439\n", - " 0\n", - " 3\n", - " 8\n", - " 8\n", + " -0.868427\n", + " -0.867583\n", + " 0.225373\n", + " 0.649436\n", + " 0.544803\n", " 4\n", - " 0\n", + " 5\n", + " 3\n", + " 1\n", " 9\n", - " [-0.05660807519951194, -0.07055138523986693, 0...\n", - " [-0.6486521298228771, -0.8936482314761995, -0....\n", - " [7, 1, 3, 8, 8, 9, 8, 9, 0, 5, 8, 4, 6, 4, 8, 0]\n", - " [1, 0, 5, 4, 3]\n", - " [3, 0, 7, 4, 8, 6, 6]\n", - " 0\n", + " 4\n", + " 6\n", + " [0.038860353982820284, 0.19487974734607683, -0...\n", + " [0.23944333485883984, -0.9628970811058808, -0....\n", + " [4, 1, 2]\n", + " [7, 5, 5, 8, 9, 0, 4, 2, 9, 0, 5]\n", + " [5, 5, 7]\n", + " 1\n", " \n", " \n", " 2\n", - " -0.044618\n", - " -0.087187\n", - " 0.230690\n", - " -0.509112\n", - " -0.697787\n", - " 0\n", - " 7\n", + " 0.972268\n", + " -0.650685\n", + " -0.689674\n", + " -0.780836\n", + " 0.677578\n", + " 1\n", + " 4\n", " 0\n", - " 2\n", - " 5\n", - " 5\n", - " 9\n", - " [-0.8096748535850016, 0.5605113724404849, -0.1...\n", - " [-0.6596240142354801, 0.8900409553874395, -0.9...\n", - " [9, 1, 2, 5, 3, 2, 1]\n", - " [4, 3, 9, 7, 3, 9, 4, 0, 8, 0, 1, 4, 0, 1, 2, ...\n", - " [8, 4, 6, 6, 7, 3, 0]\n", + " 3\n", " 0\n", + " 3\n", + " 4\n", + " [0.48108993016978885, -0.49063530543434286, 0....\n", + " [0.9539496388151523]\n", + " []\n", + " [6, 5, 6, 2, 1, 2, 0, 1, 1, 7, 7, 8, 4, 1]\n", + " [3, 1, 4, 0, 2, 8, 5, 3, 8, 4, 7, 7, 7, 5, 7]\n", + " 1\n", " \n", " \n", " 3\n", - " 0.326872\n", - " 0.843668\n", - " 0.784894\n", - " -0.827104\n", - " 0.754604\n", - " 9\n", - " 1\n", - " 2\n", - " 9\n", + " 0.982108\n", + " 0.806546\n", + " 0.150100\n", + " 0.488589\n", + " 0.353447\n", + " 3\n", " 8\n", - " 5\n", " 6\n", - " [-0.6567031654231053, 0.7465280775584306, -0.4...\n", - " [0.3959883972561651, 0.6893892305272766, 0.037...\n", - " [4, 0, 8, 4, 1, 7, 1, 9, 3, 9, 9, 8, 4]\n", - " [1, 4, 1, 0, 6, 3, 9, 9, 5, 7, 3, 5, 9]\n", - " [9, 5, 4, 1, 8, 0, 9, 6, 0, 6, 4, 7, 6, 5, 8, ...\n", + " 1\n", + " 4\n", + " 1\n", + " 3\n", + " [-0.3854225587686282, 0.5811189366242433, 0.08...\n", + " [0.8969990392704366, -0.958170926962973, 0.622...\n", + " [6, 2, 5, 5, 9, 9, 8, 2, 6, 4, 5, 4]\n", + " [8]\n", + " [6, 6, 8, 4, 0, 8]\n", " 1\n", " \n", " \n", " 4\n", - " 0.790391\n", - " -0.497946\n", - " 0.393088\n", - " 0.511822\n", - " 0.178066\n", - " 4\n", - " 5\n", - " 1\n", - " 4\n", - " 7\n", + " -0.044900\n", + " 0.653093\n", + " 0.775169\n", + " 0.230217\n", + " -0.280215\n", " 6\n", - " 4\n", - " [-0.7939317698805488, -0.3741631460119641, -0....\n", - " [-0.5244749482733557, 0.8530664847999185]\n", - " [7, 4, 3, 8, 3]\n", - " [5]\n", - " [2, 4, 6, 3, 9, 4, 6]\n", + " 9\n", + " 3\n", + " 7\n", + " 9\n", + " 5\n", + " 9\n", + " []\n", + " [0.9450709782415434, -0.07168300021759921]\n", + " [2]\n", + " [7, 2, 3, 0, 6, 8, 4, 8, 5, 1, 9, 5, 8, 6, 6, 9]\n", + " [3, 2, 8, 0, 6, 2, 3, 2, 0, 9, 1, 7]\n", " 0\n", " \n", " \n", @@ -1801,46 +415,46 @@ ], "text/plain": [ " cont0 cont1 cont2 cont3 cont4 cat0 cat1 cat2 cat3 \\\n", - "0 -0.397172 0.532571 0.759104 0.594407 -0.524947 8 7 2 1 \n", - "1 0.862493 -0.109889 -0.603772 0.057867 0.467439 0 3 8 8 \n", - "2 -0.044618 -0.087187 0.230690 -0.509112 -0.697787 0 7 0 2 \n", - "3 0.326872 0.843668 0.784894 -0.827104 0.754604 9 1 2 9 \n", - "4 0.790391 -0.497946 0.393088 0.511822 0.178066 4 5 1 4 \n", + "0 0.495186 -0.156929 0.924507 0.831834 -0.562554 9 0 5 8 \n", + "1 -0.868427 -0.867583 0.225373 0.649436 0.544803 4 5 3 1 \n", + "2 0.972268 -0.650685 -0.689674 -0.780836 0.677578 1 4 0 3 \n", + "3 0.982108 0.806546 0.150100 0.488589 0.353447 3 8 6 1 \n", + "4 -0.044900 0.653093 0.775169 0.230217 -0.280215 6 9 3 7 \n", "\n", " cat4 cat5 cat6 cont_list0 \\\n", - "0 0 5 0 [-0.6707850610776214, -0.05450009496947694, -0... \n", - "1 4 0 9 [-0.05660807519951194, -0.07055138523986693, 0... \n", - "2 5 5 9 [-0.8096748535850016, 0.5605113724404849, -0.1... \n", - "3 8 5 6 [-0.6567031654231053, 0.7465280775584306, -0.4... \n", - "4 7 6 4 [-0.7939317698805488, -0.3741631460119641, -0.... \n", + "0 9 5 3 [-0.08782642389819229, 0.8463009855676584, -0.... \n", + "1 9 4 6 [0.038860353982820284, 0.19487974734607683, -0... \n", + "2 0 3 4 [0.48108993016978885, -0.49063530543434286, 0.... \n", + "3 4 1 3 [-0.3854225587686282, 0.5811189366242433, 0.08... \n", + "4 9 5 9 [] \n", "\n", " cont_list1 \\\n", - "0 [0.5190079351542354, -0.3838151710281379, 0.44... \n", - "1 [-0.6486521298228771, -0.8936482314761995, -0.... \n", - "2 [-0.6596240142354801, 0.8900409553874395, -0.9... \n", - "3 [0.3959883972561651, 0.6893892305272766, 0.037... \n", - "4 [-0.5244749482733557, 0.8530664847999185] \n", + "0 [0.7467091963784316, 0.6435026054221511, -0.14... \n", + "1 [0.23944333485883984, -0.9628970811058808, -0.... \n", + "2 [0.9539496388151523] \n", + "3 [0.8969990392704366, -0.958170926962973, 0.622... \n", + "4 [0.9450709782415434, -0.07168300021759921] \n", "\n", - " cat_list0 \\\n", - "0 [1, 0, 1] \n", - "1 [7, 1, 3, 8, 8, 9, 8, 9, 0, 5, 8, 4, 6, 4, 8, 0] \n", - "2 [9, 1, 2, 5, 3, 2, 1] \n", - "3 [4, 0, 8, 4, 1, 7, 1, 9, 3, 9, 9, 8, 4] \n", - "4 [7, 4, 3, 8, 3] \n", + " cat_list0 \\\n", + "0 [1, 6, 3] \n", + "1 [4, 1, 2] \n", + "2 [] \n", + "3 [6, 2, 5, 5, 9, 9, 8, 2, 6, 4, 5, 4] \n", + "4 [2] \n", "\n", - " cat_list1 \\\n", - "0 [] \n", - "1 [1, 0, 5, 4, 3] \n", - "2 [4, 3, 9, 7, 3, 9, 4, 0, 8, 0, 1, 4, 0, 1, 2, ... \n", - "3 [1, 4, 1, 0, 6, 3, 9, 9, 5, 7, 3, 5, 9] \n", - "4 [5] \n", + " cat_list1 \\\n", + "0 [4, 7, 9, 6, 4, 7, 6, 6, 1, 0, 2, 7, 6, 9] \n", + "1 [7, 5, 5, 8, 9, 0, 4, 2, 9, 0, 5] \n", + "2 [6, 5, 6, 2, 1, 2, 0, 1, 1, 7, 7, 8, 4, 1] \n", + "3 [8] \n", + "4 [7, 2, 3, 0, 6, 8, 4, 8, 5, 1, 9, 5, 8, 6, 6, 9] \n", "\n", - " cat_list2 label \n", - "0 [7, 1, 9, 8, 0, 8, 6, 7, 2, 8] 0 \n", - "1 [3, 0, 7, 4, 8, 6, 6] 0 \n", - "2 [8, 4, 6, 6, 7, 3, 0] 0 \n", - "3 [9, 5, 4, 1, 8, 0, 9, 6, 0, 6, 4, 7, 6, 5, 8, ... 1 \n", - "4 [2, 4, 6, 3, 9, 4, 6] 0 " + " cat_list2 label \n", + "0 [8, 6, 7, 4, 1, 2, 2] 0 \n", + "1 [5, 5, 7] 1 \n", + "2 [3, 1, 4, 0, 2, 8, 5, 3, 8, 4, 7, 7, 7, 5, 7] 1 \n", + "3 [6, 6, 8, 4, 0, 8] 1 \n", + "4 [3, 2, 8, 0, 6, 2, 3, 2, 0, 9, 1, 7] 0 " ] }, "execution_count": 7, @@ -1892,7 +506,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "2021-09-01 23:34:16.262872: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n" + "2021-09-10 18:30:07.714270: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n" ] } ], @@ -2131,6 +745,45 @@ { "cell_type": "code", "execution_count": 17, + "id": "d249b965", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['/raid/tfrecord-test/00000.tfrecords']" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "filenames" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "96eb5b32", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "rm: cannot remove '/raid/tfrecord-test/00000.parquet': No such file or directory\r\n" + ] + } + ], + "source": [ + "!rm -r /raid/tfrecord-test/00000.parquet" + ] + }, + { + "cell_type": "code", + "execution_count": 19, "id": "854f2aa3", "metadata": {}, "outputs": [ @@ -2138,33 +791,33 @@ "name": "stderr", "output_type": "stream", "text": [ - "2021-09-01 23:34:39.728120: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n", - "2021-09-01 23:34:39.730165: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "2021-09-10 18:30:22.756211: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n", + "2021-09-10 18:30:22.758160: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", "pciBusID: 0000:0b:00.0 name: Tesla V100-SXM2-32GB computeCapability: 7.0\n", "coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 31.75GiB deviceMemoryBandwidth: 836.37GiB/s\n", - "2021-09-01 23:34:39.730197: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", - "2021-09-01 23:34:39.730269: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n", - "2021-09-01 23:34:39.730305: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n", - "2021-09-01 23:34:39.730341: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n", - "2021-09-01 23:34:39.730377: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n", - "2021-09-01 23:34:39.730429: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n", - "2021-09-01 23:34:39.730466: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n", - "2021-09-01 23:34:39.730511: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n", - "2021-09-01 23:34:39.733991: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", - "2021-09-01 23:34:39.735034: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", + "2021-09-10 18:30:22.758196: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2021-09-10 18:30:22.758265: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n", + "2021-09-10 18:30:22.758308: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n", + "2021-09-10 18:30:22.758350: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n", + "2021-09-10 18:30:22.758389: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n", + "2021-09-10 18:30:22.758446: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n", + "2021-09-10 18:30:22.758487: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n", + "2021-09-10 18:30:22.758533: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n", + "2021-09-10 18:30:22.762003: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2021-09-10 18:30:22.763646: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", - "2021-09-01 23:34:39.744239: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "2021-09-10 18:30:22.773431: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", "pciBusID: 0000:0b:00.0 name: Tesla V100-SXM2-32GB computeCapability: 7.0\n", "coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 31.75GiB deviceMemoryBandwidth: 836.37GiB/s\n", - "2021-09-01 23:34:39.747918: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", - "2021-09-01 23:34:39.747963: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", - "2021-09-01 23:34:41.243588: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", - "2021-09-01 23:34:41.243665: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", - "2021-09-01 23:34:41.243675: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", - "2021-09-01 23:34:41.250181: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 26435 MB memory) -> physical GPU (device: 0, name: Tesla V100-SXM2-32GB, pci bus id: 0000:0b:00.0, compute capability: 7.0)\n", - "2021-09-01 23:34:41.318728: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n", - "2021-09-01 23:34:41.341392: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2194940000 Hz\n", - "20000it [00:13, 1444.97it/s]\n" + "2021-09-10 18:30:22.776783: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2021-09-10 18:30:22.776834: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2021-09-10 18:30:24.163702: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2021-09-10 18:30:24.163757: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2021-09-10 18:30:24.163767: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2021-09-10 18:30:24.168933: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 26826 MB memory) -> physical GPU (device: 0, name: Tesla V100-SXM2-32GB, pci bus id: 0000:0b:00.0, compute capability: 7.0)\n", + "2021-09-10 18:30:24.235412: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n", + "2021-09-10 18:30:24.257477: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2194940000 Hz\n", + "20000it [00:12, 1593.04it/s]\n" ] } ], @@ -2192,36 +845,17 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 20, "id": "dab31264", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['/raid/tfrecord-test/00000_1.parquet',\n", - " '/raid/tfrecord-test/00000_9.parquet',\n", - " '/raid/tfrecord-test/00000_14.parquet',\n", - " '/raid/tfrecord-test/00000_2.parquet',\n", - " '/raid/tfrecord-test/00000_17.parquet',\n", - " '/raid/tfrecord-test/00000_6.parquet',\n", - " '/raid/tfrecord-test/00000_0.parquet',\n", - " '/raid/tfrecord-test/00000_19.parquet',\n", - " '/raid/tfrecord-test/00000_3.parquet',\n", - " '/raid/tfrecord-test/00000_15.parquet',\n", - " '/raid/tfrecord-test/00000_7.parquet',\n", - " '/raid/tfrecord-test/00000_4.parquet',\n", - " '/raid/tfrecord-test/00000_11.parquet',\n", - " '/raid/tfrecord-test/00000_8.parquet',\n", - " '/raid/tfrecord-test/00000_12.parquet',\n", - " '/raid/tfrecord-test/00000_18.parquet',\n", - " '/raid/tfrecord-test/00000_5.parquet',\n", - " '/raid/tfrecord-test/00000_16.parquet',\n", - " '/raid/tfrecord-test/00000_13.parquet',\n", - " '/raid/tfrecord-test/00000_10.parquet']" + "['/raid/tfrecord-test/00000.parquet']" ] }, - "execution_count": 18, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -2241,8 +875,8 @@ }, { "cell_type": "code", - "execution_count": 19, - "id": "b2ce99e0", + "execution_count": 22, + "id": "0bd30a89", "metadata": {}, "outputs": [ { @@ -2289,107 +923,107 @@ " \n", " \n", " 0\n", - " 7\n", - " 1\n", - " 1\n", - " 6\n", - " 6\n", + " 9\n", + " 0\n", " 5\n", + " 8\n", + " 9\n", " 5\n", - " [0, 9, 2]\n", - " [7, 9, 0, 1, 1, 3, 5, 5]\n", - " [5, 3, 5, 1, 8, 4, 2, 3, 5, 1, 6, 7, 5, 5, 1, ...\n", - " 0.949754\n", - " -0.748696\n", - " 0.835813\n", - " -0.444217\n", - " 0.907264\n", - " [-0.11095038, -0.26764223, -0.7053523, -0.8485...\n", - " [-0.059809078, -0.022873674, -0.7533006, -0.20...\n", + " 3\n", + " [1, 6, 3]\n", + " [4, 7, 9, 6, 4, 7, 6, 6, 1, 0, 2, 7, 6, 9]\n", + " [8, 6, 7, 4, 1, 2, 2]\n", + " 0.495186\n", + " -0.156929\n", + " 0.924507\n", + " 0.831834\n", + " -0.562554\n", + " [-0.08782642, 0.84630096, -0.34404242, -0.7675...\n", + " [0.74670917, 0.6435026, -0.14159574, -0.590101...\n", " 0\n", " \n", " \n", " 1\n", + " 4\n", " 5\n", + " 3\n", + " 1\n", + " 9\n", " 4\n", - " 2\n", - " 0\n", - " 0\n", " 6\n", - " 8\n", - " [8, 6, 5, 3, 6, 1, 9, 8, 1, 8, 5, 8, 7, 0, 5, ...\n", - " [3, 8, 0, 0, 8, 1, 6, 0]\n", - " [2, 8, 2, 1, 1, 9, 0, 2, 9, 5, 1, 6, 4, 3, 6, 4]\n", - " 0.483914\n", - " -0.067152\n", - " 0.717157\n", - " -0.530880\n", - " -0.458823\n", - " [-0.63131815, -0.39890215, 0.2429156, -0.36485...\n", - " [-0.15147838, -0.50006855, -0.28108948, -0.313...\n", - " 0\n", + " [4, 1, 2]\n", + " [7, 5, 5, 8, 9, 0, 4, 2, 9, 0, 5]\n", + " [5, 5, 7]\n", + " -0.868427\n", + " -0.867583\n", + " 0.225373\n", + " 0.649436\n", + " 0.544803\n", + " [0.038860355, 0.19487974, -0.63031155, 0.36691...\n", + " [0.23944333, -0.96289706, -0.7723948, 0.347194...\n", + " 1\n", " \n", " \n", " 2\n", + " 1\n", + " 4\n", " 0\n", - " 2\n", - " 0\n", - " 6\n", " 3\n", - " 7\n", - " 0\n", - " [5, 5, 3, 3, 3, 1, 3, 5, 3, 0, 1, 6, 2, 5, 3, ...\n", - " [0, 2, 3, 7, 1, 4, 0, 8, 8, 2, 5, 6]\n", - " [7, 3, 9, 4, 0, 2, 4, 1, 7, 3, 3]\n", - " -0.623489\n", - " -0.409438\n", - " 0.376929\n", - " 0.011878\n", - " 0.246547\n", - " [0.5586857, 0.69241923, -0.47191077, 0.3478763...\n", - " [0.20118928, 0.254733, -0.59162587, -0.8194611...\n", " 0\n", + " 3\n", + " 4\n", + " []\n", + " [6, 5, 6, 2, 1, 2, 0, 1, 1, 7, 7, 8, 4, 1]\n", + " [3, 1, 4, 0, 2, 8, 5, 3, 8, 4, 7, 7, 7, 5, 7]\n", + " 0.972268\n", + " -0.650685\n", + " -0.689674\n", + " -0.780836\n", + " 0.677578\n", + " [0.48108992, -0.4906353, 0.5207957, -0.5258586...\n", + " [0.95394963]\n", + " 1\n", " \n", " \n", " 3\n", - " 5\n", - " 5\n", - " 4\n", + " 3\n", + " 8\n", + " 6\n", + " 1\n", " 4\n", - " 7\n", - " 0\n", " 1\n", - " [4, 2, 6, 6, 1, 4, 7, 7, 1, 8, 2, 4, 8, 8, 0]\n", - " [9, 8, 4, 7, 0, 4, 5, 7, 1, 4, 8, 3, 1, 1, 5, ...\n", - " [6, 4, 5, 9, 8, 3, 2, 9, 0, 7, 0, 1, 3, 9, 0, ...\n", - " 0.015761\n", - " 0.559979\n", - " 0.580781\n", - " -0.555637\n", - " 0.755852\n", - " [0.102869034, 0.78194165, -0.5718406, 0.529719...\n", - " [0.48339945, -0.43163055, -0.04833159, 0.37615...\n", + " 3\n", + " [6, 2, 5, 5, 9, 9, 8, 2, 6, 4, 5, 4]\n", + " [8]\n", + " [6, 6, 8, 4, 0, 8]\n", + " 0.982108\n", + " 0.806546\n", + " 0.150100\n", + " 0.488589\n", + " 0.353447\n", + " [-0.38542256, 0.58111894, 0.08629591, -0.63986...\n", + " [0.89699906, -0.95817095, 0.62256795, 0.141688...\n", " 1\n", " \n", " \n", " 4\n", " 6\n", - " 1\n", + " 9\n", " 3\n", - " 4\n", - " 2\n", " 7\n", - " 8\n", - " [2, 4, 3, 3, 3, 0, 3, 6, 4, 7, 9, 0, 3, 5, 5]\n", + " 9\n", + " 5\n", + " 9\n", + " [2]\n", + " [7, 2, 3, 0, 6, 8, 4, 8, 5, 1, 9, 5, 8, 6, 6, 9]\n", + " [3, 2, 8, 0, 6, 2, 3, 2, 0, 9, 1, 7]\n", + " -0.044900\n", + " 0.653093\n", + " 0.775169\n", + " 0.230217\n", + " -0.280215\n", " []\n", - " [6, 7, 0, 6, 2, 8, 2, 5, 3, 9, 9, 3, 5]\n", - " -0.602396\n", - " -0.190180\n", - " 0.340079\n", - " -0.125164\n", - " -0.455232\n", - " [-0.6019757, 0.43704405]\n", - " [0.3293641, -0.97888887, -0.96398735, -0.61759...\n", + " [0.945071, -0.071683]\n", " 0\n", " \n", " \n", @@ -2398,64 +1032,87 @@ ], "text/plain": [ " cat0 cat1 cat2 cat3 cat4 cat5 cat6 \\\n", - "0 7 1 1 6 6 5 5 \n", - "1 5 4 2 0 0 6 8 \n", - "2 0 2 0 6 3 7 0 \n", - "3 5 5 4 4 7 0 1 \n", - "4 6 1 3 4 2 7 8 \n", + "0 9 0 5 8 9 5 3 \n", + "1 4 5 3 1 9 4 6 \n", + "2 1 4 0 3 0 3 4 \n", + "3 3 8 6 1 4 1 3 \n", + "4 6 9 3 7 9 5 9 \n", "\n", - " cat_list0 \\\n", - "0 [0, 9, 2] \n", - "1 [8, 6, 5, 3, 6, 1, 9, 8, 1, 8, 5, 8, 7, 0, 5, ... \n", - "2 [5, 5, 3, 3, 3, 1, 3, 5, 3, 0, 1, 6, 2, 5, 3, ... \n", - "3 [4, 2, 6, 6, 1, 4, 7, 7, 1, 8, 2, 4, 8, 8, 0] \n", - "4 [2, 4, 3, 3, 3, 0, 3, 6, 4, 7, 9, 0, 3, 5, 5] \n", + " cat_list0 \\\n", + "0 [1, 6, 3] \n", + "1 [4, 1, 2] \n", + "2 [] \n", + "3 [6, 2, 5, 5, 9, 9, 8, 2, 6, 4, 5, 4] \n", + "4 [2] \n", "\n", - " cat_list1 \\\n", - "0 [7, 9, 0, 1, 1, 3, 5, 5] \n", - "1 [3, 8, 0, 0, 8, 1, 6, 0] \n", - "2 [0, 2, 3, 7, 1, 4, 0, 8, 8, 2, 5, 6] \n", - "3 [9, 8, 4, 7, 0, 4, 5, 7, 1, 4, 8, 3, 1, 1, 5, ... \n", - "4 [] \n", + " cat_list1 \\\n", + "0 [4, 7, 9, 6, 4, 7, 6, 6, 1, 0, 2, 7, 6, 9] \n", + "1 [7, 5, 5, 8, 9, 0, 4, 2, 9, 0, 5] \n", + "2 [6, 5, 6, 2, 1, 2, 0, 1, 1, 7, 7, 8, 4, 1] \n", + "3 [8] \n", + "4 [7, 2, 3, 0, 6, 8, 4, 8, 5, 1, 9, 5, 8, 6, 6, 9] \n", "\n", - " cat_list2 cont0 cont1 \\\n", - "0 [5, 3, 5, 1, 8, 4, 2, 3, 5, 1, 6, 7, 5, 5, 1, ... 0.949754 -0.748696 \n", - "1 [2, 8, 2, 1, 1, 9, 0, 2, 9, 5, 1, 6, 4, 3, 6, 4] 0.483914 -0.067152 \n", - "2 [7, 3, 9, 4, 0, 2, 4, 1, 7, 3, 3] -0.623489 -0.409438 \n", - "3 [6, 4, 5, 9, 8, 3, 2, 9, 0, 7, 0, 1, 3, 9, 0, ... 0.015761 0.559979 \n", - "4 [6, 7, 0, 6, 2, 8, 2, 5, 3, 9, 9, 3, 5] -0.602396 -0.190180 \n", + " cat_list2 cont0 cont1 \\\n", + "0 [8, 6, 7, 4, 1, 2, 2] 0.495186 -0.156929 \n", + "1 [5, 5, 7] -0.868427 -0.867583 \n", + "2 [3, 1, 4, 0, 2, 8, 5, 3, 8, 4, 7, 7, 7, 5, 7] 0.972268 -0.650685 \n", + "3 [6, 6, 8, 4, 0, 8] 0.982108 0.806546 \n", + "4 [3, 2, 8, 0, 6, 2, 3, 2, 0, 9, 1, 7] -0.044900 0.653093 \n", "\n", " cont2 cont3 cont4 \\\n", - "0 0.835813 -0.444217 0.907264 \n", - "1 0.717157 -0.530880 -0.458823 \n", - "2 0.376929 0.011878 0.246547 \n", - "3 0.580781 -0.555637 0.755852 \n", - "4 0.340079 -0.125164 -0.455232 \n", + "0 0.924507 0.831834 -0.562554 \n", + "1 0.225373 0.649436 0.544803 \n", + "2 -0.689674 -0.780836 0.677578 \n", + "3 0.150100 0.488589 0.353447 \n", + "4 0.775169 0.230217 -0.280215 \n", "\n", " cont_list0 \\\n", - "0 [-0.11095038, -0.26764223, -0.7053523, -0.8485... \n", - "1 [-0.63131815, -0.39890215, 0.2429156, -0.36485... \n", - "2 [0.5586857, 0.69241923, -0.47191077, 0.3478763... \n", - "3 [0.102869034, 0.78194165, -0.5718406, 0.529719... \n", - "4 [-0.6019757, 0.43704405] \n", + "0 [-0.08782642, 0.84630096, -0.34404242, -0.7675... \n", + "1 [0.038860355, 0.19487974, -0.63031155, 0.36691... \n", + "2 [0.48108992, -0.4906353, 0.5207957, -0.5258586... \n", + "3 [-0.38542256, 0.58111894, 0.08629591, -0.63986... \n", + "4 [] \n", "\n", " cont_list1 label \n", - "0 [-0.059809078, -0.022873674, -0.7533006, -0.20... 0 \n", - "1 [-0.15147838, -0.50006855, -0.28108948, -0.313... 0 \n", - "2 [0.20118928, 0.254733, -0.59162587, -0.8194611... 0 \n", - "3 [0.48339945, -0.43163055, -0.04833159, 0.37615... 1 \n", - "4 [0.3293641, -0.97888887, -0.96398735, -0.61759... 0 " + "0 [0.74670917, 0.6435026, -0.14159574, -0.590101... 0 \n", + "1 [0.23944333, -0.96289706, -0.7723948, 0.347194... 1 \n", + "2 [0.95394963] 1 \n", + "3 [0.89699906, -0.95817095, 0.62256795, 0.141688... 1 \n", + "4 [0.945071, -0.071683] 0 " ] }, - "execution_count": 19, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "import pandas as pd\n", + "\n", "df = pd.read_parquet(filenames[0])\n", "df.head()" ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "b2ce99e0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(20000, 18)" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] } ], "metadata": { diff --git a/nvtabular/framework_utils/tensorflow/tfrecords_to_parquet.py b/nvtabular/framework_utils/tensorflow/tfrecords_to_parquet.py index b3e02ae2df0..48ca0d07c13 100644 --- a/nvtabular/framework_utils/tensorflow/tfrecords_to_parquet.py +++ b/nvtabular/framework_utils/tensorflow/tfrecords_to_parquet.py @@ -15,8 +15,10 @@ import gc -import pandas as pd +import cudf import tensorflow as tf +from cudf.core.column.lists import is_list_dtype +from cudf.io.parquet import ParquetWriter from pandas_tfrecords.from_tfrecords import _get_feature_type, read_example from tqdm import tqdm @@ -36,7 +38,7 @@ def convert_tfrecords_to_parquet( compression_type: str Compression type of the tfrecords. Options: `""` (no compression), `"ZLIB"`, or `"GZIP"` chunks: int - Split tfrecords into multiple parquet files + Chunks to convert tfrecords into parquet convert_lists: Boolean Output of tfrecords are lists. Set True to convert lists with fixed length to individual columns in the output dataframe @@ -74,37 +76,40 @@ def _to_parquet(tfrecords, file, output_dir, chunks, convert_lists): out = [] i = 0 j = 0 + w = ParquetWriter(output_dir + file.split("/")[-1].split(".")[0] + ".parquet") for tfrecord in tqdm(tfrecords): row = {key: val.numpy() for key, val in tfrecord.items()} out.append(row) i += 1 if i == chunks: - df = pd.DataFrame(out) + df = cudf.DataFrame(out) if convert_lists: df = _convert_lists(df) - df.to_parquet( - output_dir + file.split("/")[-1].split(".")[0] + "_" + str(j) + ".parquet" - ) + w.write_table(df) i = 0 out = [] j += 1 del df gc.collect() if len(out) > 0: - df = pd.DataFrame(out) + df = cudf.DataFrame(out) if convert_lists: df = _convert_lists(df) - df.to_parquet(output_dir + file.split("/")[-1].split(".")[0] + "_" + str(j) + ".parquet") + w.write_table(df) del df + gc.collect() + w.close() def _convert_lists(df): for col in df.columns: - series_length = df[col].apply(lambda x: len(x)) - if series_length.var() == 0 and series_length.min() > 0: - if series_length.max() == 1: - df[col] = df[col].apply(lambda x: x[0]) - else: - for i in range(series_length.max()): - df[col + "_" + str(i)] = df[col].apply(lambda x: x[i]) + if is_list_dtype(df[col]): + series_length = df[col].list.len() + if series_length.var() == 0 and series_length.min() > 0: + if series_length.max() == 1: + df[col] = df[col].list.get(0) + else: + for i in range(series_length.max()): + df[col + "_" + str(i)] = df[col].list.get(i) + df.drop([col], axis=1, inplace=True) return df From 797e575c85ffac7d5041615714c03125cf85225f Mon Sep 17 00:00:00 2001 From: root Date: Tue, 14 Sep 2021 13:48:41 +0000 Subject: [PATCH 23/23] updates --- .../tensorflow/TFRecords-To-Parquet.ipynb | 568 +++++++++--------- .../framework_utils/tensorflow/__init__.py | 1 - .../tensorflow/tfrecords_to_parquet.py | 4 - 3 files changed, 274 insertions(+), 299 deletions(-) diff --git a/examples/tensorflow/TFRecords-To-Parquet.ipynb b/examples/tensorflow/TFRecords-To-Parquet.ipynb index 3b4aba397e2..52e9834da99 100644 --- a/examples/tensorflow/TFRecords-To-Parquet.ipynb +++ b/examples/tensorflow/TFRecords-To-Parquet.ipynb @@ -86,62 +86,62 @@ "Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com\n", "Requirement already satisfied: pandas-tfrecords==0.1.5 in /usr/local/lib/python3.8/dist-packages (0.1.5)\n", "Requirement already satisfied: pandas==1.2.4 in /usr/local/lib/python3.8/dist-packages (from pandas-tfrecords==0.1.5) (1.2.4)\n", - "Requirement already satisfied: tensorflow==2.5.0 in /usr/local/lib/python3.8/dist-packages (from pandas-tfrecords==0.1.5) (2.5.0)\n", "Requirement already satisfied: numpy>=1.16.5 in /usr/local/lib/python3.8/dist-packages (from pandas-tfrecords==0.1.5) (1.19.5)\n", "Requirement already satisfied: s3fs==2021.6.0 in /usr/local/lib/python3.8/dist-packages (from pandas-tfrecords==0.1.5) (2021.6.0)\n", - "Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.8/dist-packages (from pandas==1.2.4->pandas-tfrecords==0.1.5) (2021.1)\n", + "Requirement already satisfied: tensorflow==2.5.0 in /usr/local/lib/python3.8/dist-packages (from pandas-tfrecords==0.1.5) (2.5.0)\n", "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.8/dist-packages (from pandas==1.2.4->pandas-tfrecords==0.1.5) (2.8.2)\n", + "Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.8/dist-packages (from pandas==1.2.4->pandas-tfrecords==0.1.5) (2021.1)\n", "Requirement already satisfied: aiobotocore>=1.0.1 in /usr/local/lib/python3.8/dist-packages (from s3fs==2021.6.0->pandas-tfrecords==0.1.5) (1.4.1)\n", "Requirement already satisfied: fsspec==2021.06.0 in /usr/local/lib/python3.8/dist-packages (from s3fs==2021.6.0->pandas-tfrecords==0.1.5) (2021.6.0)\n", - "Requirement already satisfied: gast==0.4.0 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (0.4.0)\n", + "Requirement already satisfied: wrapt~=1.12.1 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.12.1)\n", + "Requirement already satisfied: keras-preprocessing~=1.1.2 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.1.2)\n", "Requirement already satisfied: astunparse~=1.6.3 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.6.3)\n", "Requirement already satisfied: google-pasta~=0.2 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (0.2.0)\n", - "Requirement already satisfied: protobuf>=3.9.2 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (3.17.3)\n", - "Requirement already satisfied: keras-nightly~=2.5.0.dev in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (2.5.0.dev2021032900)\n", - "Requirement already satisfied: keras-preprocessing~=1.1.2 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.1.2)\n", - "Requirement already satisfied: h5py~=3.1.0 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (3.1.0)\n", - "Requirement already satisfied: termcolor~=1.1.0 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.1.0)\n", - "Requirement already satisfied: wheel~=0.35 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (0.36.2)\n", - "Requirement already satisfied: opt-einsum~=3.3.0 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (3.3.0)\n", - "Requirement already satisfied: six~=1.15.0 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.15.0)\n", "Requirement already satisfied: typing-extensions~=3.7.4 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (3.7.4.3)\n", "Requirement already satisfied: flatbuffers~=1.12.0 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.12)\n", - "Requirement already satisfied: tensorflow-estimator<2.6.0,>=2.5.0rc0 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (2.5.0)\n", - "Requirement already satisfied: wrapt~=1.12.1 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.12.1)\n", + "Requirement already satisfied: gast==0.4.0 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (0.4.0)\n", "Requirement already satisfied: tensorboard~=2.5 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (2.5.0)\n", - "Requirement already satisfied: absl-py~=0.10 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (0.12.0)\n", "Requirement already satisfied: grpcio~=1.34.0 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.34.1)\n", - "Requirement already satisfied: aiohttp>=3.3.1 in /usr/local/lib/python3.8/dist-packages (from aiobotocore>=1.0.1->s3fs==2021.6.0->pandas-tfrecords==0.1.5) (3.7.4.post0)\n", + "Requirement already satisfied: tensorflow-estimator<2.6.0,>=2.5.0rc0 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (2.5.0)\n", + "Requirement already satisfied: six~=1.15.0 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.15.0)\n", + "Requirement already satisfied: opt-einsum~=3.3.0 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (3.3.0)\n", + "Requirement already satisfied: h5py~=3.1.0 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (3.1.0)\n", + "Requirement already satisfied: absl-py~=0.10 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (0.12.0)\n", + "Requirement already satisfied: termcolor~=1.1.0 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.1.0)\n", + "Requirement already satisfied: keras-nightly~=2.5.0.dev in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (2.5.0.dev2021032900)\n", + "Requirement already satisfied: wheel~=0.35 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (0.36.2)\n", + "Requirement already satisfied: protobuf>=3.9.2 in /usr/local/lib/python3.8/dist-packages (from tensorflow==2.5.0->pandas-tfrecords==0.1.5) (3.17.3)\n", "Requirement already satisfied: botocore<1.20.107,>=1.20.106 in /usr/local/lib/python3.8/dist-packages (from aiobotocore>=1.0.1->s3fs==2021.6.0->pandas-tfrecords==0.1.5) (1.20.106)\n", "Requirement already satisfied: aioitertools>=0.5.1 in /usr/local/lib/python3.8/dist-packages (from aiobotocore>=1.0.1->s3fs==2021.6.0->pandas-tfrecords==0.1.5) (0.8.0)\n", - "Requirement already satisfied: chardet<5.0,>=2.0 in /usr/lib/python3/dist-packages (from aiohttp>=3.3.1->aiobotocore>=1.0.1->s3fs==2021.6.0->pandas-tfrecords==0.1.5) (3.0.4)\n", - "Requirement already satisfied: async-timeout<4.0,>=3.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp>=3.3.1->aiobotocore>=1.0.1->s3fs==2021.6.0->pandas-tfrecords==0.1.5) (3.0.1)\n", + "Requirement already satisfied: aiohttp>=3.3.1 in /usr/local/lib/python3.8/dist-packages (from aiobotocore>=1.0.1->s3fs==2021.6.0->pandas-tfrecords==0.1.5) (3.7.4.post0)\n", "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.8/dist-packages (from aiohttp>=3.3.1->aiobotocore>=1.0.1->s3fs==2021.6.0->pandas-tfrecords==0.1.5) (5.1.0)\n", - "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp>=3.3.1->aiobotocore>=1.0.1->s3fs==2021.6.0->pandas-tfrecords==0.1.5) (1.6.3)\n", + "Requirement already satisfied: async-timeout<4.0,>=3.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp>=3.3.1->aiobotocore>=1.0.1->s3fs==2021.6.0->pandas-tfrecords==0.1.5) (3.0.1)\n", "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp>=3.3.1->aiobotocore>=1.0.1->s3fs==2021.6.0->pandas-tfrecords==0.1.5) (21.2.0)\n", - "Requirement already satisfied: urllib3<1.27,>=1.25.4 in /usr/local/lib/python3.8/dist-packages (from botocore<1.20.107,>=1.20.106->aiobotocore>=1.0.1->s3fs==2021.6.0->pandas-tfrecords==0.1.5) (1.26.6)\n", + "Requirement already satisfied: chardet<5.0,>=2.0 in /usr/lib/python3/dist-packages (from aiohttp>=3.3.1->aiobotocore>=1.0.1->s3fs==2021.6.0->pandas-tfrecords==0.1.5) (3.0.4)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp>=3.3.1->aiobotocore>=1.0.1->s3fs==2021.6.0->pandas-tfrecords==0.1.5) (1.6.3)\n", "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /usr/local/lib/python3.8/dist-packages/jmespath-0.10.0-py3.8.egg (from botocore<1.20.107,>=1.20.106->aiobotocore>=1.0.1->s3fs==2021.6.0->pandas-tfrecords==0.1.5) (0.10.0)\n", - "Requirement already satisfied: requests<3,>=2.21.0 in /usr/local/lib/python3.8/dist-packages (from tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (2.26.0)\n", + "Requirement already satisfied: urllib3<1.27,>=1.25.4 in /usr/local/lib/python3.8/dist-packages (from botocore<1.20.107,>=1.20.106->aiobotocore>=1.0.1->s3fs==2021.6.0->pandas-tfrecords==0.1.5) (1.26.6)\n", + "Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /usr/local/lib/python3.8/dist-packages (from tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.8.0)\n", "Requirement already satisfied: tensorboard-data-server<0.7.0,>=0.6.0 in /usr/local/lib/python3.8/dist-packages (from tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (0.6.1)\n", + "Requirement already satisfied: requests<3,>=2.21.0 in /usr/local/lib/python3.8/dist-packages (from tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (2.26.0)\n", "Requirement already satisfied: google-auth<2,>=1.6.3 in /usr/local/lib/python3.8/dist-packages (from tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.33.1)\n", - "Requirement already satisfied: werkzeug>=0.11.15 in /usr/local/lib/python3.8/dist-packages (from tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (2.0.1)\n", "Requirement already satisfied: setuptools>=41.0.0 in /usr/local/lib/python3.8/dist-packages (from tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (57.4.0)\n", - "Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /usr/local/lib/python3.8/dist-packages (from tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.8.0)\n", + "Requirement already satisfied: werkzeug>=0.11.15 in /usr/local/lib/python3.8/dist-packages (from tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (2.0.1)\n", + "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.8/dist-packages (from tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (3.3.4)\n", "Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /usr/local/lib/python3.8/dist-packages (from tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (0.4.4)\n", - "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.8/dist-packages (from tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (3.3.4)\n" + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.8/dist-packages (from google-auth<2,>=1.6.3->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (0.2.8)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.8/dist-packages (from google-auth<2,>=1.6.3->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (4.7.2)\n", + "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.8/dist-packages (from google-auth<2,>=1.6.3->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (4.2.2)\n", + "Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.8/dist-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.3.0)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.8/dist-packages (from google-auth<2,>=1.6.3->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (0.2.8)\n", - "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.8/dist-packages (from google-auth<2,>=1.6.3->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (4.7.2)\n", - "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.8/dist-packages (from google-auth<2,>=1.6.3->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (4.2.2)\n", - "Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.8/dist-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (1.3.0)\n", "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.8/dist-packages (from pyasn1-modules>=0.2.1->google-auth<2,>=1.6.3->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (0.4.8)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.8/dist-packages (from requests<3,>=2.21.0->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (2021.5.30)\n", "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.8/dist-packages (from requests<3,>=2.21.0->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (2.0.3)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.8/dist-packages (from requests<3,>=2.21.0->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (2021.5.30)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.8/dist-packages (from requests<3,>=2.21.0->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (3.2)\n", "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.8/dist-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard~=2.5->tensorflow==2.5.0->pandas-tfrecords==0.1.5) (3.1.1)\n", "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\n", @@ -306,107 +306,107 @@ " \n", " \n", " 0\n", - " 0.495186\n", - " -0.156929\n", - " 0.924507\n", - " 0.831834\n", - " -0.562554\n", + " -0.121288\n", + " -0.357718\n", + " -0.431527\n", + " -0.627783\n", + " 0.191935\n", + " 6\n", + " 3\n", " 9\n", " 0\n", - " 5\n", - " 8\n", - " 9\n", - " 5\n", - " 3\n", - " [-0.08782642389819229, 0.8463009855676584, -0....\n", - " [0.7467091963784316, 0.6435026054221511, -0.14...\n", - " [1, 6, 3]\n", - " [4, 7, 9, 6, 4, 7, 6, 6, 1, 0, 2, 7, 6, 9]\n", - " [8, 6, 7, 4, 1, 2, 2]\n", + " 4\n", + " 7\n", + " 0\n", + " [0.1692354035839727, -0.46975096709111264, 0.3...\n", + " [-0.019300394422056444, -0.8409590858237292, 0...\n", + " []\n", + " []\n", + " [1, 4, 7, 7, 2, 6, 7, 9, 7, 1, 9, 8, 9]\n", " 0\n", " \n", " \n", " 1\n", - " -0.868427\n", - " -0.867583\n", - " 0.225373\n", - " 0.649436\n", - " 0.544803\n", - " 4\n", - " 5\n", - " 3\n", + " -0.886083\n", + " -0.689626\n", + " -0.799476\n", + " -0.756402\n", + " -0.530262\n", + " 7\n", + " 6\n", " 1\n", - " 9\n", " 4\n", - " 6\n", - " [0.038860353982820284, 0.19487974734607683, -0...\n", - " [0.23944333485883984, -0.9628970811058808, -0....\n", - " [4, 1, 2]\n", - " [7, 5, 5, 8, 9, 0, 4, 2, 9, 0, 5]\n", - " [5, 5, 7]\n", " 1\n", + " 2\n", + " 6\n", + " [-0.8062752683244594, -0.4870759806510412, -0....\n", + " [0.6996719923900598, -0.8914946563343444, -0.8...\n", + " [0, 0, 0, 5, 0, 9, 8, 6, 3, 1, 2, 3, 7, 5, 8, 5]\n", + " [6, 7, 9, 1, 1, 0, 2, 7, 8, 9, 4, 0, 9, 2, 7, ...\n", + " [3, 5, 2, 3, 9, 2]\n", + " 0\n", " \n", " \n", " 2\n", - " 0.972268\n", - " -0.650685\n", - " -0.689674\n", - " -0.780836\n", - " 0.677578\n", + " 0.635865\n", + " -0.678903\n", + " -0.753119\n", + " 0.295134\n", + " 0.006677\n", + " 8\n", " 1\n", - " 4\n", + " 5\n", + " 9\n", " 0\n", + " 4\n", " 3\n", + " [0.3535400387801406, 0.04545628080492481, 0.66...\n", + " [-0.002535584894796994, -0.46898774509715535, ...\n", + " [6, 4, 9, 7, 4, 0, 7]\n", + " [2, 8, 9]\n", + " [4]\n", " 0\n", - " 3\n", - " 4\n", - " [0.48108993016978885, -0.49063530543434286, 0....\n", - " [0.9539496388151523]\n", - " []\n", - " [6, 5, 6, 2, 1, 2, 0, 1, 1, 7, 7, 8, 4, 1]\n", - " [3, 1, 4, 0, 2, 8, 5, 3, 8, 4, 7, 7, 7, 5, 7]\n", - " 1\n", " \n", " \n", " 3\n", - " 0.982108\n", - " 0.806546\n", - " 0.150100\n", - " 0.488589\n", - " 0.353447\n", - " 3\n", + " 0.274878\n", + " 0.534065\n", + " 0.766480\n", + " -0.117808\n", + " 0.939361\n", + " 2\n", + " 7\n", " 8\n", - " 6\n", - " 1\n", - " 4\n", - " 1\n", - " 3\n", - " [-0.3854225587686282, 0.5811189366242433, 0.08...\n", - " [0.8969990392704366, -0.958170926962973, 0.622...\n", - " [6, 2, 5, 5, 9, 9, 8, 2, 6, 4, 5, 4]\n", - " [8]\n", - " [6, 6, 8, 4, 0, 8]\n", + " 7\n", + " 8\n", + " 8\n", + " 7\n", + " [-0.7778180828602796, -0.7450791973905229, 0.7...\n", + " [-0.28844878640240035, -0.2848391337111471, 0....\n", + " [0, 3, 3, 8, 0, 5, 6, 0, 6, 1, 1, 7, 6, 8, 8]\n", + " [3, 5, 0, 1, 7, 9, 9, 1]\n", + " [7, 5, 7, 8, 8, 0, 9, 0, 3, 9, 0, 0, 0, 2, 0, ...\n", " 1\n", " \n", " \n", " 4\n", - " -0.044900\n", - " 0.653093\n", - " 0.775169\n", - " 0.230217\n", - " -0.280215\n", - " 6\n", + " -0.519931\n", + " -0.692767\n", + " 0.405410\n", + " 0.555309\n", + " 0.494168\n", " 9\n", + " 4\n", " 3\n", - " 7\n", - " 9\n", - " 5\n", + " 2\n", " 9\n", + " 0\n", + " 0\n", + " [0.9219042955449821, -0.2829414261782328, 0.74...\n", + " [-0.31399014027475736, 0.20048294938775935, -0...\n", " []\n", - " [0.9450709782415434, -0.07168300021759921]\n", - " [2]\n", - " [7, 2, 3, 0, 6, 8, 4, 8, 5, 1, 9, 5, 8, 6, 6, 9]\n", - " [3, 2, 8, 0, 6, 2, 3, 2, 0, 9, 1, 7]\n", + " [6, 7, 4, 1, 4, 4, 1, 5]\n", + " [2, 2, 7, 6, 0, 1, 0, 9, 8, 8, 7, 7]\n", " 0\n", " \n", " \n", @@ -415,46 +415,46 @@ ], "text/plain": [ " cont0 cont1 cont2 cont3 cont4 cat0 cat1 cat2 cat3 \\\n", - "0 0.495186 -0.156929 0.924507 0.831834 -0.562554 9 0 5 8 \n", - "1 -0.868427 -0.867583 0.225373 0.649436 0.544803 4 5 3 1 \n", - "2 0.972268 -0.650685 -0.689674 -0.780836 0.677578 1 4 0 3 \n", - "3 0.982108 0.806546 0.150100 0.488589 0.353447 3 8 6 1 \n", - "4 -0.044900 0.653093 0.775169 0.230217 -0.280215 6 9 3 7 \n", + "0 -0.121288 -0.357718 -0.431527 -0.627783 0.191935 6 3 9 0 \n", + "1 -0.886083 -0.689626 -0.799476 -0.756402 -0.530262 7 6 1 4 \n", + "2 0.635865 -0.678903 -0.753119 0.295134 0.006677 8 1 5 9 \n", + "3 0.274878 0.534065 0.766480 -0.117808 0.939361 2 7 8 7 \n", + "4 -0.519931 -0.692767 0.405410 0.555309 0.494168 9 4 3 2 \n", "\n", " cat4 cat5 cat6 cont_list0 \\\n", - "0 9 5 3 [-0.08782642389819229, 0.8463009855676584, -0.... \n", - "1 9 4 6 [0.038860353982820284, 0.19487974734607683, -0... \n", - "2 0 3 4 [0.48108993016978885, -0.49063530543434286, 0.... \n", - "3 4 1 3 [-0.3854225587686282, 0.5811189366242433, 0.08... \n", - "4 9 5 9 [] \n", + "0 4 7 0 [0.1692354035839727, -0.46975096709111264, 0.3... \n", + "1 1 2 6 [-0.8062752683244594, -0.4870759806510412, -0.... \n", + "2 0 4 3 [0.3535400387801406, 0.04545628080492481, 0.66... \n", + "3 8 8 7 [-0.7778180828602796, -0.7450791973905229, 0.7... \n", + "4 9 0 0 [0.9219042955449821, -0.2829414261782328, 0.74... \n", "\n", " cont_list1 \\\n", - "0 [0.7467091963784316, 0.6435026054221511, -0.14... \n", - "1 [0.23944333485883984, -0.9628970811058808, -0.... \n", - "2 [0.9539496388151523] \n", - "3 [0.8969990392704366, -0.958170926962973, 0.622... \n", - "4 [0.9450709782415434, -0.07168300021759921] \n", + "0 [-0.019300394422056444, -0.8409590858237292, 0... \n", + "1 [0.6996719923900598, -0.8914946563343444, -0.8... \n", + "2 [-0.002535584894796994, -0.46898774509715535, ... \n", + "3 [-0.28844878640240035, -0.2848391337111471, 0.... \n", + "4 [-0.31399014027475736, 0.20048294938775935, -0... \n", "\n", - " cat_list0 \\\n", - "0 [1, 6, 3] \n", - "1 [4, 1, 2] \n", - "2 [] \n", - "3 [6, 2, 5, 5, 9, 9, 8, 2, 6, 4, 5, 4] \n", - "4 [2] \n", + " cat_list0 \\\n", + "0 [] \n", + "1 [0, 0, 0, 5, 0, 9, 8, 6, 3, 1, 2, 3, 7, 5, 8, 5] \n", + "2 [6, 4, 9, 7, 4, 0, 7] \n", + "3 [0, 3, 3, 8, 0, 5, 6, 0, 6, 1, 1, 7, 6, 8, 8] \n", + "4 [] \n", "\n", - " cat_list1 \\\n", - "0 [4, 7, 9, 6, 4, 7, 6, 6, 1, 0, 2, 7, 6, 9] \n", - "1 [7, 5, 5, 8, 9, 0, 4, 2, 9, 0, 5] \n", - "2 [6, 5, 6, 2, 1, 2, 0, 1, 1, 7, 7, 8, 4, 1] \n", - "3 [8] \n", - "4 [7, 2, 3, 0, 6, 8, 4, 8, 5, 1, 9, 5, 8, 6, 6, 9] \n", + " cat_list1 \\\n", + "0 [] \n", + "1 [6, 7, 9, 1, 1, 0, 2, 7, 8, 9, 4, 0, 9, 2, 7, ... \n", + "2 [2, 8, 9] \n", + "3 [3, 5, 0, 1, 7, 9, 9, 1] \n", + "4 [6, 7, 4, 1, 4, 4, 1, 5] \n", "\n", - " cat_list2 label \n", - "0 [8, 6, 7, 4, 1, 2, 2] 0 \n", - "1 [5, 5, 7] 1 \n", - "2 [3, 1, 4, 0, 2, 8, 5, 3, 8, 4, 7, 7, 7, 5, 7] 1 \n", - "3 [6, 6, 8, 4, 0, 8] 1 \n", - "4 [3, 2, 8, 0, 6, 2, 3, 2, 0, 9, 1, 7] 0 " + " cat_list2 label \n", + "0 [1, 4, 7, 7, 2, 6, 7, 9, 7, 1, 9, 8, 9] 0 \n", + "1 [3, 5, 2, 3, 9, 2] 0 \n", + "2 [4] 0 \n", + "3 [7, 5, 7, 8, 8, 0, 9, 0, 3, 9, 0, 0, 0, 2, 0, ... 1 \n", + "4 [2, 2, 7, 6, 0, 1, 0, 9, 8, 8, 7, 7] 0 " ] }, "execution_count": 7, @@ -506,7 +506,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "2021-09-10 18:30:07.714270: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n" + "2021-09-14 13:44:22.062009: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n" ] } ], @@ -766,24 +766,6 @@ { "cell_type": "code", "execution_count": 18, - "id": "96eb5b32", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "rm: cannot remove '/raid/tfrecord-test/00000.parquet': No such file or directory\r\n" - ] - } - ], - "source": [ - "!rm -r /raid/tfrecord-test/00000.parquet" - ] - }, - { - "cell_type": "code", - "execution_count": 19, "id": "854f2aa3", "metadata": {}, "outputs": [ @@ -791,33 +773,33 @@ "name": "stderr", "output_type": "stream", "text": [ - "2021-09-10 18:30:22.756211: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n", - "2021-09-10 18:30:22.758160: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "2021-09-14 13:44:35.832870: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n", + "2021-09-14 13:44:35.834890: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", "pciBusID: 0000:0b:00.0 name: Tesla V100-SXM2-32GB computeCapability: 7.0\n", "coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 31.75GiB deviceMemoryBandwidth: 836.37GiB/s\n", - "2021-09-10 18:30:22.758196: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", - "2021-09-10 18:30:22.758265: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n", - "2021-09-10 18:30:22.758308: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n", - "2021-09-10 18:30:22.758350: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n", - "2021-09-10 18:30:22.758389: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n", - "2021-09-10 18:30:22.758446: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n", - "2021-09-10 18:30:22.758487: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n", - "2021-09-10 18:30:22.758533: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n", - "2021-09-10 18:30:22.762003: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", - "2021-09-10 18:30:22.763646: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", + "2021-09-14 13:44:35.834919: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2021-09-14 13:44:35.834964: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n", + "2021-09-14 13:44:35.834998: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n", + "2021-09-14 13:44:35.835031: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n", + "2021-09-14 13:44:35.835064: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n", + "2021-09-14 13:44:35.835111: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n", + "2021-09-14 13:44:35.835144: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n", + "2021-09-14 13:44:35.835180: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n", + "2021-09-14 13:44:35.838915: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2021-09-14 13:44:35.839664: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", - "2021-09-10 18:30:22.773431: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "2021-09-14 13:44:35.848336: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", "pciBusID: 0000:0b:00.0 name: Tesla V100-SXM2-32GB computeCapability: 7.0\n", "coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 31.75GiB deviceMemoryBandwidth: 836.37GiB/s\n", - "2021-09-10 18:30:22.776783: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", - "2021-09-10 18:30:22.776834: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", - "2021-09-10 18:30:24.163702: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", - "2021-09-10 18:30:24.163757: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", - "2021-09-10 18:30:24.163767: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", - "2021-09-10 18:30:24.168933: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 26826 MB memory) -> physical GPU (device: 0, name: Tesla V100-SXM2-32GB, pci bus id: 0000:0b:00.0, compute capability: 7.0)\n", - "2021-09-10 18:30:24.235412: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n", - "2021-09-10 18:30:24.257477: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2194940000 Hz\n", - "20000it [00:12, 1593.04it/s]\n" + "2021-09-14 13:44:35.852172: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2021-09-14 13:44:35.852216: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2021-09-14 13:44:37.236098: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2021-09-14 13:44:37.236140: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2021-09-14 13:44:37.236150: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2021-09-14 13:44:37.241900: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 30677 MB memory) -> physical GPU (device: 0, name: Tesla V100-SXM2-32GB, pci bus id: 0000:0b:00.0, compute capability: 7.0)\n", + "2021-09-14 13:44:37.301800: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n", + "2021-09-14 13:44:37.305671: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2195025000 Hz\n", + "20000it [00:12, 1558.83it/s]\n" ] } ], @@ -845,7 +827,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 19, "id": "dab31264", "metadata": {}, "outputs": [ @@ -855,7 +837,7 @@ "['/raid/tfrecord-test/00000.parquet']" ] }, - "execution_count": 20, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -875,7 +857,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 20, "id": "0bd30a89", "metadata": {}, "outputs": [ @@ -923,107 +905,107 @@ " \n", " \n", " 0\n", + " 6\n", + " 3\n", " 9\n", " 0\n", - " 5\n", - " 8\n", - " 9\n", - " 5\n", - " 3\n", - " [1, 6, 3]\n", - " [4, 7, 9, 6, 4, 7, 6, 6, 1, 0, 2, 7, 6, 9]\n", - " [8, 6, 7, 4, 1, 2, 2]\n", - " 0.495186\n", - " -0.156929\n", - " 0.924507\n", - " 0.831834\n", - " -0.562554\n", - " [-0.08782642, 0.84630096, -0.34404242, -0.7675...\n", - " [0.74670917, 0.6435026, -0.14159574, -0.590101...\n", + " 4\n", + " 7\n", + " 0\n", + " []\n", + " []\n", + " [1, 4, 7, 7, 2, 6, 7, 9, 7, 1, 9, 8, 9]\n", + " -0.121288\n", + " -0.357718\n", + " -0.431527\n", + " -0.627783\n", + " 0.191935\n", + " [0.16923541, -0.46975097, 0.36240318, -0.05831...\n", + " [-0.019300394, -0.8409591, 0.6081534, 0.050789...\n", " 0\n", " \n", " \n", " 1\n", - " 4\n", - " 5\n", - " 3\n", + " 7\n", + " 6\n", " 1\n", - " 9\n", " 4\n", - " 6\n", - " [4, 1, 2]\n", - " [7, 5, 5, 8, 9, 0, 4, 2, 9, 0, 5]\n", - " [5, 5, 7]\n", - " -0.868427\n", - " -0.867583\n", - " 0.225373\n", - " 0.649436\n", - " 0.544803\n", - " [0.038860355, 0.19487974, -0.63031155, 0.36691...\n", - " [0.23944333, -0.96289706, -0.7723948, 0.347194...\n", " 1\n", + " 2\n", + " 6\n", + " [0, 0, 0, 5, 0, 9, 8, 6, 3, 1, 2, 3, 7, 5, 8, 5]\n", + " [6, 7, 9, 1, 1, 0, 2, 7, 8, 9, 4, 0, 9, 2, 7, ...\n", + " [3, 5, 2, 3, 9, 2]\n", + " -0.886083\n", + " -0.689626\n", + " -0.799476\n", + " -0.756402\n", + " -0.530262\n", + " [-0.80627525, -0.48707598, -0.6516318, 0.87470...\n", + " [0.699672, -0.89149463, -0.8134837, -0.9065274...\n", + " 0\n", " \n", " \n", " 2\n", + " 8\n", " 1\n", - " 4\n", + " 5\n", + " 9\n", " 0\n", + " 4\n", " 3\n", + " [6, 4, 9, 7, 4, 0, 7]\n", + " [2, 8, 9]\n", + " [4]\n", + " 0.635865\n", + " -0.678903\n", + " -0.753119\n", + " 0.295134\n", + " 0.006677\n", + " [0.35354003, 0.04545628, 0.6673933, 0.4735813,...\n", + " [-0.0025355848, -0.46898773, -0.07290607, -0.8...\n", " 0\n", - " 3\n", - " 4\n", - " []\n", - " [6, 5, 6, 2, 1, 2, 0, 1, 1, 7, 7, 8, 4, 1]\n", - " [3, 1, 4, 0, 2, 8, 5, 3, 8, 4, 7, 7, 7, 5, 7]\n", - " 0.972268\n", - " -0.650685\n", - " -0.689674\n", - " -0.780836\n", - " 0.677578\n", - " [0.48108992, -0.4906353, 0.5207957, -0.5258586...\n", - " [0.95394963]\n", - " 1\n", " \n", " \n", " 3\n", - " 3\n", + " 2\n", + " 7\n", " 8\n", - " 6\n", - " 1\n", - " 4\n", - " 1\n", - " 3\n", - " [6, 2, 5, 5, 9, 9, 8, 2, 6, 4, 5, 4]\n", - " [8]\n", - " [6, 6, 8, 4, 0, 8]\n", - " 0.982108\n", - " 0.806546\n", - " 0.150100\n", - " 0.488589\n", - " 0.353447\n", - " [-0.38542256, 0.58111894, 0.08629591, -0.63986...\n", - " [0.89699906, -0.95817095, 0.62256795, 0.141688...\n", + " 7\n", + " 8\n", + " 8\n", + " 7\n", + " [0, 3, 3, 8, 0, 5, 6, 0, 6, 1, 1, 7, 6, 8, 8]\n", + " [3, 5, 0, 1, 7, 9, 9, 1]\n", + " [7, 5, 7, 8, 8, 0, 9, 0, 3, 9, 0, 0, 0, 2, 0, ...\n", + " 0.274878\n", + " 0.534065\n", + " 0.766480\n", + " -0.117808\n", + " 0.939361\n", + " [-0.7778181, -0.7450792, 0.7001909, -0.7610098...\n", + " [-0.28844878, -0.28483912, 0.18376812, 0.32782...\n", " 1\n", " \n", " \n", " 4\n", - " 6\n", " 9\n", + " 4\n", " 3\n", - " 7\n", - " 9\n", - " 5\n", + " 2\n", " 9\n", - " [2]\n", - " [7, 2, 3, 0, 6, 8, 4, 8, 5, 1, 9, 5, 8, 6, 6, 9]\n", - " [3, 2, 8, 0, 6, 2, 3, 2, 0, 9, 1, 7]\n", - " -0.044900\n", - " 0.653093\n", - " 0.775169\n", - " 0.230217\n", - " -0.280215\n", + " 0\n", + " 0\n", " []\n", - " [0.945071, -0.071683]\n", + " [6, 7, 4, 1, 4, 4, 1, 5]\n", + " [2, 2, 7, 6, 0, 1, 0, 9, 8, 8, 7, 7]\n", + " -0.519931\n", + " -0.692767\n", + " 0.405410\n", + " 0.555309\n", + " 0.494168\n", + " [0.92190427, -0.28294143, 0.7465968, 0.5406436...\n", + " [-0.31399015, 0.20048295, -0.8439063, -0.46556...\n", " 0\n", " \n", " \n", @@ -1032,70 +1014,68 @@ ], "text/plain": [ " cat0 cat1 cat2 cat3 cat4 cat5 cat6 \\\n", - "0 9 0 5 8 9 5 3 \n", - "1 4 5 3 1 9 4 6 \n", - "2 1 4 0 3 0 3 4 \n", - "3 3 8 6 1 4 1 3 \n", - "4 6 9 3 7 9 5 9 \n", + "0 6 3 9 0 4 7 0 \n", + "1 7 6 1 4 1 2 6 \n", + "2 8 1 5 9 0 4 3 \n", + "3 2 7 8 7 8 8 7 \n", + "4 9 4 3 2 9 0 0 \n", "\n", - " cat_list0 \\\n", - "0 [1, 6, 3] \n", - "1 [4, 1, 2] \n", - "2 [] \n", - "3 [6, 2, 5, 5, 9, 9, 8, 2, 6, 4, 5, 4] \n", - "4 [2] \n", + " cat_list0 \\\n", + "0 [] \n", + "1 [0, 0, 0, 5, 0, 9, 8, 6, 3, 1, 2, 3, 7, 5, 8, 5] \n", + "2 [6, 4, 9, 7, 4, 0, 7] \n", + "3 [0, 3, 3, 8, 0, 5, 6, 0, 6, 1, 1, 7, 6, 8, 8] \n", + "4 [] \n", "\n", - " cat_list1 \\\n", - "0 [4, 7, 9, 6, 4, 7, 6, 6, 1, 0, 2, 7, 6, 9] \n", - "1 [7, 5, 5, 8, 9, 0, 4, 2, 9, 0, 5] \n", - "2 [6, 5, 6, 2, 1, 2, 0, 1, 1, 7, 7, 8, 4, 1] \n", - "3 [8] \n", - "4 [7, 2, 3, 0, 6, 8, 4, 8, 5, 1, 9, 5, 8, 6, 6, 9] \n", + " cat_list1 \\\n", + "0 [] \n", + "1 [6, 7, 9, 1, 1, 0, 2, 7, 8, 9, 4, 0, 9, 2, 7, ... \n", + "2 [2, 8, 9] \n", + "3 [3, 5, 0, 1, 7, 9, 9, 1] \n", + "4 [6, 7, 4, 1, 4, 4, 1, 5] \n", "\n", - " cat_list2 cont0 cont1 \\\n", - "0 [8, 6, 7, 4, 1, 2, 2] 0.495186 -0.156929 \n", - "1 [5, 5, 7] -0.868427 -0.867583 \n", - "2 [3, 1, 4, 0, 2, 8, 5, 3, 8, 4, 7, 7, 7, 5, 7] 0.972268 -0.650685 \n", - "3 [6, 6, 8, 4, 0, 8] 0.982108 0.806546 \n", - "4 [3, 2, 8, 0, 6, 2, 3, 2, 0, 9, 1, 7] -0.044900 0.653093 \n", + " cat_list2 cont0 cont1 \\\n", + "0 [1, 4, 7, 7, 2, 6, 7, 9, 7, 1, 9, 8, 9] -0.121288 -0.357718 \n", + "1 [3, 5, 2, 3, 9, 2] -0.886083 -0.689626 \n", + "2 [4] 0.635865 -0.678903 \n", + "3 [7, 5, 7, 8, 8, 0, 9, 0, 3, 9, 0, 0, 0, 2, 0, ... 0.274878 0.534065 \n", + "4 [2, 2, 7, 6, 0, 1, 0, 9, 8, 8, 7, 7] -0.519931 -0.692767 \n", "\n", " cont2 cont3 cont4 \\\n", - "0 0.924507 0.831834 -0.562554 \n", - "1 0.225373 0.649436 0.544803 \n", - "2 -0.689674 -0.780836 0.677578 \n", - "3 0.150100 0.488589 0.353447 \n", - "4 0.775169 0.230217 -0.280215 \n", + "0 -0.431527 -0.627783 0.191935 \n", + "1 -0.799476 -0.756402 -0.530262 \n", + "2 -0.753119 0.295134 0.006677 \n", + "3 0.766480 -0.117808 0.939361 \n", + "4 0.405410 0.555309 0.494168 \n", "\n", " cont_list0 \\\n", - "0 [-0.08782642, 0.84630096, -0.34404242, -0.7675... \n", - "1 [0.038860355, 0.19487974, -0.63031155, 0.36691... \n", - "2 [0.48108992, -0.4906353, 0.5207957, -0.5258586... \n", - "3 [-0.38542256, 0.58111894, 0.08629591, -0.63986... \n", - "4 [] \n", + "0 [0.16923541, -0.46975097, 0.36240318, -0.05831... \n", + "1 [-0.80627525, -0.48707598, -0.6516318, 0.87470... \n", + "2 [0.35354003, 0.04545628, 0.6673933, 0.4735813,... \n", + "3 [-0.7778181, -0.7450792, 0.7001909, -0.7610098... \n", + "4 [0.92190427, -0.28294143, 0.7465968, 0.5406436... \n", "\n", " cont_list1 label \n", - "0 [0.74670917, 0.6435026, -0.14159574, -0.590101... 0 \n", - "1 [0.23944333, -0.96289706, -0.7723948, 0.347194... 1 \n", - "2 [0.95394963] 1 \n", - "3 [0.89699906, -0.95817095, 0.62256795, 0.141688... 1 \n", - "4 [0.945071, -0.071683] 0 " + "0 [-0.019300394, -0.8409591, 0.6081534, 0.050789... 0 \n", + "1 [0.699672, -0.89149463, -0.8134837, -0.9065274... 0 \n", + "2 [-0.0025355848, -0.46898773, -0.07290607, -0.8... 0 \n", + "3 [-0.28844878, -0.28483912, 0.18376812, 0.32782... 1 \n", + "4 [-0.31399015, 0.20048295, -0.8439063, -0.46556... 0 " ] }, - "execution_count": 22, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "import pandas as pd\n", - "\n", "df = pd.read_parquet(filenames[0])\n", "df.head()" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 21, "id": "b2ce99e0", "metadata": {}, "outputs": [ @@ -1105,7 +1085,7 @@ "(20000, 18)" ] }, - "execution_count": 23, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } diff --git a/nvtabular/framework_utils/tensorflow/__init__.py b/nvtabular/framework_utils/tensorflow/__init__.py index 8c1e61cead3..bf3c16a024b 100644 --- a/nvtabular/framework_utils/tensorflow/__init__.py +++ b/nvtabular/framework_utils/tensorflow/__init__.py @@ -16,4 +16,3 @@ # flake8: noqa from .feature_column_utils import make_feature_column_workflow -from .tfrecords_to_parquet import convert_tfrecords_to_parquet diff --git a/nvtabular/framework_utils/tensorflow/tfrecords_to_parquet.py b/nvtabular/framework_utils/tensorflow/tfrecords_to_parquet.py index 48ca0d07c13..682f554c697 100644 --- a/nvtabular/framework_utils/tensorflow/tfrecords_to_parquet.py +++ b/nvtabular/framework_utils/tensorflow/tfrecords_to_parquet.py @@ -44,8 +44,6 @@ def convert_tfrecords_to_parquet( individual columns in the output dataframe """ - # TODO: provide row_groupby_size parameter for parquet files - # TODO: optimize parquet files for file in filenames: dataset = tf.data.TFRecordDataset(file, compression_type=compression_type) @@ -75,7 +73,6 @@ def _detect_schema(dataset): def _to_parquet(tfrecords, file, output_dir, chunks, convert_lists): out = [] i = 0 - j = 0 w = ParquetWriter(output_dir + file.split("/")[-1].split(".")[0] + ".parquet") for tfrecord in tqdm(tfrecords): row = {key: val.numpy() for key, val in tfrecord.items()} @@ -88,7 +85,6 @@ def _to_parquet(tfrecords, file, output_dir, chunks, convert_lists): w.write_table(df) i = 0 out = [] - j += 1 del df gc.collect() if len(out) > 0: