-
Notifications
You must be signed in to change notification settings - Fork 891
/
char_types.hpp
185 lines (174 loc) · 6.86 KB
/
char_types.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <cudf/column/column.hpp>
#include <cudf/strings/strings_column_view.hpp>
namespace cudf {
namespace strings {
/**
* @addtogroup strings_types
* @{
*/
/**
* @brief Character type values.
* These types can be or'd to check for any combination of types.
*
* This cannot be turned into an enum class because or'd entries can
* result in values that are not in the class. For example,
* combining NUMERIC|SPACE is a valid, reasonable combination but
* does not match to any explicitly named enumerator.
*/
enum string_character_types : uint32_t {
DECIMAL = 1 << 0, /// all decimal characters
NUMERIC = 1 << 1, /// all numeric characters
DIGIT = 1 << 2, /// all digit characters
ALPHA = 1 << 3, /// all alphabetic characters
SPACE = 1 << 4, /// all space characters
UPPER = 1 << 5, /// all upper case characters
LOWER = 1 << 6, /// all lower case characters
ALPHANUM = DECIMAL | NUMERIC | DIGIT | ALPHA, /// all alphanumeric characters
CASE_TYPES = UPPER | LOWER, /// all case-able characters
ALL_TYPES = ALPHANUM | CASE_TYPES | SPACE /// all character types
};
// OR operators for combining types
string_character_types operator|(string_character_types lhs, string_character_types rhs)
{
return static_cast<string_character_types>(
static_cast<std::underlying_type_t<string_character_types>>(lhs) |
static_cast<std::underlying_type_t<string_character_types>>(rhs));
}
string_character_types& operator|=(string_character_types& lhs, string_character_types rhs)
{
lhs = static_cast<string_character_types>(
static_cast<std::underlying_type_t<string_character_types>>(lhs) |
static_cast<std::underlying_type_t<string_character_types>>(rhs));
return lhs;
}
/**
* @brief Returns a boolean column identifying strings entries in which all
* characters are of the type specified.
*
* The output row entry will be set to false if the corresponding string element
* is empty or has at least one character not of the specified type. If all
* characters fit the type then true is set in that output row entry.
*
* To ignore all but specific types, set the `verify_types` to those types
* which should be checked. Otherwise, the default `ALL_TYPES` will verify all
* characters match `types`.
*
* @code{.pseudo}
* Example:
* s = ['ab', 'a b', 'a7', 'a B']
* b1 = s.all_characters_of_type(s,LOWER)
* b1 is [true, false, false, false]
* b2 = s.all_characters_of_type(s,LOWER,LOWER|UPPER)
* b2 is [true, true, true, false]
* @endcode
*
* Any null row results in a null entry for that row in the output column.
*
* @param strings Strings instance for this operation.
* @param types The character types to check in each string.
* @param verify_types Only verify against these character types.
* Default `ALL_TYPES` means return `true`
* iff all characters match `types`.
* @param mr Resource for allocating device memory.
* @return New column of boolean results for each string.
*/
std::unique_ptr<column> all_characters_of_type(
strings_column_view const& strings,
string_character_types types,
string_character_types verify_types = string_character_types::ALL_TYPES,
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());
/**
* @brief Returns a boolean column identifying strings in which all
* characters are valid for conversion to integers.
*
* The output row entry will be set to `true` if the corresponding string element
* has at least one character in [-+0-9].
*
* @code{.pseudo}
* Example:
* s = ['123', '-456', '', 'A', '+7']
* b = s.is_integer(s)
* b is [true, true, false, false, true]
* @endcode
*
* Any null row results in a null entry for that row in the output column.
*
* @param strings Strings instance for this operation.
* @param mr Resource for allocating device memory.
* @return New column of boolean results for each string.
*/
std::unique_ptr<column> is_integer(
strings_column_view const& strings,
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());
/**
* @brief Returns `true` if all strings contain
* characters that are valid for conversion to integers.
*
* This function will return `true` if all string elements
* has at least one character in [-+0-9].
*
* Any null entry or empty string will cause this function to return `false`.
*
* @param strings Strings instance for this operation.
* @param mr Resource for allocating device memory.
* @return true if all string are valid
*/
bool all_integer(strings_column_view const& strings,
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());
/**
* @brief Returns a boolean column identifying strings in which all
* characters are valid for conversion to floats.
*
* The output row entry will be set to `true` if the corresponding string element
* has at least one character in [-+0-9eE.].
*
* @code{.pseudo}
* Example:
* s = ['123', '-456', '', 'A', '+7', '8.9' '3.7e+5']
* b = s.is_float(s)
* b is [true, true, false, false, true, true, true]
* @endcode
*
* Any null row results in a null entry for that row in the output column.
*
* @param strings Strings instance for this operation.
* @param mr Resource for allocating device memory.
* @return New column of boolean results for each string.
*/
std::unique_ptr<column> is_float(
strings_column_view const& strings,
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());
/**
* @brief Returns `true` if all strings contain
* characters that are valid for conversion to floats.
*
* This function will return `true` if all string elements
* has at least one character in [-+0-9eE.].
*
* Any null entry or empty string will cause this function to return `false`.
*
* @param strings Strings instance for this operation.
* @param mr Resource for allocating device memory.
* @return true if all string are valid
*/
bool all_float(strings_column_view const& strings,
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());
/** @} */ // end of doxygen group
} // namespace strings
} // namespace cudf