Skip to content

Commit

Permalink
Add numeric_finite_median_approx to mlio insights
Browse files Browse the repository at this point in the history
  • Loading branch information
wiltonwu committed Sep 23, 2020
1 parent a1a923e commit e6a137f
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 0 deletions.
2 changes: 2 additions & 0 deletions src/mlio-py/mlio/contrib/insights/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ Each column in this Example will print a dictionary like the following:
'numeric_finite_mean': '5.863193',
'numeric_finite_min': '4.600000',
'numeric_finite_max': '7.700000',
'numeric_finite_median_approx': '6.0',
'example_value': '5.1',
'string_cardinality': 16,
'string_captured_unique_values': {'6.5': 5760, '6.4': 5760, '5.7': 5760, '6.1': 5760, '5': 5760, '5.6': 11520, '6.7': 5760, '4.6': 5760, '5.9': 5760, '7.7': 11520, '6.2': 5760, '5.8': 11520, '5.4': 5760, '4.7': 5760, '4.9': 5760, '5.1': 5755},
Expand All @@ -88,6 +89,7 @@ The following information on each column is available:
- `numeric_finite_mean`: the average of finite (non-infinite) numeric values seen.
- `numeric_finite_min`: the minimum finite numeric value seen.
- `numeric_finite_max`: the maximum finite numeric value seen.
- `numeric_finite_median_approx`: the approximate median of up to a sample of 10000 finite numeric values seen

**String Analysis**

Expand Down
6 changes: 6 additions & 0 deletions src/mlio-py/mlio/contrib/insights/column_analyzer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@

namespace pymlio {

static constexpr int MAX_SAMPLE_SIZE = 10000;

Column_analyzer::Column_analyzer(std::vector<Column_analysis> &columns,
const std::vector<std::string> &null_like_values,
const std::unordered_set<std::size_t> &capture_columns,
Expand Down Expand Up @@ -98,6 +100,10 @@ void Column_analyzer::analyze(const mlio::Example &example) const
numeric_column_sum += as_float;
numeric_column_count++;

if (stats.numeric_column_sample.size() < MAX_SAMPLE_SIZE) {
stats.numeric_column_sample.push_back(as_float);
}

if ((std::abs(std::round(as_float) - as_float) <= 1.0e-5)) {
stats.numeric_int_count++;
}
Expand Down
13 changes: 13 additions & 0 deletions src/mlio-py/mlio/contrib/insights/column_statistics.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,18 @@ class Column_analysis {
return static_cast<std::size_t>(std::round(str_cardinality_estimator_.estimate()));
}

public:
double estimate_median_approx() const
{
if (numeric_column_sample.empty()) {
return std::nan("");
}
size_t n = numeric_column_sample.size() / 2;
std::nth_element(numeric_column_sample.begin(), numeric_column_sample.begin() + n,
numeric_column_sample.end());
return numeric_column_sample[n];
}

public:
std::string column_name;

Expand All @@ -70,6 +82,7 @@ class Column_analysis {

private:
hll::HyperLogLog str_cardinality_estimator_;
mutable std::vector<double> numeric_column_sample{};
};

struct data_analysis {
Expand Down
1 change: 1 addition & 0 deletions src/mlio-py/mlio/contrib/insights/module.cc
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ PYBIND11_MODULE(insights, m)
result["string_captured_unique_values"] = self.str_captured_unique_values;
result["string_captured_unique_values_overflowed"] =
self.str_captured_unique_values_overflowed;
result["numeric_finite_median_approx"] = self.estimate_median_approx();

return result;
});
Expand Down

0 comments on commit e6a137f

Please sign in to comment.