Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improved str.count() algorithm using the Knuth-Morris-Pratt algorithm #2530

Merged
merged 2 commits into from
Feb 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions integration_tests/test_str_attributes.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,26 @@ def find():
assert s2.find("we") == -1
assert "".find("") == 0

def count():
s: str
sub: str
s = "ABC ABCDAB ABCDABCDABDE"
sub = "ABC"
assert s.count(sub) == 4
assert s.count("ABC") == 4

sub = "AB"
assert s.count(sub) == 6
assert s.count("AB") == 6

sub = "ABC"
assert "ABC ABCDAB ABCDABCDABDE".count(sub) == 4
assert "ABC ABCDAB ABCDABCDABDE".count("ABC") == 4

sub = "AB"
assert "ABC ABCDAB ABCDABCDABDE".count(sub) == 6
assert "ABC ABCDAB ABCDABCDABDE".count("AB") == 6


def startswith():
s: str
Expand Down Expand Up @@ -307,6 +327,7 @@ def check():
strip()
swapcase()
find()
count()
startswith()
endswith()
partition()
Expand Down
38 changes: 38 additions & 0 deletions src/libasr/asr_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -4242,6 +4242,44 @@ static inline int KMP_string_match(std::string &s_var, std::string &sub) {
return res;
}

static inline int KMP_string_match_count(std::string &s_var, std::string &sub) {
int str_len = s_var.size();
int sub_len = sub.size();
int count = 0;
std::vector<int> lps(sub_len, 0);
if (sub_len == 0) {
count = str_len + 1;
} else {
for(int i = 1, len = 0; i < sub_len;) {
if (sub[i] == sub[len]) {
lps[i++] = ++len;
} else {
if (len != 0) {
len = lps[len - 1];
} else {
lps[i++] = 0;
}
}
}
for (int i = 0, j = 0; (str_len - i) >= (sub_len - j);) {
if (sub[j] == s_var[i]) {
j++, i++;
}
if (j == sub_len) {
count++;
j = lps[j - 1];
} else if (i < str_len && sub[j] != s_var[i]) {
if (j != 0) {
j = lps[j - 1];
} else {
i = i + 1;
}
}
}
}
return count;
}

static inline void visit_expr_list(Allocator &al, Vec<ASR::call_arg_t>& exprs,
Vec<ASR::expr_t*>& exprs_vec) {
LCOMPILERS_ASSERT(exprs_vec.reserve_called);
Expand Down
39 changes: 37 additions & 2 deletions src/lpython/semantics/python_ast_to_asr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6872,13 +6872,13 @@ class BodyVisitor : public CommonVisitor<BodyVisitor> {
}
} else if (attr_name == "find") {
if (args.size() != 1) {
throw SemanticError("str.find() takes one arguments",
throw SemanticError("str.find() takes one argument",
loc);
}
ASR::expr_t *arg = args[0].m_value;
ASR::ttype_t *type = ASRUtils::expr_type(arg);
if (!ASRUtils::is_character(*type)) {
throw SemanticError("str.find() takes one arguments of type: str",
throw SemanticError("str.find() takes one argument of type: str",
arg->base.loc);
}
if (ASRUtils::expr_value(arg) != nullptr) {
Expand All @@ -6905,6 +6905,41 @@ class BodyVisitor : public CommonVisitor<BodyVisitor> {
tmp = make_call_helper(al, fn_div, current_scope, args, "_lpython_str_find", loc);
}
return;
} else if (attr_name == "count") {
if (args.size() != 1) {
throw SemanticError("str.count() takes one argument",
loc);
}
ASR::expr_t *arg = args[0].m_value;
ASR::ttype_t *type = ASRUtils::expr_type(arg);
if (!ASRUtils::is_character(*type)) {
throw SemanticError("str.count() takes one argument of type: str",
arg->base.loc);
}
if (ASRUtils::expr_value(arg) != nullptr) {
ASR::StringConstant_t* sub_str_con = ASR::down_cast<ASR::StringConstant_t>(arg);
std::string sub = sub_str_con->m_s;
int res = ASRUtils::KMP_string_match_count(s_var, sub);
tmp = ASR::make_IntegerConstant_t(al, loc, res,
ASRUtils::TYPE(ASR::make_Integer_t(al,loc, 4)));
} else {
ASR::symbol_t *fn_div = resolve_intrinsic_function(loc, "_lpython_str_count");
Vec<ASR::call_arg_t> args;
args.reserve(al, 1);
ASR::call_arg_t str_arg;
str_arg.loc = loc;
ASR::ttype_t *str_type = ASRUtils::TYPE(ASR::make_Character_t(al, loc,
1, s_var.size(), nullptr));
str_arg.m_value = ASRUtils::EXPR(
ASR::make_StringConstant_t(al, loc, s2c(al, s_var), str_type));
ASR::call_arg_t sub_arg;
sub_arg.loc = loc;
sub_arg.m_value = arg;
args.push_back(al, str_arg);
args.push_back(al, sub_arg);
tmp = make_call_helper(al, fn_div, current_scope, args, "_lpython_str_count", loc);
}
return;
} else if (attr_name == "rstrip") {
if (args.size() != 0) {
throw SemanticError("str.rstrip() takes no arguments",
Expand Down
51 changes: 42 additions & 9 deletions src/runtime/lpython_builtin.py
Original file line number Diff line number Diff line change
Expand Up @@ -637,17 +637,50 @@ def _lpython_str_capitalize(x: str) -> str:


@overload
def _lpython_str_count(x: str, y: str) -> i32:
if(len(y) == 0): return len(x) + 1
def _lpython_str_count(s: str, sub: str) -> i32:
s_len :i32; sub_len :i32; flag: bool; _len: i32;
count: i32; i: i32;
lps: list[i32] = []
s_len = len(s)
sub_len = len(sub)

count: i32 = 0
curr_char: str
i: i32
if sub_len == 0:
return s_len + 1

count = 0

for i in range(sub_len):
lps.append(0)

i = 1
_len = 0
while i < sub_len:
if sub[i] == sub[_len]:
_len += 1
lps[i] = _len
i += 1
else:
if _len != 0:
_len = lps[_len - 1]
else:
lps[i] = 0
i += 1

for i in range(len(x)):
curr_char = x[i]
if curr_char == y[0]:
count += i32(x[i:i+len(y)] == y)
j: i32
j = 0
i = 0
while (s_len - i) >= (sub_len - j):
if sub[j] == s[i]:
i += 1
j += 1
if j == sub_len:
count += 1
j = lps[j - 1]
elif i < s_len and sub[j] != s[i]:
if j != 0:
j = lps[j - 1]
else:
i = i + 1

return count

Expand Down
Loading