Skip to content

Commit

Permalink
scoring for multi-char repeats + tests
Browse files Browse the repository at this point in the history
  • Loading branch information
lowe committed Sep 14, 2015
1 parent bff60d5 commit ceaac0c
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 19 deletions.
15 changes: 12 additions & 3 deletions src/matching.coffee
Original file line number Diff line number Diff line change
Expand Up @@ -312,20 +312,29 @@ matching =
# aabaab in aabaabaabaab.
# run an anchored lazy match on greedy's repeated string
# to find the shortest repeated string
repeated_string = lazy_anchored.exec(match[0])[1]
base_token = lazy_anchored.exec(match[0])[1]
else
# lazy beats greedy for 'aaaaa'
# greedy: [aaaa, aa]
# lazy: [aaaaa, a]
match = lazy_match
repeated_string = match[1]
base_token = match[1]
[i, j] = [match.index, match.index + match[0].length - 1]
# recursively match and score the base string
base_analysis = scoring.minimum_entropy_match_sequence(
base_token
@omnimatch base_token
)
base_matches = base_analysis.match_sequence
base_entropy = base_analysis.entropy
matches.push
pattern: 'repeat'
i: i
j: j
token: match[0]
repeated_string: repeated_string
base_token: base_token
base_entropy: base_entropy
base_matches: base_matches
lastIndex = j + 1
matches

Expand Down
4 changes: 2 additions & 2 deletions src/scoring.coffee
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,8 @@ scoring =
match.entropy = entropy_functions[match.pattern].call this, match

repeat_entropy: (match) ->
cardinality = @calc_bruteforce_cardinality match.token
@lg (cardinality * match.token.length)
num_repeats = match.token.length / match.base_token.length
match.base_entropy + @lg num_repeats

sequence_entropy: (match) ->
first_chr = match.token.charAt(0)
Expand Down
14 changes: 7 additions & 7 deletions test/test-matching.coffee
Original file line number Diff line number Diff line change
Expand Up @@ -367,45 +367,45 @@ test 'repeat matching', (t) ->
matches = matching.repeat_match password
msg = "matches embedded repeat patterns"
check_matches msg, t, matches, 'repeat', [pattern], [[i, j]],
repeated_string: ['&']
base_token: ['&']

for length in [3, 12]
for chr in ['a', 'Z', '4', '&']
pattern = Array(length + 1).join(chr)
matches = matching.repeat_match pattern
msg = "matches repeats with base character '#{chr}'"
check_matches msg, t, matches, 'repeat', [pattern], [[0, pattern.length - 1]],
repeated_string: [chr]
base_token: [chr]

matches = matching.repeat_match 'BBB1111aaaaa@@@@@@'
patterns = ['BBB','1111','aaaaa','@@@@@@']
msg = 'matches multiple adjacent repeats'
check_matches msg, t, matches, 'repeat', patterns, [[0, 2],[3, 6],[7, 11],[12, 17]],
repeated_string: ['B', '1', 'a', '@']
base_token: ['B', '1', 'a', '@']

matches = matching.repeat_match '2818BBBbzsdf1111@*&@!aaaaaEUDA@@@@@@1729'
msg = 'matches multiple repeats with non-repeats in-between'
check_matches msg, t, matches, 'repeat', patterns, [[4, 6],[12, 15],[21, 25],[30, 35]],
repeated_string: ['B', '1', 'a', '@']
base_token: ['B', '1', 'a', '@']

# test multi-character repeats
pattern = 'abab'
matches = matching.repeat_match pattern
msg = 'matches multi-character repeat pattern'
check_matches msg, t, matches, 'repeat', [pattern], [[0, pattern.length - 1]],
repeated_string: ['ab']
base_token: ['ab']

pattern = 'aabaab'
matches = matching.repeat_match pattern
msg = 'matches aabaab as a repeat instead of the aa prefix'
check_matches msg, t, matches, 'repeat', [pattern], [[0, pattern.length - 1]],
repeated_string: ['aab']
base_token: ['aab']

pattern = 'abababab'
matches = matching.repeat_match pattern
msg = 'identifies ab as repeat string, even though abab is also repeated'
check_matches msg, t, matches, 'repeat', [pattern], [[0, pattern.length - 1]],
repeated_string: ['ab']
base_token: ['ab']
t.end()


Expand Down
24 changes: 17 additions & 7 deletions test/test-scoring.coffee
Original file line number Diff line number Diff line change
Expand Up @@ -194,14 +194,24 @@ test 'calc_entropy', (t) ->
t.end()

test 'repeat entropy', (t) ->
for [token, entropy] in [
[ 'aa', lg(26 * 2) ]
[ '999', lg(10 * 3) ]
[ '$$$$', lg(33 * 4) ]
for [token, base_token] in [
[ 'aa', 'a' ]
[ '999', '9' ]
[ '$$$$', '$' ]
[ 'abab', 'ab']
[ 'batterystaplebatterystaplebatterystaple', 'batterystaple']
]
match = token: token
msg = "the repeat pattern '#{token}' has entropy of #{entropy}"
t.equal scoring.repeat_entropy(match), entropy, msg
base_entropy = scoring.minimum_entropy_match_sequence(
base_token
matching.omnimatch(base_token)
).entropy
match =
token: token
base_token: base_token
base_entropy: base_entropy
expected_entropy = base_entropy + lg(match.token.length / match.base_token.length)
msg = "the repeat pattern '#{token}' has entropy of #{expected_entropy}"
t.equal scoring.repeat_entropy(match), expected_entropy, msg
t.end()

test 'sequence entropy', (t) ->
Expand Down

0 comments on commit ceaac0c

Please sign in to comment.