scoring for multi-char repeats + tests

dropbox · Sep 14, 2015 · ceaac0c · ceaac0c
1 parent bff60d5
commit ceaac0c
Show file tree

Hide file tree

Showing 4 changed files with 38 additions and 19 deletions.
diff --git a/src/matching.coffee b/src/matching.coffee
@@ -312,20 +312,29 @@ matching =
         # aabaab in aabaabaabaab.
         # run an anchored lazy match on greedy's repeated string
         # to find the shortest repeated string
-        repeated_string = lazy_anchored.exec(match[0])[1]
+        base_token = lazy_anchored.exec(match[0])[1]
       else
         # lazy beats greedy for 'aaaaa'
         #   greedy: [aaaa,  aa]
         #   lazy:   [aaaaa, a]
         match = lazy_match
-        repeated_string = match[1]
+        base_token = match[1]
       [i, j] = [match.index, match.index + match[0].length - 1]
+      # recursively match and score the base string
+      base_analysis = scoring.minimum_entropy_match_sequence(
+        base_token
+        @omnimatch base_token
+      )
+      base_matches = base_analysis.match_sequence
+      base_entropy = base_analysis.entropy
       matches.push
         pattern: 'repeat'
         i: i
         j: j
         token: match[0]
-        repeated_string: repeated_string
+        base_token: base_token
+        base_entropy: base_entropy
+        base_matches: base_matches
       lastIndex = j + 1
     matches
 

diff --git a/src/scoring.coffee b/src/scoring.coffee
@@ -140,8 +140,8 @@ scoring =
     match.entropy = entropy_functions[match.pattern].call this, match
 
   repeat_entropy: (match) ->
-    cardinality = @calc_bruteforce_cardinality match.token
-    @lg (cardinality * match.token.length)
+    num_repeats = match.token.length / match.base_token.length
+    match.base_entropy + @lg num_repeats
 
   sequence_entropy: (match) ->
     first_chr = match.token.charAt(0)

diff --git a/test/test-matching.coffee b/test/test-matching.coffee
@@ -367,45 +367,45 @@ test 'repeat matching', (t) ->
     matches = matching.repeat_match password
     msg = "matches embedded repeat patterns"
     check_matches msg, t, matches, 'repeat', [pattern], [[i, j]],
-      repeated_string: ['&']
+      base_token: ['&']
 
   for length in [3, 12]
     for chr in ['a', 'Z', '4', '&']
       pattern = Array(length + 1).join(chr)
       matches = matching.repeat_match pattern
       msg = "matches repeats with base character '#{chr}'"
       check_matches msg, t, matches, 'repeat', [pattern], [[0, pattern.length - 1]],
-        repeated_string: [chr]
+        base_token: [chr]
 
   matches = matching.repeat_match 'BBB1111aaaaa@@@@@@'
   patterns = ['BBB','1111','aaaaa','@@@@@@']
   msg = 'matches multiple adjacent repeats'
   check_matches msg, t, matches, 'repeat', patterns, [[0, 2],[3, 6],[7, 11],[12, 17]],
-    repeated_string: ['B', '1', 'a', '@']
+    base_token: ['B', '1', 'a', '@']
 
   matches = matching.repeat_match '2818BBBbzsdf1111@*&@!aaaaaEUDA@@@@@@1729'
   msg = 'matches multiple repeats with non-repeats in-between'
   check_matches msg, t, matches, 'repeat', patterns, [[4, 6],[12, 15],[21, 25],[30, 35]],
-    repeated_string: ['B', '1', 'a', '@']
+    base_token: ['B', '1', 'a', '@']
 
   # test multi-character repeats
   pattern = 'abab'
   matches = matching.repeat_match pattern
   msg = 'matches multi-character repeat pattern'
   check_matches msg, t, matches, 'repeat', [pattern], [[0, pattern.length - 1]],
-    repeated_string: ['ab']
+    base_token: ['ab']
 
   pattern = 'aabaab'
   matches = matching.repeat_match pattern
   msg = 'matches aabaab as a repeat instead of the aa prefix'
   check_matches msg, t, matches, 'repeat', [pattern], [[0, pattern.length - 1]],
-    repeated_string: ['aab']
+    base_token: ['aab']
 
   pattern = 'abababab'
   matches = matching.repeat_match pattern
   msg = 'identifies ab as repeat string, even though abab is also repeated'
   check_matches msg, t, matches, 'repeat', [pattern], [[0, pattern.length - 1]],
-    repeated_string: ['ab']
+    base_token: ['ab']
   t.end()
 
 

diff --git a/test/test-scoring.coffee b/test/test-scoring.coffee
@@ -194,14 +194,24 @@ test 'calc_entropy', (t) ->
   t.end()
 
 test 'repeat entropy', (t) ->
-  for [token, entropy] in [
-    [ 'aa',   lg(26 * 2) ]
-    [ '999',  lg(10 * 3) ]
-    [ '$$$$', lg(33 * 4) ]
+  for [token, base_token] in [
+    [ 'aa',   'a' ]
+    [ '999',  '9' ]
+    [ '$$$$', '$' ]
+    [ 'abab', 'ab']
+    [ 'batterystaplebatterystaplebatterystaple', 'batterystaple']
     ]
-    match = token: token
-    msg = "the repeat pattern '#{token}' has entropy of #{entropy}"
-    t.equal scoring.repeat_entropy(match), entropy, msg
+    base_entropy = scoring.minimum_entropy_match_sequence(
+      base_token
+      matching.omnimatch(base_token)
+    ).entropy
+    match =
+      token: token
+      base_token: base_token
+      base_entropy: base_entropy
+    expected_entropy = base_entropy + lg(match.token.length / match.base_token.length)
+    msg = "the repeat pattern '#{token}' has entropy of #{expected_entropy}"
+    t.equal scoring.repeat_entropy(match), expected_entropy, msg
   t.end()
 
 test 'sequence entropy', (t) ->