Initial migration from parent, liblevenshtein project

universal-automata · Mar 29, 2014 · a149d5a · a149d5a
commit a149d5a
Show file tree

Hide file tree

Showing 19 changed files with 2,145 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,10 @@
+*.sw?
+wiki
+.gradle/*
+node_modules/*
+build/*
+lib/*
+coverage/*
+docs/*
+.sass-cache/*
+npm-debug.log
diff --git a/Cakefile b/Cakefile
@@ -0,0 +1,99 @@
+require 'coffee-script/register' #-> Register the .coffee extension
+wrench = require 'wrench'
+
+fs            = require 'fs'
+{print}       = require 'sys'
+{spawn, exec} = require 'child_process'
+
+build = (watch, callback) ->
+  if typeof watch is 'function'
+    callback = watch
+    watch = false
+  options = ['-c', '-o', 'build', 'src']
+  options.unshift '-w' if watch
+
+  coffee = spawn "#{__dirname}/node_modules/coffee-script/bin/coffee", options
+  coffee.stdout.on 'data', (data) -> print data.toString()
+  coffee.stderr.on 'data', (data) -> print data.toString()
+  coffee.on 'exit', (status) ->
+    throw new Error("An unexpected error occurred") if status isnt 0
+
+    countdown =
+      count: 0
+      increment: () -> ++ @count
+      decrement: () -> @callback() if 0 is (-- @count) and @callback
+      callback: callback
+
+    countdown.increment()
+    concat_files = (path, files) ->
+      fs.open path, 'w', null, (error, fd) ->
+        throw error if error
+
+        fs.writeSync fd, '/**\n'
+        fs.writeSync fd, '@license\n'
+        fs.writeSync fd, fs.readFileSync('../LICENSE', 'utf8')
+        fs.writeSync fd, '\n'
+        fs.writeSync fd, '*/\n'
+
+        for file in files
+          fs.writeSync fd, fs.readFileSync(file, 'utf8')
+          fs.writeSync fd, '\n'
+
+        fs.close fd, (error) -> throw error if error
+        countdown.decrement()
+
+    countdown.increment()
+    concat_files 'build/liblevenshtein.js', do ->
+      lib_files = []
+      for file in wrench.readdirSyncRecursive('build')
+        lib_files.push("build/#{file}") if /\.js$/.test(file)
+      lib_files
+
+    countdown.increment()
+    concat_files 'build/levenshtein-transducer.js', [
+      'build/collection/dawg.js'
+      'build/collection/max-heap.js'
+      'build/levenshtein/transducer.js'
+      'build/levenshtein/builder.js'
+    ]
+
+    countdown.increment()
+    concat_files 'build/levenshtein-distance.js', [
+      'build/levenshtein/distance.js'
+    ]
+
+    countdown.decrement()
+
+task 'docs', 'Generate annotated source code with Docco', ->
+  src_files = []
+  for file in wrench.readdirSyncRecursive('src')
+    path = "src/#{file}"
+    src_files.push(path) if /\.coffee$/.test(path)
+  docco = spawn "#{__dirname}/node_modules/docco/bin/docco", src_files
+  docco.stdout.on 'data', (data) -> print data.toString()
+  docco.stderr.on 'data', (data) -> print data.toString()
+  docco.on 'exit', (status) -> callback?() if status is 0
+
+task 'build', 'Compile CoffeeScript source files', ->
+  build()
+
+task 'minify', 'Builds and minifies liblevenshtein.js', ->
+  build ->
+    closure = spawn 'gradle', ['minify']
+    closure.stdout.on 'data', (data) -> print data.toString()
+    closure.stderr.on 'data', (data) -> print data.toString()
+
+task 'watch', 'Recompile CoffeeScript source files when modified', ->
+  build true
+
+task 'test', 'Run the test suite', ->
+  build ->
+    {reporters} = require 'nodeunit'
+    process.chdir __dirname
+    reporters.default.run do ->
+      test_dirs = ['test']
+      for file in wrench.readdirSyncRecursive('test')
+        path = "test/#{file}"
+        test_dirs.push(path) if fs.lstatSync(path).isDirectory()
+      test_dirs.sort()
+
diff --git a/build.gradle b/build.gradle
@@ -0,0 +1,48 @@
+apply plugin: 'java'
+
+repositories {
+    mavenCentral()
+}
+
+dependencies {
+  runtime 'com.google.javascript:closure-compiler:v20131014'
+}
+
+task minify_liblevenshtein(type: JavaExec) {
+  main = 'com.google.javascript.jscomp.CommandLineRunner'
+  classpath = sourceSets.main.runtimeClasspath
+  args = [
+    '--compilation_level', 'ADVANCED_OPTIMIZATIONS',
+    //'--output_wrapper', '(function(){"use strict";%output%}());',
+    '--js_output_file', 'build/liblevenshtein.min.js',
+    '--js', 'build/liblevenshtein.js'
+  ]
+}
+
+task minify_levenshtein_transducer(type: JavaExec) {
+  main = 'com.google.javascript.jscomp.CommandLineRunner'
+  classpath = sourceSets.main.runtimeClasspath
+  args = [
+    '--compilation_level', 'ADVANCED_OPTIMIZATIONS',
+    //'--output_wrapper', '(function(){"use strict";%output%}());',
+    '--js_output_file', 'build/levenshtein-transducer.min.js',
+    '--js', 'build/levenshtein-transducer.js'
+  ]
+}
+
+task minify_levenshtein_distance(type: JavaExec) {
+  main = 'com.google.javascript.jscomp.CommandLineRunner'
+  classpath = sourceSets.main.runtimeClasspath
+  args = [
+    '--compilation_level', 'ADVANCED_OPTIMIZATIONS',
+    //'--output_wrapper', '(function(){"use strict";%output%}());',
+    '--js_output_file', 'build/levenshtein-distance.min.js',
+    '--js', 'build/levenshtein-distance.js'
+  ]
+}
+
+task minify(dependsOn: [
+  'minify_liblevenshtein',
+  'minify_levenshtein_transducer',
+  'minify_levenshtein_distance'
+])
diff --git a/package.json b/package.json
@@ -0,0 +1,28 @@
+{
+  "name": "liblevenshtein",
+  "description": "Various utilities regarding Levenshtein transducers.",
+  "author": "Dylon Edwards",
+  "version": "2.0.1",
+  "licenses": [
+    {
+      "type": "MIT",
+      "url": "https://github.com/dylon/liblevenshtein/raw/master/LICENSE"
+    }
+  ],
+  "repository": {
+    "type": "git",
+    "url": "https://github.com/dylon/liblevenshtein.git"
+  },
+  "main": null,
+  "devDependencies": {
+    "nodeunit": "~0.8.6",
+    "coffee-script": "~1.7.1",
+    "wrench": "~1.5.8",
+    "docco": "~0.6.3",
+    "seed-random": "~2.2.0"
+  },
+  "engines":
+  {
+    "node": "~0.10"
+  }
+}
diff --git a/src/collection/dawg.coffee b/src/collection/dawg.coffee
@@ -0,0 +1,127 @@
+# ============================================================================
+# Taken and modified for my purposes from the following source:
+#  o http://stevehanov.ca/blog/index.php?id=115
+# ============================================================================
+#
+# This class represents a node in the directed acyclic word graph (DAWG,
+# a.k.a.  Minimal Acyclic Finite State Automaton, or MA-FSA).  It has a list
+# of edges to other nodes.  It has functions for testing whether it is
+# equivalent to another node.  Nodes are equivalent if they have identical
+# edges, and each identical edge leads to identical states.
+class DawgNode
+  @next_id = 0
+
+  constructor: ->
+    @id = DawgNode.next_id; DawgNode.next_id += 1
+    @['is_final'] = false
+    @['edges'] = {}
+
+  bisect_left: (edges, edge, lower, upper) ->
+    while lower < upper
+      i = (lower + upper) >> 1
+      if edges[i] < edge
+        lower = i + 1
+      else
+        upper = i
+    return lower
+
+  'toString': ->
+    edges = []
+    for label, node of @['edges'] # insertion sort
+      edge = label + node.id.toString()
+      edges.splice(@bisect_left(edges, edge, 0, edges.length), 0, edge)
+    (+ @['is_final']) + edges.join('')
+
+class Dawg
+  constructor: (dictionary) ->
+    unless dictionary and typeof dictionary.length is 'number'
+      throw new Error("Expected dictionary to be array-like")
+
+    @previous_word = ''
+    @['root'] = new DawgNode()
+
+    # Here is a list of nodes that have not been checked for duplication.
+    @unchecked_nodes = []
+
+    # Here is a list of unique nodes that have been checked for duplication.
+    @minimized_nodes = {}
+
+    @['insert'](word) for word in dictionary
+    @finish()
+
+  'insert': (word) ->
+    # Find longest common prefix between word and previous word
+    i = 0; previous_word = @previous_word
+
+    upper_bound =
+      if word.length < previous_word.length
+        word.length
+      else
+        previous_word.length
+
+    i += 1 while i < upper_bound and word[i] is previous_word[i]
+
+    # Check the unchecked_nodes for redundant nodes, proceeding from last one
+    # down to the common prefix size.  Then truncate the list at that point.
+    @minimize(i)
+    unchecked_nodes = @unchecked_nodes
+
+    # Add the suffix, starting from the correct node mid-way through the graph.
+    if unchecked_nodes.length is 0
+      node = @['root']
+    else
+      node = unchecked_nodes[unchecked_nodes.length - 1][2]
+
+    while (character = word[i]) isnt `undefined`
+      next_node = new DawgNode()
+      node['edges'][character] = next_node
+      unchecked_nodes.push([node, character, next_node])
+      node = next_node
+      i += 1
+
+    node['is_final'] = true
+    @previous_word = word
+    return
+
+  finish: ->
+    # minimize all unchecked_nodes
+    @minimize(0)
+    return
+
+  minimize: (lower_bound) ->
+    # proceed from the leaf up to a certain point
+    minimized_nodes = @minimized_nodes
+    unchecked_nodes = @unchecked_nodes
+
+    j = unchecked_nodes.length
+    while j > lower_bound
+      [parent, character, child] = unchecked_nodes.pop()
+      child_key = child.toString()
+      if child_key of minimized_nodes
+        # replace the child with the previously encountered one
+        parent['edges'][character] = minimized_nodes[child_key]
+      else
+        # add the state to the minimized nodes
+        minimized_nodes[child_key] = child
+      j -= 1
+    return
+
+  'accepts': (word) ->
+    node = @['root']
+    for edge in word
+      node = node['edges'][edge]
+      return false unless node
+    node['is_final']
+
+global =
+  if typeof exports is 'object'
+    exports
+  else if typeof window is 'object'
+    window
+  else
+    this
+
+global['levenshtein'] ||= {}
+global['levenshtein']['DawgNode'] = DawgNode
+global['levenshtein']['Dawg'] = Dawg
+