Skip to content

Commit

Permalink
fix: hunspell dictionary builder now emits added word metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
elijah-potter committed Aug 2, 2024
1 parent 67a021f commit b4f7c34
Show file tree
Hide file tree
Showing 11 changed files with 149 additions and 117 deletions.
77 changes: 51 additions & 26 deletions harper-core/affixes.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
],
"adds_metadata": {
"kind": null,
"tense": null
"tense": null,
"plural": null
}
},
"L": {
Expand All @@ -27,7 +28,8 @@
],
"adds_metadata": {
"kind": null,
"tense": null
"tense": null,
"plural": null
}
},
"E": {
Expand All @@ -42,7 +44,8 @@
],
"adds_metadata": {
"kind": null,
"tense": null
"tense": null,
"plural": null
}
},
"Y": {
Expand All @@ -57,7 +60,8 @@
],
"adds_metadata": {
"kind": null,
"tense": null
"tense": null,
"plural": null
}
},
"U": {
Expand All @@ -72,7 +76,8 @@
],
"adds_metadata": {
"kind": null,
"tense": null
"tense": null,
"plural": null
}
},
"H": {
Expand All @@ -92,7 +97,8 @@
],
"adds_metadata": {
"kind": null,
"tense": null
"tense": null,
"plural": null
}
},
"T": {
Expand Down Expand Up @@ -122,7 +128,8 @@
],
"adds_metadata": {
"kind": null,
"tense": null
"tense": null,
"plural": null
}
},
"R": {
Expand Down Expand Up @@ -152,7 +159,8 @@
],
"adds_metadata": {
"kind": null,
"tense": null
"tense": null,
"plural": null
}
},
"C": {
Expand All @@ -167,7 +175,8 @@
],
"adds_metadata": {
"kind": null,
"tense": null
"tense": null,
"plural": null
}
},
"V": {
Expand All @@ -187,7 +196,8 @@
],
"adds_metadata": {
"kind": null,
"tense": null
"tense": null,
"plural": null
}
},
"N": {
Expand All @@ -212,7 +222,8 @@
],
"adds_metadata": {
"kind": null,
"tense": null
"tense": null,
"plural": null
}
},
"A": {
Expand All @@ -227,7 +238,8 @@
],
"adds_metadata": {
"kind": null,
"tense": null
"tense": null,
"plural": null
}
},
"Z": {
Expand Down Expand Up @@ -257,7 +269,8 @@
],
"adds_metadata": {
"kind": null,
"tense": null
"tense": null,
"plural": null
}
},
"P": {
Expand All @@ -282,7 +295,8 @@
],
"adds_metadata": {
"kind": null,
"tense": null
"tense": null,
"plural": null
}
},
"M": {
Expand All @@ -297,7 +311,8 @@
],
"adds_metadata": {
"kind": null,
"tense": null
"tense": null,
"plural": null
}
},
"F": {
Expand All @@ -312,7 +327,8 @@
],
"adds_metadata": {
"kind": null,
"tense": null
"tense": null,
"plural": null
}
},
"B": {
Expand All @@ -337,7 +353,8 @@
],
"adds_metadata": {
"kind": null,
"tense": null
"tense": null,
"plural": null
}
},
"S": {
Expand Down Expand Up @@ -367,7 +384,8 @@
],
"adds_metadata": {
"kind": null,
"tense": null
"tense": null,
"plural": true
}
},
"D": {
Expand Down Expand Up @@ -397,7 +415,8 @@
],
"adds_metadata": {
"kind": null,
"tense": null
"tense": null,
"plural": null
}
},
"G": {
Expand All @@ -417,7 +436,8 @@
],
"adds_metadata": {
"kind": null,
"tense": null
"tense": null,
"plural": null
}
},
"Q": {
Expand All @@ -432,7 +452,8 @@
],
"adds_metadata": {
"kind": null,
"tense": null
"tense": null,
"plural": null
}
},
"O": {
Expand All @@ -447,7 +468,8 @@
],
"adds_metadata": {
"kind": null,
"tense": null
"tense": null,
"plural": null
}
},
"I": {
Expand All @@ -462,7 +484,8 @@
],
"adds_metadata": {
"kind": null,
"tense": null
"tense": null,
"plural": null
}
},
"X": {
Expand All @@ -487,7 +510,8 @@
],
"adds_metadata": {
"kind": null,
"tense": null
"tense": null,
"plural": null
}
},
"J": {
Expand All @@ -507,8 +531,9 @@
],
"adds_metadata": {
"kind": null,
"tense": null
"tense": null,
"plural": null
}
}
}
}
}
40 changes: 23 additions & 17 deletions harper-core/src/spell/full_dictionary.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
use hashbrown::HashSet;
use hashbrown::{HashMap, HashSet};
use once_cell::sync::Lazy;
use smallvec::{SmallVec, ToSmallVec};

use super::dictionary::Dictionary;
use super::hunspell::{parse_default_attribute_list, parse_default_word_list};
use super::seq_to_normalized;
use crate::CharString;
use crate::{CharString, WordMetadata};

/// A full, fat dictionary.
/// All of the elements are stored in-memory.
Expand All @@ -23,25 +23,26 @@ pub struct FullDictionary {
/// that has a word whose index is that length.
word_len_starts: Vec<usize>,
/// All English words
word_set: HashSet<CharString>,
word_map: HashMap<CharString, WordMetadata>
}

fn uncached_inner_new() -> FullDictionary {
let word_list = parse_default_word_list().unwrap();
let attr_list = parse_default_attribute_list();

// There will be at _least_ this number of words
let mut words = Vec::with_capacity(word_list.len());
let mut word_map = HashMap::with_capacity(word_list.len());

attr_list.expand_marked_words(word_list, &mut words);
attr_list.expand_marked_words(word_list, &mut word_map);

let mut words: Vec<CharString> = word_map.iter().map(|(v, _)| v.clone()).collect();
words.sort();
words.dedup();

FullDictionary {
word_set: HashSet::from_iter(words.iter().cloned()),
word_map,
word_len_starts: FullDictionary::create_len_starts(&mut words),
words,
words
}
}

Expand All @@ -52,7 +53,7 @@ impl FullDictionary {
Self {
words: Vec::new(),
word_len_starts: Vec::new(),
word_set: HashSet::new(),
word_map: HashMap::new()
}
}

Expand All @@ -65,21 +66,26 @@ impl FullDictionary {
/// Appends words to the dictionary.
/// It is significantly faster to append many words with one call than many
/// distinct calls to this function.
pub fn extend_words(&mut self, words: impl IntoIterator<Item = impl AsRef<[char]>>) {
let init_size = self.words.len();
self.words
.extend(words.into_iter().map(|v| v.as_ref().to_smallvec()));
self.word_set
.extend(self.words[init_size..].iter().cloned());
pub fn extend_words(
&mut self,
words: impl IntoIterator<Item = (impl AsRef<[char]>, WordMetadata)>
) {
let pairs: Vec<_> = words
.into_iter()
.map(|(v, m)| (v.as_ref().to_smallvec(), m))
.collect();

self.words.extend(pairs.iter().map(|(v, _)| v.clone()));
self.word_len_starts = Self::create_len_starts(&mut self.words);
self.word_map.extend(pairs);
}

/// Append a single word to the dictionary.
///
/// If you are appending many words, consider using [`Self::extend_words`]
/// instead.
pub fn append_word(&mut self, word: impl AsRef<[char]>) {
self.extend_words(std::iter::once(word.as_ref()))
pub fn append_word(&mut self, word: impl AsRef<[char]>, metadata: WordMetadata) {
self.extend_words(std::iter::once((word.as_ref(), metadata)))
}

/// Create a lookup table for finding words of a specific length in a word
Expand Down Expand Up @@ -132,7 +138,7 @@ impl Dictionary for FullDictionary {
let normalized = seq_to_normalized(word);
let lowercase: SmallVec<_> = normalized.iter().flat_map(|c| c.to_lowercase()).collect();

self.word_set.contains(normalized.as_ref()) || self.word_set.contains(&lowercase)
self.word_map.contains_key(normalized.as_ref()) || self.word_map.contains_key(&lowercase)
}
}

Expand Down
15 changes: 8 additions & 7 deletions harper-core/src/spell/hunspell/affix_replacement.rs
Original file line number Diff line number Diff line change
@@ -1,39 +1,40 @@
use serde::{Deserialize, Serialize};

use super::{matcher::Matcher, Error};
use super::matcher::Matcher;
use super::Error;

#[derive(Debug, Clone)]
pub struct AffixReplacement {
pub remove: Vec<char>,
pub add: Vec<char>,
pub condition: Matcher,
pub condition: Matcher
}

impl AffixReplacement {
pub fn to_human_readable(&self) -> HumanReadableAffixReplacement {
HumanReadableAffixReplacement {
remove: self.remove.iter().collect(),
add: self.add.iter().collect(),
condition: self.condition.to_string(),
condition: self.condition.to_string()
}
}
}

/// A version of [`AffixReplacement`] that can be serialized to JSON (or whatever) and maintain the
/// nice Regex syntax of the inner [`Matcher`].
/// A version of [`AffixReplacement`] that can be serialized to JSON (or
/// whatever) and maintain the nice Regex syntax of the inner [`Matcher`].
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HumanReadableAffixReplacement {
pub remove: String,
pub add: String,
pub condition: String,
pub condition: String
}

impl HumanReadableAffixReplacement {
pub fn to_normal(&self) -> Result<AffixReplacement, Error> {
Ok(AffixReplacement {
remove: self.remove.chars().collect(),
add: self.add.chars().collect(),
condition: Matcher::parse(&self.condition)?,
condition: Matcher::parse(&self.condition)?
})
}
}
Loading

0 comments on commit b4f7c34

Please sign in to comment.