Skip to content

Commit

Permalink
Implement Lt 21911: Novel Root Guesser (#172)
Browse files Browse the repository at this point in the history
* Make use of Root Guesser in HermitCrab

* Display guessed roots in interlinear texts

* Handles m_acceptUnspecifiedGraphemes case

* Update to HermitCrab 3.3.0

* Update to HermitCrab 3.3.0

* Make changes requested by Jason

---------

Co-authored-by: Jake Oliver <jeoliver97@gmail.com>
  • Loading branch information
jtmaxwell3 and JakeOliver28 authored Oct 9, 2024
1 parent 41a1860 commit b0c93df
Show file tree
Hide file tree
Showing 12 changed files with 123 additions and 28 deletions.
6 changes: 3 additions & 3 deletions Build/mkall.targets
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@
<ParatextNugetVersion>9.4.0.1-beta</ParatextNugetVersion>
<LcmNugetVersion>11.0.0-beta0104</LcmNugetVersion>
<IcuNugetVersion>70.1.123</IcuNugetVersion>
<HermitCrabNugetVersion>2.5.13</HermitCrabNugetVersion>
<HermitCrabNugetVersion>3.3.0</HermitCrabNugetVersion>
<IPCFrameworkVersion>1.1.1-beta0001</IPCFrameworkVersion>
<!-- bt393 is the master branch build of ExCss for Windows development. Update when appropriate. -->
<ExCssBuildType Condition="'$(OS)'=='Windows_NT'">bt393</ExCssBuildType>
Expand Down Expand Up @@ -531,8 +531,8 @@
<SILNugetPackages Include="Autofac"><Version>4.9.4</Version><Path>lib/net45/*.*</Path><NoSymbols>true</NoSymbols></SILNugetPackages>
<SILNugetPackages Include="Vulcan.Uczniowie.HelpProvider"><Version>1.0.16</Version><Path>lib/net461/*.*</Path></SILNugetPackages>
<!-- HermitCrab and related packages -->
<SILNugetPackages Include="SIL.Machine.Morphology.HermitCrab"><Version>$(HermitCrabNugetVersion)</Version><Path>lib/net461/*.*</Path><NoSymbols>true</NoSymbols></SILNugetPackages>
<SILNugetPackages Include="SIL.Machine"><Version>$(HermitCrabNugetVersion)</Version><Path>lib/net461/*.*</Path><NoSymbols>true</NoSymbols></SILNugetPackages>
<SILNugetPackages Include="SIL.Machine.Morphology.HermitCrab"><Version>$(HermitCrabNugetVersion)</Version><Path>lib/netstandard2.0/*.*</Path><NoSymbols>true</NoSymbols></SILNugetPackages>
<SILNugetPackages Include="SIL.Machine"><Version>$(HermitCrabNugetVersion)</Version><Path>lib/netstandard2.0/*.*</Path><NoSymbols>true</NoSymbols></SILNugetPackages>
<SILNugetPackages Include="Sandwych.Quickgraph.Core"><Version>1.0.0</Version><Path>lib/net45/*.*</Path><NoSymbols>true</NoSymbols></SILNugetPackages>
</ItemGroup>
</Target>
Expand Down
4 changes: 2 additions & 2 deletions Build/nuget-common/packages.config
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@
<package id="SIL.libpalaso.l10ns" version="6.0.0" targetFramework="net461" />
<package id="SIL.Lift" version="13.0.0-beta0076" targetFramework="net461" />
<package id="SIL.Media" version="13.0.0-beta0076" targetFramework="net461" />
<package id="SIL.Machine" version="2.5.13" targetFramework="net461" />
<package id="SIL.Machine.Morphology.HermitCrab" version="2.5.13" targetFramework="net461" />
<package id="SIL.Machine" version="3.3.0" targetFramework="netstandard2.0" />
<package id="SIL.Machine.Morphology.HermitCrab" version="3.3.0" targetFramework="netstandard2.0" />
<package id="SIL.ParatextShared" version="7.4.0.1" targetFramework="net40" /> <!-- REVIEW (Hasso) 2023.05: do we still integrate with PT 7? -->
<package id="SIL.Scripture" version="13.0.0-beta0076" targetFramework="net461" />
<package id="SIL.TestUtilities" version="13.0.0-beta0076" targetFramework="net461" />
Expand Down
4 changes: 3 additions & 1 deletion Src/LexText/Interlinear/InterlinVc.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1356,9 +1356,10 @@ private void DisplayMorphBundle(IVwEnv vwenv, int hvo)
{
vwenv.AddString(m_tssMissingVernacular);
}
else if (mf == null)
else if (mf == null || SandboxBase.IsLexicalPattern(mf.Form))
{
// If no morph, use the form of the morph bundle (and the entry is of course missing)
// If mf.Form is a lexical pattern then the form of the morph bundle is the guessed root.
var ws = GetRealWsOrBestWsForContext(wmb.Hvo, spec);
vwenv.AddStringAltMember(WfiMorphBundleTags.kflidForm, ws, this);
}
Expand Down Expand Up @@ -2637,4 +2638,5 @@ protected override void SetInt(int hvo, int flid, int newValue)
}

}

}
5 changes: 4 additions & 1 deletion Src/LexText/Interlinear/SandboxBase.ComboHandlers.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1145,7 +1145,10 @@ private void AddAnalysesOf(IWfiWordform wordform, bool fBaseWordIsPhrase)
IMoForm morph = mb.MorphRA;
if (morph != null)
{
ITsString tss = morph.Form.get_String(m_sandbox.RawWordformWs);
// If morph.Form is a lexical pattern then mb.Form is the guessed root.
ITsString tss = IsLexicalPattern(morph.Form)
? mb.Form.get_String(m_sandbox.RawWordformWs)
: morph.Form.get_String(m_sandbox.RawWordformWs);
var morphType = morph.MorphTypeRA;
string sPrefix = morphType.Prefix;
string sPostfix = morphType.Postfix;
Expand Down
6 changes: 6 additions & 0 deletions Src/LexText/Interlinear/SandboxBase.GetRealyAnalysisMethod.cs
Original file line number Diff line number Diff line change
Expand Up @@ -426,6 +426,12 @@ private IAnalysis FinishItOff()
else
{
mb.MorphRA = mfRepository.GetObject(m_analysisMorphs[imorph]);
if (mb.MorphRA != null && IsLexicalPattern(mb.MorphRA.Form))
{
// If mb.MorphRA.Form is a lexical pattern then set mb.Form to the guessed root.
int hvoSbMorph = m_sda.get_VecItem(m_hvoSbWord, ktagSbWordMorphs, imorph);
mb.Form.set_String(wsVern, m_sandbox.GetFullMorphForm(hvoSbMorph));
}
}
// Set the MSA if we have one. Note that it is (pathologically) possible that the user has done
// something in another window to destroy the MSA we remember, so don't try to set it if so.
Expand Down
38 changes: 35 additions & 3 deletions Src/LexText/Interlinear/SandboxBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1325,9 +1325,14 @@ private bool LoadRealDataIntoSec1(int hvoSbWord, bool fLookForDefaults, bool fAd
}
else
{
// Create the secondary object corresponding to the MoForm in the usual way from the form object.
hvoMorphForm = CreateSecondaryAndCopyStrings(InterlinLineChoices.kflidMorphemes, mf.Hvo,
MoFormTags.kflidForm, hvoSbWord, sdaMain, cda);
hvoMorphForm = m_caches.FindOrCreateSec(mf.Hvo, kclsidSbNamedObj, hvoSbWord, ktagSbWordDummy);
if (IsLexicalPattern(mf.Form))
// If mf.Form is a lexical pattern then mb.Form is the guessed root.
CopyStringsToSecondary(InterlinLineChoices.kflidMorphemes, sdaMain, mb.Hvo,
WfiMorphBundleTags.kflidForm, cda, hvoMorphForm, ktagSbNamedObjName);
else
CopyStringsToSecondary(InterlinLineChoices.kflidMorphemes, sdaMain, mf.Hvo,
MoFormTags.kflidForm, cda, hvoMorphForm, ktagSbNamedObjName);
// Store the prefix and postfix markers from the MoMorphType object.
int hvoMorphType = sdaMain.get_ObjectProp(mf.Hvo,
MoFormTags.kflidMorphType);
Expand Down Expand Up @@ -1467,6 +1472,22 @@ private bool LoadRealDataIntoSec1(int hvoSbWord, bool fLookForDefaults, bool fAd
return fGuessing != 0;
}

/// <summary>
/// Does multiString contain a lexical pattern (e.g. [Seg]*)?
/// </summary>
public static bool IsLexicalPattern(IMultiUnicode multiString)
{
// This assumes that "[" and "]" are not part of any phonemes.
for (var i = 0; i < multiString.StringCount; i++)
{
int ws;
string text = multiString.GetStringFromIndex(i, out ws).Text;
if (text.Contains("[") && text.Contains("]"))
return true;
}
return false;
}

public static bool GetHasMultipleRelevantAnalyses(IWfiWordform analysis)
{
int humanCount = analysis.HumanApprovedAnalyses.Count();
Expand Down Expand Up @@ -2007,6 +2028,17 @@ where icuCollator.Compare(mf.Form.get_String(ws).Text, form) == 0 && mf.MorphTyp
&& (mf.MorphTypeRA == mmt || mf.MorphTypeRA.IsAmbiguousWith(mmt))
select mf).ToList();

if (morphs.Count == 0)
{
// Look for morphs in matching morph bundles with lexical patterns.
// If morph is a lexical pattern then the morph bundle's Form is the guessed root.
morphs = (from mb in Cache.ServiceLocator.GetInstance<IWfiMorphBundleRepository>().AllInstances()
where IsLexicalPattern(mb.MorphRA.Form)
&& icuCollator.Compare(mb.Form.get_String(ws).Text, form) == 0
&& mb.MorphRA.MorphTypeRA != null
&& (mb.MorphRA.MorphTypeRA == mmt || mb.MorphRA.MorphTypeRA.IsAmbiguousWith(mmt))
select mb.MorphRA).ToList();
}
if (morphs.Count == 1)
return morphs.First(); // special case: we can avoid the cost of figuring ReferringObjects.
IMoForm bestMorph = null;
Expand Down
3 changes: 2 additions & 1 deletion Src/LexText/ParserCore/FwXmlTraceManager.cs
Original file line number Diff line number Diff line change
Expand Up @@ -495,7 +495,8 @@ private XElement CreateAllomorphElement(Allomorph allomorph)
if (inflTypeID != 0 && !m_cache.ServiceLocator.GetInstance<ILexEntryInflTypeRepository>().TryGetObject(inflTypeID, out inflType))
return null;

return HCParser.CreateAllomorphElement("Allomorph", form, msa, inflType, formID2 != 0);
string guessedString = allomorph.Guessed ? allomorph.Morpheme.Gloss : null;
return HCParser.CreateAllomorphElement("Allomorph", form, msa, inflType, formID2 != 0, guessedString);
}
}
}
28 changes: 24 additions & 4 deletions Src/LexText/ParserCore/HCLoader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1854,7 +1854,7 @@ private void LoadAllomorphCoOccurrenceRules(IMoAlloAdhocProhib alloAdhocProhib)
{
var rule = new AllomorphCoOccurrenceRule(ConstraintType.Exclude, others, adjacency);
firstAllo.AllomorphCoOccurrenceRules.Add(rule);
m_language.AllomorphCoOccurrenceRules.Add(rule);
m_language.AllomorphCoOccurrenceRules.Add((firstAllo, rule));
}
}
}
Expand Down Expand Up @@ -1904,7 +1904,7 @@ private void LoadMorphemeCoOccurrenceRules(IMoMorphAdhocProhib morphAdhocProhib)
{
var rule = new MorphemeCoOccurrenceRule(ConstraintType.Exclude, others, adjacency);
firstMorpheme.MorphemeCoOccurrenceRules.Add(rule);
m_language.MorphemeCoOccurrenceRules.Add(rule);
m_language.MorphemeCoOccurrenceRules.Add((firstMorpheme, rule));
}
}
}
Expand Down Expand Up @@ -2198,7 +2198,7 @@ private FeatureStruct LoadFeatureStruct(IFsFeatStruc fs, FeatureSystem featSys)
private Shape Segment(string str)
{
Shape shape;
if (m_acceptUnspecifiedGraphemes)
if (m_acceptUnspecifiedGraphemes && !IsLexicalPattern(str))
{
int[] baseCharPositions = null;
do
Expand All @@ -2222,11 +2222,20 @@ private Shape Segment(string str)
}
else
{
shape = m_table.Segment(str);
shape = m_table.Segment(str, true);
}
return shape;
}

/// <summary>
/// Does form contain a lexical pattern (e.g. [Seg]*)?
/// </summary>
public static bool IsLexicalPattern(string form)
{
// This assumes that "[" and "]" are not part of any phonemes.
return form.Contains("[") && form.Contains("]");
}

private static string FormatForm(string formStr)
{
return formStr.Trim().Replace(' ', '.');
Expand Down Expand Up @@ -2383,6 +2392,17 @@ private void LoadCharacterDefinitionTable(IPhPhonemeSet phonemeSet)
m_table.AddBoundary(otherChar);
}
}
// Add natural classes to table for lexical patterns.
foreach(NaturalClass hcNaturalClass in m_language.NaturalClasses)
{
m_table.AddNaturalClass(hcNaturalClass);
}
foreach (string ncName in m_naturalClassLookup.Keys)
{
NaturalClass hcNaturalClass;
if (TryLoadNaturalClass(m_naturalClassLookup[ncName], out hcNaturalClass))
m_table.AddNaturalClass(hcNaturalClass);
}
m_language.CharacterDefinitionTables.Add(m_table);
}

Expand Down
17 changes: 8 additions & 9 deletions Src/LexText/ParserCore/HCParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ public ParseResult ParseWord(string word)
IEnumerable<Word> wordAnalyses;
try
{
wordAnalyses = m_morpher.ParseWord(word);
wordAnalyses = m_morpher.ParseWord(word, out _, true);
}
catch (Exception e)
{
Expand All @@ -103,7 +103,7 @@ public ParseResult ParseWord(string word)
if (GetMorphs(wordAnalysis, out morphs))
{
analyses.Add(new ParseAnalysis(morphs.Select(mi =>
new ParseMorph(mi.Form, mi.Msa, mi.InflType))));
new ParseMorph(mi.Form, mi.Msa, mi.InflType, mi.GuessedString))));
}
}
result = new ParseResult(analyses);
Expand Down Expand Up @@ -189,11 +189,11 @@ private XDocument ParseToXml(string form, bool tracing, IEnumerable<int> selectT
try
{
object trace;
foreach (Word wordAnalysis in m_morpher.ParseWord(form, out trace))
foreach (Word wordAnalysis in m_morpher.ParseWord(form, out trace, true))
{
List<MorphInfo> morphs;
if (GetMorphs(wordAnalysis, out morphs))
wordformElem.Add(new XElement("Analysis", morphs.Select(mi => CreateAllomorphElement("Morph", mi.Form, mi.Msa, mi.InflType, mi.IsCircumfix))));
wordformElem.Add(new XElement("Analysis", morphs.Select(mi => CreateAllomorphElement("Morph", mi.Form, mi.Msa, mi.InflType, mi.IsCircumfix, mi.GuessedString))));
}
if (tracing)
wordformElem.Add(new XElement("Trace", trace));
Expand Down Expand Up @@ -364,7 +364,6 @@ private bool GetMorphs(Word ws, out List<MorphInfo> result)
}
else
{
morphInfo.String += formStr;
continue;
}

Expand Down Expand Up @@ -394,7 +393,7 @@ private bool GetMorphs(Word ws, out List<MorphInfo> result)
morphInfo = new MorphInfo
{
Form = form,
String = formStr,
GuessedString = allomorph.Guessed ? formStr : null,
Msa = msa,
InflType = inflType,
IsCircumfix = formID2 > 0
Expand Down Expand Up @@ -466,11 +465,11 @@ private static string GetMorphTypeString(Guid typeGuid)
return "unknown";
}

internal static XElement CreateAllomorphElement(string name, IMoForm form, IMoMorphSynAnalysis msa, ILexEntryInflType inflType, bool circumfix)
internal static XElement CreateAllomorphElement(string name, IMoForm form, IMoMorphSynAnalysis msa, ILexEntryInflType inflType, bool circumfix, string guessedString)
{
Guid morphTypeGuid = circumfix ? MoMorphTypeTags.kguidMorphCircumfix : (form.MorphTypeRA == null ? Guid.Empty : form.MorphTypeRA.Guid);
var elem = new XElement(name, new XAttribute("id", form.Hvo), new XAttribute("type", GetMorphTypeString(morphTypeGuid)),
new XElement("Form", circumfix ? form.OwnerOfClass<ILexEntry>().HeadWord.Text : form.GetFormWithMarkers(form.Cache.DefaultVernWs)),
new XElement("Form", circumfix ? form.OwnerOfClass<ILexEntry>().HeadWord.Text : guessedString ?? form.GetFormWithMarkers(form.Cache.DefaultVernWs)),
new XElement("LongName", form.LongName));
elem.Add(CreateMorphemeElement(msa, inflType));
return elem;
Expand Down Expand Up @@ -567,7 +566,7 @@ private string ProcessParseException(Exception e)
class MorphInfo
{
public IMoForm Form { get; set; }
public string String { get; set; }
public string GuessedString { get; set; }
public IMoMorphSynAnalysis Msa { get; set; }
public ILexEntryInflType InflType { get; set; }
public bool IsCircumfix { get; set; }
Expand Down
7 changes: 7 additions & 0 deletions Src/LexText/ParserCore/ParseFiler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
using System.Linq;
using SIL.LCModel;
using SIL.LCModel.Application;
using SIL.LCModel.Core.Text;
using SIL.LCModel.Infrastructure;
using XCore;

Expand Down Expand Up @@ -250,6 +251,12 @@ private void ProcessAnalysis(IWfiWordform wordform, ParseAnalysis analysis)
mb.MsaRA = morph.Msa;
if (morph.InflType != null)
mb.InflTypeRA = morph.InflType;
if (morph.GuessedString != null)
{
// Override default Form with GuessedString.
int vernWS = m_cache.DefaultVernWs;
mb.Form.set_String(vernWS, TsStringUtils.MakeString(morph.GuessedString, vernWS));
}
}
matches.Add(newAnal);
}
Expand Down
31 changes: 29 additions & 2 deletions Src/LexText/ParserCore/ParseResult.cs
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,8 @@ public bool MatchesIWfiAnalysis(IWfiAnalysis analysis)
foreach (IWfiMorphBundle mb in analysis.MorphBundlesOS)
{
var current = this.Morphs[i++];
if (mb.MorphRA == current.Form && mb.MsaRA == current.Msa && mb.InflTypeRA == current.InflType)
if (mb.MorphRA == current.Form && mb.MsaRA == current.Msa && mb.InflTypeRA == current.InflType &&
(current.GuessedString == null || EquivalentFormString(mb.Form, current.GuessedString)))
{
// Possibly matches condition (2), above.
mbMatch = true;
Expand All @@ -131,6 +132,16 @@ public bool MatchesIWfiAnalysis(IWfiAnalysis analysis)
return false;
}

private bool EquivalentFormString(IMultiString multiString, string formString)
{
foreach (int ws in multiString.AvailableWritingSystemIds)
{
if (multiString.get_String(ws).Text == formString)
return true;
}
return false;
}

public override int GetHashCode()
{
int code = 23;
Expand All @@ -145,17 +156,24 @@ public class ParseMorph : IEquatable<ParseMorph>
private readonly IMoForm m_form;
private readonly IMoMorphSynAnalysis m_msa;
private readonly ILexEntryInflType m_inflType;
private readonly string m_guessedString;

public ParseMorph(IMoForm form, IMoMorphSynAnalysis msa)
: this(form, msa, null)
{
}

public ParseMorph(IMoForm form, IMoMorphSynAnalysis msa, ILexEntryInflType inflType)
: this(form, msa, inflType, null)
{
}

public ParseMorph(IMoForm form, IMoMorphSynAnalysis msa, ILexEntryInflType inflType, string guessedString)
{
m_form = form;
m_msa = msa;
m_inflType = inflType;
m_guessedString = guessedString;
}

public IMoForm Form
Expand All @@ -173,14 +191,22 @@ public ILexEntryInflType InflType
get { return m_inflType; }
}

public string GuessedString
{
get { return m_guessedString; }
}

public bool IsValid
{
get { return Form.IsValidObject && Msa.IsValidObject && (m_inflType == null || m_inflType.IsValidObject); }
}

public bool Equals(ParseMorph other)
{
return m_form == other.m_form && m_msa == other.m_msa && m_inflType == other.m_inflType;
return m_form == other.m_form
&& m_msa == other.m_msa
&& m_inflType == other.m_inflType
&& m_guessedString == other.m_guessedString;
}

public override bool Equals(object obj)
Expand All @@ -195,6 +221,7 @@ public override int GetHashCode()
code = code * 31 + m_form.Guid.GetHashCode();
code = code * 31 + m_msa.Guid.GetHashCode();
code = code * 31 + (m_inflType == null ? 0 : m_inflType.Guid.GetHashCode());
code = code * 31 + (m_guessedString == null ? 0 : m_guessedString.GetHashCode());
return code;
}
}
Expand Down
Loading

0 comments on commit b0c93df

Please sign in to comment.