From 59cbd46ca4852a8847667d046daed7065ea0f910 Mon Sep 17 00:00:00 2001 From: "Eli C. Lowry" <83078660+Enkidu93@users.noreply.github.com> Date: Wed, 7 Feb 2024 15:33:56 -0500 Subject: [PATCH] Chapter level aspnetcore (#167) * Added chapter-level filtering; fixes https://github.com/sillsdev/serval/issues/150 --- src/SIL.Machine.AspNetCore/Models/Corpus.cs | 2 + .../SIL.Machine.AspNetCore.csproj | 2 +- .../Services/NmtPreprocessBuildJob.cs | 29 +++- .../ServalTranslationEngineServiceV1.cs | 6 + src/SIL.Machine.AspNetCore/Usings.cs | 1 + .../Services/NmtPreprocessBuildJobTests.cs | 130 ++++++++++++++++++ .../Services/data/paratext2/41MATTen.SFM | 40 ++++++ .../Services/data/paratext2/42MRKTen.SFM | 4 + .../data/paratext2/ProjectBiblicalTerms.xml | 6 + .../Services/data/paratext2/Settings.xml | 34 +++++ .../data/paratext2/TermRenderings.xml | 9 ++ .../Services/data/paratext2/custom.vrs | 31 +++++ 12 files changed, 287 insertions(+), 7 deletions(-) create mode 100644 tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/41MATTen.SFM create mode 100644 tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/42MRKTen.SFM create mode 100644 tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/ProjectBiblicalTerms.xml create mode 100644 tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/Settings.xml create mode 100644 tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/TermRenderings.xml create mode 100644 tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/custom.vrs diff --git a/src/SIL.Machine.AspNetCore/Models/Corpus.cs b/src/SIL.Machine.AspNetCore/Models/Corpus.cs index c33bc52c..bf741e29 100644 --- a/src/SIL.Machine.AspNetCore/Models/Corpus.cs +++ b/src/SIL.Machine.AspNetCore/Models/Corpus.cs @@ -7,6 +7,8 @@ public class Corpus public string TargetLanguage { get; set; } = default!; public bool TrainOnAll { get; set; } public bool PretranslateAll { get; set; } + public Dictionary>? TrainOnChapters { get; set; } + public Dictionary>? PretranslateChapters { get; set; } public HashSet TrainOnTextIds { get; set; } = default!; public HashSet PretranslateTextIds { get; set; } = default!; public List SourceFiles { get; set; } = default!; diff --git a/src/SIL.Machine.AspNetCore/SIL.Machine.AspNetCore.csproj b/src/SIL.Machine.AspNetCore/SIL.Machine.AspNetCore.csproj index a6ae4d36..c06937b5 100644 --- a/src/SIL.Machine.AspNetCore/SIL.Machine.AspNetCore.csproj +++ b/src/SIL.Machine.AspNetCore/SIL.Machine.AspNetCore.csproj @@ -35,7 +35,7 @@ - + diff --git a/src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs b/src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs index 4c442377..987b6182 100644 --- a/src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs +++ b/src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs @@ -1,7 +1,4 @@ -using Google.Protobuf; -using MongoDB.Bson.IO; - -namespace SIL.Machine.AspNetCore.Services; +namespace SIL.Machine.AspNetCore.Services; public class NmtPreprocessBuildJob : HangfireBuildJob> { @@ -129,14 +126,34 @@ async IAsyncEnumerable ProcessRowsAsync() foreach (ParallelTextRow row in parallelCorpora.Flatten()) { - if (corpus.TrainOnAll || corpus.TrainOnTextIds.Contains(row.TextId)) + bool isInTrainOnChapters = false; + bool isInPretranslateChapters = false; + if (targetCorpora[CorpusType.Text] is ScriptureTextCorpus stc) + { + bool IsInChapters(Dictionary> bookChapters, object rowRef) + { + if (rowRef is not VerseRef vr) + return false; + return bookChapters.TryGetValue(vr.Book, out HashSet? chapters) + && (chapters.Contains(vr.ChapterNum) || chapters.Count == 0); + } + if (corpus.TrainOnChapters is not null) + isInTrainOnChapters = row.Refs.Any(r => IsInChapters(corpus.TrainOnChapters, r)); + if (corpus.PretranslateChapters is not null) + isInPretranslateChapters = row.Refs.Any(r => IsInChapters(corpus.PretranslateChapters, r)); + } + if (corpus.TrainOnAll || corpus.TrainOnTextIds.Contains(row.TextId) || isInTrainOnChapters) { await sourceTrainWriter.WriteAsync($"{row.SourceText}\n"); await targetTrainWriter.WriteAsync($"{row.TargetText}\n"); counts["NumTrainRows"] += 1; } if ( - (corpus.PretranslateAll || corpus.PretranslateTextIds.Contains(row.TextId)) + ( + corpus.PretranslateAll + || corpus.PretranslateTextIds.Contains(row.TextId) + || isInPretranslateChapters + ) && row.SourceSegment.Count > 0 && row.TargetSegment.Count == 0 ) diff --git a/src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs b/src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs index 0ea6542a..c1c7fbf5 100644 --- a/src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs +++ b/src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs @@ -262,6 +262,12 @@ private static Models.Corpus Map(Serval.Translation.V1.Corpus source) TargetLanguage = source.TargetLanguage, TrainOnAll = source.TrainOnAll, PretranslateAll = source.PretranslateAll, + TrainOnChapters = source + .TrainOnChapters.Select(kvp => (kvp.Key, kvp.Value.Chapters.ToHashSet())) + .ToDictionary(), + PretranslateChapters = source + .PretranslateChapters.Select(kvp => (kvp.Key, kvp.Value.Chapters.ToHashSet())) + .ToDictionary(), TrainOnTextIds = source.TrainOnTextIds.ToHashSet(), PretranslateTextIds = source.PretranslateTextIds.ToHashSet(), SourceFiles = source.SourceFiles.Select(Map).ToList(), diff --git a/src/SIL.Machine.AspNetCore/Usings.cs b/src/SIL.Machine.AspNetCore/Usings.cs index bd130ee4..3994bcdf 100644 --- a/src/SIL.Machine.AspNetCore/Usings.cs +++ b/src/SIL.Machine.AspNetCore/Usings.cs @@ -1,4 +1,5 @@ global using System.Collections.Concurrent; +global using System.Data; global using System.Diagnostics; global using System.Diagnostics.CodeAnalysis; global using System.IO.Compression; diff --git a/tests/SIL.Machine.AspNetCore.Tests/Services/NmtPreprocessBuildJobTests.cs b/tests/SIL.Machine.AspNetCore.Tests/Services/NmtPreprocessBuildJobTests.cs index 3e01dccf..0868ba26 100644 --- a/tests/SIL.Machine.AspNetCore.Tests/Services/NmtPreprocessBuildJobTests.cs +++ b/tests/SIL.Machine.AspNetCore.Tests/Services/NmtPreprocessBuildJobTests.cs @@ -10,12 +10,17 @@ public void SetUp() Path.Combine("..", "..", "..", "Services", "data", "paratext"), Path.Combine(Path.GetTempPath(), "Project.zip") ); + ZipFile.CreateFromDirectory( + Path.Combine("..", "..", "..", "Services", "data", "paratext2"), + Path.Combine(Path.GetTempPath(), "Project2.zip") + ); } [TearDown] public void TearDown() { File.Delete(Path.Combine(Path.GetTempPath(), "Project.zip")); + File.Delete(Path.Combine(Path.GetTempPath(), "Project2.zip")); } [Test] @@ -143,6 +148,131 @@ int numEntriesWrittenToPretranslate } } + [Test] + [TestCase("MAT", "1CH", 23, 4)] + [TestCase("NT;LEV", "1CH", 25, 4)] + [TestCase("OT", "MRK", 10, 0)] + [TestCase("OT", "MLK", 0, 0, true)] + public async Task BuildJobTest_Chapterlevel( + string trainOnBiblicalRangeChapters, + string pretranslateBiblicalRangeChapters, + int numLinesWrittenToTrain, + int numEntriesWrittenToPretranslate, + bool throwsException = false + ) + { + using var env = new TestEnvironment(); + var parser = new ScriptureRangeParser(); + + Corpus corpus1 = new Corpus(); + if (throwsException) + { + Assert.Throws(() => + { + corpus1 = new Corpus + { + Id = "corpusId1", + SourceLanguage = "en", + TargetLanguage = "es", + PretranslateAll = false, + TrainOnAll = false, + PretranslateChapters = parser + .GetChapters(pretranslateBiblicalRangeChapters) + .Select(kvp => (kvp.Key, kvp.Value.ToHashSet())) + .ToDictionary(), + TrainOnChapters = parser + .GetChapters(trainOnBiblicalRangeChapters) + .Select(kvp => (kvp.Key, kvp.Value.ToHashSet())) + .ToDictionary(), + PretranslateTextIds = new HashSet(), + TrainOnTextIds = new HashSet(), + SourceFiles = new List + { + new CorpusFile + { + TextId = "textId1", + Format = FileFormat.Paratext, + Location = Path.Combine(Path.GetTempPath(), "Project.zip") + } + }, + TargetFiles = new List + { + new CorpusFile + { + TextId = "textId1", + Format = FileFormat.Paratext, + Location = Path.Combine(Path.GetTempPath(), "Project2.zip") + } + } + }; + }); + return; + } + else + { + corpus1 = new Corpus + { + Id = "corpusId1", + SourceLanguage = "en", + TargetLanguage = "es", + PretranslateAll = false, + TrainOnAll = false, + PretranslateChapters = parser + .GetChapters(pretranslateBiblicalRangeChapters) + .Select(kvp => (kvp.Key, kvp.Value.ToHashSet())) + .ToDictionary(), + TrainOnChapters = parser + .GetChapters(trainOnBiblicalRangeChapters) + .Select(kvp => (kvp.Key, kvp.Value.ToHashSet())) + .ToDictionary(), + PretranslateTextIds = new HashSet(), + TrainOnTextIds = new HashSet(), + SourceFiles = new List + { + new CorpusFile + { + TextId = "textId1", + Format = FileFormat.Paratext, + Location = Path.Combine(Path.GetTempPath(), "Project.zip") + } + }, + TargetFiles = new List + { + new CorpusFile + { + TextId = "textId1", + Format = FileFormat.Paratext, + Location = Path.Combine(Path.GetTempPath(), "Project2.zip") + } + } + }; + } + var corpora = new ReadOnlyList(new List { corpus1 }); + await env.BuildJob.RunAsync("engine1", "build1", corpora, "{\"use_key_terms\":false}", default); + using (var stream = await env.SharedFileService.OpenReadAsync("builds/build1/train.src.txt")) + { + using (var reader = new StreamReader(stream)) + { + //Split yields one more segment that there are new lines; thus, the "- 1" + string text = reader.ReadToEnd(); + Assert.That(text.Split("\n").Length - 1, Is.EqualTo(numLinesWrittenToTrain), text); + } + } + using (var stream = await env.SharedFileService.OpenReadAsync("builds/build1/pretranslate.src.json")) + { + using (var reader = new StreamReader(stream)) + { + JsonArray? pretranslationJsonObject = JsonSerializer.Deserialize(reader.ReadToEnd()); + Assert.NotNull(pretranslationJsonObject); + Assert.That( + pretranslationJsonObject!.ToList().Count, + Is.EqualTo(numEntriesWrittenToPretranslate), + JsonSerializer.Serialize(pretranslationJsonObject) + ); + } + } + } + private class TestEnvironment : DisposableBase { public ISharedFileService SharedFileService { get; } diff --git a/tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/41MATTen.SFM b/tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/41MATTen.SFM new file mode 100644 index 00000000..83a1f679 --- /dev/null +++ b/tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/41MATTen.SFM @@ -0,0 +1,40 @@ +\id MAT - Test +\h Matthew +\mt Matthew +\ip An introduction to Matthew +\c 1 +\s Chapter One +\v 1 Chapter \pn one\+pro WON\+pro*\pn*, verse one.\f + \fr 1:1: \ft This is a footnote.\f* +\li1 +\v 2 \bd C\bd*hapter one, +\li2 verse\f + \fr 1:2: \ft This is a footnote.\f* two. +\v 3 Chapter one, +\li2 verse three. +\v 4 Chapter one,  +\li2 verse four, +\v 5 Chapter one, +\li2 verse \fig Figure 1|src="image1.png" size="col" ref="1:5"\fig* five. +\c 2 +\s1 Chapter Two +\p +\v 1 Chapter \add two\add*, verse \f + \fr 2:1: \ft This is a footnote.\f*one. +\v 2-3 Chapter two, verse \fm ∆\fm*two. +\v 3-4a Chapter two, verse \w three|lemma\w*. +\v 4b Chapter two, verse four. +\p +\v 6 Chapter two, verse \w six|strong="12345" \w*. +\v 6 Bad verse. +\v 5 Chapter two, verse five \rq (MAT 3:1)\rq*. +\v 7a Chapter two, verse seven A, +\s Section header +\p +\v 7b verse seven B. +\p +\v 8 This is a list: +\b +\tr \tc1 +\v 9 Chapter\tcr2 2\tc3 verse\tcr4 9 +\tr \tc1-2 +\v 10 \tc3-4 Chapter 2 verse 10 +\v 11-12 +\restore restore information diff --git a/tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/42MRKTen.SFM b/tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/42MRKTen.SFM new file mode 100644 index 00000000..46000963 --- /dev/null +++ b/tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/42MRKTen.SFM @@ -0,0 +1,4 @@ +\id MRK - Test +\h Mark +\mt Mark +\ip An introduction to Mark \ No newline at end of file diff --git a/tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/ProjectBiblicalTerms.xml b/tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/ProjectBiblicalTerms.xml new file mode 100644 index 00000000..8bdbc4d2 --- /dev/null +++ b/tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/ProjectBiblicalTerms.xml @@ -0,0 +1,6 @@ + + + PN + Abba + + \ No newline at end of file diff --git a/tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/Settings.xml b/tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/Settings.xml new file mode 100644 index 00000000..268bde64 --- /dev/null +++ b/tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/Settings.xml @@ -0,0 +1,34 @@ + + usfm.sty + 4 + en::: + English + 8.0.100.76 + Test2 + 65001 + T + + NFC + Ten + a7e0b3ce0200736062f9f810a444dbfbe64aca35 + Charis SIL + 12 + + + + 41MAT + + Ten.SFM + Project:Ten:ProjectBiblicalTerms.xml + F + F + F + Public + Standard:: + + 3 + 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + 000000000000000000000000000000000000001100000000000000000000000000000000000000000000000000000000000000000000000000000000000 + + + \ No newline at end of file diff --git a/tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/TermRenderings.xml b/tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/TermRenderings.xml new file mode 100644 index 00000000..debd73df --- /dev/null +++ b/tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/TermRenderings.xml @@ -0,0 +1,9 @@ + + + Abba + + + + + + diff --git a/tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/custom.vrs b/tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/custom.vrs new file mode 100644 index 00000000..9c1cd387 --- /dev/null +++ b/tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/custom.vrs @@ -0,0 +1,31 @@ +# custom.vrs + +LEV 14:56 +ROM 14:26 +REV 12:17 +TOB 5:22 +TOB 10:12 +SIR 23:28 +ESG 1:22 +ESG 3:15 +ESG 5:14 +ESG 8:17 +ESG 10:14 +SIR 33:33 +SIR 41:24 +BAR 1:22 +4MA 7:25 +4MA 12:20 + +# deliberately missing verses +-ROM 16:26 +-ROM 16:27 +-3JN 1:15 +-S3Y 1:49 +-ESG 4:6 +-ESG 9:5 +-ESG 9:30 + +LEV 14:55 = LEV 14:55 +LEV 14:55 = LEV 14:56 +LEV 14:56 = LEV 14:57