Skip to content

Commit

Permalink
Chapter level aspnetcore (#167)
Browse files Browse the repository at this point in the history
* Added chapter-level filtering; fixes sillsdev/serval#150
  • Loading branch information
Enkidu93 authored Feb 7, 2024
1 parent 8ea1117 commit 59cbd46
Show file tree
Hide file tree
Showing 12 changed files with 287 additions and 7 deletions.
2 changes: 2 additions & 0 deletions src/SIL.Machine.AspNetCore/Models/Corpus.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ public class Corpus
public string TargetLanguage { get; set; } = default!;
public bool TrainOnAll { get; set; }
public bool PretranslateAll { get; set; }
public Dictionary<string, HashSet<int>>? TrainOnChapters { get; set; }
public Dictionary<string, HashSet<int>>? PretranslateChapters { get; set; }
public HashSet<string> TrainOnTextIds { get; set; } = default!;
public HashSet<string> PretranslateTextIds { get; set; } = default!;
public List<CorpusFile> SourceFiles { get; set; } = default!;
Expand Down
2 changes: 1 addition & 1 deletion src/SIL.Machine.AspNetCore/SIL.Machine.AspNetCore.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
<PackageReference Include="Microsoft.AspNetCore.Mvc.NewtonsoftJson" Version="6.0.16" />
<PackageReference Include="Microsoft.Extensions.Http.Polly" Version="6.0.14" />
<PackageReference Include="Python.Included" Version="3.11.4" />
<PackageReference Include="Serval.Grpc" Version="0.14.0" Condition="!Exists('..\..\..\serval\src\Serval.Grpc\Serval.Grpc.csproj')" />
<PackageReference Include="Serval.Grpc" Version="0.15.0" Condition="!Exists('..\..\..\serval\src\Serval.Grpc\Serval.Grpc.csproj')" />
<PackageReference Include="SIL.DataAccess" Version="0.5.2" Condition="!Exists('..\..\..\serval\src\SIL.DataAccess\SIL.DataAccess.csproj')" />
<PackageReference Include="SIL.WritingSystems" Version="12.0.1" />
<PackageReference Include="System.Linq.Async" Version="6.0.1" />
Expand Down
29 changes: 23 additions & 6 deletions src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
using Google.Protobuf;
using MongoDB.Bson.IO;

namespace SIL.Machine.AspNetCore.Services;
namespace SIL.Machine.AspNetCore.Services;

public class NmtPreprocessBuildJob : HangfireBuildJob<IReadOnlyList<Corpus>>
{
Expand Down Expand Up @@ -129,14 +126,34 @@ async IAsyncEnumerable<Pretranslation> ProcessRowsAsync()

foreach (ParallelTextRow row in parallelCorpora.Flatten())
{
if (corpus.TrainOnAll || corpus.TrainOnTextIds.Contains(row.TextId))
bool isInTrainOnChapters = false;
bool isInPretranslateChapters = false;
if (targetCorpora[CorpusType.Text] is ScriptureTextCorpus stc)
{
bool IsInChapters(Dictionary<string, HashSet<int>> bookChapters, object rowRef)
{
if (rowRef is not VerseRef vr)
return false;
return bookChapters.TryGetValue(vr.Book, out HashSet<int>? chapters)
&& (chapters.Contains(vr.ChapterNum) || chapters.Count == 0);
}
if (corpus.TrainOnChapters is not null)
isInTrainOnChapters = row.Refs.Any(r => IsInChapters(corpus.TrainOnChapters, r));
if (corpus.PretranslateChapters is not null)
isInPretranslateChapters = row.Refs.Any(r => IsInChapters(corpus.PretranslateChapters, r));
}
if (corpus.TrainOnAll || corpus.TrainOnTextIds.Contains(row.TextId) || isInTrainOnChapters)
{
await sourceTrainWriter.WriteAsync($"{row.SourceText}\n");
await targetTrainWriter.WriteAsync($"{row.TargetText}\n");
counts["NumTrainRows"] += 1;
}
if (
(corpus.PretranslateAll || corpus.PretranslateTextIds.Contains(row.TextId))
(
corpus.PretranslateAll
|| corpus.PretranslateTextIds.Contains(row.TextId)
|| isInPretranslateChapters
)
&& row.SourceSegment.Count > 0
&& row.TargetSegment.Count == 0
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,12 @@ private static Models.Corpus Map(Serval.Translation.V1.Corpus source)
TargetLanguage = source.TargetLanguage,
TrainOnAll = source.TrainOnAll,
PretranslateAll = source.PretranslateAll,
TrainOnChapters = source
.TrainOnChapters.Select(kvp => (kvp.Key, kvp.Value.Chapters.ToHashSet()))
.ToDictionary(),
PretranslateChapters = source
.PretranslateChapters.Select(kvp => (kvp.Key, kvp.Value.Chapters.ToHashSet()))
.ToDictionary(),
TrainOnTextIds = source.TrainOnTextIds.ToHashSet(),
PretranslateTextIds = source.PretranslateTextIds.ToHashSet(),
SourceFiles = source.SourceFiles.Select(Map).ToList(),
Expand Down
1 change: 1 addition & 0 deletions src/SIL.Machine.AspNetCore/Usings.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
global using System.Collections.Concurrent;
global using System.Data;
global using System.Diagnostics;
global using System.Diagnostics.CodeAnalysis;
global using System.IO.Compression;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,17 @@ public void SetUp()
Path.Combine("..", "..", "..", "Services", "data", "paratext"),
Path.Combine(Path.GetTempPath(), "Project.zip")
);
ZipFile.CreateFromDirectory(
Path.Combine("..", "..", "..", "Services", "data", "paratext2"),
Path.Combine(Path.GetTempPath(), "Project2.zip")
);
}

[TearDown]
public void TearDown()
{
File.Delete(Path.Combine(Path.GetTempPath(), "Project.zip"));
File.Delete(Path.Combine(Path.GetTempPath(), "Project2.zip"));
}

[Test]
Expand Down Expand Up @@ -143,6 +148,131 @@ int numEntriesWrittenToPretranslate
}
}

[Test]
[TestCase("MAT", "1CH", 23, 4)]
[TestCase("NT;LEV", "1CH", 25, 4)]
[TestCase("OT", "MRK", 10, 0)]
[TestCase("OT", "MLK", 0, 0, true)]
public async Task BuildJobTest_Chapterlevel(
string trainOnBiblicalRangeChapters,
string pretranslateBiblicalRangeChapters,
int numLinesWrittenToTrain,
int numEntriesWrittenToPretranslate,
bool throwsException = false
)
{
using var env = new TestEnvironment();
var parser = new ScriptureRangeParser();

Corpus corpus1 = new Corpus();
if (throwsException)
{
Assert.Throws<ArgumentException>(() =>
{
corpus1 = new Corpus
{
Id = "corpusId1",
SourceLanguage = "en",
TargetLanguage = "es",
PretranslateAll = false,
TrainOnAll = false,
PretranslateChapters = parser
.GetChapters(pretranslateBiblicalRangeChapters)
.Select(kvp => (kvp.Key, kvp.Value.ToHashSet()))
.ToDictionary(),
TrainOnChapters = parser
.GetChapters(trainOnBiblicalRangeChapters)
.Select(kvp => (kvp.Key, kvp.Value.ToHashSet()))
.ToDictionary(),
PretranslateTextIds = new HashSet<string>(),
TrainOnTextIds = new HashSet<string>(),
SourceFiles = new List<CorpusFile>
{
new CorpusFile
{
TextId = "textId1",
Format = FileFormat.Paratext,
Location = Path.Combine(Path.GetTempPath(), "Project.zip")
}
},
TargetFiles = new List<CorpusFile>
{
new CorpusFile
{
TextId = "textId1",
Format = FileFormat.Paratext,
Location = Path.Combine(Path.GetTempPath(), "Project2.zip")
}
}
};
});
return;
}
else
{
corpus1 = new Corpus
{
Id = "corpusId1",
SourceLanguage = "en",
TargetLanguage = "es",
PretranslateAll = false,
TrainOnAll = false,
PretranslateChapters = parser
.GetChapters(pretranslateBiblicalRangeChapters)
.Select(kvp => (kvp.Key, kvp.Value.ToHashSet()))
.ToDictionary(),
TrainOnChapters = parser
.GetChapters(trainOnBiblicalRangeChapters)
.Select(kvp => (kvp.Key, kvp.Value.ToHashSet()))
.ToDictionary(),
PretranslateTextIds = new HashSet<string>(),
TrainOnTextIds = new HashSet<string>(),
SourceFiles = new List<CorpusFile>
{
new CorpusFile
{
TextId = "textId1",
Format = FileFormat.Paratext,
Location = Path.Combine(Path.GetTempPath(), "Project.zip")
}
},
TargetFiles = new List<CorpusFile>
{
new CorpusFile
{
TextId = "textId1",
Format = FileFormat.Paratext,
Location = Path.Combine(Path.GetTempPath(), "Project2.zip")
}
}
};
}
var corpora = new ReadOnlyList<Corpus>(new List<Corpus> { corpus1 });
await env.BuildJob.RunAsync("engine1", "build1", corpora, "{\"use_key_terms\":false}", default);
using (var stream = await env.SharedFileService.OpenReadAsync("builds/build1/train.src.txt"))
{
using (var reader = new StreamReader(stream))
{
//Split yields one more segment that there are new lines; thus, the "- 1"
string text = reader.ReadToEnd();
Assert.That(text.Split("\n").Length - 1, Is.EqualTo(numLinesWrittenToTrain), text);
}
}
using (var stream = await env.SharedFileService.OpenReadAsync("builds/build1/pretranslate.src.json"))
{
using (var reader = new StreamReader(stream))
{
JsonArray? pretranslationJsonObject = JsonSerializer.Deserialize<JsonArray>(reader.ReadToEnd());
Assert.NotNull(pretranslationJsonObject);
Assert.That(
pretranslationJsonObject!.ToList().Count,
Is.EqualTo(numEntriesWrittenToPretranslate),
JsonSerializer.Serialize(pretranslationJsonObject)
);
}
}
}

private class TestEnvironment : DisposableBase
{
public ISharedFileService SharedFileService { get; }
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
\id MAT - Test
\h Matthew
\mt Matthew
\ip An introduction to Matthew
\c 1
\s Chapter One
\v 1 Chapter \pn one\+pro WON\+pro*\pn*, verse one.\f + \fr 1:1: \ft This is a footnote.\f*
\li1
\v 2 \bd C\bd*hapter one,
\li2 verse\f + \fr 1:2: \ft This is a footnote.\f* two.
\v 3 Chapter one,
\li2 verse three.
\v 4 Chapter one, 
\li2 verse four,
\v 5 Chapter one,
\li2 verse \fig Figure 1|src="image1.png" size="col" ref="1:5"\fig* five.
\c 2
\s1 Chapter Two
\p
\v 1 Chapter \add two\add*, verse \f + \fr 2:1: \ft This is a footnote.\f*one.
\v 2-3 Chapter two, verse \fm ∆\fm*two.
\v 3-4a Chapter two, verse \w three|lemma\w*.
\v 4b Chapter two, verse four.
\p
\v 6 Chapter two, verse \w six|strong="12345" \w*.
\v 6 Bad verse.
\v 5 Chapter two, verse five \rq (MAT 3:1)\rq*.
\v 7a Chapter two, verse seven A,
\s Section header
\p
\v 7b verse seven B.
\p
\v 8 This is a list:
\b
\tr \tc1
\v 9 Chapter\tcr2 2\tc3 verse\tcr4 9
\tr \tc1-2
\v 10 \tc3-4 Chapter 2 verse 10
\v 11-12
\restore restore information
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
\id MRK - Test
\h Mark
\mt Mark
\ip An introduction to Mark
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
<BiblicalTermsList>
<Term Id="Abba">
<Category>PN</Category>
<Gloss>Abba</Gloss>
</Term>
</BiblicalTermsList>
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
<ScriptureText>
<StyleSheet>usfm.sty</StyleSheet>
<Versification>4</Versification>
<LanguageIsoCode>en:::</LanguageIsoCode>
<Language>English</Language>
<MinParatextVersion>8.0.100.76</MinParatextVersion>
<FullName>Test2</FullName>
<Encoding>65001</Encoding>
<Editable>T</Editable>
<Copyright />
<NormalizationForm>NFC</NormalizationForm>
<Name>Ten</Name>
<Guid>a7e0b3ce0200736062f9f810a444dbfbe64aca35</Guid>
<DefaultFont>Charis SIL</DefaultFont>
<DefaultFontSize>12</DefaultFontSize>
<FontFeatures />
<HtmlLanguage />
<AssociatedLexicalProject />
<FileNameBookNameForm>41MAT</FileNameBookNameForm>
<FileNamePrePart />
<FileNamePostPart>Ten.SFM</FileNamePostPart>
<BiblicalTermsListSetting>Project:Ten:ProjectBiblicalTerms.xml</BiblicalTermsListSetting>
<MatchBasedOnStems>F</MatchBasedOnStems>
<AllowReadAccess>F</AllowReadAccess>
<AllowSharingWithSLDR>F</AllowSharingWithSLDR>
<Visibility>Public</Visibility>
<TranslationInfo>Standard::</TranslationInfo>
<EncodingConverter />
<UsfmVersion>3</UsfmVersion>
<ParallelPassagesBooks>000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000</ParallelPassagesBooks>
<BooksPresent>000000000000000000000000000000000000001100000000000000000000000000000000000000000000000000000000000000000000000000000000000</BooksPresent>
<BibleModuleAssociations />
<Naming PrePart="" PostPart="Ten.SFM" BookNameForm="41MAT" />
</ScriptureText>
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<TermRenderingsList>
<TermRendering Id="Abba" Guess="false">
<Renderings>Abba</Renderings>
<Glossary />
<Changes />
<Notes />
<Denials />
</TermRendering>
</TermRenderingsList>
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# custom.vrs

LEV 14:56
ROM 14:26
REV 12:17
TOB 5:22
TOB 10:12
SIR 23:28
ESG 1:22
ESG 3:15
ESG 5:14
ESG 8:17
ESG 10:14
SIR 33:33
SIR 41:24
BAR 1:22
4MA 7:25
4MA 12:20

# deliberately missing verses
-ROM 16:26
-ROM 16:27
-3JN 1:15
-S3Y 1:49
-ESG 4:6
-ESG 9:5
-ESG 9:30

LEV 14:55 = LEV 14:55
LEV 14:55 = LEV 14:56
LEV 14:56 = LEV 14:57

0 comments on commit 59cbd46

Please sign in to comment.