Skip to content

Commit

Permalink
Fix bug with pretranslating all; begin porting tests to toolkit
Browse files Browse the repository at this point in the history
  • Loading branch information
Enkidu93 committed Oct 17, 2024
1 parent 293f31a commit 60d4084
Show file tree
Hide file tree
Showing 10 changed files with 220 additions and 31 deletions.
15 changes: 15 additions & 0 deletions Serval.sln
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,14 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{C3A14577-A65
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SIL.ServiceToolkit", "src\ServiceToolkit\src\SIL.ServiceToolkit\SIL.ServiceToolkit.csproj", "{0E40F959-C641-40A2-9750-B17A4F9F9E55}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{E41916A7-B9AA-45BE-BCFF-656722FEEA84}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "ServiceToolkit", "ServiceToolkit", "{A4DA43D4-29BC-4164-A114-E1775B2C9573}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "test", "test", "{5C42D20E-8DFC-4221-BA97-62D9E5742349}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SIL.ServiceToolkit.Tests", "src\ServiceToolkit\test\SIL.ServiceToolkit\SIL.ServiceToolkit.Tests.csproj", "{3DC5CD22-3E98-434A-9B00-EBC4DDF797A1}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down Expand Up @@ -180,6 +188,10 @@ Global
{0E40F959-C641-40A2-9750-B17A4F9F9E55}.Debug|Any CPU.Build.0 = Debug|Any CPU
{0E40F959-C641-40A2-9750-B17A4F9F9E55}.Release|Any CPU.ActiveCfg = Release|Any CPU
{0E40F959-C641-40A2-9750-B17A4F9F9E55}.Release|Any CPU.Build.0 = Release|Any CPU
{3DC5CD22-3E98-434A-9B00-EBC4DDF797A1}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{3DC5CD22-3E98-434A-9B00-EBC4DDF797A1}.Debug|Any CPU.Build.0 = Debug|Any CPU
{3DC5CD22-3E98-434A-9B00-EBC4DDF797A1}.Release|Any CPU.ActiveCfg = Release|Any CPU
{3DC5CD22-3E98-434A-9B00-EBC4DDF797A1}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand Down Expand Up @@ -215,6 +227,9 @@ Global
{10657805-48F1-4205-B8F5-79447F6EF620} = {25CDB05B-4E24-4A6E-933E-1E0BEC97D74D}
{C3A14577-A654-4604-818C-4E683DD45A51} = {EA69B41C-49EF-4017-A687-44B9DF37FF98}
{0E40F959-C641-40A2-9750-B17A4F9F9E55} = {C3A14577-A654-4604-818C-4E683DD45A51}
{A4DA43D4-29BC-4164-A114-E1775B2C9573} = {E41916A7-B9AA-45BE-BCFF-656722FEEA84}
{5C42D20E-8DFC-4221-BA97-62D9E5742349} = {A4DA43D4-29BC-4164-A114-E1775B2C9573}
{3DC5CD22-3E98-434A-9B00-EBC4DDF797A1} = {5C42D20E-8DFC-4221-BA97-62D9E5742349}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {9F18C25E-E140-43C3-B177-D562E1628370}
Expand Down
34 changes: 14 additions & 20 deletions src/Echo/src/EchoTranslationEngine/TranslationEngineServiceV1.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
using Nito.AsyncEx.Synchronous;

namespace EchoTranslationEngine;
namespace EchoTranslationEngine;

public class TranslationEngineServiceV1(BackgroundTaskQueue taskQueue) : TranslationEngineApi.TranslationEngineApiBase
{
Expand Down Expand Up @@ -85,25 +83,21 @@ await client.BuildStartedAsync(
ParallelCorpusPreprocessor.PreprocessCorpora(
request.Corpora.Select(Map).ToList(),
row => { },
(row, corpus) =>
async (row, corpus) =>
{
if (row.SourceSegment.Length > 0 && row.TargetSegment.Length == 0)
{
call.RequestStream.WriteAsync(
new InsertPretranslationsRequest
{
EngineId = request.EngineId,
CorpusId = corpus.Id,
TextId = row.TextId,
Refs = { row.Refs.Select(r => r.ToString()) },
Translation = row.SourceSegment
},
cancellationToken
)
.WaitAndUnwrapException();
}
await call.RequestStream.WriteAsync(
new InsertPretranslationsRequest
{
EngineId = request.EngineId,
CorpusId = corpus.Id,
TextId = row.TextId,
Refs = { row.Refs.Select(r => r.ToString()) },
Translation = row.SourceSegment
},
cancellationToken
);
},
true
false
);
await call.RequestStream.CompleteAsync();
await call;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,23 @@ public async Task RunAsync_PretranslateTextIds()
Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(4));
}

[Test]
public async Task RunAsync_PretranslateTextIdsOverlapWithTrainOnTextIds()
{
using TestEnvironment env = new();
ParallelCorpus corpus1 = TestEnvironment.TextFileCorpus(
pretranslateTextIds: ["textId1"],
trainOnTextIds: ["textId1"]
);

await env.RunBuildJobAsync(corpus1);
Assert.Multiple(async () =>
{
Assert.That((await env.GetTrainCountAsync()).Source1Count, Is.EqualTo(4));
Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(0));
});
}

[Test]
public async Task RunAsync_EnableKeyTerms()
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,12 @@ row.Ref is not ScriptureRef sr
{
ITextCorpus textCorpus = sc.TextCorpus;
if (sc.Corpus.PretranslateTextIds is not null)
textCorpus = textCorpus.FilterTexts(sc.Corpus.PretranslateTextIds);
{
return textCorpus.FilterTexts(
sc.Corpus.PretranslateTextIds.Except(sc.Corpus.TrainOnTextIds ?? new())
);
}
return textCorpus.Where(row =>
row.Ref is not ScriptureRef sr
|| sc.Corpus.PretranslateChapters is null
Expand Down Expand Up @@ -154,15 +159,14 @@ row.Ref is not ScriptureRef sr
}
}

ITextCorpus textCorpus =
targetCorpora.Length > 0 ? targetCorpora[0].TextCorpus : new DictionaryTextCorpus();

foreach (Row row in AlignPretranslateCorpus(sourcePretranslateCorpora, textCorpus))
foreach (
Row row in AlignPretranslateCorpus(
sourcePretranslateCorpora,
targetCorpora.Select(tc => tc.TextCorpus).ToArray()
)
)
{
if (row.SourceSegment.Length > 0)
{
pretranslate(row, corpus);
}
pretranslate(row, corpus);
}
}
}
Expand Down Expand Up @@ -295,15 +299,23 @@ IReadOnlyList<ITextCorpus> trgCorpora
}
}

private static IEnumerable<Row> AlignPretranslateCorpus(ITextCorpus[] srcCorpora, ITextCorpus trgCorpus)
private static IEnumerable<Row> AlignPretranslateCorpus(ITextCorpus[] srcCorpora, ITextCorpus[] trgCorpora)
{
int rowCount = 0;
StringBuilder srcSegBuffer = new();
StringBuilder trgSegBuffer = new();
List<object> refs = [];
string textId = "";
foreach (ParallelTextRow row in srcCorpora.SelectMany(sc => sc.AlignRows(trgCorpus, allSourceRows: true)))
foreach (
ParallelTextRow? row in srcCorpora
.SelectMany(sc => trgCorpora.Select(tc => sc.AlignRows(tc, allSourceRows: true)))
.ZipMany(rows =>
rows.Where(r => r.SourceSegment.Count > 0 && r.TargetSegment.Count == 0).FirstOrDefault()
)
)
{
if (row is null)
continue;
if (!row.IsTargetRangeStart && row.IsTargetInRange)
{
refs.AddRange(row.TargetRefs);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>net8.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="coverlet.collector" Version="6.0.0">
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
<PrivateAssets>all</PrivateAssets>
</PackageReference>
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.8.0" />
<PackageReference Include="NSubstitute" Version="5.1.0" />
<PackageReference Include="NSubstitute.Analyzers.CSharp" Version="1.0.16">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
</PackageReference>
<PackageReference Include="NUnit" Version="4.0.1" />
<PackageReference Include="NUnit3TestAdapter" Version="4.5.0" />
<PackageReference Include="NUnit.Analyzers" Version="4.0.0">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
</PackageReference>
</ItemGroup>

<ItemGroup>
<ProjectReference Include="../../src/SIL.ServiceToolkit/SIL.ServiceToolkit.csproj"/>
</ItemGroup>

</Project>
2 changes: 2 additions & 0 deletions src/ServiceToolkit/test/SIL.ServiceToolkit/Usings.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
global using NUnit.Framework;
global using SIL.ServiceToolkit.Models;
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
namespace SIL.ServiceToolkit.Utils;

[TestFixture]
public class ParallelCorpusPreprocessorTests
{
private static readonly string TestDataPath = Path.Combine(
AppContext.BaseDirectory,
"..",
"..",
"..",
"Utils",
"data"
);

[Test]
public void TestParallelCorpusPreprocessor()
{
var processor = new ParallelCorpusPreprocessor();
List<ParallelCorpus> corpora =
new()
{
new()
{
Id = "corpus1",
SourceCorpora = new List<MonolingualCorpus>
{
new MonolingualCorpus()
{
Id = "source-corpus1",
Language = "en",
Files = new List<CorpusFile>
{
new()
{
TextId = "textId1",
Format = FileFormat.Text,
Location = Path.Combine(TestDataPath, "source1.txt")
}
}
},
new MonolingualCorpus()
{
Id = "source-corpus2",
Language = "en",
Files = new List<CorpusFile>
{
new()
{
TextId = "textId1",
Format = FileFormat.Text,
Location = Path.Combine(TestDataPath, "source2.txt")
}
}
}
},
TargetCorpora = new List<MonolingualCorpus>
{
new MonolingualCorpus()
{
Id = "target-corpus1",
Language = "en",
Files = new List<CorpusFile>
{
new()
{
TextId = "textId1",
Format = FileFormat.Text,
Location = Path.Combine(TestDataPath, "target1.txt")
}
}
}
}
}
};
int trainCount = 0;
int pretranslateCount = 0;
processor.Preprocess(
corpora,
row =>
{
if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0)
trainCount++;
},
(row, corpus) =>
{
pretranslateCount++;
},
false
);
Assert.Multiple(() =>
{
Assert.That(trainCount, Is.EqualTo(2));
Assert.That(pretranslateCount, Is.EqualTo(3));
});
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Source one, Line 1
Source one, Line 2

Source one, Line 4

Source one, Line 6

Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Source two, Line 1
Source two, Line 2

Source two, Line 4
Source two, Line 5
Source two, Line 6

Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Target one, Line 1


Target one, Line 4


Target one, Line 7

0 comments on commit 60d4084

Please sign in to comment.