Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move preprocess logic to toolkit #512

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions Serval.sln
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,14 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{C3A14577-A65
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SIL.ServiceToolkit", "src\ServiceToolkit\src\SIL.ServiceToolkit\SIL.ServiceToolkit.csproj", "{0E40F959-C641-40A2-9750-B17A4F9F9E55}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{E41916A7-B9AA-45BE-BCFF-656722FEEA84}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "ServiceToolkit", "ServiceToolkit", "{A4DA43D4-29BC-4164-A114-E1775B2C9573}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "test", "test", "{5C42D20E-8DFC-4221-BA97-62D9E5742349}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SIL.ServiceToolkit.Tests", "src\ServiceToolkit\test\SIL.ServiceToolkit\SIL.ServiceToolkit.Tests.csproj", "{3DC5CD22-3E98-434A-9B00-EBC4DDF797A1}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down Expand Up @@ -180,6 +188,10 @@ Global
{0E40F959-C641-40A2-9750-B17A4F9F9E55}.Debug|Any CPU.Build.0 = Debug|Any CPU
{0E40F959-C641-40A2-9750-B17A4F9F9E55}.Release|Any CPU.ActiveCfg = Release|Any CPU
{0E40F959-C641-40A2-9750-B17A4F9F9E55}.Release|Any CPU.Build.0 = Release|Any CPU
{3DC5CD22-3E98-434A-9B00-EBC4DDF797A1}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{3DC5CD22-3E98-434A-9B00-EBC4DDF797A1}.Debug|Any CPU.Build.0 = Debug|Any CPU
{3DC5CD22-3E98-434A-9B00-EBC4DDF797A1}.Release|Any CPU.ActiveCfg = Release|Any CPU
{3DC5CD22-3E98-434A-9B00-EBC4DDF797A1}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand Down Expand Up @@ -215,6 +227,9 @@ Global
{10657805-48F1-4205-B8F5-79447F6EF620} = {25CDB05B-4E24-4A6E-933E-1E0BEC97D74D}
{C3A14577-A654-4604-818C-4E683DD45A51} = {EA69B41C-49EF-4017-A687-44B9DF37FF98}
{0E40F959-C641-40A2-9750-B17A4F9F9E55} = {C3A14577-A654-4604-818C-4E683DD45A51}
{A4DA43D4-29BC-4164-A114-E1775B2C9573} = {E41916A7-B9AA-45BE-BCFF-656722FEEA84}
{5C42D20E-8DFC-4221-BA97-62D9E5742349} = {A4DA43D4-29BC-4164-A114-E1775B2C9573}
{3DC5CD22-3E98-434A-9B00-EBC4DDF797A1} = {5C42D20E-8DFC-4221-BA97-62D9E5742349}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {9F18C25E-E140-43C3-B177-D562E1628370}
Expand Down
230 changes: 85 additions & 145 deletions src/Echo/src/EchoTranslationEngine/TranslationEngineServiceV1.cs
Original file line number Diff line number Diff line change
Expand Up @@ -80,154 +80,25 @@ await client.BuildStartedAsync(
client.InsertPretranslations(cancellationToken: cancellationToken)
)
{
foreach (ParallelCorpus corpus in request.Corpora)
{
var sourceFiles = corpus
.SourceCorpora.SelectMany(sc =>
sc.Files.Where(f =>
(
sc.PretranslateAll
|| sc.PretranslateTextIds is null
|| sc.PretranslateTextIds.Contains(f.TextId)
)
&& f.Format == FileFormat.Text
)
)
.ToDictionary(f => f.TextId, f => f.Location);
var targetFiles = corpus
.TargetCorpora.SelectMany(tc =>
tc.Files.Where(f =>
(
tc.PretranslateAll
|| tc.PretranslateTextIds is null
|| tc.PretranslateTextIds.Contains(f.TextId)
)
&& f.Format == FileFormat.Text
)
)
.ToDictionary(f => f.TextId, f => f.Location);

foreach (KeyValuePair<string, string> sourceFile in sourceFiles)
ParallelCorpusPreprocessor.PreprocessCorpora(
request.Corpora.Select(Map).ToList(),
row => { },
async (row, corpus) =>
{
string[] sourceLines = await File.ReadAllLinesAsync(
sourceFile.Value,
await call.RequestStream.WriteAsync(
new InsertPretranslationsRequest
{
EngineId = request.EngineId,
CorpusId = corpus.Id,
TextId = row.TextId,
Refs = { row.Refs.Select(r => r.ToString()) },
Translation = row.SourceSegment
},
cancellationToken
);

if (targetFiles.TryGetValue(sourceFile.Key, out string? targetPath))
{
string[] targetLines = await File.ReadAllLinesAsync(targetPath, cancellationToken);
bool isTabSeparated = (sourceLines.Length > 0) && sourceLines[0].Contains('/');
if (!isTabSeparated)
{
int lineNum = 1;
foreach (
(string sourceLine, string targetLine) in sourceLines
.Select(l => l.Trim())
.Zip(targetLines.Select(l => l.Trim()))
)
{
if (sourceLine.Length > 0 && targetLine.Length == 0)
{
await call.RequestStream.WriteAsync(
new InsertPretranslationsRequest
{
EngineId = request.EngineId,
CorpusId = corpus.Id,
TextId = sourceFile.Key,
Refs = { $"{sourceFile.Key}:{lineNum}" },
Translation = sourceLine
},
cancellationToken
);
}
lineNum++;
}
}
else
{
var sourceLinesDict = sourceLines.ToDictionary(
l => l.Split('\t')[0].Trim(),
l => l.Split('\t')[1].Trim()
);
var targetLinesDict = targetLines.ToDictionary(
l => l.Split('\t')[0].Trim(),
l => l.Contains('\t') ? l.Split('\t')[1].Trim() : string.Empty
);
foreach (KeyValuePair<string, string> targetLineKVPair in targetLinesDict)
{
string? sourceLine = null;
sourceLinesDict.TryGetValue(targetLineKVPair.Key, out sourceLine);
sourceLine ??= string.Empty;
string? targetLine = targetLineKVPair.Value;
if (sourceLine.Length > 0 && targetLine.Length == 0)
{
await call.RequestStream.WriteAsync(
new InsertPretranslationsRequest
{
EngineId = request.EngineId,
CorpusId = corpus.Id,
TextId = sourceFile.Key,
Refs = { $"{sourceFile.Key}:{targetLineKVPair.Key}" },
Translation = sourceLine
},
cancellationToken
);
}
}
}
}
else
{
bool isTabSeparated = (sourceLines.Length > 0) && sourceLines[0].Contains('/');
if (!isTabSeparated)
{
int lineNum = 1;
foreach (string sourceLine in sourceLines.Select(l => l.Trim()))
{
if (sourceLine.Length > 0)
{
await call.RequestStream.WriteAsync(
new InsertPretranslationsRequest
{
EngineId = request.EngineId,
CorpusId = corpus.Id,
TextId = sourceFile.Key,
Refs = { $"{sourceFile.Key}:{lineNum}" },
Translation = sourceLine
},
cancellationToken
);
}
lineNum++;
}
}
else
{
foreach (string sourceLine in sourceLines.Select(l => l.Trim()))
{
if (sourceLine.Length > 0)
{
await call.RequestStream.WriteAsync(
new InsertPretranslationsRequest
{
EngineId = request.EngineId,
CorpusId = corpus.Id,
TextId = sourceFile.Key,
Refs = { $"{sourceFile.Key}:{sourceLine.Split('\t')[0]}" },
Translation = sourceLine.Contains('\t')
? sourceLine.Split('\t')[1].Trim()
: string.Empty
},
cancellationToken
);
}
}
}
}
}
}

},
false
);
await call.RequestStream.CompleteAsync();
await call;
}
Expand Down Expand Up @@ -325,4 +196,73 @@ ServerCallContext context
new GetLanguageInfoResponse { InternalCode = request.Language + "_echo", IsNative = true, }
);
}

private static SIL.ServiceToolkit.Models.ParallelCorpus Map(ParallelCorpus source)
{
return new SIL.ServiceToolkit.Models.ParallelCorpus
{
Id = source.Id,
SourceCorpora = source.SourceCorpora.Select(Map).ToList(),
TargetCorpora = source.TargetCorpora.Select(Map).ToList()
};
}

private static SIL.ServiceToolkit.Models.MonolingualCorpus Map(MonolingualCorpus source)
{
var trainOnChapters = source.TrainOnChapters.ToDictionary(
kvp => kvp.Key,
kvp => kvp.Value.Chapters.ToHashSet()
);
var trainOnTextIds = source.TrainOnTextIds.ToHashSet();
FilterChoice trainingFilter = GetFilterChoice(trainOnChapters, trainOnTextIds);

var pretranslateChapters = source.PretranslateChapters.ToDictionary(
kvp => kvp.Key,
kvp => kvp.Value.Chapters.ToHashSet()
);
var pretranslateTextIds = source.PretranslateTextIds.ToHashSet();
FilterChoice pretranslateFilter = GetFilterChoice(pretranslateChapters, pretranslateTextIds);

return new SIL.ServiceToolkit.Models.MonolingualCorpus
{
Id = source.Id,
Language = source.Language,
Files = source.Files.Select(Map).ToList(),
TrainOnChapters = trainingFilter == FilterChoice.Chapters ? trainOnChapters : null,
TrainOnTextIds = trainingFilter == FilterChoice.TextIds ? trainOnTextIds : null,
PretranslateChapters = pretranslateFilter == FilterChoice.Chapters ? pretranslateChapters : null,
PretranslateTextIds = pretranslateFilter == FilterChoice.TextIds ? pretranslateTextIds : null
};
}

private static SIL.ServiceToolkit.Models.CorpusFile Map(CorpusFile source)
{
return new SIL.ServiceToolkit.Models.CorpusFile
{
Location = source.Location,
Format = (SIL.ServiceToolkit.Models.FileFormat)source.Format,
TextId = source.TextId
};
}

private enum FilterChoice
{
Chapters,
TextIds,
None
}

private static FilterChoice GetFilterChoice(
IReadOnlyDictionary<string, HashSet<int>> chapters,
HashSet<string> textIds
)
{
// Only either textIds or Scripture Range will be used at a time
// TextIds may be an empty array, so prefer that if both are empty (which applies to both scripture and text)
if (chapters is null && textIds is null)
return FilterChoice.None;
if (chapters is null || chapters.Count == 0)
return FilterChoice.TextIds;
return FilterChoice.Chapters;
}
}
1 change: 1 addition & 0 deletions src/Echo/src/EchoTranslationEngine/Usings.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
global using Grpc.Core;
global using Microsoft.Extensions.Diagnostics.HealthChecks;
global using Serval.Translation.V1;
global using SIL.ServiceToolkit.Utils;
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ public static IMachineBuilder AddMachine(this IServiceCollection services, IConf
services.AddTransient<IFileSystem, FileSystem>();

services.AddScoped<IDistributedReaderWriterLockFactory, DistributedReaderWriterLockFactory>();
services.AddSingleton<ICorpusService, CorpusService>();
services.AddStartupTask(
(sp, cancellationToken) =>
sp.GetRequiredService<IDistributedReaderWriterLockFactory>().InitAsync(cancellationToken)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,8 @@ public class NmtPreprocessBuildJob(
ILogger<NmtPreprocessBuildJob> logger,
IBuildJobService buildJobService,
ISharedFileService sharedFileService,
ICorpusService corpusService,
ILanguageTagService languageTagService
)
: PreprocessBuildJob(
platformService,
engines,
dataAccessContext,
logger,
buildJobService,
sharedFileService,
corpusService
)
) : PreprocessBuildJob(platformService, engines, dataAccessContext, logger, buildJobService, sharedFileService)
{
private readonly ILanguageTagService _languageTagService = languageTagService;

Expand Down
Loading
Loading