Skip to content

Commit

Permalink
Added chapter-lecel filtering; fixes #150
Browse files Browse the repository at this point in the history
  • Loading branch information
Enkidu93 committed Jan 26, 2024
1 parent d78c8ed commit 20d7b6a
Show file tree
Hide file tree
Showing 11 changed files with 76 additions and 42 deletions.
12 changes: 12 additions & 0 deletions src/Serval.Client/Client.g.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5241,6 +5241,9 @@ public partial class TrainingCorpus
[Newtonsoft.Json.JsonProperty("textIds", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)]
public System.Collections.Generic.IList<string>? TextIds { get; set; } = default!;

[Newtonsoft.Json.JsonProperty("biblicalRange", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)]
public string? BiblicalRange { get; set; } = default!;

}

[System.CodeDom.Compiler.GeneratedCode("NJsonSchema", "14.0.2.0 (NJsonSchema v11.0.0.0 (Newtonsoft.Json v13.0.0.0))")]
Expand All @@ -5253,6 +5256,9 @@ public partial class PretranslateCorpus
[Newtonsoft.Json.JsonProperty("textIds", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)]
public System.Collections.Generic.IList<string>? TextIds { get; set; } = default!;

[Newtonsoft.Json.JsonProperty("biblicalRange", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)]
public string? BiblicalRange { get; set; } = default!;

}

[System.CodeDom.Compiler.GeneratedCode("NJsonSchema", "14.0.2.0 (NJsonSchema v11.0.0.0 (Newtonsoft.Json v13.0.0.0))")]
Expand Down Expand Up @@ -5303,6 +5309,9 @@ public partial class TrainingCorpusConfig
[Newtonsoft.Json.JsonProperty("textIds", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)]
public System.Collections.Generic.IList<string>? TextIds { get; set; } = default!;

[Newtonsoft.Json.JsonProperty("biblicalRange", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)]
public string? BiblicalRange { get; set; } = default!;

}

[System.CodeDom.Compiler.GeneratedCode("NJsonSchema", "14.0.2.0 (NJsonSchema v11.0.0.0 (Newtonsoft.Json v13.0.0.0))")]
Expand All @@ -5315,6 +5324,9 @@ public partial class PretranslateCorpusConfig
[Newtonsoft.Json.JsonProperty("textIds", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)]
public System.Collections.Generic.IList<string>? TextIds { get; set; } = default!;

[Newtonsoft.Json.JsonProperty("biblicalRange", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)]
public string? BiblicalRange { get; set; } = default!;

}

[System.CodeDom.Compiler.GeneratedCode("NJsonSchema", "14.0.2.0 (NJsonSchema v11.0.0.0 (Newtonsoft.Json v13.0.0.0))")]
Expand Down
10 changes: 6 additions & 4 deletions src/Serval.Grpc/Protos/serval/translation/v1/engine.proto
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,12 @@ message Corpus {
string target_language = 3;
bool train_on_all = 4;
bool pretranslate_all = 5;
repeated string train_on_text_ids = 6;
repeated string pretranslate_text_ids = 7;
repeated CorpusFile source_files = 8;
repeated CorpusFile target_files = 9;
string train_on_biblical_range = 6;
string pretranslate_biblical_range = 7;
repeated string train_on_text_ids = 8;
repeated string pretranslate_text_ids = 9;
repeated CorpusFile source_files = 10;
repeated CorpusFile target_files = 11;
}

message CorpusFile {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,6 @@ public class PretranslateCorpusConfigDto
public string CorpusId { get; set; } = default!;

public IList<string>? TextIds { get; set; }

public string? BiblicalRange {get; set;}
}
2 changes: 2 additions & 0 deletions src/Serval.Translation/Contracts/PretranslateCorpusDto.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,6 @@ public class PretranslateCorpusDto
public ResourceLinkDto Corpus { get; set; } = default!;

public IList<string>? TextIds { get; set; }

public string? BiblicalRange { get; set; }
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ public class TrainingCorpusConfigDto
{
public string CorpusId { get; set; } = default!;
public IList<string>? TextIds { get; set; }
public string? BiblicalRange { get; set; }
}
2 changes: 2 additions & 0 deletions src/Serval.Translation/Contracts/TrainingCorpusDto.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,6 @@ public class TrainingCorpusDto
public ResourceLinkDto Corpus { get; set; } = default!;

public IList<string>? TextIds { get; set; }

public string? BiblicalRange { get; set; }
}
39 changes: 32 additions & 7 deletions src/Serval.Translation/Controllers/TranslationEnginesController.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1013,10 +1013,20 @@ private static Build Map(Engine engine, TranslationBuildConfigDto source)
foreach (PretranslateCorpusConfigDto ptcc in source.Pretranslate)
{
if (!corpusIds.Contains(ptcc.CorpusId))
throw new InvalidOperationException($"The corpus {ptcc.CorpusId} is not valid.");

throw new InvalidOperationException(
$"The corpus {ptcc.CorpusId} is not valid: This corpus does not exist for engine {engine.Id}."
);
if (ptcc.TextIds != null && ptcc.BiblicalRange != null)
throw new InvalidOperationException(
$"The corpus {ptcc.CorpusId} is not valid: Set exactly one of TextIds and BiblicalRange."
);
pretranslateCorpora.Add(
new PretranslateCorpus { CorpusRef = ptcc.CorpusId, TextIds = ptcc.TextIds?.ToList() }
new PretranslateCorpus
{
CorpusRef = ptcc.CorpusId,
TextIds = ptcc.TextIds?.ToList(),
BiblicalRange = ptcc.BiblicalRange
}
);
}
build.Pretranslate = pretranslateCorpora;
Expand All @@ -1027,8 +1037,21 @@ private static Build Map(Engine engine, TranslationBuildConfigDto source)
foreach (TrainingCorpusConfigDto tcc in source.TrainOn)
{
if (!corpusIds.Contains(tcc.CorpusId))
throw new InvalidOperationException($"The corpus {tcc.CorpusId} is not valid.");
trainOnCorpora.Add(new TrainingCorpus { CorpusRef = tcc.CorpusId, TextIds = tcc.TextIds?.ToList() });
throw new InvalidOperationException(
$"The corpus {tcc.CorpusId} is not valid: This corpus does not exist for engine {engine.Id}."
);
if (tcc.TextIds != null && tcc.BiblicalRange != null)
throw new InvalidOperationException(
$"The corpus {tcc.CorpusId} is not valid: Set exactly one of TextIds and BiblicalRange."
);
trainOnCorpora.Add(
new TrainingCorpus
{
CorpusRef = tcc.CorpusId,
TextIds = tcc.TextIds?.ToList(),
BiblicalRange = tcc.BiblicalRange
}
);
}
build.TrainOn = trainOnCorpora;
}
Expand Down Expand Up @@ -1101,7 +1124,8 @@ private PretranslateCorpusDto Map(string engineId, PretranslateCorpus source)
Id = source.CorpusRef,
Url = _urlService.GetUrl("GetTranslationCorpus", new { id = engineId, corpusId = source.CorpusRef })
},
TextIds = source.TextIds
TextIds = source.TextIds,
BiblicalRange = source.BiblicalRange
};
}

Expand All @@ -1114,7 +1138,8 @@ private TrainingCorpusDto Map(string engineId, TrainingCorpus source)
Id = source.CorpusRef,
Url = _urlService.GetUrl("GetTranslationCorpus", new { id = engineId, corpusId = source.CorpusRef })
},
TextIds = source.TextIds
TextIds = source.TextIds,
BiblicalRange = source.BiblicalRange
};
}

Expand Down
1 change: 1 addition & 0 deletions src/Serval.Translation/Models/PretranslateCorpus.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ public class PretranslateCorpus
{
public string CorpusRef { get; set; } = default!;
public List<string>? TextIds { get; set; }
public string? BiblicalRange { get; set; }
}
2 changes: 2 additions & 0 deletions src/Serval.Translation/Models/TrainingCorpus.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,6 @@ public class TrainingCorpus
{
public string CorpusRef { get; set; } = default!;
public IList<string>? TextIds { get; set; }

public string? BiblicalRange { get; set; }
}
2 changes: 2 additions & 0 deletions src/Serval.Translation/Services/EngineService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -213,12 +213,14 @@ public async Task<bool> StartBuildAsync(Build build, CancellationToken cancellat
pretranslateCorpus.TextIds is null || pretranslateCorpus.TextIds.Count == 0;
if (pretranslateCorpus.TextIds is not null)
corpus.PretranslateTextIds.Add(pretranslateCorpus.TextIds);
corpus.PretranslateBiblicalRange = pretranslateCorpus.BiblicalRange;
}
if (trainOn?.TryGetValue(c.Id, out TrainingCorpus? trainingCorpus) ?? false)
{
corpus.TrainOnAll = trainingCorpus.TextIds is null || trainingCorpus.TextIds.Count == 0;
if (trainingCorpus.TextIds is not null)
corpus.TrainOnTextIds.Add(trainingCorpus.TextIds);
corpus.TrainOnBiblicalRange = trainingCorpus.BiblicalRange;
}
else if (trainOn is null)
{
Expand Down
45 changes: 14 additions & 31 deletions tests/Serval.E2ETests/ServalApiTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ public async Task GetEchoPretranslate()
string engineId = await _helperClient.CreateNewEngine("Echo", "es", "es", "Echo2");
var books = new string[] { "1JN.txt", "2JN.txt" };
await _helperClient.AddTextCorpusToEngine(engineId, books, "es", "es", false);
books = new string[] { "3JN.txt" };
books = ["3JN.txt"];
var corpusId = await _helperClient.AddTextCorpusToEngine(engineId, books, "es", "es", true);
await _helperClient.BuildEngine(engineId);
var corpora = _helperClient.translationEnginesClient.GetAllCorporaAsync(engineId);
Expand Down Expand Up @@ -88,14 +88,14 @@ public async Task GetSmtMoreCorpus()
{
await _helperClient!.ClearEngines();
string engineId = await _helperClient.CreateNewEngine("SmtTransfer", "es", "en", "SMT4");
await _helperClient.AddTextCorpusToEngine(engineId, new string[] { "3JN.txt" }, "es", "en", false);
await _helperClient.AddTextCorpusToEngine(engineId, ["3JN.txt"], "es", "en", false);
await _helperClient.BuildEngine(engineId);
TranslationResult tResult = await _helperClient.translationEnginesClient.TranslateAsync(
engineId,
"verdad mundo"
);
Assert.AreEqual(tResult.Translation, "truth mundo");
await _helperClient.AddTextCorpusToEngine(engineId, new string[] { "1JN.txt", "2JN.txt" }, "es", "en", false);
await _helperClient.AddTextCorpusToEngine(engineId, ["1JN.txt", "2JN.txt"], "es", "en", false);
await _helperClient.BuildEngine(engineId);
TranslationResult tResult2 = await _helperClient.translationEnginesClient.TranslateAsync(
engineId,
Expand All @@ -113,9 +113,9 @@ public async Task NmtBatch()
var cId1 = await _helperClient.AddTextCorpusToEngine(engineId, books, "es", "en", false);
_helperClient.TranslationBuildConfig.TrainOn = new List<TrainingCorpusConfig>
{
new TrainingCorpusConfig { CorpusId = cId1, TextIds = new string[] { "1JN.txt" } }
new TrainingCorpusConfig { CorpusId = cId1, TextIds = ["1JN.txt"] }
};
var cId2 = await _helperClient.AddTextCorpusToEngine(engineId, new string[] { "3JN.txt" }, "es", "en", true);
var cId2 = await _helperClient.AddTextCorpusToEngine(engineId, ["3JN.txt"], "es", "en", true);
await _helperClient.BuildEngine(engineId);
await Task.Delay(1000);
IList<Pretranslation> lTrans = await _helperClient.translationEnginesClient.GetAllPretranslationsAsync(
Expand Down Expand Up @@ -143,7 +143,7 @@ public async Task NmtQueueMultiple()
string engineId = engineIds[i];
var books = new string[] { "MAT.txt", "1JN.txt", "2JN.txt" };
await _helperClient.AddTextCorpusToEngine(engineId, books, "es", "en", false);
await _helperClient.AddTextCorpusToEngine(engineId, new string[] { "3JN.txt" }, "es", "en", true);
await _helperClient.AddTextCorpusToEngine(engineId, ["3JN.txt"], "es", "en", true);
await _helperClient.StartBuildAsync(engineId);
//Ensure that tasks are enqueued roughly in order
await Task.Delay(1_000);
Expand Down Expand Up @@ -226,13 +226,7 @@ public async Task CircuitousRouteGetWordGraphAsync()
Assert.That(ex!.StatusCode, Is.EqualTo(409));

//Add corpus
var cId = await _helperClient.AddTextCorpusToEngine(
smtEngineId,
new string[] { "2JN.txt", "3JN.txt" },
"es",
"en",
false
);
var cId = await _helperClient.AddTextCorpusToEngine(smtEngineId, ["2JN.txt", "3JN.txt"], "es", "en", false);

//Build the new engine
await _helperClient.BuildEngine(smtEngineId);
Expand All @@ -241,13 +235,7 @@ public async Task CircuitousRouteGetWordGraphAsync()
await _helperClient.translationEnginesClient.DeleteCorpusAsync(smtEngineId, cId);

//Add corpus
await _helperClient.AddTextCorpusToEngine(
smtEngineId,
new string[] { "1JN.txt", "2JN.txt", "3JN.txt" },
"es",
"en",
false
);
await _helperClient.AddTextCorpusToEngine(smtEngineId, ["1JN.txt", "2JN.txt", "3JN.txt"], "es", "en", false);

//Build the new engine
await _helperClient.BuildEngine(smtEngineId);
Expand Down Expand Up @@ -280,7 +268,7 @@ public async Task CircuitousRouteTranslateTopNAsync()
//Add corpus
string cId = await _helperClient.AddTextCorpusToEngine(
engineId,
new string[] { "1JN.txt", "2JN.txt", "3JN.txt" },
["1JN.txt", "2JN.txt", "3JN.txt"],
"en",
"fa",
false
Expand Down Expand Up @@ -391,25 +379,20 @@ public async Task ParatextProjectNmtJobAsync()
{
SourceLanguage = "en",
TargetLanguage = "sbp",
SourceFiles = new TranslationCorpusFileConfig[]
{
new TranslationCorpusFileConfig { FileId = file1.Id }
},
TargetFiles = new TranslationCorpusFileConfig[]
{
new TranslationCorpusFileConfig { FileId = file2.Id }
}
SourceFiles = [new TranslationCorpusFileConfig { FileId = file1.Id }],
TargetFiles = [new TranslationCorpusFileConfig { FileId = file2.Id }]
}
);
_helperClient.TranslationBuildConfig.Pretranslate!.Add(
new PretranslateCorpusConfig { CorpusId = corpus.Id, TextIds = new string[] { "JHN", "REV" } }
new PretranslateCorpusConfig { CorpusId = corpus.Id, BiblicalRange = "JHN" }
);
_helperClient.TranslationBuildConfig.Options = "{\"max_steps\":10, \"use_key_terms\":true}";

await _helperClient.BuildEngine(engineId);
Assert.That(
(await _helperClient.translationEnginesClient.GetAllBuildsAsync(engineId)).First().State
== JobState.Completed
== JobState.Completed,
JsonSerializer.Serialize((await _helperClient.translationEnginesClient.GetAllBuildsAsync(engineId)).First())
);
IList<Pretranslation> lTrans = await _helperClient.translationEnginesClient.GetAllPretranslationsAsync(
engineId,
Expand Down

0 comments on commit 20d7b6a

Please sign in to comment.