Skip to content

Commit

Permalink
Chapter level specification (#289)
Browse files Browse the repository at this point in the history
* Added chapter-level filtering; fixes #150
  • Loading branch information
Enkidu93 authored Feb 7, 2024
1 parent 7231a27 commit 8e4a86b
Show file tree
Hide file tree
Showing 12 changed files with 133 additions and 14 deletions.
16 changes: 16 additions & 0 deletions src/Serval.Client/Client.g.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1604,6 +1604,8 @@ public partial interface ITranslationEnginesClient
/// <br/>Similarly, specify the corpora and textIds to train on. If no train_on field is provided, all corpora will be used.
/// <br/>Paratext projects can be filtered by book for training and pretranslating. This filtering follows the original versification.
/// <br/>To filter, use the 3 character code for the book of the Bible in the textID while building. See [here](https://github.com/sillsdev/serval/wiki/Versification-in-Serval) for more information.
/// <br/>Filters can also be supplied via scriptureRange parameter as ranges of biblical text. See [here](https://github.com/sillsdev/serval/wiki/Filtering-Paratext-Project-Data-with-a-Scripture-Range)
/// <br/>for more details.
/// <br/>
/// <br/>The `"options"` parameter of the build config provides the ability to pass build configuration parameters as a JSON object.
/// <br/>See [nmt job settings documentation](https://github.com/sillsdev/serval/wiki/NMT-Build-Options) about configuring job parameters.
Expand Down Expand Up @@ -3748,6 +3750,8 @@ public string BaseUrl
/// <br/>Similarly, specify the corpora and textIds to train on. If no train_on field is provided, all corpora will be used.
/// <br/>Paratext projects can be filtered by book for training and pretranslating. This filtering follows the original versification.
/// <br/>To filter, use the 3 character code for the book of the Bible in the textID while building. See [here](https://github.com/sillsdev/serval/wiki/Versification-in-Serval) for more information.
/// <br/>Filters can also be supplied via scriptureRange parameter as ranges of biblical text. See [here](https://github.com/sillsdev/serval/wiki/Filtering-Paratext-Project-Data-with-a-Scripture-Range)
/// <br/>for more details.
/// <br/>
/// <br/>The `"options"` parameter of the build config provides the ability to pass build configuration parameters as a JSON object.
/// <br/>See [nmt job settings documentation](https://github.com/sillsdev/serval/wiki/NMT-Build-Options) about configuring job parameters.
Expand Down Expand Up @@ -5812,6 +5816,9 @@ public partial class TrainingCorpus
[Newtonsoft.Json.JsonProperty("textIds", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)]
public System.Collections.Generic.IList<string>? TextIds { get; set; } = default!;

[Newtonsoft.Json.JsonProperty("scriptureRange", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)]
public string? ScriptureRange { get; set; } = default!;

}

[System.CodeDom.Compiler.GeneratedCode("NJsonSchema", "14.0.2.0 (NJsonSchema v11.0.0.0 (Newtonsoft.Json v13.0.0.0))")]
Expand All @@ -5824,6 +5831,9 @@ public partial class PretranslateCorpus
[Newtonsoft.Json.JsonProperty("textIds", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)]
public System.Collections.Generic.IList<string>? TextIds { get; set; } = default!;

[Newtonsoft.Json.JsonProperty("scriptureRange", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)]
public string? ScriptureRange { get; set; } = default!;

}

[System.CodeDom.Compiler.GeneratedCode("NJsonSchema", "14.0.2.0 (NJsonSchema v11.0.0.0 (Newtonsoft.Json v13.0.0.0))")]
Expand Down Expand Up @@ -5874,6 +5884,9 @@ public partial class TrainingCorpusConfig
[Newtonsoft.Json.JsonProperty("textIds", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)]
public System.Collections.Generic.IList<string>? TextIds { get; set; } = default!;

[Newtonsoft.Json.JsonProperty("scriptureRange", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)]
public string? ScriptureRange { get; set; } = default!;

}

[System.CodeDom.Compiler.GeneratedCode("NJsonSchema", "14.0.2.0 (NJsonSchema v11.0.0.0 (Newtonsoft.Json v13.0.0.0))")]
Expand All @@ -5886,6 +5899,9 @@ public partial class PretranslateCorpusConfig
[Newtonsoft.Json.JsonProperty("textIds", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)]
public System.Collections.Generic.IList<string>? TextIds { get; set; } = default!;

[Newtonsoft.Json.JsonProperty("scriptureRange", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)]
public string? ScriptureRange { get; set; } = default!;

}

[System.CodeDom.Compiler.GeneratedCode("NJsonSchema", "14.0.2.0 (NJsonSchema v11.0.0.0 (Newtonsoft.Json v13.0.0.0))")]
Expand Down
14 changes: 10 additions & 4 deletions src/Serval.Grpc/Protos/serval/translation/v1/engine.proto
Original file line number Diff line number Diff line change
Expand Up @@ -147,10 +147,16 @@ message Corpus {
string target_language = 3;
bool train_on_all = 4;
bool pretranslate_all = 5;
repeated string train_on_text_ids = 6;
repeated string pretranslate_text_ids = 7;
repeated CorpusFile source_files = 8;
repeated CorpusFile target_files = 9;
map<string, ScriptureChapters> train_on_chapters = 6;
map<string, ScriptureChapters> pretranslate_chapters = 7;
repeated string train_on_text_ids = 8;
repeated string pretranslate_text_ids = 9;
repeated CorpusFile source_files = 10;
repeated CorpusFile target_files = 11;
}

message ScriptureChapters {
repeated int32 chapters = 1;
}

message CorpusFile {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,6 @@ public class PretranslateCorpusConfigDto
public string CorpusId { get; set; } = default!;

public IList<string>? TextIds { get; set; }

public string? ScriptureRange { get; set; }
}
2 changes: 2 additions & 0 deletions src/Serval.Translation/Contracts/PretranslateCorpusDto.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,6 @@ public class PretranslateCorpusDto
public ResourceLinkDto Corpus { get; set; } = default!;

public IList<string>? TextIds { get; set; }

public string? ScriptureRange { get; set; }
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ public class TrainingCorpusConfigDto
{
public string CorpusId { get; set; } = default!;
public IList<string>? TextIds { get; set; }
public string? ScriptureRange { get; set; }
}
2 changes: 2 additions & 0 deletions src/Serval.Translation/Contracts/TrainingCorpusDto.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,6 @@ public class TrainingCorpusDto
public ResourceLinkDto Corpus { get; set; } = default!;

public IList<string>? TextIds { get; set; }

public string? ScriptureRange { get; set; }
}
41 changes: 34 additions & 7 deletions src/Serval.Translation/Controllers/TranslationEnginesController.cs
Original file line number Diff line number Diff line change
Expand Up @@ -762,6 +762,8 @@ CancellationToken cancellationToken
/// Similarly, specify the corpora and textIds to train on. If no train_on field is provided, all corpora will be used.
/// Paratext projects can be filtered by book for training and pretranslating. This filtering follows the original versification.
/// To filter, use the 3 character code for the book of the Bible in the textID while building. See [here](https://github.com/sillsdev/serval/wiki/Versification-in-Serval) for more information.
/// Filters can also be supplied via scriptureRange parameter as ranges of biblical text. See [here](https://github.com/sillsdev/serval/wiki/Filtering-Paratext-Project-Data-with-a-Scripture-Range)
/// for more details.
///
/// The `"options"` parameter of the build config provides the ability to pass build configuration parameters as a JSON object.
/// See [nmt job settings documentation](https://github.com/sillsdev/serval/wiki/NMT-Build-Options) about configuring job parameters.
Expand Down Expand Up @@ -970,10 +972,20 @@ private static Build Map(Engine engine, TranslationBuildConfigDto source)
foreach (PretranslateCorpusConfigDto ptcc in source.Pretranslate)
{
if (!corpusIds.Contains(ptcc.CorpusId))
throw new InvalidOperationException($"The corpus {ptcc.CorpusId} is not valid.");

throw new InvalidOperationException(
$"The corpus {ptcc.CorpusId} is not valid: This corpus does not exist for engine {engine.Id}."
);
if (ptcc.TextIds != null && ptcc.ScriptureRange != null)
throw new InvalidOperationException(
$"The corpus {ptcc.CorpusId} is not valid: Set at most one of TextIds and ScriptureRange."
);
pretranslateCorpora.Add(
new PretranslateCorpus { CorpusRef = ptcc.CorpusId, TextIds = ptcc.TextIds?.ToList() }
new PretranslateCorpus
{
CorpusRef = ptcc.CorpusId,
TextIds = ptcc.TextIds?.ToList(),
ScriptureRange = ptcc.ScriptureRange
}
);
}
build.Pretranslate = pretranslateCorpora;
Expand All @@ -984,8 +996,21 @@ private static Build Map(Engine engine, TranslationBuildConfigDto source)
foreach (TrainingCorpusConfigDto tcc in source.TrainOn)
{
if (!corpusIds.Contains(tcc.CorpusId))
throw new InvalidOperationException($"The corpus {tcc.CorpusId} is not valid.");
trainOnCorpora.Add(new TrainingCorpus { CorpusRef = tcc.CorpusId, TextIds = tcc.TextIds?.ToList() });
throw new InvalidOperationException(
$"The corpus {tcc.CorpusId} is not valid: This corpus does not exist for engine {engine.Id}."
);
if (tcc.TextIds != null && tcc.ScriptureRange != null)
throw new InvalidOperationException(
$"The corpus {tcc.CorpusId} is not valid: Set at most one of TextIds and ScriptureRange."
);
trainOnCorpora.Add(
new TrainingCorpus
{
CorpusRef = tcc.CorpusId,
TextIds = tcc.TextIds?.ToList(),
ScriptureRange = tcc.ScriptureRange
}
);
}
build.TrainOn = trainOnCorpora;
}
Expand Down Expand Up @@ -1056,7 +1081,8 @@ private PretranslateCorpusDto Map(string engineId, PretranslateCorpus source)
Id = source.CorpusRef,
Url = _urlService.GetUrl("GetTranslationCorpus", new { id = engineId, corpusId = source.CorpusRef })
},
TextIds = source.TextIds
TextIds = source.TextIds,
ScriptureRange = source.ScriptureRange
};
}

Expand All @@ -1069,7 +1095,8 @@ private TrainingCorpusDto Map(string engineId, TrainingCorpus source)
Id = source.CorpusRef,
Url = _urlService.GetUrl("GetTranslationCorpus", new { id = engineId, corpusId = source.CorpusRef })
},
TextIds = source.TextIds
TextIds = source.TextIds,
ScriptureRange = source.ScriptureRange
};
}

Expand Down
1 change: 1 addition & 0 deletions src/Serval.Translation/Models/PretranslateCorpus.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ public class PretranslateCorpus
{
public string CorpusRef { get; set; } = default!;
public List<string>? TextIds { get; set; }
public string? ScriptureRange { get; set; }
}
2 changes: 2 additions & 0 deletions src/Serval.Translation/Models/TrainingCorpus.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,6 @@ public class TrainingCorpus
{
public string CorpusRef { get; set; } = default!;
public IList<string>? TextIds { get; set; }

public string? ScriptureRange { get; set; }
}
61 changes: 60 additions & 1 deletion src/Serval.Translation/Services/EngineService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ public class EngineService(
GrpcClientFactory grpcClientFactory,
IOptionsMonitor<DataFileOptions> dataFileOptions,
IDataAccessContext dataAccessContext,
ILoggerFactory loggerFactory
ILoggerFactory loggerFactory,
IScriptureDataFileService scriptureDataFileService
) : EntityServiceBase<Engine>(engines), IEngineService
{
private readonly IRepository<Build> _builds = builds;
Expand All @@ -18,6 +19,7 @@ ILoggerFactory loggerFactory
private readonly IOptionsMonitor<DataFileOptions> _dataFileOptions = dataFileOptions;
private readonly IDataAccessContext _dataAccessContext = dataAccessContext;
private readonly ILogger<EngineService> _logger = loggerFactory.CreateLogger<EngineService>();
private readonly IScriptureDataFileService _scriptureDataFileService = scriptureDataFileService;

public async Task<Models.TranslationResult> TranslateAsync(
string engineId,
Expand Down Expand Up @@ -170,6 +172,24 @@ public async Task StartBuildAsync(Build build, CancellationToken cancellationTok
Dictionary<string, PretranslateCorpus>? pretranslate = build.Pretranslate?.ToDictionary(c => c.CorpusRef);
Dictionary<string, TrainingCorpus>? trainOn = build.TrainOn?.ToDictionary(c => c.CorpusRef);
var client = _grpcClientFactory.CreateClient<TranslationEngineApi.TranslationEngineApiClient>(engine.Type);
Dictionary<string, List<int>> GetChapters(V1.Corpus corpus, string scriptureRange)
{
try
{
return ScriptureRangeParser.GetChapters(

Check failure on line 179 in src/Serval.Translation/Services/EngineService.cs

View workflow job for this annotation

GitHub Actions / Build

The name 'ScriptureRangeParser' does not exist in the current context

Check failure on line 179 in src/Serval.Translation/Services/EngineService.cs

View workflow job for this annotation

GitHub Actions / Build

The name 'ScriptureRangeParser' does not exist in the current context
scriptureRange,
_scriptureDataFileService
.GetParatextProjectSettings(corpus.TargetFiles.First().Location)
.Versification
);
}
catch (ArgumentException ae)
{
throw new InvalidOperationException(
$"The scripture range {scriptureRange} is not valid: {ae.Message}"
);
}
}
var request = new StartBuildRequest
{
EngineType = engine.Type,
Expand All @@ -180,19 +200,58 @@ public async Task StartBuildAsync(Build build, CancellationToken cancellationTok
{
engine.Corpora.Select(c =>
{
if (
c.TargetFiles.Count > 1
|| c.TargetFiles.First().Format != Shared.Contracts.FileFormat.Paratext
)
{
throw new InvalidOperationException(
$"The corpus {c.Id} is not compatible with using a scripture range"
);
}
V1.Corpus corpus = Map(c);
if (pretranslate?.TryGetValue(c.Id, out PretranslateCorpus? pretranslateCorpus) ?? false)
{
corpus.PretranslateAll =
pretranslateCorpus.TextIds is null || pretranslateCorpus.TextIds.Count == 0;
if (pretranslateCorpus.TextIds is not null)
corpus.PretranslateTextIds.Add(pretranslateCorpus.TextIds);
if (pretranslateCorpus.ScriptureRange is not null)
{
corpus.PretranslateChapters.Add(
GetChapters(corpus, pretranslateCorpus.ScriptureRange)
.Select(
(kvp) =>
{
var scriptureChapters = new ScriptureChapters();
scriptureChapters.Chapters.Add(kvp.Value);
return (kvp.Key, scriptureChapters);
}
)
.ToDictionary()
);
}
}
if (trainOn?.TryGetValue(c.Id, out TrainingCorpus? trainingCorpus) ?? false)
{
corpus.TrainOnAll = trainingCorpus.TextIds is null || trainingCorpus.TextIds.Count == 0;
if (trainingCorpus.TextIds is not null)
corpus.TrainOnTextIds.Add(trainingCorpus.TextIds);
if (trainingCorpus.ScriptureRange is not null)
{
corpus.TrainOnChapters.Add(
GetChapters(corpus, trainingCorpus.ScriptureRange)
.Select(
(kvp) =>
{
var scriptureChapters = new ScriptureChapters();
scriptureChapters.Chapters.Add(kvp.Value);
return (kvp.Key, scriptureChapters);
}
)
.ToDictionary()
);
}
}
else if (trainOn is null)
{
Expand Down
2 changes: 1 addition & 1 deletion tests/Serval.E2ETests/ServalApiTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,7 @@ public async Task ParatextProjectNmtJobAsync()
}
);
_helperClient.TranslationBuildConfig.Pretranslate!.Add(
new PretranslateCorpusConfig { CorpusId = corpus.Id, TextIds = ["JHN", "REV"] }
new PretranslateCorpusConfig { CorpusId = corpus.Id, ScriptureRange = "JHN" }
);
_helperClient.TranslationBuildConfig.Options = "{\"max_steps\":10, \"use_key_terms\":true}";

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,8 @@ public TestEnvironment()
grpcClientFactory,
dataFileOptions,
new MemoryDataAccessContext(),
new LoggerFactory()
new LoggerFactory(),
new ScriptureDataFileService(new FileSystem(), dataFileOptions)
);
}

Expand Down

0 comments on commit 8e4a86b

Please sign in to comment.