Skip to content

Commit

Permalink
Added chapter-level filtering; fixes sillsdev/serval#150
Browse files Browse the repository at this point in the history
  • Loading branch information
Enkidu93 committed Jan 26, 2024
1 parent 79c0832 commit 307eba7
Show file tree
Hide file tree
Showing 13 changed files with 472 additions and 3 deletions.
2 changes: 2 additions & 0 deletions src/SIL.Machine.AspNetCore/Models/Corpus.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ public class Corpus
public string TargetLanguage { get; set; } = default!;
public bool TrainOnAll { get; set; }
public bool PretranslateAll { get; set; }
public string? TrainOnBiblicalRange { get; set; }
public string? PretranslateBiblicalRange {get; set; }
public HashSet<string> TrainOnTextIds { get; set; } = default!;
public HashSet<string> PretranslateTextIds { get; set; } = default!;
public List<CorpusFile> SourceFiles { get; set; } = default!;
Expand Down
165 changes: 165 additions & 0 deletions src/SIL.Machine.AspNetCore/Services/BiblicalRangeStringParser.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
class BiblicalRangeStringParser {
private readonly Dictionary<string, int> _bookLengths = [];
private static readonly Regex CommaSeparatedBooks = new Regex(@"^([A-Z\d]{3}|OT|NT)(, ?([A-Z\d]{3}|OT|NT))*$", RegexOptions.Compiled);
private static readonly Regex BookRange = new Regex(@"^-?[A-Z\d]{3}-[A-Z\d]{3}$", RegexOptions.Compiled);
private static readonly Regex ChapterSelection = new Regex(@"^-?[A-Z\d]{3} ?(\d+|\d+-\d+)(, ?(\d+|\d+-\d+))*$", RegexOptions.Compiled);


public BiblicalRangeStringParser(ScrVers? versification = null){
versification ??= ScrVers.Original;
foreach((string bookId, int bookNum) in Canon.AllBookIds.Zip(Canon.AllBookNumbers)){
_bookLengths[bookId] = versification.GetLastChapter(bookNum);
}
}

private Dictionary<string, List<int>> ParseSection(string section){
section = section.Trim();
Dictionary<string, List<int>> chaptersPerBook = [];

//*Specific chapters from one book*
if (char.IsAsciiDigit(section.Last())){
string bookName = section[..3];
if (!_bookLengths.ContainsKey(bookName)){
throw new ArgumentException($"{bookName} is an invalid book ID.");
}

HashSet<int> chapters = [];

int lastChapter = _bookLengths[bookName];
string[] chapterRangeStrings = section[3..].Split(',');
foreach(string chapterRangeString in chapterRangeStrings.Select(s => s.Trim())){
if(chapterRangeString.Contains('-')){
string[] startAndEnd = chapterRangeString.Split('-');
int start, end;
if (!(int.TryParse(startAndEnd[0], out start) && int.TryParse(startAndEnd[1], out end))){
throw new ArgumentException($"{chapterRangeString} is an invalid chapter range.");
}
if (start == 0 || end > lastChapter || end <= start){
throw new ArgumentException($"{chapterRangeString} is an invalid chapter range.");
}
for(int chapterNum = start; chapterNum <= end; chapterNum++){
chapters.Add(chapterNum);
}
}
else {
int chapterNum;
if (!int.TryParse(chapterRangeString, out chapterNum)){
throw new ArgumentException($"{section} is an invalid chapter number.");
}
if (chapterNum > lastChapter){
throw new ArgumentException($"{section} is an invalid chapter number.");
}
chapters.Add(chapterNum);
}
}
if (chapters.Count() == lastChapter){
chaptersPerBook[bookName] = [];
}
else {
chaptersPerBook[bookName] = chapters.ToList();
chaptersPerBook[bookName].Sort();
}
}
//*Ranges of books to be added*
else if(section.Contains('-')){
string[] startAndEnd = section.Split("-");
if (startAndEnd.Length != 2 || !_bookLengths.ContainsKey(startAndEnd[0]) || !_bookLengths.ContainsKey(startAndEnd[1]) || Canon.BookIdToNumber(startAndEnd[1]) <= Canon.BookIdToNumber(startAndEnd[0])){
throw new ArgumentException($"{section} is an invalid book range.");
}
for(int bookNum = Canon.BookIdToNumber(startAndEnd[0]); bookNum <= Canon.BookIdToNumber(startAndEnd[1]); bookNum++){
chaptersPerBook[Canon.BookNumberToId(bookNum)] = [];
}
}
//*OT*
else if(section == "OT"){
for(int bookNum = 1; bookNum <= 39; bookNum++){
chaptersPerBook[Canon.BookNumberToId(bookNum)] = [];
}
}
//*NT*
else if(section == "NT"){
for(int bookNum = 40; bookNum <= 66; bookNum++){
chaptersPerBook[Canon.BookNumberToId(bookNum)] = [];
}
}
//*Whole book*
else {
if(!_bookLengths.ContainsKey(section)){
throw new ArgumentException($"{section} is an invalid book ID.");
}
chaptersPerBook[section] = [];
}

return chaptersPerBook;
}

public Dictionary<string, List<int>> Parse(string chapterSelections){
Dictionary<string, List<int>> chaptersPerBook = [];
chapterSelections = chapterSelections.Trim();

char delimiter = ';';
if(chapterSelections.Contains(';')){
delimiter = ';';
}
else if (CommaSeparatedBooks.IsMatch(chapterSelections)){
delimiter = ',';
}
else if (!BookRange.IsMatch(chapterSelections) && ! ChapterSelection.IsMatch(chapterSelections)){
throw new ArgumentException("Invalid syntax. If you are providing multiple selections, e.g. a range of books followed by a selection of chapters from a book, separate each selection with a semicolon.");
}
string[] selections = chapterSelections.Split(delimiter);
foreach (string section in selections.Select(s => s.Trim())){

//*Subtraction*
if (section.StartsWith('-')){
Dictionary<string, List<int>> sectionChapters = ParseSection(section[1..]);
foreach(string bookName in sectionChapters.Keys){
if (!chaptersPerBook.ContainsKey(bookName)){
throw new ArgumentException($"{bookName} cannot be removed as it is not in the existing book selection.");
}

if (sectionChapters[bookName].Count() == 0){
sectionChapters[bookName] = Enumerable.Range(1, _bookLengths[bookName]).ToList();
}

if (chaptersPerBook[bookName].Count() == 0){
chaptersPerBook[bookName] = Enumerable.Range(1, _bookLengths[bookName]).ToList();
}

foreach(int chapterNumber in sectionChapters[bookName]){
if(!chaptersPerBook[bookName].Remove(chapterNumber)){
throw new ArgumentException($"{chapterNumber} cannot be removed as it is not in the existing chapter selection.");
}
}

if (chaptersPerBook[bookName].Count() == 0){
chaptersPerBook.Remove(bookName);
}
}
}

//*Addition*
else {
Dictionary<string,List<int>> sectionChapters = ParseSection(section);
foreach(string bookName in sectionChapters.Keys){
if (chaptersPerBook.ContainsKey(bookName)){
if(chaptersPerBook[bookName].Count() == 0 || sectionChapters[bookName].Count() == 0){
chaptersPerBook[bookName] = [];
continue;
}
chaptersPerBook[bookName] = chaptersPerBook[bookName].Concat(sectionChapters[bookName]).Distinct().ToList();
chaptersPerBook[bookName].Sort();
if(chaptersPerBook[bookName].Count() == _bookLengths[bookName]){
chaptersPerBook[bookName] = [];
}
}
else {
chaptersPerBook[bookName] = sectionChapters[bookName];
}

}
}
}
return chaptersPerBook;
}
}
25 changes: 22 additions & 3 deletions src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using Google.Protobuf;
using System.Data;
using Google.Protobuf;
using MongoDB.Bson.IO;

namespace SIL.Machine.AspNetCore.Services;
Expand Down Expand Up @@ -131,14 +132,32 @@ async IAsyncEnumerable<Pretranslation> ProcessRowsAsync()

foreach (ParallelTextRow row in parallelCorpora.Flatten())
{
if (corpus.TrainOnAll || corpus.TrainOnTextIds.Contains(row.TextId))
bool isInTrainOnRange = false;
bool isInPretranslateRange = false;
if(targetCorpora[CorpusType.Text] is ScriptureTextCorpus stc && row.Refs.All(r => r is VerseRef)){
Dictionary<string, List<int>> rowChaptersPerBook = row.Refs.Cast<VerseRef>().GroupBy(vr => vr.Book).ToDictionary(g => g.Key, g => g.Select(vr => vr.ChapterNum).ToList());
var parser = new BiblicalRangeStringParser(stc.Versification);
if(corpus.TrainOnBiblicalRange != null && corpus.TrainOnBiblicalRange != ""){
Dictionary<string, List<int>> trainOnBiblicalRangeChapters = parser.Parse(corpus.TrainOnBiblicalRange); //TODO calculate once
isInTrainOnRange = rowChaptersPerBook.Join(trainOnBiblicalRangeChapters, rcpb => rcpb.Key, tobrc => tobrc.Key, (rcbp, tobrc) =>
rcbp.Value.Intersect(tobrc.Value).Count() > 0 || (rcbp.Value.Count() > 0 && tobrc.Value.Count() == 0) //Empty list means all chapters from book
).Any(b => b);
}
if(corpus.PretranslateBiblicalRange != null && corpus.PretranslateBiblicalRange != ""){
Dictionary<string, List<int>> pretranslateBiblicalRangeChapters = parser.Parse(corpus.PretranslateBiblicalRange);
isInPretranslateRange = rowChaptersPerBook.Join(pretranslateBiblicalRangeChapters, rcpb => rcpb.Key, pbrc => pbrc.Key, (rcbp, pbrc) =>
rcbp.Value.Intersect(pbrc.Value).Count() > 0 || (rcbp.Value.Count() > 0 && pbrc.Value.Count() == 0)
).Any(b => b);
}
}
if (corpus.TrainOnAll || corpus.TrainOnTextIds.Contains(row.TextId) || isInTrainOnRange)
{
await sourceTrainWriter.WriteAsync($"{row.SourceText}\n");
await targetTrainWriter.WriteAsync($"{row.TargetText}\n");
counts["NumTrainRows"] += 1;
}
if (
(corpus.PretranslateAll || corpus.PretranslateTextIds.Contains(row.TextId))
(corpus.PretranslateAll || corpus.PretranslateTextIds.Contains(row.TextId) || isInPretranslateRange)
&& row.SourceSegment.Count > 0
&& row.TargetSegment.Count == 0
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,8 @@ private static Models.Corpus Map(Serval.Translation.V1.Corpus source)
TargetLanguage = source.TargetLanguage,
TrainOnAll = source.TrainOnAll,
PretranslateAll = source.PretranslateAll,
TrainOnBiblicalRange = source.TrainOnBiblicalRange,
PretranslateBiblicalRange = source.PretranslateBiblicalRange,
TrainOnTextIds = source.TrainOnTextIds.ToHashSet(),
PretranslateTextIds = source.PretranslateTextIds.ToHashSet(),
SourceFiles = source.SourceFiles.Select(Map).ToList(),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
namespace SIL.Machine.AspNetCore.Services;

[TestFixture]
public class BiblicalRangeStringParserTests {

[Test]
[TestCaseSource(nameof(GetCases))]
public void TestParse(string rangeString, Dictionary<string, List<int>> expectedOutput, bool throwsException){
var parser = new BiblicalRangeStringParser();
if(!throwsException){
Assert.That(parser.Parse(rangeString), Is.EquivalentTo(expectedOutput));
}
else {
Assert.Throws<ArgumentException>(() => {
parser.Parse(rangeString);
});
}
}

public static IEnumerable<TestCaseData> GetCases(){
yield return new TestCaseData("MAL", new Dictionary<string, List<int>>{ {"MAL" , new List<int>()}}, false);
yield return new TestCaseData("GEN,EXO", new Dictionary<string, List<int>>{ {"GEN" , new List<int>()},{"EXO" , new List<int>()} }, false);
yield return new TestCaseData("1JN,2JN", new Dictionary<string, List<int>>{ {"1JN" , new List<int>()},{"2JN" , new List<int>()} }, false);
yield return new TestCaseData("OT", Enumerable.Range(1, 39).Select(i => (Canon.BookNumberToId(i), new List<int>())).ToDictionary(), false);
yield return new TestCaseData("NT", Enumerable.Range(40, 27).Select(i => (Canon.BookNumberToId(i), new List<int>())).ToDictionary(), false);
yield return new TestCaseData("NT,OT", Enumerable.Range(1, 66).Select(i => (Canon.BookNumberToId(i), new List<int>())).ToDictionary(), false);
yield return new TestCaseData("MAT;MRK", new Dictionary<string, List<int>>{ {"MAT" , new List<int>()},{"MRK" , new List<int>()} }, false);
yield return new TestCaseData("MAT; MRK", new Dictionary<string, List<int>>{ {"MAT" , new List<int>()},{"MRK" , new List<int>()} }, false);
yield return new TestCaseData("MAT1,2,3", new Dictionary<string, List<int>>{ {"MAT" , new List<int>(){1,2,3}} }, false);
yield return new TestCaseData("MAT1, 2, 3", new Dictionary<string, List<int>>{ {"MAT" , new List<int>(){1,2,3}} }, false);
yield return new TestCaseData("MAT-LUK", new Dictionary<string, List<int>>{ {"MAT" , new List<int>()},{"MRK" , new List<int>()},{"LUK" , new List<int>()} }, false);
yield return new TestCaseData("MAT1,2,3;MAT-LUK", new Dictionary<string, List<int>>{ {"MAT" , new List<int>()},{"MRK" , new List<int>()},{"LUK" , new List<int>()} }, false);
yield return new TestCaseData("2JN-3JN;EXO1,8,3-5;GEN", new Dictionary<string, List<int>>{ {"GEN" , new List<int>()},{"EXO" , new List<int>(){1,3,4,5,8}},{"2JN" , new List<int>()},{"3JN" , new List<int>()} }, false);
yield return new TestCaseData("1JN 1;1JN 2;1JN 3-5", new Dictionary<string, List<int>>{ {"1JN" , new List<int>()}}, false);
yield return new TestCaseData("MAT-ROM;-ACT4-28", new Dictionary<string, List<int>>{ {"MAT" , new List<int>()},{"MRK" , new List<int>()},{"LUK" , new List<int>()},{"JHN" , new List<int>()},{"ACT" , new List<int>(){1,2,3}},{"ROM" , new List<int>()} }, false);
yield return new TestCaseData("2JN;-2JN 1", new Dictionary<string, List<int>>{}, false);
yield return new TestCaseData("NT;OT;-MRK;-EXO", Enumerable.Range(1, 66).Where(i => i != 2 && i!= 41).Select(i => (Canon.BookNumberToId(i), new List<int>())).ToDictionary(), false);
yield return new TestCaseData("NT;-MAT3-5,17;-REV21,22", Enumerable.Range(40, 27).Select(i => {
if (i == 40){
return (Canon.BookNumberToId(i), Enumerable.Range(1,28).Where(c => !(c == 3 || c == 4 || c == 5 || c== 17)).ToList());
}
if (i == 66){
return (Canon.BookNumberToId(i), Enumerable.Range(1,20).ToList());
}
return (Canon.BookNumberToId(i), new List<int>());
}).ToDictionary(), false);
yield return new TestCaseData("MAT-JHN;-MAT-LUK", new Dictionary<string, List<int>>{ {"JHN" , new List<int>()} }, false);


//*Throw exceptions
yield return new TestCaseData("MAT3-1", new Dictionary<string, List<int>>(), true);
yield return new TestCaseData("MRK-MAT", new Dictionary<string, List<int>>(), true);
yield return new TestCaseData("MRK;-MRK10-3", new Dictionary<string, List<int>>(), true);
yield return new TestCaseData("MAT0-10", new Dictionary<string, List<int>>(), true);
yield return new TestCaseData("MAT-FLUM", new Dictionary<string, List<int>>(), true);
yield return new TestCaseData("-MAT-FLUM", new Dictionary<string, List<int>>(), true);
yield return new TestCaseData("", new Dictionary<string, List<int>>(), true);
yield return new TestCaseData("ABC", new Dictionary<string, List<int>>(), true);
yield return new TestCaseData("MAT-ABC", new Dictionary<string, List<int>>(), true);
yield return new TestCaseData("NT;-ABC-LUK", new Dictionary<string, List<int>>(), true);
yield return new TestCaseData("MAT 500", new Dictionary<string, List<int>>(), true);
yield return new TestCaseData("MAT 1-500", new Dictionary<string, List<int>>(), true);
yield return new TestCaseData("MAT;-MAT 300-500", new Dictionary<string, List<int>>(), true);
yield return new TestCaseData("-MRK", new Dictionary<string, List<int>>(), true);
yield return new TestCaseData("-MRK 1", new Dictionary<string, List<int>>(), true);
yield return new TestCaseData("MRK 2-5;-MRK 1-4", new Dictionary<string, List<int>>(), true);
yield return new TestCaseData("MRK 2-5;-MRK 6", new Dictionary<string, List<int>>(), true);
yield return new TestCaseData("OT;-MRK-LUK", new Dictionary<string, List<int>>(), true);
yield return new TestCaseData("NT;OT;-ABC", new Dictionary<string, List<int>>(), true);
yield return new TestCaseData("MAT;-ABC 1", new Dictionary<string, List<int>>(), true);
yield return new TestCaseData("NT,OT,-MRK,-EXO", new Dictionary<string, List<int>>(), true);
yield return new TestCaseData("OT,MAT1", new Dictionary<string, List<int>>(), true);
yield return new TestCaseData("OT,MAT-LUK", new Dictionary<string, List<int>>(), true);


}

}
Loading

0 comments on commit 307eba7

Please sign in to comment.