Skip to content

Commit

Permalink
Add support for non-verse text segments in Scripture corpora (#179)
Browse files Browse the repository at this point in the history
- add new ScriptureRef corpus ref class
- update Scripture corpora classes to use ScriptureRef
- add ScriptureRefUsfmParserHandlerBase class to track ScriptureRef in USFM
- update UsfmTextUpdater and UsfmTextBase to use ScriptureRefUsfmParserHandlerBase
- add support for updating non-Scripture paragraphs and notes
- update NmtPreprocessBuildJob to support non-Scripture segments

Co-authored-by: John Lambert <john_lambert@sil.org>
  • Loading branch information
ddaspit and johnml1135 authored Apr 11, 2024
1 parent fa65835 commit a9058ce
Show file tree
Hide file tree
Showing 33 changed files with 1,591 additions and 570 deletions.
2 changes: 1 addition & 1 deletion src/SIL.Machine.AspNetCore/Services/CorpusService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ public IEnumerable<ITextCorpus> CreateTextCorpora(IReadOnlyList<CorpusFile> file
break;

case FileFormat.Paratext:
corpora.Add(new ParatextBackupTextCorpus(file.Location));
corpora.Add(new ParatextBackupTextCorpus(file.Location, includeAllText: true));
break;
}
}
Expand Down
95 changes: 68 additions & 27 deletions src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ CancellationToken cancellationToken
continue;

int skipCount = 0;
foreach (Row?[] rows in AlignCorpora(sourceTextCorpora, targetTextCorpus))
foreach (Row?[] rows in AlignTrainCorpus(sourceTextCorpora, targetTextCorpus))
{
if (skipCount > 0)
{
Expand All @@ -153,26 +153,6 @@ CancellationToken cancellationToken
if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0)
trainCount++;
}

Row? pretranslateRow = rows[0];
if (
pretranslateRow is not null
&& IsInPretranslate(pretranslateRow, corpus)
&& pretranslateRow.SourceSegment.Length > 0
&& pretranslateRow.TargetSegment.Length == 0
)
{
pretranslateWriter.WriteStartObject();
pretranslateWriter.WriteString("corpusId", corpus.Id);
pretranslateWriter.WriteString("textId", pretranslateRow.TextId);
pretranslateWriter.WriteStartArray("refs");
foreach (object rowRef in pretranslateRow.Refs)
pretranslateWriter.WriteStringValue(rowRef.ToString());
pretranslateWriter.WriteEndArray();
pretranslateWriter.WriteString("translation", pretranslateRow.SourceSegment);
pretranslateWriter.WriteEndObject();
pretranslateCount++;
}
}

if ((bool?)buildOptionsObject?["use_key_terms"] ?? true)
Expand All @@ -190,6 +170,23 @@ pretranslateRow is not null
}
}
}

foreach (Row row in AlignPretranslateCorpus(sourceTextCorpora[0], targetTextCorpus))
{
if (IsInPretranslate(row, corpus) && row.SourceSegment.Length > 0 && row.TargetSegment.Length == 0)
{
pretranslateWriter.WriteStartObject();
pretranslateWriter.WriteString("corpusId", corpus.Id);
pretranslateWriter.WriteString("textId", row.TextId);
pretranslateWriter.WriteStartArray("refs");
foreach (object rowRef in row.Refs)
pretranslateWriter.WriteStringValue(rowRef.ToString());
pretranslateWriter.WriteEndArray();
pretranslateWriter.WriteString("translation", row.SourceSegment);
pretranslateWriter.WriteEndObject();
pretranslateCount++;
}
}
}
pretranslateWriter.WriteEndArray();

Expand Down Expand Up @@ -244,13 +241,13 @@ private static bool IsIncluded(

private static bool IsInChapters(IReadOnlyDictionary<string, HashSet<int>> bookChapters, object rowRef)
{
if (rowRef is not VerseRef vr)
if (rowRef is not ScriptureRef sr)
return false;
return bookChapters.TryGetValue(vr.Book, out HashSet<int>? chapters)
&& (chapters.Contains(vr.ChapterNum) || chapters.Count == 0);
return bookChapters.TryGetValue(sr.Book, out HashSet<int>? chapters)
&& (chapters.Contains(sr.ChapterNum) || chapters.Count == 0);
}

private static IEnumerable<Row?[]> AlignCorpora(IReadOnlyList<ITextCorpus> srcCorpora, ITextCorpus trgCorpus)
private static IEnumerable<Row?[]> AlignTrainCorpus(IReadOnlyList<ITextCorpus> srcCorpora, ITextCorpus trgCorpus)
{
if (trgCorpus.IsScripture())
{
Expand Down Expand Up @@ -332,7 +329,7 @@ private static bool IsInChapters(IReadOnlyDictionary<string, HashSet<int>> bookC
{
yield return new(
vrefs.First().Book,
vrefs.Order().Cast<object>().ToArray(),
vrefs.Order().Select(v => new ScriptureRef(v)).Cast<object>().ToArray(),
srcSegBuffer.ToString(),
trgSegBuffer.ToString(),
rowCount
Expand All @@ -355,7 +352,7 @@ private static bool IsInChapters(IReadOnlyDictionary<string, HashSet<int>> bookC
{
yield return new(
vrefs.First().Book,
vrefs.Order().Cast<object>().ToArray(),
vrefs.Order().Select(v => new ScriptureRef(v)).Cast<object>().ToArray(),
srcSegBuffer.ToString(),
trgSegBuffer.ToString(),
rowCount
Expand All @@ -365,6 +362,50 @@ private static bool IsInChapters(IReadOnlyDictionary<string, HashSet<int>> bookC
}
}

private static IEnumerable<Row> AlignPretranslateCorpus(ITextCorpus srcCorpus, ITextCorpus trgCorpus)
{
int rowCount = 0;
StringBuilder srcSegBuffer = new();
StringBuilder trgSegBuffer = new();
List<object> refs = [];
string textId = "";
foreach (ParallelTextRow row in srcCorpus.AlignRows(trgCorpus, allSourceRows: true))
{
if (!row.IsTargetRangeStart && row.IsTargetInRange)
{
refs.AddRange(row.Refs);
if (row.SourceText.Length > 0)
{
if (srcSegBuffer.Length > 0)
srcSegBuffer.Append(' ');
srcSegBuffer.Append(row.SourceText);
}
rowCount++;
}
else
{
if (rowCount > 0)
{
yield return new(textId, refs, srcSegBuffer.ToString(), trgSegBuffer.ToString(), 1);
textId = "";
srcSegBuffer.Clear();
trgSegBuffer.Clear();
refs.Clear();
rowCount = 0;
}

textId = row.TextId;
refs.AddRange(row.Refs);
srcSegBuffer.Append(row.SourceText);
trgSegBuffer.Append(row.TargetText);
rowCount++;
}
}

if (rowCount > 0)
yield return new(textId, refs, srcSegBuffer.ToString(), trgSegBuffer.ToString(), 1);
}

private record Row(
string TextId,
IReadOnlyList<object> Refs,
Expand Down
26 changes: 20 additions & 6 deletions src/SIL.Machine/Corpora/CorporaExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,11 @@ public static ITextCorpus Flatten(this IEnumerable<ITextCorpus> corpora)
bool curTrgLineRange = true;
foreach (ParallelTextRow row in parallelCorpus)
{
var vref = (VerseRef)row.Ref;
var scriptureRef = (ScriptureRef)row.Ref;
if (!scriptureRef.IsVerse)
continue;

VerseRef vref = scriptureRef.VerseRef;
if (
curRef.HasValue
&& vref.CompareTo(curRef.Value, null, compareAllVerses: true, compareSegments: false) != 0
Expand All @@ -299,14 +303,14 @@ public static ITextCorpus Flatten(this IEnumerable<ITextCorpus> corpora)
curRef = vref;
if (!curTrgRef.HasValue && row.TargetRefs.Count > 0)
{
curTrgRef = (VerseRef)row.TargetRefs[0];
curTrgRef = ((ScriptureRef)row.TargetRefs[0]).VerseRef;
}
else if (curTrgRef.HasValue && row.TargetRefs.Count > 0 && !curTrgRef.Value.Equals(row.TargetRefs[0]))
{
curTrgRef.Value.Simplify();
var trgRef = (VerseRef)row.TargetRefs[0];
VerseRef startRef,
endRef;
VerseRef trgRef = ((ScriptureRef)row.TargetRefs[0]).VerseRef;
VerseRef startRef;
VerseRef endRef;
if (curTrgRef.Value < trgRef)
{
startRef = curTrgRef.Value;
Expand Down Expand Up @@ -353,7 +357,7 @@ public static ITextCorpus Flatten(this IEnumerable<ITextCorpus> corpora)

public static bool IsScripture(this ITextCorpus textCorpus)
{
return textCorpus is ScriptureTextCorpus;
return textCorpus.Versification != null;
}

private class TransformTextCorpus : TextCorpusBase
Expand All @@ -372,6 +376,8 @@ public TransformTextCorpus(ITextCorpus corpus, Func<TextRow, TextRow> transform,

public override bool IsTokenized { get; }

public override ScrVers Versification => _corpus.Versification;

public override int Count(bool includeEmpty = true)
{
return _corpus.Count(includeEmpty);
Expand All @@ -398,6 +404,8 @@ public WhereTextCorpus(ITextCorpus corpus, Func<TextRow, int, bool> predicate)

public override bool IsTokenized => _corpus.IsTokenized;

public override ScrVers Versification => _corpus.Versification;

public override IEnumerable<TextRow> GetRows(IEnumerable<string> textIds)
{
return _corpus.GetRows(textIds).Where(_predicate);
Expand All @@ -419,6 +427,8 @@ public TextFilterTextCorpus(ITextCorpus corpus, Func<IText, bool> predicate)

public override bool IsTokenized => _corpus.IsTokenized;

public override ScrVers Versification => _corpus.Versification;

public override IEnumerable<TextRow> GetRows(IEnumerable<string> textIds)
{
return _corpus.GetRows(textIds ?? Texts.Select(t => t.Id));
Expand All @@ -440,6 +450,8 @@ public TakeTextCorpus(ITextCorpus corpus, int count)

public override bool IsTokenized => _corpus.IsTokenized;

public override ScrVers Versification => _corpus.Versification;

public override IEnumerable<TextRow> GetRows(IEnumerable<string> textIds)
{
return _corpus.GetRows(textIds).Take(_count);
Expand All @@ -459,6 +471,8 @@ public FlattenTextCorpus(ITextCorpus[] corpora)

public override bool IsTokenized => _corpora.All(corpus => corpus.IsTokenized);

public override ScrVers Versification => _corpora.Length > 0 ? _corpora[0].Versification : null;

public override int Count(bool includeEmpty = true)
{
return _corpora.Sum(corpus => corpus.Count(includeEmpty));
Expand Down
3 changes: 3 additions & 0 deletions src/SIL.Machine/Corpora/DictionaryTextCorpus.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using System.Collections.Generic;
using System.Linq;
using SIL.Scripture;

namespace SIL.Machine.Corpora
{
Expand All @@ -21,6 +22,8 @@ public DictionaryTextCorpus(IEnumerable<IText> texts)

public bool IsTokenized { get; set; }

public ScrVers Versification { get; set; }

public override int Count(bool includeEmpty = true)
{
return Texts.Sum(t => t.Count(includeEmpty));
Expand Down
3 changes: 3 additions & 0 deletions src/SIL.Machine/Corpora/ITextCorpus.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using System.Collections.Generic;
using SIL.Scripture;

namespace SIL.Machine.Corpora
{
Expand All @@ -9,5 +10,7 @@ public interface ITextCorpus : ICorpus<TextRow>
IEnumerable<TextRow> GetRows(IEnumerable<string> textIds);

bool IsTokenized { get; }

ScrVers Versification { get; }
}
}
Loading

0 comments on commit a9058ce

Please sign in to comment.