From a9058ce02408cabbf3427f13b5e5d8e65b40daae Mon Sep 17 00:00:00 2001 From: Damien Daspit Date: Thu, 11 Apr 2024 14:24:32 -0500 Subject: [PATCH] Add support for non-verse text segments in Scripture corpora (#179) - add new ScriptureRef corpus ref class - update Scripture corpora classes to use ScriptureRef - add ScriptureRefUsfmParserHandlerBase class to track ScriptureRef in USFM - update UsfmTextUpdater and UsfmTextBase to use ScriptureRefUsfmParserHandlerBase - add support for updating non-Scripture paragraphs and notes - update NmtPreprocessBuildJob to support non-Scripture segments Co-authored-by: John Lambert --- .../Services/CorpusService.cs | 2 +- .../Services/NmtPreprocessBuildJob.cs | 95 +++-- src/SIL.Machine/Corpora/CorporaExtensions.cs | 26 +- .../Corpora/DictionaryTextCorpus.cs | 3 + src/SIL.Machine/Corpora/ITextCorpus.cs | 3 + src/SIL.Machine/Corpora/ParallelTextCorpus.cs | 124 +++--- src/SIL.Machine/Corpora/ParallelTextRow.cs | 3 + .../Corpora/ParatextBackupTextCorpus.cs | 5 +- src/SIL.Machine/Corpora/ParatextTextCorpus.cs | 11 +- src/SIL.Machine/Corpora/ScriptureElement.cs | 58 +++ src/SIL.Machine/Corpora/ScriptureRef.cs | 136 +++++++ .../ScriptureRefUsfmParserHandlerBase.cs | 249 ++++++++++++ src/SIL.Machine/Corpora/ScriptureText.cs | 72 +++- .../Corpora/ScriptureTextCorpus.cs | 7 +- src/SIL.Machine/Corpora/TextCorpusBase.cs | 2 + src/SIL.Machine/Corpora/UsfmFileText.cs | 5 +- src/SIL.Machine/Corpora/UsfmFileTextCorpus.cs | 9 +- src/SIL.Machine/Corpora/UsfmParser.cs | 1 + src/SIL.Machine/Corpora/UsfmTextBase.cs | 175 +++++---- ...VerseTextUpdater.cs => UsfmTextUpdater.cs} | 211 ++++++++--- src/SIL.Machine/Corpora/UsfmZipText.cs | 12 +- .../Corpora/UsxFileAlignmentCollection.cs | 4 +- src/SIL.Machine/Corpora/VerseRefComparer.cs | 3 + .../Services/NmtPreprocessBuildJobTests.cs | 22 +- .../Corpora/CorporaExtensionsTests.cs | 5 +- .../Corpora/ParallelTextCorpusTests.cs | 76 ++-- .../Corpora/ScriptureRefTests.cs | 47 +++ .../Corpora/TestData/usfm/Tes/41MATTes.SFM | 8 +- .../Corpora/UsfmFileTextTests.cs | 181 +++++++-- .../Corpora/UsfmTextUpdaterTests.cs | 356 ++++++++++++++++++ .../Corpora/UsfmTokenizerTests.cs | 18 +- .../Corpora/UsfmVerseTextUpdaterTests.cs | 207 ---------- .../Corpora/UsxZipTextTests.cs | 25 +- 33 files changed, 1591 insertions(+), 570 deletions(-) create mode 100644 src/SIL.Machine/Corpora/ScriptureElement.cs create mode 100644 src/SIL.Machine/Corpora/ScriptureRef.cs create mode 100644 src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs rename src/SIL.Machine/Corpora/{UsfmVerseTextUpdater.cs => UsfmTextUpdater.cs} (56%) create mode 100644 tests/SIL.Machine.Tests/Corpora/ScriptureRefTests.cs create mode 100644 tests/SIL.Machine.Tests/Corpora/UsfmTextUpdaterTests.cs delete mode 100644 tests/SIL.Machine.Tests/Corpora/UsfmVerseTextUpdaterTests.cs diff --git a/src/SIL.Machine.AspNetCore/Services/CorpusService.cs b/src/SIL.Machine.AspNetCore/Services/CorpusService.cs index b48e9724..635bbff5 100644 --- a/src/SIL.Machine.AspNetCore/Services/CorpusService.cs +++ b/src/SIL.Machine.AspNetCore/Services/CorpusService.cs @@ -26,7 +26,7 @@ public IEnumerable CreateTextCorpora(IReadOnlyList file break; case FileFormat.Paratext: - corpora.Add(new ParatextBackupTextCorpus(file.Location)); + corpora.Add(new ParatextBackupTextCorpus(file.Location, includeAllText: true)); break; } } diff --git a/src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs b/src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs index fd9f6217..8372c28e 100644 --- a/src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs +++ b/src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs @@ -128,7 +128,7 @@ CancellationToken cancellationToken continue; int skipCount = 0; - foreach (Row?[] rows in AlignCorpora(sourceTextCorpora, targetTextCorpus)) + foreach (Row?[] rows in AlignTrainCorpus(sourceTextCorpora, targetTextCorpus)) { if (skipCount > 0) { @@ -153,26 +153,6 @@ CancellationToken cancellationToken if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0) trainCount++; } - - Row? pretranslateRow = rows[0]; - if ( - pretranslateRow is not null - && IsInPretranslate(pretranslateRow, corpus) - && pretranslateRow.SourceSegment.Length > 0 - && pretranslateRow.TargetSegment.Length == 0 - ) - { - pretranslateWriter.WriteStartObject(); - pretranslateWriter.WriteString("corpusId", corpus.Id); - pretranslateWriter.WriteString("textId", pretranslateRow.TextId); - pretranslateWriter.WriteStartArray("refs"); - foreach (object rowRef in pretranslateRow.Refs) - pretranslateWriter.WriteStringValue(rowRef.ToString()); - pretranslateWriter.WriteEndArray(); - pretranslateWriter.WriteString("translation", pretranslateRow.SourceSegment); - pretranslateWriter.WriteEndObject(); - pretranslateCount++; - } } if ((bool?)buildOptionsObject?["use_key_terms"] ?? true) @@ -190,6 +170,23 @@ pretranslateRow is not null } } } + + foreach (Row row in AlignPretranslateCorpus(sourceTextCorpora[0], targetTextCorpus)) + { + if (IsInPretranslate(row, corpus) && row.SourceSegment.Length > 0 && row.TargetSegment.Length == 0) + { + pretranslateWriter.WriteStartObject(); + pretranslateWriter.WriteString("corpusId", corpus.Id); + pretranslateWriter.WriteString("textId", row.TextId); + pretranslateWriter.WriteStartArray("refs"); + foreach (object rowRef in row.Refs) + pretranslateWriter.WriteStringValue(rowRef.ToString()); + pretranslateWriter.WriteEndArray(); + pretranslateWriter.WriteString("translation", row.SourceSegment); + pretranslateWriter.WriteEndObject(); + pretranslateCount++; + } + } } pretranslateWriter.WriteEndArray(); @@ -244,13 +241,13 @@ private static bool IsIncluded( private static bool IsInChapters(IReadOnlyDictionary> bookChapters, object rowRef) { - if (rowRef is not VerseRef vr) + if (rowRef is not ScriptureRef sr) return false; - return bookChapters.TryGetValue(vr.Book, out HashSet? chapters) - && (chapters.Contains(vr.ChapterNum) || chapters.Count == 0); + return bookChapters.TryGetValue(sr.Book, out HashSet? chapters) + && (chapters.Contains(sr.ChapterNum) || chapters.Count == 0); } - private static IEnumerable AlignCorpora(IReadOnlyList srcCorpora, ITextCorpus trgCorpus) + private static IEnumerable AlignTrainCorpus(IReadOnlyList srcCorpora, ITextCorpus trgCorpus) { if (trgCorpus.IsScripture()) { @@ -332,7 +329,7 @@ private static bool IsInChapters(IReadOnlyDictionary> bookC { yield return new( vrefs.First().Book, - vrefs.Order().Cast().ToArray(), + vrefs.Order().Select(v => new ScriptureRef(v)).Cast().ToArray(), srcSegBuffer.ToString(), trgSegBuffer.ToString(), rowCount @@ -355,7 +352,7 @@ private static bool IsInChapters(IReadOnlyDictionary> bookC { yield return new( vrefs.First().Book, - vrefs.Order().Cast().ToArray(), + vrefs.Order().Select(v => new ScriptureRef(v)).Cast().ToArray(), srcSegBuffer.ToString(), trgSegBuffer.ToString(), rowCount @@ -365,6 +362,50 @@ private static bool IsInChapters(IReadOnlyDictionary> bookC } } + private static IEnumerable AlignPretranslateCorpus(ITextCorpus srcCorpus, ITextCorpus trgCorpus) + { + int rowCount = 0; + StringBuilder srcSegBuffer = new(); + StringBuilder trgSegBuffer = new(); + List refs = []; + string textId = ""; + foreach (ParallelTextRow row in srcCorpus.AlignRows(trgCorpus, allSourceRows: true)) + { + if (!row.IsTargetRangeStart && row.IsTargetInRange) + { + refs.AddRange(row.Refs); + if (row.SourceText.Length > 0) + { + if (srcSegBuffer.Length > 0) + srcSegBuffer.Append(' '); + srcSegBuffer.Append(row.SourceText); + } + rowCount++; + } + else + { + if (rowCount > 0) + { + yield return new(textId, refs, srcSegBuffer.ToString(), trgSegBuffer.ToString(), 1); + textId = ""; + srcSegBuffer.Clear(); + trgSegBuffer.Clear(); + refs.Clear(); + rowCount = 0; + } + + textId = row.TextId; + refs.AddRange(row.Refs); + srcSegBuffer.Append(row.SourceText); + trgSegBuffer.Append(row.TargetText); + rowCount++; + } + } + + if (rowCount > 0) + yield return new(textId, refs, srcSegBuffer.ToString(), trgSegBuffer.ToString(), 1); + } + private record Row( string TextId, IReadOnlyList Refs, diff --git a/src/SIL.Machine/Corpora/CorporaExtensions.cs b/src/SIL.Machine/Corpora/CorporaExtensions.cs index 718e2c61..64e87690 100644 --- a/src/SIL.Machine/Corpora/CorporaExtensions.cs +++ b/src/SIL.Machine/Corpora/CorporaExtensions.cs @@ -284,7 +284,11 @@ public static ITextCorpus Flatten(this IEnumerable corpora) bool curTrgLineRange = true; foreach (ParallelTextRow row in parallelCorpus) { - var vref = (VerseRef)row.Ref; + var scriptureRef = (ScriptureRef)row.Ref; + if (!scriptureRef.IsVerse) + continue; + + VerseRef vref = scriptureRef.VerseRef; if ( curRef.HasValue && vref.CompareTo(curRef.Value, null, compareAllVerses: true, compareSegments: false) != 0 @@ -299,14 +303,14 @@ public static ITextCorpus Flatten(this IEnumerable corpora) curRef = vref; if (!curTrgRef.HasValue && row.TargetRefs.Count > 0) { - curTrgRef = (VerseRef)row.TargetRefs[0]; + curTrgRef = ((ScriptureRef)row.TargetRefs[0]).VerseRef; } else if (curTrgRef.HasValue && row.TargetRefs.Count > 0 && !curTrgRef.Value.Equals(row.TargetRefs[0])) { curTrgRef.Value.Simplify(); - var trgRef = (VerseRef)row.TargetRefs[0]; - VerseRef startRef, - endRef; + VerseRef trgRef = ((ScriptureRef)row.TargetRefs[0]).VerseRef; + VerseRef startRef; + VerseRef endRef; if (curTrgRef.Value < trgRef) { startRef = curTrgRef.Value; @@ -353,7 +357,7 @@ public static ITextCorpus Flatten(this IEnumerable corpora) public static bool IsScripture(this ITextCorpus textCorpus) { - return textCorpus is ScriptureTextCorpus; + return textCorpus.Versification != null; } private class TransformTextCorpus : TextCorpusBase @@ -372,6 +376,8 @@ public TransformTextCorpus(ITextCorpus corpus, Func transform, public override bool IsTokenized { get; } + public override ScrVers Versification => _corpus.Versification; + public override int Count(bool includeEmpty = true) { return _corpus.Count(includeEmpty); @@ -398,6 +404,8 @@ public WhereTextCorpus(ITextCorpus corpus, Func predicate) public override bool IsTokenized => _corpus.IsTokenized; + public override ScrVers Versification => _corpus.Versification; + public override IEnumerable GetRows(IEnumerable textIds) { return _corpus.GetRows(textIds).Where(_predicate); @@ -419,6 +427,8 @@ public TextFilterTextCorpus(ITextCorpus corpus, Func predicate) public override bool IsTokenized => _corpus.IsTokenized; + public override ScrVers Versification => _corpus.Versification; + public override IEnumerable GetRows(IEnumerable textIds) { return _corpus.GetRows(textIds ?? Texts.Select(t => t.Id)); @@ -440,6 +450,8 @@ public TakeTextCorpus(ITextCorpus corpus, int count) public override bool IsTokenized => _corpus.IsTokenized; + public override ScrVers Versification => _corpus.Versification; + public override IEnumerable GetRows(IEnumerable textIds) { return _corpus.GetRows(textIds).Take(_count); @@ -459,6 +471,8 @@ public FlattenTextCorpus(ITextCorpus[] corpora) public override bool IsTokenized => _corpora.All(corpus => corpus.IsTokenized); + public override ScrVers Versification => _corpora.Length > 0 ? _corpora[0].Versification : null; + public override int Count(bool includeEmpty = true) { return _corpora.Sum(corpus => corpus.Count(includeEmpty)); diff --git a/src/SIL.Machine/Corpora/DictionaryTextCorpus.cs b/src/SIL.Machine/Corpora/DictionaryTextCorpus.cs index a11f1605..364d7888 100644 --- a/src/SIL.Machine/Corpora/DictionaryTextCorpus.cs +++ b/src/SIL.Machine/Corpora/DictionaryTextCorpus.cs @@ -1,5 +1,6 @@ using System.Collections.Generic; using System.Linq; +using SIL.Scripture; namespace SIL.Machine.Corpora { @@ -21,6 +22,8 @@ public DictionaryTextCorpus(IEnumerable texts) public bool IsTokenized { get; set; } + public ScrVers Versification { get; set; } + public override int Count(bool includeEmpty = true) { return Texts.Sum(t => t.Count(includeEmpty)); diff --git a/src/SIL.Machine/Corpora/ITextCorpus.cs b/src/SIL.Machine/Corpora/ITextCorpus.cs index 43bf1907..ba0950ef 100644 --- a/src/SIL.Machine/Corpora/ITextCorpus.cs +++ b/src/SIL.Machine/Corpora/ITextCorpus.cs @@ -1,4 +1,5 @@ using System.Collections.Generic; +using SIL.Scripture; namespace SIL.Machine.Corpora { @@ -9,5 +10,7 @@ public interface ITextCorpus : ICorpus IEnumerable GetRows(IEnumerable textIds); bool IsTokenized { get; } + + ScrVers Versification { get; } } } diff --git a/src/SIL.Machine/Corpora/ParallelTextCorpus.cs b/src/SIL.Machine/Corpora/ParallelTextCorpus.cs index 0e9323ff..9a2415fa 100644 --- a/src/SIL.Machine/Corpora/ParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/ParallelTextCorpus.cs @@ -64,20 +64,20 @@ public IEnumerable GetRows() textIds = targetTextIds; using (IEnumerator srcEnumerator = SourceCorpus.GetRows(textIds).GetEnumerator()) - using (var trgEnumerator = new TargetCorpusEnumerator(TargetCorpus.GetRows(textIds).GetEnumerator())) + using ( + var trgEnumerator = new TargetCorpusEnumerator( + TargetCorpus.GetRows(textIds).GetEnumerator(), + SourceCorpus.Versification, + TargetCorpus.Versification + ) + ) using (IEnumerator alignmentEnumerator = AlignmentCorpus.GetRows(textIds).GetEnumerator()) { - var rangeInfo = new RangeInfo(); - rangeInfo.Versification = - TargetCorpus is ScriptureTextCorpus tc && SourceCorpus is ScriptureTextCorpus - ? tc.Versification - : null; + var rangeInfo = new RangeInfo { TargetVersification = TargetCorpus.Versification }; var sourceSameRefRows = new List(); var targetSameRefRows = new List(); bool srcCompleted = !srcEnumerator.MoveNext(); - if (!srcCompleted && srcEnumerator.Current.Ref is VerseRef verseRef) - trgEnumerator.SourceVersification = verseRef.Versification; bool trgCompleted = !trgEnumerator.MoveNext(); while (!srcCompleted && !trgCompleted) { @@ -105,6 +105,7 @@ TargetCorpus is ScriptureTextCorpus tc && SourceCorpus is ScriptureTextCorpus { yield return rangeInfo.CreateRow(); } + rangeInfo.TextId = srcEnumerator.Current.TextId; rangeInfo.SourceRefs.Add(srcEnumerator.Current.Ref); targetSameRefRows.Clear(); if (rangeInfo.IsSourceEmpty) @@ -143,6 +144,7 @@ ParallelTextRow row in CreateSourceRows( { yield return rangeInfo.CreateRow(); } + rangeInfo.TextId = trgEnumerator.Current.TextId; rangeInfo.TargetRefs.Add(trgEnumerator.Current.Ref); sourceSameRefRows.Clear(); if (rangeInfo.IsTargetEmpty) @@ -359,18 +361,13 @@ private IEnumerable CreateRows( else throw new ArgumentNullException("Either a source or target must be specified."); - var sourceRefs = srcRow != null ? new object[] { srcRow.Ref } : Array.Empty(); - var targetRefs = trgRow != null ? new object[] { trgRow.Ref } : Array.Empty(); + object[] sourceRefs = srcRow != null ? new object[] { srcRow.Ref } : Array.Empty(); + object[] targetRefs = trgRow != null ? new object[] { trgRow.Ref } : Array.Empty(); if (targetRefs.Length == 0 && TargetCorpus is ScriptureTextCorpus stc) { targetRefs = sourceRefs - .Cast() - .Select(r => - { - var t = r.Clone(); - t.ChangeVersification(stc.Versification); - return t; - }) + .Cast() + .Select(r => r.ChangeVersification(stc.Versification)) .Cast() .ToArray(); } @@ -486,22 +483,17 @@ private class RangeInfo public bool IsSourceEmpty => SourceSegment.Count == 0; public bool IsTargetEmpty => TargetSegment.Count == 0; - public ScrVers Versification { get; set; } = null; + public ScrVers TargetVersification { get; set; } = null; public ParallelTextRow CreateRow() { object[] trgRefs = TargetRefs.ToArray(); - if (TargetRefs.Count == 0 && Versification != null) + if (TargetRefs.Count == 0 && TargetVersification != null) { trgRefs = SourceRefs .ToArray() - .Cast() - .Select(r => - { - VerseRef t = r.Clone(); - t.ChangeVersification(Versification); - return t; - }) + .Cast() + .Select(r => r.ChangeVersification(TargetVersification)) .Cast() .ToArray(); } @@ -525,14 +517,11 @@ public ParallelTextRow CreateRow() private class DefaultRowRefComparer : IComparer { - private static readonly VerseRefComparer VerseRefComparer = new VerseRefComparer(compareSegments: false); - public int Compare(object x, object y) { - // Do not use the default comparer for VerseRef, since we want to compare all verses in a range or - // sequence - if (x is VerseRef vx && y is VerseRef vy) - return VerseRefComparer.Compare(vx, vy); + // Do not use the default comparer for ScriptureRef, since we want to ignore segments + if (x is ScriptureRef sx && y is ScriptureRef sy) + return sx.CompareTo(sy, compareSegments: false); return Comparer.Default.Compare(x, y); } @@ -541,48 +530,40 @@ public int Compare(object x, object y) private class TargetCorpusEnumerator : DisposableBase, IEnumerator { private readonly IEnumerator _enumerator; - private bool _isScripture = false; - private bool _isEnumerating = false; + private readonly bool _isScripture = false; private readonly Queue _verseRows; + private readonly ScrVers _sourceVersification; private TextRow _current; + private bool _isEnumerating = false; - public TargetCorpusEnumerator(IEnumerator enumerator) + public TargetCorpusEnumerator( + IEnumerator enumerator, + ScrVers sourceVersification, + ScrVers targetVersification + ) { _enumerator = enumerator; + _sourceVersification = sourceVersification; + _isScripture = + sourceVersification != null + && targetVersification != null + && sourceVersification != targetVersification; _verseRows = new Queue(); } - public ScrVers SourceVersification { get; set; } - public TextRow Current => _current; object IEnumerator.Current => Current; public bool MoveNext() { - bool result; - if (!_isEnumerating) + if (_isScripture) { - _isEnumerating = true; - result = _enumerator.MoveNext(); - if ( - result - && _enumerator.Current.Ref is VerseRef verseRef - && SourceVersification != null - && SourceVersification != verseRef.Versification - ) - { - _isScripture = true; - } - else + if (!_isEnumerating) { - _current = _enumerator.Current; - return result; + _enumerator.MoveNext(); + _isEnumerating = true; } - } - - if (_isScripture) - { if (_verseRows.Count == 0 && _enumerator.Current != null) CollectVerses(); if (_verseRows.Count > 0) @@ -594,7 +575,7 @@ public bool MoveNext() return false; } - result = _enumerator.MoveNext(); + bool result = _enumerator.MoveNext(); _current = _enumerator.Current; return result; } @@ -603,7 +584,6 @@ public void Reset() { _enumerator.Reset(); _isEnumerating = false; - _isScripture = false; } protected override void DisposeManagedResources() @@ -613,23 +593,25 @@ protected override void DisposeManagedResources() private void CollectVerses() { - var rowList = new List<(VerseRef Ref, TextRow Row)>(); + var rowList = new List<(ScriptureRef Ref, TextRow Row)>(); bool outOfOrder = false; - var prevVerseRef = new VerseRef(); + ScriptureRef prevScrRef = ScriptureRef.Empty; int rangeStartOffset = -1; do { TextRow row = _enumerator.Current; - var verseRef = (VerseRef)row.Ref; - if (!prevVerseRef.IsDefault && verseRef.BookNum != prevVerseRef.BookNum) + var scrRef = (ScriptureRef)row.Ref; + if (!prevScrRef.IsEmpty && scrRef.BookNum != prevScrRef.BookNum) break; - verseRef.ChangeVersification(SourceVersification); + scrRef = scrRef.ChangeVersification(_sourceVersification); // convert one-to-many versification mapping to a verse range - if (verseRef.Equals(prevVerseRef)) + if (scrRef.Equals(prevScrRef)) { - var (rangeStartVerseRef, rangeStartRow) = rowList[rowList.Count + rangeStartOffset]; - var flags = TextRowFlags.InRange; + (ScriptureRef rangeStartVerseRef, TextRow rangeStartRow) = rowList[ + rowList.Count + rangeStartOffset + ]; + TextRowFlags flags = TextRowFlags.InRange; if (rangeStartRow.IsSentenceStart) flags |= TextRowFlags.SentenceStart; if (rangeStartOffset == -1 && (!rangeStartRow.IsInRange || rangeStartRow.IsRangeStart)) @@ -649,16 +631,16 @@ private void CollectVerses() { rangeStartOffset = -1; } - rowList.Add((verseRef, row)); - if (!outOfOrder && verseRef.CompareTo(prevVerseRef) < 0) + rowList.Add((scrRef, row)); + if (!outOfOrder && scrRef.CompareTo(prevScrRef) < 0) outOfOrder = true; - prevVerseRef = verseRef; + prevScrRef = scrRef; } while (_enumerator.MoveNext()); if (outOfOrder) rowList.Sort((x, y) => x.Ref.CompareTo(y.Ref)); - foreach (var (_, row) in rowList) + foreach ((ScriptureRef _, TextRow row) in rowList) _verseRows.Enqueue(row); } } diff --git a/src/SIL.Machine/Corpora/ParallelTextRow.cs b/src/SIL.Machine/Corpora/ParallelTextRow.cs index afa44118..9b8618e4 100644 --- a/src/SIL.Machine/Corpora/ParallelTextRow.cs +++ b/src/SIL.Machine/Corpora/ParallelTextRow.cs @@ -8,6 +8,9 @@ public class ParallelTextRow : IRow { public ParallelTextRow(string textId, IReadOnlyList sourceRefs, IReadOnlyList targetRefs) { + if (string.IsNullOrEmpty(textId)) + throw new ArgumentNullException(nameof(textId)); + if (sourceRefs.Count == 0 && targetRefs.Count == 0) throw new ArgumentNullException("Either a source or target ref must be provided."); diff --git a/src/SIL.Machine/Corpora/ParatextBackupTextCorpus.cs b/src/SIL.Machine/Corpora/ParatextBackupTextCorpus.cs index 2279c542..3a88cd27 100644 --- a/src/SIL.Machine/Corpora/ParatextBackupTextCorpus.cs +++ b/src/SIL.Machine/Corpora/ParatextBackupTextCorpus.cs @@ -6,7 +6,7 @@ namespace SIL.Machine.Corpora { public class ParatextBackupTextCorpus : ScriptureTextCorpus { - public ParatextBackupTextCorpus(string fileName, bool includeMarkers = false) + public ParatextBackupTextCorpus(string fileName, bool includeMarkers = false, bool includeAllText = false) { using (ZipArchive archive = ZipFile.OpenRead(fileName)) { @@ -28,7 +28,8 @@ public ParatextBackupTextCorpus(string fileName, bool includeMarkers = false) fileName, sfmEntry.FullName, Versification, - includeMarkers + includeMarkers, + includeAllText ) ); } diff --git a/src/SIL.Machine/Corpora/ParatextTextCorpus.cs b/src/SIL.Machine/Corpora/ParatextTextCorpus.cs index aec48df7..2b9357e9 100644 --- a/src/SIL.Machine/Corpora/ParatextTextCorpus.cs +++ b/src/SIL.Machine/Corpora/ParatextTextCorpus.cs @@ -4,7 +4,7 @@ namespace SIL.Machine.Corpora { public class ParatextTextCorpus : ScriptureTextCorpus { - public ParatextTextCorpus(string projectDir, bool includeMarkers = false) + public ParatextTextCorpus(string projectDir, bool includeMarkers = false, bool includeAllText = false) { var parser = new FileParatextProjectSettingsParser(projectDir); ParatextProjectSettings settings = parser.Parse(); @@ -19,7 +19,14 @@ string sfmFileName in Directory.EnumerateFiles( ) { AddText( - new UsfmFileText(settings.Stylesheet, settings.Encoding, sfmFileName, Versification, includeMarkers) + new UsfmFileText( + settings.Stylesheet, + settings.Encoding, + sfmFileName, + Versification, + includeMarkers, + includeAllText + ) ); } } diff --git a/src/SIL.Machine/Corpora/ScriptureElement.cs b/src/SIL.Machine/Corpora/ScriptureElement.cs new file mode 100644 index 00000000..9512ea79 --- /dev/null +++ b/src/SIL.Machine/Corpora/ScriptureElement.cs @@ -0,0 +1,58 @@ +using System; + +namespace SIL.Machine.Corpora +{ + public class ScriptureElement : IEquatable, IComparable + { + public ScriptureElement(int position, string name) + { + Position = position; + Name = name; + } + + public int Position { get; } + public string Name { get; } + + int IComparable.CompareTo(ScriptureElement other) + { + return CompareTo(other, strict: true); + } + + public int CompareTo(ScriptureElement other, bool strict = true) + { + if (strict) + { + int res = Position.CompareTo(other.Position); + if (res != 0) + return res; + } + + return Name.CompareTo(other.Name); + } + + public bool Equals(ScriptureElement other) + { + return Position == other.Position && Name == other.Name; + } + + public override bool Equals(object obj) + { + return obj is ScriptureElement se && Equals(se); + } + + public override int GetHashCode() + { + int hashCode = 23; + hashCode = hashCode * 31 + Position.GetHashCode(); + hashCode = hashCode * 31 + Name.GetHashCode(); + return hashCode; + } + + public override string ToString() + { + if (Position == 0) + return Name; + return $"{Position}:{Name}"; + } + } +} diff --git a/src/SIL.Machine/Corpora/ScriptureRef.cs b/src/SIL.Machine/Corpora/ScriptureRef.cs new file mode 100644 index 00000000..bb19f84e --- /dev/null +++ b/src/SIL.Machine/Corpora/ScriptureRef.cs @@ -0,0 +1,136 @@ +using System; +using System.Collections.Generic; +using System.Globalization; +using System.Linq; +using System.Text; +using SIL.Extensions; +using SIL.Scripture; + +namespace SIL.Machine.Corpora +{ + /** + * A reference to a specific text segment in a scripture text. A verse reference is a the primary anchor point for + * each text segment. If the text segment is not in a verse, then a path is used to specify the location of the + * segment within the verse. A path element consists of the position with the parent and a name. The position is + * 1-based. The position 0 is used when a position is not specified or unknown. The reference is serialized in the + * following format: "[verse reference]/[path element 1]/[path element 2]/...". A path element is serialized as + * "[position]:[name]". For example, the reference for the section header that occurs directly after MAT 1:1 would + * be represented as "MAT 1:1/1:s". Introductory material that occurs at the beginning of a book before the first + * verse is referenced by the "1:0" verse reference. Some non-verse text segments can be nested in another element. + * For example, a table cell might be represented as "MAT 1:1/1:tr/1:tc1". + */ + public class ScriptureRef : IEquatable, IComparable, IComparable + { + public static ScriptureRef Empty { get; } = new ScriptureRef(); + + public static ScriptureRef Parse(string str, ScrVers versification = null) + { + string[] parts = str.Split('/'); + if (parts.Length == 1) + return new ScriptureRef(new VerseRef(parts[0], versification ?? ScrVers.English)); + + string vref = parts[0]; + var path = new List(); + foreach (string part in parts.Skip(1)) + { + string[] elem = part.Split(':'); + if (elem.Length == 1) + path.Add(new ScriptureElement(0, elem[0])); + else + path.Add(new ScriptureElement(int.Parse(elem[0], CultureInfo.InvariantCulture), elem[1])); + } + + return new ScriptureRef(new VerseRef(vref, versification ?? ScrVers.English), path); + } + + public ScriptureRef(VerseRef verseRef = default, IEnumerable path = null) + { + VerseRef = verseRef; + Path = path?.ToArray() ?? Array.Empty(); + } + + public VerseRef VerseRef { get; } + public IReadOnlyList Path { get; } + public int BookNum => VerseRef.BookNum; + public int ChapterNum => VerseRef.ChapterNum; + public int VerseNum => VerseRef.VerseNum; + public string Book => VerseRef.Book; + public string Chapter => VerseRef.Chapter; + public string Verse => VerseRef.Verse; + public ScrVers Versification => VerseRef.Versification; + public bool IsEmpty => VerseRef.IsDefault; + public bool IsVerse => VerseRef.VerseNum != 0 && Path.Count == 0; + + public ScriptureRef ChangeVersification(ScrVers versification) + { + VerseRef vr = VerseRef.Clone(); + vr.ChangeVersification(versification); + return new ScriptureRef(vr, Path); + } + + public bool Overlaps(ScriptureRef other) + { + if (!VerseRef.AreOverlappingVersesRanges(VerseRef, other.VerseRef)) + return false; + + return Path.SequenceEqual(other.Path); + } + + int IComparable.CompareTo(ScriptureRef other) + { + return CompareTo(other, compareSegments: true); + } + + public int CompareTo(ScriptureRef other, bool compareSegments = true, bool strict = true) + { + IComparer comparer = compareSegments ? VerseRefComparer.Default : VerseRefComparer.IgnoreSegments; + int res = comparer.Compare(VerseRef, other.VerseRef); + if (res != 0) + return res; + + foreach ((ScriptureElement se1, ScriptureElement se2) in Path.Zip(other.Path)) + { + res = se1.CompareTo(se2, strict); + if (res != 0) + return res; + } + + return Path.Count - other.Path.Count; + } + + public int CompareTo(object obj) + { + if (!(obj is ScriptureRef sr)) + throw new ArgumentException("obj is not a ScriptureRef."); + + return CompareTo(sr); + } + + public bool Equals(ScriptureRef other) + { + return VerseRef.Equals(other.VerseRef) && Path.SequenceEqual(other.Path); + } + + public override bool Equals(object obj) + { + return obj is ScriptureRef sr && Equals(sr); + } + + public override int GetHashCode() + { + int hashCode = 23; + hashCode = hashCode * 31 + VerseRef.GetHashCode(); + hashCode = hashCode * 31 + Path.GetSequenceHashCode(); + return hashCode; + } + + public override string ToString() + { + var sb = new StringBuilder(); + sb.Append(VerseRef); + foreach (ScriptureElement se in Path) + sb.Append($"/{se}"); + return sb.ToString(); + } + } +} diff --git a/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs b/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs new file mode 100644 index 00000000..2f0ffc86 --- /dev/null +++ b/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs @@ -0,0 +1,249 @@ +using System.Collections.Generic; +using System.Linq; +using SIL.Scripture; + +namespace SIL.Machine.Corpora +{ + public enum ScriptureTextType + { + NonVerse, + Verse, + Note + } + + public abstract class ScriptureRefUsfmParserHandlerBase : UsfmParserHandlerBase + { + private VerseRef _curVerseRef; + private readonly Stack _curElements; + private readonly Stack _curTextType; + private bool _duplicateVerse = false; + + protected ScriptureRefUsfmParserHandlerBase() + { + _curElements = new Stack(); + _curTextType = new Stack(); + } + + protected ScriptureTextType CurrentTextType => + _curTextType.Count == 0 ? ScriptureTextType.NonVerse : _curTextType.Peek(); + + public override void EndUsfm(UsfmParserState state) + { + EndVerseText(state); + } + + public override void Chapter( + UsfmParserState state, + string number, + string marker, + string altNumber, + string pubNumber + ) + { + EndVerseText(state); + UpdateVerseRef(state.VerseRef, marker); + } + + public override void Verse( + UsfmParserState state, + string number, + string marker, + string altNumber, + string pubNumber + ) + { + if (state.VerseRef.Equals(_curVerseRef)) + { + EndVerseText(state); + // ignore duplicate verses + _duplicateVerse = true; + } + else if (VerseRef.AreOverlappingVersesRanges(number, _curVerseRef.Verse)) + { + // merge overlapping verse ranges in to one range + VerseRef verseRef = _curVerseRef.Clone(); + verseRef.Verse = CorporaUtils.MergeVerseRanges(number, _curVerseRef.Verse); + UpdateVerseRef(verseRef, marker); + } + else + { + EndVerseText(state); + UpdateVerseRef(state.VerseRef, marker); + StartVerseText(state); + } + } + + public override void StartPara( + UsfmParserState state, + string marker, + bool unknown, + IReadOnlyList attributes + ) + { + if (_curVerseRef.IsDefault) + UpdateVerseRef(state.VerseRef, marker); + + if (!state.IsVerseText) + { + StartParentElement(marker); + StartNonVerseText(state); + } + } + + public override void EndPara(UsfmParserState state, string marker) + { + if (CurrentTextType == ScriptureTextType.NonVerse) + { + EndParentElement(); + EndNonVerseText(state); + } + } + + public override void StartRow(UsfmParserState state, string marker) + { + if (CurrentTextType == ScriptureTextType.NonVerse) + StartParentElement(marker); + } + + public override void EndRow(UsfmParserState state, string marker) + { + if (CurrentTextType == ScriptureTextType.NonVerse) + EndParentElement(); + } + + public override void StartCell(UsfmParserState state, string marker, string align, int colspan) + { + if (CurrentTextType == ScriptureTextType.NonVerse) + { + StartParentElement(marker); + StartNonVerseText(state); + } + } + + public override void EndCell(UsfmParserState state, string marker) + { + if (CurrentTextType == ScriptureTextType.NonVerse) + { + EndParentElement(); + EndNonVerseText(state); + } + } + + public override void StartSidebar(UsfmParserState state, string marker, string category) + { + StartParentElement(marker); + } + + public override void EndSidebar(UsfmParserState state, string marker, bool closed) + { + EndParentElement(); + } + + public override void StartNote(UsfmParserState state, string marker, string caller, string category) + { + NextElement(marker); + StartNoteText(state); + } + + public override void EndNote(UsfmParserState state, string marker, bool closed) + { + EndNoteText(state); + } + + public override void Ref(UsfmParserState state, string marker, string display, string target) { } + + protected virtual void StartVerseText(UsfmParserState state, IReadOnlyList scriptureRefs) { } + + protected virtual void EndVerseText(UsfmParserState state, IReadOnlyList scriptureRefs) { } + + protected virtual void StartNonVerseText(UsfmParserState state, ScriptureRef scriptureRef) { } + + protected virtual void EndNonVerseText(UsfmParserState state, ScriptureRef scriptureRef) { } + + protected virtual void StartNoteText(UsfmParserState state, ScriptureRef scriptureRef) { } + + protected virtual void EndNoteText(UsfmParserState state, ScriptureRef scriptureRef) { } + + private void StartVerseText(UsfmParserState state) + { + _duplicateVerse = false; + _curTextType.Push(ScriptureTextType.Verse); + StartVerseText(state, CreateVerseRefs()); + } + + private void EndVerseText(UsfmParserState state) + { + if (!_duplicateVerse && _curVerseRef.VerseNum != 0) + { + EndVerseText(state, CreateVerseRefs()); + _curTextType.Pop(); + } + } + + private void StartNonVerseText(UsfmParserState state) + { + _curTextType.Push(ScriptureTextType.NonVerse); + StartNonVerseText(state, CreateNonVerseRef()); + } + + private void EndNonVerseText(UsfmParserState state) + { + EndNonVerseText(state, CreateNonVerseRef()); + _curTextType.Pop(); + } + + private void StartNoteText(UsfmParserState state) + { + _curTextType.Push(ScriptureTextType.Note); + StartNoteText(state, CreateNonVerseRef()); + } + + private void EndNoteText(UsfmParserState state) + { + EndNoteText(state, CreateNonVerseRef()); + _curTextType.Pop(); + } + + private void UpdateVerseRef(VerseRef verseRef, string marker) + { + if (!VerseRef.AreOverlappingVersesRanges(verseRef, _curVerseRef)) + { + _curElements.Clear(); + _curElements.Push(new ScriptureElement(0, marker)); + } + _curVerseRef = verseRef; + } + + private void NextElement(string marker) + { + ScriptureElement prevElem = _curElements.Pop(); + _curElements.Push(new ScriptureElement(prevElem.Position + 1, marker)); + } + + private void StartParentElement(string marker) + { + NextElement(marker); + _curElements.Push(new ScriptureElement(0, marker)); + } + + private void EndParentElement() + { + _curElements.Pop(); + } + + private IReadOnlyList CreateVerseRefs() + { + return _curVerseRef.HasMultiple + ? _curVerseRef.AllVerses().Select(v => new ScriptureRef(v)).ToArray() + : new[] { new ScriptureRef(_curVerseRef) }; + } + + private ScriptureRef CreateNonVerseRef() + { + return new ScriptureRef( + _curVerseRef.HasMultiple ? _curVerseRef.AllVerses().Last() : _curVerseRef, + _curElements.Where(e => e.Position > 0).Reverse() + ); + } + } +} diff --git a/src/SIL.Machine/Corpora/ScriptureText.cs b/src/SIL.Machine/Corpora/ScriptureText.cs index 5fee2f23..b55b328d 100644 --- a/src/SIL.Machine/Corpora/ScriptureText.cs +++ b/src/SIL.Machine/Corpora/ScriptureText.cs @@ -17,24 +17,59 @@ public override IEnumerable GetRows() { var rowList = new List(); bool outOfOrder = false; - var prevVerseRef = new VerseRef(); + var prevScrRef = new ScriptureRef(); foreach (TextRow r in GetVersesInDocOrder()) { TextRow row = r; - var verseRef = (VerseRef)row.Ref; + var scrRef = (ScriptureRef)row.Ref; rowList.Add(row); - if (!outOfOrder && verseRef.CompareTo(prevVerseRef) < 0) + if (!outOfOrder && scrRef.CompareTo(prevScrRef) < 0) outOfOrder = true; - prevVerseRef = verseRef; + prevScrRef = scrRef; } if (outOfOrder) - rowList.Sort((x, y) => ((VerseRef)x.Ref).CompareTo(y.Ref)); + rowList.Sort((x, y) => ((ScriptureRef)x.Ref).CompareTo(y.Ref)); return rowList; } protected abstract IEnumerable GetVersesInDocOrder(); + protected IEnumerable CreateRows( + IReadOnlyList scriptureRefs, + string text = "", + bool isSentenceStart = true + ) + { + if (scriptureRefs.Count > 1) + { + bool firstVerse = true; + foreach (ScriptureRef scriptureRef in scriptureRefs) + { + if (firstVerse) + { + TextRowFlags flags = TextRowFlags.InRange | TextRowFlags.RangeStart; + if (isSentenceStart) + flags |= TextRowFlags.SentenceStart; + yield return CreateRow(text, scriptureRef, flags); + firstVerse = false; + } + else + { + yield return CreateEmptyRow(scriptureRef, TextRowFlags.InRange); + } + } + } + else + { + yield return CreateRow( + text, + scriptureRefs[0], + isSentenceStart ? TextRowFlags.SentenceStart : TextRowFlags.None + ); + } + } + protected IEnumerable CreateRows(VerseRef verseRef, string text = "", bool isSentenceStart = true) { if (verseRef.HasMultiple) @@ -44,15 +79,15 @@ protected IEnumerable CreateRows(VerseRef verseRef, string text = "", b { if (firstVerse) { - var flags = TextRowFlags.InRange | TextRowFlags.RangeStart; + TextRowFlags flags = TextRowFlags.InRange | TextRowFlags.RangeStart; if (isSentenceStart) flags |= TextRowFlags.SentenceStart; - yield return CreateRow(text, vref, flags); + yield return CreateRow(text, new ScriptureRef(vref), flags); firstVerse = false; } else { - yield return CreateEmptyRow(vref, TextRowFlags.InRange); + yield return CreateEmptyRow(new ScriptureRef(vref), TextRowFlags.InRange); } } } @@ -60,12 +95,31 @@ protected IEnumerable CreateRows(VerseRef verseRef, string text = "", b { yield return CreateRow( text, - verseRef, + new ScriptureRef(verseRef), isSentenceStart ? TextRowFlags.SentenceStart : TextRowFlags.None ); } } + protected TextRow CreateRow( + VerseRef verseRef, + IEnumerable elements, + string text = "", + bool isSentenceStart = true + ) + { + return CreateRow( + text, + new ScriptureRef(verseRef, elements), + isSentenceStart ? TextRowFlags.SentenceStart : TextRowFlags.None + ); + } + + protected TextRow CreateRow(ScriptureRef scriptureRef, string text = "", bool isSentenceStart = true) + { + return CreateRow(text, scriptureRef, isSentenceStart ? TextRowFlags.SentenceStart : TextRowFlags.None); + } + protected VerseRef CreateVerseRef(string chapter, string verse) { return new VerseRef(Id, chapter, verse, Versification); diff --git a/src/SIL.Machine/Corpora/ScriptureTextCorpus.cs b/src/SIL.Machine/Corpora/ScriptureTextCorpus.cs index 9d210703..83762caf 100644 --- a/src/SIL.Machine/Corpora/ScriptureTextCorpus.cs +++ b/src/SIL.Machine/Corpora/ScriptureTextCorpus.cs @@ -39,9 +39,10 @@ public ScriptureTextCorpus(ScrVers versification, IEnumerable texts) Versification = versification; } - protected ScriptureTextCorpus() { } - - public ScrVers Versification { get; protected set; } = ScrVers.English; + protected ScriptureTextCorpus() + { + Versification = ScrVers.English; + } private class VersificationRefCorpusText : ScriptureText { diff --git a/src/SIL.Machine/Corpora/TextCorpusBase.cs b/src/SIL.Machine/Corpora/TextCorpusBase.cs index 1e22e53a..db690654 100644 --- a/src/SIL.Machine/Corpora/TextCorpusBase.cs +++ b/src/SIL.Machine/Corpora/TextCorpusBase.cs @@ -1,4 +1,5 @@ using System.Collections.Generic; +using SIL.Scripture; namespace SIL.Machine.Corpora { @@ -6,6 +7,7 @@ public abstract class TextCorpusBase : CorpusBase, ITextCorpus { public abstract IEnumerable Texts { get; } public abstract bool IsTokenized { get; } + public abstract ScrVers Versification { get; } public override IEnumerable GetRows() { diff --git a/src/SIL.Machine/Corpora/UsfmFileText.cs b/src/SIL.Machine/Corpora/UsfmFileText.cs index e658259e..6f1c34f8 100644 --- a/src/SIL.Machine/Corpora/UsfmFileText.cs +++ b/src/SIL.Machine/Corpora/UsfmFileText.cs @@ -14,9 +14,10 @@ public UsfmFileText( Encoding encoding, string fileName, ScrVers versification = null, - bool includeMarkers = false + bool includeMarkers = false, + bool includeAllText = false ) - : base(GetId(fileName, encoding), stylesheet, encoding, versification, includeMarkers) + : base(GetId(fileName, encoding), stylesheet, encoding, versification, includeMarkers, includeAllText) { _fileName = fileName; } diff --git a/src/SIL.Machine/Corpora/UsfmFileTextCorpus.cs b/src/SIL.Machine/Corpora/UsfmFileTextCorpus.cs index c21c39b0..e3e452ed 100644 --- a/src/SIL.Machine/Corpora/UsfmFileTextCorpus.cs +++ b/src/SIL.Machine/Corpora/UsfmFileTextCorpus.cs @@ -12,13 +12,18 @@ public UsfmFileTextCorpus( string projectPath, ScrVers versification = null, bool includeMarkers = false, - string filePattern = "*.SFM" + string filePattern = "*.SFM", + bool includeAllText = false ) { Versification = versification ?? ScrVers.English; var stylesheet = new UsfmStylesheet(stylesheetFileName); foreach (string sfmFileName in Directory.EnumerateFiles(projectPath, filePattern)) - AddText(new UsfmFileText(stylesheet, encoding, sfmFileName, Versification, includeMarkers)); + { + AddText( + new UsfmFileText(stylesheet, encoding, sfmFileName, Versification, includeMarkers, includeAllText) + ); + } } } } diff --git a/src/SIL.Machine/Corpora/UsfmParser.cs b/src/SIL.Machine/Corpora/UsfmParser.cs index 4cde32b4..ce504762 100644 --- a/src/SIL.Machine/Corpora/UsfmParser.cs +++ b/src/SIL.Machine/Corpora/UsfmParser.cs @@ -141,6 +141,7 @@ public bool ProcessToken() // If past end if (State.Index >= State.Tokens.Count - 1) { + CloseAll(); Handler?.EndUsfm(State); return false; } diff --git a/src/SIL.Machine/Corpora/UsfmTextBase.cs b/src/SIL.Machine/Corpora/UsfmTextBase.cs index 956fbd2a..3af05c30 100644 --- a/src/SIL.Machine/Corpora/UsfmTextBase.cs +++ b/src/SIL.Machine/Corpora/UsfmTextBase.cs @@ -1,6 +1,5 @@ using System.Collections.Generic; using System.IO; -using System.Linq; using System.Text; using SIL.Machine.Utils; using SIL.Scripture; @@ -12,19 +11,22 @@ public abstract class UsfmTextBase : ScriptureText private readonly UsfmStylesheet _stylesheet; private readonly Encoding _encoding; private readonly bool _includeMarkers; + private readonly bool _includeAllText; protected UsfmTextBase( string id, UsfmStylesheet stylesheet, Encoding encoding, ScrVers versification, - bool includeMarkers + bool includeMarkers, + bool includeAllText ) : base(id, versification) { _stylesheet = stylesheet; _encoding = encoding; _includeMarkers = includeMarkers; + _includeAllText = includeAllText; } protected override IEnumerable GetVersesInDocOrder() @@ -46,12 +48,11 @@ private string ReadUsfm() } } - private class TextRowCollector : UsfmParserHandlerBase + private class TextRowCollector : ScriptureRefUsfmParserHandlerBase { private readonly UsfmTextBase _text; private readonly List _rows; - private VerseRef _verseRef; - private readonly StringBuilder _verseText; + private readonly Stack _rowTexts; private bool _sentenceStart; private readonly List _nextParaTokens; private bool _nextParaTextStarted = false; @@ -60,24 +61,12 @@ public TextRowCollector(UsfmTextBase text) { _text = text; _rows = new List(); - _verseText = new StringBuilder(); + _rowTexts = new Stack(); _nextParaTokens = new List(); } public IEnumerable Rows => _rows; - public override void Chapter( - UsfmParserState state, - string number, - string marker, - string altNumber, - string pubNumber - ) - { - VerseCompleted(nextSentenceStart: true); - _verseRef = default; - } - public override void Verse( UsfmParserState state, string number, @@ -86,27 +75,8 @@ public override void Verse( string pubNumber ) { - if (_verseRef.IsDefault) - { - _verseRef = state.VerseRef; - } - else if (state.VerseRef.Equals(_verseRef)) - { - VerseCompleted(); + base.Verse(state, number, marker, altNumber, pubNumber); - // ignore duplicate verse - _verseRef = default; - } - else if (VerseRef.AreOverlappingVersesRanges(number, _verseRef.Verse)) - { - // merge overlapping verse ranges in to one range - _verseRef.Verse = CorporaUtils.MergeVerseRanges(number, _verseRef.Verse); - } - else - { - VerseCompleted(); - _verseRef = state.VerseRef; - } _nextParaTextStarted = true; _nextParaTokens.Clear(); } @@ -118,32 +88,38 @@ public override void StartPara( IReadOnlyList attributes ) { + base.StartPara(state, marker, unknown, attributes); + HandlePara(state); } public override void StartRow(UsfmParserState state, string marker) { + base.StartRow(state, marker); + HandlePara(state); } public override void StartCell(UsfmParserState state, string marker, string align, int colspan) { - if (_verseRef.IsDefault) - return; + base.StartCell(state, marker, align, colspan); if (_text._includeMarkers) { OutputMarker(state); } - else + else if (CurrentTextType == ScriptureTextType.Verse) { - if (_verseText.Length > 0 && !char.IsWhiteSpace(_verseText[_verseText.Length - 1])) - _verseText.Append(" "); + StringBuilder verseText = _rowTexts.Peek(); + if (verseText.Length > 0 && !char.IsWhiteSpace(verseText[verseText.Length - 1])) + verseText.Append(" "); } } public override void Ref(UsfmParserState state, string marker, string display, string target) { + base.Ref(state, marker, display, target); + OutputMarker(state); } @@ -154,6 +130,8 @@ public override void StartChar( IReadOnlyList attributes ) { + base.StartChar(state, markerWithoutPlus, unknown, attributes); + OutputMarker(state); } @@ -164,108 +142,155 @@ public override void EndChar( bool closed ) { + base.EndChar(state, marker, attributes, closed); + if (_text._includeMarkers && attributes != null && state.PrevToken?.Type == UsfmTokenType.Attribute) - _verseText.Append(state.PrevToken); + _rowTexts.Peek().Append(state.PrevToken); if (closed) OutputMarker(state); + if (!_text._includeMarkers && marker == "rq") - _verseText.TrimEnd(); + _rowTexts.Peek().TrimEnd(); } public override void StartNote(UsfmParserState state, string marker, string caller, string category) { + base.StartNote(state, marker, caller, category); + OutputMarker(state); } public override void EndNote(UsfmParserState state, string marker, bool closed) { + base.EndNote(state, marker, closed); + if (closed) OutputMarker(state); } public override void OptBreak(UsfmParserState state) { - if (!_text._includeMarkers) - _verseText.TrimEnd(); + base.OptBreak(state); + + if (_text._includeMarkers) + { + _rowTexts.Peek().Append("//"); + } + else if (CurrentTextType != ScriptureTextType.Verse || state.IsVerseText) + { + _rowTexts.Peek().TrimEnd(); + } } public override void Text(UsfmParserState state, string text) { - if (_verseRef.IsDefault || !state.IsVersePara) + base.Text(state, text); + + if (_rowTexts.Count == 0) return; + StringBuilder rowText = _rowTexts.Peek(); if (_text._includeMarkers) { text = text.TrimEnd('\r', '\n'); - if (text.Length > 0 && !state.Stack.Any(e => e.Type == UsfmElementType.Sidebar)) + if (text.Length > 0) { if (!text.IsWhiteSpace()) { foreach (UsfmToken token in _nextParaTokens) - _verseText.Append(token); + rowText.Append(token); _nextParaTokens.Clear(); _nextParaTextStarted = true; } - if (_verseText.Length == 0 || char.IsWhiteSpace(_verseText[_verseText.Length - 1])) - { + if (rowText.Length == 0 || char.IsWhiteSpace(rowText[rowText.Length - 1])) text = text.TrimStart(); - } - _verseText.Append(text); + rowText.Append(text); } } - else if (state.IsVerseText && text.Length > 0) + else if (text.Length > 0 && (CurrentTextType != ScriptureTextType.Verse || state.IsVerseText)) { if ( state.PrevToken?.Type == UsfmTokenType.End - && (_verseText.Length == 0 || char.IsWhiteSpace(_verseText[_verseText.Length - 1])) + && (rowText.Length == 0 || char.IsWhiteSpace(rowText[rowText.Length - 1])) ) { text = text.TrimStart(); } - _verseText.Append(text); + rowText.Append(text); } } - public override void EndUsfm(UsfmParserState state) + protected override void StartVerseText(UsfmParserState state, IReadOnlyList scriptureRefs) { - VerseCompleted(); + _rowTexts.Push(new StringBuilder()); } - private void OutputMarker(UsfmParserState state) + protected override void EndVerseText(UsfmParserState state, IReadOnlyList scriptureRefs) + { + string text = _rowTexts.Pop().ToString(); + _rows.AddRange(_text.CreateRows(scriptureRefs, text, _sentenceStart)); + _sentenceStart = state.Token.Marker == "c" || text.HasSentenceEnding(); + } + + protected override void StartNonVerseText(UsfmParserState state, ScriptureRef scriptureRef) + { + _rowTexts.Push(new StringBuilder()); + } + + protected override void EndNonVerseText(UsfmParserState state, ScriptureRef scriptureRef) { - if (_verseRef.IsDefault || !_text._includeMarkers) + string text = _rowTexts.Pop().ToString(); + if (_text._includeAllText) + _rows.Add(_text.CreateRow(scriptureRef, text, _sentenceStart)); + } + + protected override void StartNoteText(UsfmParserState state, ScriptureRef scriptureRef) + { + if (_text._includeMarkers) return; - if (_nextParaTextStarted) - _verseText.Append(state.Token); - else - _nextParaTokens.Add(state.Token); + _rowTexts.Push(new StringBuilder()); } - private void VerseCompleted(bool? nextSentenceStart = null) + protected override void EndNoteText(UsfmParserState state, ScriptureRef scriptureRef) { - if (_verseRef.IsDefault) + if (_text._includeMarkers) return; - string text = _verseText.ToString(); - _rows.AddRange(_text.CreateRows(_verseRef, text, _sentenceStart)); - _sentenceStart = nextSentenceStart ?? text.HasSentenceEnding(); - _verseText.Clear(); + string text = _rowTexts.Pop().ToString(); + if (_text._includeAllText) + _rows.Add(_text.CreateRow(scriptureRef, text, _sentenceStart)); + } + + private void OutputMarker(UsfmParserState state) + { + if (!_text._includeMarkers || _rowTexts.Count == 0) + return; + + if (_nextParaTextStarted) + _rowTexts.Peek().Append(state.Token); + else + _nextParaTokens.Add(state.Token); } private void HandlePara(UsfmParserState state) { - if (_verseRef.IsDefault) + if (_rowTexts.Count == 0) return; - if (state.IsVersePara) + foreach (StringBuilder rowText in _rowTexts) + { + if (rowText.Length > 0 && !char.IsWhiteSpace(rowText[rowText.Length - 1])) + rowText.Append(" "); + } + if (CurrentTextType == ScriptureTextType.Verse) { - if (_verseText.Length > 0 && !char.IsWhiteSpace(_verseText[_verseText.Length - 1])) - _verseText.Append(" "); _nextParaTokens.Add(state.Token); _nextParaTextStarted = false; } + if (!state.IsVersePara) + _sentenceStart = true; } } } diff --git a/src/SIL.Machine/Corpora/UsfmVerseTextUpdater.cs b/src/SIL.Machine/Corpora/UsfmTextUpdater.cs similarity index 56% rename from src/SIL.Machine/Corpora/UsfmVerseTextUpdater.cs rename to src/SIL.Machine/Corpora/UsfmTextUpdater.cs index b50b2cb9..8b44ea44 100644 --- a/src/SIL.Machine/Corpora/UsfmVerseTextUpdater.cs +++ b/src/SIL.Machine/Corpora/UsfmTextUpdater.cs @@ -1,7 +1,6 @@ using System; using System.Collections.Generic; using System.Linq; -using SIL.Scripture; namespace SIL.Machine.Corpora { @@ -9,43 +8,51 @@ namespace SIL.Machine.Corpora * This is a USFM parser handler that can be used to replace the existing text in a USFM file with the specified * text. */ - public class UsfmVerseTextUpdater : UsfmParserHandlerBase + public class UsfmTextUpdater : ScriptureRefUsfmParserHandlerBase { - private readonly IReadOnlyList<(IReadOnlyList, string)> _rows; + private readonly IReadOnlyList<(IReadOnlyList, string)> _rows; private readonly List _tokens; private readonly string _idText; private readonly bool _stripAllText; + private readonly bool _strictComparison; + private readonly Stack _replace; private int _rowIndex; private int _tokenIndex; - private bool _replaceText; - public UsfmVerseTextUpdater( - IReadOnlyList<(IReadOnlyList, string)> rows = null, + public UsfmTextUpdater( + IReadOnlyList<(IReadOnlyList, string)> rows = null, string idText = null, - bool stripAllText = false + bool stripAllText = false, + bool strictComparison = true ) { - _rows = rows ?? Array.Empty<(IReadOnlyList, string)>(); + _rows = rows ?? Array.Empty<(IReadOnlyList, string)>(); _tokens = new List(); _idText = idText; _stripAllText = stripAllText; + _strictComparison = strictComparison; + _replace = new Stack(); } public IReadOnlyList Tokens => _tokens; + private bool ReplaceText => _stripAllText || (_replace.Count > 0 && _replace.Peek()); + public override void StartBook(UsfmParserState state, string marker, string code) { CollectTokens(state); if (_idText != null) - { _tokens.Add(new UsfmToken(_idText + " ")); - _replaceText = true; - } + _replace.Push(_idText != null); + + base.StartBook(state, marker, code); } public override void EndBook(UsfmParserState state, string marker) { - _replaceText = false; + _replace.Pop(); + + base.EndBook(state, marker); } public override void StartPara( @@ -55,37 +62,45 @@ public override void StartPara( IReadOnlyList attributes ) { - if (!state.IsVersePara) - _replaceText = false; CollectTokens(state); + + base.StartPara(state, marker, unknown, attributes); } public override void StartRow(UsfmParserState state, string marker) { CollectTokens(state); + + base.StartRow(state, marker); } public override void StartCell(UsfmParserState state, string marker, string align, int colspan) { CollectTokens(state); + + base.StartCell(state, marker, align, colspan); } public override void EndCell(UsfmParserState state, string marker) { CollectTokens(state); + + base.EndCell(state, marker); } public override void StartSidebar(UsfmParserState state, string marker, string category) { - _replaceText = false; CollectTokens(state); + + base.StartSidebar(state, marker, category); } public override void EndSidebar(UsfmParserState state, string marker, bool closed) { - _replaceText = false; if (closed) CollectTokens(state); + + base.EndSidebar(state, marker, closed); } public override void Chapter( @@ -96,8 +111,9 @@ public override void Chapter( string pubNumber ) { - _replaceText = false; CollectTokens(state); + + base.Chapter(state, number, marker, altNumber, pubNumber); } public override void Milestone( @@ -108,6 +124,8 @@ IReadOnlyList attributes ) { CollectTokens(state); + + base.Milestone(state, marker, startMilestone, attributes); } public override void Verse( @@ -118,43 +136,9 @@ public override void Verse( string pubNumber ) { - _replaceText = false; CollectTokens(state); - while (_rowIndex < _rows.Count) - { - var (verseRefs, text) = _rows[_rowIndex]; - bool stop = false; - foreach (VerseRef verseRef in verseRefs) - { - int compare = verseRef.CompareTo(state.VerseRef, compareAllVerses: true); - if (compare == 0) - { - _tokens.Add(new UsfmToken(text + " ")); - _replaceText = true; - break; - } - else - { - if (state.VerseRef.AllVerses().Any(v => v.Equals(verseRef))) - { - _tokens.Add(new UsfmToken(text + " ")); - _replaceText = true; - break; - } - if (compare > 0) - { - stop = true; - break; - } - } - } - - if (stop) - break; - else - _rowIndex++; - } + base.Verse(state, number, marker, altNumber, pubNumber); } public override void StartChar( @@ -165,10 +149,12 @@ IReadOnlyList attributes ) { // strip out char-style markers in verses that are being replaced - if (_stripAllText || (_replaceText && state.IsVersePara)) + if (ReplaceText) SkipTokens(state); else CollectTokens(state); + + base.StartChar(state, markerWithoutPlus, unknown, attributes); } public override void EndChar( @@ -179,60 +165,126 @@ bool closed ) { // strip out char-style markers in verses that are being replaced - if (closed && (_stripAllText || (_replaceText && state.IsVersePara))) + if (closed && ReplaceText) SkipTokens(state); + + base.EndChar(state, marker, attributes, closed); } public override void StartNote(UsfmParserState state, string marker, string caller, string category) { // strip out notes in verses that are being replaced - if (_stripAllText || (_replaceText && state.IsVersePara)) + if (ReplaceText) SkipTokens(state); else CollectTokens(state); + + base.StartNote(state, marker, caller, category); } public override void EndNote(UsfmParserState state, string marker, bool closed) { // strip out notes in verses that are being replaced - if (closed && (_stripAllText || (_replaceText && state.IsVersePara))) + if (closed && ReplaceText) SkipTokens(state); + + base.EndNote(state, marker, closed); } public override void Ref(UsfmParserState state, string marker, string display, string target) { // strip out ref in verses that are being replaced - if (_stripAllText || (_replaceText && state.IsVersePara)) + if (ReplaceText) SkipTokens(state); else CollectTokens(state); + + base.Ref(state, marker, display, target); } public override void Text(UsfmParserState state, string text) { // strip out text in verses that are being replaced - if (_stripAllText || (_replaceText && (state.IsVersePara || state.ParaTag.Marker == "id"))) + if (ReplaceText) SkipTokens(state); else CollectTokens(state); + + base.Text(state, text); } public override void OptBreak(UsfmParserState state) { // strip out optbreaks in verses that are being replaced - if (_stripAllText || (_replaceText && state.IsVersePara)) + if (ReplaceText) SkipTokens(state); else CollectTokens(state); + + base.OptBreak(state); } public override void Unmatched(UsfmParserState state, string marker) { // strip out unmatched end markers in verses that are being replaced - if (_stripAllText || (_replaceText && state.IsVersePara)) + if (ReplaceText) SkipTokens(state); else CollectTokens(state); + + base.Unmatched(state, marker); + } + + protected override void StartVerseText(UsfmParserState state, IReadOnlyList scriptureRefs) + { + IReadOnlyList rowTexts = AdvanceRows(scriptureRefs); + _tokens.AddRange(rowTexts.Select(t => new UsfmToken(t + " "))); + _replace.Push(rowTexts.Count > 0); + } + + protected override void EndVerseText(UsfmParserState state, IReadOnlyList scriptureRefs) + { + _replace.Pop(); + } + + protected override void StartNonVerseText(UsfmParserState state, ScriptureRef scriptureRef) + { + IReadOnlyList rowTexts = AdvanceRows(new[] { scriptureRef }); + _tokens.AddRange(rowTexts.Select(t => new UsfmToken(t + " "))); + _replace.Push(rowTexts.Count > 0); + } + + protected override void EndNonVerseText(UsfmParserState state, ScriptureRef scriptureRef) + { + _replace.Pop(); + } + + protected override void StartNoteText(UsfmParserState state, ScriptureRef scriptureRef) + { + IReadOnlyList rowTexts = AdvanceRows(new[] { scriptureRef }); + if (rowTexts.Count > 0) + { + _tokens.Add(state.Token); + _tokens.Add(new UsfmToken(UsfmTokenType.Character, "ft", null, "ft*")); + for (int i = 0; i < rowTexts.Count; i++) + { + string text = rowTexts[i]; + if (i < rowTexts.Count - 1) + text += " "; + _tokens.Add(new UsfmToken(text)); + } + _tokens.Add(new UsfmToken(UsfmTokenType.End, state.Token.EndMarker, null, null)); + _replace.Push(true); + } + else + { + _replace.Push(_replace.Peek()); + } + } + + protected override void EndNoteText(UsfmParserState state, ScriptureRef scriptureRef) + { + _replace.Pop(); } public string GetUsfm(string stylesheetFileName = "usfm.sty") @@ -246,6 +298,45 @@ public string GetUsfm(UsfmStylesheet stylesheet) return tokenizer.Detokenize(_tokens); } + private IReadOnlyList AdvanceRows(IReadOnlyList segScrRefs) + { + var rowTexts = new List(); + int i = 0; + while (_rowIndex < _rows.Count && i < segScrRefs.Count) + { + (IReadOnlyList rowScrRefs, string text) = _rows[_rowIndex]; + bool stop = false; + foreach (ScriptureRef rowScrRef in rowScrRefs) + { + bool found = false; + for (; i < segScrRefs.Count; i++) + { + int compare = rowScrRef.CompareTo(segScrRefs[i], compareSegments: false, _strictComparison); + if (compare == 0) + { + rowTexts.Add(text); + i++; + found = true; + break; + } + else if (compare > 0) + { + stop = true; + break; + } + } + if (stop || found) + break; + } + + if (stop) + break; + else + _rowIndex++; + } + return rowTexts; + } + private void CollectTokens(UsfmParserState state) { while (_tokenIndex <= state.Index + state.SpecialTokenCount) diff --git a/src/SIL.Machine/Corpora/UsfmZipText.cs b/src/SIL.Machine/Corpora/UsfmZipText.cs index 19bf3159..c1ce4864 100644 --- a/src/SIL.Machine/Corpora/UsfmZipText.cs +++ b/src/SIL.Machine/Corpora/UsfmZipText.cs @@ -17,9 +17,17 @@ public UsfmZipText( string archiveFileName, string path, ScrVers versification = null, - bool includeMarkers = false + bool includeMarkers = false, + bool includeAllText = false ) - : base(GetId(archiveFileName, path, encoding), stylesheet, encoding, versification, includeMarkers) + : base( + GetId(archiveFileName, path, encoding), + stylesheet, + encoding, + versification, + includeMarkers, + includeAllText + ) { _archiveFileName = archiveFileName; _path = path; diff --git a/src/SIL.Machine/Corpora/UsxFileAlignmentCollection.cs b/src/SIL.Machine/Corpora/UsxFileAlignmentCollection.cs index 1aa88d4c..1af7062f 100644 --- a/src/SIL.Machine/Corpora/UsxFileAlignmentCollection.cs +++ b/src/SIL.Machine/Corpora/UsxFileAlignmentCollection.cs @@ -13,8 +13,6 @@ namespace SIL.Machine.Corpora { public class UsxFileAlignmentCollection : IAlignmentCollection { - private static readonly VerseRefComparer VerseRefComparer = new VerseRefComparer(); - private readonly IRangeTokenizer _srcWordTokenizer; private readonly IRangeTokenizer _trgWordTokenizer; private readonly string _srcFileName; @@ -77,7 +75,7 @@ public IEnumerable GetRows() var srcVerseRef = new VerseRef(Id, srcVerse.Chapter, srcVerse.Verse, _srcVersification); var trgVerseRef = new VerseRef(Id, trgVerse.Chapter, trgVerse.Verse, _trgVersification); - int compare = VerseRefComparer.Compare(srcVerseRef, trgVerseRef); + int compare = VerseRefComparer.Default.Compare(srcVerseRef, trgVerseRef); if (compare < 0) { srcCompleted = !srcEnumerator.MoveNext(); diff --git a/src/SIL.Machine/Corpora/VerseRefComparer.cs b/src/SIL.Machine/Corpora/VerseRefComparer.cs index c284844b..1d49f422 100644 --- a/src/SIL.Machine/Corpora/VerseRefComparer.cs +++ b/src/SIL.Machine/Corpora/VerseRefComparer.cs @@ -8,6 +8,9 @@ namespace SIL.Machine.Corpora { public class VerseRefComparer : IComparer { + public static IComparer Default { get; } = new VerseRefComparer(compareSegments: true); + public static IComparer IgnoreSegments { get; } = new VerseRefComparer(compareSegments: false); + private readonly bool _compareSegments; public VerseRefComparer(bool compareSegments = true) diff --git a/tests/SIL.Machine.AspNetCore.Tests/Services/NmtPreprocessBuildJobTests.cs b/tests/SIL.Machine.AspNetCore.Tests/Services/NmtPreprocessBuildJobTests.cs index 9416d25e..5fafaeec 100644 --- a/tests/SIL.Machine.AspNetCore.Tests/Services/NmtPreprocessBuildJobTests.cs +++ b/tests/SIL.Machine.AspNetCore.Tests/Services/NmtPreprocessBuildJobTests.cs @@ -43,7 +43,7 @@ public async Task RunAsync_TrainOnAll() public async Task RunAsync_TrainOnTextIds() { using TestEnvironment env = new(); - Corpus corpus1 = env.DefaultTextFileCorpus with { TrainOnTextIds = new HashSet { "textId1" } }; + Corpus corpus1 = env.DefaultTextFileCorpus with { TrainOnTextIds = ["textId1"] }; await env.RunBuildJobAsync(corpus1); @@ -72,7 +72,7 @@ public async Task RunAsync_PretranslateAll() public async Task RunAsync_PretranslateTextIds() { using TestEnvironment env = new(); - Corpus corpus1 = env.DefaultTextFileCorpus with { PretranslateTextIds = new HashSet { "textId1" } }; + Corpus corpus1 = env.DefaultTextFileCorpus with { PretranslateTextIds = ["textId1"] }; await env.RunBuildJobAsync(corpus1); @@ -178,7 +178,7 @@ public async Task RunAsync_MixedSource_Paratext() Assert.That(trgCount, Is.EqualTo(1)); Assert.That(termCount, Is.EqualTo(0)); }); - Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(8)); + Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(12)); } [Test] @@ -247,8 +247,8 @@ public TestEnvironment() TargetLanguage = "en", PretranslateAll = false, TrainOnAll = false, - PretranslateTextIds = new HashSet(), - TrainOnTextIds = new HashSet(), + PretranslateTextIds = [], + TrainOnTextIds = [], SourceFiles = [TextFile("source1")], TargetFiles = [TextFile("target1")] }; @@ -260,8 +260,8 @@ public TestEnvironment() TargetLanguage = "en", PretranslateAll = false, TrainOnAll = false, - PretranslateTextIds = new HashSet(), - TrainOnTextIds = new HashSet(), + PretranslateTextIds = [], + TrainOnTextIds = [], SourceFiles = [TextFile("source1"), TextFile("source2")], TargetFiles = [TextFile("target1")] }; @@ -273,8 +273,8 @@ public TestEnvironment() TargetLanguage = "en", PretranslateAll = false, TrainOnAll = false, - PretranslateTextIds = new HashSet(), - TrainOnTextIds = new HashSet(), + PretranslateTextIds = [], + TrainOnTextIds = [], SourceFiles = [ParatextFile("pt-source1")], TargetFiles = [ParatextFile("pt-target1")] }; @@ -286,8 +286,8 @@ public TestEnvironment() TargetLanguage = "en", PretranslateAll = false, TrainOnAll = false, - PretranslateTextIds = new HashSet(), - TrainOnTextIds = new HashSet(), + PretranslateTextIds = [], + TrainOnTextIds = [], SourceFiles = [ParatextFile("pt-source1"), ParatextFile("pt-source2")], TargetFiles = [ParatextFile("pt-target1")] }; diff --git a/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs b/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs index cf4c16b9..29b645b9 100644 --- a/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs @@ -9,13 +9,12 @@ public class CorporaExtensionsTests [Test] public void ExtractScripture() { - var corpus = new ParatextTextCorpus(CorporaTestHelpers.UsfmTestProjectPath); + var corpus = new ParatextTextCorpus(CorporaTestHelpers.UsfmTestProjectPath, includeAllText: true); var lines = corpus.ExtractScripture().ToList(); - Assert.That(lines.Count, Is.EqualTo(41899)); + Assert.That(lines, Has.Count.EqualTo(41899)); (string text, VerseRef origRef, VerseRef? corpusRef) = lines[0]; - Assert.That(text, Is.EqualTo("")); Assert.That(origRef, Is.EqualTo(new VerseRef("GEN 1:1", ScrVers.Original))); Assert.That(corpusRef, Is.EqualTo(new VerseRef("GEN 1:1", corpus.Versification))); diff --git a/tests/SIL.Machine.Tests/Corpora/ParallelTextCorpusTests.cs b/tests/SIL.Machine.Tests/Corpora/ParallelTextCorpusTests.cs index 551dda3c..d40529c6 100644 --- a/tests/SIL.Machine.Tests/Corpora/ParallelTextCorpusTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/ParallelTextCorpusTests.cs @@ -892,37 +892,45 @@ public void GetGetRows_SameVerseRefOneToMany() "MAT", new[] { - TextRow("MAT", new VerseRef("MAT 1:1", ScrVers.Original), "source chapter one, verse one ."), - TextRow("MAT", new VerseRef("MAT 1:2", ScrVers.Original), "source chapter one, verse two ."), - TextRow("MAT", new VerseRef("MAT 1:3", ScrVers.Original), "source chapter one, verse three .") + TextRow("MAT", ScriptureRef.Parse("MAT 1:1", ScrVers.Original), "source chapter one, verse one ."), + TextRow("MAT", ScriptureRef.Parse("MAT 1:2", ScrVers.Original), "source chapter one, verse two ."), + TextRow("MAT", ScriptureRef.Parse("MAT 1:3", ScrVers.Original), "source chapter one, verse three .") } ) - ); + ) + { + Versification = ScrVers.Original + }; var targetCorpus = new DictionaryTextCorpus( new MemoryText( "MAT", new[] { - TextRow("MAT", new VerseRef("MAT 1:1", versification), "target chapter one, verse one ."), + TextRow("MAT", ScriptureRef.Parse("MAT 1:1", versification), "target chapter one, verse one ."), TextRow( "MAT", - new VerseRef("MAT 1:2", versification), + ScriptureRef.Parse("MAT 1:2", versification), "target chapter one, verse two . target chapter one, verse three .", TextRowFlags.SentenceStart | TextRowFlags.InRange | TextRowFlags.RangeStart ), - TextRow("MAT", new VerseRef("MAT 1:3", versification), flags: TextRowFlags.InRange), - TextRow("MAT", new VerseRef("MAT 1:4", versification), "target chapter one, verse four .") + TextRow("MAT", ScriptureRef.Parse("MAT 1:3", versification), flags: TextRowFlags.InRange), + TextRow("MAT", ScriptureRef.Parse("MAT 1:4", versification), "target chapter one, verse four .") } ) - ); + ) + { + Versification = versification + }; var parallelCorpus = new ParallelTextCorpus(sourceCorpus, targetCorpus); ParallelTextRow[] rows = parallelCorpus.ToArray(); Assert.That(rows.Length, Is.EqualTo(3)); - Assert.That(rows[1].SourceRefs, Is.EqualTo(new[] { new VerseRef("MAT 1:2", ScrVers.Original) })); + Assert.That(rows[1].SourceRefs, Is.EqualTo(new[] { ScriptureRef.Parse("MAT 1:2", ScrVers.Original) })); Assert.That( rows[1].TargetRefs, - Is.EqualTo(new[] { new VerseRef("MAT 1:2", versification), new VerseRef("MAT 1:3", versification) }) + Is.EqualTo( + new[] { ScriptureRef.Parse("MAT 1:2", versification), ScriptureRef.Parse("MAT 1:3", versification) } + ) ); Assert.That(rows[1].SourceSegment, Is.EqualTo("source chapter one, verse two .".Split())); Assert.That( @@ -947,45 +955,57 @@ public void GetGetRows_VerseRefOutOfOrder() "MAT", new[] { - TextRow("MAT", new VerseRef("MAT 1:1", ScrVers.Original), "source chapter one, verse one ."), - TextRow("MAT", new VerseRef("MAT 1:2", ScrVers.Original), "source chapter one, verse two ."), - TextRow("MAT", new VerseRef("MAT 1:3", ScrVers.Original), "source chapter one, verse three ."), - TextRow("MAT", new VerseRef("MAT 1:4", ScrVers.Original), "source chapter one, verse four .") + TextRow("MAT", ScriptureRef.Parse("MAT 1:1", ScrVers.Original), "source chapter one, verse one ."), + TextRow("MAT", ScriptureRef.Parse("MAT 1:2", ScrVers.Original), "source chapter one, verse two ."), + TextRow( + "MAT", + ScriptureRef.Parse("MAT 1:3", ScrVers.Original), + "source chapter one, verse three ." + ), + TextRow("MAT", ScriptureRef.Parse("MAT 1:4", ScrVers.Original), "source chapter one, verse four .") } ) - ); + ) + { + Versification = ScrVers.Original + }; var targetCorpus = new DictionaryTextCorpus( new MemoryText( "MAT", new[] { - TextRow("MAT", new VerseRef("MAT 1:1", versification), "target chapter one, verse one ."), - TextRow("MAT", new VerseRef("MAT 1:2", versification), "target chapter one, verse two ."), - TextRow("MAT", new VerseRef("MAT 1:3", versification), "target chapter one, verse three ."), - TextRow("MAT", new VerseRef("MAT 1:4", versification), "target chapter one, verse four ."), - TextRow("MAT", new VerseRef("MAT 1:5", versification), "target chapter one, verse five .") + TextRow("MAT", ScriptureRef.Parse("MAT 1:1", versification), "target chapter one, verse one ."), + TextRow("MAT", ScriptureRef.Parse("MAT 1:2", versification), "target chapter one, verse two ."), + TextRow("MAT", ScriptureRef.Parse("MAT 1:3", versification), "target chapter one, verse three ."), + TextRow("MAT", ScriptureRef.Parse("MAT 1:4", versification), "target chapter one, verse four ."), + TextRow("MAT", ScriptureRef.Parse("MAT 1:5", versification), "target chapter one, verse five .") } ) - ); + ) + { + Versification = versification + }; var parallelCorpus = new ParallelTextCorpus(sourceCorpus, targetCorpus); ParallelTextRow[] rows = parallelCorpus.ToArray(); Assert.That(rows.Length, Is.EqualTo(4)); - Assert.That(rows[1].SourceRefs, Is.EqualTo(new[] { new VerseRef("MAT 1:2", ScrVers.Original) })); - Assert.That(rows[1].TargetRefs, Is.EqualTo(new[] { new VerseRef("MAT 1:3", versification) })); + Assert.That(rows[1].SourceRefs, Is.EqualTo(new[] { ScriptureRef.Parse("MAT 1:2", ScrVers.Original) })); + Assert.That(rows[1].TargetRefs, Is.EqualTo(new[] { ScriptureRef.Parse("MAT 1:3", versification) })); Assert.That(rows[1].SourceSegment, Is.EqualTo("source chapter one, verse two .".Split())); Assert.That(rows[1].TargetSegment, Is.EqualTo("target chapter one, verse three .".Split())); - Assert.That(rows[2].SourceRefs, Is.EqualTo(new[] { new VerseRef("MAT 1:3", ScrVers.Original) })); - Assert.That(rows[2].TargetRefs, Is.EqualTo(new[] { new VerseRef("MAT 1:2", versification) })); + Assert.That(rows[2].SourceRefs, Is.EqualTo(new[] { ScriptureRef.Parse("MAT 1:3", ScrVers.Original) })); + Assert.That(rows[2].TargetRefs, Is.EqualTo(new[] { ScriptureRef.Parse("MAT 1:2", versification) })); Assert.That(rows[2].SourceSegment, Is.EqualTo("source chapter one, verse three .".Split())); Assert.That(rows[2].TargetSegment, Is.EqualTo("target chapter one, verse two .".Split())); - Assert.That(rows[3].SourceRefs, Is.EqualTo(new[] { new VerseRef("MAT 1:4", ScrVers.Original) })); + Assert.That(rows[3].SourceRefs, Is.EqualTo(new[] { ScriptureRef.Parse("MAT 1:4", ScrVers.Original) })); Assert.That( rows[3].TargetRefs, - Is.EqualTo(new[] { new VerseRef("MAT 1:4", versification), new VerseRef("MAT 1:5", versification) }) + Is.EqualTo( + new[] { ScriptureRef.Parse("MAT 1:4", versification), ScriptureRef.Parse("MAT 1:5", versification) } + ) ); Assert.That(rows[3].SourceSegment, Is.EqualTo("source chapter one, verse four .".Split())); Assert.That( diff --git a/tests/SIL.Machine.Tests/Corpora/ScriptureRefTests.cs b/tests/SIL.Machine.Tests/Corpora/ScriptureRefTests.cs new file mode 100644 index 00000000..5102e3cb --- /dev/null +++ b/tests/SIL.Machine.Tests/Corpora/ScriptureRefTests.cs @@ -0,0 +1,47 @@ +using NUnit.Framework; + +namespace SIL.Machine.Corpora; + +[TestFixture] +public class ScriptureRefTests +{ + [TestCase("MAT 1:1", "MAT 1:2", ExpectedResult = -1, Description = "VerseLessThan")] + [TestCase("MAT 1:1", "MAT 1:1", ExpectedResult = 0, Description = "VerseEqualTo")] + [TestCase("MAT 1:2", "MAT 1:1", ExpectedResult = 1, Description = "VerseGreaterThan")] + [TestCase("MAT 1:0/1:p", "MAT 1:0/2:p", ExpectedResult = -1, Description = "NonVerseLessThan")] + [TestCase("MAT 1:0/1:p", "MAT 1:0/1:p", ExpectedResult = 0, Description = "NonVerseEqualTo")] + [TestCase("MAT 1:0/2:p", "MAT 1:0/1:p", ExpectedResult = 1, Description = "NonVerseGreaterThan")] + [TestCase("MAT 1:0/1:esb", "MAT 1:0/1:esb/1:p", ExpectedResult = -1, Description = "NonVerseParentChild")] + public int CompareTo_Strict(string ref1Str, string ref2Str) + { + var ref1 = ScriptureRef.Parse(ref1Str); + var ref2 = ScriptureRef.Parse(ref2Str); + + int result = ref1.CompareTo(ref2); + + if (result < 0) + result = -1; + else if (result > 0) + result = 1; + return result; + } + + [TestCase("MAT 1:1", "MAT 1:2", ExpectedResult = -1, Description = "VerseLessThan")] + [TestCase("MAT 1:1", "MAT 1:1", ExpectedResult = 0, Description = "VerseEqualTo")] + [TestCase("MAT 1:2", "MAT 1:1", ExpectedResult = 1, Description = "VerseGreaterThan")] + [TestCase("MAT 1:0/1:p", "MAT 1:0/2:p", ExpectedResult = 0, Description = "NonVerseSameMarkerDifferentPosition")] + [TestCase("MAT 1:0/2:esb", "MAT 1:0/1:esb/1:p", ExpectedResult = -1, Description = "NonVerseParentChild")] + public int CompareTo_Relaxed(string ref1Str, string ref2Str) + { + var ref1 = ScriptureRef.Parse(ref1Str); + var ref2 = ScriptureRef.Parse(ref2Str); + + int result = ref1.CompareTo(ref2, strict: false); + + if (result < 0) + result = -1; + else if (result > 0) + result = 1; + return result; + } +} diff --git a/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/41MATTes.SFM b/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/41MATTes.SFM index 2c77542e..af634baf 100644 --- a/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/41MATTes.SFM +++ b/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/41MATTes.SFM @@ -1,7 +1,7 @@ \id MAT - Test \h Matthew \mt Matthew -\ip An introduction to Matthew +\ip An introduction to Matthew\fe + \ft This is an endnote.\fe* \c 1 \s Chapter One \v 1 Chapter \pn one\+pro WON\+pro*\pn*, verse one.\f + \fr 1:1: \ft This is a footnote.\f* @@ -15,13 +15,15 @@ \v 5 Chapter one, \li2 verse \fig Figure 1|src="image1.png" size="col" ref="1:5"\fig* five. \c 2 -\s1 Chapter Two +\tr \tc1 Row one, column one. \tc2 Row one, column two. +\tr \tc1 Row two, column one. \tc2 Row two, column two. +\s1 Chapter \it Two \it* \p \v 1 Chapter \add two\add*, verse \f + \fr 2:1: \ft This is a footnote.\f*one. \v 2-3 Chapter two, // verse \fm ∆\fm*two. \esb \ms This is a sidebar -\p Here is some sidebar content. +\p Here is some sidebar // content. \esbe \v 3-4a Chapter two, verse \w three|lemma\w*. \v 4b Chapter two, verse four. diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmFileTextTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmFileTextTests.cs index 3dca83c8..e20b2731 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmFileTextTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmFileTextTests.cs @@ -1,6 +1,5 @@ using System.Text; using NUnit.Framework; -using SIL.Scripture; namespace SIL.Machine.Corpora; @@ -14,49 +13,118 @@ public void GetRows_NonEmptyText() IText text = corpus["MAT"]; TextRow[] rows = text.GetRows().ToArray(); - Assert.That(rows.Length, Is.EqualTo(19)); + Assert.That(rows, Has.Length.EqualTo(19)); - Assert.That(rows[0].Ref, Is.EqualTo(new VerseRef("MAT 1:1", corpus.Versification))); + Assert.That(rows[0].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:1", corpus.Versification))); Assert.That(rows[0].Text, Is.EqualTo("Chapter one, verse one.")); - Assert.That(rows[1].Ref, Is.EqualTo(new VerseRef("MAT 1:2", corpus.Versification))); + Assert.That(rows[1].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:2", corpus.Versification))); Assert.That(rows[1].Text, Is.EqualTo("Chapter one, verse two.")); - Assert.That(rows[4].Ref, Is.EqualTo(new VerseRef("MAT 1:5", corpus.Versification))); + Assert.That(rows[4].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:5", corpus.Versification))); Assert.That(rows[4].Text, Is.EqualTo("Chapter one, verse five.")); - Assert.That(rows[5].Ref, Is.EqualTo(new VerseRef("MAT 2:1", corpus.Versification))); + Assert.That(rows[5].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:1", corpus.Versification))); Assert.That(rows[5].Text, Is.EqualTo("Chapter two, verse one.")); - Assert.That(rows[6].Ref, Is.EqualTo(new VerseRef("MAT 2:2", corpus.Versification))); + Assert.That(rows[6].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:2", corpus.Versification))); Assert.That(rows[6].Text, Is.EqualTo("Chapter two, verse two. Chapter two, verse three.")); Assert.That(rows[6].IsInRange, Is.True); Assert.That(rows[6].IsRangeStart, Is.True); - Assert.That(rows[7].Ref, Is.EqualTo(new VerseRef("MAT 2:3", corpus.Versification))); + Assert.That(rows[7].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:3", corpus.Versification))); Assert.That(rows[7].Text, Is.Empty); Assert.That(rows[7].IsInRange, Is.True); Assert.That(rows[7].IsRangeStart, Is.False); - Assert.That(rows[8].Ref, Is.EqualTo(new VerseRef("MAT 2:4a", corpus.Versification))); + Assert.That(rows[8].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:4a", corpus.Versification))); Assert.That(rows[8].Text, Is.Empty); Assert.That(rows[8].IsInRange, Is.True); Assert.That(rows[7].IsRangeStart, Is.False); - Assert.That(rows[9].Ref, Is.EqualTo(new VerseRef("MAT 2:4b", corpus.Versification))); + Assert.That(rows[9].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:4b", corpus.Versification))); Assert.That(rows[9].Text, Is.EqualTo("Chapter two, verse four.")); - Assert.That(rows[10].Ref, Is.EqualTo(new VerseRef("MAT 2:5", corpus.Versification))); + Assert.That(rows[10].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:5", corpus.Versification))); Assert.That(rows[10].Text, Is.EqualTo("Chapter two, verse five.")); - Assert.That(rows[11].Ref, Is.EqualTo(new VerseRef("MAT 2:6", corpus.Versification))); + Assert.That(rows[11].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:6", corpus.Versification))); Assert.That(rows[11].Text, Is.EqualTo("Chapter two, verse six.")); - Assert.That(rows[15].Ref, Is.EqualTo(new VerseRef("MAT 2:9", corpus.Versification))); + Assert.That(rows[15].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:9", corpus.Versification))); Assert.That(rows[15].Text, Is.EqualTo("Chapter 2 verse 9")); - Assert.That(rows[16].Ref, Is.EqualTo(new VerseRef("MAT 2:10", corpus.Versification))); + Assert.That(rows[16].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:10", corpus.Versification))); Assert.That(rows[16].Text, Is.EqualTo("Chapter 2 verse 10")); + + Assert.That(rows[17].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:11", corpus.Versification))); + Assert.That(rows[17].Text, Is.Empty); + } + + [Test] + public void GetRows_NonEmptyText_AllText() + { + var corpus = new UsfmFileTextCorpus( + "usfm.sty", + Encoding.UTF8, + CorporaTestHelpers.UsfmTestProjectPath, + includeAllText: true + ); + + IText text = corpus["MAT"]; + TextRow[] rows = text.GetRows().ToArray(); + Assert.That(rows, Has.Length.EqualTo(36)); + + Assert.That(rows[0].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:0/1:h", corpus.Versification))); + Assert.That(rows[0].Text, Is.EqualTo("Matthew")); + + Assert.That(rows[1].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:0/2:mt", corpus.Versification))); + Assert.That(rows[1].Text, Is.EqualTo("Matthew")); + + Assert.That(rows[2].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:0/3:ip", corpus.Versification))); + Assert.That(rows[2].Text, Is.EqualTo("An introduction to Matthew")); + + Assert.That(rows[3].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:0/3:ip/1:fe", corpus.Versification))); + Assert.That(rows[3].Text, Is.EqualTo("This is an endnote.")); + + Assert.That(rows[4].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:0/4:s", corpus.Versification))); + Assert.That(rows[4].Text, Is.EqualTo("Chapter One")); + + Assert.That(rows[6].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:1/1:f", corpus.Versification))); + Assert.That(rows[6].Text, Is.EqualTo("1:1: This is a footnote.")); + + Assert.That(rows[8].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:2/1:f", corpus.Versification))); + Assert.That(rows[8].Text, Is.EqualTo("1:2: This is a footnote.")); + + Assert.That(rows[12].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:0/1:tr/1:tc1", corpus.Versification))); + Assert.That(rows[12].Text, Is.EqualTo("Row one, column one.")); + + Assert.That(rows[13].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:0/1:tr/2:tc2", corpus.Versification))); + Assert.That(rows[13].Text, Is.EqualTo("Row one, column two.")); + + Assert.That(rows[14].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:0/2:tr/1:tc1", corpus.Versification))); + Assert.That(rows[14].Text, Is.EqualTo("Row two, column one.")); + + Assert.That(rows[15].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:0/2:tr/2:tc2", corpus.Versification))); + Assert.That(rows[15].Text, Is.EqualTo("Row two, column two.")); + + Assert.That(rows[16].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:0/3:s1", corpus.Versification))); + Assert.That(rows[16].Text, Is.EqualTo("Chapter Two")); + + Assert.That(rows[18].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:1/1:f", corpus.Versification))); + Assert.That(rows[18].Text, Is.EqualTo("2:1: This is a footnote.")); + + Assert.That(rows[21].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:3/1:esb/1:ms", corpus.Versification))); + Assert.That(rows[21].Text, Is.EqualTo("This is a sidebar")); + + Assert.That(rows[22].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:3/1:esb/2:p", corpus.Versification))); + Assert.That(rows[22].Text, Is.EqualTo("Here is some sidebar content.")); + + Assert.That(rows[28].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:7a/1:s", corpus.Versification))); + Assert.That(rows[28].Text, Is.EqualTo("Section header")); + + Assert.That(rows[35].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:12/1:restore", corpus.Versification))); + Assert.That(rows[35].Text, Is.EqualTo("restore information")); } [Test] @@ -66,13 +134,13 @@ public void GetRows_SentenceStart() IText text = corpus["MAT"]; TextRow[] rows = text.GetRows().ToArray(); - Assert.That(rows.Length, Is.EqualTo(19)); + Assert.That(rows, Has.Length.EqualTo(19)); - Assert.That(rows[3].Ref, Is.EqualTo(new VerseRef("MAT 1:4", corpus.Versification))); + Assert.That(rows[3].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:4", corpus.Versification))); Assert.That(rows[3].Text, Is.EqualTo("Chapter one, verse four,")); Assert.That(rows[3].IsSentenceStart, Is.True); - Assert.That(rows[4].Ref, Is.EqualTo(new VerseRef("MAT 1:5", corpus.Versification))); + Assert.That(rows[4].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:5", corpus.Versification))); Assert.That(rows[4].Text, Is.EqualTo("Chapter one, verse five.")); Assert.That(rows[4].IsSentenceStart, Is.False); } @@ -99,21 +167,21 @@ public void GetRows_IncludeMarkers() IText text = corpus["MAT"]; TextRow[] rows = text.GetRows().ToArray(); - Assert.That(rows.Length, Is.EqualTo(19)); + Assert.That(rows, Has.Length.EqualTo(19)); - Assert.That(rows[0].Ref, Is.EqualTo(new VerseRef("MAT 1:1", corpus.Versification))); + Assert.That(rows[0].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:1", corpus.Versification))); Assert.That( rows[0].Text, Is.EqualTo("Chapter \\pn one\\+pro WON\\+pro*\\pn*, verse one.\\f + \\fr 1:1: \\ft This is a footnote.\\f*") ); - Assert.That(rows[1].Ref, Is.EqualTo(new VerseRef("MAT 1:2", corpus.Versification))); + Assert.That(rows[1].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:2", corpus.Versification))); Assert.That( rows[1].Text, Is.EqualTo("\\bd C\\bd*hapter one, \\li2 verse\\f + \\fr 1:2: \\ft This is a footnote.\\f* two.") ); - Assert.That(rows[4].Ref, Is.EqualTo(new VerseRef("MAT 1:5", corpus.Versification))); + Assert.That(rows[4].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:5", corpus.Versification))); Assert.That( rows[4].Text, Is.EqualTo( @@ -121,43 +189,94 @@ public void GetRows_IncludeMarkers() ) ); - Assert.That(rows[5].Ref, Is.EqualTo(new VerseRef("MAT 2:1", corpus.Versification))); + Assert.That(rows[5].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:1", corpus.Versification))); Assert.That( rows[5].Text, Is.EqualTo("Chapter \\add two\\add*, verse \\f + \\fr 2:1: \\ft This is a footnote.\\f*one.") ); - Assert.That(rows[6].Ref, Is.EqualTo(new VerseRef("MAT 2:2", corpus.Versification))); + Assert.That(rows[6].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:2", corpus.Versification))); Assert.That( rows[6].Text, - Is.EqualTo("Chapter two, verse \\fm ∆\\fm*two. Chapter two, verse \\w three|lemma\\w*.") + Is.EqualTo("Chapter two, // verse \\fm ∆\\fm*two. Chapter two, verse \\w three|lemma\\w*.") ); Assert.That(rows[6].IsInRange, Is.True); Assert.That(rows[6].IsRangeStart, Is.True); - Assert.That(rows[7].Ref, Is.EqualTo(new VerseRef("MAT 2:3", corpus.Versification))); + Assert.That(rows[7].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:3", corpus.Versification))); Assert.That(rows[7].Text, Is.Empty); Assert.That(rows[7].IsInRange, Is.True); Assert.That(rows[7].IsRangeStart, Is.False); - Assert.That(rows[8].Ref, Is.EqualTo(new VerseRef("MAT 2:4a", corpus.Versification))); + Assert.That(rows[8].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:4a", corpus.Versification))); Assert.That(rows[8].Text, Is.Empty); Assert.That(rows[8].IsInRange, Is.True); Assert.That(rows[8].IsRangeStart, Is.False); - Assert.That(rows[9].Ref, Is.EqualTo(new VerseRef("MAT 2:4b", corpus.Versification))); + Assert.That(rows[9].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:4b", corpus.Versification))); Assert.That(rows[9].Text, Is.EqualTo("Chapter two, verse four.")); - Assert.That(rows[10].Ref, Is.EqualTo(new VerseRef("MAT 2:5", corpus.Versification))); + Assert.That(rows[10].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:5", corpus.Versification))); Assert.That(rows[10].Text, Is.EqualTo("Chapter two, verse five \\rq (MAT 3:1)\\rq*.")); - Assert.That(rows[11].Ref, Is.EqualTo(new VerseRef("MAT 2:6", corpus.Versification))); + Assert.That(rows[11].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:6", corpus.Versification))); Assert.That(rows[11].Text, Is.EqualTo("Chapter two, verse \\w six|strong=\"12345\" \\w*.")); - Assert.That(rows[15].Ref, Is.EqualTo(new VerseRef("MAT 2:9", corpus.Versification))); + Assert.That(rows[15].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:9", corpus.Versification))); Assert.That(rows[15].Text, Is.EqualTo("Chapter\\tcr2 2\\tc3 verse\\tcr4 9")); - Assert.That(rows[16].Ref, Is.EqualTo(new VerseRef("MAT 2:10", corpus.Versification))); + Assert.That(rows[16].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:10", corpus.Versification))); Assert.That(rows[16].Text, Is.EqualTo("\\tc3-4 Chapter 2 verse 10")); } + + [Test] + public void GetRows_IncludeMarkers_AllText() + { + var corpus = new UsfmFileTextCorpus( + "usfm.sty", + Encoding.UTF8, + CorporaTestHelpers.UsfmTestProjectPath, + includeMarkers: true, + includeAllText: true + ); + + IText text = corpus["MAT"]; + TextRow[] rows = text.GetRows().ToArray(); + Assert.That(rows, Has.Length.EqualTo(32)); + + Assert.That(rows[2].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:0/3:ip", corpus.Versification))); + Assert.That(rows[2].Text, Is.EqualTo("An introduction to Matthew\\fe + \\ft This is an endnote.\\fe*")); + + Assert.That(rows[4].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:1", corpus.Versification))); + Assert.That( + rows[4].Text, + Is.EqualTo("Chapter \\pn one\\+pro WON\\+pro*\\pn*, verse one.\\f + \\fr 1:1: \\ft This is a footnote.\\f*") + ); + + Assert.That(rows[5].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:2", corpus.Versification))); + Assert.That( + rows[5].Text, + Is.EqualTo("\\bd C\\bd*hapter one, \\li2 verse\\f + \\fr 1:2: \\ft This is a footnote.\\f* two.") + ); + + Assert.That(rows[8].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:5", corpus.Versification))); + Assert.That( + rows[8].Text, + Is.EqualTo( + "Chapter one, \\li2 verse \\fig Figure 1|src=\"image1.png\" size=\"col\" ref=\"1:5\"\\fig* five." + ) + ); + + Assert.That(rows[13].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:0/3:s1", corpus.Versification))); + Assert.That(rows[13].Text, Is.EqualTo("Chapter \\it Two \\it*")); + + Assert.That(rows[14].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:1", corpus.Versification))); + Assert.That( + rows[14].Text, + Is.EqualTo("Chapter \\add two\\add*, verse \\f + \\fr 2:1: \\ft This is a footnote.\\f*one.") + ); + + Assert.That(rows[18].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:3/1:esb/2:p", corpus.Versification))); + Assert.That(rows[18].Text, Is.EqualTo("Here is some sidebar // content.")); + } } diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmTextUpdaterTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmTextUpdaterTests.cs new file mode 100644 index 00000000..730ef3a9 --- /dev/null +++ b/tests/SIL.Machine.Tests/Corpora/UsfmTextUpdaterTests.cs @@ -0,0 +1,356 @@ +using NUnit.Framework; + +namespace SIL.Machine.Corpora; + +[TestFixture] +public class UsfmTextUpdaterTests +{ + [Test] + public void GetUsfm_Verse_CharStyle() + { + var rows = new List<(IReadOnlyList, string)> + { + (ScrRef("MAT 1:1"), "First verse of the first chapter.") + }; + + string target = UpdateUsfm(rows); + Assert.That(target, Contains.Substring("\\id MAT - Test\r\n")); + Assert.That(target, Contains.Substring("\\v 1 First verse of the first chapter.\r\n")); + } + + [Test] + public void GetUsfm_IdText() + { + string target = UpdateUsfm(idText: "- Updated"); + Assert.That(target, Contains.Substring("\\id MAT - Updated\r\n")); + } + + [Test] + public void GetUsfm_StripAllText() + { + string target = UpdateUsfm(stripAllText: true); + Assert.That(target, Contains.Substring("\\id MAT\r\n")); + Assert.That(target, Contains.Substring("\\v 1\r\n")); + Assert.That(target, Contains.Substring("\\s\r\n")); + Assert.That(target, Contains.Substring("\\ms\r\n")); + } + + [Test] + public void GetUsfm_Verse_SkipNote() + { + var rows = new List<(IReadOnlyList, string)> + { + (ScrRef("MAT 2:1"), "First verse of the second chapter.") + }; + + string target = UpdateUsfm(rows); + Assert.That(target, Contains.Substring("\\v 1 First verse of the second chapter.\r\n")); + } + + [Test] + public void GetUsfm_Verse_ReplaceNote() + { + var rows = new List<(IReadOnlyList, string)> + { + (ScrRef("MAT 2:1"), "First verse of the second chapter."), + (ScrRef("MAT 2:1/1:f"), "This is a new footnote.") + }; + + string target = UpdateUsfm(rows); + Assert.That( + target, + Contains.Substring("\\v 1 First verse of the second chapter. \\f + \\ft This is a new footnote.\\f*\r\n") + ); + } + + [Test] + public void GetUsfm_Verse_RowVerseSegment() + { + var rows = new List<(IReadOnlyList, string)> + { + (ScrRef("MAT 2:1a"), "First verse of the second chapter.") + }; + + string target = UpdateUsfm(rows); + Assert.That(target, Contains.Substring("\\v 1 First verse of the second chapter.\r\n")); + } + + [Test] + public void GetUsfm_Verse_UsfmVerseSegment() + { + var rows = new List<(IReadOnlyList, string)> + { + (ScrRef("MAT 2:7"), "Seventh verse of the second chapter.") + }; + + string target = UpdateUsfm(rows); + Assert.That(target, Contains.Substring("\\v 7a Seventh verse of the second chapter.\r\n")); + } + + [Test] + public void GetUsfm_Verse_MultipleParas() + { + var rows = new List<(IReadOnlyList, string)> + { + (ScrRef("MAT 1:2"), "Second verse of the first chapter.") + }; + + string target = UpdateUsfm(rows); + Assert.That(target, Contains.Substring("\\v 2 Second verse of the first chapter.\r\n\\li2\r\n")); + } + + [Test] + public void GetUsfm_Verse_Table() + { + var rows = new List<(IReadOnlyList, string)> + { + (ScrRef("MAT 2:9"), "Ninth verse of the second chapter.") + }; + + string target = UpdateUsfm(rows); + Assert.That(target, Contains.Substring("\\v 9 Ninth verse of the second chapter. \\tcr2 \\tc3 \\tcr4\r\n")); + } + + [Test] + public void GetUsfm_Verse_RangeSingleRowMultipleVerses() + { + var rows = new List<(IReadOnlyList, string)> + { + ( + ScrRef("MAT 2:11", "MAT 2:12"), + "Eleventh verse of the second chapter. Twelfth verse of the second chapter." + ) + }; + + string target = UpdateUsfm(rows); + Assert.That( + target, + Contains.Substring( + "\\v 11-12 Eleventh verse of the second chapter. Twelfth verse of the second chapter.\r\n" + ) + ); + } + + [Test] + public void GetUsfm_Verse_RangeSingleRowSingleVerse() + { + var rows = new List<(IReadOnlyList, string)> + { + (ScrRef("MAT 2:11"), "Eleventh verse of the second chapter.") + }; + + string target = UpdateUsfm(rows); + Assert.That(target, Contains.Substring("\\v 11-12 Eleventh verse of the second chapter.\r\n")); + } + + [Test] + public void GetUsfm_Verse_RangeMultipleRowsSingleVerse() + { + var rows = new List<(IReadOnlyList, string)> + { + (ScrRef("MAT 2:11"), "Eleventh verse of the second chapter."), + (ScrRef("MAT 2:12"), "Twelfth verse of the second chapter.") + }; + + string target = UpdateUsfm(rows); + Assert.That( + target, + Contains.Substring( + "\\v 11-12 Eleventh verse of the second chapter. Twelfth verse of the second chapter.\r\n" + ) + ); + } + + [Test] + public void GetUsfm_Verse_OptBreak() + { + var rows = new List<(IReadOnlyList, string)> + { + (ScrRef("MAT 2:2"), "Second verse of the second chapter."), + (ScrRef("MAT 2:3"), "Third verse of the second chapter.") + }; + + string target = UpdateUsfm(rows); + Assert.That( + target, + Contains.Substring("\\v 2-3 Second verse of the second chapter. Third verse of the second chapter.\r\n") + ); + } + + [Test] + public void GetUsfm_Verse_Milestone() + { + var rows = new List<(IReadOnlyList, string)> + { + (ScrRef("MAT 2:10"), "Tenth verse of the second chapter.") + }; + + string target = UpdateUsfm(rows); + Assert.That( + target, + Contains.Substring("\\v 10 Tenth verse of the second chapter. \\tc3-4 \\qt-s |Jesus\\*\\qt-e\\*\r\n") + ); + } + + [Test] + public void GetUsfm_Verse_Unmatched() + { + var rows = new List<(IReadOnlyList, string)> + { + (ScrRef("MAT 1:3"), "Third verse of the first chapter.") + }; + + string target = UpdateUsfm(rows); + Assert.That(target, Contains.Substring("\\v 3 Third verse of the first chapter.\r\n")); + } + + [Test] + public void GetUsfm_NonVerse_CharStyle() + { + var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 2:0/3:s1"), "The second chapter.") }; + + string target = UpdateUsfm(rows); + Assert.That(target, Contains.Substring("\\s1 The second chapter.\r\n")); + } + + [Test] + public void GetUsfm_NonVerse_Paragraph() + { + var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:0/4:s"), "The first chapter.") }; + + string target = UpdateUsfm(rows); + Assert.That(target, Contains.Substring("\\s The first chapter.\r\n")); + } + + [Test] + public void GetUsfm_NonVerse_Relaxed() + { + var rows = new List<(IReadOnlyList, string)> + { + (ScrRef("MAT 1:0/s"), "The first chapter."), + (ScrRef("MAT 1:1"), "First verse of the first chapter."), + (ScrRef("MAT 2:0/tr/tc1"), "The first cell of the table."), + (ScrRef("MAT 2:0/tr/tc2"), "The second cell of the table."), + (ScrRef("MAT 2:0/tr/tc1"), "The third cell of the table.") + }; + + string target = UpdateUsfm(rows, strictComparison: false); + Assert.That(target, Contains.Substring("\\s The first chapter.\r\n")); + Assert.That(target, Contains.Substring("\\v 1 First verse of the first chapter.\r\n")); + Assert.That( + target, + Contains.Substring("\\tr \\tc1 The first cell of the table. \\tc2 The second cell of the table.\r\n") + ); + Assert.That( + target, + Contains.Substring("\\tr \\tc1 The third cell of the table. \\tc2 Row two, column two.\r\n") + ); + } + + [Test] + public void GetUsfm_NonVerse_Sidebar() + { + var rows = new List<(IReadOnlyList, string)> + { + (ScrRef("MAT 2:3/1:esb/1:ms"), "The first paragraph of the sidebar.") + }; + + string target = UpdateUsfm(rows); + Assert.That(target, Contains.Substring("\\ms The first paragraph of the sidebar.\r\n")); + } + + [Test] + public void GetUsfm_NonVerse_Table() + { + var rows = new List<(IReadOnlyList, string)> + { + (ScrRef("MAT 2:0/1:tr/1:tc1"), "The first cell of the table."), + (ScrRef("MAT 2:0/2:tr/1:tc1"), "The third cell of the table.") + }; + + string target = UpdateUsfm(rows); + Assert.That( + target, + Contains.Substring("\\tr \\tc1 The first cell of the table. \\tc2 Row one, column two.\r\n") + ); + Assert.That( + target, + Contains.Substring("\\tr \\tc1 The third cell of the table. \\tc2 Row two, column two.\r\n") + ); + } + + [Test] + public void GetUsfm_NonVerse_OptBreak() + { + var rows = new List<(IReadOnlyList, string)> + { + (ScrRef("MAT 2:3/1:esb/2:p"), "The second paragraph of the sidebar.") + }; + + string target = UpdateUsfm(rows); + Assert.That(target, Contains.Substring("\\p The second paragraph of the sidebar.\r\n")); + } + + [Test] + public void GetUsfm_NonVerse_Milestone() + { + var rows = new List<(IReadOnlyList, string)> + { + (ScrRef("MAT 2:7a/1:s"), "A new section header.") + }; + + string target = UpdateUsfm(rows); + Assert.That(target, Contains.Substring("\\s A new section header. \\ts-s\\*\r\n")); + } + + [Test] + public void GetUsfm_NonVerse_SkipNote() + { + var rows = new List<(IReadOnlyList, string)> + { + (ScrRef("MAT 1:0/3:ip"), "The introductory paragraph.") + }; + + string target = UpdateUsfm(rows); + Assert.That(target, Contains.Substring("\\ip The introductory paragraph.\r\n")); + } + + [Test] + public void GetUsfm_NonVerse_ReplaceNote() + { + var rows = new List<(IReadOnlyList, string)> + { + (ScrRef("MAT 1:0/3:ip"), "The introductory paragraph."), + (ScrRef("MAT 1:0/3:ip/1:fe"), "This is a new endnote.") + }; + + string target = UpdateUsfm(rows); + Assert.That( + target, + Contains.Substring("\\ip The introductory paragraph. \\fe + \\ft This is a new endnote.\\fe*\r\n") + ); + } + + private static ScriptureRef[] ScrRef(params string[] refs) + { + return refs.Select(r => ScriptureRef.Parse(r)).ToArray(); + } + + private static string UpdateUsfm( + IReadOnlyList<(IReadOnlyList, string)>? rows = null, + string? idText = null, + bool stripAllText = false, + bool strictComparison = true + ) + { + string source = ReadUsfm(); + var updater = new UsfmTextUpdater(rows, idText, stripAllText, strictComparison); + UsfmParser.Parse(source, updater); + return updater.GetUsfm(); + } + + private static string ReadUsfm() + { + return File.ReadAllText(Path.Combine(CorporaTestHelpers.UsfmTestProjectPath, "41MATTes.SFM")); + } +} diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmTokenizerTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmTokenizerTests.cs index b7cc5fd4..a45869cd 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmTokenizerTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmTokenizerTests.cs @@ -11,22 +11,22 @@ public void Tokenize() string usfm = ReadUsfm(); var tokenizer = new UsfmTokenizer(); IReadOnlyList tokens = tokenizer.Tokenize(usfm); - Assert.That(tokens, Has.Count.EqualTo(151)); + Assert.That(tokens, Has.Count.EqualTo(170)); Assert.That(tokens[0].Type, Is.EqualTo(UsfmTokenType.Book)); Assert.That(tokens[0].Marker, Is.EqualTo("id")); Assert.That(tokens[0].Data, Is.EqualTo("MAT")); - Assert.That(tokens[10].Type, Is.EqualTo(UsfmTokenType.Text)); - Assert.That(tokens[10].Text, Is.EqualTo("Chapter One ")); + Assert.That(tokens[15].Type, Is.EqualTo(UsfmTokenType.Text)); + Assert.That(tokens[15].Text, Is.EqualTo("Chapter One ")); - Assert.That(tokens[11].Type, Is.EqualTo(UsfmTokenType.Verse)); - Assert.That(tokens[11].Marker, Is.EqualTo("v")); - Assert.That(tokens[11].Data, Is.EqualTo("1")); + Assert.That(tokens[16].Type, Is.EqualTo(UsfmTokenType.Verse)); + Assert.That(tokens[16].Marker, Is.EqualTo("v")); + Assert.That(tokens[16].Data, Is.EqualTo("1")); - Assert.That(tokens[20].Type, Is.EqualTo(UsfmTokenType.Note)); - Assert.That(tokens[20].Marker, Is.EqualTo("f")); - Assert.That(tokens[20].Data, Is.EqualTo("+")); + Assert.That(tokens[25].Type, Is.EqualTo(UsfmTokenType.Note)); + Assert.That(tokens[25].Marker, Is.EqualTo("f")); + Assert.That(tokens[25].Data, Is.EqualTo("+")); } [Test] diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmVerseTextUpdaterTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmVerseTextUpdaterTests.cs deleted file mode 100644 index 10d4e2c1..00000000 --- a/tests/SIL.Machine.Tests/Corpora/UsfmVerseTextUpdaterTests.cs +++ /dev/null @@ -1,207 +0,0 @@ -using NUnit.Framework; -using SIL.Scripture; - -namespace SIL.Machine.Corpora; - -[TestFixture] -public class UsfmVerseTextUpdaterTests -{ - [Test] - public void GetUsfm_CharStyle() - { - var rows = new List<(IReadOnlyList, string)> - { - (new[] { new VerseRef("MAT 1:1", ScrVers.English) }, "First verse of the first chapter.") - }; - - string target = UpdateUsfm(rows); - Assert.That(target, Contains.Substring("\\id MAT - Test\r\n")); - Assert.That(target, Contains.Substring("\\v 1 First verse of the first chapter.\r\n")); - } - - [Test] - public void GetUsfm_IdText() - { - string target = UpdateUsfm(idText: "- Updated"); - Assert.That(target, Contains.Substring("\\id MAT - Updated\r\n")); - } - - [Test] - public void GetUsfm_StripAllText() - { - string target = UpdateUsfm(stripAllText: true); - Assert.That(target, Contains.Substring("\\id MAT\r\n")); - Assert.That(target, Contains.Substring("\\v 1\r\n")); - Assert.That(target, Contains.Substring("\\s\r\n")); - } - - [Test] - public void GetUsfm_Notes() - { - var rows = new List<(IReadOnlyList, string)> - { - (new[] { new VerseRef("MAT 2:1", ScrVers.English) }, "First verse of the second chapter.") - }; - - string target = UpdateUsfm(rows); - Assert.That(target, Contains.Substring("\\v 1 First verse of the second chapter.\r\n")); - } - - [Test] - public void GetUsfm_RowVerseSegment() - { - var rows = new List<(IReadOnlyList, string)> - { - (new[] { new VerseRef("MAT 2:1a", ScrVers.English) }, "First verse of the second chapter.") - }; - - string target = UpdateUsfm(rows); - Assert.That(target, Contains.Substring("\\v 1 First verse of the second chapter.\r\n")); - } - - [Test] - public void GetUsfm_UsfmVerseSegment() - { - var rows = new List<(IReadOnlyList, string)> - { - (new[] { new VerseRef("MAT 2:7", ScrVers.English) }, "Seventh verse of the second chapter.") - }; - - string target = UpdateUsfm(rows); - Assert.That(target, Contains.Substring("\\v 7a Seventh verse of the second chapter.\r\n")); - } - - [Test] - public void GetUsfm_MultipleParas() - { - var rows = new List<(IReadOnlyList, string)> - { - (new[] { new VerseRef("MAT 1:2", ScrVers.English) }, "Second verse of the first chapter.") - }; - - string target = UpdateUsfm(rows); - Assert.That(target, Contains.Substring("\\v 2 Second verse of the first chapter.\r\n\\li2\r\n")); - } - - [Test] - public void GetUsfm_Table() - { - var rows = new List<(IReadOnlyList, string)> - { - (new[] { new VerseRef("MAT 2:9", ScrVers.English) }, "Ninth verse of the second chapter.") - }; - - string target = UpdateUsfm(rows); - Assert.That(target, Contains.Substring("\\v 9 Ninth verse of the second chapter. \\tcr2 \\tc3 \\tcr4\r\n")); - } - - [Test] - public void GetUsfm_RangeSingleRowMultipleVerses() - { - var rows = new List<(IReadOnlyList, string)> - { - ( - new[] { new VerseRef("MAT 2:11", ScrVers.English), new VerseRef("MAT 2:12", ScrVers.English) }, - "Eleventh verse of the second chapter. Twelfth verse of the second chapter." - ) - }; - - string target = UpdateUsfm(rows); - Assert.That( - target, - Contains.Substring( - "\\v 11-12 Eleventh verse of the second chapter. Twelfth verse of the second chapter.\r\n" - ) - ); - } - - [Test] - public void GetUsfm_RangeSingleRowSingleVerse() - { - var rows = new List<(IReadOnlyList, string)> - { - (new[] { new VerseRef("MAT 2:11", ScrVers.English) }, "Eleventh verse of the second chapter.") - }; - - string target = UpdateUsfm(rows); - Assert.That(target, Contains.Substring("\\v 11-12 Eleventh verse of the second chapter.\r\n")); - } - - [Test] - public void GetUsfm_RangeMultipleRowsSingleVerse() - { - var rows = new List<(IReadOnlyList, string)> - { - (new[] { new VerseRef("MAT 2:11", ScrVers.English) }, "Eleventh verse of the second chapter."), - (new[] { new VerseRef("MAT 2:12", ScrVers.English) }, "Twelfth verse of the second chapter.") - }; - - string target = UpdateUsfm(rows); - Assert.That( - target, - Contains.Substring( - "\\v 11-12 Eleventh verse of the second chapter. Twelfth verse of the second chapter.\r\n" - ) - ); - } - - [Test] - public void GetUsfm_OptBreak() - { - var rows = new List<(IReadOnlyList, string)> - { - (new[] { new VerseRef("MAT 2:2", ScrVers.English) }, "Second verse of the second chapter."), - (new[] { new VerseRef("MAT 2:3", ScrVers.English) }, "Third verse of the second chapter.") - }; - - string target = UpdateUsfm(rows); - Assert.That( - target, - Contains.Substring("\\v 2-3 Second verse of the second chapter. Third verse of the second chapter.\r\n") - ); - } - - [Test] - public void GetUsfm_Milestone() - { - var rows = new List<(IReadOnlyList, string)> - { - (new[] { new VerseRef("MAT 2:10", ScrVers.English) }, "Tenth verse of the second chapter.") - }; - - string target = UpdateUsfm(rows); - Assert.That( - target, - Contains.Substring("\\v 10 Tenth verse of the second chapter. \\tc3-4 \\qt-s |Jesus\\*\\qt-e\\*\r\n") - ); - } - - [Test] - public void GetUsfm_Unmatched() - { - var rows = new List<(IReadOnlyList, string)> - { - (new[] { new VerseRef("MAT 1:3", ScrVers.English) }, "Third verse of the first chapter.") - }; - - string target = UpdateUsfm(rows); - Assert.That(target, Contains.Substring("\\v 3 Third verse of the first chapter.\r\n")); - } - - private static string UpdateUsfm( - IReadOnlyList<(IReadOnlyList, string)>? rows = null, - string? idText = null, - bool stripAllText = false - ) - { - string source = ReadUsfm(); - var updater = new UsfmVerseTextUpdater(rows, idText, stripAllText); - UsfmParser.Parse(source, updater); - return updater.GetUsfm(); - } - - private static string ReadUsfm() - { - return File.ReadAllText(Path.Combine(CorporaTestHelpers.UsfmTestProjectPath, "41MATTes.SFM")); - } -} diff --git a/tests/SIL.Machine.Tests/Corpora/UsxZipTextTests.cs b/tests/SIL.Machine.Tests/Corpora/UsxZipTextTests.cs index f240620c..db3583ef 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsxZipTextTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsxZipTextTests.cs @@ -1,5 +1,4 @@ using NUnit.Framework; -using SIL.Scripture; namespace SIL.Machine.Corpora; @@ -15,37 +14,37 @@ public void GetRows_NonEmptyText() TextRow[] segments = text.GetRows().ToArray(); Assert.That(segments.Length, Is.EqualTo(14)); - Assert.That(segments[0].Ref, Is.EqualTo(new VerseRef("MAT 1:1", env.Corpus.Versification))); + Assert.That(segments[0].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:1", env.Corpus.Versification))); Assert.That(segments[0].Text, Is.EqualTo("Chapter one, verse one.")); - Assert.That(segments[1].Ref, Is.EqualTo(new VerseRef("MAT 1:2", env.Corpus.Versification))); + Assert.That(segments[1].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:2", env.Corpus.Versification))); Assert.That(segments[1].Text, Is.EqualTo("Chapter one, verse two.")); - Assert.That(segments[4].Ref, Is.EqualTo(new VerseRef("MAT 1:5", env.Corpus.Versification))); + Assert.That(segments[4].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:5", env.Corpus.Versification))); Assert.That(segments[4].Text, Is.EqualTo("Chapter one, verse five.")); - Assert.That(segments[5].Ref, Is.EqualTo(new VerseRef("MAT 2:1", env.Corpus.Versification))); + Assert.That(segments[5].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:1", env.Corpus.Versification))); Assert.That(segments[5].Text, Is.EqualTo("Chapter two, verse one.")); - Assert.That(segments[6].Ref, Is.EqualTo(new VerseRef("MAT 2:2", env.Corpus.Versification))); + Assert.That(segments[6].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:2", env.Corpus.Versification))); Assert.That(segments[6].Text, Is.EqualTo("Chapter two, verse two. Chapter two, verse three.")); Assert.That(segments[6].IsInRange, Is.True); - Assert.That(segments[7].Ref, Is.EqualTo(new VerseRef("MAT 2:3", env.Corpus.Versification))); + Assert.That(segments[7].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:3", env.Corpus.Versification))); Assert.That(segments[7].Text, Is.Empty); Assert.That(segments[7].IsInRange, Is.True); - Assert.That(segments[8].Ref, Is.EqualTo(new VerseRef("MAT 2:4a", env.Corpus.Versification))); + Assert.That(segments[8].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:4a", env.Corpus.Versification))); Assert.That(segments[8].Text, Is.Empty); Assert.That(segments[8].IsInRange, Is.True); - Assert.That(segments[9].Ref, Is.EqualTo(new VerseRef("MAT 2:4b", env.Corpus.Versification))); + Assert.That(segments[9].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:4b", env.Corpus.Versification))); Assert.That(segments[9].Text, Is.EqualTo("Chapter two, verse four.")); - Assert.That(segments[10].Ref, Is.EqualTo(new VerseRef("MAT 2:5", env.Corpus.Versification))); + Assert.That(segments[10].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:5", env.Corpus.Versification))); Assert.That(segments[10].Text, Is.EqualTo("Chapter two, verse five.")); - Assert.That(segments[11].Ref, Is.EqualTo(new VerseRef("MAT 2:6", env.Corpus.Versification))); + Assert.That(segments[11].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:6", env.Corpus.Versification))); Assert.That(segments[11].Text, Is.EqualTo("Chapter two, verse six.")); } } @@ -59,11 +58,11 @@ public void GetRows_SentenceStart() TextRow[] segments = text.GetRows().ToArray(); Assert.That(segments.Length, Is.EqualTo(14)); - Assert.That(segments[3].Ref, Is.EqualTo(new VerseRef("MAT 1:4", env.Corpus.Versification))); + Assert.That(segments[3].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:4", env.Corpus.Versification))); Assert.That(segments[3].Text, Is.EqualTo("Chapter one, verse four,")); Assert.That(segments[3].IsSentenceStart, Is.True); - Assert.That(segments[4].Ref, Is.EqualTo(new VerseRef("MAT 1:5", env.Corpus.Versification))); + Assert.That(segments[4].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:5", env.Corpus.Versification))); Assert.That(segments[4].Text, Is.EqualTo("Chapter one, verse five.")); Assert.That(segments[4].IsSentenceStart, Is.False); }