Skip to content

Commit

Permalink
Fix USFM parsing bugs (#229)
Browse files Browse the repository at this point in the history
* *Add custom exception for parsing
*Fix off-by-one error
*Handle triplicate, quadruplicate, n-plicate verses
*Add test to cover triplicate verse

* Update test; add better error messages to tests

* Move try-catch to ProcessTokens

* Change error messages for ref asserts

* Updates from review

* Remove commented out code

* Add ParatextProjectTextUpdaterBase class

* Draft: Fill in zip classes

* Use new class in manual tests

* Fix typo in file name

* Add direct-from-settings constructor

* Remove constructor

* Add constructor

* Remove zip base class - unnecessary

* Move error-handling to updater base; use updater in manual test

---------

Co-authored-by: John Lambert <john_lambert@sil.org>
Co-authored-by: Damien Daspit <damien_daspit@sil.org>
  • Loading branch information
3 people authored Aug 6, 2024
1 parent 136a3ba commit cd90626
Show file tree
Hide file tree
Showing 9 changed files with 236 additions and 54 deletions.
25 changes: 25 additions & 0 deletions src/SIL.Machine/Corpora/FileParatextProjectTextUpdater.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
using System.IO;

namespace SIL.Machine.Corpora
{
public class FileParatextProjectTextUpdater : ParatextProjectTextUpdaterBase
{
private readonly string _projectDir;

public FileParatextProjectTextUpdater(string projectDir)
: base(new FileParatextProjectSettingsParser(projectDir))
{
_projectDir = projectDir;
}

protected override bool Exists(string fileName)
{
return File.Exists(Path.Combine(_projectDir, fileName));
}

protected override Stream Open(string fileName)
{
return File.OpenRead(Path.Combine(_projectDir, fileName));
}
}
}
65 changes: 65 additions & 0 deletions src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;

namespace SIL.Machine.Corpora
{
public abstract class ParatextProjectTextUpdaterBase
{
private readonly ParatextProjectSettings _settings;

protected ParatextProjectTextUpdaterBase(ParatextProjectSettings settings)
{
_settings = settings;
}

protected ParatextProjectTextUpdaterBase(ParatextProjectSettingsParserBase settingsParser)
{
_settings = settingsParser.Parse();
}

public string UpdateUsfm(
string bookId,
IReadOnlyList<(IReadOnlyList<ScriptureRef>, string)> rows,
string fullName = null,
bool stripAllText = false,
bool preferExistingText = true
)
{
string fileName = _settings.GetBookFileName(bookId);
if (!Exists(fileName))
return null;

string usfm;
using (var reader = new StreamReader(Open(fileName)))
{
usfm = reader.ReadToEnd();
}

var handler = new UpdateUsfmParserHandler(
rows,
fullName is null ? null : $"- {fullName}",
stripAllText,
preferExistingText: preferExistingText
);
try
{
UsfmParser.Parse(usfm, handler, _settings.Stylesheet, _settings.Versification);
return handler.GetUsfm(_settings.Stylesheet);
}
catch (Exception ex)
{
var sb = new StringBuilder();
sb.Append($"An error occurred while parsing the usfm for '{bookId}`");
if (!string.IsNullOrEmpty(_settings.Name))
sb.Append($" in project '{_settings.Name}'");
sb.Append($". Error: '{ex.Message}'");
throw new InvalidOperationException(sb.ToString(), ex);
}
}

protected abstract bool Exists(string fileName);
protected abstract Stream Open(string fileName);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ public override void Verse(
string pubNumber
)
{
if (state.VerseRef.Equals(_curVerseRef))
if (state.VerseRef.Equals(_curVerseRef) && !_duplicateVerse)
{
EndVerseText(state, CreateVerseRefs());
// ignore duplicate verses
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ namespace SIL.Machine.Corpora
* This is a USFM parser handler that can be used to replace the existing text in a USFM file with the specified
* text.
*/
public class UsfmTextUpdater : ScriptureRefUsfmParserHandlerBase
public class UpdateUsfmParserHandler : ScriptureRefUsfmParserHandlerBase
{
private readonly IReadOnlyList<(IReadOnlyList<ScriptureRef>, string)> _rows;
private readonly List<UsfmToken> _tokens;
Expand All @@ -20,7 +20,7 @@ public class UsfmTextUpdater : ScriptureRefUsfmParserHandlerBase
private int _rowIndex;
private int _tokenIndex;

public UsfmTextUpdater(
public UpdateUsfmParserHandler(
IReadOnlyList<(IReadOnlyList<ScriptureRef>, string)> rows = null,
string idText = null,
bool stripAllText = false,
Expand Down Expand Up @@ -361,7 +361,7 @@ private void SkipTokens(UsfmParserState state)
private bool ReplaceWithNewTokens(UsfmParserState state)
{
bool newText = _replace.Count > 0 && _replace.Peek();
int tokenEnd = state.Index + state.SpecialTokenCount + 1;
int tokenEnd = state.Index + state.SpecialTokenCount;
bool existingText = false;
for (int index = _tokenIndex; index <= tokenEnd; index++)
{
Expand Down
1 change: 1 addition & 0 deletions src/SIL.Machine/Corpora/UsfmParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ public static void Parse(
versification,
preserveWhitespace
);

parser.ProcessTokens();
}

Expand Down
29 changes: 29 additions & 0 deletions src/SIL.Machine/Corpora/ZipParatextProjectTextUpdater.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
using System.IO;
using System.IO.Compression;

namespace SIL.Machine.Corpora
{
public class ZipParatextProjectTextUpdater : ParatextProjectTextUpdaterBase
{
private readonly ZipArchive _archive;

public ZipParatextProjectTextUpdater(ZipArchive archive)
: base(new ZipParatextProjectSettingsParser(archive))
{
_archive = archive;
}

protected override bool Exists(string fileName)
{
return _archive.GetEntry(fileName) != null;
}

protected override Stream Open(string fileName)
{
ZipArchiveEntry entry = _archive.GetEntry(fileName);
if (entry == null)
return null;
return entry.Open();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
namespace SIL.Machine.Corpora;

[TestFixture]
public class UsfmTextUpdaterTests
public class UpdateUsfmParserHandlerTests
{
[Test]
public void GetUsfm_Verse_CharStyle()
Expand All @@ -21,7 +21,7 @@ public void GetUsfm_Verse_CharStyle()
[Test]
public void GetUsfm_IdText()
{
string target = UpdateUsfm(idText: "- Updated");
string target = UpdateUsfm(idText: "Updated");
Assert.That(target, Contains.Substring("\\id MAT - Updated\r\n"));
}

Expand Down Expand Up @@ -443,21 +443,27 @@ private static string UpdateUsfm(
)
{
if (source is null)
source = ReadUsfm();
{
var updater = new FileParatextProjectTextUpdater(CorporaTestHelpers.UsfmTestProjectPath);
return updater.UpdateUsfm(
"MAT",
rows,
fullName: idText,
stripAllText: stripAllText,
preferExistingText: preferExistingText
);
}
else
{
source = source.Trim().ReplaceLineEndings("\r\n") + "\r\n";
var updater = new UsfmTextUpdater(
rows,
idText,
stripAllText: stripAllText,
preferExistingText: preferExistingText
);
UsfmParser.Parse(source, updater);
return updater.GetUsfm();
}

private static string ReadUsfm()
{
return File.ReadAllText(Path.Combine(CorporaTestHelpers.UsfmTestProjectPath, "41MATTes.SFM"));
var updater = new UpdateUsfmParserHandler(
rows,
idText,
stripAllText: stripAllText,
preferExistingText: preferExistingText
);
UsfmParser.Parse(source, updater);
return updater.GetUsfm();
}
}
}
71 changes: 43 additions & 28 deletions tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ public class UsfmManualTests
{
[Test]
[Ignore("This is for manual testing only. Remove this tag to run the test.")]
public async Task ParseParallelCorpusAsync()
public void ParseParallelCorpusAsync()
{
ParatextTextCorpus tCorpus =
new(projectDir: CorporaTestHelpers.UsfmTargetProjectPath, includeAllText: true, includeMarkers: true);
Expand All @@ -36,18 +36,20 @@ public async Task ParseParallelCorpusAsync()
ParatextProjectSettings targetSettings = new FileParatextProjectSettingsParser(
CorporaTestHelpers.UsfmTargetProjectPath
).Parse();

var updater = new FileParatextProjectTextUpdater(CorporaTestHelpers.UsfmTargetProjectPath);
foreach (
string sfmFileName in Directory.EnumerateFiles(
CorporaTestHelpers.UsfmTargetProjectPath,
$"{targetSettings.FileNamePrefix}*{targetSettings.FileNameSuffix}"
)
string sfmFileName in Directory
.EnumerateFiles(
CorporaTestHelpers.UsfmTargetProjectPath,
$"{targetSettings.FileNamePrefix}*{targetSettings.FileNameSuffix}"
)
.Select(path => new DirectoryInfo(path).Name)
)
{
var updater = new UsfmTextUpdater(pretranslations, stripAllText: true, preferExistingText: false);
string usfm = await File.ReadAllTextAsync(sfmFileName);
UsfmParser.Parse(usfm, updater, targetSettings.Stylesheet, targetSettings.Versification);
string newUsfm = updater.GetUsfm(targetSettings.Stylesheet);
string bookId;
if (!targetSettings.IsBookFileName(sfmFileName, out bookId))
continue;
string newUsfm = updater.UpdateUsfm(bookId, pretranslations, stripAllText: true, preferExistingText: false);
Assert.That(newUsfm, Is.Not.Null);
}
}
Expand Down Expand Up @@ -105,39 +107,52 @@ async Task GetUsfmAsync(string projectPath)
)
)
.ToArrayAsync();
List<string> sfmTexts = [];
List<string> bookIds = [];
ParatextProjectTextUpdaterBase updater;
if (projectArchive == null)
{
sfmTexts = (
await Task.WhenAll(
Directory
.EnumerateFiles(projectPath, $"{settings.FileNamePrefix}*{settings.FileNameSuffix}")
.Select(async sfmFileName => await File.ReadAllTextAsync(sfmFileName))
)
bookIds = (
Directory
.EnumerateFiles(projectPath, $"{settings.FileNamePrefix}*{settings.FileNameSuffix}")
.Select(path => new DirectoryInfo(path).Name)
.Select(filename =>
{
string bookId;
if (settings.IsBookFileName(filename, out bookId))
return bookId;
else
return "";
})
.Where(id => id != "")
).ToList();
updater = new FileParatextProjectTextUpdater(projectPath);
}
else
{
sfmTexts = projectArchive
bookIds = projectArchive
.Entries.Where(e =>
e.Name.StartsWith(settings.FileNamePrefix) && e.Name.EndsWith(settings.FileNameSuffix)
)
.Select(e =>
{
string contents;
using (var sr = new StreamReader(e.Open()))
{
contents = sr.ReadToEnd();
}
return contents;
string bookId;
if (settings.IsBookFileName(e.Name, out bookId))
return bookId;
else
return "";
})
.Where(id => id != "")
.ToList();
updater = new ZipParatextProjectTextUpdater(projectArchive);
}
foreach (string usfm in sfmTexts)
foreach (string bookId in bookIds)
{
var updater = new UsfmTextUpdater(pretranslations, stripAllText: true, preferExistingText: true);
UsfmParser.Parse(usfm, updater, settings.Stylesheet, settings.Versification);
string newUsfm = updater.GetUsfm(settings.Stylesheet);
string newUsfm = updater.UpdateUsfm(
bookId,
pretranslations,
stripAllText: true,
preferExistingText: true
);
Assert.That(newUsfm, Is.Not.Null);
}
}
Expand Down
Loading

0 comments on commit cd90626

Please sign in to comment.