Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix usfm parsing bugs #447

Merged
merged 16 commits into from
Aug 7, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@
public interface IScriptureDataFileService
{
ParatextProjectSettings GetParatextProjectSettings(string filename);
Task<string?> ReadParatextProjectBookAsync(string filename, string book);
ZipParatextProjectTextUpdater GetZipParatextProjectTextUpdater(string filename);
}
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,10 @@ public ParatextProjectSettings GetParatextProjectSettings(string filename)
return ParseProjectSettings(container);
}

public async Task<string?> ReadParatextProjectBookAsync(string filename, string book)
public ZipParatextProjectTextUpdater GetZipParatextProjectTextUpdater(string filename)
{
using IZipContainer container = _fileSystem.OpenZipFile(GetFilePath(filename));
ParatextProjectSettings settings = ParseProjectSettings(container);
string entryName = settings.GetBookFileName(book);
if (!container.EntryExists(entryName))
return null;
using StreamReader reader = new(container.OpenEntry(entryName));
return await reader.ReadToEndAsync();
return new ZipParatextProjectTextUpdater(container);
}

private string GetFilePath(string filename)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
namespace Serval.Shared.Services;

public class ZipParatextProjectTextUpdater : ParatextProjectTextUpdaterBase

Check failure on line 3 in src/Serval/src/Serval.Shared/Services/ZipParatextProjectTextUpdater.cs

View workflow job for this annotation

GitHub Actions / Build

The type or namespace name 'ParatextProjectTextUpdaterBase' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 3 in src/Serval/src/Serval.Shared/Services/ZipParatextProjectTextUpdater.cs

View workflow job for this annotation

GitHub Actions / Build

The type or namespace name 'ParatextProjectTextUpdaterBase' could not be found (are you missing a using directive or an assembly reference?)
{
public ZipParatextProjectTextUpdater(IZipContainer container)
: base(new ZipParatextProjectSettingsParser(container))
{
_projectContainer = container;
}

public ZipParatextProjectTextUpdater(IZipContainer container, ParatextProjectSettings settings)
: base(settings)
{
_projectContainer = container;
}

private readonly IZipContainer _projectContainer;

protected override bool Exists(string fileName)
{
return _projectContainer.EntryExists(fileName);
}

protected override Stream Open(string fileName)
{
return _projectContainer.OpenEntry(fileName);
}
}
160 changes: 70 additions & 90 deletions src/Serval/src/Serval.Translation/Services/PretranslationService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -71,107 +71,87 @@ await GetAllAsync(engineId, modelRevision, corpusId, textId, cancellationToken)
// Update the target book if it exists
if (template is PretranslationUsfmTemplate.Auto or PretranslationUsfmTemplate.Target)
{
string? usfm = await _scriptureDataFileService.ReadParatextProjectBookAsync(targetFile.Filename, textId);
if (usfm is not null)
// the pretranslations are generated from the source book and inserted into the target book
// use relaxed references since the USFM structure may not be the same
pretranslations = pretranslations.Select(p =>
((IReadOnlyList<ScriptureRef>)p.Refs.Select(r => r.ToRelaxed()).ToArray(), p.Translation)
);
Shared.Services.ZipParatextProjectTextUpdater updater =
_scriptureDataFileService.GetZipParatextProjectTextUpdater(targetFile.Filename);
string usfm = "";
switch (textOrigin)
{
// the pretranslations are generated from the source book and inserted into the target book
// use relaxed references since the USFM structure may not be the same
pretranslations = pretranslations.Select(p =>
((IReadOnlyList<ScriptureRef>)p.Refs.Select(r => r.ToRelaxed()).ToArray(), p.Translation)
);
switch (textOrigin)
{
case PretranslationUsfmTextOrigin.PreferExisting:
return UpdateUsfm(
targetSettings,
usfm,
pretranslations,
fullName: targetSettings.FullName,
stripAllText: false,
preferExistingText: true
);
case PretranslationUsfmTextOrigin.PreferPretranslated:
return UpdateUsfm(
targetSettings,
usfm,
pretranslations,
fullName: targetSettings.FullName,
stripAllText: false,
preferExistingText: false
);
case PretranslationUsfmTextOrigin.OnlyExisting:
return UpdateUsfm(
targetSettings,
usfm,
pretranslations: [], // don't put any pretranslations, we only want the existing text.
fullName: targetSettings.FullName,
stripAllText: false,
preferExistingText: false
);
case PretranslationUsfmTextOrigin.OnlyPretranslated:
return UpdateUsfm(
targetSettings,
usfm,
pretranslations,
fullName: targetSettings.FullName,
stripAllText: true,
preferExistingText: false
);
}
case PretranslationUsfmTextOrigin.PreferExisting:
usfm = updater.UpdateUsfm(
textId,
pretranslations.ToList(),
fullName: targetSettings.FullName,
stripAllText: false,
preferExistingText: true
);
break;
case PretranslationUsfmTextOrigin.PreferPretranslated:
usfm = updater.UpdateUsfm(
textId,
pretranslations.ToList(),
fullName: targetSettings.FullName,
stripAllText: false,
preferExistingText: false
);
break;
case PretranslationUsfmTextOrigin.OnlyExisting:
usfm = updater.UpdateUsfm(
textId,
[], // don't put any pretranslations, we only want the existing text.
fullName: targetSettings.FullName,
stripAllText: false,
preferExistingText: false
);
break;
case PretranslationUsfmTextOrigin.OnlyPretranslated:
usfm = updater.UpdateUsfm(
textId,
pretranslations.ToList(),
fullName: targetSettings.FullName,
stripAllText: true,
preferExistingText: false
);
break;
}
// In order to support PretranslationUsfmTemplate.Auto
if (usfm != null)
return usfm;
}

if (template is PretranslationUsfmTemplate.Auto or PretranslationUsfmTemplate.Source)
{
Shared.Services.ZipParatextProjectTextUpdater updater =
_scriptureDataFileService.GetZipParatextProjectTextUpdater(sourceFile.Filename);

// Copy and update the source book if it exists
string? usfm = await _scriptureDataFileService.ReadParatextProjectBookAsync(sourceFile.Filename, textId);
if (usfm is not null)
switch (textOrigin)
{
switch (textOrigin)
{
case PretranslationUsfmTextOrigin.PreferExisting:
case PretranslationUsfmTextOrigin.PreferPretranslated:
case PretranslationUsfmTextOrigin.OnlyPretranslated:
return UpdateUsfm(
sourceSettings,
usfm,
pretranslations,
fullName: targetSettings.FullName,
stripAllText: true,
preferExistingText: true
);
case PretranslationUsfmTextOrigin.OnlyExisting:
return UpdateUsfm(
sourceSettings,
usfm,
pretranslations: [], // don't pass the pretranslations, we only want the existing text.
fullName: targetSettings.FullName,
stripAllText: true,
preferExistingText: true
);
}
case PretranslationUsfmTextOrigin.PreferExisting:
case PretranslationUsfmTextOrigin.PreferPretranslated:
case PretranslationUsfmTextOrigin.OnlyPretranslated:
return updater.UpdateUsfm(
textId,
pretranslations.ToList(),
fullName: targetSettings.FullName,
stripAllText: true,
preferExistingText: true
);
case PretranslationUsfmTextOrigin.OnlyExisting:
return updater.UpdateUsfm(
textId,
[], // don't pass the pretranslations, we only want the existing text.
fullName: targetSettings.FullName,
stripAllText: true,
preferExistingText: true
);
}
}

return "";
}

private static string UpdateUsfm(
ParatextProjectSettings settings,
string usfm,
IEnumerable<(IReadOnlyList<ScriptureRef>, string)> pretranslations,
string? fullName = null,
bool stripAllText = false,
bool preferExistingText = true
)
{
var updater = new UsfmTextUpdater(
pretranslations.ToArray(),
fullName is null ? null : $"- {fullName}",
stripAllText,
preferExistingText: preferExistingText
);
UsfmParser.Parse(usfm, updater, settings.Stylesheet, settings.Versification);
return updater.GetUsfm(settings.Stylesheet);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,15 @@ public void GetParatextProjectSettings()
}

[Test]
public async Task ReadParatextProjectBookAsync_Exists()
public void GetZipParatextProjectTextUpdater()
{
TestEnvironment env = new();
string? usfm = await env.Service.ReadParatextProjectBookAsync("file1.zip", "MAT");
Assert.That(usfm, Is.Not.Null);
ZipParatextProjectTextUpdater updater = env.Service.GetZipParatextProjectTextUpdater("file1.zip");
Assert.That(
usfm.Replace("\r\n", "\n"),
updater.UpdateUsfm("MAT", [], preferExistingText: true),
Is.EqualTo(
@"\id MAT - PROJ
\h Matthew
$@"\id MAT - PROJ
\h {Canon.BookIdToEnglishName("MAT")}
\c 1
\p
\v 1 Chapter one, verse one.
Expand All @@ -30,19 +29,11 @@ public async Task ReadParatextProjectBookAsync_Exists()
\p
\v 1 Chapter two, verse one.
\v 2 Chapter two, verse two.
".Replace("\r\n", "\n")
".Replace("\n", "\r\n")
)
);
}

[Test]
public async Task ReadParatextProjectBookAsync_DoesNotExist()
{
TestEnvironment env = new();
string? usfm = await env.Service.ReadParatextProjectBookAsync("file1.zip", "MRK");
Assert.That(usfm, Is.Null);
}

private class TestEnvironment
{
public TestEnvironment()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -340,42 +340,53 @@ public TestEnvironment()
ScriptureDataFileService = Substitute.For<IScriptureDataFileService>();
ScriptureDataFileService.GetParatextProjectSettings("file1.zip").Returns(CreateProjectSettings("SRC"));
ScriptureDataFileService.GetParatextProjectSettings("file2.zip").Returns(CreateProjectSettings("TRG"));
ScriptureDataFileService
.ReadParatextProjectBookAsync("file1.zip", "MAT")
.Returns(Task.FromResult<string?>(SourceUsfm));
ScriptureDataFileService
.ReadParatextProjectBookAsync("file2.zip", "MAT")
.Returns(Task.FromResult<string?>(null));
var zipSubstituteSource = Substitute.For<IZipContainer>();
var zipSubstituteTarget = Substitute.For<IZipContainer>();
zipSubstituteSource.OpenEntry("MATSRC.SFM").Returns(new MemoryStream(Encoding.UTF8.GetBytes(SourceUsfm)));
zipSubstituteTarget.OpenEntry("MATTRG.SFM").Returns(new MemoryStream(Encoding.UTF8.GetBytes("")));
zipSubstituteSource.EntryExists(Arg.Any<string>()).Returns(false);
zipSubstituteTarget.EntryExists(Arg.Any<string>()).Returns(false);
zipSubstituteSource.EntryExists("MATSRC.SFM").Returns(true);
zipSubstituteTarget.EntryExists("MATTRG.SFM").Returns(true);
TargetZipContainer = zipSubstituteTarget;
var textUpdaterSource = new Shared.Services.ZipParatextProjectTextUpdater(
zipSubstituteSource,
CreateProjectSettings("SRC")
);
var textUpdaterTarget = new Shared.Services.ZipParatextProjectTextUpdater(
zipSubstituteTarget,
CreateProjectSettings("TRG")
);
ScriptureDataFileService.GetZipParatextProjectTextUpdater("file1.zip").Returns(textUpdaterSource);
ScriptureDataFileService.GetZipParatextProjectTextUpdater("file2.zip").Returns(textUpdaterTarget);
Service = new PretranslationService(Pretranslations, Engines, ScriptureDataFileService);
}

public PretranslationService Service { get; }
public MemoryRepository<Pretranslation> Pretranslations { get; }
public MemoryRepository<Engine> Engines { get; }
public IScriptureDataFileService ScriptureDataFileService { get; }
public IZipContainer TargetZipContainer { get; }

public async Task<string> GetUsfmAsync(
PretranslationUsfmTextOrigin textOrigin,
PretranslationUsfmTemplate template
)
{
return (
await Service.GetUsfmAsync(
engineId: "engine1",
modelRevision: 1,
corpusId: "corpus1",
textId: "MAT",
textOrigin: textOrigin,
template: template
)
).Replace("\r\n", "\n");
string usfm = await Service.GetUsfmAsync(
engineId: "engine1",
modelRevision: 1,
corpusId: "corpus1",
textId: "MAT",
textOrigin: textOrigin,
template: template
);
return usfm.Replace("\r\n", "\n");
}

public void AddMatthewToTarget()
{
ScriptureDataFileService
.ReadParatextProjectBookAsync("file2.zip", "MAT")
.Returns(Task.FromResult<string?>(TargetUsfm));
TargetZipContainer.OpenEntry("MATTRG.SFM").Returns(new MemoryStream(Encoding.UTF8.GetBytes(TargetUsfm)));
}

private static ParatextProjectSettings CreateProjectSettings(string name)
Expand Down
Loading