Skip to content

Commit

Permalink
Address UTF-8 Detection In Get-Content -Tail (PowerShell#11899)
Browse files Browse the repository at this point in the history
- Addresses a comparison failure that causes UTF-8 detection to fail which in turn causes Get-Content -Tail to resort to forward lookups given encoding type cannot be detected. Possible this misdetection is due to the incoming encoding object as being of type System.Text.UTF8Encoding where as the comparison uses the object Encoding.UTF8 which is derived from System.Text.UTF8Encoding+UTF8EncodingSealed.
- See PowerShell#11830

- Added 'OEM', 'UTF8BOM', and 'UTF8NoBOM' as explicit encodings for existing Get-Content -Tail tests.

* Add Multi-Byte Unicode Tail Character Tests

- Modified -Tail encoding test to use three different test sets: utf-8, utf-16, utf-32.  The test verifies that the content resulting from -Tail is equal to the same string returned from a regular Get-Content using both an explicit and implicit encoding.

* Remove BigEndianUnicode Reference In Comment
  • Loading branch information
NoMoreFood committed Mar 14, 2020
1 parent 47645e0 commit 07962a9
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 25 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1434,7 +1434,7 @@ private int RefillByteBuff()
int toRead = lengthLeft > BuffSize ? BuffSize : (int)lengthLeft;
_stream.Seek(-toRead, SeekOrigin.Current);

if (_currentEncoding.Equals(Encoding.UTF8))
if (_currentEncoding is UTF8Encoding)
{
// It's UTF-8, we need to detect the starting byte of a character
do
Expand All @@ -1460,14 +1460,12 @@ private int RefillByteBuff()
_byteCount += _stream.Read(_byteBuff, _byteCount, (int)(lengthLeft - _stream.Position));
_stream.Position = _currentPosition;
}
else if (_currentEncoding.Equals(Encoding.Unicode) ||
_currentEncoding.Equals(Encoding.BigEndianUnicode) ||
_currentEncoding.Equals(Encoding.UTF32) ||
_currentEncoding.Equals(Encoding.ASCII) ||
else if (_currentEncoding is UnicodeEncoding ||
_currentEncoding is UTF32Encoding ||
_currentEncoding is ASCIIEncoding ||
IsSingleByteCharacterSet())
{
// Unicode -- two bytes per character
// BigEndianUnicode -- two types per character
// UTF-32 -- four bytes per character
// ASCII -- one byte per character
// The BufferSize will be a multiple of 4, so we can just read toRead number of bytes
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,35 +93,44 @@ Describe "Get-Content" -Tags "CI" {
{ Get-Content -Path Variable:\PSHOME -Tail 1 -TotalCount 5 -ErrorAction Stop} | Should -Throw -ErrorId 'TailAndHeadCannotCoexist,Microsoft.PowerShell.Commands.GetContentCommand'
}

It 'Verifies -Tail with content that uses an explicit encoding' -TestCases @(
It 'Verifies -Tail with content that uses an explicit/implicit encoding' -TestCases @(
@{EncodingName = 'String'},
@{EncodingName = 'OEM'},
@{EncodingName = 'Unicode'},
@{EncodingName = 'BigEndianUnicode'},
@{EncodingName = 'UTF8'},
@{EncodingName = 'UTF8BOM'},
@{EncodingName = 'UTF8NoBOM'},
@{EncodingName = 'UTF7'},
@{EncodingName = 'UTF32'},
@{EncodingName = 'Ascii'}
){
param($EncodingName)

$content = @"
one
two
foo
bar
baz
"@
$expected = 'foo'
$tailCount = 3

$testPath = Join-Path -Path $TestDrive -ChildPath 'TailWithEncoding.txt'
$content | Set-Content -Path $testPath -Encoding $encodingName
$expected = 'foo'

$actual = Get-Content -Path $testPath -Tail $tailCount -Encoding $encodingName
$actual | Should -BeOfType string
$actual.Length | Should -Be $tailCount
$actual[0] | Should -BeExactly $expected
$contentSets =
@(@('a1','aa2','aaa3','aaaa4','aaaaa5'), # utf-8
@('€1','€€2','€€€3','€€€€4','€€€€€5'), # utf-16
@('𐍈1','𐍈𐍈2','𐍈𐍈𐍈3','𐍈𐍈𐍈𐍈4','𐍈𐍈𐍈𐍈𐍈5')) # utf-32
ForEach ($content in $contentSets)
{
$tailCount = 3
$testPath = Join-Path -Path $TestDrive -ChildPath 'TailWithEncoding.txt'
$content | Set-Content -Path $testPath -Encoding $EncodingName

# read and verify using explicit encoding
$expected = (Get-Content -Path $testPath -Encoding $EncodingName)[-$tailCount]
$actual = Get-Content -Path $testPath -Tail $tailCount -Encoding $EncodingName
$actual | Should -BeOfType string
$actual.Length | Should -Be $tailCount
$actual[0] | Should -BeExactly $expected

# read and verify using implicit encoding
$expected = (Get-Content -Path $testPath)[-$tailCount]
$actual = Get-Content -Path $testPath -Tail $tailCount
$actual | Should -BeOfType string
$actual.Length | Should -Be $tailCount
$actual[0] | Should -BeExactly $expected
}
}

It "should Get-Content with a variety of -Tail and -ReadCount: <test>" -TestCases @(
Expand Down

0 comments on commit 07962a9

Please sign in to comment.