Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add keep_empty_rows kwarg #220

Merged
merged 4 commits into from
Mar 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 9 additions & 6 deletions src/read.jl
Original file line number Diff line number Diff line change
Expand Up @@ -543,7 +543,8 @@ end
[header],
[infer_eltypes],
[stop_in_empty_row],
[stop_in_row_function]
[stop_in_row_function],
[keep_empty_rows]
) -> DataTable

Returns tabular data from a spreadsheet as a struct `XLSX.DataTable`.
Expand Down Expand Up @@ -585,7 +586,9 @@ function stop_function(r)
end
```

Rows where all column values are equal to `missing` are dropped.
`keep_empty_rows` determines whether rows where all column values are equal to `missing` are kept (`true`) or dropped (`false`) from the resulting table.
`keep_empty_rows` never affects the *bounds* of the table; the number of rows read from a sheet is only affected by, `first_row`, `stop_in_empty_row` and `stop_in_row_function` (if specified).
`keep_empty_rows` is only checked once the first and last row of the table have been determined, to see whether to keep or drop empty rows between the first and the last row.

# Example

Expand All @@ -597,16 +600,16 @@ julia> df = DataFrame(XLSX.readtable("myfile.xlsx", "mysheet"))

See also: [`XLSX.gettable`](@ref).
"""
function readtable(source::Union{AbstractString, IO}, sheet::Union{AbstractString, Int}; first_row::Union{Nothing, Int} = nothing, column_labels=nothing, header::Bool=true, infer_eltypes::Bool=false, stop_in_empty_row::Bool=true, stop_in_row_function::Union{Nothing, Function}=nothing, enable_cache::Bool=false)
function readtable(source::Union{AbstractString, IO}, sheet::Union{AbstractString, Int}; first_row::Union{Nothing, Int} = nothing, column_labels=nothing, header::Bool=true, infer_eltypes::Bool=false, stop_in_empty_row::Bool=true, stop_in_row_function::Union{Nothing, Function}=nothing, enable_cache::Bool=false, keep_empty_rows::Bool=false)
c = openxlsx(source, enable_cache=enable_cache) do xf
gettable(getsheet(xf, sheet); first_row=first_row, column_labels=column_labels, header=header, infer_eltypes=infer_eltypes, stop_in_empty_row=stop_in_empty_row, stop_in_row_function=stop_in_row_function)
gettable(getsheet(xf, sheet); first_row=first_row, column_labels=column_labels, header=header, infer_eltypes=infer_eltypes, stop_in_empty_row=stop_in_empty_row, stop_in_row_function=stop_in_row_function, keep_empty_rows=keep_empty_rows)
end
return c
end

function readtable(source::Union{AbstractString, IO}, sheet::Union{AbstractString, Int}, columns::Union{ColumnRange, AbstractString}; first_row::Union{Nothing, Int} = nothing, column_labels=nothing, header::Bool=true, infer_eltypes::Bool=false, stop_in_empty_row::Bool=true, stop_in_row_function::Union{Nothing, Function}=nothing, enable_cache::Bool=false)
function readtable(source::Union{AbstractString, IO}, sheet::Union{AbstractString, Int}, columns::Union{ColumnRange, AbstractString}; first_row::Union{Nothing, Int} = nothing, column_labels=nothing, header::Bool=true, infer_eltypes::Bool=false, stop_in_empty_row::Bool=true, stop_in_row_function::Union{Nothing, Function}=nothing, enable_cache::Bool=false, keep_empty_rows::Bool=false)
c = openxlsx(source, enable_cache=enable_cache) do xf
gettable(getsheet(xf, sheet), columns; first_row=first_row, column_labels=column_labels, header=header, infer_eltypes=infer_eltypes, stop_in_empty_row=stop_in_empty_row, stop_in_row_function=stop_in_row_function)
gettable(getsheet(xf, sheet), columns; first_row=first_row, column_labels=column_labels, header=header, infer_eltypes=infer_eltypes, stop_in_empty_row=stop_in_empty_row, stop_in_row_function=stop_in_row_function, keep_empty_rows=keep_empty_rows)
end
return c
end
103 changes: 56 additions & 47 deletions src/table.jl
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ function push_unique!(vect::Vector{Symbol}, sheet::Worksheet, cell::AbstractCell
end

"""
eachtablerow(sheet, [columns]; [first_row], [column_labels], [header], [stop_in_empty_row], [stop_in_row_function])
eachtablerow(sheet, [columns]; [first_row], [column_labels], [header], [stop_in_empty_row], [stop_in_row_function], [keep_empty_rows])

Constructs an iterator of table rows. Each element of the iterator is of type `TableRow`.

Expand Down Expand Up @@ -102,6 +102,11 @@ function stop_function(r)
end
```

`keep_empty_rows` determines whether rows where all column values are equal to `missing` are kept (`true`) or skipped (`false`) by the row iterator.
`keep_empty_rows` never affects the *bounds* of the iterator; the number of rows read from a sheet is only affected by `first_row`, `stop_in_empty_row` and `stop_in_row_function` (if specified).
`keep_empty_rows` is only checked once the first and last row of the table have been determined, to see whether to keep or drop empty rows between the first and the last row.


Example code:
```
for r in XLSX.eachtablerow(sheet)
Expand All @@ -121,7 +126,8 @@ function eachtablerow(
column_labels=nothing,
header::Bool=true,
stop_in_empty_row::Bool=true,
stop_in_row_function::Union{Nothing, Function}=nothing
stop_in_row_function::Union{Nothing, Function}=nothing,
keep_empty_rows::Bool=false,
) :: TableRowIterator

if first_row == nothing
Expand Down Expand Up @@ -152,14 +158,14 @@ function eachtablerow(
end

first_data_row = header ? first_row + 1 : first_row
return TableRowIterator(sheet, Index(column_range, column_labels), first_data_row, stop_in_empty_row, stop_in_row_function)
return TableRowIterator(sheet, Index(column_range, column_labels), first_data_row, stop_in_empty_row, stop_in_row_function, keep_empty_rows)
end

function TableRowIterator(sheet::Worksheet, index::Index, first_data_row::Int, stop_in_empty_row::Bool=true, stop_in_row_function::Union{Nothing, Function}=nothing)
return TableRowIterator(eachrow(sheet), index, first_data_row, stop_in_empty_row, stop_in_row_function)
function TableRowIterator(sheet::Worksheet, index::Index, first_data_row::Int, stop_in_empty_row::Bool=true, stop_in_row_function::Union{Nothing, Function}=nothing, keep_empty_rows::Bool=false)
return TableRowIterator(eachrow(sheet), index, first_data_row, stop_in_empty_row, stop_in_row_function, keep_empty_rows)
end

function eachtablerow(sheet::Worksheet; first_row::Union{Nothing, Int}=nothing, column_labels=nothing, header::Bool=true, stop_in_empty_row::Bool=true, stop_in_row_function::Union{Function, Nothing}=nothing) :: TableRowIterator
function eachtablerow(sheet::Worksheet; first_row::Union{Nothing, Int}=nothing, column_labels=nothing, header::Bool=true, stop_in_empty_row::Bool=true, stop_in_row_function::Union{Function, Nothing}=nothing, keep_empty_rows::Bool=false) :: TableRowIterator

if first_row == nothing
# if no columns were given,
Expand All @@ -170,43 +176,41 @@ function eachtablerow(sheet::Worksheet; first_row::Union{Nothing, Int}=nothing,

for r in eachrow(sheet)

# skip rows until we reach first_row
if row_number(r) < first_row
# skip rows until we reach first_row, and if !keep_empty_rows then skip empty rows
if row_number(r) < first_row || isempty(r) && !keep_empty_rows
continue
end

if !isempty(r)
columns_ordered = sort(collect(keys(r.rowcells)))

for (ci, cn) in enumerate(columns_ordered)
if !ismissing(getdata(r, cn))
# found a row with data. Will get ColumnRange from non-empty consecutive cells
first_row = row_number(r)
column_start = cn
column_stop = cn

if length(columns_ordered) == 1
# there's only one column
column_range = ColumnRange(column_start, column_stop)
return eachtablerow(sheet, column_range; first_row=first_row, column_labels=column_labels, header=header, stop_in_empty_row=stop_in_empty_row, stop_in_row_function=stop_in_row_function)
else
# will figure out the column range
for ci_stop in (ci+1):length(columns_ordered)
cn_stop = columns_ordered[ci_stop]

# Will stop if finds an empty cell or a skipped column
if ismissing(getdata(r, cn_stop)) || (cn_stop - 1 != column_stop)
column_range = ColumnRange(column_start, column_stop)
return eachtablerow(sheet, column_range; first_row=first_row, column_labels=column_labels, header=header, stop_in_empty_row=stop_in_empty_row, stop_in_row_function=stop_in_row_function)
end
column_stop = cn_stop
end
end
columns_ordered = sort(collect(keys(r.rowcells)))

for (ci, cn) in enumerate(columns_ordered)
if !ismissing(getdata(r, cn))
# found a row with data. Will get ColumnRange from non-empty consecutive cells
first_row = row_number(r)
column_start = cn
column_stop = cn

# if got here, it's because all columns are non-empty
if length(columns_ordered) == 1
# there's only one column
column_range = ColumnRange(column_start, column_stop)
return eachtablerow(sheet, column_range; first_row=first_row, column_labels=column_labels, header=header, stop_in_empty_row=stop_in_empty_row, stop_in_row_function=stop_in_row_function)
return eachtablerow(sheet, column_range; first_row=first_row, column_labels=column_labels, header=header, stop_in_empty_row=stop_in_empty_row, stop_in_row_function=stop_in_row_function, keep_empty_rows)
else
# will figure out the column range
for ci_stop in (ci+1):length(columns_ordered)
cn_stop = columns_ordered[ci_stop]

# Will stop if finds an empty cell or a skipped column
if ismissing(getdata(r, cn_stop)) || (cn_stop - 1 != column_stop)
column_range = ColumnRange(column_start, column_stop)
return eachtablerow(sheet, column_range; first_row=first_row, column_labels=column_labels, header=header, stop_in_empty_row=stop_in_empty_row, stop_in_row_function=stop_in_row_function, keep_empty_rows)
end
column_stop = cn_stop
end
end

# if got here, it's because all columns are non-empty
column_range = ColumnRange(column_start, column_stop)
return eachtablerow(sheet, column_range; first_row=first_row, column_labels=column_labels, header=header, stop_in_empty_row=stop_in_empty_row, stop_in_row_function=stop_in_row_function, keep_empty_rows)
end
end
end
Expand Down Expand Up @@ -354,7 +358,7 @@ function Base.iterate(itr::TableRowIterator, state::TableRowIteratorState)
if itr.stop_in_empty_row
# user asked to stop fetching table rows if we find an empty row
return nothing
else
elseif !itr.keep_empty_rows
# keep looking for a non-empty row
next = iterate(itr.itr, sheet_row_iterator_state)
while next != nothing
Expand All @@ -371,8 +375,9 @@ function Base.iterate(itr::TableRowIterator, state::TableRowIteratorState)
end
end
end

@assert !is_empty_table_row(sheet_row) # if the `is_empty_table_row` check above was successful, we can't get empty sheet_row here

# if the `is_empty_table_row` check above was successful, we can't get empty sheet_row here
@assert !is_empty_table_row(sheet_row) || itr.keep_empty_rows
table_row = TableRow(table_row_index, itr.index, sheet_row)

# user asked to stop
Expand Down Expand Up @@ -459,7 +464,7 @@ function gettable(itr::TableRowIterator; infer_eltypes::Bool=false) :: DataTable
end

# undo insert row in case of empty rows
if is_empty_row
if is_empty_row && !itr.keep_empty_rows
for c in 1:columns_count
pop!(data[c])
end
Expand Down Expand Up @@ -491,7 +496,8 @@ end
[header],
[infer_eltypes],
[stop_in_empty_row],
[stop_in_row_function]
[stop_in_row_function],
[keep_empty_rows]
) -> DataTable

Returns tabular data from a spreadsheet as a struct `XLSX.DataTable`.
Expand Down Expand Up @@ -533,7 +539,10 @@ function stop_function(r)
end
```

Rows where all column values are equal to `missing` are dropped.
`keep_empty_rows` determines whether rows where all column values are equal to `missing` are kept (`true`) or dropped (`false`) from the resulting table.
`keep_empty_rows` never affects the *bounds* of the table; the number of rows read from a sheet is only affected by `first_row`, `stop_in_empty_row` and `stop_in_row_function` (if specified).
`keep_empty_rows` is only checked once the first and last row of the table have been determined, to see whether to keep or drop empty rows between the first and the last row.


# Example

Expand All @@ -547,12 +556,12 @@ julia> df = XLSX.openxlsx("myfile.xlsx") do xf

See also: [`XLSX.readtable`](@ref).
"""
function gettable(sheet::Worksheet, cols::Union{ColumnRange, AbstractString}; first_row::Union{Nothing, Int}=nothing, column_labels=nothing, header::Bool=true, infer_eltypes::Bool=false, stop_in_empty_row::Bool=true, stop_in_row_function::Union{Function, Nothing}=nothing)
itr = eachtablerow(sheet, cols; first_row=first_row, column_labels=column_labels, header=header, stop_in_empty_row=stop_in_empty_row, stop_in_row_function=stop_in_row_function)
function gettable(sheet::Worksheet, cols::Union{ColumnRange, AbstractString}; first_row::Union{Nothing, Int}=nothing, column_labels=nothing, header::Bool=true, infer_eltypes::Bool=false, stop_in_empty_row::Bool=true, stop_in_row_function::Union{Function, Nothing}=nothing, keep_empty_rows::Bool=false)
itr = eachtablerow(sheet, cols; first_row=first_row, column_labels=column_labels, header=header, stop_in_empty_row=stop_in_empty_row, stop_in_row_function=stop_in_row_function, keep_empty_rows)
return gettable(itr; infer_eltypes=infer_eltypes)
end

function gettable(sheet::Worksheet; first_row::Union{Nothing, Int}=nothing, column_labels=nothing, header::Bool=true, infer_eltypes::Bool=false, stop_in_empty_row::Bool=true, stop_in_row_function::Union{Function, Nothing}=nothing)
itr = eachtablerow(sheet; first_row=first_row, column_labels=column_labels, header=header, stop_in_empty_row=stop_in_empty_row, stop_in_row_function=stop_in_row_function)
function gettable(sheet::Worksheet; first_row::Union{Nothing, Int}=nothing, column_labels=nothing, header::Bool=true, infer_eltypes::Bool=false, stop_in_empty_row::Bool=true, stop_in_row_function::Union{Function, Nothing}=nothing, keep_empty_rows::Bool=false)
itr = eachtablerow(sheet; first_row=first_row, column_labels=column_labels, header=header, stop_in_empty_row=stop_in_empty_row, stop_in_row_function=stop_in_row_function, keep_empty_rows)
return gettable(itr; infer_eltypes=infer_eltypes)
end
1 change: 1 addition & 0 deletions src/types.jl
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,7 @@ struct TableRowIterator{I<:SheetRowIterator}
first_data_row::Int
stop_in_empty_row::Bool
stop_in_row_function::Union{Nothing, Function}
keep_empty_rows::Bool
end

struct TableRow
Expand Down
12 changes: 12 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -809,6 +809,18 @@ end
data, col_names = dtable.data, dtable.column_labels
@test col_names == [ Symbol("Column B"), Symbol("Column C"), Symbol("Column D"), Symbol("Column E"), Symbol("Column F"), Symbol("Column G")]

# test keep_empty_rows
for (stop_in_empty_row, keep_empty_rows, n_rows) in [
(false, false, 9),
(false, true, 10),
(true, false, 8),
(true, true, 8)
]
dtable = XLSX.gettable(s; stop_in_empty_row, keep_empty_rows)
@test all(col_name -> length(Tables.getcolumn(dtable, col_name)) == n_rows, Tables.columnnames(dtable))
end


test_data = Vector{Any}(undef, 6)
test_data[1] = [1, 2, 3, 4, 5, 6, 7, 8, "trash" ]
test_data[2] = [ "Str1", missing, "Str1", "Str1", "Str2", "Str2", "Str2", "Str2", missing ]
Expand Down