Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/arraytypes/arraytypes.jl
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,7 @@ end

Base.size(p::ValidityBitmap) = (p.ℓ,)
nullcount(x::ValidityBitmap) = x.nc
Base.all(x::ValidityBitmap) = x.nc == 0
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't love overloading all here; can we just call this allvalid? I don't think we need to overload anything for this.


function ValidityBitmap(x)
T = eltype(x)
Expand Down
77 changes: 46 additions & 31 deletions src/eltypes.jl
Original file line number Diff line number Diff line change
Expand Up @@ -34,21 +34,26 @@ arrowtype(b, col::AbstractVector{T}) where {T} = arrowtype(b, maybemissing(T))
arrowtype(b, col::DictEncoded) = arrowtype(b, col.encoding.data)
arrowtype(b, col::Compressed) = arrowtype(b, col.data)

function juliaeltype(f::Meta.Field, ::Nothing, convert::Bool)
T = juliaeltype(f, convert)
function juliaeltype(f::Meta.Field, ::Nothing, convert::Bool, allvalid)
T = juliaeltype(f, convert, allvalid)
return convert ? finaljuliatype(T) : T
end

function juliaeltype(f::Meta.Field, meta::AbstractDict{String,String}, convert::Bool)
TT = juliaeltype(f, convert)
function juliaeltype(
f::Meta.Field,
meta::AbstractDict{String,String},
convert::Bool,
allvalid::Bool=false,
)
TT = juliaeltype(f, convert, allvalid)
!convert && return TT
T = finaljuliatype(TT)
if haskey(meta, "ARROW:extension:name")
typename = meta["ARROW:extension:name"]
metadata = get(meta, "ARROW:extension:metadata", "")
JT = ArrowTypes.JuliaType(Val(Symbol(typename)), maybemissing(TT), metadata)
if JT !== nothing
return f.nullable ? Union{JT,Missing} : JT
return f.nullable && !allvalid ? Union{JT,Missing} : JT
else
@warn "unsupported ARROW:extension:name type: \"$typename\", arrow type = $TT" maxlog =
1 _id = hash((:juliaeltype, typename, TT))
Expand All @@ -57,19 +62,19 @@ function juliaeltype(f::Meta.Field, meta::AbstractDict{String,String}, convert::
return something(TT, T)
end

function juliaeltype(f::Meta.Field, convert::Bool)
T = juliaeltype(f, f.type, convert)
return f.nullable ? Union{T,Missing} : T
function juliaeltype(f::Meta.Field, convert::Bool, allvalid)
T = juliaeltype(f, f.type, convert, allvalid)
return f.nullable && !allvalid ? Union{T,Missing} : T
end

juliaeltype(f::Meta.Field, ::Meta.Null, convert) = Missing
juliaeltype(f::Meta.Field, ::Meta.Null, convert, allvalid) = Missing

function arrowtype(b, ::Type{Missing})
Meta.nullStart(b)
return Meta.Null, Meta.nullEnd(b), nothing
end

function juliaeltype(f::Meta.Field, int::Meta.Int, convert)
function juliaeltype(f::Meta.Field, int::Meta.Int, convert, allvalid)
if int.is_signed
if int.bitWidth == 8
Int8
Expand Down Expand Up @@ -109,7 +114,7 @@ function arrowtype(b, ::Type{T}) where {T<:Integer}
end

# primitive types
function juliaeltype(f::Meta.Field, fp::Meta.FloatingPoint, convert)
function juliaeltype(f::Meta.Field, fp::Meta.FloatingPoint, convert, allvalid)
if fp.precision == Meta.Precision.HALF
Float16
elseif fp.precision == Meta.Precision.SINGLE
Expand All @@ -129,20 +134,21 @@ function arrowtype(b, ::Type{T}) where {T<:AbstractFloat}
return Meta.FloatingPoint, Meta.floatingPointEnd(b), nothing
end

juliaeltype(f::Meta.Field, b::Union{Meta.Utf8,Meta.LargeUtf8}, convert) = String
juliaeltype(f::Meta.Field, b::Union{Meta.Utf8,Meta.LargeUtf8}, convert, allvalid) = String

datasizeof(x) = sizeof(x)
datasizeof(x::AbstractVector) = sum(datasizeof, x)

juliaeltype(f::Meta.Field, b::Union{Meta.Binary,Meta.LargeBinary}, convert) = Base.CodeUnits
juliaeltype(f::Meta.Field, b::Union{Meta.Binary,Meta.LargeBinary}, convert, allvalid) =
Base.CodeUnits

juliaeltype(f::Meta.Field, x::Meta.FixedSizeBinary, convert) =
juliaeltype(f::Meta.Field, x::Meta.FixedSizeBinary, convert, allvalid) =
NTuple{Int(x.byteWidth),UInt8}

# arggh!
Base.write(io::IO, x::NTuple{N,T}) where {N,T} = sum(y -> Base.write(io, y), x)

juliaeltype(f::Meta.Field, x::Meta.Bool, convert) = Bool
juliaeltype(f::Meta.Field, x::Meta.Bool, convert, allvalid) = Bool

function arrowtype(b, ::Type{Bool})
Meta.boolStart(b)
Expand All @@ -157,7 +163,7 @@ Base.zero(::Type{Decimal{P,S,T}}) where {P,S,T} = Decimal{P,S,T}(T(0))
==(a::Decimal{P,S,T}, b::Decimal{P,S,T}) where {P,S,T} = ==(a.value, b.value)
Base.isequal(a::Decimal{P,S,T}, b::Decimal{P,S,T}) where {P,S,T} = isequal(a.value, b.value)

function juliaeltype(f::Meta.Field, x::Meta.Decimal, convert)
function juliaeltype(f::Meta.Field, x::Meta.Decimal, convert, allvalid)
return Decimal{x.precision,x.scale,x.bitWidth == 256 ? Int256 : Int128}
end

Expand Down Expand Up @@ -188,7 +194,7 @@ bitwidth(x::Meta.DateUnit.T) = x == Meta.DateUnit.DAY ? Int32 : Int64
Date{Meta.DateUnit.DAY}(days) = DATE(Int32(days))
Date{Meta.DateUnit.MILLISECOND}(ms) = Date{Meta.DateUnit.MILLISECOND,Int64}(Int64(ms))

juliaeltype(f::Meta.Field, x::Meta.Date, convert) = Date{x.unit,bitwidth(x.unit)}
juliaeltype(f::Meta.Field, x::Meta.Date, convert, allvalid) = Date{x.unit,bitwidth(x.unit)}
finaljuliatype(::Type{DATE}) = Dates.Date
Base.convert(::Type{Dates.Date}, x::DATE) =
Dates.Date(Dates.UTD(Int64(x.x + UNIX_EPOCH_DATE)))
Expand Down Expand Up @@ -228,7 +234,7 @@ bitwidth(x::Meta.TimeUnit.T) =
x == Meta.TimeUnit.SECOND || x == Meta.TimeUnit.MILLISECOND ? Int32 : Int64
Time{U}(x) where {U<:Meta.TimeUnit.T} = Time{U,bitwidth(U)}(bitwidth(U)(x))
storagetype(::Type{Time{U,T}}) where {U,T} = T
juliaeltype(f::Meta.Field, x::Meta.Time, convert) = Time{x.unit,bitwidth(x.unit)}
juliaeltype(f::Meta.Field, x::Meta.Time, convert, allvalid) = Time{x.unit,bitwidth(x.unit)}
finaljuliatype(::Type{<:Time}) = Dates.Time
periodtype(U::Meta.TimeUnit.T) =
U === Meta.TimeUnit.SECOND ? Dates.Second :
Expand Down Expand Up @@ -260,7 +266,7 @@ end

Base.zero(::Type{Timestamp{U,T}}) where {U,T} = Timestamp{U,T}(Int64(0))

function juliaeltype(f::Meta.Field, x::Meta.Timestamp, convert)
function juliaeltype(f::Meta.Field, x::Meta.Timestamp, convert, allvalid)
return Timestamp{x.unit,x.timezone === nothing ? nothing : Symbol(x.timezone)}
end

Expand Down Expand Up @@ -381,7 +387,7 @@ Interval{Meta.IntervalUnit.YEAR_MONTH}(x) =
Interval{Meta.IntervalUnit.DAY_TIME}(x) =
Interval{Meta.IntervalUnit.DAY_TIME,Int64}(Int64(x))

function juliaeltype(f::Meta.Field, x::Meta.Interval, convert)
function juliaeltype(f::Meta.Field, x::Meta.Interval, convert, allvalid)
return Interval{x.unit,bitwidth(x.unit)}
end

Expand All @@ -397,7 +403,7 @@ end

Base.zero(::Type{Duration{U}}) where {U} = Duration{U}(Int64(0))

function juliaeltype(f::Meta.Field, x::Meta.Duration, convert)
function juliaeltype(f::Meta.Field, x::Meta.Duration, convert, allvalid)
return Duration{x.unit}
end

Expand Down Expand Up @@ -428,8 +434,15 @@ ArrowTypes.JuliaType(::Val{PERIOD_SYMBOL}, ::Type{Duration{U}}) where {U} = peri
ArrowTypes.fromarrow(::Type{P}, x::Duration{U}) where {P<:Dates.Period,U} = convert(P, x)

# nested types; call juliaeltype recursively on nested children
function juliaeltype(f::Meta.Field, list::Union{Meta.List,Meta.LargeList}, convert)
return Vector{juliaeltype(f.children[1], buildmetadata(f.children[1]), convert)}
function juliaeltype(
f::Meta.Field,
list::Union{Meta.List,Meta.LargeList},
convert,
allvalid=false,
)
return Vector{
juliaeltype(f.children[1], buildmetadata(f.children[1]), convert, allvalid),
}
end

# arrowtype will call fieldoffset recursively for children
Expand Down Expand Up @@ -464,8 +477,8 @@ function arrowtype(b, x::List{T,O,A}) where {T,O,A}
end
end

function juliaeltype(f::Meta.Field, list::Meta.FixedSizeList, convert)
type = juliaeltype(f.children[1], buildmetadata(f.children[1]), convert)
function juliaeltype(f::Meta.Field, list::Meta.FixedSizeList, convert, allvalid=false)
type = juliaeltype(f.children[1], buildmetadata(f.children[1]), convert, allvalid)
return NTuple{Int(list.listSize),type}
end

Expand All @@ -485,16 +498,18 @@ function arrowtype(b, x::FixedSizeList{T,A}) where {T,A}
end
end

function juliaeltype(f::Meta.Field, map::Meta.Map, convert)
function juliaeltype(f::Meta.Field, map::Meta.Map, convert, allvalid)
K = juliaeltype(
f.children[1].children[1],
buildmetadata(f.children[1].children[1]),
convert,
allvalid,
)
V = juliaeltype(
f.children[1].children[2],
buildmetadata(f.children[1].children[2]),
convert,
allvalid,
)
return Dict{K,V}
end
Expand All @@ -521,9 +536,9 @@ function arrowtype(b, ::Type{KeyValue{K,V}}) where {K,V}
return Meta.Struct, Meta.structEnd(b), children
end

function juliaeltype(f::Meta.Field, list::Meta.Struct, convert)
function juliaeltype(f::Meta.Field, list::Meta.Struct, convert, allvalid)
names = Tuple(Symbol(x.name) for x in f.children)
types = Tuple(juliaeltype(x, buildmetadata(x), convert) for x in f.children)
types = Tuple(juliaeltype(x, buildmetadata(x), convert, allvalid) for x in f.children)
return NamedTuple{names,Tuple{types...}}
end

Expand All @@ -540,13 +555,13 @@ function UnionT(f::Meta.Field, convert)
UT = UnionT{
f.type.mode,
typeids,
Tuple{(juliaeltype(x, buildmetadata(x), convert) for x in f.children)...},
Tuple{(juliaeltype(x, buildmetadata(x), convert, false) for x in f.children)...},
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you help me understand why we hard-code false in a number of places for the allvalid argument? vs. in other places calling all(validity)?

}
return UT
end

juliaeltype(f::Meta.Field, u::Meta.Union, convert) =
Union{(juliaeltype(x, buildmetadata(x), convert) for x in f.children)...}
juliaeltype(f::Meta.Field, u::Meta.Union, convert, allvalid) =
Union{(juliaeltype(x, buildmetadata(x), convert, allvalid) for x in f.children)...}

function arrowtype(
b,
Expand Down
41 changes: 25 additions & 16 deletions src/table.jl
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,12 @@ function Base.iterate(x::Stream, (pos, id)=(1, 0))
push!(x.names, Symbol(field.name))
push!(
x.types,
juliaeltype(field, buildmetadata(field.custom_metadata), x.convert),
juliaeltype(
field,
buildmetadata(field.custom_metadata),
x.convert,
false,
),
)
# recursively find any dictionaries for any fields
getdictionaries!(x.dictencoded, field)
Expand Down Expand Up @@ -242,7 +247,7 @@ function Base.iterate(x::Stream, (pos, id)=(1, 0))
A = ChainedVector([values])
S =
field.dictionary.indexType === nothing ? Int32 :
juliaeltype(field, field.dictionary.indexType, false)
juliaeltype(field, field.dictionary.indexType, false, false)
x.dictencodings[id] = DictEncoding{eltype(A),S,typeof(A)}(
id,
A,
Expand Down Expand Up @@ -486,7 +491,7 @@ function Table(blobs::Vector{ArrowBlob}; convert::Bool=true)
A = ChainedVector([dictencoding.data, values])
S =
field.dictionary.indexType === nothing ? Int32 :
juliaeltype(field, field.dictionary.indexType, false)
juliaeltype(field, field.dictionary.indexType, false, false)
dictencodings[id] = DictEncoding{eltype(A),S,typeof(A)}(
id,
A,
Expand All @@ -511,7 +516,7 @@ function Table(blobs::Vector{ArrowBlob}; convert::Bool=true)
A = values
S =
field.dictionary.indexType === nothing ? Int32 :
juliaeltype(field, field.dictionary.indexType, false)
juliaeltype(field, field.dictionary.indexType, false, false)
dictencodings[id] = DictEncoding{eltype(A),S,typeof(A)}(
id,
A,
Expand Down Expand Up @@ -539,7 +544,7 @@ function Table(blobs::Vector{ArrowBlob}; convert::Bool=true)
# 158; some implementations may send 0 record batches
if !anyrecordbatches && !isnothing(sch)
for field in sch.fields
T = juliaeltype(field, buildmetadata(field), convert)
T = juliaeltype(field, buildmetadata(field), convert, false)
push!(columns(t), T[])
end
end
Expand Down Expand Up @@ -652,7 +657,10 @@ function build(field::Meta.Field, batch, rb, de, nodeidx, bufferidx, convert)
validity = buildbitmap(batch, rb, nodeidx, bufferidx)
bufferidx += 1
buffer = rb.buffers[bufferidx]
S = d.indexType === nothing ? Int32 : juliaeltype(field, d.indexType, false)
S =
d.indexType === nothing ? Int32 :
juliaeltype(field, d.indexType, false, all(validity))

bytes, indices = reinterp(S, batch, buffer, rb.compression)
encoding = de[d.id]
A = DictEncoded(
Expand Down Expand Up @@ -757,7 +765,7 @@ function build(f::Meta.Field, L::ListTypes, batch, rb, de, nodeidx, bufferidx, c
len = rb.nodes[nodeidx].length
nodeidx += 1
meta = buildmetadata(f.custom_metadata)
T = juliaeltype(f, meta, convert)
T = juliaeltype(f, meta, convert, all(validity))
if L isa Meta.Utf8 ||
L isa Meta.LargeUtf8 ||
L isa Meta.Binary ||
Expand Down Expand Up @@ -804,7 +812,7 @@ function build(
build(f.children[1], batch, rb, de, nodeidx, bufferidx, convert)
end
meta = buildmetadata(f.custom_metadata)
T = juliaeltype(f, meta, convert)
T = juliaeltype(f, meta, convert, all(validity))
return FixedSizeList{T,typeof(A)}(bytes, validity, A, len, meta), nodeidx, bufferidx
end

Expand All @@ -822,7 +830,7 @@ function build(f::Meta.Field, L::Meta.Map, batch, rb, de, nodeidx, bufferidx, co
nodeidx += 1
A, nodeidx, bufferidx = build(f.children[1], batch, rb, de, nodeidx, bufferidx, convert)
meta = buildmetadata(f.custom_metadata)
T = juliaeltype(f, meta, convert)
T = juliaeltype(f, meta, convert, all(validity))
return Map{T,OT,typeof(A)}(validity, offsets, A, len, meta), nodeidx, bufferidx
end

Expand All @@ -839,12 +847,13 @@ function build(f::Meta.Field, L::Meta.Struct, batch, rb, de, nodeidx, bufferidx,
end
data = Tuple(vecs)
meta = buildmetadata(f.custom_metadata)
T = juliaeltype(f, meta, convert)
T = juliaeltype(f, meta, convert, all(validity))
return Struct{T,typeof(data)}(validity, data, len, meta), nodeidx, bufferidx
end

function build(f::Meta.Field, L::Meta.Union, batch, rb, de, nodeidx, bufferidx, convert)
@debugv 2 "building array: L = $L"
validity = buildbitmap(batch, rb, nodeidx, bufferidx)
buffer = rb.buffers[bufferidx]
bytes, typeIds = reinterp(UInt8, batch, buffer, rb.compression)
bufferidx += 1
Expand All @@ -861,7 +870,7 @@ function build(f::Meta.Field, L::Meta.Union, batch, rb, de, nodeidx, bufferidx,
end
data = Tuple(vecs)
meta = buildmetadata(f.custom_metadata)
T = juliaeltype(f, meta, convert)
T = juliaeltype(f, meta, convert, all(validity))
UT = UnionT(f, convert)
if L.mode == Meta.UnionMode.Dense
B = DenseUnion{T,UT,typeof(data)}(bytes, bytes2, typeIds, offsets, data, meta)
Expand All @@ -874,7 +883,7 @@ end
function build(f::Meta.Field, L::Meta.Null, batch, rb, de, nodeidx, bufferidx, convert)
@debugv 2 "building array: L = $L"
meta = buildmetadata(f.custom_metadata)
T = juliaeltype(f, meta, convert)
T = juliaeltype(f, meta, convert, false)
return NullVector{maybemissing(T)}(MissingVector(rb.nodes[nodeidx].length), meta),
nodeidx + 1,
bufferidx
Expand All @@ -888,11 +897,11 @@ function build(f::Meta.Field, ::L, batch, rb, de, nodeidx, bufferidx, convert) w
buffer = rb.buffers[bufferidx]
meta = buildmetadata(f.custom_metadata)
# get storage type (non-converted)
T = juliaeltype(f, nothing, false)
T = juliaeltype(f, nothing, false, all(validity))
@debugv 2 "storage type for primitive: T = $T"
bytes, A = reinterp(Base.nonmissingtype(T), batch, buffer, rb.compression)
len = rb.nodes[nodeidx].length
T = juliaeltype(f, meta, convert)
T = juliaeltype(f, meta, convert, all(validity))
@debugv 2 "final julia type for primitive: T = $T"
return Primitive(T, bytes, validity, A, len, meta), nodeidx + 1, bufferidx + 1
end
Expand All @@ -904,7 +913,7 @@ function build(f::Meta.Field, L::Meta.Bool, batch, rb, de, nodeidx, bufferidx, c
buffer = rb.buffers[bufferidx]
meta = buildmetadata(f.custom_metadata)
# get storage type (non-converted)
T = juliaeltype(f, nothing, false)
T = juliaeltype(f, nothing, false, all(validity))
@debugv 2 "storage type for primitive: T = $T"
buffer = rb.buffers[bufferidx]
voff = batch.pos + buffer.offset
Expand All @@ -921,6 +930,6 @@ function build(f::Meta.Field, L::Meta.Bool, batch, rb, de, nodeidx, bufferidx, c
# return ValidityBitmap(decodedbytes, 1, node.length, node.null_count)
end
len = rb.nodes[nodeidx].length
T = juliaeltype(f, meta, convert)
T = juliaeltype(f, meta, convert, all(validity))
return BoolVector{T}(decodedbytes, pos, validity, len, meta), nodeidx + 1, bufferidx + 1
end