Skip to content

Commit f0e2597

Browse files
authored
Start support for ListView, BinaryView, Utf8View, etc. (#512)
1 parent 9041cbf commit f0e2597

File tree

8 files changed

+234
-42
lines changed

8 files changed

+234
-42
lines changed

Project.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ EnumX = "4e289a0a-7415-4d19-859d-a7e5c4648b56"
3131
Mmap = "a63ad114-7e13-5084-954f-fe012c677804"
3232
PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
3333
SentinelArrays = "91c51154-3ec4-41a3-a24f-3f23e20d615c"
34+
StringViews = "354b36f9-a18e-4713-926e-db85100087ba"
3435
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
3536
TimeZones = "f269a46b-ccf7-5d73-abea-4c690281aa53"
3637
TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"

src/Arrow.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,8 @@ using DataAPI,
5252
CodecZstd,
5353
TimeZones,
5454
BitIntegers,
55-
ConcurrentUtilities
55+
ConcurrentUtilities,
56+
StringViews
5657

5758
export ArrowTypes
5859

src/arraytypes/arraytypes.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,3 +271,4 @@ include("map.jl")
271271
include("struct.jl")
272272
include("unions.jl")
273273
include("dictencoding.jl")
274+
include("views.jl")

src/arraytypes/views.jl

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
struct ViewElement
18+
length::Int32
19+
prefix::Int32
20+
bufindex::Int32
21+
offset::Int32
22+
end
23+
24+
"""
25+
Arrow.View
26+
27+
An `ArrowVector` where each element is a variable sized list of some kind, like an `AbstractVector` or `AbstractString`.
28+
"""
29+
struct View{T} <: ArrowVector{T}
30+
arrow::Vector{UInt8} # need to hold a reference to arrow memory blob
31+
validity::ValidityBitmap
32+
data::Vector{ViewElement}
33+
inline::Vector{UInt8} # `data` field reinterpreted as a byte array
34+
buffers::Vector{Vector{UInt8}} # holds non-inlined data
35+
::Int
36+
metadata::Union{Nothing,Base.ImmutableDict{String,String}}
37+
end
38+
39+
Base.size(l::View) = (l.ℓ,)
40+
41+
@propagate_inbounds function Base.getindex(l::View{T}, i::Integer) where {T}
42+
@boundscheck checkbounds(l, i)
43+
@inbounds v = l.data[i]
44+
S = Base.nonmissingtype(T)
45+
if S <: Base.CodeUnits
46+
# BinaryView
47+
return !l.validity[i] ? missing :
48+
v.length < 13 ?
49+
Base.CodeUnits(StringView(@view l.inline[(((i - 1) * 16) + 5):(((i - 1) * 16) + 5 + v.length - 1)])) :
50+
Base.CodeUnits(StringView(@view l.buffers[v.bufindex + 1][(v.offset + 1):(v.offset + v.length)]))
51+
else
52+
# Utf8View
53+
return !l.validity[i] ? missing :
54+
v.length < 13 ?
55+
ArrowTypes.fromarrow(T, StringView(@view l.inline[(((i - 1) * 16) + 5):(((i - 1) * 16) + 5 + v.length - 1)])) :
56+
ArrowTypes.fromarrow(T, StringView(@view l.buffers[v.bufindex + 1][(v.offset + 1):(v.offset + v.length)]))
57+
end
58+
end
59+
60+
# @propagate_inbounds function Base.setindex!(l::List{T}, v, i::Integer) where {T}
61+
62+
# end

src/eltypes.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -129,12 +129,12 @@ function arrowtype(b, ::Type{T}) where {T<:AbstractFloat}
129129
return Meta.FloatingPoint, Meta.floatingPointEnd(b), nothing
130130
end
131131

132-
juliaeltype(f::Meta.Field, b::Union{Meta.Utf8,Meta.LargeUtf8}, convert) = String
132+
juliaeltype(f::Meta.Field, b::Union{Meta.Utf8,Meta.LargeUtf8,Meta.Utf8View}, convert) = String
133133

134134
datasizeof(x) = sizeof(x)
135135
datasizeof(x::AbstractVector) = sum(datasizeof, x)
136136

137-
juliaeltype(f::Meta.Field, b::Union{Meta.Binary,Meta.LargeBinary}, convert) = Base.CodeUnits
137+
juliaeltype(f::Meta.Field, b::Union{Meta.Binary,Meta.LargeBinary,Meta.BinaryView}, convert) = Base.CodeUnits
138138

139139
juliaeltype(f::Meta.Field, x::Meta.FixedSizeBinary, convert) =
140140
NTuple{Int(x.byteWidth),UInt8}
@@ -428,7 +428,7 @@ ArrowTypes.JuliaType(::Val{PERIOD_SYMBOL}, ::Type{Duration{U}}) where {U} = peri
428428
ArrowTypes.fromarrow(::Type{P}, x::Duration{U}) where {P<:Dates.Period,U} = convert(P, x)
429429

430430
# nested types; call juliaeltype recursively on nested children
431-
function juliaeltype(f::Meta.Field, list::Union{Meta.List,Meta.LargeList}, convert)
431+
function juliaeltype(f::Meta.Field, list::Union{Meta.List,Meta.LargeList,Meta.ListView,Meta.LargeListView}, convert)
432432
return Vector{juliaeltype(f.children[1], buildmetadata(f.children[1]), convert)}
433433
end
434434

src/metadata/Message.jl

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ struct RecordBatch <: FlatBuffers.Table
7575
pos::Base.Int
7676
end
7777

78-
Base.propertynames(x::RecordBatch) = (:length, :nodes, :buffers, :compression)
78+
Base.propertynames(x::RecordBatch) = (:length, :nodes, :buffers, :compression, :variadicBufferCounts)
7979

8080
function Base.getproperty(x::RecordBatch, field::Symbol)
8181
if field === :length
@@ -97,6 +97,11 @@ function Base.getproperty(x::RecordBatch, field::Symbol)
9797
y = FlatBuffers.indirect(x, o + FlatBuffers.pos(x))
9898
return FlatBuffers.init(BodyCompression, FlatBuffers.bytes(x), y)
9999
end
100+
elseif field === :variadicBufferCounts
101+
o = FlatBuffers.offset(x, 12)
102+
if o != 0
103+
return FlatBuffers.Array{Int32}(x, o)
104+
end
100105
end
101106
return nothing
102107
end

src/metadata/Schema.jl

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -401,6 +401,91 @@ durationAddUnit(b::FlatBuffers.Builder, unit::TimeUnit.T) =
401401
FlatBuffers.prependslot!(b, 0, unit, 1)
402402
durationEnd(b::FlatBuffers.Builder) = FlatBuffers.endobject!(b)
403403

404+
# /// Contains two child arrays, run_ends and values.
405+
# /// The run_ends child array must be a 16/32/64-bit integer array
406+
# /// which encodes the indices at which the run with the value in
407+
# /// each corresponding index in the values child array ends.
408+
# /// Like list/struct types, the value array can be of any type.
409+
# table RunEndEncoded {
410+
# }
411+
struct RunEndEncoded <: FlatBuffers.Table
412+
bytes::Vector{UInt8}
413+
pos::Base.Int
414+
end
415+
416+
Base.propertynames(x::RunEndEncoded) = ()
417+
418+
runEndEncodedStart(b::FlatBuffers.Builder) = FlatBuffers.startobject!(b, 0)
419+
runEndEncodedEnd(b::FlatBuffers.Builder) = FlatBuffers.endobject!(b)
420+
421+
# /// Logically the same as Binary, but the internal representation uses a view
422+
# /// struct that contains the string length and either the string's entire data
423+
# /// inline (for small strings) or an inlined prefix, an index of another buffer,
424+
# /// and an offset pointing to a slice in that buffer (for non-small strings).
425+
# ///
426+
# /// Since it uses a variable number of data buffers, each Field with this type
427+
# /// must have a corresponding entry in `variadicBufferCounts`.
428+
# table BinaryView {
429+
# }
430+
struct BinaryView <: FlatBuffers.Table
431+
bytes::Vector{UInt8}
432+
pos::Base.Int
433+
end
434+
435+
Base.propertynames(x::BinaryView) = ()
436+
437+
binaryViewStart(b::FlatBuffers.Builder) = FlatBuffers.startobject!(b, 0)
438+
binaryViewEnd(b::FlatBuffers.Builder) = FlatBuffers.endobject!(b)
439+
440+
# /// Logically the same as Utf8, but the internal representation uses a view
441+
# /// struct that contains the string length and either the string's entire data
442+
# /// inline (for small strings) or an inlined prefix, an index of another buffer,
443+
# /// and an offset pointing to a slice in that buffer (for non-small strings).
444+
# ///
445+
# /// Since it uses a variable number of data buffers, each Field with this type
446+
# /// must have a corresponding entry in `variadicBufferCounts`.
447+
# table Utf8View {
448+
# }
449+
struct Utf8View <: FlatBuffers.Table
450+
bytes::Vector{UInt8}
451+
pos::Base.Int
452+
end
453+
454+
Base.propertynames(x::Utf8View) = ()
455+
456+
utf8ViewStart(b::FlatBuffers.Builder) = FlatBuffers.startobject!(b, 0)
457+
utf8ViewEnd(b::FlatBuffers.Builder) = FlatBuffers.endobject!(b)
458+
459+
# /// Represents the same logical types that List can, but contains offsets and
460+
# /// sizes allowing for writes in any order and sharing of child values among
461+
# /// list values.
462+
# table ListView {
463+
# }
464+
struct ListView <: FlatBuffers.Table
465+
bytes::Vector{UInt8}
466+
pos::Base.Int
467+
end
468+
469+
Base.propertynames(x::ListView) = ()
470+
471+
listViewStart(b::FlatBuffers.Builder) = FlatBuffers.startobject!(b, 0)
472+
listViewEnd(b::FlatBuffers.Builder) = FlatBuffers.endobject!(b)
473+
474+
# /// Represents the same logical types that LargeList can, but contains offsets
475+
# /// and sizes allowing for writes in any order and sharing of child values among
476+
# /// list values.
477+
# table LargeListView {
478+
# }
479+
struct LargeListView <: FlatBuffers.Table
480+
bytes::Vector{UInt8}
481+
pos::Base.Int
482+
end
483+
484+
Base.propertynames(x::LargeListView) = ()
485+
486+
largeListViewStart(b::FlatBuffers.Builder) = FlatBuffers.startobject!(b, 0)
487+
largeListViewEnd(b::FlatBuffers.Builder) = FlatBuffers.endobject!(b)
488+
404489
function Type(b::UInt8)
405490
b == 1 && return Null
406491
b == 2 && return Int
@@ -423,6 +508,11 @@ function Type(b::UInt8)
423508
b == 19 && return LargeBinary
424509
b == 20 && return LargeUtf8
425510
b == 21 && return LargeList
511+
b == 22 && return RunEndEncoded
512+
b == 23 && return BinaryView
513+
b == 24 && return Utf8View
514+
b == 25 && return ListView
515+
b == 26 && return LargeListView
426516
return nothing
427517
end
428518

0 commit comments

Comments
 (0)