Skip to content

Commit

Permalink
Support new view types when reading
Browse files Browse the repository at this point in the history
  • Loading branch information
quinnj committed Dec 1, 2024
1 parent f1a91bf commit 74140b0
Show file tree
Hide file tree
Showing 8 changed files with 226 additions and 38 deletions.
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ LoggingExtras = "e6f89c97-d47a-5376-807f-9c37f3926c36"
Mmap = "a63ad114-7e13-5084-954f-fe012c677804"
PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
SentinelArrays = "91c51154-3ec4-41a3-a24f-3f23e20d615c"
StringViews = "354b36f9-a18e-4713-926e-db85100087ba"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
TimeZones = "f269a46b-ccf7-5d73-abea-4c690281aa53"
TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
Expand Down
3 changes: 2 additions & 1 deletion src/Arrow.jl
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ using DataAPI,
CodecZstd,
TimeZones,
BitIntegers,
ConcurrentUtilities
ConcurrentUtilities,
StringViews

export ArrowTypes

Expand Down
1 change: 1 addition & 0 deletions src/arraytypes/arraytypes.jl
Original file line number Diff line number Diff line change
Expand Up @@ -271,3 +271,4 @@ include("map.jl")
include("struct.jl")
include("unions.jl")
include("dictencoding.jl")
include("views.jl")
62 changes: 62 additions & 0 deletions src/arraytypes/views.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

struct ViewElement
length::Int32
prefix::Int32
bufindex::Int32
offset::Int32
end

"""
Arrow.View
An `ArrowVector` where each element is a variable sized list of some kind, like an `AbstractVector` or `AbstractString`.
"""
struct View{T} <: ArrowVector{T}
arrow::Vector{UInt8} # need to hold a reference to arrow memory blob
validity::ValidityBitmap
data::Vector{ViewElement}
inline::Vector{UInt8} # `data` field reinterpreted as a byte array
buffers::Vector{Vector{UInt8}} # holds non-inlined data
::Int
metadata::Union{Nothing,Base.ImmutableDict{String,String}}
end

Base.size(l::View) = (l.ℓ,)

@propagate_inbounds function Base.getindex(l::View{T}, i::Integer) where {T}
@boundscheck checkbounds(l, i)
@inbounds v = l.data[i]
S = Base.nonmissingtype(T)
if S <: Base.CodeUnits
# BinaryView
return !l.validity[i] ? missing :
v.length < 13 ?
Base.CodeUnits(StringView(@view l.inline[(((i - 1) * 16) + 5):(((i - 1) * 16) + 5 + v.length - 1)])) :
Base.CodeUnits(StringView(@view l.buffers[v.bufindex + 1][(v.offset + 1):(v.offset + v.length)]))
else
# Utf8View
return !l.validity[i] ? missing :
v.length < 13 ?
ArrowTypes.fromarrow(T, StringView(@view l.inline[(((i - 1) * 16) + 5):(((i - 1) * 16) + 5 + v.length - 1)])) :
ArrowTypes.fromarrow(T, StringView(@view l.buffers[v.bufindex + 1][(v.offset + 1):(v.offset + v.length)]))
end
end

# @propagate_inbounds function Base.setindex!(l::List{T}, v, i::Integer) where {T}

# end
6 changes: 3 additions & 3 deletions src/eltypes.jl
Original file line number Diff line number Diff line change
Expand Up @@ -129,12 +129,12 @@ function arrowtype(b, ::Type{T}) where {T<:AbstractFloat}
return Meta.FloatingPoint, Meta.floatingPointEnd(b), nothing
end

juliaeltype(f::Meta.Field, b::Union{Meta.Utf8,Meta.LargeUtf8}, convert) = String
juliaeltype(f::Meta.Field, b::Union{Meta.Utf8,Meta.LargeUtf8,Meta.Utf8View}, convert) = String

datasizeof(x) = sizeof(x)
datasizeof(x::AbstractVector) = sum(datasizeof, x)

juliaeltype(f::Meta.Field, b::Union{Meta.Binary,Meta.LargeBinary}, convert) = Base.CodeUnits
juliaeltype(f::Meta.Field, b::Union{Meta.Binary,Meta.LargeBinary,Meta.BinaryView}, convert) = Base.CodeUnits

juliaeltype(f::Meta.Field, x::Meta.FixedSizeBinary, convert) =
NTuple{Int(x.byteWidth),UInt8}
Expand Down Expand Up @@ -428,7 +428,7 @@ ArrowTypes.JuliaType(::Val{PERIOD_SYMBOL}, ::Type{Duration{U}}) where {U} = peri
ArrowTypes.fromarrow(::Type{P}, x::Duration{U}) where {P<:Dates.Period,U} = convert(P, x)

# nested types; call juliaeltype recursively on nested children
function juliaeltype(f::Meta.Field, list::Union{Meta.List,Meta.LargeList}, convert)
function juliaeltype(f::Meta.Field, list::Union{Meta.List,Meta.LargeList,Meta.ListView,Meta.LargeListView}, convert)
return Vector{juliaeltype(f.children[1], buildmetadata(f.children[1]), convert)}
end

Expand Down
7 changes: 6 additions & 1 deletion src/metadata/Message.jl
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ struct RecordBatch <: FlatBuffers.Table
pos::Base.Int
end

Base.propertynames(x::RecordBatch) = (:length, :nodes, :buffers, :compression)
Base.propertynames(x::RecordBatch) = (:length, :nodes, :buffers, :compression, :variadicBufferCounts)

function Base.getproperty(x::RecordBatch, field::Symbol)
if field === :length
Expand All @@ -97,6 +97,11 @@ function Base.getproperty(x::RecordBatch, field::Symbol)
y = FlatBuffers.indirect(x, o + FlatBuffers.pos(x))
return FlatBuffers.init(BodyCompression, FlatBuffers.bytes(x), y)
end
elseif field === :variadicBufferCounts
o = FlatBuffers.offset(x, 12)
if o != 0
return FlatBuffers.Array{Int32}(x, o)
end
end
return nothing
end
Expand Down
90 changes: 90 additions & 0 deletions src/metadata/Schema.jl
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,91 @@ durationAddUnit(b::FlatBuffers.Builder, unit::TimeUnit.T) =
FlatBuffers.prependslot!(b, 0, unit, 1)
durationEnd(b::FlatBuffers.Builder) = FlatBuffers.endobject!(b)

# /// Contains two child arrays, run_ends and values.
# /// The run_ends child array must be a 16/32/64-bit integer array
# /// which encodes the indices at which the run with the value in
# /// each corresponding index in the values child array ends.
# /// Like list/struct types, the value array can be of any type.
# table RunEndEncoded {
# }
struct RunEndEncoded <: FlatBuffers.Table
bytes::Vector{UInt8}
pos::Base.Int
end

Base.propertynames(x::RunEndEncoded) = ()

runEndEncodedStart(b::FlatBuffers.Builder) = FlatBuffers.startobject!(b, 0)
runEndEncodedEnd(b::FlatBuffers.Builder) = FlatBuffers.endobject!(b)

# /// Logically the same as Binary, but the internal representation uses a view
# /// struct that contains the string length and either the string's entire data
# /// inline (for small strings) or an inlined prefix, an index of another buffer,
# /// and an offset pointing to a slice in that buffer (for non-small strings).
# ///
# /// Since it uses a variable number of data buffers, each Field with this type
# /// must have a corresponding entry in `variadicBufferCounts`.
# table BinaryView {
# }
struct BinaryView <: FlatBuffers.Table
bytes::Vector{UInt8}
pos::Base.Int
end

Base.propertynames(x::BinaryView) = ()

binaryViewStart(b::FlatBuffers.Builder) = FlatBuffers.startobject!(b, 0)
binaryViewEnd(b::FlatBuffers.Builder) = FlatBuffers.endobject!(b)

# /// Logically the same as Utf8, but the internal representation uses a view
# /// struct that contains the string length and either the string's entire data
# /// inline (for small strings) or an inlined prefix, an index of another buffer,
# /// and an offset pointing to a slice in that buffer (for non-small strings).
# ///
# /// Since it uses a variable number of data buffers, each Field with this type
# /// must have a corresponding entry in `variadicBufferCounts`.
# table Utf8View {
# }
struct Utf8View <: FlatBuffers.Table
bytes::Vector{UInt8}
pos::Base.Int
end

Base.propertynames(x::Utf8View) = ()

utf8ViewStart(b::FlatBuffers.Builder) = FlatBuffers.startobject!(b, 0)
utf8ViewEnd(b::FlatBuffers.Builder) = FlatBuffers.endobject!(b)

# /// Represents the same logical types that List can, but contains offsets and
# /// sizes allowing for writes in any order and sharing of child values among
# /// list values.
# table ListView {
# }
struct ListView <: FlatBuffers.Table
bytes::Vector{UInt8}
pos::Base.Int
end

Base.propertynames(x::ListView) = ()

listViewStart(b::FlatBuffers.Builder) = FlatBuffers.startobject!(b, 0)
listViewEnd(b::FlatBuffers.Builder) = FlatBuffers.endobject!(b)

# /// Represents the same logical types that LargeList can, but contains offsets
# /// and sizes allowing for writes in any order and sharing of child values among
# /// list values.
# table LargeListView {
# }
struct LargeListView <: FlatBuffers.Table
bytes::Vector{UInt8}
pos::Base.Int
end

Base.propertynames(x::LargeListView) = ()

largeListViewStart(b::FlatBuffers.Builder) = FlatBuffers.startobject!(b, 0)
largeListViewEnd(b::FlatBuffers.Builder) = FlatBuffers.endobject!(b)

function Type(b::UInt8)
b == 1 && return Null
b == 2 && return Int
Expand All @@ -423,6 +508,11 @@ function Type(b::UInt8)
b == 19 && return LargeBinary
b == 20 && return LargeUtf8
b == 21 && return LargeList
b == 22 && return RunEndEncoded
b == 23 && return BinaryView
b == 24 && return Utf8View
b == 25 && return ListView
b == 26 && return LargeListView
return nothing
end

Expand Down
Loading

0 comments on commit 74140b0

Please sign in to comment.