From 899bddb8dc4492e342a0568f2eb8659ec1335cbe Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Mon, 14 Aug 2023 14:16:46 -0500 Subject: [PATCH] move timezone support to extension --- Project.toml | 11 +++- ext/ArrowTimeZonesExt.jl | 121 +++++++++++++++++++++++++++++++++++++++ src/Arrow.jl | 7 ++- src/eltypes.jl | 103 --------------------------------- 4 files changed, 137 insertions(+), 105 deletions(-) create mode 100644 ext/ArrowTimeZonesExt.jl diff --git a/Project.toml b/Project.toml index 167cb9aa..40fe4476 100644 --- a/Project.toml +++ b/Project.toml @@ -17,7 +17,7 @@ name = "Arrow" uuid = "69666777-d1a9-59fb-9406-91d4454c9d45" authors = ["quinnj "] -version = "2.6.2" +version = "3.0.0" [deps] ArrowTypes = "31f734f8-188a-4ce0-8406-c8a06bd891cd" @@ -52,3 +52,12 @@ Tables = "1.1" TimeZones = "1" TranscodingStreams = "0.9.12" julia = "1.6" + +[weakdeps] +TimeZones = "f269a46b-ccf7-5d73-abea-4c690281aa53" + +[extensions] +# name of extension to the left +# extension dependencies required to load the extension to the right +# use a list for multiple extension dependencies +ArrowTimeZonesExt = ["TimeZones"] diff --git a/ext/ArrowTimeZonesExt.jl b/ext/ArrowTimeZonesExt.jl new file mode 100644 index 00000000..e64cff9b --- /dev/null +++ b/ext/ArrowTimeZonesExt.jl @@ -0,0 +1,121 @@ +module ArrowTimeZonesExt + +using Arrow +using Dates +using TimeZones + +using Arrow: ArrowTypes, DATETIME, FlatBuffers, Meta, periodtype, Timestamp, UNIX_EPOCH_DATETIME + +Arrow.finaljuliatype(::Type{Timestamp{U,TZ}}) where {U,TZ} = ZonedDateTime +Arrow.finaljuliatype(::Type{Timestamp{U,nothing}}) where {U} = DateTime + +@noinline warntimestamp(U, T) = + @warn "automatically converting Arrow.Timestamp with precision = $U to `$T` which only supports millisecond precision; conversion may be lossy; to avoid converting, pass `Arrow.Table(source; convert=false)" maxlog = + 1 _id = hash((:warntimestamp, U, T)) + +function Base.convert(::Type{ZonedDateTime}, x::Timestamp{U,TZ}) where {U,TZ} + (U === Meta.TimeUnit.MICROSECOND || U == Meta.TimeUnit.NANOSECOND) && + warntimestamp(U, ZonedDateTime) + return ZonedDateTime( + Dates.DateTime( + Dates.UTM(Int64(Dates.toms(periodtype(U)(x.x)) + UNIX_EPOCH_DATETIME)), + ), + TimeZone(String(TZ)); + from_utc=true, + ) +end + +function Base.convert(::Type{DateTime}, x::Timestamp{U,nothing}) where {U} + (U === Meta.TimeUnit.MICROSECOND || U == Meta.TimeUnit.NANOSECOND) && + warntimestamp(U, DateTime) + return Dates.DateTime( + Dates.UTM(Int64(Dates.toms(periodtype(U)(x.x)) + UNIX_EPOCH_DATETIME)), + ) +end + +Base.convert(::Type{Timestamp{Meta.TimeUnit.MILLISECOND,TZ}}, x::ZonedDateTime) where {TZ} = + Timestamp{Meta.TimeUnit.MILLISECOND,TZ}( + Int64(Dates.value(DateTime(x, UTC)) - UNIX_EPOCH_DATETIME), + ) +Base.convert(::Type{Timestamp{Meta.TimeUnit.MILLISECOND,nothing}}, x::DateTime) = + Timestamp{Meta.TimeUnit.MILLISECOND,nothing}( + Int64(Dates.value(x) - UNIX_EPOCH_DATETIME), + ) + +function Arrow.arrowtype(b, ::Type{Timestamp{U,TZ}}) where {U,TZ} + tz = TZ !== nothing ? FlatBuffers.createstring!(b, String(TZ)) : FlatBuffers.UOffsetT(0) + Meta.timestampStart(b) + Meta.timestampAddUnit(b, U) + Meta.timestampAddTimezone(b, tz) + return Meta.Timestamp, Meta.timestampEnd(b), nothing +end + +ArrowTypes.ArrowType(::Type{Dates.DateTime}) = DATETIME +ArrowTypes.toarrow(x::Dates.DateTime) = convert(DATETIME, x) +const DATETIME_SYMBOL = Symbol("JuliaLang.DateTime") +ArrowTypes.arrowname(::Type{Dates.DateTime}) = DATETIME_SYMBOL +ArrowTypes.JuliaType(::Val{DATETIME_SYMBOL}, S) = Dates.DateTime +ArrowTypes.fromarrow(::Type{Dates.DateTime}, x::Timestamp) = convert(Dates.DateTime, x) +ArrowTypes.fromarrow(::Type{Dates.DateTime}, x::Arrow.Date{Meta.DateUnit.MILLISECOND,Int64}) = + convert(Dates.DateTime, x) +ArrowTypes.default(::Type{Dates.DateTime}) = Dates.DateTime(1, 1, 1, 1, 1, 1) + +ArrowTypes.ArrowType(::Type{ZonedDateTime}) = Timestamp +ArrowTypes.toarrow(x::ZonedDateTime) = + convert(Timestamp{Meta.TimeUnit.MILLISECOND,Symbol(x.timezone)}, x) +const ZONEDDATETIME_SYMBOL = Symbol("JuliaLang.ZonedDateTime-UTC") +ArrowTypes.arrowname(::Type{ZonedDateTime}) = ZONEDDATETIME_SYMBOL +ArrowTypes.JuliaType(::Val{ZONEDDATETIME_SYMBOL}, S) = ZonedDateTime +ArrowTypes.fromarrow(::Type{ZonedDateTime}, x::Timestamp) = convert(ZonedDateTime, x) +ArrowTypes.default(::Type{TimeZones.ZonedDateTime}) = + TimeZones.ZonedDateTime(1, 1, 1, 1, 1, 1, TimeZones.tz"UTC") + +# Backwards compatibility: older versions of Arrow saved ZonedDateTime's with this metdata: +const OLD_ZONEDDATETIME_SYMBOL = Symbol("JuliaLang.ZonedDateTime") +# and stored the local time instead of the UTC time. +struct LocalZonedDateTime end +ArrowTypes.JuliaType(::Val{OLD_ZONEDDATETIME_SYMBOL}, S) = LocalZonedDateTime +function ArrowTypes.fromarrow(::Type{LocalZonedDateTime}, x::Timestamp{U,TZ}) where {U,TZ} + (U === Meta.TimeUnit.MICROSECOND || U == Meta.TimeUnit.NANOSECOND) && + warntimestamp(U, ZonedDateTime) + return ZonedDateTime( + Dates.DateTime( + Dates.UTM(Int64(Dates.toms(periodtype(U)(x.x)) + UNIX_EPOCH_DATETIME)), + ), + TimeZone(String(TZ)), + ) +end + +""" + Arrow.ToTimestamp(x::AbstractVector{ZonedDateTime}) + +Wrapper array that provides a more efficient encoding of `ZonedDateTime` elements to the arrow format. In the arrow format, +timestamp columns with timezone information are encoded as the arrow equivalent of a Julia type parameter, meaning an entire column +_should_ have elements all with the same timezone. If a `ZonedDateTime` column is passed to `Arrow.write`, for correctness, it must +scan each element to check each timezone. `Arrow.ToTimestamp` provides a "bypass" of this process by encoding the timezone of the +first element of the `AbstractVector{ZonedDateTime}`, which in turn allows `Arrow.write` to avoid costly checking/conversion and +can encode the `ZonedDateTime` as `Arrow.Timestamp` directly. +""" +struct ToTimestamp{A,TZ} <: AbstractVector{Timestamp{Meta.TimeUnit.MILLISECOND,TZ}} + data::A # AbstractVector{ZonedDateTime} +end + +ToTimestamp(x::A) where {A<:AbstractVector{ZonedDateTime}} = + ToTimestamp{A,Symbol(x[1].timezone)}(x) +Base.IndexStyle(::Type{<:ToTimestamp}) = Base.IndexLinear() +Base.size(x::ToTimestamp) = (length(x.data),) +Base.eltype(::Type{ToTimestamp{A,TZ}}) where {A,TZ} = + Timestamp{Meta.TimeUnit.MILLISECOND,TZ} +Base.getindex(x::ToTimestamp{A,TZ}, i::Integer) where {A,TZ} = + convert(Timestamp{Meta.TimeUnit.MILLISECOND,TZ}, getindex(x.data, i)) + + +function __init__() + # we need to add extension types back to the toplevel module + @static if VERSION >= v"1.9" + setglobal!(Arrow, :ToTimestamp, ToTimestamp) + end +end + + +end # module diff --git a/src/Arrow.jl b/src/Arrow.jl index efef8266..6fc96ea1 100644 --- a/src/Arrow.jl +++ b/src/Arrow.jl @@ -51,7 +51,6 @@ using DataAPI, PooledArrays, CodecLz4, CodecZstd, - TimeZones, BitIntegers, ConcurrentUtilities @@ -141,4 +140,10 @@ function __init__() return end +if !isdefined(Base, :get_extension) + include("../ext/ArrowTimeZonesExt.jl") + using .ArrowTimeZonesExt + const ToTimestamp = ArrowTimeZonesExt.ToTimestamp +end + end # module Arrow diff --git a/src/eltypes.jl b/src/eltypes.jl index ffc53c03..3a505824 100644 --- a/src/eltypes.jl +++ b/src/eltypes.jl @@ -266,109 +266,6 @@ end const DATETIME = Timestamp{Meta.TimeUnit.MILLISECOND,nothing} -finaljuliatype(::Type{Timestamp{U,TZ}}) where {U,TZ} = ZonedDateTime -finaljuliatype(::Type{Timestamp{U,nothing}}) where {U} = DateTime - -@noinline warntimestamp(U, T) = - @warn "automatically converting Arrow.Timestamp with precision = $U to `$T` which only supports millisecond precision; conversion may be lossy; to avoid converting, pass `Arrow.Table(source; convert=false)" maxlog = - 1 _id = hash((:warntimestamp, U, T)) - -function Base.convert(::Type{ZonedDateTime}, x::Timestamp{U,TZ}) where {U,TZ} - (U === Meta.TimeUnit.MICROSECOND || U == Meta.TimeUnit.NANOSECOND) && - warntimestamp(U, ZonedDateTime) - return ZonedDateTime( - Dates.DateTime( - Dates.UTM(Int64(Dates.toms(periodtype(U)(x.x)) + UNIX_EPOCH_DATETIME)), - ), - TimeZone(String(TZ)); - from_utc=true, - ) -end - -function Base.convert(::Type{DateTime}, x::Timestamp{U,nothing}) where {U} - (U === Meta.TimeUnit.MICROSECOND || U == Meta.TimeUnit.NANOSECOND) && - warntimestamp(U, DateTime) - return Dates.DateTime( - Dates.UTM(Int64(Dates.toms(periodtype(U)(x.x)) + UNIX_EPOCH_DATETIME)), - ) -end - -Base.convert(::Type{Timestamp{Meta.TimeUnit.MILLISECOND,TZ}}, x::ZonedDateTime) where {TZ} = - Timestamp{Meta.TimeUnit.MILLISECOND,TZ}( - Int64(Dates.value(DateTime(x, UTC)) - UNIX_EPOCH_DATETIME), - ) -Base.convert(::Type{Timestamp{Meta.TimeUnit.MILLISECOND,nothing}}, x::DateTime) = - Timestamp{Meta.TimeUnit.MILLISECOND,nothing}( - Int64(Dates.value(x) - UNIX_EPOCH_DATETIME), - ) - -function arrowtype(b, ::Type{Timestamp{U,TZ}}) where {U,TZ} - tz = TZ !== nothing ? FlatBuffers.createstring!(b, String(TZ)) : FlatBuffers.UOffsetT(0) - Meta.timestampStart(b) - Meta.timestampAddUnit(b, U) - Meta.timestampAddTimezone(b, tz) - return Meta.Timestamp, Meta.timestampEnd(b), nothing -end - -ArrowTypes.ArrowType(::Type{Dates.DateTime}) = DATETIME -ArrowTypes.toarrow(x::Dates.DateTime) = convert(DATETIME, x) -const DATETIME_SYMBOL = Symbol("JuliaLang.DateTime") -ArrowTypes.arrowname(::Type{Dates.DateTime}) = DATETIME_SYMBOL -ArrowTypes.JuliaType(::Val{DATETIME_SYMBOL}, S) = Dates.DateTime -ArrowTypes.fromarrow(::Type{Dates.DateTime}, x::Timestamp) = convert(Dates.DateTime, x) -ArrowTypes.fromarrow(::Type{Dates.DateTime}, x::Date{Meta.DateUnit.MILLISECOND,Int64}) = - convert(Dates.DateTime, x) -ArrowTypes.default(::Type{Dates.DateTime}) = Dates.DateTime(1, 1, 1, 1, 1, 1) - -ArrowTypes.ArrowType(::Type{ZonedDateTime}) = Timestamp -ArrowTypes.toarrow(x::ZonedDateTime) = - convert(Timestamp{Meta.TimeUnit.MILLISECOND,Symbol(x.timezone)}, x) -const ZONEDDATETIME_SYMBOL = Symbol("JuliaLang.ZonedDateTime-UTC") -ArrowTypes.arrowname(::Type{ZonedDateTime}) = ZONEDDATETIME_SYMBOL -ArrowTypes.JuliaType(::Val{ZONEDDATETIME_SYMBOL}, S) = ZonedDateTime -ArrowTypes.fromarrow(::Type{ZonedDateTime}, x::Timestamp) = convert(ZonedDateTime, x) -ArrowTypes.default(::Type{TimeZones.ZonedDateTime}) = - TimeZones.ZonedDateTime(1, 1, 1, 1, 1, 1, TimeZones.tz"UTC") - -# Backwards compatibility: older versions of Arrow saved ZonedDateTime's with this metdata: -const OLD_ZONEDDATETIME_SYMBOL = Symbol("JuliaLang.ZonedDateTime") -# and stored the local time instead of the UTC time. -struct LocalZonedDateTime end -ArrowTypes.JuliaType(::Val{OLD_ZONEDDATETIME_SYMBOL}, S) = LocalZonedDateTime -function ArrowTypes.fromarrow(::Type{LocalZonedDateTime}, x::Timestamp{U,TZ}) where {U,TZ} - (U === Meta.TimeUnit.MICROSECOND || U == Meta.TimeUnit.NANOSECOND) && - warntimestamp(U, ZonedDateTime) - return ZonedDateTime( - Dates.DateTime( - Dates.UTM(Int64(Dates.toms(periodtype(U)(x.x)) + UNIX_EPOCH_DATETIME)), - ), - TimeZone(String(TZ)), - ) -end - -""" - Arrow.ToTimestamp(x::AbstractVector{ZonedDateTime}) - -Wrapper array that provides a more efficient encoding of `ZonedDateTime` elements to the arrow format. In the arrow format, -timestamp columns with timezone information are encoded as the arrow equivalent of a Julia type parameter, meaning an entire column -_should_ have elements all with the same timezone. If a `ZonedDateTime` column is passed to `Arrow.write`, for correctness, it must -scan each element to check each timezone. `Arrow.ToTimestamp` provides a "bypass" of this process by encoding the timezone of the -first element of the `AbstractVector{ZonedDateTime}`, which in turn allows `Arrow.write` to avoid costly checking/conversion and -can encode the `ZonedDateTime` as `Arrow.Timestamp` directly. -""" -struct ToTimestamp{A,TZ} <: AbstractVector{Timestamp{Meta.TimeUnit.MILLISECOND,TZ}} - data::A # AbstractVector{ZonedDateTime} -end - -ToTimestamp(x::A) where {A<:AbstractVector{ZonedDateTime}} = - ToTimestamp{A,Symbol(x[1].timezone)}(x) -Base.IndexStyle(::Type{<:ToTimestamp}) = Base.IndexLinear() -Base.size(x::ToTimestamp) = (length(x.data),) -Base.eltype(::Type{ToTimestamp{A,TZ}}) where {A,TZ} = - Timestamp{Meta.TimeUnit.MILLISECOND,TZ} -Base.getindex(x::ToTimestamp{A,TZ}, i::Integer) where {A,TZ} = - convert(Timestamp{Meta.TimeUnit.MILLISECOND,TZ}, getindex(x.data, i)) - struct Interval{U,T} <: ArrowTimeType x::T end