@@ -117,21 +117,31 @@ inttype(::Type{Float128}) = Int128
117117
118118# conversion
119119Float128 (x:: Float128 ) = x
120+ function Float128 (x:: T ) where T <: Base.IEEEFloat
121+ if x=== zero (x)
122+ return reinterpret (Float128, zero (UInt128))
123+ elseif x=== - zero (x)
124+ return reinterpret (Float128, sign_mask (Float128))
125+ else
126+ s = UInt128 (signbit (x))<< 127
127+ if ! isfinite (x)
128+ xf = reinterpret (Unsigned, x) & significand_mask (T)
129+ d = exponent_mask (Float128)
130+ else
131+ f, e = frexp (x)
132+ xf = reinterpret (Unsigned, f) & significand_mask (T)
133+ d = ((e+ exponent_bias (Float128)- 1 ) % UInt128) << significand_bits (Float128)
134+ end
135+ xu = UInt128 (xf)<< (significand_bits (Float128)- significand_bits (T))
136+ return reinterpret (Float128, s| d| xu)
137+ end
138+ end
120139
121- # Float64
122- @assume_effects :foldable Float128 (x:: Float64 ) =
123- Float128 (@quad_ccall (quadoplib. __extenddftf2 (x:: Cdouble ):: Cfloat128 ))
140+ # truncation
124141@assume_effects :foldable Float64 (x:: Float128 ) =
125142 @quad_ccall (quadoplib. __trunctfdf2 (x:: Cfloat128 ):: Cdouble )
126-
127- # Float32
128- @assume_effects :foldable Float128 (x:: Float32 ) =
129- Float128 (@quad_ccall (quadoplib. __extendsftf2 (x:: Cfloat ):: Cfloat128 ))
130143@assume_effects :foldable Float32 (x:: Float128 ) =
131144 @quad_ccall (quadoplib. __trunctfsf2 (x:: Cfloat128 ):: Cfloat )
132-
133- # Float16
134- Float128 (x:: Float16 ) = Float128 (Float32 (x))
135145Float16 (x:: Float128 ) = Float16 (Float64 (x)) # TODO : avoid double rounding
136146
137147# TwicePrecision
@@ -192,75 +202,81 @@ Float128(x::Bool) = x ? Float128(1) : Float128(0)
192202 Float128 (@quad_ccall (quadoplib. __negtf2 (x:: Cfloat128 ):: Cfloat128 ))
193203
194204# Float128 -> Integer
195- @assume_effects :foldable unsafe_trunc (:: Type{Int32} , x:: Float128 ) =
196- @quad_ccall (quadoplib. __fixtfsi (x:: Cfloat128 ):: Int32 )
197-
198- @assume_effects :foldable unsafe_trunc (:: Type{Int64} , x:: Float128 ) =
199- @quad_ccall (quadoplib. __fixtfdi (x:: Cfloat128 ):: Int64 )
200-
201- @assume_effects :foldable unsafe_trunc (:: Type{UInt32} , x:: Float128 ) =
202- @quad_ccall (quadoplib. __fixunstfsi (x:: Cfloat128 ):: UInt32 )
203-
204- @assume_effects :foldable unsafe_trunc (:: Type{UInt64} , x:: Float128 ) =
205- @quad_ccall (quadoplib. __fixunstfdi (x:: Cfloat128 ):: UInt64 )
206-
207- function unsafe_trunc (:: Type{UInt128} , x:: Float128 )
205+ function unsafe_trunc (:: Type{T} , x:: Float128 ) where T<: Base.BitUnsigned
208206 xu = reinterpret (UInt128,x)
209207 k = (Int64 (xu >> 112 ) & 0x07fff ) - 16382 - 113
210208 xu = (xu & significand_mask (Float128)) | 0x0001_0000_0000_0000_0000_0000_0000_0000
211209 if k <= 0
212- UInt128 (xu >> - k)
210+ (xu >> - k) % T
213211 else
214- UInt128 (xu) << k
212+ (xu % T ) << k
215213 end
216214end
217- function unsafe_trunc (:: Type{Int128 } , x:: Float128 )
218- copysign (unsafe_trunc (UInt128 ,x) % Int128 , x)
215+ function unsafe_trunc (:: Type{T } , x:: Float128 ) where T <: Base.BitSigned
216+ copysign (unsafe_trunc (unsigned (T) ,x) % T , x)
219217end
220218trunc (:: Type{Signed} , x:: Float128 ) = trunc (Int,x)
221219trunc (:: Type{Unsigned} , x:: Float128 ) = trunc (Int,x)
222220trunc (:: Type{Integer} , x:: Float128 ) = trunc (Int,x)
223221
224- for Ti in (Int32, Int64, Int128, UInt32, UInt64, UInt128)
225- if Ti <: Unsigned || sizeof (Ti) < sizeof (Float128)
226- # Here `Float128(typemin(Ti))-1` is exact, so we can compare the lower-bound
227- # directly. `Float128(typemax(Ti))+1` is either always exactly representable, or
228- # rounded to `Inf` (e.g. when `Ti==UInt128 && Float128==Float32`).
229- @eval begin
230- function trunc (:: Type{$Ti} ,x:: Float128 )
231- if Float128 (typemin ($ Ti)) - one (Float128) < x < Float128 (typemax ($ Ti)) + one (Float128)
232- return unsafe_trunc ($ Ti,x)
233- else
234- throw (InexactError (:trunc , $ Ti, x))
235- end
222+ for Ti in (UInt8, UInt16, UInt32, UInt64, UInt128)
223+ # Here `Float128(typemin(Ti))` is 0, so we can check the sign of the float
224+ # `Float128(typemax(Ti))+1` is exactly representable
225+ @eval begin
226+ function trunc (:: Type{$Ti} ,x:: Float128 )
227+ if ! signbit (x) && x < Float128 (typemax ($ Ti)) + one (Float128)
228+ return unsafe_trunc ($ Ti,x)
229+ else
230+ throw (InexactError (:trunc , $ Ti, x))
236231 end
237- function ( :: Type{$Ti} )(x :: Float128 )
238- if ( Float128 ( typemin ( $ Ti)) <= x <= Float128 ( typemax ( $ Ti))) && ( round (x, RoundToZero) == x )
239- return unsafe_trunc ( $ Ti, x)
240- else
241- throw ( InexactError ( $ ( Expr ( :quote ,Ti . name . name)), $ Ti, x))
242- end
232+ end
233+ function ( :: Type{ $Ti} )(x :: Float128 )
234+ if ( ! signbit (x) && x <= Float128 ( typemax ( $ Ti))) && ( round (x, RoundToZero) == x)
235+ return unsafe_trunc ( $ Ti,x)
236+ else
237+ throw ( InexactError ( $ ( Expr ( :quote ,Ti . name . name)), $ Ti, x))
243238 end
244239 end
245- else
246- # Here `eps(Float128(typemin(Ti))) > 1`, so the only value which can be truncated to
247- # `Float128(typemin(Ti)` is itself. Similarly, `Float128(typemax(Ti))` is inexact and will
248- # be rounded up. This assumes that `Float128(typemin(Ti)) > -Inf`, which is true for
249- # these types, but not for `Float16` or larger integer types.
250- @eval begin
251- function trunc (:: Type{$Ti} ,x:: Float128 )
252- if Float128 (typemin ($ Ti)) <= x < Float128 (typemax ($ Ti))
253- return unsafe_trunc ($ Ti,x)
254- else
255- throw (InexactError (:trunc , $ Ti, x))
256- end
240+ end
241+ end
242+ for Ti in (Int8, Int16, Int32, Int64)
243+ # Here `Float128(typemin(Ti))-1` is exact, so we can compare the lower-bound
244+ # directly. `Float128(typemax(Ti))+1` is always exactly representable
245+ @eval begin
246+ function trunc (:: Type{$Ti} ,x:: Float128 )
247+ if Float128 (typemin ($ Ti)) - one (Float128) < x < Float128 (typemax ($ Ti)) + one (Float128)
248+ return unsafe_trunc ($ Ti,x)
249+ else
250+ throw (InexactError (:trunc , $ Ti, x))
257251 end
258- function (:: Type{$Ti} )(x:: Float128 )
259- if (Float128 (typemin ($ Ti)) <= x < Float128 (typemax ($ Ti))) && (round (x, RoundToZero) == x)
260- return unsafe_trunc ($ Ti,x)
261- else
262- throw (InexactError ($ (Expr (:quote ,Ti. name. name)), $ Ti, x))
263- end
252+ end
253+ function (:: Type{$Ti} )(x:: Float128 )
254+ if (Float128 (typemin ($ Ti)) <= x <= Float128 (typemax ($ Ti))) && (round (x, RoundToZero) == x)
255+ return unsafe_trunc ($ Ti,x)
256+ else
257+ throw (InexactError ($ (Expr (:quote ,Ti. name. name)), $ Ti, x))
258+ end
259+ end
260+ end
261+ end
262+ for Ti in (Int128,)
263+ # Here `eps(Float128(typemin(Ti))) > 1`, so the only value which can be truncated to
264+ # `Float128(typemin(Ti)` is itself. Similarly, `Float128(typemax(Ti))` is inexact and will
265+ # be rounded up. This assumes that `Float128(typemin(Ti)) > -Inf`, which is true for
266+ # Int128, but possibly not for larger integer types.
267+ @eval begin
268+ function trunc (:: Type{$Ti} ,x:: Float128 )
269+ if Float128 (typemin ($ Ti)) <= x < Float128 (typemax ($ Ti))
270+ return unsafe_trunc ($ Ti,x)
271+ else
272+ throw (InexactError (:trunc , $ Ti, x))
273+ end
274+ end
275+ function (:: Type{$Ti} )(x:: Float128 )
276+ if (Float128 (typemin ($ Ti)) <= x < Float128 (typemax ($ Ti))) && (round (x, RoundToZero) == x)
277+ return unsafe_trunc ($ Ti,x)
278+ else
279+ throw (InexactError ($ (Expr (:quote ,Ti. name. name)), $ Ti, x))
264280 end
265281 end
266282 end
@@ -278,7 +294,10 @@ for f in (:acos, :acosh, :asin, :asinh, :atan, :atanh, :cosh, :cos,
278294 end
279295end
280296
281- @assume_effects :foldable abs (x:: Float128 ) = Float128 (@quad_ccall (libquadmath. fabsq (x:: Cfloat128 ):: Cfloat128 ))
297+ function abs (x:: Float128 )
298+ # mask out sign
299+ reinterpret (Float128, reinterpret (UInt128, x)& (~ (UInt128 (1 )<< 127 )))
300+ end
282301@assume_effects :foldable round (x:: Float128 ) = Float128 (@quad_ccall (libquadmath. rintq (x:: Cfloat128 ):: Cfloat128 ))
283302round (x:: Float128 , r:: RoundingMode{:Down} ) = floor (x)
284303round (x:: Float128 , r:: RoundingMode{:Up} ) = ceil (x)
@@ -317,9 +336,16 @@ sincos(x::Float128) = (sin(x), cos(x))
317336 Float128 (@quad_ccall (libquadmath. fmaq (x:: Cfloat128 , y:: Cfloat128 , z:: Cfloat128 ):: Cfloat128 ))
318337end
319338
320- @assume_effects :foldable isnan (x:: Float128 ) = 0 != @quad_ccall (libquadmath. isnanq (x:: Cfloat128 ):: Cint )
321- @assume_effects :foldable isinf (x:: Float128 ) = 0 != @quad_ccall (libquadmath. isinfq (x:: Cfloat128 ):: Cint )
322- @assume_effects :foldable isfinite (x:: Float128 ) = 0 != @quad_ccall (libquadmath. finiteq (x:: Cfloat128 ):: Cint )
339+ function isinf (x:: Float128 )
340+ xu = reinterpret (UInt128, x)
341+ # xu must be either 0x7fff_0000... or 0xffff_0000...
342+ return xu in (exponent_mask (Float128), exponent_mask (Float128)&~ (UInt128 (1 )<< 127 ))
343+ end
344+ function isfinite (x:: Float128 )
345+ xu = reinterpret (UInt128, x)
346+ return xu>> significand_bits (Float128)& 0x7fff != 0x7fff
347+ end
348+ isnan (x:: Float128 ) = ! (isfinite (x) || isinf (x))
323349
324350isinteger (x:: Float128 ) = isfinite (x) && x === trunc (x)
325351
0 commit comments