Skip to content

Commit 1aedb16

Browse files
authored
Merge pull request #24 from sl-solution/issorted
define issorted/!
2 parents d8ee8b3 + 9ceaa26 commit 1aedb16

File tree

4 files changed

+273
-0
lines changed

4 files changed

+273
-0
lines changed

docs/src/man/sorting.md

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -308,3 +308,58 @@ julia> unsort!(ds)
308308
```
309309

310310
## `issorted`/`issorted!`
311+
312+
The `issorted` function checks if a data set is sorted by given column(s). The syntax for the function is `issorted(ds, cols)`, and by default the `mapformats` keyword argument is set to `true` and the `rev` keyword argument is set to `false`. The `issorted!` function does the same job, however, if it returns `true` it marks the input data set as a sorted data set, i.e. it attaches some meta information to the data set.
313+
314+
### Examples
315+
316+
```jldoctest
317+
julia> ds = Dataset(x1 = [1, 4, 7], x2 = [3.0, 1.1, -10.0], x3 = ["one", "two", "three"])
318+
3×3 Dataset
319+
Row │ x1 x2 x3
320+
│ identity identity identity
321+
│ Int64? Float64? String?
322+
─────┼──────────────────────────────
323+
1 │ 1 3.0 one
324+
2 │ 4 1.1 two
325+
3 │ 7 -10.0 three
326+
327+
julia> issorted(ds, 1)
328+
true
329+
330+
julia> issorted(ds, 2)
331+
false
332+
333+
julia> issorted(ds, 2, rev = true)
334+
true
335+
336+
julia> julia> fmt(x) = x == "one" ? 1 : x=="two" ? 2 : 3
337+
fmt (generic function with 1 method)
338+
339+
julia> setformat!(ds, :x3=>fmt)
340+
3×3 Dataset
341+
Row │ x1 x2 x3
342+
│ identity identity fmt
343+
│ Int64? Float64? String?
344+
─────┼─────────────────────────────
345+
1 │ 1 3.0 1
346+
2 │ 4 1.1 2
347+
3 │ 7 -10.0 3
348+
349+
julia> issorted(ds, 3)
350+
true
351+
352+
julia> issorted!(ds, 1:3, rev = [false, true, false])
353+
true
354+
355+
julia> ds
356+
3×3 Sorted Dataset
357+
Sorted by: x1, x2, x3
358+
Row │ x1 x2 x3
359+
│ identity identity fmt
360+
│ Int64? Float64? String?
361+
─────┼─────────────────────────────
362+
1 │ 1 3.0 1
363+
2 │ 4 1.1 2
364+
3 │ 7 -10.0 3
365+
```

src/InMemoryDatasets.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ export
7070
groupby,
7171
gatherby,
7272
describe,
73+
issorted!,
7374
unsort!,
7475
ungroup!,
7576
modify,

src/sort/sort.jl

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,3 +133,84 @@ function unsort!(ds::Dataset)
133133
ds
134134
end
135135
end
136+
137+
function Base.issorted(ds::AbstractDataset, cols::MultiColumnIndex; rev = false, mapformats = true)
138+
_issorted(ds, cols, nrow(ds) < typemax(Int32) ? Val(Int32) : Val(Int64), rev = rev, mapformats = mapformats)[1]
139+
end
140+
Base.issorted(ds::AbstractDataset, col::ColumnIndex; rev = false, mapformats = true) = issorted(ds, [col], rev = rev, mapformats = mapformats)
141+
142+
function issorted!(ds::Dataset, cols::MultiColumnIndex; rev = false, mapformats = true)
143+
res, starts, lastvalid, colsidx, revs, mapformats = _issorted(ds, cols, nrow(ds) < typemax(Int32) ? Val(Int32) : Val(Int64), rev = rev, mapformats = mapformats)
144+
if res
145+
_reset_grouping_info!(ds)
146+
append!(index(ds).sortedcols, collect(colsidx))
147+
append!(index(ds).rev, revs)
148+
append!(index(ds).perm, collect(1:nrow(ds)))
149+
append!(index(ds).starts, starts)
150+
index(ds).ngroups[] = lastvalid
151+
index(ds).fmt[] = mapformats
152+
end
153+
res
154+
end
155+
156+
issorted!(ds::Dataset, col::ColumnIndex; rev = false, mapformats = true) = issorted!(ds, [col], rev = rev, mapformats = mapformats)
157+
158+
function _issorted(ds, cols::MultiColumnIndex, ::Val{T}; rev = false, mapformats = true) where T
159+
colsidx = index(ds)[cols]
160+
if rev isa AbstractVector
161+
@assert length(rev) == length(colsidx) "length of rev and the number of selected columns must match"
162+
revs = rev
163+
else
164+
revs = repeat([rev], length(colsidx))
165+
end
166+
by = Function[]
167+
168+
if mapformats
169+
for j in 1:length(colsidx)
170+
push!(by, getformat(parent(ds), colsidx[j]))
171+
end
172+
else
173+
for j in 1:length(colsidx)
174+
push!(by, identity)
175+
end
176+
end
177+
res = true
178+
starts = Vector{T}(undef, nrow(ds))
179+
starts[1] = 1
180+
lastvalid = 1
181+
inbits = zeros(Bool, nrow(ds))
182+
inbits[1] = true
183+
for j in 1:length(colsidx)
184+
v = _columns(ds)[colsidx[j]]
185+
for rng in 1:lastvalid
186+
lo = starts[rng]
187+
rng == lastvalid ? hi = nrow(ds) : hi = starts[rng+1] - 1
188+
part_res = _issorted_barrier(v, Base.Order.ord(isless, by[j], revs[j]), lo, hi)
189+
!part_res && return false, starts, lastvalid, colsidx, revs, mapformats
190+
end
191+
_find_starts_of_groups!(_columns(ds)[colsidx[j]], 1:nrow(ds), by[j], inbits)
192+
lastvalid = _fill_starts_from_inbits!(starts, inbits)
193+
lastvalid == nrow(ds) && return true, starts, lastvalid, colsidx, revs, mapformats
194+
# lastvalid = _fill_starts_v2!(starts, inbits, _columns(ds)[colsidx[j]], lastvalid, Base.Order.ord(isless, by[j], revs[j]), Val(T))
195+
end
196+
res, starts, lastvalid, colsidx, revs, mapformats
197+
end
198+
199+
function _fill_starts_from_inbits!(starts, inbits)
200+
lastvalid = 1
201+
@inbounds for i in 1:length(inbits)
202+
if inbits[i]
203+
starts[lastvalid] = i
204+
lastvalid += 1
205+
end
206+
end
207+
lastvalid - 1
208+
end
209+
210+
function _issorted_barrier(v, _ord, lo, hi)
211+
lo >= hi && return true
212+
for i in lo+1:hi
213+
Base.Order.lt(_ord, v[i], v[i-1]) && return false
214+
end
215+
true
216+
end

test/sort.jl

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -371,3 +371,139 @@ end
371371
@test sort(ds, :) == sort(ds[!, 1:2], :)
372372
end
373373
end
374+
375+
@testset "issorted/issorted!" begin
376+
dv1 = [9, 1, 8, missing, 3, 3, 7, missing]
377+
dv2 = [9, 1, 8, missing, 3, 3, 7, missing]
378+
dv3 = Vector{Union{Int, Missing}}(1:8)
379+
cv1 = CategoricalArray(dv1, ordered=true)
380+
381+
d = Dataset(dv1=dv1, dv2=dv2, dv3=dv3, cv1=cv1)
382+
383+
@test !issorted(d, :cv1)
384+
@test issorted(d, :dv3)
385+
@test !issorted(d, :dv1)
386+
387+
dv1 = [1,3,3,7,8,9, missing, missing]
388+
dv2 = [9, 1, 8, missing, 3, 3, 7, missing]
389+
dv3 = Vector{Union{Int, Missing}}(1:8)
390+
cv1 = CategoricalArray(dv1, ordered=true)
391+
392+
d = Dataset(dv1=dv1, dv2=dv2, dv3=dv3, cv1=cv1)
393+
@test issorted(d, :cv1)
394+
@test issorted(d, :dv1)
395+
@test !issorted(d, :dv2)
396+
397+
ds = Dataset(x = [0xfffffffffffffff3, 0xfffffffffffffff2, 0xfffffffffffffff4, 0xfffffffffffffff1], y = [1,1,2,2])
398+
@test issorted(ds[[4,2,1,3],:],1)
399+
@test issorted(view(ds, [4,2,1,3], :), 1)
400+
@test issorted(ds[[3,1,2,4],:],1, rev = true)
401+
setformat!(ds, 1=>isodd)
402+
@test issorted(ds[[2,3,1,4],:],1)
403+
@test issorted(view(ds, [2,3,1,4], :), 1)
404+
@test issorted(ds[[1,4,2,3],:],1, rev=true)
405+
@test issorted(ds[[2,3,1,4], :], 1:2)
406+
@test issorted(view(ds, [2,3,1,4], :), 1:2)
407+
@test issorted(ds[[3,2,4,1], :], 1:2, rev = [false, true])
408+
@test issorted(view(ds, [3,2,4,1], :), 1:2, rev = [false, true])
409+
410+
411+
x = rand(Int128, 1000)
412+
y = rand(1:100, 1000)
413+
ds = Dataset(x = x, y = y)
414+
@test issorted(sort(ds, 1),1)
415+
@test issorted(sort(ds, 1, rev = true), 1, rev=true)
416+
setformat!(ds, 1=>isodd)
417+
@test issorted(sort(ds, 1),1)
418+
@test issorted(sort(ds, 1, rev = true), 1, rev = true)
419+
420+
ds = Dataset(x = big.([1,4,-1,1,100]), x2 = [45,3,98,100,10])
421+
@test !issorted(ds, 1)
422+
@test issorted(ds[[3,1,4,2,5], 1:1], 1)
423+
@test issorted(view(ds, [5,2,1,4,3], [2,1]), 2, rev = true)
424+
@test issorted(ds[[3, 1, 4, 2, 5], :], 1:2)
425+
@test issorted(ds[[3,4,1,2,5],:], 1:2, rev = [false, true])
426+
ds[2,1]=missing
427+
@test !issorted(ds, 1)
428+
@test issorted(ds[[3,1,4,5,2], :], 1)
429+
@test issorted(view(ds, [2,5,1,4,3], :), 1, rev = true)
430+
431+
for i in 1:100
432+
ds = Dataset(rand(1:10, 1000, 10), :auto)
433+
for j in 1:10
434+
@test issorted(sort(ds, 1:j), 1:j)
435+
@test issorted(sort(ds, 1:j, rev = true), 1:j, rev = true)
436+
setformat!(ds, 1:10=>isodd)
437+
@test issorted(sort(ds, 1:j), 1:j)
438+
@test issorted(sort(ds, 1:j, rev = true), 1:j, rev = true)
439+
end
440+
ds = Dataset(rand(1:10., 1000, 10), :auto)
441+
map!(ds, x->rand()<.1 ? missing : x, :)
442+
for j in 1:10
443+
@test issorted(sort(ds, 1:j), 1:j)
444+
@test issorted(sort(ds, 1:j, rev = true), 1:j, rev = true)
445+
setformat!(ds, 1:10=>sign)
446+
@test issorted(sort(ds, 1:j), 1:j)
447+
@test issorted(sort(ds, 1:j, rev = true), 1:j, rev = true)
448+
end
449+
ds = Dataset(rand(1:10., 1000, 10), :auto)
450+
map!(ds, x->rand()<.1 ? missing : x, :)
451+
for j in 1:10
452+
ds[!, j] = PooledArray(ds[!, j])
453+
end
454+
for j in 1:10
455+
@test issorted(sort(ds, 1:j), 1:j)
456+
@test issorted(sort(ds, 1:j, rev = true), 1:j, rev = true)
457+
setformat!(ds, 1:10=>sign)
458+
@test issorted(sort(ds, 1:j), 1:j)
459+
@test issorted(sort(ds, 1:j, rev = true), 1:j, rev = true)
460+
end
461+
end
462+
for i in 1:100
463+
ds = Dataset(rand(1:10, 1000, 10), :auto)
464+
for j in 1:10
465+
sort!(ds, 1:j)
466+
issorted!(ds, 1:j)
467+
@test IMD._sortedcols(ds) == 1:j
468+
@test issorted(ds, 1:j)
469+
470+
setformat!(ds, 1:10=>isodd)
471+
sort!(ds, 1:j, rev = true)
472+
issorted!(ds, 1:j, rev = true)
473+
@test IMD._sortedcols(ds) == 1:j
474+
@test issorted(ds, 1:j, rev = true)
475+
end
476+
ds = Dataset(rand(1:10., 1000, 10), :auto)
477+
map!(ds, x->rand()<.1 ? missing : x, :)
478+
for j in 1:10
479+
sort!(ds, 1:2:j)
480+
issorted!(ds, 1:2:j)
481+
@test IMD._sortedcols(ds) == collect(1:2:j)
482+
@test issorted(ds, 1:2:j)
483+
484+
setformat!(ds, 1:10=>sign)
485+
sort!(ds, 1:2:j, rev = true)
486+
issorted!(ds, 1:2:j, rev = true)
487+
@test IMD._sortedcols(ds) == collect(1:2:j)
488+
@test issorted(ds, 1:2:j, rev = true)
489+
end
490+
ds = Dataset(rand(1:10., 1000, 10), :auto)
491+
map!(ds, x->rand()<.1 ? missing : x, :)
492+
for j in 1:10
493+
ds[!, j] = PooledArray(ds[!, j])
494+
end
495+
for j in 1:10
496+
sort!(ds, 1:2:j)
497+
issorted!(ds, 1:2:j)
498+
@test IMD._sortedcols(ds) == collect(1:2:j)
499+
@test issorted(ds, 1:2:j)
500+
501+
setformat!(ds, 1:10=>sign)
502+
sort!(ds, 1:2:j, rev = true)
503+
issorted!(ds, 1:2:j, rev = true)
504+
@test IMD._sortedcols(ds) == collect(1:2:j)
505+
@test issorted(ds, 1:2:j, rev = true)
506+
end
507+
end
508+
509+
end

0 commit comments

Comments
 (0)