Skip to content

Commit 08b26f1

Browse files
committed
Characters only support showing ascii
1 parent 4a6d1fc commit 08b26f1

File tree

4 files changed

+48
-101
lines changed

4 files changed

+48
-101
lines changed

src/characters/characters.jl

Lines changed: 36 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,18 @@
22

33
import Base: iterate, lastindex, getindex, sizeof, length, ncodeunits, codeunit, isvalid, read, write, setindex!, string, convert
44

5-
struct Characters{N, M} <: AbstractString
6-
data::NTuple{N, M}
7-
function Characters{N, M}(v::Vector{UInt8}) where N where M
8-
new(NTuple{N, M}(v))
5+
struct Characters{N} <: AbstractString
6+
data::NTuple{N, UInt8}
7+
function Characters{N}(v::Vector{UInt8}) where N
8+
new(NTuple{N, UInt8}(v))
99
end
10-
function Characters{N, M}(itr) where {N} where {M}
10+
function Characters{N}(itr) where {N}
1111
isempty(itr) && return missing
12-
new(NTuple{N, M}(rpad(itr, N)))
12+
new(NTuple{N, UInt8}(rpad(itr, N)))
1313
end
1414
end
1515

16-
function Characters{N, M}(v::Vector{UInt8}, v2) where N where M
16+
function Characters{N}(v::Vector{UInt8}, v2) where N
1717

1818
@simd for i in 1:min(N, length(v))
1919
@inbounds v2[i] = v[i]
@@ -22,122 +22,74 @@ function Characters{N, M}(v::Vector{UInt8}, v2) where N where M
2222
@inbounds v2[i] = 0x20
2323
end
2424

25-
Characters{N, M}(v2)
25+
Characters{N}(v2)
2626
end
27-
function Characters{N, M}(v::Vector{UInt8}, v2, st, en) where N where M
27+
function Characters{N}(v::Vector{UInt8}, v2, st, en) where N
2828
o1 = min(N, en-st+1)
2929
copyto!(v2, 1, v, st, o1)
30-
# for i in 1:o1
31-
# @inbounds v2[i] = v[st+i-1]
32-
# end
3330
@simd for i in o1+1:N
3431
@inbounds v2[i] = 0x20
3532
end
3633

37-
Characters{N, M}(v2)
38-
end
39-
40-
function Characters{N}(itr) where {N}
41-
Characters{N, UInt8}(itr)
34+
Characters{N}(v2)
4235
end
4336

4437
Characters(s::Characters) = s
4538

4639
function Characters(s::AbstractString)
4740
isempty(s) && return missing
48-
sl = cld(sizeof(s), length(s))
49-
if sl == 1
50-
Characters{length(s), UInt8}(s)
51-
else
52-
Characters{length(s), UInt16}(s)
53-
end
54-
# else
55-
# throw(ArgumentError("Characters only support UInt8 and UInt16"))
56-
# end
41+
Characters{ncodeunits(s)}(collect(codeunits(s)))
42+
5743
end
5844

5945
macro c_str(str)
6046
Characters(str)
6147
end
6248

63-
function Base.print(io::IO, s::Characters)
64-
# s_end = length(s)
65-
# @inbounds for i in length(s):-1:1
66-
# s.data[i] == 0x20 ? s_end -= 1 : break
67-
# end
68-
print(io, String(view(s, 1:length(s))))
49+
function Base.String(s::T) where T <: Characters
50+
len = ncodeunits(s)
51+
out = Base._string_n(len)
52+
ref = Ref{T}(s)
53+
GC.@preserve ref out begin
54+
ptr = convert(Ptr{UInt8}, Base.unsafe_convert(Ptr{T}, ref))
55+
unsafe_copyto!(pointer(out), ptr, len)
56+
end
57+
return out
58+
end
59+
60+
61+
function Base.print(io::IO, s::T) where T<:Characters
62+
print(io, String(s))
6963
end
7064
Base.string(s::Characters) = String(s)
7165

7266
function Base.:(==)(s1::Characters, s2::Characters)
73-
# s1end = length(s1)
74-
# s2end = length(s2)
75-
# @inbounds for i in length(s1):-1:1
76-
# s1.data[i] == 0x20 ? s1end -= 1 : break
77-
# end
78-
# @inbounds for i in length(s2):-1:1
79-
# s2.data[i] == 0x20 ? s2end -= 1 : break
80-
# end
81-
# s1end != s2end && return false
82-
# @inbounds for i in 1:s1end
83-
# s1.data[i] != s2.data[i] && return false
84-
# end
85-
# return true
86-
# return view(s1, 1:length(s1)) == view(s2, 1:length(s2))
8767
cmp(s1,s2) == 0
8868
end
8969

9070
function Base.:(==)(s1::Characters, s2::AbstractString)
91-
# M = max(N, length(s2))
92-
if codeunit(s1) == UInt8
93-
return view(codeunits(s1), 1:length(s1)) == codeunits(s2)
94-
else
95-
s1 == Characters(s2)
96-
end
71+
return view(codeunits(s1), 1:length(s1)) == codeunits(s2)
72+
9773
end
9874
Base.:(==)(s1::AbstractString, s2::Characters) = s2 == s1
9975

10076
Base.isequal(s1::Characters, s2::Characters) = cmp(s1, s2) == 0#s1 == s2
10177
function Base.isequal(s1::Characters, s2::AbstractString)
102-
# M = max(N, length(s2))
103-
if codeunit(s1) == UInt8
104-
return isequal(view(codeunits(s1), 1:length(s1)), codeunits(s2))
105-
else
106-
isequal(s1, Characters(s2))
107-
end
78+
return isequal(view(codeunits(s1), 1:length(s1)), codeunits(s2))
10879

10980
end
11081
Base.isequal(s1::AbstractString, s2::Characters) = isequal(s2, s1)
11182

11283
function Base.isless(s1::Characters, s2::Characters)
113-
# s1end = length(s1)
114-
# s2end = length(s2)
115-
# @inbounds for i in length(s1):-1:1
116-
# s1.data[i] == 0x20 ? s1end -= 1 : break
117-
# end
118-
# @inbounds for i in length(s2):-1:1
119-
# s2.data[i] == 0x20 ? s2end -= 1 : break
120-
# end
121-
# isless(view(s1, 1:length(s1)), view(s2, 1:length(s2)))
12284
cmp(s1,s2)<0
12385
end
12486

12587

12688
function Base.isless(s1::Characters, s2::AbstractString)
127-
# M = max(N, length(s2))
128-
if codeunit(s1) == UInt8
129-
return isless(view(codeunits(s1), 1:length(s1)), codeunits(s2))
130-
else
131-
isless(s1, Characters(s2))
132-
end
89+
return isless(view(codeunits(s1), 1:length(s1)), codeunits(s2))
13390
end
13491
function Base.isless(s1::AbstractString, s2::Characters)
135-
# M = max(N, length(s1))
136-
if codeunit(s2) == UInt8
137-
return isless(codeunits(s1), view(codeunits(s2), 1:length(s2)))
138-
else
139-
isless(Characters(s1), s2)
140-
end
92+
return isless(codeunits(s1), view(codeunits(s2), 1:length(s2)))
14193
end
14294

14395
function iterate(s::Characters{N}, i::Int = 1) where N
@@ -161,11 +113,10 @@ function length(s::Characters)
161113
end
162114

163115

164-
ncodeunits(s::Characters) = length(s.data)
116+
ncodeunits(s::Characters) = length(s)
165117

166-
codeunit(::Type{Characters{N, M}}) where N where M = M
167-
codeunit(::Characters{N, M}) where N where M = M
168118
codeunit(::Type{Characters{N}}) where N = UInt8
119+
codeunit(::Characters) = UInt8
169120
codeunit(s::Characters, i::Integer) = s.data[i]
170121

171122
isvalid(s::Characters, i::Int) = checkbounds(Bool, s, i)
@@ -174,18 +125,16 @@ Characters(s::Symbol) = Characters(string(s))
174125

175126
Characters(::Missing) = missing
176127
Characters{N}(::Missing) where N = missing
177-
Characters{N, M}(::Missing) where N where M = missing
178128

179-
function read(io::IO, T::Type{Characters{N, M}}) where N where M
129+
function read(io::IO, T::Type{Characters{N}}) where N
180130
return read!(io, Ref{T}())[]::T
181131
end
182132

183-
function write(io::IO, s::Characters{N, M}) where N where M
133+
function write(io::IO, s::Characters{N}) where N
184134
return write(io, Ref(s))
185135
end
186136

187-
# TODO I don't know how I should do this for UInt16
188-
function Base.hash(s::Characters{N, UInt8}, h::UInt) where N
137+
function Base.hash(s::Characters{N}, h::UInt) where N
189138
h += Base.memhash_seed
190139
ref = Ref(s.data)
191140
ccall(Base.memhash, UInt, (Ptr{UInt8}, Csize_t, UInt32), ref, length(s), h % UInt32) + h

test/characters.jl

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,12 @@ using Test
22

33

44
@testset "characters construction" begin
5-
@test Characters{3, UInt16}("abα") == "abα"
65
@test Characters{2}("abc") == "ab"
76
@test length(Characters{12}("12 ")) == 2
87
@test length(Characters{3}("helloworld")) == 3
98
@test String(Characters{12}(" abc ")) == " abc"
109
@test isequal(Characters(""), missing)
11-
@test isequal(Characters{3, UInt16}.(["a", "b", "", missing]), ["a","b", missing, missing])
12-
@test isequal(Characters{3, UInt8}.(["a", "b", "", missing]), ["a","b", missing, missing])
10+
@test isequal(Characters{3}.(["a", "b", "", missing]), ["a","b", missing, missing])
1311
end
1412

1513
@testset "characters comparison" begin

test/join.jl

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -792,8 +792,8 @@ closefinance_tol10ms_noexact = Dataset([Union{Missing, DateTime}[DateTime("2016-
792792
@test close10_v == close10_t
793793
@test close11_v == close11_t
794794

795-
dsl = Dataset([[Characters{1, UInt8}(randstring(1)) for _ in 1:10^5] for _ in 1:3], :auto)
796-
dsr = Dataset([[Characters{1, UInt8}(randstring(1)) for _ in 1:10^5] for _ in 1:3], :auto)
795+
dsl = Dataset([[Characters{1}(randstring(1)) for _ in 1:10^5] for _ in 1:3], :auto)
796+
dsr = Dataset([[Characters{1}(randstring(1)) for _ in 1:10^5] for _ in 1:3], :auto)
797797
left1 = leftjoin(dsl, dsr, on = [:x1, :x2], makeunique = true, accelerate = true, stable =true, check = false)
798798
left2 = leftjoin(dsl, dsr, on = [:x1, :x2], makeunique = true, accelerate = false, stable = true, check = false)
799799

@@ -808,8 +808,8 @@ closefinance_tol10ms_noexact = Dataset([Union{Missing, DateTime}[DateTime("2016-
808808
@test left1 == left2
809809
@test unique(select!(left1, [:x1, :x2, :x3]), [:x1, :x2]) == unique(dsl, [:x1, :x2])
810810

811-
dsl = Dataset([[Characters{1, UInt8}(randstring(1)) for _ in 1:10^5] for _ in 1:3], :auto)
812-
dsr = Dataset([[Characters{1, UInt8}(randstring(1)) for _ in 1:10^5] for _ in 1:3], :auto)
811+
dsl = Dataset([[Characters{1}(randstring(1)) for _ in 1:10^5] for _ in 1:3], :auto)
812+
dsr = Dataset([[Characters{1}(randstring(1)) for _ in 1:10^5] for _ in 1:3], :auto)
813813
for i in 1:3
814814
dsl[!, i] = PooledArray(dsl[!, i])
815815
dsr[!, i] = PooledArray(dsr[!, i])
@@ -838,7 +838,7 @@ closefinance_tol10ms_noexact = Dataset([Union{Missing, DateTime}[DateTime("2016-
838838
x2 = rand(1:100, 5000)
839839
y = rand(5000)
840840
y2 = rand(5000)
841-
dsl = Dataset(x1 = Characters{6, UInt8}.(c"id" .* string.(x1)), x2 = Characters{5, UInt8}.(c"id" .* string.(x2)), y = y)
841+
dsl = Dataset(x1 = Characters{6}.(c"id" .* string.(x1)), x2 = Characters{5}.(c"id" .* string.(x2)), y = y)
842842
dsr = Dataset(x1 = x1, x2 = x2, y2 = y2)
843843
fmtfun(x) = @views parse(Int, x[3:end])
844844
setformat!(dsl, 1:2=>fmtfun)
@@ -875,7 +875,7 @@ closefinance_tol10ms_noexact = Dataset([Union{Missing, DateTime}[DateTime("2016-
875875
@test left1 == leftjoin(dsl, dsr, on =[:x1, :x2], mapformats = [true, false], accelerate = true, stable =true, method = :hash, threads = false)
876876

877877
@test inn1 == out1 == left1
878-
fmtfun2(x) = c"id" * Characters{4, UInt8}(x)
878+
fmtfun2(x) = c"id" * Characters{4}(x)
879879
setformat!(dsr, 1:2=>fmtfun2)
880880
semi1 = semijoin(dsl, dsr, on = [:x1, :x2], mapformats = [false, true])
881881
@test semi1 == semijoin(dsl, dsr, on = [:x1, :x2], mapformats = [false, true], method = :hash)
@@ -906,7 +906,7 @@ closefinance_tol10ms_noexact = Dataset([Union{Missing, DateTime}[DateTime("2016-
906906
x2 = rand(1:100, 5000)
907907
y = rand(5000)
908908
y2 = rand(5000)
909-
dsl = Dataset(x1 = Characters{6, UInt8}.(c"id" .* string.(x1)), x2 = Characters{5, UInt8}.(c"id" .* string.(x2)), y = y)
909+
dsl = Dataset(x1 = Characters{6}.(c"id" .* string.(x1)), x2 = Characters{5}.(c"id" .* string.(x2)), y = y)
910910
dsr = Dataset(x1 = x1, x2 = x2, y2 = y2)
911911
for i in 1:2
912912
dsl[!, i] = PooledArray(dsl[!, i])
@@ -965,7 +965,7 @@ closefinance_tol10ms_noexact = Dataset([Union{Missing, DateTime}[DateTime("2016-
965965
x2 = -rand(1:100, 5000)
966966
y = rand(5000)
967967
y2 = rand(5000)
968-
dsl = Dataset(x1 = Characters{6, UInt8}.(c"id" .* string.(-x1)), x2 = Characters{5, UInt8}.(c"id" .* string.(-x2)), y = y)
968+
dsl = Dataset(x1 = Characters{6}.(c"id" .* string.(-x1)), x2 = Characters{5}.(c"id" .* string.(-x2)), y = y)
969969
dsr = Dataset(x1 = x1, x2 = x2, y2 = y2)
970970
for i in 1:2
971971
dsl[!, i] = PooledArray(dsl[!, i])

test/sort.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ using InMemoryDatasets, PooledArrays, Random, Test, CategoricalArrays
131131

132132

133133

134-
x = CategoricalArray{Union{Characters{6, UInt8}, String}}(["Old", "Young", "Middle", "Young"])
134+
x = CategoricalArray{Union{Characters{6}, String}}(["Old", "Young", "Middle", "Young"])
135135
levels!(x, ["Young", "Middle", "Old"])
136136
ds = Dataset(x = x)
137137
ds_s = sort(ds, :x)
@@ -166,7 +166,7 @@ using InMemoryDatasets, PooledArrays, Random, Test, CategoricalArrays
166166
end
167167
x1 = -rand(1:1000, 5000)
168168
x2 = -rand(1:100, 5000)
169-
dsl = Dataset(x1 = Characters{6, UInt8}.(c"id" .* string.(-x1)), x2 = Characters{5, UInt8}.(c"id" .* string.(-x2)))
169+
dsl = Dataset(x1 = Characters{6}.(c"id" .* string.(-x1)), x2 = Characters{5}.(c"id" .* string.(-x2)))
170170
dsr = Dataset(x1 = x1, x2 = x2)
171171
for i in 1:2
172172
dsl[!, i] = PooledArray(dsl[!, i])
@@ -366,7 +366,7 @@ end
366366
c1 = PooledArray(["string", "string", 1.1, 1.1, 1.1, 20000.0, 123.0])
367367
c2 = PooledArray(["string", missing, 1.1, 1.1, 'a', 'a', 'b'])
368368
c3 = PooledArray([missing, missing, missing, 1.1, 1.1, 20000.0, 123.0])
369-
c4 = CategoricalArray{Union{Characters{6, UInt8}, String}}(["Old", "Young", "Young", "Young", "Old", "Young", "Middle"])
369+
c4 = CategoricalArray{Union{Characters{6}, String}}(["Old", "Young", "Young", "Young", "Old", "Young", "Middle"])
370370
levels!(c4, ["Young", "Middle", "Old"])
371371
ds = Dataset(x1 = c1, x2 = c2, x3 = c3, x4 = c4)
372372
colsidx = [1, 2]

0 commit comments

Comments
 (0)