Skip to content

Commit d16497b

Browse files
committed
Bring back String.normalize and fix infinite loop
1 parent 7ef6dfc commit d16497b

File tree

3 files changed

+114
-12
lines changed

3 files changed

+114
-12
lines changed

lib/elixir/lib/string.ex

Lines changed: 39 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -455,13 +455,13 @@ defmodule String do
455455
For example, take the grapheme "é" which is made of the characters
456456
"e" and the acute accent. The following will split the string into two parts:
457457
458-
iex> String.split(:unicode.characters_to_nfd_binary("é"), "e")
458+
iex> String.split(String.normalize("é", :nfd), "e")
459459
["", "́"]
460460
461461
However, if "é" is represented by the single character "e with acute"
462462
accent, then it will split the string into just one part:
463463
464-
iex> String.split(:unicode.characters_to_nfc_binary("é"), "e")
464+
iex> String.split(String.normalize("é", :nfc), "e")
465465
["é"]
466466
467467
"""
@@ -650,9 +650,9 @@ defmodule String do
650650
651651
String.normalize(string1, :nfd) == String.normalize(string2, :nfd)
652652
653-
Therefore, if you plan to compare multiple strings, multiple times
654-
in a row, you may normalize them upfront and compare them directly
655-
to avoid multiple normalization passes.
653+
If you plan to compare multiple strings, multiple times in a row, you
654+
may normalize them upfront and compare them directly to avoid multiple
655+
normalization passes.
656656
657657
## Examples
658658
@@ -674,21 +674,49 @@ defmodule String do
674674
normalize(string1, :nfd) == normalize(string2, :nfd)
675675
end
676676

677-
@doc false
678-
@deprecated "Use :unicode.characters_to_nfc_binary/1 or :unicode.characters_to_nfd_binary/1 instead"
677+
@doc """
678+
Converts all characters in `string` to Unicode normalization
679+
form identified by `form`.
680+
681+
Invalid Unicode codepoints are skipped and the remaining of
682+
the string is converted. If you want the algorith to stop
683+
and return on invalid codepoint, use `:unicode.characters_to_nfd_binary/1`
684+
and `:unicode.characters_to_nfc_binary/1` instead.
685+
686+
## Forms
687+
688+
The supported forms are:
689+
690+
* `:nfd` - Normalization Form Canonical Decomposition.
691+
Characters are decomposed by canonical equivalence, and
692+
multiple combining characters are arranged in a specific
693+
order.
694+
695+
* `:nfc` - Normalization Form Canonical Composition.
696+
Characters are decomposed and then recomposed by canonical equivalence.
697+
698+
## Examples
699+
700+
iex> String.normalize("yêṩ", :nfd)
701+
"yêṩ"
702+
703+
iex> String.normalize("leña", :nfc)
704+
"leña"
705+
706+
"""
679707
def normalize(string, form)
680708

681709
def normalize(string, :nfd) do
682710
case :unicode.characters_to_nfd_binary(string) do
683711
string when is_binary(string) -> string
684-
{:error, bad, rest} -> bad <> normalize(rest, :nfd)
712+
{:error, good, <<head, rest::binary>>} -> good <> <<head>> <> normalize(rest, :nfd)
685713
end
686714
end
687715

688716
def normalize(string, :nfc) do
689717
case :unicode.characters_to_nfc_binary(string) do
690718
string when is_binary(string) -> string
691-
{:error, bad, rest} -> bad <> normalize(rest, :nfc)
719+
{:error, good, <<head, rest::binary>>} -> good <> <<head>> <> normalize(rest, :nfc)
692720
end
693721
end
694722

@@ -2174,13 +2202,13 @@ defmodule String do
21742202
For example, take the grapheme "é" which is made of the characters
21752203
"e" and the acute accent. The following returns `true`:
21762204
2177-
iex> String.contains?(:unicode.characters_to_nfd_binary("é"), "e")
2205+
iex> String.contains?(String.normalize("é", :nfd), "e")
21782206
true
21792207
21802208
However, if "é" is represented by the single character "e with acute"
21812209
accent, then it will return `false`:
21822210
2183-
iex> String.contains?(:unicode.characters_to_nfc_binary("é"), "e")
2211+
iex> String.contains?(String.normalize("é", :nfc), "e")
21842212
false
21852213
21862214
"""

lib/elixir/pages/Compatibility and Deprecations.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,6 @@ Version | Deprecated feature | Replaced by (ava
8181
[v1.10] | Passing non-chardata to `Logger.log/2` | Explicitly convert to string with `to_string/1` (v1.0)
8282
[v1.10] | `:compile_time_purge_level` in `Logger` app environment | `:compile_time_purge_matching` in `Logger` app environment (v1.7)
8383
[v1.10] | `Supervisor.Spec.supervise/2` | The new child specs outlined in `Supervisor` (v1.5)
84-
[v1.10] | `String.normalize/2` | `:unicode.characters_to_nfc_binary/1` or `:unicode.characters_to_nfd_binary/1` (Erlang/OTP 20)
8584
[v1.10] | `:simple_one_for_one` strategy in `Supervisor` | `DynamicSupervisor` (v1.6)
8685
[v1.10] | `:restart` and `:shutdown` in `Task.Supervisor.start_link/1` | `:restart` and `:shutdown` in `Task.Supervisor.start_child/3` (v1.6)
8786
[v1.9] | Enumerable keys in `Map.drop/2`, `Map.split/2`, and `Map.take/2` | Call `Enum.to_list/1` on the second argument before hand (v1.0)

lib/elixir/test/elixir/string_test.exs

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -749,4 +749,79 @@ defmodule StringTest do
749749
assert String.myers_difference("abc", "aйbc") == [eq: "a", ins: "й", eq: "bc"]
750750
assert String.myers_difference("aйbc", "abc") == [eq: "a", del: "й", eq: "bc"]
751751
end
752+
753+
test "normalize/2" do
754+
assert String.normalize("ŝ", :nfd) == "ŝ"
755+
assert String.normalize("ḇravô", :nfd) == "ḇravô"
756+
assert String.normalize("ṩierra", :nfd) == "ṩierra"
757+
assert String.normalize("뢴", :nfd) == "뢴"
758+
assert String.normalize("êchǭ", :nfc) == "êchǭ"
759+
assert String.normalize("거̄", :nfc) == "거̄"
760+
assert String.normalize("뢴", :nfc) == "뢴"
761+
762+
## Error cases
763+
assert String.normalize(<<15, 216>>, :nfc) == <<15, 216>>
764+
assert String.normalize(<<15, 216>>, :nfd) == <<15, 216>>
765+
assert String.normalize(<<216, 15>>, :nfc) == <<216, 15>>
766+
assert String.normalize(<<216, 15>>, :nfd) == <<216, 15>>
767+
768+
## Cases from NormalizationTest.txt
769+
770+
# 05B8 05B9 05B1 0591 05C3 05B0 05AC 059F
771+
# 05B1 05B8 05B9 0591 05C3 05B0 05AC 059F
772+
# HEBREW POINT QAMATS, HEBREW POINT HOLAM, HEBREW POINT HATAF SEGOL,
773+
# HEBREW ACCENT ETNAHTA, HEBREW PUNCTUATION SOF PASUQ, HEBREW POINT SHEVA,
774+
# HEBREW ACCENT ILUY, HEBREW ACCENT QARNEY PARA
775+
assert String.normalize("ֱָֹ֑׃ְ֬֟", :nfc) == "ֱָֹ֑׃ְ֬֟"
776+
777+
# 095D (exclusion list)
778+
# 0922 093C
779+
# DEVANAGARI LETTER RHA
780+
assert String.normalize("ढ़", :nfc) == "ढ़"
781+
782+
# 0061 0315 0300 05AE 0340 0062
783+
# 00E0 05AE 0300 0315 0062
784+
# LATIN SMALL LETTER A, COMBINING COMMA ABOVE RIGHT, COMBINING GRAVE ACCENT,
785+
# HEBREW ACCENT ZINOR, COMBINING GRAVE TONE MARK, LATIN SMALL LETTER B
786+
assert String.normalize("à֮̀̕b", :nfc) == "à֮̀̕b"
787+
788+
# 0344
789+
# 0308 0301
790+
# COMBINING GREEK DIALYTIKA TONOS
791+
assert String.normalize("\u0344", :nfc) == "\u0308\u0301"
792+
793+
# 115B9 0334 115AF
794+
# 115B9 0334 115AF
795+
# SIDDHAM VOWEL SIGN AI, COMBINING TILDE OVERLAY, SIDDHAM VOWEL SIGN AA
796+
assert String.normalize("𑖹̴𑖯", :nfc) == "𑖹̴𑖯"
797+
# HEBREW ACCENT ETNAHTA, HEBREW PUNCTUATION SOF PASUQ, HEBREW POINT SHEVA,
798+
# HEBREW ACCENT ILUY, HEBREW ACCENT QARNEY PARA
799+
assert String.normalize("ֱָֹ֑׃ְ֬֟", :nfc) == "ֱָֹ֑׃ְ֬֟"
800+
801+
# 095D (exclusion list)
802+
# HEBREW ACCENT ETNAHTA, HEBREW PUNCTUATION SOF PASUQ, HEBREW POINT SHEVA,
803+
# HEBREW ACCENT ILUY, HEBREW ACCENT QARNEY PARA
804+
assert String.normalize("ֱָֹ֑׃ְ֬֟", :nfc) == "ֱָֹ֑׃ְ֬֟"
805+
806+
# 095D (exclusion list)
807+
# 0922 093C
808+
# DEVANAGARI LETTER RHA
809+
assert String.normalize("ढ़", :nfc) == "ढ़"
810+
811+
# 0061 0315 0300 05AE 0340 0062
812+
# 00E0 05AE 0300 0315 0062
813+
# LATIN SMALL LETTER A, COMBINING COMMA ABOVE RIGHT, COMBINING GRAVE ACCENT,
814+
# HEBREW ACCENT ZINOR, COMBINING GRAVE TONE MARK, LATIN SMALL LETTER B
815+
assert String.normalize("à֮̀̕b", :nfc) == "à֮̀̕b"
816+
817+
# 0344
818+
# 0308 0301
819+
# COMBINING GREEK DIALYTIKA TONOS
820+
assert String.normalize("\u0344", :nfc) == "\u0308\u0301"
821+
822+
# 115B9 0334 115AF
823+
# 115B9 0334 115AF
824+
# SIDDHAM VOWEL SIGN AI, COMBINING TILDE OVERLAY, SIDDHAM VOWEL SIGN AA
825+
assert String.normalize("𑖹̴𑖯", :nfc) == "𑖹̴𑖯"
826+
end
752827
end

0 commit comments

Comments
 (0)