Bring back String.normalize and fix infinite loop

josevalim · josevalim · commit d16497b966d4 · 2020-04-23T13:35:59.000+02:00
diff --git a/lib/elixir/lib/string.ex b/lib/elixir/lib/string.ex
@@ -455,13 +455,13 @@ defmodule String do
   For example, take the grapheme "é" which is made of the characters
   "e" and the acute accent. The following will split the string into two parts:
 
-      iex> String.split(:unicode.characters_to_nfd_binary("é"), "e")
+      iex> String.split(String.normalize("é", :nfd), "e")
       ["", "́"]
 
   However, if "é" is represented by the single character "e with acute"
   accent, then it will split the string into just one part:
 
-      iex> String.split(:unicode.characters_to_nfc_binary("é"), "e")
+      iex> String.split(String.normalize("é", :nfc), "e")
       ["é"]
 
   """
@@ -650,9 +650,9 @@ defmodule String do
 
       String.normalize(string1, :nfd) == String.normalize(string2, :nfd)
 
-  Therefore, if you plan to compare multiple strings, multiple times
-  in a row, you may normalize them upfront and compare them directly
-  to avoid multiple normalization passes.
+  If you plan to compare multiple strings, multiple times in a row, you
+  may normalize them upfront and compare them directly to avoid multiple
+  normalization passes.
 
   ## Examples
 
@@ -674,21 +674,49 @@ defmodule String do
     normalize(string1, :nfd) == normalize(string2, :nfd)
   end
 
-  @doc false
-  @deprecated "Use :unicode.characters_to_nfc_binary/1 or :unicode.characters_to_nfd_binary/1 instead"
+  @doc """
+  Converts all characters in `string` to Unicode normalization
+  form identified by `form`.
+
+  Invalid Unicode codepoints are skipped and the remaining of
+  the string is converted. If you want the algorith to stop
+  and return on invalid codepoint, use `:unicode.characters_to_nfd_binary/1`
+  and `:unicode.characters_to_nfc_binary/1` instead.
+
+  ## Forms
+
+  The supported forms are:
+
+    * `:nfd` - Normalization Form Canonical Decomposition.
+      Characters are decomposed by canonical equivalence, and
+      multiple combining characters are arranged in a specific
+      order.
+
+    * `:nfc` - Normalization Form Canonical Composition.
+      Characters are decomposed and then recomposed by canonical equivalence.
+
+  ## Examples
+
+      iex> String.normalize("yêṩ", :nfd)
+      "yêṩ"
+
+      iex> String.normalize("leña", :nfc)
+      "leña"
+
+  """
   def normalize(string, form)
 
   def normalize(string, :nfd) do
     case :unicode.characters_to_nfd_binary(string) do
       string when is_binary(string) -> string
-      {:error, bad, rest} -> bad <> normalize(rest, :nfd)
+      {:error, good, <<head, rest::binary>>} -> good <> <<head>> <> normalize(rest, :nfd)
     end
   end
 
   def normalize(string, :nfc) do
     case :unicode.characters_to_nfc_binary(string) do
       string when is_binary(string) -> string
-      {:error, bad, rest} -> bad <> normalize(rest, :nfc)
+      {:error, good, <<head, rest::binary>>} -> good <> <<head>> <> normalize(rest, :nfc)
     end
   end
 
@@ -2174,13 +2202,13 @@ defmodule String do
   For example, take the grapheme "é" which is made of the characters
   "e" and the acute accent. The following returns `true`:
 
-      iex> String.contains?(:unicode.characters_to_nfd_binary("é"), "e")
+      iex> String.contains?(String.normalize("é", :nfd), "e")
       true
 
   However, if "é" is represented by the single character "e with acute"
   accent, then it will return `false`:
 
-      iex> String.contains?(:unicode.characters_to_nfc_binary("é"), "e")
+      iex> String.contains?(String.normalize("é", :nfc), "e")
       false
 
   """
diff --git a/lib/elixir/pages/Compatibility and Deprecations.md b/lib/elixir/pages/Compatibility and Deprecations.md
@@ -81,7 +81,6 @@ Version | Deprecated feature                                  | Replaced by (ava
 [v1.10] | Passing non-chardata to `Logger.log/2`              | Explicitly convert to string with `to_string/1` (v1.0)
 [v1.10] | `:compile_time_purge_level` in `Logger` app environment | `:compile_time_purge_matching` in `Logger` app environment (v1.7)
 [v1.10] | `Supervisor.Spec.supervise/2`                       | The new child specs outlined in `Supervisor` (v1.5)
-[v1.10] | `String.normalize/2`                                | `:unicode.characters_to_nfc_binary/1` or `:unicode.characters_to_nfd_binary/1` (Erlang/OTP 20)
 [v1.10] | `:simple_one_for_one` strategy in `Supervisor`      | `DynamicSupervisor` (v1.6)
 [v1.10] | `:restart` and `:shutdown` in `Task.Supervisor.start_link/1` | `:restart` and `:shutdown` in `Task.Supervisor.start_child/3` (v1.6)
 [v1.9]  | Enumerable keys in `Map.drop/2`, `Map.split/2`, and `Map.take/2` | Call `Enum.to_list/1` on the second argument before hand (v1.0)
diff --git a/lib/elixir/test/elixir/string_test.exs b/lib/elixir/test/elixir/string_test.exs
@@ -749,4 +749,79 @@ defmodule StringTest do
     assert String.myers_difference("abc", "aйbc") == [eq: "a", ins: "й", eq: "bc"]
     assert String.myers_difference("aйbc", "abc") == [eq: "a", del: "й", eq: "bc"]
   end
+
+  test "normalize/2" do
+    assert String.normalize("ŝ", :nfd) == "ŝ"
+    assert String.normalize("ḇravô", :nfd) == "ḇravô"
+    assert String.normalize("ṩierra", :nfd) == "ṩierra"
+    assert String.normalize("뢴", :nfd) == "뢴"
+    assert String.normalize("êchǭ", :nfc) == "êchǭ"
+    assert String.normalize("거̄", :nfc) == "거̄"
+    assert String.normalize("뢴", :nfc) == "뢴"
+
+    ## Error cases
+    assert String.normalize(<<15, 216>>, :nfc) == <<15, 216>>
+    assert String.normalize(<<15, 216>>, :nfd) == <<15, 216>>
+    assert String.normalize(<<216, 15>>, :nfc) == <<216, 15>>
+    assert String.normalize(<<216, 15>>, :nfd) == <<216, 15>>
+
+    ## Cases from NormalizationTest.txt
+
+    # 05B8 05B9 05B1 0591 05C3 05B0 05AC 059F
+    # 05B1 05B8 05B9 0591 05C3 05B0 05AC 059F
+    # HEBREW POINT QAMATS, HEBREW POINT HOLAM, HEBREW POINT HATAF SEGOL,
+    # HEBREW ACCENT ETNAHTA, HEBREW PUNCTUATION SOF PASUQ, HEBREW POINT SHEVA,
+    # HEBREW ACCENT ILUY, HEBREW ACCENT QARNEY PARA
+    assert String.normalize("ֱָֹ֑׃ְ֬֟", :nfc) == "ֱָֹ֑׃ְ֬֟"
+
+    # 095D (exclusion list)
+    # 0922 093C
+    # DEVANAGARI LETTER RHA
+    assert String.normalize("ढ़", :nfc) == "ढ़"
+
+    # 0061 0315 0300 05AE 0340 0062
+    # 00E0 05AE 0300 0315 0062
+    # LATIN SMALL LETTER A, COMBINING COMMA ABOVE RIGHT, COMBINING GRAVE ACCENT,
+    # HEBREW ACCENT ZINOR, COMBINING GRAVE TONE MARK, LATIN SMALL LETTER B
+    assert String.normalize("à֮̀̕b", :nfc) == "à֮̀̕b"
+
+    # 0344
+    # 0308 0301
+    # COMBINING GREEK DIALYTIKA TONOS
+    assert String.normalize("\u0344", :nfc) == "\u0308\u0301"
+
+    # 115B9 0334 115AF
+    # 115B9 0334 115AF
+    # SIDDHAM VOWEL SIGN AI, COMBINING TILDE OVERLAY, SIDDHAM VOWEL SIGN AA
+    assert String.normalize("𑖹̴𑖯", :nfc) == "𑖹̴𑖯"
+    # HEBREW ACCENT ETNAHTA, HEBREW PUNCTUATION SOF PASUQ, HEBREW POINT SHEVA,
+    # HEBREW ACCENT ILUY, HEBREW ACCENT QARNEY PARA
+    assert String.normalize("ֱָֹ֑׃ְ֬֟", :nfc) == "ֱָֹ֑׃ְ֬֟"
+
+    # 095D (exclusion list)
+    # HEBREW ACCENT ETNAHTA, HEBREW PUNCTUATION SOF PASUQ, HEBREW POINT SHEVA,
+    # HEBREW ACCENT ILUY, HEBREW ACCENT QARNEY PARA
+    assert String.normalize("ֱָֹ֑׃ְ֬֟", :nfc) == "ֱָֹ֑׃ְ֬֟"
+
+    # 095D (exclusion list)
+    # 0922 093C
+    # DEVANAGARI LETTER RHA
+    assert String.normalize("ढ़", :nfc) == "ढ़"
+
+    # 0061 0315 0300 05AE 0340 0062
+    # 00E0 05AE 0300 0315 0062
+    # LATIN SMALL LETTER A, COMBINING COMMA ABOVE RIGHT, COMBINING GRAVE ACCENT,
+    # HEBREW ACCENT ZINOR, COMBINING GRAVE TONE MARK, LATIN SMALL LETTER B
+    assert String.normalize("à֮̀̕b", :nfc) == "à֮̀̕b"
+
+    # 0344
+    # 0308 0301
+    # COMBINING GREEK DIALYTIKA TONOS
+    assert String.normalize("\u0344", :nfc) == "\u0308\u0301"
+
+    # 115B9 0334 115AF
+    # 115B9 0334 115AF
+    # SIDDHAM VOWEL SIGN AI, COMBINING TILDE OVERLAY, SIDDHAM VOWEL SIGN AA
+    assert String.normalize("𑖹̴𑖯", :nfc) == "𑖹̴𑖯"
+  end
 end