Add scale_to_z_score (#432)

vancexu · wenleix · commit 58a9e12eaa61 · 2022-07-15T22:36:47.000-07:00
Summary: Pull Request resolved: #432 scale_to_z_score is a common transform during preproc. The implementation is similar to tft reference here https://www.tensorflow.org/tfx/transform/api_docs/python/tft/scale_to_z_score. Reviewed By: Tianshu-Bao Differential Revision: D37771097 fbshipit-source-id: b0fbe28af7b768ac857ac24f15727b981ee17262
diff --git a/torcharrow/functional.py b/torcharrow/functional.py
@@ -9,6 +9,8 @@
 from types import ModuleType
 from typing import Dict, List, Optional, Set, Union
 
+import torcharrow.dtypes as dt
+
 from torcharrow.icolumn import Column
 from torcharrow.ilist_column import ListColumn
 from torcharrow.inumerical_column import NumericalColumn
@@ -508,3 +510,20 @@ def scale_to_0_1(col: NumericalColumn) -> NumericalColumn:
     else:
         # TODO: we should add explicit stub to sigmoid
         return sys.modules["torcharrow.functional"].sigmoid(col)
+
+
+def scale_to_z_score(col: NumericalColumn) -> NumericalColumn:
+    """
+    Return the column data scaled to mean 0 and variance 1 (standard deviation 1).
+    Scaling to z-score subtracts out the mean and divides by standard deviation.
+    Note that the standard deviation computed here is based on the biased variance (0 delta degrees of freedom).
+    If input col contains a single distinct value, then the input is returned without scaling.
+    If input col is integral, the output is cast to float32.
+    """
+    assert isinstance(col, NumericalColumn)
+    std = col.std()
+    if std == 0:
+        if dt.is_integer(col.dtype):
+            return col.cast(dt.Float32(col.dtype.nullable))
+        return col
+    return (col - col.mean()) / std
diff --git a/torcharrow/test/test_functional_cpu.py b/torcharrow/test/test_functional_cpu.py
@@ -6,6 +6,8 @@
 
 import unittest
 
+import numpy as np
+
 import torcharrow as ta
 import torcharrow._torcharrow
 import torcharrow.dtypes as dt
@@ -96,6 +98,58 @@ def test_scale_to_0_1(self):
         with self.assertRaises(AssertionError):
             functional.scale_to_0_1(c)
 
+    def test_scale_to_z_score(self):
+        # norm same int
+        c = ta.column([1, 1], device=self.device)
+        self.assertEqual(c.dtype, dt.int64)
+        result = functional.scale_to_z_score(c)
+        self.assertEqual(
+            list(result),
+            [1, 1],
+        )
+        self.assertEqual(result.dtype, dt.float32)
+
+        # norm same double
+        c = ta.column([np.float64(1), np.float64(1)], device=self.device)
+        self.assertEqual(c.dtype, dt.float64)
+        result = functional.scale_to_z_score(c)
+        self.assertEqual(
+            list(result),
+            [1, 1],
+        )
+        self.assertEqual(result.dtype, dt.float64)
+
+        # norm float
+        c = ta.column([1.0, 1.0, 2.0, 2.0], device=self.device)
+        self.assertEqual(
+            list(functional.scale_to_z_score(c)),
+            [
+                -0.866025447845459,
+                -0.866025447845459,
+                0.866025447845459,
+                0.866025447845459,
+            ],
+        )
+
+        # norm int with None
+        c = ta.column([1, 2, 3, None, 4, 5], device=self.device)
+        self.assertEqual(
+            list(functional.scale_to_z_score(c)),
+            [
+                -1.2649110555648804,
+                -0.6324555277824402,
+                0.0,
+                None,
+                0.6324555277824402,
+                1.2649110555648804,
+            ],
+        )
+
+        # test assert
+        c = ta.column(["foo", "bar"])
+        with self.assertRaises(AssertionError):
+            functional.scale_to_z_score(c)
+
 
 if __name__ == "__main__":
     unittest.main()