Merge pull request #351 from gnes-ai/feat-standardscaler

mergify[bot] · web-flow · commit 5de329dbb5b5 · 2019-10-23T04:18:13.000Z
feat(standarder): add standard scaler
diff --git a/gnes/encoder/numeric/pca.py b/gnes/encoder/numeric/pca.py
@@ -23,16 +23,19 @@
 class PCAEncoder(BaseNumericEncoder):
     batch_size = 2048
 
-    def __init__(self, output_dim: int, *args, **kwargs):
+    def __init__(self, output_dim: int, whiten: bool=False, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.output_dim = output_dim
+        self.whiten = whiten
         self.pca_components = None
         self.mean = None
 
+
     def post_init(self):
         from sklearn.decomposition import IncrementalPCA
         self.pca = IncrementalPCA(n_components=self.output_dim)
 
+
     @batching
     def train(self, vecs: np.ndarray, *args, **kwargs) -> None:
         num_samples, num_dim = vecs.shape
@@ -49,11 +52,16 @@ def train(self, vecs: np.ndarray, *args, **kwargs) -> None:
 
         self.pca_components = np.transpose(self.pca.components_)
         self.mean = self.pca.mean_.astype('float32')
+        self.explained_variance = self.pca.explained_variance_.astype('float32')
+
 
     @train_required
     @batching
     def encode(self, vecs: np.ndarray, *args, **kwargs) -> np.ndarray:
-        return np.matmul(vecs - self.mean, self.pca_components)
+        X_transformed = np.matmul(vecs - self.mean, self.pca_components)
+        if self.whiten:
+            X_transformed /= np.sqrt(self.explained_variance)
+        return X_transformed
 
 
 class PCALocalEncoder(BaseNumericEncoder):
diff --git a/gnes/encoder/numeric/standarder.py b/gnes/encoder/numeric/standarder.py
@@ -0,0 +1,45 @@
+#  Tencent is pleased to support the open source community by making GNES available.
+#
+#  Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+
+import numpy as np
+
+from ..base import BaseNumericEncoder
+from ...helper import batching, train_required
+
+
+class StandarderEncoder(BaseNumericEncoder):
+    batch_size = 2048
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.mean = None
+        self.scale = None
+
+    def post_init(self):
+        from sklearn.preprocessing import StandardScaler
+        self.standarder = StandardScaler()
+
+    @batching
+    def train(self, vecs: np.ndarray, *args, **kwargs) -> None:
+        self.standarder.partial_fit(vecs)
+
+        self.mean = self.standarder.mean_.astype('float32')
+        self.scale = self.standarder.scale_.astype('float32')
+
+    @train_required
+    @batching
+    def encode(self, vecs: np.ndarray, *args, **kwargs) -> np.ndarray:
+        return (vecs - self.mean) / self.scale