added experiments

Tomasz Latkowski · Tomasz Latkowski · commit a782db7d449c · 2018-02-25T17:38:43.000+01:00
diff --git a/.gitignore b/.gitignore
@@ -101,4 +101,5 @@ ENV/
 .mypy_cache/
 
 logs
-.idea/
+.idea/
+.pytest_cache/
diff --git a/experiments/__init__.py b/experiments/__init__.py
diff --git a/experiments/classifier.py b/experiments/classifier.py
@@ -0,0 +1,29 @@
+import tensorflow as tf
+
+
+def ff_neural_network(inputs, units):
+    layer = tf.layers.dense(inputs, units=units, activation=tf.nn.tanh)
+    output = tf.layers.dense(layer, units=1)
+    return output
+
+
+class NeuralNetworkClassifier:
+
+    def __init__(self, num_features, units):
+        self.x = tf.placeholder(dtype=tf.float64, shape=[None, num_features], name='inputs')
+        self.y = tf.placeholder(dtype=tf.float64, shape=[None, 1], name='labels')
+
+        output = ff_neural_network(self.x, units=units)
+
+        with tf.name_scope('loss'):
+            self.loss = tf.losses.sigmoid_cross_entropy(self.y, output)
+            self.opt = tf.train.AdamOptimizer(learning_rate=0.01).minimize(self.loss)
+
+        with tf.name_scope('metrics'):
+            self.prediction = tf.nn.sigmoid(output)
+
+            self.correct_predictions = tf.equal(self.prediction, self.y)
+            self.accuracy = tf.reduce_mean(tf.to_float(self.correct_predictions))
+            tf.summary.scalar("accuracy", self.accuracy)
+            tf.summary.scalar("loss", self.loss)
+            self.summary_op = tf.summary.merge_all()
diff --git a/experiments/dataset.py b/experiments/dataset.py
@@ -0,0 +1,4 @@
+class Dataset:
+
+    def __init__(self, data, num_classes):
+        pass
diff --git a/experiments/experiment.py b/experiments/experiment.py
@@ -0,0 +1,18 @@
+import tensorflow as tf
+
+from experiments.classifier import NeuralNetworkClassifier
+from methods.selection_wrapper import SelectionWrapper
+
+
+class ExperimentModel:
+
+    def __init__(self, selection_method, num_features, num_instances, classifier, dataset):
+
+        with tf.name_scope('selection'):
+            self.selection_wrapper = SelectionWrapper(dataset,
+                                                     num_instances=num_instances,
+                                                     selection_method=selection_method,
+                                                     num_features=num_features)
+
+        with tf.name_scope('classifier'):
+            self.clf = NeuralNetworkClassifier(num_features, 20)
diff --git a/methods/selection.py b/methods/selection.py
@@ -13,7 +13,8 @@ def selection_wrapper(data, num_instances, selection_method=None, num_features=N
         num_features = data.get_shape().as_list()[-1]
 
     values, indices = selection_method(data, num_instances, num_features)
-    return values, tf.gather(data, indices, axis=1)
+    selected_features = tf.gather(data, indices, axis=1)
+    return values, selected_features
 
 
 def fisher(data, num_instances: list, top_k_features=2):
@@ -27,15 +28,20 @@ def fisher(data, num_instances: list, top_k_features=2):
     :return: the list of most significant features.
     """
     assert len(num_instances) == 2, "Fisher selection method can be performed for two-class problems."
+
     data = tf.convert_to_tensor(data)
     _, num_features = data.get_shape().as_list()
     if top_k_features > num_features:
         top_k_features = num_features
     class1, class2 = tf.split(data, num_instances)
-    mean1, std1 = tf.nn.moments(class1, axes=0)
-    mean2, std2 = tf.nn.moments(class2, axes=0)
-    fisher_coeffs = tf.abs(mean1 - mean2) / (std1 + std2)
-    return tf.nn.top_k(fisher_coeffs, k=top_k_features)
+
+    with tf.name_scope('fisher_selection'):
+        mean1, std1 = tf.nn.moments(class1, axes=0)
+        mean2, std2 = tf.nn.moments(class2, axes=0)
+        fisher_coeffs = tf.abs(mean1 - mean2) / (std1 + std2)
+        selected_features = tf.nn.top_k(fisher_coeffs, k=top_k_features)
+
+    return selected_features
 
 
 def feature_correlation_with_class(data, num_instances: list, top_k_features=10):
@@ -49,11 +55,15 @@ def feature_correlation_with_class(data, num_instances: list, top_k_features=10)
     if top_k_features > num_features:
         top_k_features = num_features
     class1, class2 = tf.split(data, num_instances)
-    mean1, std1 = tf.nn.moments(class1, axes=0)
-    mean2, std2 = tf.nn.moments(class2, axes=0)
-    mean, std = tf.nn.moments(data, axes=0)
-    corr_coeffs = (tf.square(mean1 - mean) + tf.square(mean2 - mean)) / 2*tf.square(std)
-    return tf.nn.top_k(corr_coeffs, k=top_k_features)
+
+    with tf.name_scope('corr_selection'):
+        mean1, std1 = tf.nn.moments(class1, axes=0)
+        mean2, std2 = tf.nn.moments(class2, axes=0)
+        mean, std = tf.nn.moments(data, axes=0)
+        corr_coeffs = (tf.square(mean1 - mean) + tf.square(mean2 - mean)) / 2 * tf.square(std)
+        selected_features = tf.nn.top_k(corr_coeffs, k=top_k_features)
+
+    return selected_features
 
 
 def t_test(data, num_instances: list, top_k_features=10):
@@ -67,7 +77,29 @@ def t_test(data, num_instances: list, top_k_features=10):
     if top_k_features > num_features:
         top_k_features = num_features
     class1, class2 = tf.split(data, num_instances)
-    mean1, std1 = tf.nn.moments(class1, axes=0)
-    mean2, std2 = tf.nn.moments(class2, axes=0)
-    t_test_coeffs = tf.abs(mean1 - mean2) / tf.sqrt(tf.square(std1)/num_instances[0] + tf.square(std2) / num_instances[1])
-    return tf.nn.top_k(t_test_coeffs, k=top_k_features)
+
+    with tf.name_scope('t_test_selection'):
+        mean1, std1 = tf.nn.moments(class1, axes=0)
+        mean2, std2 = tf.nn.moments(class2, axes=0)
+        t_test_coeffs = tf.abs(mean1 - mean2) / tf.sqrt(
+            tf.square(std1) / num_instances[0] + tf.square(std2) / num_instances[1])
+        selected_features = tf.nn.top_k(t_test_coeffs, k=top_k_features)
+
+    return selected_features
+
+
+def random(data, num_instances: list, top_k_features=10):
+    data = tf.convert_to_tensor(data)
+    _, num_features = data.get_shape().as_list()
+    if top_k_features > num_features:
+        top_k_features = num_features
+    class1, class2 = tf.split(data, num_instances)
+
+    with tf.name_scope('random_selection'):
+        mean1, std1 = tf.nn.moments(class1, axes=0)
+        mean2, std2 = tf.nn.moments(class2, axes=0)
+        t_test_coeffs = tf.abs(mean1 - mean2) / tf.sqrt(
+            tf.square(std1) / num_instances[0] + tf.square(std2) / num_instances[1])
+        selected_features = tf.nn.top_k(t_test_coeffs, k=top_k_features)
+
+    return selected_features
diff --git a/methods/selection_wrapper.py b/methods/selection_wrapper.py
@@ -0,0 +1,18 @@
+import tensorflow as tf
+
+
+class SelectionWrapper:
+
+    def __init__(self, data, num_instances, selection_method=None, num_features=None):
+        if data is None:
+            raise ValueError('Provide data to make selection.')
+
+        if selection_method is None:
+            raise ValueError('Provide selection method.')
+
+        if num_features is None:
+            data = tf.convert_to_tensor(data)
+            num_features = data.get_shape().as_list()[-1]
+
+        self.values, indices = selection_method(data, num_instances, num_features)
+        self.selected_features = tf.gather(data, indices, axis=1)
diff --git a/run.py b/run.py
@@ -0,0 +1,48 @@
+import numpy as np
+import tensorflow as tf
+from utils.log_saver import LogSaver
+from experiments.experiment import ExperimentModel
+from methods.selection import fisher
+from tqdm import tqdm
+from sklearn.model_selection import StratifiedKFold
+from utils.data_reader import read
+
+
+data_fn = 'data/autism.tsv'
+data = read(data_fn)
+
+num_features = 100
+num_epochs = 1000
+
+labels = np.concatenate([np.ones(82, dtype=np.float64), np.zeros(64, dtype=np.float64)])
+labels = np.reshape(labels, (-1, 1))
+
+
+skf = StratifiedKFold(n_splits=10)
+
+for fold_id, (train_idxs, test_idxs) in enumerate(skf.split(data, labels.reshape(146))):
+
+    data_fold = data[train_idxs, :]
+    labels_fold = labels[train_idxs]
+    num_instances = [int(sum(labels_fold == 0)), int(sum(labels_fold == 1))]
+
+    with tf.Graph().as_default() as graph:
+
+        model = ExperimentModel(fisher, num_features, num_instances, None, data_fold)
+
+        with tf.Session() as session:
+
+            global_step = 0
+            session.run(tf.global_variables_initializer())
+
+            log_saver = LogSaver('logs', 'fisher_fold{}'.format(fold_id), session.graph)
+
+            selected_data = session.run(model.selection_wrapper.selected_features)
+
+            tqdm_iter = tqdm(range(num_epochs), desc='Epochs')
+
+            for epoch in tqdm_iter:
+                feed_dict = {model.clf.x: selected_data, model.clf.y: labels_fold}
+                loss, _, summary = session.run([model.clf.loss, model.clf.opt, model.clf.summary_op], feed_dict=feed_dict)
+                log_saver.log_train(summary, epoch)
+                tqdm_iter.set_postfix(loss='{:.2f}'.format(float(loss)), epoch=epoch)
diff --git a/tests/corr.py b/tests/corr.py
@@ -8,4 +8,4 @@ def testCorrelationWithClassCorrectScore(self):
 
 
 if __name__ == '__main__':
-    tf.test.main()
+    tf.test.main()
diff --git a/tests/test_pearson.py b/tests/test_pearson.py
@@ -1,6 +1,7 @@
+import numpy as np
 import tensorflow as tf
+
 from utils.statistics import pearson_correlation
-import numpy as np
 
 
 class TestPearson(tf.test.TestCase):
@@ -37,4 +38,4 @@ def testPositivePearsonCoefficientValueForTwoVectors(self):
 
 
 if __name__ == '__main__':
-    tf.test.main()
+    tf.test.main()
diff --git a/tests/ttest.py b/tests/ttest.py
@@ -8,4 +8,4 @@ def testTtestCorrectScore(self):
 
 
 if __name__ == '__main__':
-    tf.test.main()
+    tf.test.main()
diff --git a/utils/data_reader.py b/utils/data_reader.py
@@ -0,0 +1,6 @@
+import pandas as pd
+
+
+def read(file_name):
+    data = pd.read_csv(file_name, sep='\t', header=None, index_col=0).T
+    return data.as_matrix()
diff --git a/utils/log_saver.py b/utils/log_saver.py
@@ -0,0 +1,18 @@
+import os
+
+import tensorflow as tf
+
+
+class LogSaver:
+
+    def __init__(self, logs_path, model_name, graph: tf.Graph):
+        if not os.path.isdir(logs_path):
+            os.makedirs(logs_path)
+        self.test_summary_writer = tf.summary.FileWriter('{}/{}/test/'.format(logs_path, model_name), graph=graph)
+        self.train_summary_writer = tf.summary.FileWriter('{}/{}/train/'.format(logs_path, model_name), graph=graph)
+
+    def log_test(self, summary, global_step):
+        self.test_summary_writer.add_summary(summary, global_step)
+
+    def log_train(self, summary, global_step):
+        self.test_summary_writer.add_summary(summary, global_step)
diff --git a/utils/statistics.py b/utils/statistics.py
@@ -9,7 +9,7 @@ def pearson_correlation(x1, x2):
     l = tf.reduce_sum((x1 - m1) * (x2 - m2))
     i = tf.reduce_sum((x1 - m1) ** 2) * tf.reduce_sum((x2 - m2) ** 2)
     p = tf.sqrt(i)
-    return l/p
+    return l / p
 
 
 def f_test():

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +class Dataset:
++
 +    def __init__(self, data, num_classes):
 +        pass
Original file line number	Diff line number	Diff line change
`@@ -8,4 +8,4 @@ def testCorrelationWithClassCorrectScore(self):`
`8`	`8`
`9`	`9`
`10`	`10`	`if __name__ == '__main__':`
`11`		`- tf.test.main()`
	`11`	`+ tf.test.main()`