From b85ee9d64a536937912544c7bbd5b98b635b7e8d Mon Sep 17 00:00:00 2001
From: Christian C <cc@localhost>
Date: Mon, 11 Nov 2024 12:29:32 -0800
Subject: Initial commit

---
 code/sunlab/transform_data.py | 799 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 799 insertions(+)
 create mode 100644 code/sunlab/transform_data.py

(limited to 'code/sunlab/transform_data.py')

diff --git a/code/sunlab/transform_data.py b/code/sunlab/transform_data.py
new file mode 100644
index 0000000..d6e3813
--- /dev/null
+++ b/code/sunlab/transform_data.py
@@ -0,0 +1,799 @@
+from sklearn import preprocessing
+import numpy as np
+
+# import features
+
+
+def import_train_set(train_file_name="AllResults.txt"):
+    featurelist = []
+
+    with open(train_file_name, "r") as infile:
+        for line in infile:
+            featurelist.append(line.strip())
+
+    # so now, featurelist[1] has names of things in form
+    #   'Area, MajorAxisLength, ... Class'
+    FeatureNames = [x.strip() for x in featurelist[0].split(",")]
+    # FeatureNames has form ['Area','MajorAxisLength',....'Class']
+    #   which is what I wanted
+
+    AllData = [
+        [float(x.strip()) for x in featurelist[i].split(",")]
+        for i in range(1, len(featurelist))
+    ]
+
+    # Data is in form [[1,2,3,....0.0],[3,3,1,...0.0],...[5,3,1,...0.0]],
+    #   the last input is the class.
+
+    classes = [int(i[-1]) for i in AllData]
+
+    # classes contains the class number from which the data is from
+
+    # want to delete target from AllData.
+
+    X = [i[0:-1] for i in AllData]
+
+    # X has form similar to Data. So when we reshape, we want the output to be
+    # X = array([[0,1,2,...]
+    #            [1,2,3,...]])
+
+    Data = np.asarray(X, order="F")
+
+    # this has the right form, is uses fortran column-major style memory representation vs row major C-style
+    # the notation is scientific, where iris data set looks like a float. CHECKED: Both are type numpy.float64
+    # both have same indexing calls, so I think we're in business.
+
+    # looks exactly correct, or at least like iris data set target.
+    Target = np.asarray(classes)
+    return (Data, Target)
+
+
+########################################################################
+# for training purposes, the number of samples in data must be divisible by 256
+
+
+def Trim_Train_Data(Data, Target, max_length=None, balance=False):
+    ####
+    # Inputs: Data is numpy array with N samples (rows) and M measures (cols)
+    # Target is 1xN samples with ground truth
+    # max_length defines maximum length of training data. Should be divisible by 256, might want to code that...
+    # balance is boolean if you wish to have same number of samples in each class.
+    print("Class lengths are = ", [sum(Target == i) for i in set(Target)])
+    if not balance:
+        if (
+            np.shape(Data)[0] / 256 != np.round(np.shape(Data)[0] / 256)
+            or max_length < np.shape(Data)[0]
+        ):
+            print("Trimming data for training purposes...")
+            if not max_length:
+                max_length = 256 * (np.floor(np.shape(Data)[0] / 256))
+            else:
+                if max_length / 256 != np.round(max_length / 256):
+                    # must make it divisible by 256
+                    max_length = int(np.floor(max_length / 256) * 256)
+                    print(
+                        "Your given max_length was not divisible by 256. New max length is = %d"
+                        % max_length
+                    )
+            # determine percentages of each class.
+            cs = np.unique(Target)
+            ps = np.zeros(shape=(1, len(cs)))
+            ps = ps[0]
+            rows_to_take = np.array([])
+            for i in range(len(cs)):
+                ps[i] = np.sum(Target == cs[i]) / len(Target)
+                goodrows = np.where(Target == cs[i])[0]
+                rows_to_take = np.append(
+                    rows_to_take, goodrows[0 : int(np.floor(ps[i] * max_length))]
+                )
+
+            ad_row = 0
+            class_ind = 0
+            while len(rows_to_take) != max_length:
+                # need to supplament.
+                goodrows = np.where(Target == cs[class_ind])[0]
+                rows_to_take = np.append(
+                    rows_to_take,
+                    goodrows[int(np.floor(ps[class_ind] * max_length)) + 1 + ad_row],
+                )
+                class_ind = class_ind + 1
+                if class_ind > len(cs):
+                    class_ind = 0
+                    ad_row = ad_row + 1
+            rows_to_take = rows_to_take.astype(int)
+            X_train_scaled = Data[rows_to_take, :]
+            Y_train = Target[rows_to_take]
+            print("Complete")
+        else:
+            X_train_scaled = Data
+            Y_train = Target
+        print("Final training length = %d" % X_train_scaled.shape[0])
+        print(
+            "Class lengths after trimming are = ",
+            [sum(Y_train == i) for i in set(Y_train)],
+        )
+        return (X_train_scaled, Y_train)
+    else:
+        # determine which has the minimum number of cases.
+        cs = np.unique(Target)
+        lens = np.zeros((len(cs)))
+        for i in range(len(cs)):
+            lens[i] = sum(Target == cs[i])
+
+        # randomly sample from each class now that number of samples.
+        min_len = int(min(lens))
+        rows_to_take = np.array([])
+        for i in range(len(cs)):
+            possiblerows = np.where(Target == cs[i])[0]
+            # now sample without replacement.
+            rows_to_take = np.append(
+                rows_to_take, np.random.choice(possiblerows, min_len, replace=False)
+            )
+        if len(rows_to_take) / 256 != np.round(
+            len(rows_to_take) / 256
+        ) or max_length < len(rows_to_take):
+            # trim until correct size.
+            if not max_length:
+                max_length = 256 * (np.floor(np.shape(Data)[0] / 256))
+            else:
+                if max_length / 256 != np.round(max_length / 256):
+                    # must make it divisible by 256
+                    max_length = int(np.floor(max_length / 256) * 256)
+                    print(
+                        "Your given max_length was not divisible by 256. New max length is = %d"
+                        % max_length
+                    )
+            # use min_len now to delete entries.
+            timearound = 0
+            pheno = len(cs)  # start at the end
+            while len(rows_to_take) > max_length:
+                # entry to delete is
+                # first (min_len-round)*range(1,len(np.unique(Target))+1) -1
+                # print("%d entry delete" % (((min_len-timearound)*pheno) - 1))
+                rows_to_take = np.delete(
+                    rows_to_take, ((min_len - timearound) * pheno) - 1
+                )
+                pheno = pheno - 1
+                if pheno < 1:
+                    pheno = len(cs)
+                    timearound = timearound + 1
+        rows_to_take = rows_to_take.astype(int)
+        X_train_scaled = Data[rows_to_take, :]
+        Y_train = Target[rows_to_take]
+        print("Final training length = %d" % X_train_scaled.shape[0])
+        print(
+            "Class lengths after trimming are = ",
+            [sum(Y_train == i) for i in set(Y_train)],
+        )
+        return (X_train_scaled, Y_train)
+
+
+#############################REMOVE OUTLIER DATA########################
+# How? Do this after scaling the data, then compute a z-score. We'll check the data after that.
+
+
+def Remove_Outliers(Data, Target):
+    # for each class, detect outliers.
+    # we'll begin by using z-scoring. This assumes data is described by a Guassian
+    # which is why it is vital to do this AFTER scaling the data.
+    # I plotted the data, it is absolutely not Gaussian.
+    # I tried DBSCAN machine learning algorithm but it is really not helpful.
+    # However, the data IS perhaps Gaussian after embedding. We can clean the signal AFTER by sending in
+    # the emebedded data in 1, 2, or 3 dimensions and removing points that are beyond a standard deviation.
+    # Data is TSNE embedded.
+    zscores = np.zeros(np.shape(Data))
+    for pheno in np.unique(Target):
+        # find rows where phenotype is correct.
+        prows = np.where(Target == pheno)[0]
+        for dim in range(np.shape(Data)[1]):
+            # calculate the mean.
+            m = np.mean(Data[prows, dim])
+            # calculate std.
+            s = np.std(Data[prows, dim])
+            for example in range(len(prows)):
+                zscores[prows[example], dim] = (Data[prows[example], dim] - m) / s
+
+    # now you calculated the zscores for each element. Apply a threshold
+    # good "thumb-rule" thresholds can be: 2.5, 3, 3.5, or more.
+    zthresh = 2.5
+
+    zscores = zscores > 2.5
+
+    badrows = [i for i in range(np.shape(zscores)[0]) if zscores[i].any()]
+
+    Data = np.delete(Data, badrows, axis=0)
+    Target = np.delete(Target, badrows, axis=0)
+
+    return (Data, Target)
+
+
+##############################POST AUGMENTATION#########################
+def Augment_Size(Data, Target, max_copies=0, s=0.2, balance=False, augment_class=None):
+    max_copies = int(max_copies)
+    # augment only the copies made by scaling the unit based measures.
+    # Measures should go: Area, MjrAxis, MnrAxis, Ecc,ConA,EqD,Sol,Ext,Per,conPer,fiber_length,InscribeR,bleb_len
+
+    # first, determine if we desire class balance.
+    if balance:
+        # determine which class has maximum number of samples.
+        cs = np.unique(Target)
+        vals = [sum(Target == cs[i]) for i in cs]
+        print(
+            "Class %d has max number of samples, increasing other classes via size augmentation"
+            % np.argmax(vals)
+        )
+        for i in range(len(cs)):
+            if i != np.argmax(vals):
+                # determine how many samples need to be made.
+                to_make = int(vals[np.argmax(vals)] - vals[i])
+                # randomly sample rows from Data with the correct phenotype cs[i]
+                possible_rows = np.where(Target == cs[i])[0]
+                # sample to_make numbers from possible_rows.
+                sampled_rows = np.random.choice(possible_rows, to_make, replace=True)
+                newrows = Data[sampled_rows, :]
+                size_vary = s * np.random.rand(1, to_make)[0]
+                # vary size.
+                for v in range(to_make):
+                    if np.random.rand() < 0.5:
+                        newrows[v, 0] = (
+                            newrows[v, 0] + newrows[v, 0] * size_vary[v] * size_vary[v]
+                        )
+                        newrows[v, 1] = newrows[v, 1] + newrows[v, 1] * size_vary[v]
+                        newrows[v, 2] = newrows[v, 2] + newrows[v, 2] * size_vary[v]
+                        newrows[v, 4] = (
+                            newrows[v, 4] + newrows[v, 4] * size_vary[v] * size_vary[v]
+                        )
+                        newrows[v, 5] = newrows[v, 5] + newrows[v, 5] * size_vary[v]
+                        newrows[v, 7] = newrows[v, 7] + newrows[v, 7] * size_vary[v]
+                        newrows[v, 8] = newrows[v, 8] + newrows[v, 8] * size_vary[v]
+                        newrows[v, 9] = newrows[v, 9] + newrows[v, 9] * size_vary[v]
+                        newrows[v, 10] = newrows[v, 10] + newrows[v, 10] * size_vary[v]
+                        newrows[v, 11] = newrows[v, 11] + newrows[v, 11] * size_vary[v]
+                    else:
+                        newrows[v, 0] = (
+                            newrows[v, 0] - newrows[v, 0] * size_vary[v] * size_vary[v]
+                        )
+                        newrows[v, 1] = newrows[v, 1] - newrows[v, 1] * size_vary[v]
+                        newrows[v, 2] = newrows[v, 2] - newrows[v, 2] * size_vary[v]
+                        newrows[v, 4] = (
+                            newrows[v, 4] - newrows[v, 4] * size_vary[v] * size_vary[v]
+                        )
+                        newrows[v, 5] = newrows[v, 5] - newrows[v, 5] * size_vary[v]
+                        newrows[v, 7] = newrows[v, 7] - newrows[v, 7] * size_vary[v]
+                        newrows[v, 8] = newrows[v, 8] - newrows[v, 8] * size_vary[v]
+                        newrows[v, 9] = newrows[v, 9] - newrows[v, 9] * size_vary[v]
+                        newrows[v, 10] = newrows[v, 10] - newrows[v, 10] * size_vary[v]
+                        newrows[v, 11] = newrows[v, 11] - newrows[v, 11] * size_vary[v]
+            Data = np.concatenate((Data, newrows), axis=0)
+            yadd = np.ones(to_make) * cs[i]
+            Target = np.concatenate((Target, yadd.astype(int)), axis=0)
+
+        Data = Data[np.argsort(Target), :]
+        Target = Target[np.argsort(Target)]
+
+    if augment_class is None:
+        if max_copies > 0:
+            print(
+                "Augmenting each class with additional %d samples via size augmentation"
+                % max_copies
+            )
+            cs = np.unique(Target)
+            for i in range(len(cs)):
+                # generate n = max_copies of Data.
+                possible_rows = np.where(Target == cs[i])[0]
+                # sample to_make numbers from possible_rows.
+                sampled_rows = np.random.choice(possible_rows, max_copies, replace=True)
+                newrows = Data[sampled_rows, :]
+                size_vary = s * np.random.rand(1, max_copies)[0]
+                # vary size.
+                for v in range(max_copies):
+                    if np.random.rand() < 0.5:
+                        newrows[v, 0] = (
+                            newrows[v, 0] + newrows[v, 0] * size_vary[v] * size_vary[v]
+                        )
+                        newrows[v, 1] = newrows[v, 1] + newrows[v, 1] * size_vary[v]
+                        newrows[v, 2] = newrows[v, 2] + newrows[v, 2] * size_vary[v]
+                        newrows[v, 4] = (
+                            newrows[v, 4] + newrows[v, 4] * size_vary[v] * size_vary[v]
+                        )
+                        newrows[v, 5] = newrows[v, 5] + newrows[v, 5] * size_vary[v]
+                        newrows[v, 7] = newrows[v, 7] + newrows[v, 7] * size_vary[v]
+                        newrows[v, 8] = newrows[v, 8] + newrows[v, 8] * size_vary[v]
+                        newrows[v, 9] = newrows[v, 9] + newrows[v, 9] * size_vary[v]
+                        newrows[v, 10] = newrows[v, 10] + newrows[v, 10] * size_vary[v]
+                        newrows[v, 11] = newrows[v, 11] + newrows[v, 11] * size_vary[v]
+                    else:
+                        newrows[v, 0] = (
+                            newrows[v, 0] - newrows[v, 0] * size_vary[v] * size_vary[v]
+                        )
+                        newrows[v, 1] = newrows[v, 1] - newrows[v, 1] * size_vary[v]
+                        newrows[v, 2] = newrows[v, 2] - newrows[v, 2] * size_vary[v]
+                        newrows[v, 4] = (
+                            newrows[v, 4] - newrows[v, 4] * size_vary[v] * size_vary[v]
+                        )
+                        newrows[v, 5] = newrows[v, 5] - newrows[v, 5] * size_vary[v]
+                        newrows[v, 7] = newrows[v, 7] - newrows[v, 7] * size_vary[v]
+                        newrows[v, 8] = newrows[v, 8] - newrows[v, 8] * size_vary[v]
+                        newrows[v, 9] = newrows[v, 9] - newrows[v, 9] * size_vary[v]
+                        newrows[v, 10] = newrows[v, 10] - newrows[v, 10] * size_vary[v]
+                        newrows[v, 11] = newrows[v, 11] - newrows[v, 11] * size_vary[v]
+                Data = np.concatenate((Data, newrows), axis=0)
+                yadd = np.ones(max_copies) * cs[i]
+                Target = np.concatenate((Target, yadd.astype(int)), axis=0)
+
+            Data = Data[np.argsort(Target), :]
+            Target = Target[np.argsort(Target)]
+
+    else:
+        augment_class = int(augment_class)
+        if max_copies > 0:
+            print(
+                "Augmenting Class = %d with additional %d samples via size augmentation"
+                % (augment_class, max_copies)
+            )
+            # generate n = max_copies of Data.
+            possible_rows = np.where(Target == augment_class)[0]
+            # sample to_make numbers from possible_rows.
+            sampled_rows = np.random.choice(possible_rows, max_copies, replace=True)
+            newrows = Data[sampled_rows, :]
+            size_vary = s * np.random.rand(1, max_copies)[0]
+            # vary size.
+            for v in range(max_copies):
+                if np.random.rand() < 0.5:
+                    newrows[v, 0] = (
+                        newrows[v, 0] + newrows[v, 0] * size_vary[v] * size_vary[v]
+                    )
+                    newrows[v, 1] = newrows[v, 1] + newrows[v, 1] * size_vary[v]
+                    newrows[v, 2] = newrows[v, 2] + newrows[v, 2] * size_vary[v]
+                    newrows[v, 4] = (
+                        newrows[v, 4] + newrows[v, 4] * size_vary[v] * size_vary[v]
+                    )
+                    newrows[v, 5] = newrows[v, 5] + newrows[v, 5] * size_vary[v]
+                    newrows[v, 7] = newrows[v, 7] + newrows[v, 7] * size_vary[v]
+                    newrows[v, 8] = newrows[v, 8] + newrows[v, 8] * size_vary[v]
+                    newrows[v, 9] = newrows[v, 9] + newrows[v, 9] * size_vary[v]
+                    newrows[v, 10] = newrows[v, 10] + newrows[v, 10] * size_vary[v]
+                    newrows[v, 11] = newrows[v, 11] + newrows[v, 11] * size_vary[v]
+                else:
+                    newrows[v, 0] = (
+                        newrows[v, 0] - newrows[v, 0] * size_vary[v] * size_vary[v]
+                    )
+                    newrows[v, 1] = newrows[v, 1] - newrows[v, 1] * size_vary[v]
+                    newrows[v, 2] = newrows[v, 2] - newrows[v, 2] * size_vary[v]
+                    newrows[v, 4] = (
+                        newrows[v, 4] - newrows[v, 4] * size_vary[v] * size_vary[v]
+                    )
+                    newrows[v, 5] = newrows[v, 5] - newrows[v, 5] * size_vary[v]
+                    newrows[v, 7] = newrows[v, 7] - newrows[v, 7] * size_vary[v]
+                    newrows[v, 8] = newrows[v, 8] - newrows[v, 8] * size_vary[v]
+                    newrows[v, 9] = newrows[v, 9] - newrows[v, 9] * size_vary[v]
+                    newrows[v, 10] = newrows[v, 10] - newrows[v, 10] * size_vary[v]
+                    newrows[v, 11] = newrows[v, 11] - newrows[v, 11] * size_vary[v]
+            Data = np.concatenate((Data, newrows), axis=0)
+            yadd = np.ones(max_copies) * augment_class
+            Target = np.concatenate((Target, yadd.astype(int)), axis=0)
+
+            Data = Data[np.argsort(Target), :]
+            Target = Target[np.argsort(Target)]
+
+    return (Data, Target)
+
+
+########################################################################
+########################################################################
+####### IMPORT THE DEV SET #####
+########################################################################
+########################################################################
+def import_dev_set(dev_file_name="DevResults.txt"):
+    print("Importing the dev set...")
+
+    # import features
+    featurelist = []
+
+    with open(dev_file_name, "r") as infile:
+        for line in infile:
+            featurelist.append(line.strip())
+
+    # so now, featurelist[1] has names of things in form 'Area, MajorAxisLength, ... Class'
+    FeatureNames = [x.strip() for x in featurelist[0].split(",")]
+    # FeatureNames has form ['Area','MajorAxisLength',....'Class'] which is what I wanted
+
+    DevData = [
+        [float(x.strip()) for x in featurelist[i].split(",")]
+        for i in range(1, len(featurelist))
+    ]
+
+    # Data is in form [[1,2,3,....0.0],[3,3,1,...0.0],...[5,3,1,...0.0]], the last input is the class.
+
+    Devclasses = [int(i[-1]) for i in DevData]
+
+    # classes contains the class number from which the data is from
+
+    # want to delete target from AllData.
+
+    DevX = [i[0:-1] for i in DevData]
+
+    # X has form similar to Data. So when we reshape, we want the output to be
+    # X = array([[0,1,2,...]
+    #            [1,2,3,...]])
+
+    X_dev = np.asarray(DevX, order="F")
+
+    # add aspect ratio as last column of data
+    AR = []
+    for i in range(len(X_dev)):
+        AR.append(X_dev[i, 1] / X_dev[i, 2])
+
+    AR = np.asarray(AR)
+
+    AR = AR.reshape((len(AR), 1))
+
+    X_dev = np.append(X_dev, AR, 1)  # concatenates arrays appropriately.
+
+    # add form factor as last column of data
+    # P^2/Area
+    FF = []
+    for i in range(len(X_dev)):
+        FF.append(X_dev[i, 8] * X_dev[i, 8] / X_dev[i, 0])
+    FF = np.asarray(FF)
+    FF = FF.reshape((len(FF), 1))
+    X_dev = np.append(X_dev, FF, 1)
+
+    # this has the right form, is uses fortran column-major style memory representation vs row major C-style
+    # the notation is scientific, where iris data set looks like a float. CHECKED: Both are type numpy.float64
+    # both have same indexing calls, so I think we're in business.
+
+    # looks exactly correct, or at least like iris data set target.
+    y_dev = np.asarray(Devclasses)
+
+    return (X_dev, y_dev, FeatureNames)
+
+
+########################################################################
+#########DATA IS IN THE SAME FORM AS IS FOUND IN IRIS DATASET###########
+########################################################################
+# Target = Target classes (0-4) for training and validation (type, numpy.int64, array)
+# Data = Data for training and validation to be split. (type, numpy.float64, array)
+# FeatureNames = Feature names for each column of data. (type, 'str', python list)
+########################################################################
+# print "Data is now in the same form as that found in Iris Dataset"
+# print "Splitting the training dataset into train/val"
+
+
+def apply_normalization(X_train, max_norm=False, l1_norm=False, l2_norm=False):
+    ########################################################
+    if max_norm:
+        print("Normalizing data using l1_norm")
+        X_train = X_train / np.max(np.abs(X_train), 0)[None, :]
+    if l1_norm:
+        print("Normalizing data using l1_norm")
+        X_train = X_train / np.sum(X_train, 0)[None, :]
+    if l2_norm:
+        print("Normalizing data using l1_norm")
+        X_train = X_train / np.sqrt(np.sum(X_train * X_train, 0))[None, :]
+
+    return X_train
+
+
+########################################################################
+
+
+def preprocess_train_data(X_train, d=2):
+
+    ############### SPLITTING THE DATASET ##################
+    # First split the dataset so it is as if we only had a training set then a eval set.
+    # X_train, X_test, y_train, y_test = train_test_split(Data, Target, test_size = .3)#.25)#, random_state =
+    # default has shuffle = True. test_size sets the proportion of the data set to include in the test, here 25%.
+    ########################################################
+    if d > 1:
+        print("Increasing dimensionality of dataset using cross terms")
+    #################INCREASING FEATURES####################
+    poly = preprocessing.PolynomialFeatures(degree=d, interaction_only=True)
+    # IN SOME MODELS with 2 polynomial features, we are getting 90% exactly. In some polynomial 3 models,
+    # we are getting 90.83%, which is exactly even with deep learning models.
+
+    X_train = poly.fit_transform(X_train)
+    # target_feature_names = ['x'.join(['{}^{}'.format(pair[0],pair[1]) for pair in tuple if pair[1]!=0]) for tuple in [zip(FeatureNames,p) for p in poly.powers_]]
+    # poly=preprocessing.PolynomialFeatures(degree = 2, interaction_only = True)
+    # X_test = poly.fit_transform(X_test)
+    # poly=preprocessing.PolynomialFeatures(degree = 2, interaction_only = True)
+    # X_dev = poly.fit_transform(X_dev)
+
+    ########################################################
+
+    print("Scaling the data")
+    ################# SCALE THE DATA #######################
+    # Scale the data. Each attribute in the dataset must be independently scaled, that is
+    # 0 mean, and unit variance. Doing this returns the z-scores of the data
+    # Z = (x - mu) / sigma
+
+    # , QuantileTransformer(output_distribution='normal')
+    scaler = preprocessing.RobustScaler().fit(X_train)
+    # preprocessing.StandardScaler().fit(X_train) #IMPORTANT NOTE: We are scaling based only on training data!!!!
+
+    X_train_scaled = scaler.fit_transform(X_train)
+
+    # X_test_scaled = scaler.transform(X_test) # will be used later to evaluate the performance.
+
+    # X_dev_scaled = scaler.transform(X_dev)
+
+    ##########################################################
+
+    return (X_train_scaled, scaler)  # , target_feature_names)
+
+
+def preprocess_test_data(X_dev, scaler, d=2):
+    ############### SPLITTING THE DATASET ##################
+    # First split the dataset so it is as if we only had a training set then a eval set.
+    # X_train, X_test, y_train, y_test = train_test_split(Data, Target, test_size = .3)#.25)#, random_state =
+    # default has shuffle = True. test_size sets the proportion of the data set to include in the test, here 25%.
+    ########################################################
+
+    print("Increasing dimensionality of dataset using cross terms")
+    #################INCREASING FEATURES####################
+    poly = preprocessing.PolynomialFeatures(degree=d, interaction_only=True)
+    # IN SOME MODELS with 2 polynomial features, we are getting 90% exactly. In some polynomial 3 models,
+    # we are getting 90.83%, which is exactly even with deep learning models.
+
+    # X_train = poly.fit_transform(X_train)
+    # target_feature_names = ['x'.join(['{}^{}'.format(pair[0],pair[1]) for pair in tuple if pair[1]!=0]) for tuple in [zip(FeatureNames,p) for p in poly.powers_]]
+    # poly=preprocessing.PolynomialFeatures(degree = 2, interaction_only = True)
+    # X_test = poly.fit_transform(X_test)
+    # poly=preprocessing.PolynomialFeatures(degree = 2, interaction_only = True)
+    X_dev = poly.fit_transform(X_dev)
+
+    ########################################################
+
+    print("Scaling the data")
+    ################# SCALE THE DATA #######################
+    # Scale the data. Each attribute in the dataset must be independently scaled, that is
+    # 0 mean, and unit variance. Doing this returns the z-scores of the data
+    # Z = (x - mu) / sigma
+
+    # scaler = preprocessing.StandardScaler().fit(X_train) #IMPORTANT NOTE: We are scaling based only on training data!!!!
+
+    # X_train_scaled = scaler.transform(X_train)
+
+    # X_test_scaled = scaler.transform(X_test) # will be used later to evaluate the performance.
+
+    X_dev_scaled = scaler.transform(X_dev)
+
+    ##########################################################
+
+    return X_dev_scaled
+
+
+def Add_Measures(
+    Data,
+    FeatureNames=None,
+    add_AR=True,
+    add_FF=True,
+    add_convexity=True,
+    add_curl_old=True,
+    add_curl=True,
+    add_sphericity=True,
+    add_InscribedArea=True,
+    add_BlebRel=True,
+):
+    ############### EXPANDING THE DATASET ##################
+    # Add measures of Aspect Ratio, Form Factor, Convexity, Curl, and Sphericity
+    # Input: Data must be an np array with N (row) examples x M (cols) measures.
+    # Measures should go: Area, MjrAxis, MnrAxis, Ecc,ConA,EqD,Sol,Ext,Per,conPer,fiber_length,InscribeR,bleb_len
+    ########################################################
+    if add_AR:
+        AR = []
+        for i in range(len(Data)):
+            AR.append(Data[i, 1] / Data[i, 2])
+
+        AR = np.asarray(AR)
+
+        AR = AR.reshape((len(AR), 1))
+
+        Data = np.append(Data, AR, 1)  # concatenates arrays appropriately.
+        if FeatureNames is not None:
+            FeatureNames.extend(["AR"])
+
+    if add_FF:
+        # this measure is really compactness, if you multiply each by 4 pi
+        # note this is different from roundness, which would use convex perimeter
+        FF = []
+        for i in range(len(Data)):
+            FF.append(Data[i, 0] / (Data[i, 8] * Data[i, 8]))
+            # FF.append(Data[i,8]*Data[i,8] / Data[i,0])
+
+        FF = np.asarray(FF)
+        FF = FF.reshape((len(FF), 1))
+        Data = np.append(Data, FF, 1)
+        if FeatureNames is not None:
+            FeatureNames.extend(["FF"])
+
+    if add_convexity:
+        CC = []
+        for i in range(len(Data)):
+            CC.append(Data[i, 8] / Data[i, 9])
+
+        CC = np.asarray(CC)
+        CC = CC.reshape((len(CC), 1))
+        Data = np.append(Data, CC, 1)
+        if FeatureNames is not None:
+            FeatureNames.extend(["Convexity"])
+
+    if add_curl_old:
+        # tells how curled the object is. might help for lamellipodia.
+        # curl is length / fiber length. (I assume length here can be major axis length)
+        # fiber length definition is (perimeter - sqrt(perimeter^2 - 16*Area)) / 4
+
+        # this definition does not work for a circle. Note that the result will be imaginary.
+        # I changed the 16 to a 4Pi. This should be fine.
+        cc = []
+        for i in range(len(Data)):
+            if (4 * np.pi * Data[i, 0]) <= (Data[i, 8] * Data[i, 8]):
+                fiber_length = (
+                    Data[i, 8]
+                    - np.sqrt((Data[i, 8] * Data[i, 8]) - (4 * np.pi * Data[i, 0]))
+                ) / np.pi  # 4
+                cc.append(Data[i, 1] / fiber_length)
+            else:
+                fiber_length = Data[i, 8] / np.pi  # 4
+                cc.append(Data[i, 1] / fiber_length)
+
+        cc = np.asarray(cc)
+        cc = cc.reshape((len(cc), 1))
+        Data = np.append(Data, cc, 1)
+        if FeatureNames is not None:
+            FeatureNames.extend(["Curl_old"])
+
+    if add_curl:
+        cc = []
+        for i in range(len(Data)):
+            cc.append(Data[i, 1] / Data[i, 10])
+
+        cc = np.asarray(cc)
+        cc = cc.reshape((len(cc), 1))
+        Data = np.append(Data, cc, 1)
+        # bound between 0 and 1 if major axis length could be replaced by feret diameter.
+        if FeatureNames is not None:
+            FeatureNames.extend(["Curl"])
+
+    if add_sphericity:
+        ss = []
+        for i in range(len(Data)):
+            ss.append(Data[i, 11] * 2 / Data[i, 1])
+
+        ss = np.asarray(ss)
+        ss = ss.reshape((len(ss), 1))
+        Data = np.append(Data, ss, 1)
+        # bound between 0 and 1 where 1 is a circle, perfectly spherical, and 0 is not at all.
+        # would be better if we had feret diameter instead of major axis.
+        if FeatureNames is not None:
+            FeatureNames.extend(["Sphericity"])
+
+    if add_InscribedArea:
+        aa = []
+        for i in range(len(Data)):
+            aa.append(Data[i, 1] * Data[i, 1] * np.pi / Data[i, 11])
+
+        aa = np.asarray(aa)
+        aa = aa.reshape((len(aa), 1))
+        Data = np.append(Data, aa, 1)
+        if FeatureNames is not None:
+            FeatureNames.extend(["InArea"])
+
+    if add_BlebRel:
+        bb = []
+        for i in range(len(Data)):
+            bb.append(Data[i, 12] / Data[i, 11])
+
+        bb = np.asarray(bb)
+        bb = bb.reshape((len(bb), 1))
+        Data = np.append(Data, bb, 1)
+        if FeatureNames is not None:
+            FeatureNames.extend(["Bleb_Rel"])
+
+    if FeatureNames is not None:
+        return (Data, FeatureNames)
+    else:
+        return Data
+
+
+def Exclude_Measures(
+    Data,
+    FeatureNames=None,
+    ex_Area=False,
+    ex_MjrAxis=False,
+    ex_MnrAxis=False,
+    ex_Ecc=False,
+    ex_ConA=False,
+    ex_EqD=False,
+    ex_Sol=False,
+    ex_Ext=False,
+    ex_Per=False,
+    ex_conPer=False,
+    ex_FL=False,
+    ex_InR=False,
+    ex_bleb=False,
+):
+    # Area,MjrAxis,MnrAxis,Ecc,ConA,EqD,Sol,Ext,Per,conPer,FL,InR
+
+    del_cols = []
+    if ex_Area:
+        del_cols.append(0)
+    if ex_MjrAxis:
+        del_cols.append(1)
+    if ex_MnrAxis:
+        del_cols.append(2)
+    if ex_Ecc:
+        del_cols.append(3)
+    if ex_ConA:
+        del_cols.append(4)
+    if ex_EqD:
+        del_cols.append(5)
+    if ex_Sol:
+        del_cols.append(6)
+    if ex_Ext:
+        del_cols.append(7)
+    if ex_Per:
+        del_cols.append(8)
+    if ex_conPer:
+        del_cols.append(9)
+    if ex_FL:
+        del_cols.append(10)
+    if ex_InR:
+        del_cols.append(11)
+    if ex_bleb:
+        del_cols.append(12)
+
+    Data = np.delete(Data, del_cols, 1)
+    if FeatureNames is not None:
+        FeatureNames = [i for j, i in enumerate(FeatureNames) if j not in del_cols]
+        return (Data, FeatureNames)
+    else:
+        return Data
+
+
+def open_and_save_test_data(fpath, csvfilename, txtfilename, ratio):
+    # fpath = '/volumes/chris stuff/chemsensing/chemsensing/Y27632_120518/Results/'
+    # /Rho_Act_120118/Results_after/'
+    # filename = 'FinalResults_after'
+    # option to delete certain measures if done so in training.
+    # order should go like
+    # %frame number%correctedNum%area%centroidx%centroidy%major%minor%eccentricity
+    # %orientation%convex area%filledarea%equivDiameter%solidity%extent%perimeter
+    # %perimeter old%convex perimeter%fiber length%%max in radii%bleb length%centersx%centersy
+
+    data = np.genfromtxt(
+        fpath + csvfilename + ".csv",
+        delimiter=",",
+        usecols=[2, 5, 6, 7, 9, 11, 12, 13, 14, 16, 17, 18, 19],
+        skip_header=1,
+    )
+    # was cols 3,6,7,8,10,12,13,14,15
+    frames_cell = np.genfromtxt(
+        fpath + csvfilename + ".csv", delimiter=",", usecols=[0, 1], skip_header=1
+    )
+    # add aspect ratio as last column of data
+
+    data[:, 0] = data[:, 0] * ratio * ratio  # area
+    data[:, 1] = data[:, 1] * ratio  # mjr
+    data[:, 2] = data[:, 2] * ratio  # MnrAxis
+    # ecc unitless
+    data[:, 4] = data[:, 4] * ratio * ratio  # ConvexArea
+    data[:, 5] = data[:, 5] * ratio  # EquivDiameter
+    # Solidity
+    # Extent
+    data[:, 8] = data[:, 8] * ratio  # Perimeter
+    data[:, 9] = data[:, 9] * ratio  # conPerim
+    data[:, 10] = data[:, 10] * ratio  # FibLen
+    data[:, 11] = data[:, 11] * ratio  # max inscribed r
+    data[:, 12] = data[:, 12] * ratio  # bleblen
+
+    preds = np.genfromtxt(
+        fpath + "/" + txtfilename + ".txt",
+        delimiter=" ",
+        usecols=[4, 5, 6, 7],
+        skip_header=1,
+    )
+    y_target = np.where(np.max(preds, 1) > 0.7, np.argmax(preds, 1), 4)
+    # y_target = np.reshape(y_target,(len(y_target),1))
+
+    return (data, y_target, frames_cell)
-- 
cgit v1.2.1