diff options
author | Christian C <cc@localhost> | 2024-11-11 12:29:32 -0800 |
---|---|---|
committer | Christian C <cc@localhost> | 2024-11-11 12:29:32 -0800 |
commit | b85ee9d64a536937912544c7bbd5b98b635b7e8d (patch) | |
tree | cef7bc17d7b29f40fc6b1867d0ce0a742d5583d0 /code/sunlab/transform_data.py |
Initial commit
Diffstat (limited to 'code/sunlab/transform_data.py')
-rw-r--r-- | code/sunlab/transform_data.py | 799 |
1 files changed, 799 insertions, 0 deletions
diff --git a/code/sunlab/transform_data.py b/code/sunlab/transform_data.py new file mode 100644 index 0000000..d6e3813 --- /dev/null +++ b/code/sunlab/transform_data.py @@ -0,0 +1,799 @@ +from sklearn import preprocessing +import numpy as np + +# import features + + +def import_train_set(train_file_name="AllResults.txt"): + featurelist = [] + + with open(train_file_name, "r") as infile: + for line in infile: + featurelist.append(line.strip()) + + # so now, featurelist[1] has names of things in form + # 'Area, MajorAxisLength, ... Class' + FeatureNames = [x.strip() for x in featurelist[0].split(",")] + # FeatureNames has form ['Area','MajorAxisLength',....'Class'] + # which is what I wanted + + AllData = [ + [float(x.strip()) for x in featurelist[i].split(",")] + for i in range(1, len(featurelist)) + ] + + # Data is in form [[1,2,3,....0.0],[3,3,1,...0.0],...[5,3,1,...0.0]], + # the last input is the class. + + classes = [int(i[-1]) for i in AllData] + + # classes contains the class number from which the data is from + + # want to delete target from AllData. + + X = [i[0:-1] for i in AllData] + + # X has form similar to Data. So when we reshape, we want the output to be + # X = array([[0,1,2,...] + # [1,2,3,...]]) + + Data = np.asarray(X, order="F") + + # this has the right form, is uses fortran column-major style memory representation vs row major C-style + # the notation is scientific, where iris data set looks like a float. CHECKED: Both are type numpy.float64 + # both have same indexing calls, so I think we're in business. + + # looks exactly correct, or at least like iris data set target. + Target = np.asarray(classes) + return (Data, Target) + + +######################################################################## +# for training purposes, the number of samples in data must be divisible by 256 + + +def Trim_Train_Data(Data, Target, max_length=None, balance=False): + #### + # Inputs: Data is numpy array with N samples (rows) and M measures (cols) + # Target is 1xN samples with ground truth + # max_length defines maximum length of training data. Should be divisible by 256, might want to code that... + # balance is boolean if you wish to have same number of samples in each class. + print("Class lengths are = ", [sum(Target == i) for i in set(Target)]) + if not balance: + if ( + np.shape(Data)[0] / 256 != np.round(np.shape(Data)[0] / 256) + or max_length < np.shape(Data)[0] + ): + print("Trimming data for training purposes...") + if not max_length: + max_length = 256 * (np.floor(np.shape(Data)[0] / 256)) + else: + if max_length / 256 != np.round(max_length / 256): + # must make it divisible by 256 + max_length = int(np.floor(max_length / 256) * 256) + print( + "Your given max_length was not divisible by 256. New max length is = %d" + % max_length + ) + # determine percentages of each class. + cs = np.unique(Target) + ps = np.zeros(shape=(1, len(cs))) + ps = ps[0] + rows_to_take = np.array([]) + for i in range(len(cs)): + ps[i] = np.sum(Target == cs[i]) / len(Target) + goodrows = np.where(Target == cs[i])[0] + rows_to_take = np.append( + rows_to_take, goodrows[0 : int(np.floor(ps[i] * max_length))] + ) + + ad_row = 0 + class_ind = 0 + while len(rows_to_take) != max_length: + # need to supplament. + goodrows = np.where(Target == cs[class_ind])[0] + rows_to_take = np.append( + rows_to_take, + goodrows[int(np.floor(ps[class_ind] * max_length)) + 1 + ad_row], + ) + class_ind = class_ind + 1 + if class_ind > len(cs): + class_ind = 0 + ad_row = ad_row + 1 + rows_to_take = rows_to_take.astype(int) + X_train_scaled = Data[rows_to_take, :] + Y_train = Target[rows_to_take] + print("Complete") + else: + X_train_scaled = Data + Y_train = Target + print("Final training length = %d" % X_train_scaled.shape[0]) + print( + "Class lengths after trimming are = ", + [sum(Y_train == i) for i in set(Y_train)], + ) + return (X_train_scaled, Y_train) + else: + # determine which has the minimum number of cases. + cs = np.unique(Target) + lens = np.zeros((len(cs))) + for i in range(len(cs)): + lens[i] = sum(Target == cs[i]) + + # randomly sample from each class now that number of samples. + min_len = int(min(lens)) + rows_to_take = np.array([]) + for i in range(len(cs)): + possiblerows = np.where(Target == cs[i])[0] + # now sample without replacement. + rows_to_take = np.append( + rows_to_take, np.random.choice(possiblerows, min_len, replace=False) + ) + if len(rows_to_take) / 256 != np.round( + len(rows_to_take) / 256 + ) or max_length < len(rows_to_take): + # trim until correct size. + if not max_length: + max_length = 256 * (np.floor(np.shape(Data)[0] / 256)) + else: + if max_length / 256 != np.round(max_length / 256): + # must make it divisible by 256 + max_length = int(np.floor(max_length / 256) * 256) + print( + "Your given max_length was not divisible by 256. New max length is = %d" + % max_length + ) + # use min_len now to delete entries. + timearound = 0 + pheno = len(cs) # start at the end + while len(rows_to_take) > max_length: + # entry to delete is + # first (min_len-round)*range(1,len(np.unique(Target))+1) -1 + # print("%d entry delete" % (((min_len-timearound)*pheno) - 1)) + rows_to_take = np.delete( + rows_to_take, ((min_len - timearound) * pheno) - 1 + ) + pheno = pheno - 1 + if pheno < 1: + pheno = len(cs) + timearound = timearound + 1 + rows_to_take = rows_to_take.astype(int) + X_train_scaled = Data[rows_to_take, :] + Y_train = Target[rows_to_take] + print("Final training length = %d" % X_train_scaled.shape[0]) + print( + "Class lengths after trimming are = ", + [sum(Y_train == i) for i in set(Y_train)], + ) + return (X_train_scaled, Y_train) + + +#############################REMOVE OUTLIER DATA######################## +# How? Do this after scaling the data, then compute a z-score. We'll check the data after that. + + +def Remove_Outliers(Data, Target): + # for each class, detect outliers. + # we'll begin by using z-scoring. This assumes data is described by a Guassian + # which is why it is vital to do this AFTER scaling the data. + # I plotted the data, it is absolutely not Gaussian. + # I tried DBSCAN machine learning algorithm but it is really not helpful. + # However, the data IS perhaps Gaussian after embedding. We can clean the signal AFTER by sending in + # the emebedded data in 1, 2, or 3 dimensions and removing points that are beyond a standard deviation. + # Data is TSNE embedded. + zscores = np.zeros(np.shape(Data)) + for pheno in np.unique(Target): + # find rows where phenotype is correct. + prows = np.where(Target == pheno)[0] + for dim in range(np.shape(Data)[1]): + # calculate the mean. + m = np.mean(Data[prows, dim]) + # calculate std. + s = np.std(Data[prows, dim]) + for example in range(len(prows)): + zscores[prows[example], dim] = (Data[prows[example], dim] - m) / s + + # now you calculated the zscores for each element. Apply a threshold + # good "thumb-rule" thresholds can be: 2.5, 3, 3.5, or more. + zthresh = 2.5 + + zscores = zscores > 2.5 + + badrows = [i for i in range(np.shape(zscores)[0]) if zscores[i].any()] + + Data = np.delete(Data, badrows, axis=0) + Target = np.delete(Target, badrows, axis=0) + + return (Data, Target) + + +##############################POST AUGMENTATION######################### +def Augment_Size(Data, Target, max_copies=0, s=0.2, balance=False, augment_class=None): + max_copies = int(max_copies) + # augment only the copies made by scaling the unit based measures. + # Measures should go: Area, MjrAxis, MnrAxis, Ecc,ConA,EqD,Sol,Ext,Per,conPer,fiber_length,InscribeR,bleb_len + + # first, determine if we desire class balance. + if balance: + # determine which class has maximum number of samples. + cs = np.unique(Target) + vals = [sum(Target == cs[i]) for i in cs] + print( + "Class %d has max number of samples, increasing other classes via size augmentation" + % np.argmax(vals) + ) + for i in range(len(cs)): + if i != np.argmax(vals): + # determine how many samples need to be made. + to_make = int(vals[np.argmax(vals)] - vals[i]) + # randomly sample rows from Data with the correct phenotype cs[i] + possible_rows = np.where(Target == cs[i])[0] + # sample to_make numbers from possible_rows. + sampled_rows = np.random.choice(possible_rows, to_make, replace=True) + newrows = Data[sampled_rows, :] + size_vary = s * np.random.rand(1, to_make)[0] + # vary size. + for v in range(to_make): + if np.random.rand() < 0.5: + newrows[v, 0] = ( + newrows[v, 0] + newrows[v, 0] * size_vary[v] * size_vary[v] + ) + newrows[v, 1] = newrows[v, 1] + newrows[v, 1] * size_vary[v] + newrows[v, 2] = newrows[v, 2] + newrows[v, 2] * size_vary[v] + newrows[v, 4] = ( + newrows[v, 4] + newrows[v, 4] * size_vary[v] * size_vary[v] + ) + newrows[v, 5] = newrows[v, 5] + newrows[v, 5] * size_vary[v] + newrows[v, 7] = newrows[v, 7] + newrows[v, 7] * size_vary[v] + newrows[v, 8] = newrows[v, 8] + newrows[v, 8] * size_vary[v] + newrows[v, 9] = newrows[v, 9] + newrows[v, 9] * size_vary[v] + newrows[v, 10] = newrows[v, 10] + newrows[v, 10] * size_vary[v] + newrows[v, 11] = newrows[v, 11] + newrows[v, 11] * size_vary[v] + else: + newrows[v, 0] = ( + newrows[v, 0] - newrows[v, 0] * size_vary[v] * size_vary[v] + ) + newrows[v, 1] = newrows[v, 1] - newrows[v, 1] * size_vary[v] + newrows[v, 2] = newrows[v, 2] - newrows[v, 2] * size_vary[v] + newrows[v, 4] = ( + newrows[v, 4] - newrows[v, 4] * size_vary[v] * size_vary[v] + ) + newrows[v, 5] = newrows[v, 5] - newrows[v, 5] * size_vary[v] + newrows[v, 7] = newrows[v, 7] - newrows[v, 7] * size_vary[v] + newrows[v, 8] = newrows[v, 8] - newrows[v, 8] * size_vary[v] + newrows[v, 9] = newrows[v, 9] - newrows[v, 9] * size_vary[v] + newrows[v, 10] = newrows[v, 10] - newrows[v, 10] * size_vary[v] + newrows[v, 11] = newrows[v, 11] - newrows[v, 11] * size_vary[v] + Data = np.concatenate((Data, newrows), axis=0) + yadd = np.ones(to_make) * cs[i] + Target = np.concatenate((Target, yadd.astype(int)), axis=0) + + Data = Data[np.argsort(Target), :] + Target = Target[np.argsort(Target)] + + if augment_class is None: + if max_copies > 0: + print( + "Augmenting each class with additional %d samples via size augmentation" + % max_copies + ) + cs = np.unique(Target) + for i in range(len(cs)): + # generate n = max_copies of Data. + possible_rows = np.where(Target == cs[i])[0] + # sample to_make numbers from possible_rows. + sampled_rows = np.random.choice(possible_rows, max_copies, replace=True) + newrows = Data[sampled_rows, :] + size_vary = s * np.random.rand(1, max_copies)[0] + # vary size. + for v in range(max_copies): + if np.random.rand() < 0.5: + newrows[v, 0] = ( + newrows[v, 0] + newrows[v, 0] * size_vary[v] * size_vary[v] + ) + newrows[v, 1] = newrows[v, 1] + newrows[v, 1] * size_vary[v] + newrows[v, 2] = newrows[v, 2] + newrows[v, 2] * size_vary[v] + newrows[v, 4] = ( + newrows[v, 4] + newrows[v, 4] * size_vary[v] * size_vary[v] + ) + newrows[v, 5] = newrows[v, 5] + newrows[v, 5] * size_vary[v] + newrows[v, 7] = newrows[v, 7] + newrows[v, 7] * size_vary[v] + newrows[v, 8] = newrows[v, 8] + newrows[v, 8] * size_vary[v] + newrows[v, 9] = newrows[v, 9] + newrows[v, 9] * size_vary[v] + newrows[v, 10] = newrows[v, 10] + newrows[v, 10] * size_vary[v] + newrows[v, 11] = newrows[v, 11] + newrows[v, 11] * size_vary[v] + else: + newrows[v, 0] = ( + newrows[v, 0] - newrows[v, 0] * size_vary[v] * size_vary[v] + ) + newrows[v, 1] = newrows[v, 1] - newrows[v, 1] * size_vary[v] + newrows[v, 2] = newrows[v, 2] - newrows[v, 2] * size_vary[v] + newrows[v, 4] = ( + newrows[v, 4] - newrows[v, 4] * size_vary[v] * size_vary[v] + ) + newrows[v, 5] = newrows[v, 5] - newrows[v, 5] * size_vary[v] + newrows[v, 7] = newrows[v, 7] - newrows[v, 7] * size_vary[v] + newrows[v, 8] = newrows[v, 8] - newrows[v, 8] * size_vary[v] + newrows[v, 9] = newrows[v, 9] - newrows[v, 9] * size_vary[v] + newrows[v, 10] = newrows[v, 10] - newrows[v, 10] * size_vary[v] + newrows[v, 11] = newrows[v, 11] - newrows[v, 11] * size_vary[v] + Data = np.concatenate((Data, newrows), axis=0) + yadd = np.ones(max_copies) * cs[i] + Target = np.concatenate((Target, yadd.astype(int)), axis=0) + + Data = Data[np.argsort(Target), :] + Target = Target[np.argsort(Target)] + + else: + augment_class = int(augment_class) + if max_copies > 0: + print( + "Augmenting Class = %d with additional %d samples via size augmentation" + % (augment_class, max_copies) + ) + # generate n = max_copies of Data. + possible_rows = np.where(Target == augment_class)[0] + # sample to_make numbers from possible_rows. + sampled_rows = np.random.choice(possible_rows, max_copies, replace=True) + newrows = Data[sampled_rows, :] + size_vary = s * np.random.rand(1, max_copies)[0] + # vary size. + for v in range(max_copies): + if np.random.rand() < 0.5: + newrows[v, 0] = ( + newrows[v, 0] + newrows[v, 0] * size_vary[v] * size_vary[v] + ) + newrows[v, 1] = newrows[v, 1] + newrows[v, 1] * size_vary[v] + newrows[v, 2] = newrows[v, 2] + newrows[v, 2] * size_vary[v] + newrows[v, 4] = ( + newrows[v, 4] + newrows[v, 4] * size_vary[v] * size_vary[v] + ) + newrows[v, 5] = newrows[v, 5] + newrows[v, 5] * size_vary[v] + newrows[v, 7] = newrows[v, 7] + newrows[v, 7] * size_vary[v] + newrows[v, 8] = newrows[v, 8] + newrows[v, 8] * size_vary[v] + newrows[v, 9] = newrows[v, 9] + newrows[v, 9] * size_vary[v] + newrows[v, 10] = newrows[v, 10] + newrows[v, 10] * size_vary[v] + newrows[v, 11] = newrows[v, 11] + newrows[v, 11] * size_vary[v] + else: + newrows[v, 0] = ( + newrows[v, 0] - newrows[v, 0] * size_vary[v] * size_vary[v] + ) + newrows[v, 1] = newrows[v, 1] - newrows[v, 1] * size_vary[v] + newrows[v, 2] = newrows[v, 2] - newrows[v, 2] * size_vary[v] + newrows[v, 4] = ( + newrows[v, 4] - newrows[v, 4] * size_vary[v] * size_vary[v] + ) + newrows[v, 5] = newrows[v, 5] - newrows[v, 5] * size_vary[v] + newrows[v, 7] = newrows[v, 7] - newrows[v, 7] * size_vary[v] + newrows[v, 8] = newrows[v, 8] - newrows[v, 8] * size_vary[v] + newrows[v, 9] = newrows[v, 9] - newrows[v, 9] * size_vary[v] + newrows[v, 10] = newrows[v, 10] - newrows[v, 10] * size_vary[v] + newrows[v, 11] = newrows[v, 11] - newrows[v, 11] * size_vary[v] + Data = np.concatenate((Data, newrows), axis=0) + yadd = np.ones(max_copies) * augment_class + Target = np.concatenate((Target, yadd.astype(int)), axis=0) + + Data = Data[np.argsort(Target), :] + Target = Target[np.argsort(Target)] + + return (Data, Target) + + +######################################################################## +######################################################################## +####### IMPORT THE DEV SET ##### +######################################################################## +######################################################################## +def import_dev_set(dev_file_name="DevResults.txt"): + print("Importing the dev set...") + + # import features + featurelist = [] + + with open(dev_file_name, "r") as infile: + for line in infile: + featurelist.append(line.strip()) + + # so now, featurelist[1] has names of things in form 'Area, MajorAxisLength, ... Class' + FeatureNames = [x.strip() for x in featurelist[0].split(",")] + # FeatureNames has form ['Area','MajorAxisLength',....'Class'] which is what I wanted + + DevData = [ + [float(x.strip()) for x in featurelist[i].split(",")] + for i in range(1, len(featurelist)) + ] + + # Data is in form [[1,2,3,....0.0],[3,3,1,...0.0],...[5,3,1,...0.0]], the last input is the class. + + Devclasses = [int(i[-1]) for i in DevData] + + # classes contains the class number from which the data is from + + # want to delete target from AllData. + + DevX = [i[0:-1] for i in DevData] + + # X has form similar to Data. So when we reshape, we want the output to be + # X = array([[0,1,2,...] + # [1,2,3,...]]) + + X_dev = np.asarray(DevX, order="F") + + # add aspect ratio as last column of data + AR = [] + for i in range(len(X_dev)): + AR.append(X_dev[i, 1] / X_dev[i, 2]) + + AR = np.asarray(AR) + + AR = AR.reshape((len(AR), 1)) + + X_dev = np.append(X_dev, AR, 1) # concatenates arrays appropriately. + + # add form factor as last column of data + # P^2/Area + FF = [] + for i in range(len(X_dev)): + FF.append(X_dev[i, 8] * X_dev[i, 8] / X_dev[i, 0]) + FF = np.asarray(FF) + FF = FF.reshape((len(FF), 1)) + X_dev = np.append(X_dev, FF, 1) + + # this has the right form, is uses fortran column-major style memory representation vs row major C-style + # the notation is scientific, where iris data set looks like a float. CHECKED: Both are type numpy.float64 + # both have same indexing calls, so I think we're in business. + + # looks exactly correct, or at least like iris data set target. + y_dev = np.asarray(Devclasses) + + return (X_dev, y_dev, FeatureNames) + + +######################################################################## +#########DATA IS IN THE SAME FORM AS IS FOUND IN IRIS DATASET########### +######################################################################## +# Target = Target classes (0-4) for training and validation (type, numpy.int64, array) +# Data = Data for training and validation to be split. (type, numpy.float64, array) +# FeatureNames = Feature names for each column of data. (type, 'str', python list) +######################################################################## +# print "Data is now in the same form as that found in Iris Dataset" +# print "Splitting the training dataset into train/val" + + +def apply_normalization(X_train, max_norm=False, l1_norm=False, l2_norm=False): + ######################################################## + if max_norm: + print("Normalizing data using l1_norm") + X_train = X_train / np.max(np.abs(X_train), 0)[None, :] + if l1_norm: + print("Normalizing data using l1_norm") + X_train = X_train / np.sum(X_train, 0)[None, :] + if l2_norm: + print("Normalizing data using l1_norm") + X_train = X_train / np.sqrt(np.sum(X_train * X_train, 0))[None, :] + + return X_train + + +######################################################################## + + +def preprocess_train_data(X_train, d=2): + + ############### SPLITTING THE DATASET ################## + # First split the dataset so it is as if we only had a training set then a eval set. + # X_train, X_test, y_train, y_test = train_test_split(Data, Target, test_size = .3)#.25)#, random_state = + # default has shuffle = True. test_size sets the proportion of the data set to include in the test, here 25%. + ######################################################## + if d > 1: + print("Increasing dimensionality of dataset using cross terms") + #################INCREASING FEATURES#################### + poly = preprocessing.PolynomialFeatures(degree=d, interaction_only=True) + # IN SOME MODELS with 2 polynomial features, we are getting 90% exactly. In some polynomial 3 models, + # we are getting 90.83%, which is exactly even with deep learning models. + + X_train = poly.fit_transform(X_train) + # target_feature_names = ['x'.join(['{}^{}'.format(pair[0],pair[1]) for pair in tuple if pair[1]!=0]) for tuple in [zip(FeatureNames,p) for p in poly.powers_]] + # poly=preprocessing.PolynomialFeatures(degree = 2, interaction_only = True) + # X_test = poly.fit_transform(X_test) + # poly=preprocessing.PolynomialFeatures(degree = 2, interaction_only = True) + # X_dev = poly.fit_transform(X_dev) + + ######################################################## + + print("Scaling the data") + ################# SCALE THE DATA ####################### + # Scale the data. Each attribute in the dataset must be independently scaled, that is + # 0 mean, and unit variance. Doing this returns the z-scores of the data + # Z = (x - mu) / sigma + + # , QuantileTransformer(output_distribution='normal') + scaler = preprocessing.RobustScaler().fit(X_train) + # preprocessing.StandardScaler().fit(X_train) #IMPORTANT NOTE: We are scaling based only on training data!!!! + + X_train_scaled = scaler.fit_transform(X_train) + + # X_test_scaled = scaler.transform(X_test) # will be used later to evaluate the performance. + + # X_dev_scaled = scaler.transform(X_dev) + + ########################################################## + + return (X_train_scaled, scaler) # , target_feature_names) + + +def preprocess_test_data(X_dev, scaler, d=2): + ############### SPLITTING THE DATASET ################## + # First split the dataset so it is as if we only had a training set then a eval set. + # X_train, X_test, y_train, y_test = train_test_split(Data, Target, test_size = .3)#.25)#, random_state = + # default has shuffle = True. test_size sets the proportion of the data set to include in the test, here 25%. + ######################################################## + + print("Increasing dimensionality of dataset using cross terms") + #################INCREASING FEATURES#################### + poly = preprocessing.PolynomialFeatures(degree=d, interaction_only=True) + # IN SOME MODELS with 2 polynomial features, we are getting 90% exactly. In some polynomial 3 models, + # we are getting 90.83%, which is exactly even with deep learning models. + + # X_train = poly.fit_transform(X_train) + # target_feature_names = ['x'.join(['{}^{}'.format(pair[0],pair[1]) for pair in tuple if pair[1]!=0]) for tuple in [zip(FeatureNames,p) for p in poly.powers_]] + # poly=preprocessing.PolynomialFeatures(degree = 2, interaction_only = True) + # X_test = poly.fit_transform(X_test) + # poly=preprocessing.PolynomialFeatures(degree = 2, interaction_only = True) + X_dev = poly.fit_transform(X_dev) + + ######################################################## + + print("Scaling the data") + ################# SCALE THE DATA ####################### + # Scale the data. Each attribute in the dataset must be independently scaled, that is + # 0 mean, and unit variance. Doing this returns the z-scores of the data + # Z = (x - mu) / sigma + + # scaler = preprocessing.StandardScaler().fit(X_train) #IMPORTANT NOTE: We are scaling based only on training data!!!! + + # X_train_scaled = scaler.transform(X_train) + + # X_test_scaled = scaler.transform(X_test) # will be used later to evaluate the performance. + + X_dev_scaled = scaler.transform(X_dev) + + ########################################################## + + return X_dev_scaled + + +def Add_Measures( + Data, + FeatureNames=None, + add_AR=True, + add_FF=True, + add_convexity=True, + add_curl_old=True, + add_curl=True, + add_sphericity=True, + add_InscribedArea=True, + add_BlebRel=True, +): + ############### EXPANDING THE DATASET ################## + # Add measures of Aspect Ratio, Form Factor, Convexity, Curl, and Sphericity + # Input: Data must be an np array with N (row) examples x M (cols) measures. + # Measures should go: Area, MjrAxis, MnrAxis, Ecc,ConA,EqD,Sol,Ext,Per,conPer,fiber_length,InscribeR,bleb_len + ######################################################## + if add_AR: + AR = [] + for i in range(len(Data)): + AR.append(Data[i, 1] / Data[i, 2]) + + AR = np.asarray(AR) + + AR = AR.reshape((len(AR), 1)) + + Data = np.append(Data, AR, 1) # concatenates arrays appropriately. + if FeatureNames is not None: + FeatureNames.extend(["AR"]) + + if add_FF: + # this measure is really compactness, if you multiply each by 4 pi + # note this is different from roundness, which would use convex perimeter + FF = [] + for i in range(len(Data)): + FF.append(Data[i, 0] / (Data[i, 8] * Data[i, 8])) + # FF.append(Data[i,8]*Data[i,8] / Data[i,0]) + + FF = np.asarray(FF) + FF = FF.reshape((len(FF), 1)) + Data = np.append(Data, FF, 1) + if FeatureNames is not None: + FeatureNames.extend(["FF"]) + + if add_convexity: + CC = [] + for i in range(len(Data)): + CC.append(Data[i, 8] / Data[i, 9]) + + CC = np.asarray(CC) + CC = CC.reshape((len(CC), 1)) + Data = np.append(Data, CC, 1) + if FeatureNames is not None: + FeatureNames.extend(["Convexity"]) + + if add_curl_old: + # tells how curled the object is. might help for lamellipodia. + # curl is length / fiber length. (I assume length here can be major axis length) + # fiber length definition is (perimeter - sqrt(perimeter^2 - 16*Area)) / 4 + + # this definition does not work for a circle. Note that the result will be imaginary. + # I changed the 16 to a 4Pi. This should be fine. + cc = [] + for i in range(len(Data)): + if (4 * np.pi * Data[i, 0]) <= (Data[i, 8] * Data[i, 8]): + fiber_length = ( + Data[i, 8] + - np.sqrt((Data[i, 8] * Data[i, 8]) - (4 * np.pi * Data[i, 0])) + ) / np.pi # 4 + cc.append(Data[i, 1] / fiber_length) + else: + fiber_length = Data[i, 8] / np.pi # 4 + cc.append(Data[i, 1] / fiber_length) + + cc = np.asarray(cc) + cc = cc.reshape((len(cc), 1)) + Data = np.append(Data, cc, 1) + if FeatureNames is not None: + FeatureNames.extend(["Curl_old"]) + + if add_curl: + cc = [] + for i in range(len(Data)): + cc.append(Data[i, 1] / Data[i, 10]) + + cc = np.asarray(cc) + cc = cc.reshape((len(cc), 1)) + Data = np.append(Data, cc, 1) + # bound between 0 and 1 if major axis length could be replaced by feret diameter. + if FeatureNames is not None: + FeatureNames.extend(["Curl"]) + + if add_sphericity: + ss = [] + for i in range(len(Data)): + ss.append(Data[i, 11] * 2 / Data[i, 1]) + + ss = np.asarray(ss) + ss = ss.reshape((len(ss), 1)) + Data = np.append(Data, ss, 1) + # bound between 0 and 1 where 1 is a circle, perfectly spherical, and 0 is not at all. + # would be better if we had feret diameter instead of major axis. + if FeatureNames is not None: + FeatureNames.extend(["Sphericity"]) + + if add_InscribedArea: + aa = [] + for i in range(len(Data)): + aa.append(Data[i, 1] * Data[i, 1] * np.pi / Data[i, 11]) + + aa = np.asarray(aa) + aa = aa.reshape((len(aa), 1)) + Data = np.append(Data, aa, 1) + if FeatureNames is not None: + FeatureNames.extend(["InArea"]) + + if add_BlebRel: + bb = [] + for i in range(len(Data)): + bb.append(Data[i, 12] / Data[i, 11]) + + bb = np.asarray(bb) + bb = bb.reshape((len(bb), 1)) + Data = np.append(Data, bb, 1) + if FeatureNames is not None: + FeatureNames.extend(["Bleb_Rel"]) + + if FeatureNames is not None: + return (Data, FeatureNames) + else: + return Data + + +def Exclude_Measures( + Data, + FeatureNames=None, + ex_Area=False, + ex_MjrAxis=False, + ex_MnrAxis=False, + ex_Ecc=False, + ex_ConA=False, + ex_EqD=False, + ex_Sol=False, + ex_Ext=False, + ex_Per=False, + ex_conPer=False, + ex_FL=False, + ex_InR=False, + ex_bleb=False, +): + # Area,MjrAxis,MnrAxis,Ecc,ConA,EqD,Sol,Ext,Per,conPer,FL,InR + + del_cols = [] + if ex_Area: + del_cols.append(0) + if ex_MjrAxis: + del_cols.append(1) + if ex_MnrAxis: + del_cols.append(2) + if ex_Ecc: + del_cols.append(3) + if ex_ConA: + del_cols.append(4) + if ex_EqD: + del_cols.append(5) + if ex_Sol: + del_cols.append(6) + if ex_Ext: + del_cols.append(7) + if ex_Per: + del_cols.append(8) + if ex_conPer: + del_cols.append(9) + if ex_FL: + del_cols.append(10) + if ex_InR: + del_cols.append(11) + if ex_bleb: + del_cols.append(12) + + Data = np.delete(Data, del_cols, 1) + if FeatureNames is not None: + FeatureNames = [i for j, i in enumerate(FeatureNames) if j not in del_cols] + return (Data, FeatureNames) + else: + return Data + + +def open_and_save_test_data(fpath, csvfilename, txtfilename, ratio): + # fpath = '/volumes/chris stuff/chemsensing/chemsensing/Y27632_120518/Results/' + # /Rho_Act_120118/Results_after/' + # filename = 'FinalResults_after' + # option to delete certain measures if done so in training. + # order should go like + # %frame number%correctedNum%area%centroidx%centroidy%major%minor%eccentricity + # %orientation%convex area%filledarea%equivDiameter%solidity%extent%perimeter + # %perimeter old%convex perimeter%fiber length%%max in radii%bleb length%centersx%centersy + + data = np.genfromtxt( + fpath + csvfilename + ".csv", + delimiter=",", + usecols=[2, 5, 6, 7, 9, 11, 12, 13, 14, 16, 17, 18, 19], + skip_header=1, + ) + # was cols 3,6,7,8,10,12,13,14,15 + frames_cell = np.genfromtxt( + fpath + csvfilename + ".csv", delimiter=",", usecols=[0, 1], skip_header=1 + ) + # add aspect ratio as last column of data + + data[:, 0] = data[:, 0] * ratio * ratio # area + data[:, 1] = data[:, 1] * ratio # mjr + data[:, 2] = data[:, 2] * ratio # MnrAxis + # ecc unitless + data[:, 4] = data[:, 4] * ratio * ratio # ConvexArea + data[:, 5] = data[:, 5] * ratio # EquivDiameter + # Solidity + # Extent + data[:, 8] = data[:, 8] * ratio # Perimeter + data[:, 9] = data[:, 9] * ratio # conPerim + data[:, 10] = data[:, 10] * ratio # FibLen + data[:, 11] = data[:, 11] * ratio # max inscribed r + data[:, 12] = data[:, 12] * ratio # bleblen + + preds = np.genfromtxt( + fpath + "/" + txtfilename + ".txt", + delimiter=" ", + usecols=[4, 5, 6, 7], + skip_header=1, + ) + y_target = np.where(np.max(preds, 1) > 0.7, np.argmax(preds, 1), 4) + # y_target = np.reshape(y_target,(len(y_target),1)) + + return (data, y_target, frames_cell) |