isebenius · filbe1 · Apr 17, 2024
diff --git a/MIND.py b/MIND.py
@@ -1,11 +1,8 @@
-import sys
-import os
-import numpy as np
-import pandas as pd
-from MIND_helpers import calculate_mind_network, is_outlier
+from MIND_helpers import calculate_mind_network
 from get_vertex_df import get_vertex_df
 
-def compute_MIND(surf_dir, features, parcellation, filter_vertices=False, resample=False, n_samples = 4000):
+
+def compute_MIND(surf_dir, features, parcellation, filter_vertices=False, resample=False, n_samples=4000):
 
 	vertex_data, regions, features_used = get_vertex_df(surf_dir, features, parcellation)
 
@@ -18,8 +15,9 @@ def compute_MIND(surf_dir, features, parcellation, filter_vertices=False, resamp
 	columns = ['Label'] + features_used
 
 	feature_conv_dict = dict(zip(list(features), list(features_used)))
-	#The filter_vertices parameter determines you want to filter out all the non-biologically feasible vertices (i.e. any of volume, surface area or cortical thickness equalling zero)	
-	if filter_vertices == True:
+	# The filter_vertices parameter determines you want to filter out all the non-biologically feasible vertices
+	# (i.e. any of volume, surface area or cortical thickness equalling zero)
+	if filter_vertices:
 
 		if 'CT' in features:
 			vertex_data = vertex_data.drop(vertex_data[vertex_data[feature_conv_dict['CT']] == 0].index)
@@ -32,17 +30,18 @@ def compute_MIND(surf_dir, features, parcellation, filter_vertices=False, resamp
 
 	vertex_data = vertex_data[columns]
 
-	#standardize across the brain for each feature to get each dimension to roughly the same scale.
+	# standardize across the brain for each feature to get each dimension to roughly the same scale.
 	for x in features_used:
 		vertex_data[x] = (vertex_data[x] - vertex_data[x].mean())/vertex_data[x].std()
 
-	#Drop outliers. This drops vertices with an MAD score in ANY of the used features. Can be customized.
-	# z_score_threshhold = 7
-	# outliers_per_features = np.array([is_outlier(vertex_data[x].values, z_score_threshhold) for x in features_used]).T
+	# Drop outliers. This drops vertices with an MAD score in ANY of the used features. Can be customized.
+	# import numpy as np
+	# z_score_threshold = 7
+	# outliers_per_features = np.array([is_outlier(vertex_data[x].values, z_score_threshold) for x in features_used]).T
 	# vertex_data = vertex_data.loc[np.sum(outliers_per_features, axis = 1) == 0]
 
 	print('Computing MIND...')
-	#calculate MIND network
+	# calculate MIND network
 	MIND = calculate_mind_network(vertex_data, features_used, regions, resample=resample, n_samples = n_samples)
 
 	print('Done!')

diff --git a/MIND_helpers.py b/MIND_helpers.py
@@ -4,8 +4,8 @@
 from scipy import stats
 from collections import defaultdict
 
-def is_outlier(points, thresh=7): #taken from https://stackoverflow.com/questions/22354094/pythonic-way-of-detecting-outliers-in-one-dimensional-observation-data
 
+def is_outlier(points, thresh=7):
     """
     Returns a boolean array with True if points are outliers and False 
     otherwise.
@@ -28,7 +28,7 @@ def is_outlier(points, thresh=7): #taken from https://stackoverflow.com/question
         Statistical Techniques, Edward F. Mykytka, Ph.D., Editor. 
     """
     if len(points.shape) == 1:
-        points = points[:,None]
+        points = points[:, None]
     median = np.median(points, axis=0)
     diff = np.sum((points - median)**2, axis=-1)
     diff = np.sqrt(diff)
@@ -38,7 +38,8 @@ def is_outlier(points, thresh=7): #taken from https://stackoverflow.com/question
 
     return modified_z_score > thresh
 
-def get_KDTree(x): #Inspired by https://gist.github.com/atabakd/ed0f7581f8510c8587bc2f41a094b518
+
+def get_KDTree(x):  # Inspired by https://gist.github.com/atabakd/ed0f7581f8510c8587bc2f41a094b518
 
     # Check the dimensions are consistent
     x = np.atleast_2d(x)
@@ -48,32 +49,32 @@ def get_KDTree(x): #Inspired by https://gist.github.com/atabakd/ed0f7581f8510c85
 
     return xtree
 
-def get_KL(x, y, xtree, ytree): #Inspired by https://gist.github.com/atabakd/ed0f7581f8510c8587bc2f41a094b518
 
-    
+def get_KL(x, y, xtree, ytree):  # Inspired by https://gist.github.com/atabakd/ed0f7581f8510c8587bc2f41a094b518
     x = np.atleast_2d(x)
     y = np.atleast_2d(y)
 
     x = np.atleast_2d(x)
     y = np.atleast_2d(y)
 
-    n,d = x.shape
-    m,dy = y.shape
+    n, d = x.shape
+    m, dy = y.shape
 
-    #Check dimensions
-    assert(d == dy)
+    # Check dimensions
+    assert (d == dy)
 
     # Get the first two nearest neighbours for x, since the closest one is the
     # sample itself.
-    r = xtree.query(x, k=2, eps=.01, p=2)[0][:,1]
+    r = xtree.query(x, k=2, eps=.01, p=2)[0][:, 1]
     s = ytree.query(x, k=1, eps=.01, p=2)[0]
 
     rs_ratio = r/s
 
-    #Remove points with zero, nan, or infinity. This happens when two regions have a vertex with the exact same value – an occurence that basically onnly happens for the single feature MSNs
-    #and has to do with FreeSurfer occasionally outputting the exact same value for different vertices.
+    # Remove points with zero, nan, or infinity. This happens when two regions have a vertex with the exact same value
+    # an occurrence that basically only happens for the single feature MSNs
+    # and has to do with FreeSurfer occasionally outputting the exact same value for different vertices.
     rs_ratio = rs_ratio[np.isfinite(rs_ratio)]
-    rs_ratio = rs_ratio[rs_ratio!=0.0]
+    rs_ratio = rs_ratio[rs_ratio != 0.0]
 
     # There is a mistake in the paper. In Eq. 14, the right side misses a negative sign
     # on the first term of the right hand side.
@@ -84,33 +85,36 @@ def get_KL(x, y, xtree, ytree): #Inspired by https://gist.github.com/atabakd/ed0
     return kl
 
 
-def calculate_mind_network(data_df, feature_cols, region_list, resample=False, n_samples = 4000):
-
-    MIND = pd.DataFrame(np.zeros((len(region_list), len(region_list))), \
-                        index = region_list, columns = region_list)
+def calculate_mind_network(data_df, feature_cols, region_list, resample=False, n_samples=4000):
+    MIND = pd.DataFrame(np.zeros((len(region_list), len(region_list))),
+                        index=region_list, columns=region_list)
 
-    #Get only desired regions
+    # Get only desired regions
     data_df = data_df.loc[data_df['Label'].isin(region_list)]
 
-    #Resample dataset if resample has been set to True and if it is UNIVARIATE ONLY. This should only be done if you are using a single feature which contains repeated values.
-    if (len(feature_cols) == 1) and resample==True:   
+    # Resample dataset if resample has been set to True and if it is UNIVARIATE ONLY. This should only be done if you
+    # are using a single feature which contains repeated values.
+    if (len(feature_cols) == 1) and resample:
         n_samples = n_samples
-        resampled_dataset = pd.DataFrame(np.zeros((n_samples, len(region_list))), columns = region_list)
+        resampled_dataset = pd.DataFrame(np.zeros((n_samples, len(region_list))), columns=region_list)
 
         for name, data in data_df.groupby('Label'):
             resampled_dataset[name] = stats.gaussian_kde(data[feature_cols[0]]).resample(n_samples)[0]
 
-        resampled_dataset = resampled_dataset.melt(var_name = 'Label', value_name = feature_cols[0])
+        resampled_dataset = resampled_dataset.melt(var_name='Label', value_name=feature_cols[0])
         data_df = resampled_dataset
 
-    if (len(feature_cols) > 1) and resample==True:   
-        raise Exception("Resampling the data is only supported if you are using a single feature -- this is because higher order density estimation can be unreliable and very computationally expensive.")
+    if (len(feature_cols) > 1) and resample:
+        raise Exception("Resampling the data is only supported if you are using a single feature -- this is because "
+                        "higher order density estimation can be unreliable and very computationally expensive.")
 
-    #Check that there aren't many repeated values
+    # Check that there aren't many repeated values
     percent_unique_vals = len(data_df[feature_cols].drop_duplicates())/len(data_df[feature_cols])
 
     if percent_unique_vals < 0.8:
-        raise Exception("There are many repeated values in the data, which compromises the validity of MIND calculation. Please minimize the number of repeated values in the data and try again. If you are using only one feature, try rerunning with resample=True.")
+        raise Exception("There are many repeated values in the data, which compromises the validity of MIND "
+                        "calculation. Please minimize the number of repeated values in the data and try again. "
+                        "If you are using only one feature, try rerunning with resample=True.")
 
     grouped_data = data_df.groupby('Label')
 
@@ -128,7 +132,7 @@ def calculate_mind_network(data_df, feature_cols, region_list, resample=False, n
             if name_x == name_y:
                 continue
 
-            if set([name_x,name_y]) in used_pairs:
+            if {name_x, name_y} in used_pairs:
                 continue
 
             dat_x = dat_x[feature_cols]
@@ -139,11 +143,11 @@ def calculate_mind_network(data_df, feature_cols, region_list, resample=False, n
 
             kl = KLa + KLb
 
-            MIND.at[name_x,name_y] = 1/(1+kl)
-            MIND.at[name_y,name_x] = 1/(1+kl)
+            MIND.at[name_x, name_y] = 1/(1+kl)
+            MIND.at[name_y, name_x] = 1/(1+kl)
 
-            used_pairs.append(set([name_x,name_y]))
+            used_pairs.append({name_x, name_y})
 
     MIND = MIND[region_list].T[region_list].T
 
-    return MIND
+    return MIND
diff --git a/get_vertex_df.py b/get_vertex_df.py
@@ -1,16 +1,13 @@
-import sys
 import numpy as np
-import os
 import pandas as pd
 from os.path import exists
 from nibabel.freesurfer.io import read_morph_data, read_annot
 from nibabel.freesurfer.mghformat import load
 from collections import defaultdict
-from MIND_helpers import calculate_mind_network, is_outlier
 
-def get_vertex_df(surf_dir, features, parcellation):
 
-    '''
+def get_vertex_df(surf_dir, features, parcellation):
+    """
     INPUT SPECIFICATIONS:
     • surf_dir (str) : This is a string the location containing all relevant directories output by FreeSurfer (i.e. label, mri, surf).
 
@@ -41,29 +38,30 @@ def get_vertex_df(surf_dir, features, parcellation):
         A valid list of feature values combining these different input types would therefore be: ['CT','SD',(path/to/lh_feature1, path/to/rh_feature1), (path/to/lh_feature2, path/to/rh_feature2)] 
 
     • parcellation (str): This is a string the location containing parcellation scheme to be used. The files 'lh.' + parcellation + '.annot' and 'rh.' + parcellation + '.annot' must exist inside the surf_dir/label directory.
-    '''
+    """
 
-    #specify data locations
+    # specify data locations
     surfer_location = surf_dir + '/'
 
-    #Check inputs!
-    if (exists(surfer_location + '/label/lh.' + parcellation + '.annot') == False) or (exists(surfer_location + '/label/rh.' + parcellation + '.annot') == False):
+    # Check inputs!
+    if (not (exists(surfer_location + '/label/lh.' + parcellation + '.annot')) or
+            (not exists(surfer_location + '/label/rh.' + parcellation + '.annot'))):
         raise Exception('Parcellation files not found.')
 
-    all_shorthand_features = ['CT','Vol','SA','MC','SD']
-    all_shorthand_features_dict = dict(zip(all_shorthand_features, ['thickness','volume','area','curv','sulc']))
-    
+    all_shorthand_features = ['CT', 'Vol', 'SA', 'MC', 'SD']
+    all_shorthand_features_dict = dict(zip(all_shorthand_features, ['thickness', 'volume', 'area', 'curv', 'sulc']))
+
     lh_feature_locs = []
     rh_feature_locs = []
 
-    #Check feature inputs, store location of files
+    # Check feature inputs, store location of files
     for feature in features:
         if feature in all_shorthand_features:
             lh_loc = surfer_location + 'surf/lh.' + all_shorthand_features_dict[feature]
             rh_loc = surfer_location + 'surf/rh.' + all_shorthand_features_dict[feature]
-            
-            if (exists(lh_loc) == False) or (exists(rh_loc) == False):
-                raise Exception('Feature for input "' + feature +'" not found.')
+
+            if (not exists(lh_loc)) or (not exists(rh_loc)):
+                raise Exception('Feature for input "' + feature + '" not found.')
             else:
                 lh_feature_locs.append(lh_loc)
                 rh_feature_locs.append(rh_loc)
@@ -73,9 +71,9 @@ def get_vertex_df(surf_dir, features, parcellation):
             if len(feature.split('/')) == 1:
                 lh_loc = surfer_location + 'surf/lh.' + feature
                 rh_loc = surfer_location + 'surf/rh.' + feature
-                
-                if (exists(lh_loc) == False) or (exists(rh_loc) == False):
-                    raise Exception('Feature for input "' + feature +'" not found.')
+
+                if (not exists(lh_loc)) or (not exists(rh_loc)):
+                    raise Exception('Feature for input "' + feature + '" not found.')
 
                 else:
                     lh_feature_locs.append(lh_loc)
@@ -89,9 +87,9 @@ def get_vertex_df(surf_dir, features, parcellation):
                 rh_loc = feature.split('/')
                 rh_loc[-1] = 'r' + rh_loc[-1][1:]
                 rh_loc = '/'.join(rh_loc)
-                
-                if (exists(lh_loc) == False) or (exists(rh_loc) == False):
-                    raise Exception('Feature for input "' + feature +'" not found.')
+
+                if (not exists(lh_loc)) or (not exists(rh_loc)):
+                    raise Exception('Feature for input "' + feature + '" not found.')
 
                 else:
                     lh_feature_locs.append(lh_loc)
@@ -101,9 +99,9 @@ def get_vertex_df(surf_dir, features, parcellation):
 
             lh_loc = feature[0]
             rh_loc = feature[1]
-            
-            if (exists(lh_loc) == False) or (exists(rh_loc) == False):
-                raise Exception('Feature for input "' + feature[0] +'" or "' + feature[1] +'" not found.')
+
+            if (not exists(lh_loc)) or (not exists(rh_loc)):
+                raise Exception('Feature for input "' + feature[0] + '" or "' + feature[1] + '" not found.')
 
             else:
                 lh_feature_locs.append(lh_loc)
@@ -112,11 +110,11 @@ def get_vertex_df(surf_dir, features, parcellation):
         else:
             raise Exception('Unrecognized format for feature input: ', feature)
 
-    #Get annotation files
-    lh_annot = read_annot(surfer_location + '/label/lh.' + parcellation + '.annot', orig_ids = True)
-    rh_annot = read_annot(surfer_location + '/label/rh.' + parcellation + '.annot', orig_ids = True)
+    # Get annotation files
+    lh_annot = read_annot(surfer_location + '/label/lh.' + parcellation + '.annot', orig_ids=True)
+    rh_annot = read_annot(surfer_location + '/label/rh.' + parcellation + '.annot', orig_ids=True)
 
-    annot_dict = {'lh':lh_annot, 'rh':rh_annot}
+    annot_dict = {'lh': lh_annot, 'rh': rh_annot}
 
     '''
     The regions in the lh and rh need to be renamed and distinct. 
@@ -129,30 +127,30 @@ def get_vertex_df(surf_dir, features, parcellation):
     lh_region_names = ['lh_' + str(x).split("'")[1] for x in lh_annot[2]]
     rh_region_names = ['rh_' + str(x).split("'")[1] for x in rh_annot[2]]
 
-    lh_convert_dict = dict(zip(lh_annot[1][:,-1], lh_region_names))
-    rh_convert_dict = dict(zip(rh_annot[1][:,-1], rh_region_names))
+    lh_convert_dict = dict(zip(lh_annot[1][:, -1], lh_region_names))
+    rh_convert_dict = dict(zip(rh_annot[1][:, -1], rh_region_names))
 
-    convert_dicts = {'lh': lh_convert_dict,\
-                    'rh': rh_convert_dict}
+    convert_dicts = {'lh': lh_convert_dict,
+                     'rh': rh_convert_dict}
 
     used_labels_l = np.intersect1d(np.unique(lh_annot[0]), list(lh_convert_dict.keys()))
     used_labels_r = np.intersect1d(np.unique(rh_annot[0]), list(rh_convert_dict.keys()))
 
-    used_labels = {'lh': used_labels_l,\
-                    'rh': used_labels_r}
-
+    used_labels = {'lh': used_labels_l,
+                   'rh': used_labels_r}
 
     used_regions_l = np.array([value for key, value in lh_convert_dict.items() if key in used_labels_l])
     used_regions_r = np.array([value for key, value in rh_convert_dict.items() if key in used_labels_r])
 
     combined_regions = np.hstack((used_regions_l, used_regions_r))
-    unknown_regions = [x for x in combined_regions if (('?' in x) | ('unknown' in x) | ('Unknown' in x) | ('Medial_Wall' in x) | (len(x) == 3))]
+    unknown_regions = [x for x in combined_regions if
+                       (('?' in x) | ('unknown' in x) | ('Unknown' in x) | ('Medial_Wall' in x) | (len(x) == 3))]
     combined_regions = np.array([x for x in combined_regions if x not in unknown_regions])
 
     vertex_data_dict = defaultdict()
 
-    #Now load up all the vertex-level data!
-    for hemi in ['lh','rh']:
+    # Now load up all the vertex-level data!
+    for hemi in ['lh', 'rh']:
         print(hemi)
         hemi_data_dict = defaultdict()
 
@@ -161,7 +159,7 @@ def get_vertex_df(surf_dir, features, parcellation):
 
             for i, lh_feature_loc in enumerate(lh_feature_locs):
                 print(lh_feature_loc)
-                #check for mgh/mgz format vs regular curv files
+                # check for mgh/mgz format vs regular curv files
                 if lh_feature_loc.endswith('mgh') or lh_feature_loc.endswith('mgz'):
                     hemi_data_dict['Feature_' + str(i)] = load(lh_feature_loc).get_fdata().flatten()
                 else:
@@ -185,19 +183,19 @@ def get_vertex_df(surf_dir, features, parcellation):
         for i, feature in enumerate(used_features):
             print(i, feature)
             hemi_data[i + 1] = hemi_data_dict[feature]
-        
+
         col_names = ['Label'] + used_features
-        hemi_data = pd.DataFrame(hemi_data.T, columns = col_names)
-        
-        #Select only the vertices that map to regions.
+        hemi_data = pd.DataFrame(hemi_data.T, columns=col_names)
+
+        # Select only the vertices that map to regions.
         hemi_data = hemi_data.loc[hemi_data['Label'].isin(used_labels[hemi])]
-        
+
         hemi_data["Label"] = hemi_data["Label"].map(convert_dicts[hemi])
         vertex_data_dict[hemi] = hemi_data
 
-    vertex_data = pd.concat([vertex_data_dict['lh'], vertex_data_dict['rh']], ignore_index = True)
+    vertex_data = pd.concat([vertex_data_dict['lh'], vertex_data_dict['rh']], ignore_index=True)
 
-    #Output data
+    # Output data
     print("features used: ")
     print(used_features)
     return vertex_data, combined_regions, used_features