Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 12 additions & 13 deletions MIND.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
import sys
import os
import numpy as np
import pandas as pd
from MIND_helpers import calculate_mind_network, is_outlier
from MIND_helpers import calculate_mind_network
from get_vertex_df import get_vertex_df

def compute_MIND(surf_dir, features, parcellation, filter_vertices=False, resample=False, n_samples = 4000):

def compute_MIND(surf_dir, features, parcellation, filter_vertices=False, resample=False, n_samples=4000):

vertex_data, regions, features_used = get_vertex_df(surf_dir, features, parcellation)

Expand All @@ -18,8 +15,9 @@ def compute_MIND(surf_dir, features, parcellation, filter_vertices=False, resamp
columns = ['Label'] + features_used

feature_conv_dict = dict(zip(list(features), list(features_used)))
#The filter_vertices parameter determines you want to filter out all the non-biologically feasible vertices (i.e. any of volume, surface area or cortical thickness equalling zero)
if filter_vertices == True:
# The filter_vertices parameter determines you want to filter out all the non-biologically feasible vertices
# (i.e. any of volume, surface area or cortical thickness equalling zero)
if filter_vertices:

if 'CT' in features:
vertex_data = vertex_data.drop(vertex_data[vertex_data[feature_conv_dict['CT']] == 0].index)
Expand All @@ -32,17 +30,18 @@ def compute_MIND(surf_dir, features, parcellation, filter_vertices=False, resamp

vertex_data = vertex_data[columns]

#standardize across the brain for each feature to get each dimension to roughly the same scale.
# standardize across the brain for each feature to get each dimension to roughly the same scale.
for x in features_used:
vertex_data[x] = (vertex_data[x] - vertex_data[x].mean())/vertex_data[x].std()

#Drop outliers. This drops vertices with an MAD score in ANY of the used features. Can be customized.
# z_score_threshhold = 7
# outliers_per_features = np.array([is_outlier(vertex_data[x].values, z_score_threshhold) for x in features_used]).T
# Drop outliers. This drops vertices with an MAD score in ANY of the used features. Can be customized.
# import numpy as np
# z_score_threshold = 7
# outliers_per_features = np.array([is_outlier(vertex_data[x].values, z_score_threshold) for x in features_used]).T
# vertex_data = vertex_data.loc[np.sum(outliers_per_features, axis = 1) == 0]

print('Computing MIND...')
#calculate MIND network
# calculate MIND network
MIND = calculate_mind_network(vertex_data, features_used, regions, resample=resample, n_samples = n_samples)

print('Done!')
Expand Down
66 changes: 35 additions & 31 deletions MIND_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
from scipy import stats
from collections import defaultdict

def is_outlier(points, thresh=7): #taken from https://stackoverflow.com/questions/22354094/pythonic-way-of-detecting-outliers-in-one-dimensional-observation-data

def is_outlier(points, thresh=7):
"""
Returns a boolean array with True if points are outliers and False
otherwise.
Expand All @@ -28,7 +28,7 @@ def is_outlier(points, thresh=7): #taken from https://stackoverflow.com/question
Statistical Techniques, Edward F. Mykytka, Ph.D., Editor.
"""
if len(points.shape) == 1:
points = points[:,None]
points = points[:, None]
median = np.median(points, axis=0)
diff = np.sum((points - median)**2, axis=-1)
diff = np.sqrt(diff)
Expand All @@ -38,7 +38,8 @@ def is_outlier(points, thresh=7): #taken from https://stackoverflow.com/question

return modified_z_score > thresh

def get_KDTree(x): #Inspired by https://gist.github.com/atabakd/ed0f7581f8510c8587bc2f41a094b518

def get_KDTree(x): # Inspired by https://gist.github.com/atabakd/ed0f7581f8510c8587bc2f41a094b518

# Check the dimensions are consistent
x = np.atleast_2d(x)
Expand All @@ -48,32 +49,32 @@ def get_KDTree(x): #Inspired by https://gist.github.com/atabakd/ed0f7581f8510c85

return xtree

def get_KL(x, y, xtree, ytree): #Inspired by https://gist.github.com/atabakd/ed0f7581f8510c8587bc2f41a094b518

def get_KL(x, y, xtree, ytree): # Inspired by https://gist.github.com/atabakd/ed0f7581f8510c8587bc2f41a094b518
x = np.atleast_2d(x)
y = np.atleast_2d(y)

x = np.atleast_2d(x)
y = np.atleast_2d(y)

n,d = x.shape
m,dy = y.shape
n, d = x.shape
m, dy = y.shape

#Check dimensions
assert(d == dy)
# Check dimensions
assert (d == dy)

# Get the first two nearest neighbours for x, since the closest one is the
# sample itself.
r = xtree.query(x, k=2, eps=.01, p=2)[0][:,1]
r = xtree.query(x, k=2, eps=.01, p=2)[0][:, 1]
s = ytree.query(x, k=1, eps=.01, p=2)[0]

rs_ratio = r/s

#Remove points with zero, nan, or infinity. This happens when two regions have a vertex with the exact same value – an occurence that basically onnly happens for the single feature MSNs
#and has to do with FreeSurfer occasionally outputting the exact same value for different vertices.
# Remove points with zero, nan, or infinity. This happens when two regions have a vertex with the exact same value
# an occurrence that basically only happens for the single feature MSNs
# and has to do with FreeSurfer occasionally outputting the exact same value for different vertices.
rs_ratio = rs_ratio[np.isfinite(rs_ratio)]
rs_ratio = rs_ratio[rs_ratio!=0.0]
rs_ratio = rs_ratio[rs_ratio != 0.0]

# There is a mistake in the paper. In Eq. 14, the right side misses a negative sign
# on the first term of the right hand side.
Expand All @@ -84,33 +85,36 @@ def get_KL(x, y, xtree, ytree): #Inspired by https://gist.github.com/atabakd/ed0
return kl


def calculate_mind_network(data_df, feature_cols, region_list, resample=False, n_samples = 4000):

MIND = pd.DataFrame(np.zeros((len(region_list), len(region_list))), \
index = region_list, columns = region_list)
def calculate_mind_network(data_df, feature_cols, region_list, resample=False, n_samples=4000):
MIND = pd.DataFrame(np.zeros((len(region_list), len(region_list))),
index=region_list, columns=region_list)

#Get only desired regions
# Get only desired regions
data_df = data_df.loc[data_df['Label'].isin(region_list)]

#Resample dataset if resample has been set to True and if it is UNIVARIATE ONLY. This should only be done if you are using a single feature which contains repeated values.
if (len(feature_cols) == 1) and resample==True:
# Resample dataset if resample has been set to True and if it is UNIVARIATE ONLY. This should only be done if you
# are using a single feature which contains repeated values.
if (len(feature_cols) == 1) and resample:
n_samples = n_samples
resampled_dataset = pd.DataFrame(np.zeros((n_samples, len(region_list))), columns = region_list)
resampled_dataset = pd.DataFrame(np.zeros((n_samples, len(region_list))), columns=region_list)

for name, data in data_df.groupby('Label'):
resampled_dataset[name] = stats.gaussian_kde(data[feature_cols[0]]).resample(n_samples)[0]

resampled_dataset = resampled_dataset.melt(var_name = 'Label', value_name = feature_cols[0])
resampled_dataset = resampled_dataset.melt(var_name='Label', value_name=feature_cols[0])
data_df = resampled_dataset

if (len(feature_cols) > 1) and resample==True:
raise Exception("Resampling the data is only supported if you are using a single feature -- this is because higher order density estimation can be unreliable and very computationally expensive.")
if (len(feature_cols) > 1) and resample:
raise Exception("Resampling the data is only supported if you are using a single feature -- this is because "
"higher order density estimation can be unreliable and very computationally expensive.")

#Check that there aren't many repeated values
# Check that there aren't many repeated values
percent_unique_vals = len(data_df[feature_cols].drop_duplicates())/len(data_df[feature_cols])

if percent_unique_vals < 0.8:
raise Exception("There are many repeated values in the data, which compromises the validity of MIND calculation. Please minimize the number of repeated values in the data and try again. If you are using only one feature, try rerunning with resample=True.")
raise Exception("There are many repeated values in the data, which compromises the validity of MIND "
"calculation. Please minimize the number of repeated values in the data and try again. "
"If you are using only one feature, try rerunning with resample=True.")

grouped_data = data_df.groupby('Label')

Expand All @@ -128,7 +132,7 @@ def calculate_mind_network(data_df, feature_cols, region_list, resample=False, n
if name_x == name_y:
continue

if set([name_x,name_y]) in used_pairs:
if {name_x, name_y} in used_pairs:
continue

dat_x = dat_x[feature_cols]
Expand All @@ -139,11 +143,11 @@ def calculate_mind_network(data_df, feature_cols, region_list, resample=False, n

kl = KLa + KLb

MIND.at[name_x,name_y] = 1/(1+kl)
MIND.at[name_y,name_x] = 1/(1+kl)
MIND.at[name_x, name_y] = 1/(1+kl)
MIND.at[name_y, name_x] = 1/(1+kl)

used_pairs.append(set([name_x,name_y]))
used_pairs.append({name_x, name_y})

MIND = MIND[region_list].T[region_list].T

return MIND
return MIND
92 changes: 45 additions & 47 deletions get_vertex_df.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,13 @@
import sys
import numpy as np
import os
import pandas as pd
from os.path import exists
from nibabel.freesurfer.io import read_morph_data, read_annot
from nibabel.freesurfer.mghformat import load
from collections import defaultdict
from MIND_helpers import calculate_mind_network, is_outlier

def get_vertex_df(surf_dir, features, parcellation):

'''
def get_vertex_df(surf_dir, features, parcellation):
"""
INPUT SPECIFICATIONS:
• surf_dir (str) : This is a string the location containing all relevant directories output by FreeSurfer (i.e. label, mri, surf).

Expand Down Expand Up @@ -41,29 +38,30 @@ def get_vertex_df(surf_dir, features, parcellation):
A valid list of feature values combining these different input types would therefore be: ['CT','SD',(path/to/lh_feature1, path/to/rh_feature1), (path/to/lh_feature2, path/to/rh_feature2)]

• parcellation (str): This is a string the location containing parcellation scheme to be used. The files 'lh.' + parcellation + '.annot' and 'rh.' + parcellation + '.annot' must exist inside the surf_dir/label directory.
'''
"""

#specify data locations
# specify data locations
surfer_location = surf_dir + '/'

#Check inputs!
if (exists(surfer_location + '/label/lh.' + parcellation + '.annot') == False) or (exists(surfer_location + '/label/rh.' + parcellation + '.annot') == False):
# Check inputs!
if (not (exists(surfer_location + '/label/lh.' + parcellation + '.annot')) or
(not exists(surfer_location + '/label/rh.' + parcellation + '.annot'))):
raise Exception('Parcellation files not found.')

all_shorthand_features = ['CT','Vol','SA','MC','SD']
all_shorthand_features_dict = dict(zip(all_shorthand_features, ['thickness','volume','area','curv','sulc']))
all_shorthand_features = ['CT', 'Vol', 'SA', 'MC', 'SD']
all_shorthand_features_dict = dict(zip(all_shorthand_features, ['thickness', 'volume', 'area', 'curv', 'sulc']))

lh_feature_locs = []
rh_feature_locs = []

#Check feature inputs, store location of files
# Check feature inputs, store location of files
for feature in features:
if feature in all_shorthand_features:
lh_loc = surfer_location + 'surf/lh.' + all_shorthand_features_dict[feature]
rh_loc = surfer_location + 'surf/rh.' + all_shorthand_features_dict[feature]
if (exists(lh_loc) == False) or (exists(rh_loc) == False):
raise Exception('Feature for input "' + feature +'" not found.')

if (not exists(lh_loc)) or (not exists(rh_loc)):
raise Exception('Feature for input "' + feature + '" not found.')
else:
lh_feature_locs.append(lh_loc)
rh_feature_locs.append(rh_loc)
Expand All @@ -73,9 +71,9 @@ def get_vertex_df(surf_dir, features, parcellation):
if len(feature.split('/')) == 1:
lh_loc = surfer_location + 'surf/lh.' + feature
rh_loc = surfer_location + 'surf/rh.' + feature
if (exists(lh_loc) == False) or (exists(rh_loc) == False):
raise Exception('Feature for input "' + feature +'" not found.')

if (not exists(lh_loc)) or (not exists(rh_loc)):
raise Exception('Feature for input "' + feature + '" not found.')

else:
lh_feature_locs.append(lh_loc)
Expand All @@ -89,9 +87,9 @@ def get_vertex_df(surf_dir, features, parcellation):
rh_loc = feature.split('/')
rh_loc[-1] = 'r' + rh_loc[-1][1:]
rh_loc = '/'.join(rh_loc)
if (exists(lh_loc) == False) or (exists(rh_loc) == False):
raise Exception('Feature for input "' + feature +'" not found.')

if (not exists(lh_loc)) or (not exists(rh_loc)):
raise Exception('Feature for input "' + feature + '" not found.')

else:
lh_feature_locs.append(lh_loc)
Expand All @@ -101,9 +99,9 @@ def get_vertex_df(surf_dir, features, parcellation):

lh_loc = feature[0]
rh_loc = feature[1]
if (exists(lh_loc) == False) or (exists(rh_loc) == False):
raise Exception('Feature for input "' + feature[0] +'" or "' + feature[1] +'" not found.')

if (not exists(lh_loc)) or (not exists(rh_loc)):
raise Exception('Feature for input "' + feature[0] + '" or "' + feature[1] + '" not found.')

else:
lh_feature_locs.append(lh_loc)
Expand All @@ -112,11 +110,11 @@ def get_vertex_df(surf_dir, features, parcellation):
else:
raise Exception('Unrecognized format for feature input: ', feature)

#Get annotation files
lh_annot = read_annot(surfer_location + '/label/lh.' + parcellation + '.annot', orig_ids = True)
rh_annot = read_annot(surfer_location + '/label/rh.' + parcellation + '.annot', orig_ids = True)
# Get annotation files
lh_annot = read_annot(surfer_location + '/label/lh.' + parcellation + '.annot', orig_ids=True)
rh_annot = read_annot(surfer_location + '/label/rh.' + parcellation + '.annot', orig_ids=True)

annot_dict = {'lh':lh_annot, 'rh':rh_annot}
annot_dict = {'lh': lh_annot, 'rh': rh_annot}

'''
The regions in the lh and rh need to be renamed and distinct.
Expand All @@ -129,30 +127,30 @@ def get_vertex_df(surf_dir, features, parcellation):
lh_region_names = ['lh_' + str(x).split("'")[1] for x in lh_annot[2]]
rh_region_names = ['rh_' + str(x).split("'")[1] for x in rh_annot[2]]

lh_convert_dict = dict(zip(lh_annot[1][:,-1], lh_region_names))
rh_convert_dict = dict(zip(rh_annot[1][:,-1], rh_region_names))
lh_convert_dict = dict(zip(lh_annot[1][:, -1], lh_region_names))
rh_convert_dict = dict(zip(rh_annot[1][:, -1], rh_region_names))

convert_dicts = {'lh': lh_convert_dict,\
'rh': rh_convert_dict}
convert_dicts = {'lh': lh_convert_dict,
'rh': rh_convert_dict}

used_labels_l = np.intersect1d(np.unique(lh_annot[0]), list(lh_convert_dict.keys()))
used_labels_r = np.intersect1d(np.unique(rh_annot[0]), list(rh_convert_dict.keys()))

used_labels = {'lh': used_labels_l,\
'rh': used_labels_r}

used_labels = {'lh': used_labels_l,
'rh': used_labels_r}

used_regions_l = np.array([value for key, value in lh_convert_dict.items() if key in used_labels_l])
used_regions_r = np.array([value for key, value in rh_convert_dict.items() if key in used_labels_r])

combined_regions = np.hstack((used_regions_l, used_regions_r))
unknown_regions = [x for x in combined_regions if (('?' in x) | ('unknown' in x) | ('Unknown' in x) | ('Medial_Wall' in x) | (len(x) == 3))]
unknown_regions = [x for x in combined_regions if
(('?' in x) | ('unknown' in x) | ('Unknown' in x) | ('Medial_Wall' in x) | (len(x) == 3))]
combined_regions = np.array([x for x in combined_regions if x not in unknown_regions])

vertex_data_dict = defaultdict()

#Now load up all the vertex-level data!
for hemi in ['lh','rh']:
# Now load up all the vertex-level data!
for hemi in ['lh', 'rh']:
print(hemi)
hemi_data_dict = defaultdict()

Expand All @@ -161,7 +159,7 @@ def get_vertex_df(surf_dir, features, parcellation):

for i, lh_feature_loc in enumerate(lh_feature_locs):
print(lh_feature_loc)
#check for mgh/mgz format vs regular curv files
# check for mgh/mgz format vs regular curv files
if lh_feature_loc.endswith('mgh') or lh_feature_loc.endswith('mgz'):
hemi_data_dict['Feature_' + str(i)] = load(lh_feature_loc).get_fdata().flatten()
else:
Expand All @@ -185,19 +183,19 @@ def get_vertex_df(surf_dir, features, parcellation):
for i, feature in enumerate(used_features):
print(i, feature)
hemi_data[i + 1] = hemi_data_dict[feature]

col_names = ['Label'] + used_features
hemi_data = pd.DataFrame(hemi_data.T, columns = col_names)
#Select only the vertices that map to regions.
hemi_data = pd.DataFrame(hemi_data.T, columns=col_names)

# Select only the vertices that map to regions.
hemi_data = hemi_data.loc[hemi_data['Label'].isin(used_labels[hemi])]

hemi_data["Label"] = hemi_data["Label"].map(convert_dicts[hemi])
vertex_data_dict[hemi] = hemi_data

vertex_data = pd.concat([vertex_data_dict['lh'], vertex_data_dict['rh']], ignore_index = True)
vertex_data = pd.concat([vertex_data_dict['lh'], vertex_data_dict['rh']], ignore_index=True)

#Output data
# Output data
print("features used: ")
print(used_features)
return vertex_data, combined_regions, used_features
Loading