Source code for glhmm.preproc

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Preprocessing functions - General/Gaussian Linear Hidden Markov Model
@author: Diego Vidaurre 2023
"""

import math
import numpy as np
import warnings
from sklearn.decomposition import PCA
from scipy import signal

from . import auxiliary
# import auxiliary

[docs] def apply_pca(X,d,whitening=False,exact=True): """Applies PCA to the input data X. Parameters: ----------- X : array-like of shape (n_samples, n_parcels) The input data to be transformed. d : int or float If int, the number of components to keep. If float, the percentage of explained variance to keep. If array-like of shape (n_parcels, n_components), the transformation matrix. whitening : bool, default=False Whether to whiten the transformed data. exact : bool, default=True Whether to use full SVD solver for PCA. Returns: -------- X_transformed : array-like of shape (n_samples, n_components) The transformed data after applying PCA. """ if type(d) is np.ndarray: X -= np.mean(X,axis=0) X = X @ d if whitening: X /= np.std(X,axis=0) return X svd_solver = 'full' if exact else 'auto' if d >= 1: pcamodel = PCA(n_components=d,whiten=whitening,svd_solver=svd_solver) pcamodel.fit(X) X = pcamodel.transform(X) else: pcamodel = PCA(whiten=whitening,svd_solver=svd_solver) pcamodel.fit(X) ncomp = np.where(np.cumsum(pcamodel.explained_variance_ratio_)>=d)[0][0] + 1 X = pcamodel.transform(X) X = X[:,0:ncomp] d = ncomp # sign convention equal to Matlab's for j in range(d): jj = np.where(np.abs(X[:,j]) == np.abs(np.max(X[:,j])) )[0][0] if X[jj,j] < 0: X[:,j] *= -1 return X
[docs] def preprocess_data(data,indices, fs = 1, # frequency of the data standardise=True, # True / False filter=None, # Tuple with low-pass high-pass thresholds, or None detrend=False, # True / False onpower=False, # True / False pca=None, # Number of components, % explained variance, or None whitening=False, # True / False exact_pca=True, downsample=None # new frequency, or None ): """Preprocess the input data. Parameters: ----------- data : array-like of shape (n_samples, n_parcels) The input data to be preprocessed. indices : array-like of shape (n_sessions, 2) The start and end indices of each trial/session in the input data. fs : int or float, default=1 The frequency of the input data. standardise : bool, default=True Whether to standardize the input data. filter : tuple of length 2 or None, default=None The low-pass and high-pass thresholds to apply to the input data. If None, no filtering will be applied. If a tuple, the first element is the low-pass threshold and the second is the high-pass threshold. detrend : bool, default=False Whether to detrend the input data. onpower : bool, default=False Whether to calculate the power of the input data using the Hilbert transform. pca : int or float or None, default=None If int, the number of components to keep after applying PCA. If float, the percentage of explained variance to keep after applying PCA. If None, no PCA will be applied. whitening : bool, default=False Whether to whiten the input data after applying PCA. exact_pca : bool, default=True Whether to use full SVD solver for PCA. downsample : int or float or None, default=None The new frequency of the input data after downsampling. If None, no downsampling will be applied. Returns: -------- data_processed : array-like of shape (n_samples_processed, n_parcels) The preprocessed input data. indices_processed : array-like of shape (n_sessions_processed, 2) The start and end indices of each trial/session in the preprocessed data. """ p = data.shape[1] N = indices.shape[0] data = np.copy(data) if standardise: for j in range(N): t = np.arange(indices[j,0],indices[j,1]) data[t,:] -= np.mean(data[t,:],axis=0) data[t,:] /= np.std(data[t,:],axis=0) if filter != None: filterorder = 6 if filter[0] == 0: # low-pass sos = signal.butter(filterorder, filter[1], 'lowpass', output='sos', fs = fs) elif filter[1] == None: # high-pass sos = signal.butter(filterorder, filter[0], 'highpass', output='sos', fs = fs) else: sos = signal.butter(filterorder, filter, 'bandpass', output='sos', fs = fs) for j in range(N): t = np.arange(indices[j,0],indices[j,1]) data[t,:] = signal.sosfilt(sos, data[t,:], axis=0) if detrend: for j in range(N): t = np.arange(indices[j,0],indices[j,1]) data[t,:] = signal.detrend(data[t,:], axis=0) if onpower: for j in range(N): t = np.arange(indices[j,0],indices[j,1]) data[t,:] = np.abs(signal.hilbert(data[t,:], axis=0)) if pca != None: data = apply_pca(data,pca,whitening,exact_pca) p = data.shape[1] if downsample != None: factor = downsample / fs Tnew = np.ceil(factor * (indices[:,1]-indices[:,0])).astype(int) indices_new = auxiliary.make_indices_from_T(Tnew) data_new = np.zeros((np.sum(Tnew),p)) gcd = math.gcd(downsample,fs) for j in range(N): t = np.arange(indices[j,0],indices[j,1]) tnew = np.arange(indices_new[j,0],indices_new[j,1]) data_new[tnew,:] = signal.resample_poly(data[t,:], downsample/gcd, fs/gcd) # Tjnew = tnew.shape[0] # data_new[tnew,:] = signal.resample(data[t,:], Tjnew) data = data_new else: indices_new = indices return data,indices_new
[docs] def build_data_autoregressive(data,indices,autoregressive_order=1, connectivity=None,center_data=True): """Builds X and Y for the autoregressive model, as well as an adapted indices array and predefined connectivity matrix in the right format. X and Y are centered by default. Parameters: ----------- data : array-like of shape (n_samples,n_parcels) The data timeseries. indices : array-like of shape (n_sessions, 2) The start and end indices of each trial/session in the input data. autoregressive_order : int, optional, default=1 The number of lags to include in the autoregressive model. connectivity : array-like of shape (n_parcels, n_parcels), optional, default=None The matrix indicating which regressors should be used for each variable. center_data : bool, optional, default=True If True, the data will be centered. Returns: -------- X : array-like of shape (n_samples - n_sessions*autoregressive_order, n_parcels*autoregressive_order) The timeseries of set of variables 1 (i.e., the regressors). Y : array-like of shape (n_samples - n_sessions*autoregressive_order, n_parcels) The timeseries of set of variables 2 (i.e., variables to predict, targets). indices_new : array-like of shape (n_sessions, 2) The new array of start and end indices for each trial/session. connectivity_new : array-like of shape (n_parcels*autoregressive_order, n_parcels) The new connectivity matrix indicating which regressors should be used for each variable. """ T,p = data.shape N = indices.shape[0] if autoregressive_order == 0: warnings.warn("autoregressive_order is 0 so nothing to be done") return np.empty(0),data,indices,connectivity X = np.zeros((T - N*autoregressive_order,p*autoregressive_order)) Y = np.zeros((T - N*autoregressive_order,p)) indices_new = np.zeros((N,2)) for j in range(N): ind_1 = np.arange(indices[j,0]+autoregressive_order,indices[j,1],dtype=np.int64) ind_2 = np.arange(indices[j,0],indices[j,1]-autoregressive_order,dtype=np.int64) \ - j * autoregressive_order Y[ind_2,:] = data[ind_1,:] for i in range(autoregressive_order): ind_3 = np.arange(indices[j,0]+autoregressive_order-(i+1),indices[j,1]-(i+1),dtype=np.int64) ind_ch = np.arange(p) + i * p X[ind_2,ind_ch[:,np.newaxis]] = data[ind_3,:].T indices_new[j,0] = ind_2[0] indices_new[j,1] = ind_2[-1] + 1 # center if center_data: Y -= np.mean(Y,axis=0) X -= np.mean(X,axis=0) if connectivity is not None: # connectivity_new : (regressors by regressed) connectivity_new = np.zeros((autoregressive_order*p,p)) for i in range(autoregressive_order): ind_ch = np.arange(p) + i * p connectivity_new[ind_ch,:] = connectivity # regress out when asked for j in range(p): jj = np.where(connectivity_new[:,j]==0)[0] if len(jj)==0: continue b = np.linalg.inv(X[:,jj].T @ X[:,jj] + 0.001 * np.eye(len(jj))) \ @ (X[:,jj].T @ Y[:,j]) Y[:,j] -= X[:,jj] @ b # remove unused variables active_X = np.zeros(p,dtype=bool) active_Y = np.zeros(p,dtype=bool) for j in range(p): active_X[j] = np.sum(connectivity[j,:]==1) > 0 active_Y[j] = np.sum(connectivity[:,j]==1) > 0 active_X = np.tile(active_X,autoregressive_order) active_X = np.where(active_X)[0] active_Y = np.where(active_Y)[0] Y = Y[:,active_Y] X = X[:,active_X] connectivity_new = connectivity_new[active_X,active_Y[:,np.newaxis]].T else: connectivity_new = None return X,Y,indices_new,connectivity_new
[docs] def build_data_partial_connectivity(X,Y,connectivity=None,center_data=True): """Builds X and Y for the partial connectivity model, essentially regressing out things when indicated in connectivity, and getting rid of regressors / regressed variables that are not used; it return connectivity with the right dimensions as well. Parameters: ----------- X : np.ndarray of shape (n_samples, n_parcels) The timeseries of set of variables 1 (i.e., the regressors). Y : np.ndarray of shape (n_samples, n_parcels) The timeseries of set of variables 2 (i.e., variables to predict, targets). connectivity : np.ndarray of shape (n_parcels, n_parcels), optional, default=None A binary matrix indicating which regressors affect which targets (i.e., variables to predict). center_data : bool, default=True Center data to zero mean. Returns: -------- X_new : np.ndarray of shape (n_samples, n_active_parcels) The timeseries of set of variables 1 (i.e., the regressors) after removing unused predictors and regressing out the effects indicated in connectivity. Y_new : np.ndarray of shape (n_samples, n_active_parcels) The timeseries of set of variables 2 (i.e., variables to predict, targets) after removing unused targets and regressing out the effects indicated in connectivity. connectivity_new : np.ndarray of shape (n_active_parcels, n_active_parcels), optional, default=None A binary matrix indicating which regressors affect which targets The matrix has the same structure as `connectivity` after removing unused predictors and targets. """ X_new = np.copy(X) Y_new = np.copy(Y) if connectivity is not None: p = X.shape[1] q = Y.shape[1] # regress out when asked for j in range(q): jj = np.where(connectivity[:,j]==0)[0] if len(jj)==0: continue b = np.linalg.inv(X[:,jj].T @ X[:,jj] + 0.001 * np.eye(len(jj))) \ @ (X[:,jj].T @ Y[:,j]) Y_new[:,j] -= X[:,jj] @ b # remove unused variables active_X = np.zeros(p,dtype=bool) for j in range(p): active_X[j] = np.sum(connectivity[j,:]==1) > 0 active_Y = np.zeros(q,dtype=bool) for j in range(q): active_Y[j] = np.sum(connectivity[:,j]==1) > 0 active_X = np.where(active_X)[0] active_Y = np.where(active_Y)[0] Y = Y[:,active_Y] X = X[:,active_X] # copy of connectivity connectivity_new = np.copy(connectivity) connectivity_new = connectivity_new[active_X,active_Y[:,np.newaxis]].T else: connectivity_new = None # center if center_data: Y_new = Y_new - np.mean(Y_new,axis=0) X_new = X_new - np.mean(X_new,axis=0) return X_new,Y_new,connectivity_new
[docs] def build_data_tde(data,indices,lags,pca=None,standardise_pc=True): """Builds X for the temporal delay embedded HMM, as well as an adapted indices array. Parameters: ----------- data : numpy array of shape (n_samples, n_parcels) The data matrix. indices : array-like of shape (n_sessions, 2) The start and end indices of each trial/session in the input data. lags : list or array-like The lags to use for the embedding. pca : None or int or float or numpy array, default=None The number of components for PCA, the explained variance for PCA, the precomputed PCA projection matrix, or None to skip PCA. standardise_pc : bool, default=True Whether or not to standardise the principal components before returning. Returns: -------- X : numpy array of shape (n_samples - n_sessions*rwindow, n_parcels*n_lags) The delay-embedded timeseries data. indices_new : numpy array of shape (n_sessions, 2) The adapted indices for each segment of delay-embedded data. PCA can be run optionally: if pca >=1, that is the number of components; if pca < 1, that is explained variance; if pca is a numpy array, then it is a precomputed PCA projection matrix; if pca is None, then no PCA is run. """ T,p = data.shape N = indices.shape[0] L = len(lags) minlag = np.min(lags) maxlag = np.max(lags) rwindow = maxlag-minlag X = np.zeros((T - N*rwindow,p*L)) indices_new = np.zeros((N,2)).astype(int) # Embedding for j in range(N): ind_1 = np.arange(indices[j,0],indices[j,1],dtype=np.int64) ind_2 = np.arange(indices[j,0],indices[j,1]-rwindow,dtype=np.int64) - j * rwindow for i in range(L): l = lags[i] X_l = np.roll(data[ind_1,:],l,axis=0) X_l = X_l[-minlag:-maxlag,:] ind_ch = np.arange(i,L*p,L) X[ind_2,ind_ch[:,np.newaxis]] = X_l.T indices_new[j,0] = ind_2[0] indices_new[j,1] = ind_2[-1] + 1 # Standardise (in Matlab's HMM-MAR we only centered pre-embedding) # note that this is done for the entire data set and not per sessions X -= np.mean(X,axis=0) X /= np.std(X,axis=0) # PCA and whitening if pca is not None: X = apply_pca(X,pca,standardise_pc) return X,indices_new
[docs] def load_files(files,I=None,do_only_indices=False): X = [] Y = [] indices = [] sum_T = 0 if I is None: I = np.arange(len(files)) elif type(I) is int: I = np.array([I]) for ij in range(I.shape[0]): j = I[ij] # if type(files[j]) is tuple: # if len(files[j][0]) > 0: # X # if files[j][0][-4:] == '.npy': # X.append(np.load(files[j][0])) # elif files[j][0][-4:] == '.txt': if files[j][-4:] == '.mat': dat = scipy.io.loadmat(files[j]) elif files[j][-4:] == '.npz': dat = np.load(files[j]) if not do_only_indices: if ('X' in dat) and (not 'Y' in dat): Y.append(dat["X"]) else: if 'X' in dat: X.append(dat["X"]) Y.append(dat["Y"]) if 'indices' in dat: indices.append(dat['indices']) elif 'T' in dat: indices.append(make_indices_from_T(dat['T']) + sum_T) else: ind = np.zeros((1,2)).astype(int) ind[0,0] = 0 ind[0,1] = Y[-1].shape[0] indices.append(ind + sum_T) sum_T += dat["Y"].shape[0] if not do_only_indices: if len(X) > 0: X = np.concatenate(X) Y = np.concatenate(Y) indices = np.concatenate(indices) if len(indices.shape) == 1: indices = np.expand_dims(indices,axis=0) if len(X) == 0: X = None return X,Y,indices