Source code for TSFEDL.data

import os
from typing import Optional, Tuple
import numpy as np
import wfdb
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split


[docs]def get_mit_bih_segments(data: wfdb.Record,
                         annotations: wfdb.Annotation,
                         labels: np.ndarray,
                         left_offset: int = 99,
                         right_offset: int = 160,
                         fixed_length: Optional[int] = None) -> Tuple[np.ndarray, np.ndarray]:
    """
    It generates the segments of uninterrupted sequences of arrythmia beats into the corresponding arrythmia groups
    in labels.

    Parameters
    ----------

    data : wfdb.Record
        The arrythmia signal as a wfdb Record class
    annotations : wfdb.Annotation
        The set of annotations as a wfdb Annotation class
    labels : array-like
        The set of valid labels for the different segments. Segments with different labels are discarded
    left_offset : int
        The number of instance at the left of the first R peak of the segment. Default to 99
    right_offset : int
        The number of instances at the right of the last R peak of the segment. Default to 160
    fixed_length : int, optional
        Should the segments have a fixed length? If fixed_length is a number, then the segments will
        have the specified length. If the segment length is greater than fixed_length, it is truncated
        or padded with zeros otherwise. Default to None.

    Returns
    -------
        A tuple that contains the data and the associated labels. Data has a shape of (N, T, V)
        where N is the number of segments (or instances), V is the number of variables (1 in this case)
        and T is the number of timesteps of each segment.  Labels are numerically encoded according to the
        value passed in the :parameter labels param.
    """
    i = 0
    annot_segments = []

    # Get the tuples for consecutive symbols. The tuple is (first, last, symbol) where first is the index of the first occurrence of symbol,
    # and last is the index of the last consecutive ocurrence.
    while (i < len(annotations.symbol)):
        first = i
        current_symbol = annotations.symbol[i]
        while (i < len(annotations.symbol) and annotations.symbol[i] == current_symbol):
            i += 1
        last = i - 1
        tup = (first, last, current_symbol)
        annot_segments.append(tup)

    # Now, for each extracted tuple, get the X segments:
    result = []
    classes = []
    for s in annot_segments:  # s is a tuple (first, last, symbol)
        if s[2] in labels:
            classes.append(s[2])
            init = annotations.sample[s[0]] - left_offset
            if init < 0:
                init = 0

            end = annotations.sample[s[1]] + right_offset
            if end >= len(data.p_signal):
                end = len(data.p_signal) - 1

            r = range(init, end)

            # Get the samples of the segments (p_signal is a 2D array, we only want the first axis)
            new_segment = np.array(data.p_signal[r, 1], dtype='float32')

            # truncate or pad with zeros the segment if necessary
            if (fixed_length != None):
                if (len(new_segment) > fixed_length):  # truncate
                    new_segment = new_segment[:fixed_length]
                elif (len(new_segment < fixed_length)):  # pad with zeros to the right
                    number_of_zeros = fixed_length - len(new_segment)
                    new_segment = np.pad(new_segment, (0, number_of_zeros), mode='constant', constant_values=0)

            result.append(new_segment)
    result = np.stack(result, axis=0)
    result = np.reshape(result, (result.shape[0], result.shape[1], 1))  # shape[0] segments with 1 variable, with shape[1] timestamps each
    classes = np.array(classes, dtype=str)

    # Encode labels: from string to numeric.
    label_encoder = LabelEncoder()
    label_encoder.fit(labels)
    classes = label_encoder.transform(classes)

    return (result, classes)


[docs]def read_mit_bih(path: str,
                 labels: np.ndarray = np.array(['N', 'L', 'R', 'A', 'V']),
                 left_offset: int = 99,
                 right_offset: int = 160,
                 fixed_length: Optional[int] = 1000) -> Tuple[np.ndarray, np.ndarray]:
    """
    It reads the MIT-BIH Arrythmia X with the specified default configuration of the work presented at:
    Oh, Shu Lih, et al. "Automated diagnosis of arrhythmia using combination of CNN and LSTM techniques with
    variable length heart beats." Computers in biology and medicine 102 (2018): 278-287.

    Parameters
    ----------
    labels : array-like
        The labels of the different types of arrythmia to be employed
    path : str
        The path of the directory where the X files are stored. Note: The X and annotations
        files must have the same name, but different extension (annotations must have .atr extension)
    left_offset : int
        The number of instances at the left of the first R peak of the segment. Defaults to 99
    right_offset : int
        The number of instances at the right of the last R peak of the segment. Defaults to 160
    fixed_length : int, optional
        If different to None, the segment will have the specified number of instances. Note that
        if the segment length > fixed_length it will be truncate or padded with zeros otherwise.

    Returns
    -------
        A tuple that contains the data and the associated labels as an ndarray. Data has a shape of (N, T, V)
        where N is the number of segments (or instances), V is the number of variables (1 in this case)
        and T is the number of timesteps of each segment.  Labels are numerically encoded according to the
        value passed in the :parameter labels param.
    """
    print("reading data...")
    segments = []
    classes = []

    files = [file[:-4] for file in os.listdir(path) if file.endswith('.dat')]
    for f in files:
        data = wfdb.rdrecord(path + f)
        annotation = wfdb.rdann(path + f, 'atr')

        s, clazz = get_mit_bih_segments(data=data,
                                        annotations=annotation,
                                        labels=labels,
                                        left_offset=left_offset,
                                        right_offset=right_offset,
                                        fixed_length=fixed_length)

        segments.append(s)
        classes.append(clazz)

    segments = np.vstack(segments)
    classes = np.concatenate(classes)
    print("done.")

    return (segments, classes)


[docs]class MIT_BIH(Dataset):
    """
        Reads the MIT-BIH datasets and return a data loader with Shape (N, C, L) where N is the batch size, C is the
        number of channels (1 in this dataset) and L is the `length` of the time series (1000 by default).

        Parameters
        ----------
        labels :array-like
            The labels of the different types of arrythmia to be employed
        path : str
            The path of the directory where the X files are stored. Note: The X and annotations
            files must have the same name, but different extension (annotations must have .atr extension)
        left_offset : int
            The number of instances at the left of the first R peak of the segment. Defaults to 99
        right_offset : int
            The number of instances at the right of the last R peak of the segment. Defaults to 160
        return_hot_coded : bool
            Wether to return the raw labels or hot-encoded ones.

        Returns
        -------
            A tuple that contains the data and the associated labels as an ndarray. Data has a shape of (N, T, V)
            where N is the number of segments (or instances), V is the number of variables (1 in this case)
            and T is the number of timesteps of each segment.  Labels are numerically encoded according to the
            value passed in the :parameter labels param.
    """
    def __init__(self, path,
                 labels=np.array(['N', 'L', 'R', 'A', 'V']),
                 length=1000,
                 left_offset=99,
                 right_offset=160,
                 return_hot_coded=False):
        X, y = read_mit_bih(path, labels, left_offset=left_offset, right_offset=right_offset, fixed_length=length)
        y_hot_encoded = np.zeros((y.size, y.max() + 1), dtype='int64')
        y_hot_encoded[np.arange(y.size), y] = 1

        self.x = X.reshape((X.shape[0], X.shape[2], X.shape[1]))
        if return_hot_coded:
            self.y = y_hot_encoded
        else:
            self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, index):
        return self.x[index], self.y[index]
Source code for TSFEDL.data

TSFEDL

Navigation

Related Topics