Source code for msAI.msData


"""msAI module for importing mass spectrometry data into dataframes.

Features
    * Extraction of data from MS files (mzML, TBD...)
    * Creation of in-memory data structures for spectra / peaks values
    * Building a set of MS data files

Todo
    * Change MSfile to dataclass
    * Change properties to attributes
    * Modify public / private
    * Create types for peaks and spectra dataframes

"""


import msAI.miscUtils as miscUtils
from msAI.errors import MSfileSetInitError
from msAI.miscDecos import log_timer
from msAI.types import DF

import os
import logging
from typing import ClassVar, List

import pandas as pd
import pymzml


logger = logging.getLogger(__name__)
"""Module logger."""


[docs]class MSfile:
    """Interface class for accessing data from a MS file stored in various file types.

    Subclass implementations provide support for the various file types
    and override the init method to set values.
    The `peaks` and `spectra` properties hold data structured in dataframes.
    """

    _run_id: str
    _run_date: str
    _ms_file_version: str

    _spectrum_count: int
    _peak_count: int
    _tic_sum: float

    _peaks: DF
    _spectra: DF

[docs]    def __init__(self):
        """Initializes an instance of MSfile class.

        No need to call this superclass initialization,
        as subclasses provide values for all attributes initialized here.
        """

        self._run_id = None
        self._run_date = None
        self._ms_file_version = None

        self._spectrum_count = None
        self._peak_count = None
        self._tic_sum = None

        self._peaks = pd.DataFrame()
        self._spectra = pd.DataFrame()

[docs]    @property
    def run_id(self):
        """Get the sample's run ID as specified from its MS data file."""

        return self._run_id

[docs]    @property
    def run_date(self):
        """Get the date the sample was run as specified from its MS data file."""

        return self._run_date

[docs]    @property
    def ms_file_version(self):
        """Get the data format version in which the sample was originally saved as specified from its ms file.

        Note: Currently, this is equivalent to mzML version number.
        """

        return self._ms_file_version

[docs]    @property
    def spectrum_count(self):
        """Get the number of MS spectra from a sample run.

        This value is calculated from the number of spectra imported, rather than from MS file metadata.
        """

        return self._spectrum_count

[docs]    @property
    def peak_count(self):
        """Get the total number of MS peaks from all MS spectra in sample run."""

        return self._peak_count

[docs]    @property
    def tic_sum(self):
        """
        Get the total ion current sum of all spectra in sample run
        """

        return self._tic_sum

[docs]    @property
    def peaks(self):
        """Get a dataframe of all peaks in a MS file.

        Dataframe structure
            | **First Index Level:**  spec_id
            | **Second Index Level:**  peak_number
            | **Columns:**  rt,  mz,  i
        """

        return self._peaks

[docs]    @property
    def spectra(self):
        """Get a dataframe of all spectra in an MS file.

        Dataframe structure
            | **Index:**  spec_id
            | **Columns:**  rt,  peak_count,  tic,  ms_lvl,  filters
        """

        return self._spectra


[docs]class MZMLfile(MSfile):
    """Class to access MS data stored in an mzML file."""

[docs]    def __init__(self,
                 mzml_file_path: str):
        """Initializes an instance of MZMLfile class.

        Args:
            mzml_file_path: A string representation of the path to the mzML data file.
                Path can be relative or absolute.
        """
        self._mzml_file_path = mzml_file_path
        self._run = pymzml.run.Reader(self._mzml_file_path)

        # self._peaks = pd.DataFrame()
        # self._spectra = pd.DataFrame()
        self._create_dfs()

        self._run_id = self._run.info['run_id']
        # self.spectrum_count = self._run.info['spectrum_count']
        self._run_date = self._run.info['start_time']
        self._ms_file_version = self._run.info['mzml_version']

        self._spectrum_count = self.spectra.index.size
        self._tic_sum = self.spectra['tic'].sum()       # Total number of MS peaks from all spectra
        self._peak_count = self.peaks.index.size        # Total ion current sum of all spectra

        del self._run

[docs]    def _create_spectrum_peaks_df(self, spectrum):
        """Creates a dataframe of all the peaks for a single spectrum in an mzML file."""

        mz_values = spectrum.mz.round(5)
        i_values = spectrum.i
        rt = spectrum.scan_time_in_minutes()
        peak_count = len(mz_values)
        spectrum_id = [spectrum.ID]
        peak_list = list(range(peak_count))

        peak_index = pd.MultiIndex.from_product([spectrum_id, peak_list], names=['spec_id', 'peak_number'])
        peaks = {'rt': rt,
                 'mz': mz_values,
                 'i': i_values}
        spectrum_peaks = pd.DataFrame(peaks, index=peak_index)

        return spectrum_peaks

[docs]    def _create_spectrum_df(self, spectrum):
        """Creates a dataframe of all the spectra in an mzML file."""

        rt = spectrum.scan_time_in_minutes()
        peak_count = len(spectrum.mz)

        try:
            tic = spectrum.TIC
        except AttributeError:
            logger.warning("missing TIC value in mzML file")
            tic = None

        ms_lvl = spectrum.ms_level
        filters = spectrum.get('filter string')
        spectrum_id = [spectrum.ID]

        spec = {'rt': rt,
                'peak_count': peak_count,
                'tic': tic,
                'ms_lvl': ms_lvl,
                'filters': filters}
        spectrum_df = pd.DataFrame(spec, index=spectrum_id)

        return spectrum_df

[docs]    def _create_dfs(self):
        """Creates spectra and peaks dataframes for an mzML file.

        This method sets the following properties:
            * self._peaks
            * self._spectra
        """

        peaks_df_list = []
        spectra_df_list = []

        for n, spectrum in enumerate(self._run):
            spectrum_peaks_df = self._create_spectrum_peaks_df(spectrum)
            peaks_df_list.append(spectrum_peaks_df)

            spectrum_df = self._create_spectrum_df(spectrum)
            spectra_df_list.append(spectrum_df)

        self._peaks = pd.concat(peaks_df_list)
        self._spectra = pd.concat(spectra_df_list)


[docs]class MSfileSet:
    """Class to create a set of MS files from a data directory.

    Creating a set enables a large number of datafiles to be viewed / manipulated as a dataframe,
    without loading their entire contents into memory.

    By default, contents of sub directories will be recursively included.
    However, an error is raised if included filenames are duplicated.
    A Set can include any MSfile type (mzML, msAIr, or a mix).
    By default, any datafile matching these extensions will be included.
    An exclusive type may alternatively be specified.
    """

    mzML_exts: ClassVar[List[str]] = ['mzML', 'mzml', 'MZML']
    """File extensions considered to be mzML files."""

    msAIr_exts: ClassVar[List[str]] = ['msAIr', 'msair', 'MSAIR']
    """File extensions considered to be msAIr files."""
    
[docs]    @log_timer
    def __init__(self,
                 dir_path: str,
                 data_type: str = 'all',
                 recursive: bool = True):
        """Initializes an instance of MSfileSet class.

        Args:
            dir_path: A string representation of the path to the data directory.
                Path can be relative or absolute.
            data_type: (`all`, `mzML`, `msAIr`) The type of MS files to include in the set.
                By default, all types are included.
            recursive: A boolean indicating if files in subdirectories are included in the set.
                Defaults to ``True``.

        Raises:
            MSfileSetInitError: For duplicated filenames.
        """

        self._dir_path = dir_path

        if data_type == 'all':
            ext_list = self.mzML_exts + self.msAIr_exts
        elif data_type == 'mzML':
            ext_list = self.mzML_exts
        elif data_type == 'msAIr':
            ext_list = self.msAIr_exts
        else:
            raise MSfileSetInitError(f"Invalid data_type: {data_type}")

        if recursive:
            self._file_iter = miscUtils.FileGrabber.multi_extensions(self._dir_path, *ext_list)
        else:
            self._file_iter = miscUtils.FileGrabber.multi_extensions(self._dir_path, *ext_list, recursive=False)

        def file_gen():
            for datafile in self._file_iter:
                file_size = miscUtils.Sizer.file_mb(datafile)

                path_head, path_tail = os.path.split(datafile)
                filename, file_ext = os.path.splitext(path_tail)

                # Fix mixed cases extensions to have same file_type value
                if file_ext.replace(".", "").casefold() == 'mzml':
                    file_type = 'mzML'
                elif file_ext.replace(".", "").casefold() == 'msair':
                    file_type = 'msAIr'

                yield (filename, file_type, file_size, str(datafile))

        # Initial import into a dataframe with integer index
        self._hf = pd.DataFrame(file_gen(), columns=['filename', 'file_type', 'file_size', 'path'])

        # Test if any filenames are duplicated
        duplicates = self._hf[self._hf.duplicated('filename', keep=False)]
        if duplicates.size > 0:
            raise MSfileSetInitError(f"Duplicated filenames:\n {duplicates.to_string()}")

        else:
            self._df = self._hf.set_index('filename', verify_integrity=True)

    def __repr__(self):
        return self._df.to_string()

[docs]    @property
    def df(self):
        """Get a dataframe of MS files.

        Dataframe structure
            | **Index:**  name (from filename)
            | **Columns:**  type,  size_MB,  path
        """
        return self._df