"""msAI module for importing mass spectrometry data into dataframes.
Features
* Extraction of data from MS files (mzML, TBD...)
* Creation of in-memory data structures for spectra / peaks values
* Building a set of MS data files
Todo
* Change MSfile to dataclass
* Change properties to attributes
* Modify public / private
* Create types for peaks and spectra dataframes
"""
import msAI.miscUtils as miscUtils
from msAI.errors import MSfileSetInitError
from msAI.miscDecos import log_timer
from msAI.types import DF
import os
import logging
from typing import ClassVar, List
import pandas as pd
import pymzml
logger = logging.getLogger(__name__)
"""Module logger."""
[docs]class MSfile:
"""Interface class for accessing data from a MS file stored in various file types.
Subclass implementations provide support for the various file types
and override the init method to set values.
The `peaks` and `spectra` properties hold data structured in dataframes.
"""
_run_id: str
_run_date: str
_ms_file_version: str
_spectrum_count: int
_peak_count: int
_tic_sum: float
_peaks: DF
_spectra: DF
[docs] def __init__(self):
"""Initializes an instance of MSfile class.
No need to call this superclass initialization,
as subclasses provide values for all attributes initialized here.
"""
self._run_id = None
self._run_date = None
self._ms_file_version = None
self._spectrum_count = None
self._peak_count = None
self._tic_sum = None
self._peaks = pd.DataFrame()
self._spectra = pd.DataFrame()
[docs] @property
def run_id(self):
"""Get the sample's run ID as specified from its MS data file."""
return self._run_id
[docs] @property
def run_date(self):
"""Get the date the sample was run as specified from its MS data file."""
return self._run_date
[docs] @property
def ms_file_version(self):
"""Get the data format version in which the sample was originally saved as specified from its ms file.
Note: Currently, this is equivalent to mzML version number.
"""
return self._ms_file_version
[docs] @property
def spectrum_count(self):
"""Get the number of MS spectra from a sample run.
This value is calculated from the number of spectra imported, rather than from MS file metadata.
"""
return self._spectrum_count
[docs] @property
def peak_count(self):
"""Get the total number of MS peaks from all MS spectra in sample run."""
return self._peak_count
[docs] @property
def tic_sum(self):
"""
Get the total ion current sum of all spectra in sample run
"""
return self._tic_sum
[docs] @property
def peaks(self):
"""Get a dataframe of all peaks in a MS file.
Dataframe structure
| **First Index Level:** spec_id
| **Second Index Level:** peak_number
| **Columns:** rt, mz, i
"""
return self._peaks
[docs] @property
def spectra(self):
"""Get a dataframe of all spectra in an MS file.
Dataframe structure
| **Index:** spec_id
| **Columns:** rt, peak_count, tic, ms_lvl, filters
"""
return self._spectra
[docs]class MZMLfile(MSfile):
"""Class to access MS data stored in an mzML file."""
[docs] def __init__(self,
mzml_file_path: str):
"""Initializes an instance of MZMLfile class.
Args:
mzml_file_path: A string representation of the path to the mzML data file.
Path can be relative or absolute.
"""
self._mzml_file_path = mzml_file_path
self._run = pymzml.run.Reader(self._mzml_file_path)
# self._peaks = pd.DataFrame()
# self._spectra = pd.DataFrame()
self._create_dfs()
self._run_id = self._run.info['run_id']
# self.spectrum_count = self._run.info['spectrum_count']
self._run_date = self._run.info['start_time']
self._ms_file_version = self._run.info['mzml_version']
self._spectrum_count = self.spectra.index.size
self._tic_sum = self.spectra['tic'].sum() # Total number of MS peaks from all spectra
self._peak_count = self.peaks.index.size # Total ion current sum of all spectra
del self._run
[docs] def _create_spectrum_peaks_df(self, spectrum):
"""Creates a dataframe of all the peaks for a single spectrum in an mzML file."""
mz_values = spectrum.mz.round(5)
i_values = spectrum.i
rt = spectrum.scan_time_in_minutes()
peak_count = len(mz_values)
spectrum_id = [spectrum.ID]
peak_list = list(range(peak_count))
peak_index = pd.MultiIndex.from_product([spectrum_id, peak_list], names=['spec_id', 'peak_number'])
peaks = {'rt': rt,
'mz': mz_values,
'i': i_values}
spectrum_peaks = pd.DataFrame(peaks, index=peak_index)
return spectrum_peaks
[docs] def _create_spectrum_df(self, spectrum):
"""Creates a dataframe of all the spectra in an mzML file."""
rt = spectrum.scan_time_in_minutes()
peak_count = len(spectrum.mz)
try:
tic = spectrum.TIC
except AttributeError:
logger.warning("missing TIC value in mzML file")
tic = None
ms_lvl = spectrum.ms_level
filters = spectrum.get('filter string')
spectrum_id = [spectrum.ID]
spec = {'rt': rt,
'peak_count': peak_count,
'tic': tic,
'ms_lvl': ms_lvl,
'filters': filters}
spectrum_df = pd.DataFrame(spec, index=spectrum_id)
return spectrum_df
[docs] def _create_dfs(self):
"""Creates spectra and peaks dataframes for an mzML file.
This method sets the following properties:
* self._peaks
* self._spectra
"""
peaks_df_list = []
spectra_df_list = []
for n, spectrum in enumerate(self._run):
spectrum_peaks_df = self._create_spectrum_peaks_df(spectrum)
peaks_df_list.append(spectrum_peaks_df)
spectrum_df = self._create_spectrum_df(spectrum)
spectra_df_list.append(spectrum_df)
self._peaks = pd.concat(peaks_df_list)
self._spectra = pd.concat(spectra_df_list)
[docs]class MSfileSet:
"""Class to create a set of MS files from a data directory.
Creating a set enables a large number of datafiles to be viewed / manipulated as a dataframe,
without loading their entire contents into memory.
By default, contents of sub directories will be recursively included.
However, an error is raised if included filenames are duplicated.
A Set can include any MSfile type (mzML, msAIr, or a mix).
By default, any datafile matching these extensions will be included.
An exclusive type may alternatively be specified.
"""
mzML_exts: ClassVar[List[str]] = ['mzML', 'mzml', 'MZML']
"""File extensions considered to be mzML files."""
msAIr_exts: ClassVar[List[str]] = ['msAIr', 'msair', 'MSAIR']
"""File extensions considered to be msAIr files."""
[docs] @log_timer
def __init__(self,
dir_path: str,
data_type: str = 'all',
recursive: bool = True):
"""Initializes an instance of MSfileSet class.
Args:
dir_path: A string representation of the path to the data directory.
Path can be relative or absolute.
data_type: (`all`, `mzML`, `msAIr`) The type of MS files to include in the set.
By default, all types are included.
recursive: A boolean indicating if files in subdirectories are included in the set.
Defaults to ``True``.
Raises:
MSfileSetInitError: For duplicated filenames.
"""
self._dir_path = dir_path
if data_type == 'all':
ext_list = self.mzML_exts + self.msAIr_exts
elif data_type == 'mzML':
ext_list = self.mzML_exts
elif data_type == 'msAIr':
ext_list = self.msAIr_exts
else:
raise MSfileSetInitError(f"Invalid data_type: {data_type}")
if recursive:
self._file_iter = miscUtils.FileGrabber.multi_extensions(self._dir_path, *ext_list)
else:
self._file_iter = miscUtils.FileGrabber.multi_extensions(self._dir_path, *ext_list, recursive=False)
def file_gen():
for datafile in self._file_iter:
file_size = miscUtils.Sizer.file_mb(datafile)
path_head, path_tail = os.path.split(datafile)
filename, file_ext = os.path.splitext(path_tail)
# Fix mixed cases extensions to have same file_type value
if file_ext.replace(".", "").casefold() == 'mzml':
file_type = 'mzML'
elif file_ext.replace(".", "").casefold() == 'msair':
file_type = 'msAIr'
yield (filename, file_type, file_size, str(datafile))
# Initial import into a dataframe with integer index
self._hf = pd.DataFrame(file_gen(), columns=['filename', 'file_type', 'file_size', 'path'])
# Test if any filenames are duplicated
duplicates = self._hf[self._hf.duplicated('filename', keep=False)]
if duplicates.size > 0:
raise MSfileSetInitError(f"Duplicated filenames:\n {duplicates.to_string()}")
else:
self._df = self._hf.set_index('filename', verify_integrity=True)
def __repr__(self):
return self._df.to_string()
[docs] @property
def df(self):
"""Get a dataframe of MS files.
Dataframe structure
| **Index:** name (from filename)
| **Columns:** type, size_MB, path
"""
return self._df