Source code for msAI.metadata
"""msAI module for importing sample metadata into dataframes.
Features
* Extraction of metadata from various file types
* Importing metadata into a dataframe
* Verification of metadata usability
* Auto indexing of metadata
Todo
* Move .msAIm saving to this module
* Refactor auto indexing
* Add anomaly detection
* Add additional file types: TBD...
"""
from msAI.errors import MetadataVerifyError, MetadataIndexError, MetadataInitError
from msAI.types import Series, DF, MetaDF
from msAI.miscUtils import Saver
from msAI.miscDecos import log_timer
import os
import logging
import pandas as pd
logger: logging.Logger = logging.getLogger(__name__)
"""Module logger."""
[docs]class SampleMetadata:
"""Imports sample metadata from a supported file type into a dataframe and assigns an index.
Supported file types: *.csv*, *.msAIm*, TBD...
(A *.msAIm* file can be created from a previous `.SampleSet`).
Content from the metadata file is initially imported into a dataframe with a default numerical index.
By default, metadata labels and values are analyzed and if possible, a new index is assigned from an existing column.
This index is used by `.SampleSet` to match this metadata with corresponding MS data in `.MSfileSet`.
Requirements to auto index metadata imported into a dataframe:
* Dataframe has 1 or more rows
* Dataframe has 2 or more columns
* For one and only one column:
* All column values are unique
* All entries/rows have a value for this column
"""
file_path: str
"""A string representation of the path to the metadata file."""
_hf: DF
"""High fidelity copy of imported data.
Leave this original data untouched for future reference if needed.
"""
df: MetaDF
"""The metadata dataframe."""
[docs] @log_timer
def __init__(self,
file_path: str,
auto_index: bool = True):
"""Initializes an instance of SampleMetadata class.
Args:
file_path: A string representation of the path to the metadata file.
Path can be relative or absolute.
auto_index: A boolean indicating if the metadata should be automatically indexed.
Default is True.
Raises:
MetadataInitError: For an invalid file type/extension.
"""
self.file_path = file_path
name, ext = os.path.splitext(self.file_path)
# CSV import
if ext.casefold() == ".csv":
self._hf = pd.read_csv(self.file_path)
self.df = self._hf.copy()
# Verify imported metadata is usable
self._verify_import()
if auto_index:
# Assign an index, if possible
self._auto_index()
# msAIm import
elif ext.casefold() == ".msaim":
metadata, hash_result = Saver.load_obj(self.file_path)
self.df = metadata
else:
raise MetadataInitError(f"Invalid file type/extension: {self.file_path}")
[docs] def __repr__(self):
"""Returns a string representation of the metadata dataframe."""
return self.df.to_string()
[docs] def _verify_import(self):
"""Verifies the imported metadata is usable.
Ensures at least one metadata entry/row and
at least two metadata labels/columns exist.
Raises:
MetadataVerifyError: If No metadata entries or not enough metadata labels are found
"""
def verify_entries_count():
# Ensure at least one metadata entry/row exists
row_count = self.df.shape[0]
if row_count < 1:
raise MetadataVerifyError("No metadata entries found")
def verify_label_count():
# Ensure at least two metadata labels/columns exist
column_count = self.df.columns.size
if column_count < 2:
raise MetadataVerifyError(f"Not enough metadata labels: {column_count} labels found")
verify_entries_count()
verify_label_count()
[docs] def _auto_index(self):
"""Attempts to identify and set the dataframe index from a metadata label/column.
This index is used to match metadata to `.SampleRun`.
"""
def most_unique_label() -> Series:
"""Gets the label(s)/column(s) with the most unique values - a possible index.
More than one label will be returned if there are ties.
Returns:
A series of count value index by label name
"""
return self.df.nunique().nlargest(1, keep='all')
def verify_index(possible_index):
"""Ensures contents of imported metadata is suitable for auto indexing."""
def verify_unique_label_values():
"""Ensures a label/column has a unique value for all entries/rows."""
row_count = self.df.shape[0]
unique_col_value_count = possible_index[0]
if row_count != unique_col_value_count:
raise MetadataIndexError(f"Count of unique metadata labels (n={unique_col_value_count}) not equal to entry count (n={row_count})")
def verify_single_unique_label():
"""Ensures a only a single label/column has a unique value for all entries/rows.
Otherwise, more than one label is suitable for use as index- the user must decide.
"""
possible_index_count = possible_index.shape[0]
if possible_index_count > 1:
raise MetadataIndexError(f"{possible_index_count} labels possible for use as index")
verify_unique_label_values()
verify_single_unique_label()
possible_index_label = most_unique_label()
try:
verify_index(possible_index_label)
except MetadataIndexError as err:
logger.error(f"Can not auto index metadata: {err}")
else:
index_name = possible_index_label.index[0]
self.df.set_index(index_name, inplace=True, verify_integrity=True)
[docs] def describe(self):
"""Prints a summary of metadata contents."""
print(self.df.describe().to_string())
[docs] def set_index(self,
new_index: str):
"""Manually sets the metadata dataframe index to an existing label/column.
This index is used to match metadata to `.SampleRun`.
Args:
new_index: The name of the metadata label/column to use as the index.
"""
self.df.set_index(new_index, inplace=True, verify_integrity=True)