from typing import List, Any, Tuple, Union
import numpy as np
import pandas as pd
from statsmodels.distributions.empirical_distribution import ECDF
from scipy.stats import entropy, kstest, gaussian_kde
from metrics.feature.feature import Feature
[docs]class ContinuousFeature(Feature):
"""
Representation of a continuous data feature.
Attributes
----------
name :
The name of the feature.
affinity : {"NUMERIC", "INTEGER", "REAL", "TEXT", "BLOB"}
The SQLite3 type affinity of the feature.
is_null :
True if feature data can be null, False otherwise.
"""
feature_type = "continuous"
"""string: Type of the feature."""
def __str__(self) -> str:
return "CONTINUOUS " + super().__str__()
[docs] def compare_feature_stat(
self, sample_data: pd.DataFrame, simulation_data: pd.DataFrame
) -> float:
"""
Uses statistical tests to compare continuous features.
Uses Kolmogorov-Smirnov test to compare continuous feature between
sample and tumor distributions. The Kolmogorov–Smirnov statistic
quantifies a distance between the empirical distribution functions.
Parameters
----------
sample_data :
Loaded sample data.
simulation_data :
Loaded tumor data.
Returns
-------
:
Result of statistical test.
"""
if self.is_valid_feature_name(simulation_data, sample_data):
sample = list(sample_data[self.name])
reference = list(simulation_data[self.name])
reference_cdf = ECDF(reference)
p_value = kstest(sample, reference_cdf)
return p_value[1]
return float("nan")
[docs] def compare_feature_info(
self, sample_data: pd.DataFrame, simulation_data: pd.DataFrame
) -> float:
"""
Uses Kullback-Leibler divergence (KL divergence) compare continuous features.
Parameters
----------
sample_data :
Loaded sample data.
simulation_data :
Loaded tumor data.
Returns
-------
:
Result of KL divergence that are keyed by the category.
"""
if self.is_valid_feature_name(simulation_data, sample_data):
sample: list = list(sample_data[self.name])
reference: list = list(simulation_data[self.name])
sample_prob, reference_prob = self.get_pdfs(sample, reference)
return entropy(sample_prob, reference_prob)
return float("nan")
[docs] def write_feature_data(
self,
data_list: list,
sample_data: pd.DataFrame,
simulation_data: pd.DataFrame,
) -> List[Any]:
"""
Writes feature data into the list of data.
Parameters
----------
data_list:
List of data in analysis table.
sample_data :
Loaded sample data.
simulation_data :
Loaded tumor data.
Returns
-------
:
List of data needed for analysis dataframe.
"""
stats_data = self.compare_feature_stat(sample_data, simulation_data)
info_data = self.compare_feature_info(sample_data, simulation_data)
return [data_list + [None, stats_data, info_data]]
[docs] def is_valid_feature_name(
self, simulation_data: pd.DataFrame, sample_data: pd.DataFrame
) -> bool:
"""
Parameters
----------
simulation_data :
Loaded tumor data.
sample_data :
Loaded sample data.
Returns
-------
:
True if feature name valid and if dataframe contains data, False otherwise.
"""
if self.name not in simulation_data.columns or self.name not in sample_data.columns:
return False
if simulation_data[self.name].isna().all() or sample_data[self.name].isna().all():
return False
return True
[docs] def get_pdfs(
self, sample_data: list[float], reference_data: list[float]
) -> Union[Tuple[np.ndarray, np.ndarray], Tuple[float, float]]:
"""
Parameters
----------
reference_data :
Distribution of the feature in the simulation data.
sample_data :
Distribution of the feature in the sample data.
Returns
-------
:
Probability density function of the feature.
"""
try:
# Estimate the probability density function (PDF) of the sample and simulation data
reference_kde = gaussian_kde(reference_data)
sample_kde = gaussian_kde(sample_data)
except (ValueError, np.linalg.LinAlgError):
# If the sample or simulation contain only a single unique value, the KDE will fail
return float("nan"), float("nan")
# Discretize continuous distributions into discrete approximations
num_bins = 1000
x = np.linspace(
min(min(reference_data), max(sample_data)),
max(min(reference_data), max(sample_data)),
num_bins,
)
reference_prob = reference_kde.pdf(x)
sample_prob = sample_kde.pdf(x)
return sample_prob, reference_prob