Source code for metrics.script.script

from typing import List, Optional

import numpy as np
import pandas as pd

from metrics.analysis.database import Database


[docs]class Script: """ Script for visualizing the data. """
[docs] @staticmethod def create_databases( population: str, context: str, heterogeneity_levels: list = ["00", "10", "20", "30", "40", "50"], ) -> List[Database]: """ Creates database files for each context. Parameters ---------- population : str The population. context : str The context. heterogeneity_levels : list The heterogeneity levels. Returns ------- : List[Database] List of database objects. """ database_list = [] for heterogeneity in heterogeneity_levels: database_list.append( Database(f"./data/databases/{population}_{context}_{heterogeneity}.db") ) return database_list
[docs] @staticmethod def select_data( database_list: List[Database], context: str, reference_time: Optional[str] = None, observation_time: Optional[str] = None, feature: Optional[str] = None, comparison: Optional[dict] = None, ) -> pd.DataFrame: """ Selects data from the database. Parameters ---------- database_list : List[Database] The database object. reference_time : Optional[str] The reference time. observation_time : Optional[str] The observation time. feature : Optional[str] The feature. comparison: Optional[dict] The comparison group. Returns ------- combined_table : pd.DataFrame Combined selected dataframe. """ query_list: List[str] = [] if reference_time is not None: query_list.append(f"reference_time = {reference_time}") if observation_time is not None: query_list.append(f"obervation_time = {observation_time}") if feature is not None: query_list.append(f"feature = '{feature}'") if comparison is not None: query_list.append(f"comparison_group = '{' | '.join(comparison.values())}'") combined_table: list = [] for database in database_list: data = database.execute_query(f"SELECT * FROM stats WHERE {' AND '.join(query_list)}") # Add context column and change key column data["context"] = context key_split = data["key"].str.split("_", expand=True) selected_columns = key_split[[4, 3]] selected_columns.columns = ["heterogeneity", "cell_line"] data = pd.concat([data, selected_columns], axis=1) data = data.drop("key", axis=1) data["heterogeneity"] = data["heterogeneity"].astype(int) data["divergence"] = data["divergence"].replace("NULL", np.nan).astype(float) combined_table.append(data) return pd.concat(combined_table, axis=0)
[docs] @staticmethod def find_unique_variables( data: pd.DataFrame, variable: Optional[str], column_name: Optional[str] = "category" ) -> np.ndarray: """ Find unique variables. Parameters ---------- data : pd.DataFrame The data. variable : Optional[str] The variable. column_name : Optional[str] The column name. Returns ------- unique_variables : np.ndarray Unique variables. """ unique_variables = data[column_name].unique() if variable == "population": # Create a mask to identify the items to keep mask = unique_variables != "4" # Remove item from the array unique_variables = unique_variables[mask] return unique_variables