import importlib
import inspect
import os
from abc import ABC, abstractmethod
from typing import Dict, List, Union
from sklearn.base import BaseEstimator, TransformerMixin
from ..utils.config import config
from ..utils.logging_config import setup_logger
logger = setup_logger(__name__)
[docs]
class Numerical(ABC):
"""Abstract interface to indicate numerical step"""
pass
[docs]
class Categorical(ABC):
"""Abstract interface to indicate categorical step"""
pass
[docs]
class NumericalCategorical(ABC):
"""Abstract interface to indicate categorical and numerical step"""
pass
[docs]
class Classifier(ABC):
"""Abstract interface to indicate classification problem."""
pass
[docs]
class Regressor(ABC):
"""Abstract interface to indicate regression problem."""
pass
[docs]
class Step(ABC, BaseEstimator, TransformerMixin):
"""
Abstract class to be overwritten for implementing custom
preprocessing steps. If step is parametrizable, it should have
defined "param_grid" of all possible values for each parameter.
"""
[docs]
@abstractmethod
def to_tex(self) -> dict:
"""
Returns a short description in form of dictionary.
Keys are: name - transformer name, desc - short description, params - class parameters (if None then {}).
"""
pass
[docs]
class RequiredStep(Step):
"""
Required step that will be always considered in preprocessing.
"""
pass
[docs]
class NonRequiredStep(Step):
"""
Non required step that will be only considered for preprocessing.
"""
pass
[docs]
class ModulesHandler(ABC):
supported_interfaces: List[object] = [
Numerical,
Categorical,
NumericalCategorical,
RequiredStep,
NonRequiredStep,
]
supported_combinations: List[List[object]] = [
("NumericalRequired", (Numerical, RequiredStep)),
("NumericalNonRequired", (Numerical, NonRequiredStep)),
("CategoricalRequired", (Categorical, RequiredStep)),
("CategoricalNonRequired", (Categorical, NonRequiredStep)),
("NumericalCategoricalRequired", (NumericalCategorical, RequiredStep)),
("NumericalCategoricalNonRequired", (NumericalCategorical, NonRequiredStep)),
]
def __init__(self):
"""
Performs checks.
Raises:
AssertionError - if any member of :obj:`ModulesHandler.supported_combinations`
is not in :obj:`ModulesHandler.supported_interfaces`
"""
for name, members in ModulesHandler.supported_combinations:
for member in members:
assert (
member in ModulesHandler.supported_interfaces
), f"Unsupported member in group {name} - {member}"
[docs]
@staticmethod
def get_subpackage(__file__):
"""
Returns the name of the package (directory) containing the given file
as relative auto_prep subpackage.
Args:
__file__ (str): The absolute or relative path to the current file.
Returns:
str: The name of the directory containing the file, which is treated as the package name.
Raises:
ValueError - if it cannot find the module
"""
current_file = os.path.abspath(__file__)
abs_dir = os.path.dirname(current_file)
if config.root_project_dir not in abs_dir:
logger.error(f"Tried to import module from {abs_dir}.")
raise ValueError("Unknown relative module")
rel_dir = abs_dir[len(config.root_project_dir) :].lstrip(os.path.sep)
# Convert the path to a module-style dot-separated format
return rel_dir.replace(os.path.sep, ".")
[docs]
@staticmethod
def construct_pipelines_steps_helper(
step_name: str,
package_name: str,
called_from: str,
pipelines: List[List[Step]],
required_only_: bool = False,
) -> List[List[Step]]:
"""
A helper method to construct and extend pipelines steps by incorporating modules
dynamically from a specified package.
This method uses the `ModulesHandler.construct_pipelines` function to add
modules to existing pipelines based on the package's name and the current
file context. It logs the operation's start and end using the provided
logger.
Args:
step_name (str): The name of the step, used for logging purposes.
package_name (str): The name of the package containing the modules
to be dynamically added to the pipelines.
called_from (str) - python file from which this method is called. Required
for relative imports.
pipelines (List[List[Step]]): A list of existing pipelines to which
new modules will be added.
required_only_ (bool, optional): If `True`, only the required modules
(determined by the package) will be added. If `False`, both required
and non-required modules will be included. Defaults to `False`.
Returns:
List[List[Step]]: The updated list of pipelines steps after incorporating
the modules from the specified package.
"""
logger.start_operation(step_name)
pipelines = ModulesHandler.construct_pipelines_steps(
step_name,
package_name,
called_from,
pipelines=pipelines,
required_only_=required_only_,
)
logger.end_operation()
return pipelines
[docs]
@staticmethod
def construct_pipelines_steps(
step_name: str,
module_name: str,
called_from: str,
pipelines: List[List[Step]] = [],
required_only_: bool = False,
) -> List[List[Step]]:
"""
Constructs new pipelines (list of steps) by adding steps from the provided module. The
method dynamically loads and groups classes from the module, and then
extends existing pipelines by adding required and/or non-required steps.
The method starts by loading and grouping classes from the module. It then
explodes the existing pipelines by adding required steps. If the `required_only_`
flag is `False`, non-required steps are also added to the pipelines.
Args:
step_name (str): The name of the step, used for logging purposes.
module_name (str): The name of the module from which to load and group classes.
called_from (str) - python file from which this method is called. Required
for relative imports.
pipelines (List[List[Step]]): A list of existing pipelines to be extended.
required_only_ (bool, optional): If `True`, only required steps are added to the
pipelines. If `False`, both required and non-required steps are added.
Defaults to `False`.
Returns:
List[List[Step]]: A list of new pipelines steps created by adding the corresponding
required and non-required steps to the original pipelines.
"""
logger.start_operation("Constructing new pipelines.")
package = ModulesHandler.get_subpackage(called_from)
new_steps, _ = ModulesHandler._load_and_group_classes(
module_name, package=package
)
logger.debug(f"New steps: {new_steps}")
new_pipelines = []
logger.debug(f"Starting with {len(pipelines)} pipelines")
new_pipelines, num_required = ModulesHandler._explode_pipelines_steps(
steps=ModulesHandler._get_required_steps(),
new_steps=new_steps,
pipelines=pipelines,
)
logger.debug(f"After required steps - {len(new_pipelines)}")
num_non_required = 0
if not required_only_:
non_required, num_non_required = ModulesHandler._explode_pipelines_steps(
steps=ModulesHandler._get_non_required_steps(),
new_steps=new_steps,
pipelines=new_pipelines,
)
# keep those from required only and those extended with non-required steps
if num_non_required > 0: # there were some non-required steps
new_pipelines.extend(non_required)
logger.debug(f"After non-required steps - {len(new_pipelines)}")
logger.info(
f"Extracted {num_required + num_non_required} steps for {step_name} ({num_required} required, {num_non_required} non required)"
)
logger.end_operation()
return new_pipelines
@staticmethod
def _explode_pipelines_steps(
steps: List[str],
new_steps: Dict[str, List[object]],
pipelines: List[List[Step]],
) -> Union[List[List[Step]], int]:
"""
Explodes the given pipelines by adding new steps to them based on the
provided `steps` and `new_steps`. This method creates new pipelines where
each pipeline is extended by the corresponding steps from `new_steps`.
Args:
steps (List[str]): List of step names to match against `new_steps` keys.
new_steps (Dict[str, List[object]]): A dictionary where each key is a
step name and its value is a list of classes (steps) to be added
to the corresponding pipeline.
pipelines (List[List[PipelineStep]: A list of existing pipelines to be extended.
Returns:
List[List[Step]: A list of new pipelines created by adding the new steps
to the existing pipelines. If pipelines are empty, it will just return
new found steps.
int: number of unique objects extracted.
"""
new_pipelines = []
num = 0
if len(pipelines) == 0:
for step in steps:
if step in new_steps.keys():
for cls in new_steps[step]:
num += 1
new_pipelines.append([cls])
return new_pipelines, num
for step in steps:
if step in new_steps.keys():
for cls in new_steps[step]:
num += 1
for pipeline in pipelines:
new_pipelines.append([*pipeline, cls])
if num > 0:
return new_pipelines, num
return pipelines, num
@staticmethod
def _get_required_steps() -> List[str]:
"""Returns list of names of required steps combinations"""
return [
e[0]
for e in ModulesHandler.supported_combinations
if "NonRequired" not in e[0]
]
@staticmethod
def _get_non_required_steps() -> List[str]:
"""Returns list of names of required steps combinations"""
return [
e[0] for e in ModulesHandler.supported_combinations if "NonRequired" in e[0]
]
@staticmethod
def _load_and_group_classes(
module_name: str, package: str
) -> Union[Dict[str, List[object]], int]:
"""
Import all objects from module_name that extends any of interfaces from
:obj:`ModulesHandler.supported_interfaces` and groups them into steps
defined in :obj:`ModulesHandler.supported_combinations`
Args:
module_name (str) - module to import.
package (str) - python package from which this method is called. Required
for relative imports.
Returns:
Dict[List[object]] - objects groupped into pre-defined groups.
int - number of unique objects extracted.
Raises:
ValueError - if any of imported classes fits into more than one group.
"""
classes = ModulesHandler.load_classes(module_name, package)
combinations = {}
groupped = set()
for cls in classes:
for name, members in ModulesHandler.supported_combinations:
in_group_ = True
for member in members:
if not issubclass(cls, member):
in_group_ = False
if in_group_:
if cls not in groupped:
groupped.add(cls)
if name not in combinations:
combinations[name] = [cls]
else:
combinations[name].append(cls)
else:
raise ValueError(f"{cls} fits more than one group")
logger.debug(f"Retrieved follwing combinations - {combinations}")
return combinations, len(groupped)
[docs]
@staticmethod
def load_classes(module_name: str, package: str) -> List[object]:
logger.debug(f"Importing classes from {module_name}")
module = importlib.import_module(module_name, package=package)
classes = [
cls
for _, cls in inspect.getmembers(module, inspect.isclass)
if cls.__module__.endswith(module_name)
]
logger.debug(f"Found following classes: {classes}")
return classes