Source code for srlearn.base

# Copyright © 2017, 2018, 2019 Alexander L. Hayes

"""
Base class for Boosted Relational Models
"""

from collections import Counter
import inspect
import json
import logging
import warnings

from sklearn.utils.validation import check_is_fitted
import subprocess

from .background import Background
from .system_manager import FileSystem
from .utils._parse_trees import parse_tree
from ._meta import __version__


warnings.simplefilter("default")


[docs]class BaseBoostedRelationalModel: """Base class for deriving boosted relational models This class extends :class:`sklearn.base.BaseEstimator` and :class:`sklearn.base.ClassifierMixin` while providing several utilities for instantiating a model and performing learning/inference with the BoostSRL jar files. .. note:: This is not a complete treatment of *how to derive estimators*. Contributions would be appreciated. Examples -------- The actual :class:`srlearn.rdn.BoostedRDNClassifier` is derived from this class, so this example is similar to the implementation (but the actual implementation passes model parameters instead of leaving them with the defaults). This example derives a new class ``BoostedRDNClassifier``, which inherits the default values of the superclass while also setting a 'special_parameter' which may be unique to this model. All that remains is to implement the specific cases of ``fit()``, ``predict()``, and ``predict_proba()``. """
[docs] def __init__( self, *, background=None, target="None", n_estimators=10, node_size=2, max_tree_depth=3, neg_pos_ratio=2, solver = None, ): """Initialize a BaseEstimator""" self.background = background self.target = target self.n_estimators = n_estimators self.neg_pos_ratio = neg_pos_ratio if solver is None: warnings.warn( "solver='BoostSRL' will default to solver='SRLBoost' in 0.6.0" ", pass one or the other as an argument to suppress this warning.", FutureWarning) self.solver = "BoostSRL" else: if solver not in ("BoostSRL", "SRLBoost"): raise ValueError("`solver` must be 'SRLBoost' or 'BoostSRL'") self.solver = solver if isinstance(background, Background): self.node_size = node_size self.max_tree_depth = max_tree_depth
@property def node_size(self): return self.background.node_size @node_size.setter def node_size(self, value): self.background.node_size = value @property def max_tree_depth(self): return self.background.max_tree_depth @max_tree_depth.setter def max_tree_depth(self, value): self.background.max_tree_depth = value @classmethod def _get_param_names(cls): # Based on `scikit-learn.base.BaseEstimator._get_param_names` # Copyright Gael Varoquaux, BSD 3 clause signature = inspect.signature(cls) parameters = [ p for p in signature.parameters.values() if p.name != "self" and p.kind != p.VAR_KEYWORD ] for p in parameters: if p.kind == p.VAR_POSITIONAL: raise RuntimeError( "Oh no." ) return sorted([p.name for p in parameters]) def _check_params(self): """Check validity of parameters. Raise ValueError if errors are detected. If all parameters are valid, instantiate ``self.file_system`` by instantiating it with a :class:`srlearn.system_manager.FileSystem` """ checks = ( ( self.target, (str,), (lambda x: x != "None",), "'target' must be a string and cannot be 'None'", ), ( self.background, (Background,), (), "'background' must be a Background instance", ), ( self.n_estimators, (int,), ( lambda x: not isinstance(x, bool), lambda x: x >= 1, ), "'n_estimators' must be an 'int' >= 1", ), ( self.neg_pos_ratio, (int, float), (lambda x: not isinstance(x, bool), lambda x: x >= 1.0), "'neg_pos_ratio' must be 'int' or 'float'", ), ) for param, types, constraints, message in checks: if not any([isinstance(param, t) for t in types]): raise ValueError(message) for c in constraints: if not c(param): raise ValueError(message) # If all params are valid, allocate a FileSystem: self.file_system = FileSystem() def to_json(self, file_name) -> None: """Serialize a learned model to json. Parameters ---------- file_name : str (or pathlike) Path to a saved json file. Notes / Warnings ---------------- Intended for locally saving/loading. .. warning:: There could be major changes between releases, causing old model files to break.""" check_is_fitted(self, "estimators_") with open( self.file_system.files.BRDNS_DIR.joinpath( "{0}.model".format(self.target) ), "r", ) as _fh: _model = _fh.read().splitlines() model_params = { "background": dict(self.background.__dict__.items()), "target": self.target, "n_estimators": self.n_estimators, "node_size": self.node_size, "max_tree_depth": self.max_tree_depth, "neg_pos_ratio": self.neg_pos_ratio, "solver": self.solver, } with open(file_name, "w") as _fh: _fh.write( json.dumps( [ __version__, _model, self.estimators_, model_params, self._dotfiles, ] ) ) def from_json(self, file_name): """Load a learned model from json. Parameters ---------- file_name : str (or pathlike) Path to a saved json file. Notes / Warnings ---------------- Intended for locally saving/loading. .. warning:: There could be major changes between releases, causing old model files to break. There are also *no checks* to ensure you are loading the correct object type. """ with open(file_name, "r") as _fh: params = json.loads(_fh.read()) if params[0] != __version__: logging.warning( "Version of loaded model ({0}) does not match srlearn version ({1}).".format( params[0], __version__ ) ) _model = params[1] _estimators = params[2] _model_parameters = params[3] try: self._dotfiles = params[4] except IndexError: self._dotfiles = None logging.warning( "Did not find dotfiles during load, srlearn.plotting may not work." ) _bkg = Background() _bkg.__dict__ = _model_parameters["background"] # 1. Loop over all class attributes of `BaseBoostedRelationalModel` # except `background`, `node_size`, and `max_tree_depth`, which are # handled by `Background` objects. # 2. Update an `_attributes` dictionary mapping attributes from JSON # 3. *If a key was not present in the JSON*: set it to the default value. # 4. Initialize self by unpacking the dictionary into arguments. _attributes = { "background": _bkg, "node_size": _model_parameters["node_size"], "max_tree_depth": _model_parameters["max_tree_depth"], } for key in set(BaseBoostedRelationalModel()._get_param_names()) - {"background", "node_size", "max_tree_depth"}: _attributes[key] = _model_parameters.get( key, BaseBoostedRelationalModel().__dict__[key], ) self.__init__(**_attributes) self.estimators_ = _estimators # Currently allocates the File System. self._check_params() self.file_system.files.TREES_DIR.mkdir(parents=True) with open( self.file_system.files.BRDNS_DIR.joinpath( "{0}.model".format(self.target) ), "w", ) as _fh: _fh.write("\n".join(_model)) for i, _tree in enumerate(_estimators): with open( self.file_system.files.TREES_DIR.joinpath( "{0}Tree{1}.tree".format(self.target, i) ), "w", ) as _fh: _fh.write(_tree) return self @property def feature_importances_(self): """ Return the features contained in a tree. Parameters ---------- tree_number: int Index of the tree to read. """ check_is_fitted(self, "estimators_") features = [] for tree_number in range(self.n_estimators): _rules_string = self.estimators_[tree_number] features += parse_tree( _rules_string, (not self.background.use_std_logic_variables) ) return Counter(features) def _get_dotfiles(self): dotfiles = [] for i in range(self.n_estimators): with open( self.file_system.files.DOT_DIR.joinpath( "WILLTreeFor_" + self.target + str(i) + ".dot" ) ) as _fh: dotfiles.append(_fh.read()) self._dotfiles = dotfiles def _check_initialized(self): """Check for the estimator(s), raise an error if not found.""" check_is_fitted(self, "estimators_") @staticmethod def _call_shell_command(shell_command): """Start a new process to execute a shell command. This is intended for use in calling jar files. It opens a new process and waits for it to return 0. Parameters ---------- shell_command : str A string representing a shell command. Returns ------- None """ _pid = subprocess.Popen(shell_command, shell=True) _status = _pid.wait() if _status != 0: raise RuntimeError( "Error when running shell command: {0}".format(shell_command) ) def fit(self, database): raise NotImplementedError def predict(self, database): raise NotImplementedError def predict_proba(self, database): raise NotImplementedError def __repr__(self): params = self._get_param_names() params.remove("max_tree_depth") params.remove("node_size") params = ", ".join([str(param) + "=" + repr(self.__dict__[param]) for param in params]) return ( self.__class__.__name__ + f"({params})" )