Source code for srlearn.rdn

# Copyright © 2017, 2018, 2019 Alexander L. Hayes

"""
Relational Dependency Networks
"""

import re
import warnings
import numpy as np

from .base import BaseBoostedRelationalModel
from .database import Database

warnings.simplefilter("default")


[docs]class BoostedRDNClassifier(BaseBoostedRelationalModel):
    """Relational Dependency Networks Estimator

    Wrappers around BoostSRL for learning and inference with Relational Dependency
    Networks written with a scikit-learn-style interface derived from
    :class:`sklearn.base.BaseEstimator`

    Similar to :class:`sklearn.ensemble.GradientBoostingClassifier`, this builds
    a model by fitting a series of regression trees.

    Examples
    --------

    >>> from srlearn.rdn import BoostedRDNClassifier
    >>> from srlearn import Background
    >>> from srlearn.datasets import load_toy_cancer
    >>> train, test = load_toy_cancer()
    >>> bk = Background(modes=train.modes)
    >>> dn = BoostedRDNClassifier(background=bk, target="cancer")
    >>> dn.fit(train)
    BoostedRDNClassifier(background=setParam: numOfClauses=100.
    setParam: numOfCycles=100.
    usePrologVariables: true.
    setParam: nodeSize=2.
    setParam: maxTreeDepth=3.
    mode: friends(+Person,-Person).
    mode: friends(-Person,+Person).
    mode: smokes(+Person).
    mode: cancer(+Person).
    , n_estimators=10, neg_pos_ratio=2, solver='BoostSRL', target='cancer')
    >>> dn.predict(test)
    array([ True,  True,  True, False, False])

    """

    # pylint: disable=too-many-instance-attributes

[docs]    def __init__(
        self,
        background=None,
        target="None",
        n_estimators=10,
        node_size=2,
        max_tree_depth=3,
        neg_pos_ratio=2,
        solver=None,
    ):
        """Initialize a BoostedRDN

        Parameters
        ----------
        background : :class:`srlearn.background.Background` (default: None)
            Background knowledge with respect to the database
        target : str (default: "None")
            Target predicate to learn
        n_estimators : int, optional (default: 10)
            Number of trees to fit
        node_size : int, optional (default: 2)
            Maximum number of literals in each node.
        max_tree_depth : int, optional (default: 3)
            Maximum number of nodes from root to leaf (height) in the tree.
        neg_pos_ratio : int or float, optional (default: 2)
            Ratio of negative to positive examples used during learning.

        Attributes
        ----------
        estimators_ : array, shape (n_estimators)
            Return the boosted regression trees
        feature_importances_ : array, shape (n_features)
            Return the feature importances (based on how often each feature appears)
        """

        super().__init__(
            background=background,
            target=target,
            n_estimators=n_estimators,
            node_size=node_size,
            max_tree_depth=max_tree_depth,
            neg_pos_ratio=neg_pos_ratio,
            solver=solver,
        )

[docs]    def fit(self, database):
        """Learn structure and parameters.

        Fit the structure and parameters of a Relational Dependency Network using a
        database of positive examples, negative examples, facts, and any relevant
        background knowledge.

        Parameters
        ----------
        database : :class:`srlearn.database.Database`
            Database containing examples and facts.

        Returns
        -------
        self : object
            Returns self.

        Notes
        -----

        The underlying algorithm is based on the "Relational Functional Gradient
        Boosting" as described in [1]_.

        This fit function is based on subprocess calling the BoostSRL jar files.
        This will require a Java runtime to also be available. See [2]_.

        .. [1] Sriraam Natarajan, Tushar Khot, Kristian Kersting, and Jude Shavlik,
           "*Boosted Statistical Relational Learners: From Benchmarks to Data-Driven
           Medicine*". SpringerBriefs in Computer Science, ISBN: 978-3-319-13643-1,
           2015
        .. [2] https://starling.utdallas.edu/software/boostsrl/
        """

        self._check_params()

        # Write the background to file.
        self.background.write(
            filename="train", location=self.file_system.files.TRAIN_DIR
        )

        if isinstance(database, tuple):
            _db = Database()
            _db.pos = database.pos
            _db.neg = database.neg
            _db.facts = database.facts
            database = _db

        # Write the data to files.
        database.write(
            filename="train", location=self.file_system.files.TRAIN_DIR
        )

        if self.solver == "BoostSRL":
            _jar = str(self.file_system.files.BOOSTSRL_BACKEND)
        else:
            _jar = str(self.file_system.files.SRLBOOST_BACKEND)

        _CALL = (
            "java -jar "
            + _jar
            + " -l -train "
            + str(self.file_system.files.TRAIN_DIR)
            + " -target "
            + self.target
            + " -trees "
            + str(self.n_estimators)
            + " -negPosRatio "
            + str(self.neg_pos_ratio)
            + " > "
            + str(self.file_system.files.TRAIN_LOG)
        )

        # Call the constructed command.
        self._call_shell_command(_CALL)

        # Read the trees from files.
        _estimators = []
        for _tree_number in range(self.n_estimators):
            with open(
                self.file_system.files.TREES_DIR.joinpath(
                    "{0}Tree{1}.tree".format(self.target, _tree_number)
                )
            ) as _fh:
                _estimators.append(_fh.read())

        self._get_dotfiles()
        self.estimators_ = _estimators

        return self

    def _run_inference(self, database) -> None:
        """Run inference mode on the BoostSRL Jar files.

        This is a helper method for ``self.predict`` and ``self.predict_proba``
        """

        self._check_initialized()

        if isinstance(database, tuple):
            _db = Database()
            _db.pos = database.pos
            _db.neg = database.neg
            _db.facts = database.facts
            database = _db

        # Write the background to file.
        self.background.write(
            filename="test", location=self.file_system.files.TEST_DIR
        )

        # Write the data to files.
        database.write(filename="test", location=self.file_system.files.TEST_DIR)

        if self.solver == "BoostSRL":
            _jar = str(self.file_system.files.BOOSTSRL_BACKEND)
        else:
            _jar = str(self.file_system.files.SRLBOOST_BACKEND)

        _CALL = (
            "java -jar "
            + _jar
            + " -i -test "
            + str(self.file_system.files.TEST_DIR)
            + " -model "
            + str(self.file_system.files.MODELS_DIR)
            + " -target "
            + self.target
            + " -trees "
            + str(self.n_estimators)
            + " -aucJarPath "
            + str(self.file_system.files.AUC_JAR)
            + " > "
            + str(self.file_system.files.TEST_LOG)
        )

        self._call_shell_command(_CALL)

        # Read the threshold
        with open(self.file_system.files.TEST_LOG, "r") as _fh:
            _threshold = re.findall("% Threshold = \\d*.\\d*", _fh.read())
        self.threshold_ = float(_threshold[0].split(" = ")[1])

[docs]    def predict(self, database):
        """Use the learned model to predict on new data.

        Parameters
        ----------
        database : :class:`srlearn.Database`
            Database containing examples and facts.

        Returns
        -------
        results : ndarray
            Positive or negative class.
        """

        self._run_inference(database)

        # Collect the classifications.
        _results_db = self.file_system.files.TEST_DIR.joinpath(
            "results_" + self.target + ".db"
        )
        _classes, _results = np.loadtxt(
            _results_db,
            delimiter=") ",
            usecols=(0, 1),
            converters={0: lambda s: 0 if s[0] == 33 else 1},
            unpack=True,
        )

        self.classes_ = _classes

        _neg = _results[_classes == 0]
        _pos = _results[_classes == 1]
        _results2 = np.greater(
            np.concatenate((_pos, 1 - _neg), axis=0), self.threshold_
        )

        return _results2

[docs]    def predict_proba(self, database):
        """Return class probabilities.

        Parameters
        ----------
        database : :class:`srlearn.Database`
            Database containing examples and facts.

        Returns
        -------
        results : ndarray
            Probability of belonging to the positive class
        """

        self._run_inference(database)

        _results_db = self.file_system.files.TEST_DIR.joinpath(
            "results_" + self.target + ".db"
        )
        _classes, _results = np.loadtxt(
            _results_db,
            delimiter=") ",
            usecols=(0, 1),
            converters={0: lambda s: 0 if s[0] == 33 else 1},
            unpack=True,
        )

        _neg = _results[_classes == 0]
        _pos = _results[_classes == 1]
        _results2 = np.concatenate((_pos, 1 - _neg), axis=0)

        self.classes_ = _classes

        return _results2


class BoostedRDN(BoostedRDNClassifier):

    def __init__(self, **args):
        warnings.warn(
            "Use 'BoostedRDNClassifier' instead of 'BoostedRDN'",
            DeprecationWarning,
        )
        super().__init__(**args)


[docs]class BoostedRDNRegressor(BaseBoostedRelationalModel):
    """Relational Dependency Networks Regressor

    Wrappers around BoostSRL for learning and inference of RDNs for regression task.

    Similar to :class:`sklearn.ensemble.GradientBoostingRegressor`, this builds
    a model by fitting a series of regression trees.

    Examples
    --------

    >>> from srlearn.rdn import BoostedRDNRegressor
    >>> from srlearn import Background
    >>> from srlearn import Database
    >>> train = Database.from_files(
    ...     pos="../datasets/Boston/train/pos.pl",
    ...     neg="../datasets/Boston/train/neg.pl",
    ...     facts="../datasets/Boston/train/facts.pl",
    ...     lazy_load=False,
    ... )
    >>> test = Database.from_files(
    ...     pos="../datasets/Boston/test/pos.pl",
    ...     neg="../datasets/Boston/test/neg.pl",
    ...     facts="../datasets/Boston/test/facts.pl",
    ...     lazy_load=False,
    ... )
    >>> train.modes = ["crim(+id,#varsrim).",
    ...     "zn(+id,#varzn).",
    ...     "indus(+id,#varindus).",
    ...     "chas(+id,#varchas).",
    ...     "nox(+id,#varnox).",
    ...     "rm(+id,#varrm).",
    ...     "age(+id,#varage).",
    ...     "dis(+id,#vardis).",
    ...     "rad(+id,#varrad).",
    ...     "tax(+id,#vartax).",
    ...     "ptratio(+id,#varptrat).",
    ...     "b(+id,#varb).",
    ...     "lstat(+id,#varlstat).",
    ...     "medv(+id)."]
    >>> bk = Background(modes=train.modes)
    >>> reg = BoostedRDNRegressor(background=bk, target="medv", n_estimators=5)
    >>> reg.fit(train)
    BoostedRDNRegressor(background=setParam: numOfClauses=100.
    setParam: numOfCycles=100.
    usePrologVariables: true.
    setParam: nodeSize=2.
    setParam: maxTreeDepth=3.
    mode: crim(+id,#varsrim).
    mode: zn(+id,#varzn).
    mode: indus(+id,#varindus).
    mode: chas(+id,#varchas).
    mode: nox(+id,#varnox).
    mode: rm(+id,#varrm).
    mode: age(+id,#varage).
    mode: dis(+id,#vardis).
    mode: rad(+id,#varrad).
    mode: tax(+id,#vartax).
    mode: ptratio(+id,#varptrat).
    mode: b(+id,#varb).
    mode: lstat(+id,#varlstat).
    mode: medv(+id).
    , n_estimators=5, neg_pos_ratio=2, solver='BoostSRL', target='medv')
    >>> reg.predict(test)   # doctest: +SKIP
    array([10.04313307 13.55804603 20.549378   18.14681934 23.9393469  10.01292162
         29.83298024 20.34668817 27.81642572 32.04067867  9.41342835 20.975001
         19.21966845])

    """

    # pylint: disable=too-many-instance-attributes

[docs]    def __init__(
        self,
        background=None,
        target="None",
        n_estimators=10,
        node_size=2,
        max_tree_depth=3,
        neg_pos_ratio=2,
        solver="BoostSRL",
    ):
        """Initialize a BoostedRDN

        Parameters
        ----------
        background : :class:`srlearn.background.Background` (default: None)
            Background knowledge with respect to the database
        target : str (default: "None")
            Target predicate to learn
        n_estimators : int, optional (default: 10)
            Number of trees to fit
        node_size : int, optional (default: 2)
            Maximum number of literals in each node.
        max_tree_depth : int, optional (default: 3)
            Maximum number of nodes from root to leaf (height) in the tree.
        neg_pos_ratio : int or float, optional (default: 2)
            Ratio of negative to positive examples used during learning.

        Attributes
        ----------
        estimators_ : array, shape (n_estimators)
            Return the boosted regression trees
        feature_importances_ : array, shape (n_features)
            Return the feature importances (based on how often each feature appears)
        """

        super().__init__(
            background=background,
            target=target,
            n_estimators=n_estimators,
            node_size=node_size,
            max_tree_depth=max_tree_depth,
            neg_pos_ratio=neg_pos_ratio,
            solver=solver,
        )

[docs]    def fit(self, database):
        """Learn structure and parameters.

        Fit the structure and parameters of a Relational Dependency Network using a
        database of positive examples, negative examples, facts, and any relevant
        background knowledge.

        Parameters
        ----------
        database : :class:`srlearn.database.Database`
            Database containing examples and facts.

        Returns
        -------
        self : object
            Returns self.

        Notes
        -----

        The underlying algorithm is based on the "Relational Functional Gradient
        Boosting" as described in [1]_ and [2]_.

        This fit function is based on subprocess calling the BoostSRL jar files.
        This will require a Java runtime to also be available. See [3]_.

        .. [1] Sriraam Natarajan, Tushar Khot, Kristian Kersting, and Jude Shavlik,
           "*Boosted Statistical Relational Learners: From Benchmarks to Data-Driven
           Medicine*". SpringerBriefs in Computer Science, ISBN: 978-3-319-13643-1,
           2015
        -- [2] https://starling.utdallas.edu/software/boostsrl/wiki/regression/
        .. [3] https://starling.utdallas.edu/software/boostsrl/
        """

        self._check_params()

        # Write the background to file.
        self.background.write(
            filename="train", location=self.file_system.files.TRAIN_DIR
        )

        if isinstance(database, tuple):
            _db = Database()
            _db.pos = database.pos
            _db.neg = database.neg
            _db.facts = database.facts
            database = _db

        # Write the data to files.
        database.write(
            filename="train", location=self.file_system.files.TRAIN_DIR
        )

        if self.solver == "BoostSRL":
            _jar = str(self.file_system.files.BOOSTSRL_BACKEND)
        else:
            _jar = str(self.file_system.files.SRLBOOST_BACKEND)

        _CALL = (
            "java -jar "
            + _jar
            + " -reg -l -train "
            + str(self.file_system.files.TRAIN_DIR)
            + " -target "
            + self.target
            + " -trees "
            + str(self.n_estimators)
            + " -negPosRatio "
            + str(self.neg_pos_ratio)
            + " > "
            + str(self.file_system.files.TRAIN_LOG)
        )

        # Call the constructed command.
        self._call_shell_command(_CALL)

        # Read the trees from files.
        _estimators = []
        for _tree_number in range(self.n_estimators):
            with open(
                self.file_system.files.TREES_DIR.joinpath(
                    "{0}Tree{1}.tree".format(self.target, _tree_number)
                )
            ) as _fh:
                _estimators.append(_fh.read())

        self._get_dotfiles()
        self.estimators_ = _estimators

        return self

    def _run_inference(self, database) -> None:
        """Run inference mode on the BoostSRL Jar files.

        This is a helper method for ``self.predict`` and ``self.predict_proba``
        """

        self._check_initialized()

        # Write the background to file.
        self.background.write(
            filename="test", location=self.file_system.files.TEST_DIR
        )

        if isinstance(database, tuple):
            _db = Database()
            _db.pos = database.pos
            _db.neg = database.neg
            _db.facts = database.facts
            database = _db

        # Write the data to files.
        database.write(filename="test", location=self.file_system.files.TEST_DIR)

        if self.solver == "BoostSRL":
            _jar = str(self.file_system.files.BOOSTSRL_BACKEND)
        else:
            _jar = str(self.file_system.files.SRLBOOST_BACKEND)

        _CALL = (
            "java -jar "
            + _jar
            + " -reg -i -test "
            + str(self.file_system.files.TEST_DIR)
            + " -model "
            + str(self.file_system.files.MODELS_DIR)
            + " -target "
            + self.target
            + " -trees "
            + str(self.n_estimators)
            + " -aucJarPath "
            + str(self.file_system.files.AUC_JAR)
            + " > "
            + str(self.file_system.files.TEST_LOG)
        )

        self._call_shell_command(_CALL)

[docs]    def predict(self, database):
        """Use the learned model to predict values on new data.

        Parameters
        ----------
        database : :class:`srlearn.Database`
            Database containing examples and facts.

        Returns
        -------
        pred : ndarray
            regression value predicted for each example.
        """

        self._run_inference(database)

        # Collect the classifications.
        _results_db = self.file_system.files.TEST_DIR.joinpath(
            "results_" + self.target + ".db"
        )
        _pred, _true = np.loadtxt(
            _results_db,
            delimiter="\t",
            usecols=(1, 2),
            unpack=True,
        )

        self.true_ = _true
        self.pred_ = _pred

        return _pred