# Copyright © 2017, 2018, 2019 Alexander L. Hayes
"""
Relational Dependency Networks
"""
import re
import warnings
import numpy as np
from .base import BaseBoostedRelationalModel
from .database import Database
warnings.simplefilter("default")
[docs]class BoostedRDNClassifier(BaseBoostedRelationalModel):
"""Relational Dependency Networks Estimator
Wrappers around BoostSRL for learning and inference with Relational Dependency
Networks written with a scikit-learn-style interface derived from
:class:`sklearn.base.BaseEstimator`
Similar to :class:`sklearn.ensemble.GradientBoostingClassifier`, this builds
a model by fitting a series of regression trees.
Examples
--------
>>> from srlearn.rdn import BoostedRDNClassifier
>>> from srlearn import Background
>>> from srlearn.datasets import load_toy_cancer
>>> train, test = load_toy_cancer()
>>> bk = Background(modes=train.modes)
>>> dn = BoostedRDNClassifier(background=bk, target="cancer")
>>> dn.fit(train)
BoostedRDNClassifier(background=setParam: numOfClauses=100.
setParam: numOfCycles=100.
usePrologVariables: true.
setParam: nodeSize=2.
setParam: maxTreeDepth=3.
mode: friends(+Person,-Person).
mode: friends(-Person,+Person).
mode: smokes(+Person).
mode: cancer(+Person).
, n_estimators=10, neg_pos_ratio=2, solver='BoostSRL', target='cancer')
>>> dn.predict(test)
array([ True, True, True, False, False])
"""
# pylint: disable=too-many-instance-attributes
[docs] def __init__(
self,
background=None,
target="None",
n_estimators=10,
node_size=2,
max_tree_depth=3,
neg_pos_ratio=2,
solver=None,
):
"""Initialize a BoostedRDN
Parameters
----------
background : :class:`srlearn.background.Background` (default: None)
Background knowledge with respect to the database
target : str (default: "None")
Target predicate to learn
n_estimators : int, optional (default: 10)
Number of trees to fit
node_size : int, optional (default: 2)
Maximum number of literals in each node.
max_tree_depth : int, optional (default: 3)
Maximum number of nodes from root to leaf (height) in the tree.
neg_pos_ratio : int or float, optional (default: 2)
Ratio of negative to positive examples used during learning.
Attributes
----------
estimators_ : array, shape (n_estimators)
Return the boosted regression trees
feature_importances_ : array, shape (n_features)
Return the feature importances (based on how often each feature appears)
"""
super().__init__(
background=background,
target=target,
n_estimators=n_estimators,
node_size=node_size,
max_tree_depth=max_tree_depth,
neg_pos_ratio=neg_pos_ratio,
solver=solver,
)
[docs] def fit(self, database):
"""Learn structure and parameters.
Fit the structure and parameters of a Relational Dependency Network using a
database of positive examples, negative examples, facts, and any relevant
background knowledge.
Parameters
----------
database : :class:`srlearn.database.Database`
Database containing examples and facts.
Returns
-------
self : object
Returns self.
Notes
-----
The underlying algorithm is based on the "Relational Functional Gradient
Boosting" as described in [1]_.
This fit function is based on subprocess calling the BoostSRL jar files.
This will require a Java runtime to also be available. See [2]_.
.. [1] Sriraam Natarajan, Tushar Khot, Kristian Kersting, and Jude Shavlik,
"*Boosted Statistical Relational Learners: From Benchmarks to Data-Driven
Medicine*". SpringerBriefs in Computer Science, ISBN: 978-3-319-13643-1,
2015
.. [2] https://starling.utdallas.edu/software/boostsrl/
"""
self._check_params()
# Write the background to file.
self.background.write(
filename="train", location=self.file_system.files.TRAIN_DIR
)
if isinstance(database, tuple):
_db = Database()
_db.pos = database.pos
_db.neg = database.neg
_db.facts = database.facts
database = _db
# Write the data to files.
database.write(
filename="train", location=self.file_system.files.TRAIN_DIR
)
if self.solver == "BoostSRL":
_jar = str(self.file_system.files.BOOSTSRL_BACKEND)
else:
_jar = str(self.file_system.files.SRLBOOST_BACKEND)
_CALL = (
"java -jar "
+ _jar
+ " -l -train "
+ str(self.file_system.files.TRAIN_DIR)
+ " -target "
+ self.target
+ " -trees "
+ str(self.n_estimators)
+ " -negPosRatio "
+ str(self.neg_pos_ratio)
+ " > "
+ str(self.file_system.files.TRAIN_LOG)
)
# Call the constructed command.
self._call_shell_command(_CALL)
# Read the trees from files.
_estimators = []
for _tree_number in range(self.n_estimators):
with open(
self.file_system.files.TREES_DIR.joinpath(
"{0}Tree{1}.tree".format(self.target, _tree_number)
)
) as _fh:
_estimators.append(_fh.read())
self._get_dotfiles()
self.estimators_ = _estimators
return self
def _run_inference(self, database) -> None:
"""Run inference mode on the BoostSRL Jar files.
This is a helper method for ``self.predict`` and ``self.predict_proba``
"""
self._check_initialized()
if isinstance(database, tuple):
_db = Database()
_db.pos = database.pos
_db.neg = database.neg
_db.facts = database.facts
database = _db
# Write the background to file.
self.background.write(
filename="test", location=self.file_system.files.TEST_DIR
)
# Write the data to files.
database.write(filename="test", location=self.file_system.files.TEST_DIR)
if self.solver == "BoostSRL":
_jar = str(self.file_system.files.BOOSTSRL_BACKEND)
else:
_jar = str(self.file_system.files.SRLBOOST_BACKEND)
_CALL = (
"java -jar "
+ _jar
+ " -i -test "
+ str(self.file_system.files.TEST_DIR)
+ " -model "
+ str(self.file_system.files.MODELS_DIR)
+ " -target "
+ self.target
+ " -trees "
+ str(self.n_estimators)
+ " -aucJarPath "
+ str(self.file_system.files.AUC_JAR)
+ " > "
+ str(self.file_system.files.TEST_LOG)
)
self._call_shell_command(_CALL)
# Read the threshold
with open(self.file_system.files.TEST_LOG, "r") as _fh:
_threshold = re.findall("% Threshold = \\d*.\\d*", _fh.read())
self.threshold_ = float(_threshold[0].split(" = ")[1])
[docs] def predict(self, database):
"""Use the learned model to predict on new data.
Parameters
----------
database : :class:`srlearn.Database`
Database containing examples and facts.
Returns
-------
results : ndarray
Positive or negative class.
"""
self._run_inference(database)
# Collect the classifications.
_results_db = self.file_system.files.TEST_DIR.joinpath(
"results_" + self.target + ".db"
)
_classes, _results = np.loadtxt(
_results_db,
delimiter=") ",
usecols=(0, 1),
converters={0: lambda s: 0 if s[0] == 33 else 1},
unpack=True,
)
self.classes_ = _classes
_neg = _results[_classes == 0]
_pos = _results[_classes == 1]
_results2 = np.greater(
np.concatenate((_pos, 1 - _neg), axis=0), self.threshold_
)
return _results2
[docs] def predict_proba(self, database):
"""Return class probabilities.
Parameters
----------
database : :class:`srlearn.Database`
Database containing examples and facts.
Returns
-------
results : ndarray
Probability of belonging to the positive class
"""
self._run_inference(database)
_results_db = self.file_system.files.TEST_DIR.joinpath(
"results_" + self.target + ".db"
)
_classes, _results = np.loadtxt(
_results_db,
delimiter=") ",
usecols=(0, 1),
converters={0: lambda s: 0 if s[0] == 33 else 1},
unpack=True,
)
_neg = _results[_classes == 0]
_pos = _results[_classes == 1]
_results2 = np.concatenate((_pos, 1 - _neg), axis=0)
self.classes_ = _classes
return _results2
class BoostedRDN(BoostedRDNClassifier):
def __init__(self, **args):
warnings.warn(
"Use 'BoostedRDNClassifier' instead of 'BoostedRDN'",
DeprecationWarning,
)
super().__init__(**args)
[docs]class BoostedRDNRegressor(BaseBoostedRelationalModel):
"""Relational Dependency Networks Regressor
Wrappers around BoostSRL for learning and inference of RDNs for regression task.
Similar to :class:`sklearn.ensemble.GradientBoostingRegressor`, this builds
a model by fitting a series of regression trees.
Examples
--------
>>> from srlearn.rdn import BoostedRDNRegressor
>>> from srlearn import Background
>>> from srlearn import Database
>>> train = Database.from_files(
... pos="../datasets/Boston/train/pos.pl",
... neg="../datasets/Boston/train/neg.pl",
... facts="../datasets/Boston/train/facts.pl",
... lazy_load=False,
... )
>>> test = Database.from_files(
... pos="../datasets/Boston/test/pos.pl",
... neg="../datasets/Boston/test/neg.pl",
... facts="../datasets/Boston/test/facts.pl",
... lazy_load=False,
... )
>>> train.modes = ["crim(+id,#varsrim).",
... "zn(+id,#varzn).",
... "indus(+id,#varindus).",
... "chas(+id,#varchas).",
... "nox(+id,#varnox).",
... "rm(+id,#varrm).",
... "age(+id,#varage).",
... "dis(+id,#vardis).",
... "rad(+id,#varrad).",
... "tax(+id,#vartax).",
... "ptratio(+id,#varptrat).",
... "b(+id,#varb).",
... "lstat(+id,#varlstat).",
... "medv(+id)."]
>>> bk = Background(modes=train.modes)
>>> reg = BoostedRDNRegressor(background=bk, target="medv", n_estimators=5)
>>> reg.fit(train)
BoostedRDNRegressor(background=setParam: numOfClauses=100.
setParam: numOfCycles=100.
usePrologVariables: true.
setParam: nodeSize=2.
setParam: maxTreeDepth=3.
mode: crim(+id,#varsrim).
mode: zn(+id,#varzn).
mode: indus(+id,#varindus).
mode: chas(+id,#varchas).
mode: nox(+id,#varnox).
mode: rm(+id,#varrm).
mode: age(+id,#varage).
mode: dis(+id,#vardis).
mode: rad(+id,#varrad).
mode: tax(+id,#vartax).
mode: ptratio(+id,#varptrat).
mode: b(+id,#varb).
mode: lstat(+id,#varlstat).
mode: medv(+id).
, n_estimators=5, neg_pos_ratio=2, solver='BoostSRL', target='medv')
>>> reg.predict(test) # doctest: +SKIP
array([10.04313307 13.55804603 20.549378 18.14681934 23.9393469 10.01292162
29.83298024 20.34668817 27.81642572 32.04067867 9.41342835 20.975001
19.21966845])
"""
# pylint: disable=too-many-instance-attributes
[docs] def __init__(
self,
background=None,
target="None",
n_estimators=10,
node_size=2,
max_tree_depth=3,
neg_pos_ratio=2,
solver="BoostSRL",
):
"""Initialize a BoostedRDN
Parameters
----------
background : :class:`srlearn.background.Background` (default: None)
Background knowledge with respect to the database
target : str (default: "None")
Target predicate to learn
n_estimators : int, optional (default: 10)
Number of trees to fit
node_size : int, optional (default: 2)
Maximum number of literals in each node.
max_tree_depth : int, optional (default: 3)
Maximum number of nodes from root to leaf (height) in the tree.
neg_pos_ratio : int or float, optional (default: 2)
Ratio of negative to positive examples used during learning.
Attributes
----------
estimators_ : array, shape (n_estimators)
Return the boosted regression trees
feature_importances_ : array, shape (n_features)
Return the feature importances (based on how often each feature appears)
"""
super().__init__(
background=background,
target=target,
n_estimators=n_estimators,
node_size=node_size,
max_tree_depth=max_tree_depth,
neg_pos_ratio=neg_pos_ratio,
solver=solver,
)
[docs] def fit(self, database):
"""Learn structure and parameters.
Fit the structure and parameters of a Relational Dependency Network using a
database of positive examples, negative examples, facts, and any relevant
background knowledge.
Parameters
----------
database : :class:`srlearn.database.Database`
Database containing examples and facts.
Returns
-------
self : object
Returns self.
Notes
-----
The underlying algorithm is based on the "Relational Functional Gradient
Boosting" as described in [1]_ and [2]_.
This fit function is based on subprocess calling the BoostSRL jar files.
This will require a Java runtime to also be available. See [3]_.
.. [1] Sriraam Natarajan, Tushar Khot, Kristian Kersting, and Jude Shavlik,
"*Boosted Statistical Relational Learners: From Benchmarks to Data-Driven
Medicine*". SpringerBriefs in Computer Science, ISBN: 978-3-319-13643-1,
2015
-- [2] https://starling.utdallas.edu/software/boostsrl/wiki/regression/
.. [3] https://starling.utdallas.edu/software/boostsrl/
"""
self._check_params()
# Write the background to file.
self.background.write(
filename="train", location=self.file_system.files.TRAIN_DIR
)
if isinstance(database, tuple):
_db = Database()
_db.pos = database.pos
_db.neg = database.neg
_db.facts = database.facts
database = _db
# Write the data to files.
database.write(
filename="train", location=self.file_system.files.TRAIN_DIR
)
if self.solver == "BoostSRL":
_jar = str(self.file_system.files.BOOSTSRL_BACKEND)
else:
_jar = str(self.file_system.files.SRLBOOST_BACKEND)
_CALL = (
"java -jar "
+ _jar
+ " -reg -l -train "
+ str(self.file_system.files.TRAIN_DIR)
+ " -target "
+ self.target
+ " -trees "
+ str(self.n_estimators)
+ " -negPosRatio "
+ str(self.neg_pos_ratio)
+ " > "
+ str(self.file_system.files.TRAIN_LOG)
)
# Call the constructed command.
self._call_shell_command(_CALL)
# Read the trees from files.
_estimators = []
for _tree_number in range(self.n_estimators):
with open(
self.file_system.files.TREES_DIR.joinpath(
"{0}Tree{1}.tree".format(self.target, _tree_number)
)
) as _fh:
_estimators.append(_fh.read())
self._get_dotfiles()
self.estimators_ = _estimators
return self
def _run_inference(self, database) -> None:
"""Run inference mode on the BoostSRL Jar files.
This is a helper method for ``self.predict`` and ``self.predict_proba``
"""
self._check_initialized()
# Write the background to file.
self.background.write(
filename="test", location=self.file_system.files.TEST_DIR
)
if isinstance(database, tuple):
_db = Database()
_db.pos = database.pos
_db.neg = database.neg
_db.facts = database.facts
database = _db
# Write the data to files.
database.write(filename="test", location=self.file_system.files.TEST_DIR)
if self.solver == "BoostSRL":
_jar = str(self.file_system.files.BOOSTSRL_BACKEND)
else:
_jar = str(self.file_system.files.SRLBOOST_BACKEND)
_CALL = (
"java -jar "
+ _jar
+ " -reg -i -test "
+ str(self.file_system.files.TEST_DIR)
+ " -model "
+ str(self.file_system.files.MODELS_DIR)
+ " -target "
+ self.target
+ " -trees "
+ str(self.n_estimators)
+ " -aucJarPath "
+ str(self.file_system.files.AUC_JAR)
+ " > "
+ str(self.file_system.files.TEST_LOG)
)
self._call_shell_command(_CALL)
[docs] def predict(self, database):
"""Use the learned model to predict values on new data.
Parameters
----------
database : :class:`srlearn.Database`
Database containing examples and facts.
Returns
-------
pred : ndarray
regression value predicted for each example.
"""
self._run_inference(database)
# Collect the classifications.
_results_db = self.file_system.files.TEST_DIR.joinpath(
"results_" + self.target + ".db"
)
_pred, _true = np.loadtxt(
_results_db,
delimiter="\t",
usecols=(1, 2),
unpack=True,
)
self.true_ = _true
self.pred_ = _pred
return _pred