Source code for srlearn.boostsrl

# Copyright 2017, 2018, 2019 Alexander L. Hayes

"""
(Deprecated) boostsrl class for training and testing.

.. warning:: This module is deprecated, pending removal in 0.6.0.
    See :class:`srlearn.rdn` instead.
"""

import os
import re
import subprocess
import warnings


warnings.warn(
    "Deprecation Warning: "
    "'srlearn.boostsrl' is deprecated and will be removed in 0.6.0. "
    "'srlearn.rdn' will replace this functionality."
)

# Mode definitions and examples can be verified with regular expressions.
mode_re = re.compile(
    r"[a-zA-Z0-9]*\(((\+|\-|\#)[a-zA-Z0-9]*,( )*)*(\+|\-|\#)[a-zA-Z0-9]*\)\."
)
exam_re = re.compile(r"[a-zA-Z0-9]*\(([a-zA-Z0-9]*,( )*)*[a-zA-Z0-9]*\)\.")


[docs]def example_data(example):
    """
    .. deprecated:: 0.5.0
       Use :class:`srlearn.example_data` instead.

    For demo purposes, include some sample data.

    .. code-block:: python

        from srlearn.boostsrl import example_data
        train_pos = example_data('train_pos')
        train_neg = example_data('train_neg')
        train_facts = example_data('train_facts')

    """

    print(
        "Deprecation Warning: "
        "'srlearn.boostsrl.example_data' will be removed in 0.6.0. "
        "'srlearn.example_data' will replace this functionality."
    )

    from . import example_data as ex_data

    if example == "train_pos":
        return ex_data.train.pos
    elif example == "train_neg":
        return ex_data.train.neg
    elif example == "train_facts":
        return ex_data.train.facts
    elif example == "test_pos":
        return ex_data.test.pos
    elif example == "test_neg":
        return ex_data.test.neg
    elif example == "test_facts":
        return ex_data.test.facts
    elif example == "background":
        return ex_data.test.background
    else:
        raise (Exception("Attempted to use sample data that does not exist."))


[docs]def call_process(call):
    """
    .. deprecated:: 0.5.0
       Not intended as a public method.

    Create a subprocess and wait for it to finish.
    Raise an Exception if errors occur.
    """
    try:
        p = subprocess.Popen(call, shell=True)
        os.waitpid(p.pid, 0)
    except:
        raise Exception("Encountered problems while running process: ", call)


[docs]def inspect_mode_syntax(example):
    """
    .. deprecated:: 0.5.0
       Not intended as a public method.

    Uses a regular expression to check whether all of the examples in a list
    are in the correct form.
    """
    if not mode_re.search(example):
        raise (
            Exception(
                "Error when checking background knowledge; incorrect syntax: "
                + example
                + "\nBackground knowledge should only contain letters and numbers, "
                + "of the form: predicate(+var1, -var2)."
            )
        )


[docs]def inspect_example_syntax(example):
    """
    .. deprecated:: 0.5.0
       Not intended as a public method.

    Uses a regular expression to check whether all of the examples in a list are
    in the correct form.
    """
    if not exam_re.search(example):
        raise (Exception("Error when checking example; incorrect syntax: " + example))


[docs]def write_to_file(content, path):
    """
    .. deprecated:: 0.5.0
       Not intended as a public method.

    Takes a list (content) and a path/file (path) and
    writes each line of the list to the file location.
    """
    with open(path, "w") as f:
        for line in content:
            f.write(line + "\n")
    f.close()


class modes(object):
    def __init__(
        self,
        background,
        target,
        bridgers=None,
        precomputes=None,
        loadAllLibraries=False,
        useStdLogicVariables=False,
        usePrologVariables=False,
        recursion=False,
        lineSearch=False,
        resampleNegs=False,
        treeDepth=None,
        maxTreeDepth=None,
        nodeSize=None,
        numOfClauses=None,
        numOfCycles=None,
        minLCTrees=None,
        incrLCTrees=None,
    ):
        """
        target: a list of predicate heads that learning/inference will be performed on.
        """
        print(
            "Deprecation Warning: "
            "'srlearn.boostsrl' is deprecated and will be removed in 0.6.0. "
            "'srlearn.rdn' will replace this functionality."
        )

        self.target = target

        self.bridgers = bridgers
        self.precomputes = precomputes

        self.loadAllLibraries = loadAllLibraries
        self.useStdLogicVariables = useStdLogicVariables
        self.usePrologVariables = usePrologVariables
        self.treeDepth = treeDepth
        self.maxTreeDepth = maxTreeDepth
        self.nodeSize = nodeSize
        self.numOfClauses = numOfClauses
        self.numOfCycles = numOfCycles
        self.minLCTrees = minLCTrees
        self.incrLCTrees = incrLCTrees
        self.recursion = recursion
        self.lineSearch = lineSearch
        self.resampleNegs = resampleNegs
        # self.queryPred = 'advisedby/2'

        # Many of the arguments in the modes object are optional this shows
        # us the values of the ones that are neither false nor none.

        types = {
            "background should be a list.": isinstance(background, list),
            "target should be a list.": isinstance(target, list),
            "bridgers should be a list.": isinstance(bridgers, list)
            or bridgers is None,
            "precomputes should be a dictionary.": isinstance(precomputes, dict)
            or precomputes is None,
            "loadAllLibraries should be boolean.": isinstance(loadAllLibraries, bool),
            "useStdLogicVariables should be boolean.": isinstance(
                useStdLogicVariables, bool
            ),
            "usePrologVariables should be boolean.": isinstance(
                usePrologVariables, bool
            ),
            "recursion should be boolean.": isinstance(recursion, bool),
            "lineSearch should be boolean.": isinstance(lineSearch, bool),
            "resampleNegs should be boolean.": isinstance(resampleNegs, bool),
            "treeDepth should be an int.": isinstance(treeDepth, int)
            or treeDepth is None,
            "maxTreeDepth should be an int.": isinstance(maxTreeDepth, int)
            or maxTreeDepth is None,
            "nodeSize should be an int.": isinstance(nodeSize, int) or nodeSize is None,
            "numOfClause should be an int.": isinstance(numOfClauses, int)
            or numOfClauses is None,
            "numOfCycles should be an int.": isinstance(numOfCycles, int)
            or numOfCycles is None,
            "minLCTrees should be an int.": isinstance(minLCTrees, int)
            or minLCTrees is None,
            "incrLCTrees should be an int.": isinstance(incrLCTrees, int)
            or incrLCTrees is None,
        }

        # Force type checking for input validation Issue #5
        for type_check in types:
            if not types[type_check]:
                raise (TypeError("Error when checking type: " + type_check))

        relevant = [
            [attr, value]
            for attr, value in self.__dict__.items()
            if (value is not False) and (value is not None)
        ]
        self.relevant = relevant

        background_knowledge = []
        for a, v in relevant:
            if a in ["useStdLogicVariables", "usePrologVariables"] and v:
                s = a + ": " + str(v).lower() + "."
                background_knowledge.append(s)
            elif a in ["target", "bridgers", "precomputes"]:
                pass
            elif v:
                s = "setParam: " + a + "=" + str(v).lower() + "."
                background_knowledge.append(s)
            else:
                s = "setParam: " + a + "=" + str(v) + "."
                background_knowledge.append(s)

        for pred in background:
            inspect_mode_syntax(pred)
            background_knowledge.append("mode: " + pred)

        if self.bridgers is not None:
            for bridger in self.bridgers:
                background_knowledge.append("bridger: " + bridger)

        if self.precomputes is not None:
            for precompute in self.precomputes:
                background_knowledge.append(self.precomputes[precompute])
                background_knowledge.append("mode: " + precompute)

        # Write the newly created background_knowledge to a file: background.txt
        self.background_knowledge = background_knowledge
        write_to_file(background_knowledge, "background.txt")


[docs]class train(object):
    """
    .. deprecated:: 0.5.0
       Use :class:`srlearn.rdn` instead.
    """

    def __init__(
        self,
        background,
        train_pos,
        train_neg,
        train_facts,
        save=False,
        advice=False,
        softm=False,
        alpha=0.5,
        beta=-2,
        trees=10,
    ):
        print(
            "Deprecation Warning: "
            "'srlearn.boostsrl' is deprecated and will be removed in 0.6.0. "
            "'srlearn.rdn' will replace this functionality."
        )

        self.target = background.target
        self.train_pos = train_pos
        self.train_neg = train_neg
        self.train_facts = train_facts
        self.advice = advice
        self.softm = softm
        self.alpha = alpha
        self.beta = beta
        self.trees = trees

        # Syntax checking for examples in each set.
        for example in self.train_pos:
            inspect_example_syntax(example)
        for example in self.train_neg:
            inspect_example_syntax(example)
        for example in self.train_facts:
            inspect_example_syntax(example)

        write_to_file(self.train_pos, "srlearn/train/train_pos.txt")
        write_to_file(self.train_neg, "srlearn/train/train_neg.txt")
        write_to_file(self.train_facts, "srlearn/train/train_facts.txt")

        CALL = (
            "(cd srlearn; java -jar v1-0.jar -l -train train/ -target "
            + ",".join(self.target)
            + " -trees "
            + str(self.trees)
            + " > train_output.txt 2>&1)"
        )
        call_process(CALL)

[docs]    def tree(self, treenumber, target, image=False):
        """
        """
        # Tree number is between 0 and the self.trees.
        if treenumber > (self.trees - 1):
            raise Exception("Tried to find a tree that does not exist.")
        elif image:
            """
            Writing this with Jupyter notebooks in mind.
            """
            from graphviz import Source

            tree_file = (
                "srlearn/train/models/bRDNs/dotFiles/WILLTreeFor_"
                + target
                + str(treenumber)
                + ".dot"
            )
            with open(tree_file, "r") as f:
                tree_output = "".join(f.read().splitlines())
            src = Source(tree_output)
            return src
        else:
            tree_file = (
                "srlearn/train/models/bRDNs/Trees/"
                + target
                + "Tree"
                + str(treenumber)
                + ".tree"
            )
            with open(tree_file, "r") as f:
                tree_output = f.read()
            return tree_output

    def _get_training_time(self):
        """
        Return the training time as a float representing the total number of
        seconds seconds.
        """
        with open("srlearn/train_output.txt", "r") as f:
            text = f.read()
        line = re.findall(r"% Total learning time \(\d* trees\):.*", text)
        # Remove the last character "." from the line and split it on spaces.
        splitline = line[0][:-1].split()
        return splitline

    def _training_time_to_float(self, splitline):
        """
        Convert the string representing training time into a float representing
        total seconds.
        """
        seconds = []
        if "milliseconds" in splitline:
            seconds.append(
                (float(splitline[splitline.index("milliseconds") - 1])) / 1000
            )
        if "seconds" in splitline:
            seconds.append(float(splitline[splitline.index("seconds") - 1]))
        if "minutes" in splitline:
            seconds.append(float(splitline[splitline.index("minutes") - 1]) * 60)
        if "hours" in splitline:
            seconds.append(float(splitline[splitline.index("hours") - 1]) * 3600)
        if "days" in splitline:
            seconds.append(float(splitline[splitline.index("days") - 1]) * 86400)
        return sum(seconds)

[docs]    def traintime(self) -> float:
        """
        Returns a float representing seconds.
        """
        splitline = self._get_training_time()
        return self._training_time_to_float(splitline)


[docs]class test(object):
    """
    .. deprecated:: 0.5.0
       Use :class:`srlearn.rdn` instead.
    """

    # Possibly a partial fix to Issue #3: checking for the .aucTemp.txt.lock
    if os.path.isfile("srlearn/test/AUC/.aucTemp.txt.lock"):
        print("Found lock file srlearn/test/AUC/.aucTemp.txt.lock, removing it:")
        os.remove("srlearn/test/AUC/.aucTemp.txt.lock")

    def __init__(self, model, test_pos, test_neg, test_facts, trees=10):
        write_to_file(test_pos, "srlearn/test/test_pos.txt")
        write_to_file(test_neg, "srlearn/test/test_neg.txt")
        write_to_file(test_facts, "srlearn/test/test_facts.txt")

        print(
            "Deprecation Warning: "
            "'srlearn.boostsrl' is deprecated and will be removed in 0.6.0. "
            "'srlearn.rdn' will replace this functionality."
        )

        self.target = model.target

        CALL = (
            "(cd srlearn; java -jar v1-0.jar -i -model train/models/ "
            + "-test test/ -target "
            + ",".join(self.target)
            + " -trees "
            + str(trees)
            + " -aucJarPath . > test_output.txt 2>&1)"
        )
        call_process(CALL)

    def summarize_results(self):
        with open("srlearn/test_output.txt", "r") as f:
            text = f.read()
        line = re.findall(
            r"%   AUC ROC.*|%   AUC PR.*|%   CLL.*|%   Precision.*|%   Recall.*|%   F1.*",
            text,
        )
        line = [
            word.replace(" ", "")
            .replace("\t", "")
            .replace("%", "")
            .replace("atthreshold=", ",")
            for word in line
        ]

        results = {
            "AUC ROC": line[0][line[0].index("=") + 1 :],
            "AUC PR": line[1][line[1].index("=") + 1 :],
            "CLL": line[2][line[2].index("=") + 1 :],
            "Precision": line[3][line[3].index("=") + 1 :],
            "Recall": line[4][line[4].index("=") + 1 :],
            "F1": line[5][line[5].index("=") + 1 :],
        }
        return results

    def _float_split(self, line):
        """Returns a list where the first item is a string and the second is a float.

        Used when returning inference results.

        Examples
        --------

        >>> test._float_split('target(pred1, pred2, pred3). 0.85691')
        ['target(pred1, pred2, pred3).', 0.85691]"""
        intermediate = line.rsplit(None, 1)
        return [intermediate[0], float(intermediate[1])]

[docs]    def inference_results(self, target):
        """Converts BoostSRL results into a Python dictionary."""
        results_file = "srlearn/test/results_" + target + ".db"
        inference_dict = {}

        with open(results_file, "r") as f:
            for line in f.read().splitlines():
                full = self._float_split(line)
                key_predicate = full[0]
                value_regression = full[1]
                inference_dict[key_predicate] = value_regression
        return inference_dict