Source code for rllm.llm.llm_module.featllm.feat_llm

import copy
from typing import Union, List, Any

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch import Tensor
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
class LC:
    from langchain_core.language_models import BaseLLM
    from langchain.chat_models.base import BaseChatModel


from .feat_engineer import FeatLLMEngineer


class simple_model(nn.Module):

    """
    A simple neural network model that learns weights for each feature set.

    Attributes:
        weights (nn.ParameterList): A list of trainable parameters, where each parameter corresponds to the weights of a feature set.
    """
    def __init__(self, X):
        """
        Initializes the simple_model with trainable weights for each feature set.

        Args:
            X (List[Tensor]): A list of feature tensors, where each tensor represents a feature set.
        """
        super(simple_model, self).__init__()
        self.weights = nn.ParameterList(
            [
                nn.Parameter(torch.ones(x_each.shape[1], 1) / x_each.shape[1])
                for x_each in X
            ]
        )

    def forward(self, x):
        """
        Forward pass of the model.

        Args:
            x (List[Tensor]): A list of input feature tensors.

        Returns:
            Tensor: Concatenated scores for all feature sets.
        """
        x_total_score = []
        for idx, x_each in enumerate(x):
            x_score = x_each @ torch.clamp(self.weights[idx], min=0)
            x_total_score.append(x_score)
        x_total_score = torch.cat(x_total_score, dim=-1)
        return x_total_score



[docs]
class FeatLLM:
    """
    A class that integrates feature engineering and LLM-based tasks.

    Attributes:
        feat_engineer (FeatLLMEngineer): An instance of the feature engineering class.
        kwargs (dict): Additional arguments for customization.
    """
    def __init__(
        self,
        file_path: str,
        metadata_path: str,
        task_info_path: str,
        llm: Union[LC.BaseChatModel, LC.BaseLLM] = None,
        **kwargs,
    ) -> None:
        """
        Initializes the FeatLLM class with paths and an optional LLM.

        Args:
            file_path (str): Path to the input data file.
            metadata_path (str): Path to the metadata file.
            task_info_path (str): Path to the task information file.
            llm (Union[LC.BaseChatModel, LC.BaseLLM], optional): An optional LLM instance.
            **kwargs: Additional arguments for customization.
        """
        self.kwargs = kwargs
        self.feat_engineer = FeatLLMEngineer(
            file_path=file_path,
            metadata_path=metadata_path,
            task_info_path=task_info_path,
            llm=llm,
            **kwargs,
        )


[docs]
    def invoke(self):
        """
        Executes the feature engineering and model training pipeline.

        This method performs the following steps:
        1. Generates features using the feature engineer.
        2. Trains a model for each executable feature set.
        3. Evaluates the model and computes AUC scores.
        4. Ensembles the results and computes the final AUC score.
        """
        (
            executable_list,
            label_list,
            X_train_all_dict,
            X_test_all_dict,
            y_train,
            y_test,
        ) = self.feat_engineer()

        test_outputs_all = []
        multiclass = True if len(label_list) > 2 else False
        for i in executable_list:
            X_train_now = list(X_train_all_dict[i].values())
            X_test_now = list(X_test_all_dict[i].values())

            # Train
            trained_model = self._train(
                X_train_now, label_list, self.kwargs.get("shots", 4), y_train
            )

            # Evaluate
            test_outputs = trained_model(X_test_now).detach().cpu()
            test_outputs = F.softmax(test_outputs, dim=1).detach()
            result_auc = self._evaluate(
                test_outputs.numpy(), y_test.numpy(), multiclass=multiclass
            )
            print("AUC:", result_auc)
            test_outputs_all.append(test_outputs)
        test_outputs_all = np.stack(test_outputs_all, axis=0)
        ensembled_probs = test_outputs_all.mean(0)
        result_auc = self._evaluate(
            ensembled_probs, y_test.numpy(), multiclass=multiclass
        )
        print("Ensembled AUC:", result_auc)
        pass


    def _train(
        self,
        X_train_now: List[Tensor],
        label_list: List,
        shot: int,
        y_train: Tensor,
    ):
        """
        Trains the model using the provided training data.

        Args:
            X_train_now (List[Tensor]): A list of training feature tensors.
            label_list (List): A list of class labels.
            shot (int): The number of training examples per class.
            y_train (Tensor): The ground truth labels for the training data.

        Returns:
            simple_model: The trained model.
        """
        criterion = nn.CrossEntropyLoss()
        if shot // len(label_list) == 1:
            model = simple_model(X_train_now)
            optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
            for _ in range(200):
                optimizer.zero_grad()
                outputs = model(X_train_now)
                preds = outputs.argmax(dim=1)
                acc = (y_train == preds).sum() / len(preds)
                if acc == 1:
                    break
                loss = criterion(outputs, y_train)
                loss.backward()
                optimizer.step()
        else:
            # K-fold cross-validation.
            if shot // len(label_list) <= 2:
                n_splits = 2
            else:
                n_splits = 4

            kfold = StratifiedKFold(n_splits=n_splits, shuffle=True)
            model_list = []
            for _, (train_ids, valid_ids) in enumerate(
                kfold.split(X_train_now[0], y_train)
            ):
                model = simple_model(X_train_now)
                optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
                X_train_now_fold = [
                    x_train_now[train_ids] for x_train_now in X_train_now
                ]
                X_valid_now_fold = [
                    x_train_now[valid_ids] for x_train_now in X_train_now
                ]
                y_train_fold = y_train[train_ids]
                y_valid_fold = y_train[valid_ids]

                max_acc = -1
                for _ in range(200):
                    optimizer.zero_grad()
                    outputs = model(X_train_now_fold)
                    loss = criterion(outputs, y_train_fold)
                    loss.backward()
                    optimizer.step()

                    valid_outputs = model(X_valid_now_fold)
                    preds = valid_outputs.argmax(dim=1)
                    acc = (y_valid_fold == preds).sum() / len(preds)
                    if max_acc < acc:
                        max_acc = acc
                        final_model = copy.deepcopy(model)
                        if max_acc >= 1:
                            break
                model_list.append(final_model)

            sdict = model_list[0].state_dict()
            for key in sdict:
                sdict[key] = torch.stack(
                    [model.state_dict()[key] for model in model_list], dim=0
                ).mean(dim=0)

            model = simple_model(X_train_now)
            model.load_state_dict(sdict)
        return model

    def _evaluate(self, pred_probs, answers, multiclass=False):
        """
        Evaluates the model using AUC score.

        Args:
            pred_probs (ndarray): Predicted probabilities for each class.
            answers (ndarray): Ground truth labels.
            multiclass (bool, optional): Whether the task is multiclass. Defaults to False.

        Returns:
            float: The computed AUC score.
        """
        if multiclass == False:
            result_auc = roc_auc_score(answers, pred_probs[:, 1])
        else:
            result_auc = roc_auc_score(
                answers, pred_probs, multi_class="ovr", average="macro"
            )
        return result_auc
    
    def __call__(self, *args: Any, **kwargs: Any):
        """
        Allows the FeatLLM instance to be called like a function.

        Args:
            *args (Any): Positional arguments.
            **kwargs (Any): Keyword arguments.
        """
        self.invoke()



if __name__ == "__main__":
    """
    Example usage of the FeatLLM class with a specified LLM and file paths.
    """
    API_KEY = "<Your API KEY>"
    API_URL = "<Your API URL>"
    # Example usage
    from langchain_openai import ChatOpenAI
    llm = ChatOpenAI(
        model_name="Your Model Name",
        openai_api_base=API_URL,
        openai_api_key=API_KEY
    )
    featllm = FeatLLM(
        file_path="Your File Path",
        metadata_path="Your Metadata Path",
        task_info_path="Your Task Info Path",
        llm=llm,
        # For example:
        # file_path="./adult.csv",
        # metadata_path="./adult-metadata.json",
        # task_info_path="./adult-task.txt",
        query_num=1,
    )
    featllm.invoke()