from typing import Any, List, Literal, Optional
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
from rllm.llm.prompt.default_prompt import DEFAULT_SCENARIO_EXPLANATION_TMPL
from rllm.llm.prompt.utils import (
generate_sample_description,
get_template_vars
)
from rllm.llm.llm_module.general_llm import LLM
from rllm.llm.prompt.base import BasePromptTemplate
[docs]
class Enhancer:
r"""Enhancer for relational data. Data should be
organized into a :class:`pandas.dataframe` format.
If attribute `type` is 'explanation|embedding', enhancer will explain
them firstly and embedding them into vectors.
Args:
prompt (Optional[:class:`rllm.llm.prompt.base.BasePromptTemplate`]):
The prompt to instruct llm make enhancement.
llm (:class:`rllm.llm.llm_module.general_llm.LLM`):
The llm used for explanation, it is recommended
to be initialized with LangChain. Only useful in explanation step.
llm_embed (:class:`rllm.llm.llm_module.general_llm.LLM`):
The llm used for embedding, it is recommended
to be initialized with LangChain. Only useful in embedding step.
type (Optional[
Literal['explanation|embedding', 'explanation', 'embedding']
]):
Task type, default type is 'explanation|embedding'.
Explanation|Embedding:
.. code-block:: python
import pandas as pd
from langchain_openai import OpenAI, OpenAIEmbeddings
from rllm.llm import LangChainLLM, Enhancer
data = pd.read_csv('data.csv')
scenario = 'Your_task_description'
llm = LangChainLLM(OpenAI(openai_api_key="YOUR_API_KEY"))
llm_embed = LangChainLLM(
OpenAIEmbeddings(openai_api_key="YOUR_API_KEY")
)
enhancer = Enhancer(
llm=llm, llm_embed=llm_embed, type='explanation|embedding'
)
outputs = enhancer(data.head(10), scenario=scenario)
Explanation:
.. code-block:: python
import pandas as pd
from langchain_openai import OpenAI
from rllm.llm import LangChainLLM, Enhancer
data = pd.read_csv('data.csv')
scenario = 'Your_task_description'
llm = LangChainLLM(OpenAI(openai_api_key="YOUR_API_KEY"))
enhancer = Enhancer(llm=llm, type='explanation')
outputs = enhancer(data.head(10), scenario=scenario)
Embedding:
.. code-block:: python
import pandas as pd
from langchain.embeddings import OpenAIEmbeddings
from rllm.llm import LangChainLLM, Enhancer
data = pd.read_csv('data.csv')
scenario = 'Your_task_description'
llm = LangChainLLM(OpenAIEmbeddings(openai_api_key="YOUR_API_KEY"))
enhancer = Enhancer(llm_embed=llm, type='embedding')
# Embedding columns 'text' and 'explanation'
outputs = enhancer(data, cols=['text', 'explanation'])
"""
def __init__(
self,
prompt: Optional['BasePromptTemplate'] = None,
llm: LLM = None,
llm_embed: LLM = None,
type: Optional[
Literal['explanation|embedding', 'explanation', 'embedding']
] = 'explanation|embedding',
) -> None:
# NOTE: Only support `PromptTemplate` so far!
# NOTE: Only support `explanation` so far!
if 'explanation' in type:
self._llm = llm
if 'embedding' in type:
self._llm_embed = llm_embed
assert type in ['explanation|embedding', 'explanation', 'embedding'], \
"type error!"
self.type = type
if 'explanation' in self.type:
from rllm.llm.prompt.base import PromptTemplate
if prompt is None:
function_mapping = {
'sample_description': generate_sample_description
}
self.prompt = PromptTemplate(
DEFAULT_SCENARIO_EXPLANATION_TMPL,
function_mappings=function_mapping
)
else:
self.prompt = prompt
def invoke(
self,
df: pd.DataFrame,
**kwargs,
) -> List[str]:
if 'explanation' in self.type:
# Check if all variables in the prompt are provided.
input_variables = {
**kwargs,
**self.prompt.function_mappings
}.keys()
required_variables = get_template_vars(self.prompt.template)
for var in required_variables:
assert var in input_variables, \
f"Variable '{var}' not found in input variables."
# Make explanation, remember `row` is a default argument.
outputs = []
for index, row in tqdm(df.iterrows(), total=len(df)):
for i in range(3):
try:
result = self._llm.predict(self.prompt, row=row, **kwargs)
break
except Exception as exc:
if i == 2:
raise type(exc)(
f"Failed to generate explanation for row index {index}: {exc}"
) from exc
time.sleep(1.5 * (i + 1))
outputs.append(result)
time.sleep(0.5)
if 'embedding' in self.type:
if 'explanation' in self.type:
inputs = [outputs]
else:
# default target column is 'text'.
cols = kwargs['cols'] if 'cols' in kwargs else ['text']
inputs = [
col.values.tolist() for col_name, col in df[cols].items()
]
outputs = []
for input in inputs:
for i in range(3):
try:
embed = self._llm_embed.embedding(input)
break
except Exception:
if i == 2:
raise
time.sleep(1.5 * (i + 1))
outputs.append(np.array(embed))
time.sleep(0.5)
outputs = outputs[0] if len(outputs) == 1 else outputs
return outputs
def __call__(
self,
df: pd.DataFrame,
**kwargs,
) -> Any:
return self.invoke(df, **kwargs)