Design of LLM MethodsΒΆ
Large language models (LLMs) excel at zero-shot tasks involving text. In this tutorial, we demonstrate how to use LLMs as predictors to label a subset of entities in relational tables, followed by applying BRIDGE model for relational machine learning.
We begin by loading the original dataset, building a graph and selecting nodes by degree for annotation.
import os.path as osp
import random
import torch
import networkx as nx
import dashscope
from langchain_community.llms import Tongyi
from examples.bridge.bridge import build_bridge_model, train_bridge_model
from examples.bridge.utils import data_prepare
from rllm.datasets import TLF2KDataset
from rllm.llm import Predictor
from rllm.llm.llm_module.langchain_llm import LangChainLLM
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data")
dataset = TLF2KDataset(cached_dir=path, force_reload=True)
target_table, non_table_embeddings, adj, emb_size = data_prepare(dataset, "tlf2k", device)
_, val_mask, test_mask = (
target_table.train_mask.cpu(),
target_table.val_mask.cpu(),
target_table.test_mask.cpu(),
)
label_names = sorted(set(map(str, target_table.df[target_table.target_col].tolist())))
select_mask = ~(test_mask | val_mask)
train_num = min(100, select_mask.sum().item())
val_num = min(100, val_mask.sum().item())
target_nodes = list(range(len(target_table.df)))
edges = adj.coalesce().indices().t().tolist()
G = nx.Graph(edges)
G.add_nodes_from(target_nodes)
target_degrees = [(node, G.degree(node)) for node in target_nodes if select_mask[node]]
target_degrees.sort(key=lambda x: x[1], reverse=True)
top_nodes = [node for node, _ in target_degrees][:train_num]
train_indices = torch.tensor(top_nodes, dtype=torch.long)
train_mask = torch.zeros(target_table.df.shape[0], dtype=torch.bool)
train_mask[train_indices] = True
target_table.train_mask = train_mask
val_indices = torch.nonzero(val_mask).flatten()
val_indices = val_indices[torch.randperm(len(val_indices))[:val_num]]
val_mask[:] = False
val_mask[val_indices] = True
target_table.val_mask = val_mask
mask = train_mask | val_mask
Next, we query LLM for label predictions.
pseudo_labels = -1 * torch.ones(target_table.df.shape[0], dtype=torch.long)
df = target_table.df.loc[mask.cpu().numpy()].drop(columns=[target_table.target_col])
scenario = "Classify the artists into one of the given labels."
labels = ", ".join(label_names)
DASHSCOPE_API_KEY = "your-api-key"
llm = Tongyi(dashscope_api_key=DASHSCOPE_API_KEY, model_kwargs={"api_key": DASHSCOPE_API_KEY, "model": "qwen-max-2025-01-25"}, client=dashscope.Generation)
predictor = Predictor(llm=LangChainLLM(llm), type="classification")
outputs = predictor(df, scenario=scenario, labels=labels)
select_pred = []
for output in outputs:
output = output.lower()
matches = []
for label in label_names:
if label.lower() in output.lower():
matches.append(label)
if matches:
matched = max(matches, key=len)
else:
matched = random.choice(label_names)
select_pred.append(label_names.index(matched))
select_pred = torch.tensor(select_pred)
pseudo_labels[mask] = select_pred
Finally, we use the obtained pseudo-labels to train a BRIDGE model.
real_labels = torch.tensor([label_names.index(_) if _ in label_names else random.randrange(len(label_names)) for _ in target_table.df[target_table.target_col].astype(str).tolist()])
y = real_labels.long().to(device)
labeled_mask = (train_mask | val_mask).to(device)
y[labeled_mask] = pseudo_labels.long().to(device)[labeled_mask]
target_table.y = y
model = build_bridge_model(target_table.num_classes, target_table.metadata, emb_size).to(device)
train_bridge_model(model, target_table, non_table_embeddings, adj, 100, 0.001, 1e-4)