Source code for rllm.datasets.bank_marketing

import os
import os.path as osp
from typing import Optional

import pandas as pd

from rllm.types import ColType
from rllm.data.table_data import TableData
from rllm.datasets.dataset import Dataset
from rllm.utils.download import download_url
from rllm.utils.extract import extract_zip


[docs] class BankMarketing(Dataset): r"""The `Bank Marketing dataset <https://archive.ics.uci.edu/dataset/ 222/bank+marketing>`__ is related to direct marketing campaigns of a Portuguese banking institution. The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required in order to assess if the product (bank term deposit) would be subscribed. The classification goal is to predict if the client will subscribe to a term deposit. The dataset encompasses a variety of features pertaining to clients and their banking information. The primary objective is to predict whether a client will subscribe to a term deposit. .. Age: Age of the client. .. Job: Type of job (admin, blue-collar, entrepreneur, housemaid, .. management, retired, self-employed, services, student, technician, .. unemployed, unknown). .. Marital: Marital status of the client (divorced, married, single). .. Education: The highest level of education achieved (unknown, secondary, .. primary, tertiary). .. Default: Has credit in default? .. Balance: Average yearly balance, in euros. .. Housing: Has housing loan? .. Loan: Has personal loan? .. Contact: Contact communication type (unknown, telephone, cellular). .. Day: Last contact day of the month. .. Month: Last contact month of the year (jan, feb, mar, apr, may, jun, jul, .. aug, sep, oct, nov, dec). .. Duration: Last contact duration, in seconds. .. Campaign: Number of contacts performed during this campaign and for this .. client. .. Pdays: Number of days that passed by after the client was last contacted .. from a previous campaign. .. Previous: Number of contacts performed before this campaign and for this .. client. .. Poutcome: Outcome of the previous marketing campaign (unknown, other, .. failure, success). .. Target: Has the client subscribed a term deposit? Args: cached_dir (str): Root directory where dataset should be saved. forced_reload (bool): If set to `True`, this dataset will be re-processed again. .. parsed-literal:: Statics: Name Clients Features Size 45211 16 """ url = "https://archive.ics.uci.edu/static/public/222/bank+marketing.zip" def __init__(self, cached_dir: str, forced_reload: Optional[bool] = False) -> None: self.name = "bank_marketing" root = os.path.join(cached_dir, self.name) super().__init__(root, force_reload=forced_reload) self.data_list = [TableData.load(self.processed_paths[0])] @property def raw_filenames(self): return ["bank-full.csv"] @property def processed_filenames(self): return ["data.pt"]
[docs] def process(self): r""" process data and save to './cached_dir/{dataset}/processed/'. """ os.makedirs(self.processed_dir, exist_ok=True) path = osp.join(self.raw_dir, self.raw_filenames[0]) df = pd.read_csv(path, sep=";", quotechar='"') # Note: the order of column in col_types must # correspond to the order of column in files, # except target column. col_types = { "age": ColType.NUMERICAL, "job": ColType.CATEGORICAL, "marital": ColType.CATEGORICAL, "education": ColType.CATEGORICAL, "default": ColType.CATEGORICAL, "balance": ColType.NUMERICAL, "housing": ColType.CATEGORICAL, "loan": ColType.CATEGORICAL, "contact": ColType.CATEGORICAL, "day": ColType.NUMERICAL, "month": ColType.CATEGORICAL, "duration": ColType.NUMERICAL, "campaign": ColType.NUMERICAL, "pdays": ColType.NUMERICAL, "previous": ColType.NUMERICAL, "poutcome": ColType.CATEGORICAL, "y": ColType.CATEGORICAL, } data = TableData( df=df, col_types=col_types, target_col="y", ) data.save(self.processed_paths[0])
[docs] def download(self): os.makedirs(self.raw_dir, exist_ok=True) download_url(self.url, self.raw_dir, "bank+marketing.zip") extract_zip(osp.join(self.raw_dir, "bank+marketing.zip"), self.raw_dir) extract_zip(osp.join(self.raw_dir, "bank.zip"), self.raw_dir)
def __len__(self): return 1 def __getitem__(self, index: int): if index != 0: raise IndexError return self.data_list[index]