{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bafac90c-a1fd-4a6e-ba0f-fc5c9b2f0483",
   "metadata": {},
   "outputs": [],
   "source": [
    "from PTLF.lab import lab_setup\n",
    "lab_setup(settings_path='path/to/your/project/dir/project_name.json')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f3597210-b1aa-4b8a-8056-c3cfc29678c3",
   "metadata": {},
   "outputs": [],
   "source": [
    "from PTLF.utils import *"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0fee6bae-8c2e-4090-a057-84414c4376f8",
   "metadata": {},
   "source": [
    "## Simple"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "30ea8e01-4da4-4009-8ceb-6d919c663ed0",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "class DS01(DataSet):\n",
    "    def __init__(self):\n",
    "        self.args = {\"data_src\":None}\n",
    "\n",
    "    def _setup(self, args):\n",
    "        self.df = pd.read_csv(args['data_src'])\n",
    "\n",
    "    def __len__(self):\n",
    "        return len(self.df)\n",
    "\n",
    "    def __getitem__(self, idx):\n",
    "        row = self.df.iloc[idx, :].values\n",
    "        row = torch.tensor(row, dtype=torch.float32)  # Convert entire row to float32 tensor\n",
    "        label = row[-1]\n",
    "        data = row[:-1]\n",
    "        return [data], [label]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "7e1bc229-db7a-48b4-af0f-0cec13799bfd",
   "metadata": {},
   "outputs": [],
   "source": [
    "from torch.utils.data import Dataset,DataLoader\n",
    "import torch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "51b0a928-bd7c-4b45-a269-219080747b27",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\stdML\\GitDesk\\PTLF2\\src\\PTLF\\utils.py:94: UserWarning: DS01 component is not saved. Make sure to save it in an appropriate location beforeinitiating an experiment, test, or report.\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "dsargs = {\n",
    "        'loc':'DS01',\n",
    "        'args':{\n",
    "            \"data_src\": r\"D:\\stdML\\Py310\\Adult\\Prepared\\raw2\\test.csv\"\n",
    "        }\n",
    "}\n",
    "ds = load_component(**dsargs, setup=True)\n",
    "Dloader = DataLoader(ds,batch_size=32)\n",
    "dT = iter(Dloader)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "bcb2168b-3c5a-45ae-a64d-68d0d128e682",
   "metadata": {},
   "outputs": [],
   "source": [
    "dt = next(dT)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "24ff7ca7-a248-4115-8ed0-e4c26b4c5dba",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[tensor([0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,\n",
       "         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.])]"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dt[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "cdb66a9d-d4db-436a-ad3a-0d18c24b02d9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([32, 14])"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dt[0][0].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "afbd10f7-77da-4703-a112-48d89f21557a",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7beec04b-05c3-430f-bea1-49115357d8f2",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\stdML\\GitDesk\\PTLF2\\src\\PTLF\\utils.py:94: UserWarning: SimpleNN component is not saved. Make sure to save it in an appropriate location beforeinitiating an experiment, test, or report.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "==========================================================================================\n",
       "Layer (type:depth-idx)                   Output Shape              Param #\n",
       "==========================================================================================\n",
       "SimpleNN                                 [32, 1]                   --\n",
       "├─Sequential: 1-1                        [32, 200]                 --\n",
       "│    └─Linear: 2-1                       [32, 120]                 1,800\n",
       "│    └─ReLU: 2-2                         [32, 120]                 --\n",
       "│    └─Linear: 2-3                       [32, 100]                 12,100\n",
       "│    └─ReLU: 2-4                         [32, 100]                 --\n",
       "│    └─Linear: 2-5                       [32, 200]                 20,200\n",
       "│    └─ReLU: 2-6                         [32, 200]                 --\n",
       "│    └─Linear: 2-7                       [32, 200]                 40,200\n",
       "│    └─ReLU: 2-8                         [32, 200]                 --\n",
       "├─Dropout: 1-2                           [32, 200]                 --\n",
       "├─Linear: 1-3                            [32, 1]                   201\n",
       "==========================================================================================\n",
       "Total params: 74,501\n",
       "Trainable params: 74,501\n",
       "Non-trainable params: 0\n",
       "Total mult-adds (M): 2.38\n",
       "==========================================================================================\n",
       "Input size (MB): 0.00\n",
       "Forward/backward pass size (MB): 0.16\n",
       "Params size (MB): 0.30\n",
       "Estimated Total Size (MB): 0.46\n",
       "=========================================================================================="
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import torch.nn as nn\n",
    "from torch.nn import functional as F\n",
    "from torchinfo import summary\n",
    "\n",
    "class SimpleNN(Model):\n",
    "    def __init__(self):\n",
    "        super().__init__()\n",
    "        self.args = {\"h1_dim\":None, \"h2_dim\":None,'drop':None}\n",
    "    def _setup(self, args):\n",
    "        h1_dim, h2_dim, drop = args['h1_dim'], args['h2_dim'], args['drop']\n",
    "        self.seq = nn.Sequential(\n",
    "            nn.Linear(14, h1_dim),\n",
    "            nn.ReLU(),\n",
    "            nn.Linear(h1_dim, h2_dim),\n",
    "            nn.ReLU(),\n",
    "            nn.Linear(h2_dim, h2_dim*2),\n",
    "            nn.ReLU(),\n",
    "            nn.Linear(h2_dim*2, h2_dim*2),\n",
    "            nn.ReLU()\n",
    "        )\n",
    "\n",
    "        self.dropout = nn.Dropout(p=drop)\n",
    "        self.final = nn.Linear(h2_dim*2, 1)\n",
    "\n",
    "    def forward(self, x):\n",
    "        x = self.seq(x)\n",
    "        # print(x.shape)\n",
    "        x = self.dropout(x)\n",
    "        x = self.final(x)\n",
    "        return x\n",
    "\n",
    "model = load_component(loc='SimpleNN', args={\"h1_dim\":120, \"h2_dim\":100, 'drop':0.3}, setup=True)\n",
    "# model.to('cuda')\n",
    "summary(model=model, input_data=dt[0])\n",
    "# model(*dt[0]).shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "fc17ea54-bbbb-484c-835c-fc674747baa0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[ -220.3309],\n",
       "        [-3426.8130],\n",
       "        [-1660.9607],\n",
       "        [ -948.7296],\n",
       "        [-1603.6975],\n",
       "        [ -275.6187],\n",
       "        [-1397.0565],\n",
       "        [-1364.8336],\n",
       "        [-1171.5076],\n",
       "        [-1769.9922],\n",
       "        [ -481.8759],\n",
       "        [-1443.5280],\n",
       "        [ -222.4868],\n",
       "        [ -874.3320],\n",
       "        [ -927.0721],\n",
       "        [ -677.9394],\n",
       "        [ -546.9390],\n",
       "        [ -435.9732],\n",
       "        [ -694.5280],\n",
       "        [ -785.2214],\n",
       "        [-1834.9946],\n",
       "        [ -287.0668],\n",
       "        [-2317.0957],\n",
       "        [-1651.9097],\n",
       "        [ -170.4194],\n",
       "        [ -214.1748],\n",
       "        [-5181.8262],\n",
       "        [ -711.2175],\n",
       "        [-1923.5010],\n",
       "        [ -833.2474],\n",
       "        [ -293.7393],\n",
       "        [ -859.3155]], grad_fn=<AddmmBackward0>)"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model(*dt[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cac7858a-efda-48f1-a2e9-85e187409564",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Optimizer\n",
    "import torch.optim as optim\n",
    "\n",
    "class OptAdam(Optimizer):\n",
    "    def __init__(self):\n",
    "        super().__init__()\n",
    "\n",
    "    def _setup(self,args):\n",
    "        learning_rate = args.get('learning_rate', 0.001)\n",
    "        self.optimizer = optim.Adam(args['model_parameters'], lr=learning_rate)\n",
    "\n",
    "    def step(self, **kwargs):\n",
    "        self.optimizer.step()\n",
    "\n",
    "    def zero_grad(self):\n",
    "        self.optimizer.zero_grad()\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9b35faed-5da0-450b-a87f-8572b233a712",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Metrics\n",
    "import torch\n",
    "from PyTorchLabFlow.utils import Metric\n",
    "from torchmetrics.classification import BinaryAccuracy\n",
    "\n",
    "class BinAcc(Metric):\n",
    "    def __init__(self):\n",
    "        super().__init__()\n",
    "        self.accuracy = BinaryAccuracy()\n",
    "\n",
    "    def setup(self, args):\n",
    "        if self.check_args(args):\n",
    "\n",
    "            self.accuracy = BinaryAccuracy()\n",
    "            return self\n",
    "    def forward(self,y_pred, y_true):\n",
    "        y_true = y_true[0]\n",
    "        y_pred = y_pred.view_as(y_true)\n",
    "        accuracy = self.accuracy(y_pred, y_true)\n",
    "        return accuracy.item()\n",
    "\n",
    "\n",
    "import torch.nn as nn\n",
    "from sklearn.metrics import roc_auc_score\n",
    "class AUROC(Metric):\n",
    "    def __init__(self):\n",
    "        super().__init__()\n",
    "    def setup(self, args):\n",
    "        if self.check_args(args):\n",
    "\n",
    "            self.accuracy = BinaryAccuracy()\n",
    "            return self\n",
    "    def forward(self, outputs, targets):\n",
    "        # If outputs are raw logits, apply a sigmoid or softmax to get probabilities\n",
    "        # For binary classification (sigmoid)\n",
    "        targets = targets[0]\n",
    "        # outputs = outputs.view_as(targets)\n",
    "        if outputs.size(1) == 1:\n",
    "            probabilities = torch.sigmoid(outputs).detach().cpu().numpy()\n",
    "            targets = targets.detach().cpu().numpy()\n",
    "            auroc = roc_auc_score(targets, probabilities)\n",
    "        # For multi-class classification (softmax)\n",
    "        else:\n",
    "            probabilities = torch.softmax(outputs, dim=1).detach().cpu().numpy()\n",
    "            targets = targets.detach().cpu().numpy()\n",
    "            # One-hot encode targets for multi-class\n",
    "            auroc = roc_auc_score(targets, probabilities, average='macro', multi_class='ovr')\n",
    "\n",
    "        return auroc\n",
    "\n",
    "from sklearn.metrics import average_precision_score\n",
    "\n",
    "class AUPRC(Metric):\n",
    "    def __init__(self):\n",
    "        super().__init__()\n",
    "    def setup(self, args):\n",
    "        if self.check_args(args):\n",
    "\n",
    "            self.accuracy = BinaryAccuracy()\n",
    "            return self\n",
    "    def forward(self, outputs, targets):\n",
    "        # If outputs are raw logits, apply a sigmoid or softmax to get probabilities\n",
    "        # For binary classification (sigmoid)\n",
    "        targets = targets[0]\n",
    "        if outputs.size(1) == 1:\n",
    "            probabilities = torch.sigmoid(outputs).detach().cpu().numpy()\n",
    "            targets = targets.detach().cpu().numpy()\n",
    "            auprc = average_precision_score(targets, probabilities)\n",
    "        # For multi-class classification (softmax)\n",
    "        else:\n",
    "            probabilities = torch.softmax(outputs, dim=1).detach().cpu().numpy()\n",
    "            targets = targets.detach().cpu().numpy()\n",
    "            # For multi-class, use average_precision_score for each class separately and average\n",
    "            auprc = average_precision_score(targets, probabilities, average='macro', multi_class='ovr')\n",
    "\n",
    "        return auprc\n",
    "\n",
    "from sklearn.metrics import f1_score\n",
    "class F1Score(Metric):\n",
    "    def __init__(self):\n",
    "        super().__init__()\n",
    "    def setup(self, args):\n",
    "        if self.check_args(args):\n",
    "\n",
    "            self.accuracy = BinaryAccuracy()\n",
    "            return self\n",
    "    def forward(self, outputs, targets):\n",
    "        # If outputs are raw logits, apply a sigmoid or softmax to get probabilities\n",
    "        # For binary classification (sigmoid)\n",
    "        targets = targets[0]\n",
    "        if outputs.size(1) == 1:\n",
    "            probabilities = torch.sigmoid(outputs).detach().cpu().numpy()\n",
    "            predictions = (probabilities > 0.5).astype(int)  # Convert to 0 or 1 (binary classification)\n",
    "            targets = targets.detach().cpu().numpy()\n",
    "            f1 = f1_score(targets, predictions)\n",
    "        # For multi-class classification (softmax)\n",
    "        else:\n",
    "            probabilities = torch.softmax(outputs, dim=1).detach().cpu().numpy()\n",
    "            predictions = probabilities.argmax(axis=1)  # Choose the class with the highest probability\n",
    "            targets = targets.detach().cpu().numpy()\n",
    "            f1 = f1_score(targets, predictions, average='macro')  # Macro-average for multi-class\n",
    "        return f1\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d3ba6b90-69ee-4f0c-8757-feb135ef5d48",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Loss\n",
    "from torch import nn\n",
    "from PyTorchLabFlow.utils import Loss\n",
    "\n",
    "class BCElogit(Loss):\n",
    "    def __init__(self):\n",
    "        super().__init__()\n",
    "        self.args ={}\n",
    "    def _setup(self,args):\n",
    "        self.criterion = nn.BCEWithLogitsLoss()\n",
    "\n",
    "    def forward(self, logits, y_true):\n",
    "        print('inside loss',type(y_true), y_true.shape)\n",
    "        y_true = y_true[0]\n",
    "        logits = logits.view_as(y_true)\n",
    "        # print('sdjkfndjnvjnf',type(logits), type(y_true))\n",
    "        loss = self.criterion(logits, y_true.float())\n",
    "        return loss\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5b643386-831e-4336-8b48-b552a3dd941f",
   "metadata": {},
   "outputs": [],
   "source": [
    "expargs = {\n",
    "    'dataset':{\n",
    "        'loc':'DS01',\n",
    "        'args':{ }\n",
    "    },\n",
    "    'model':{\n",
    "        'loc': 'SimpleNN',\n",
    "        'args': {\"h1_dim\":120, \"h2_dim\":1000, 'drop':0.3}\n",
    "    },\n",
    "    \"loss\":{\n",
    "        'loc':\"BCElogit\",\n",
    "        'args':{},\n",
    "    },\n",
    "    'optimizer':{\n",
    "        'loc':'OptAdam',\n",
    "        'args':{}\n",
    "    },\n",
    "    \"metrics\":{\n",
    "        \"accuracy\":{\n",
    "            'loc':\"BinAcc\",\n",
    "            'args':{}\n",
    "        } ,\n",
    "        \"auroc\":{\n",
    "            'loc':\"AUROC\",\n",
    "            'args':{}\n",
    "        } ,\n",
    "        \"f1score\":{\n",
    "            'loc':\"AUPRC\",\n",
    "            'args':{}\n",
    "        } ,\n",
    "        \"auprc\":{\n",
    "            'loc':\"F1Score\",\n",
    "            'args':{}\n",
    "        }\n",
    "    },\n",
    "    \"train_data_src\": r\"D:\\stdML\\Py310\\Adult\\Prepared\\raw2\\train.csv\",\n",
    "    \"val_data_src\": r\"D:\\stdML\\Py310\\Adult\\Prepared\\raw2\\valid.csv\",\n",
    "    \"train_batch_size\":36,\n",
    "    \"val_batch_size\":36\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2b64ae29-6d16-4c82-9f51-c85492ae6fba",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "a9a99909-3c3a-4091-a37e-77ccfd45b401",
   "metadata": {},
   "source": [
    "## Embedding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "608c5754-f07b-4744-a649-32f8939cbeb0",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import torch\n",
    "class DS02(DataSet):\n",
    "    def __init__(self):\n",
    "        self.args = {\"data_src\":None}\n",
    "\n",
    "    def _setup(self, args):\n",
    "        self.df = pd.read_csv(args['data_src'])\n",
    "        self.df.replace('?', pd.NA, inplace=True)\n",
    "        self.df = self.df.dropna()\n",
    "        # Define categorical and continuous columns\n",
    "        self.cat_cols = [\n",
    "            'workclass', 'education', 'marital_status', 'relationship', 'race',\n",
    "            'occupation', 'native_country'\n",
    "        ]\n",
    "        self.cont_cols = [\n",
    "            'age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week'\n",
    "        ]\n",
    "        self.label_col = 'income'\n",
    "\n",
    "        # Define mappings for categorical columns (ensure this matches your earlier mappings)\n",
    "        self.label_encoders = {\n",
    "            'workclass': {\n",
    "                'Private': 0, 'Local-gov': 1, 'Self-emp-not-inc': 2, 'Federal-gov': 3,\n",
    "                'State-gov': 4, 'Self-emp-inc': 5, 'Without-pay': 6, 'Never-worked': 7\n",
    "            },\n",
    "            'education': {\n",
    "                '11th': 0, 'HS-grad': 1, 'Assoc-acdm': 2, 'Some-college': 3, '10th': 4,\n",
    "                'Prof-school': 5, '7th-8th': 6, 'Bachelors': 7, 'Masters': 8, '5th-6th': 9,\n",
    "                'Assoc-voc': 10, '9th': 11, 'Doctorate': 12, '12th': 13, '1st-4th': 14, 'Preschool': 15\n",
    "            },\n",
    "            'marital_status': {\n",
    "                'Never-married': 0, 'Married-civ-spouse': 1, 'Widowed': 2,\n",
    "                'Divorced': 3, 'Separated': 4, 'Married-spouse-absent': 5, 'Married-AF-spouse': 6\n",
    "            },\n",
    "            'relationship': {\n",
    "                'Own-child': 0, 'Husband': 1, 'Not-in-family': 2,\n",
    "                'Unmarried': 3, 'Wife': 4, 'Other-relative': 5\n",
    "            },\n",
    "            'race': {\n",
    "                'Black': 0, 'White': 1, 'Other': 2, 'Amer-Indian-Eskimo': 3, 'Asian-Pac-Islander': 4\n",
    "            },\n",
    "            'occupation': {\n",
    "                'Machine-op-inspct': 0, 'Farming-fishing': 1, 'Protective-serv': 2,\n",
    "                'Other-service': 3, 'Prof-specialty': 4, 'Craft-repair': 5,\n",
    "                'Adm-clerical': 6, 'Exec-managerial': 7, 'Tech-support': 8,\n",
    "                'Sales': 9, 'Priv-house-serv': 10, 'Transport-moving': 11,\n",
    "                'Handlers-cleaners': 12, 'Armed-Forces': 13\n",
    "            },\n",
    "            'native_country': {\n",
    "                'United-States': 0, 'Peru': 1, 'Guatemala': 2, 'Mexico': 3, 'Dominican-Republic': 4,\n",
    "                'Ireland': 5, 'Germany': 6, 'Philippines': 7, 'Thailand': 8, 'Haiti': 9, 'El-Salvador': 10,\n",
    "                'Puerto-Rico': 11, 'Vietnam': 12, 'South': 13, 'Columbia': 14, 'Japan': 15, 'India': 16,\n",
    "                'Cambodia': 17, 'Poland': 18, 'Laos': 19, 'England': 20, 'Cuba': 21, 'Taiwan': 22,\n",
    "                'Italy': 23, 'Canada': 24, 'Portugal': 25, 'China': 26, 'Nicaragua': 27, 'Honduras': 28,\n",
    "                'Iran': 29, 'Scotland': 30, 'Jamaica': 31, 'Ecuador': 32, 'Yugoslavia': 33, 'Hungary': 34,\n",
    "                'Hong': 35, 'Greece': 36, 'Trinadad&Tobago': 37, 'Outlying-US(Guam-USVI-etc)': 38,\n",
    "                'France': 39, 'Holand-Netherlands': 40\n",
    "            }\n",
    "        }\n",
    "\n",
    "        # Encode categorical variables\n",
    "        for col, mapping in self.label_encoders.items():\n",
    "            self.df[col] = self.df[col].replace(mapping)\n",
    "\n",
    "        # Encode label column\n",
    "        self.df[self.label_col] = self.df[self.label_col].replace({'<=50K': 0, '>50K': 1})\n",
    "\n",
    "        # Convert everything to torch tensors\n",
    "        self.cat_data = torch.tensor(self.df[self.cat_cols].values, dtype=torch.long)\n",
    "        self.cont_data = torch.tensor(self.df[self.cont_cols].values, dtype=torch.float32)\n",
    "        self.labels = torch.tensor(self.df[self.label_col].values, dtype=torch.float32).unsqueeze(1)\n",
    "    def __len__(self):\n",
    "        return len(self.df)\n",
    "\n",
    "    def __getitem__(self, idx):\n",
    "        label = self.labels[idx]\n",
    "        return [self.cat_data[idx], self.cont_data[idx]], [label]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "e841ece6-fd51-45a3-8083-c5b2d56486c0",
   "metadata": {},
   "outputs": [],
   "source": [
    "from torch.utils.data import Dataset,DataLoader\n",
    "import torch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "b35af12d-093b-481a-ad22-deb33779c149",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\stdML\\GitDesk\\PTLF2\\src\\PTLF\\utils.py:94: UserWarning: DS02 component is not saved. Make sure to save it in an appropriate location beforeinitiating an experiment, test, or report.\n",
      "  warnings.warn(\n",
      "C:\\Users\\BBEK-Anand\\AppData\\Local\\Temp\\ipykernel_2920\\854477436.py:64: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
      "  self.df[col] = self.df[col].replace(mapping)\n",
      "C:\\Users\\BBEK-Anand\\AppData\\Local\\Temp\\ipykernel_2920\\854477436.py:67: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
      "  self.df[self.label_col] = self.df[self.label_col].replace({'<=50K': 0, '>50K': 1})\n"
     ]
    }
   ],
   "source": [
    "dsargs = {\n",
    "        'loc':'DS02',\n",
    "        'args':{\n",
    "            \"data_src\": \"D:/stdML/Py310/Adult/Prepared/raw/test.csv\"\n",
    "        }\n",
    "}\n",
    "ds = load_component(**dsargs, setup=True)\n",
    "Dloader = DataLoader(ds,batch_size=32)\n",
    "dT = iter(Dloader)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "012741b4-e879-49b7-9e3f-93ae7b38440f",
   "metadata": {},
   "outputs": [],
   "source": [
    "dt = next(dT)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "ea233717-4962-490f-aa9c-8169a2fdd36e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[3.6000e+01, 3.2709e+04, 1.0000e+01, 3.3250e+03, 0.0000e+00, 4.5000e+01],\n",
       "        [1.9000e+01, 4.4381e+05, 9.0000e+00, 0.0000e+00, 0.0000e+00, 3.0000e+01],\n",
       "        [2.3000e+01, 2.4040e+05, 1.3000e+01, 0.0000e+00, 0.0000e+00, 1.5000e+01],\n",
       "        [4.9000e+01, 3.3087e+05, 1.0000e+01, 0.0000e+00, 0.0000e+00, 7.0000e+01],\n",
       "        [3.5000e+01, 3.7646e+05, 1.0000e+01, 1.5024e+04, 0.0000e+00, 6.0000e+01],\n",
       "        [3.4000e+01, 1.9200e+05, 1.4000e+01, 0.0000e+00, 0.0000e+00, 5.5000e+01],\n",
       "        [2.2000e+01, 2.9293e+05, 1.3000e+01, 0.0000e+00, 0.0000e+00, 1.0000e+01],\n",
       "        [4.3000e+01, 1.7023e+05, 1.3000e+01, 0.0000e+00, 0.0000e+00, 6.0000e+01],\n",
       "        [3.2000e+01, 3.3154e+05, 9.0000e+00, 0.0000e+00, 0.0000e+00, 4.0000e+01],\n",
       "        [4.6000e+01, 2.7377e+05, 9.0000e+00, 0.0000e+00, 0.0000e+00, 4.0000e+01],\n",
       "        [3.5000e+01, 1.5231e+05, 7.0000e+00, 0.0000e+00, 0.0000e+00, 4.0000e+01],\n",
       "        [4.2000e+01, 1.9512e+05, 4.0000e+00, 0.0000e+00, 0.0000e+00, 3.5000e+01],\n",
       "        [4.0000e+01, 1.2147e+05, 9.0000e+00, 0.0000e+00, 0.0000e+00, 4.0000e+01],\n",
       "        [4.6000e+01, 1.6047e+05, 9.0000e+00, 0.0000e+00, 1.5900e+03, 4.3000e+01],\n",
       "        [3.9000e+01, 2.4505e+05, 1.3000e+01, 0.0000e+00, 0.0000e+00, 5.0000e+01],\n",
       "        [5.3000e+01, 9.6062e+04, 1.0000e+01, 0.0000e+00, 1.7400e+03, 4.0000e+01],\n",
       "        [1.9000e+01, 2.3448e+05, 1.0000e+01, 0.0000e+00, 0.0000e+00, 7.0000e+00],\n",
       "        [3.0000e+01, 1.5412e+05, 1.3000e+01, 0.0000e+00, 0.0000e+00, 6.5000e+01],\n",
       "        [1.9000e+01, 1.3043e+05, 3.0000e+00, 0.0000e+00, 0.0000e+00, 3.6000e+01],\n",
       "        [2.2000e+01, 1.2497e+05, 9.0000e+00, 0.0000e+00, 0.0000e+00, 4.5000e+01],\n",
       "        [4.8000e+01, 3.2466e+05, 1.3000e+01, 0.0000e+00, 0.0000e+00, 4.0000e+01],\n",
       "        [5.0000e+01, 1.5895e+05, 4.0000e+00, 3.4110e+03, 0.0000e+00, 4.0000e+01],\n",
       "        [2.8000e+01, 4.1195e+05, 9.0000e+00, 0.0000e+00, 0.0000e+00, 4.0000e+01],\n",
       "        [2.6000e+01, 2.9280e+05, 1.0000e+01, 0.0000e+00, 0.0000e+00, 2.4000e+01],\n",
       "        [1.9000e+01, 2.8145e+04, 9.0000e+00, 0.0000e+00, 0.0000e+00, 5.2000e+01],\n",
       "        [2.5000e+01, 8.2560e+04, 1.2000e+01, 0.0000e+00, 0.0000e+00, 4.3000e+01],\n",
       "        [5.6000e+01, 6.5956e+05, 8.0000e+00, 0.0000e+00, 0.0000e+00, 4.0000e+01],\n",
       "        [2.2000e+01, 1.3850e+05, 1.0000e+01, 0.0000e+00, 0.0000e+00, 3.0000e+01],\n",
       "        [3.8000e+01, 3.1223e+05, 1.0000e+01, 0.0000e+00, 0.0000e+00, 5.0000e+01],\n",
       "        [1.9000e+01, 2.5158e+05, 1.0000e+01, 0.0000e+00, 0.0000e+00, 1.4000e+01],\n",
       "        [1.8000e+01, 4.3272e+04, 1.0000e+01, 0.0000e+00, 0.0000e+00, 2.0000e+01],\n",
       "        [3.5000e+01, 1.0371e+05, 1.4000e+01, 0.0000e+00, 0.0000e+00, 4.0000e+01]])"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dt[0][1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ba8f6f14-a565-42bd-b078-4eb41468193e",
   "metadata": {
    "jupyter": {
     "source_hidden": true
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'embedding_info': [(7, 4),\n",
       "  (16, 8),\n",
       "  (7, 4),\n",
       "  (6, 3),\n",
       "  (5, 3),\n",
       "  (14, 7),\n",
       "  (40, 20)],\n",
       " 'continuous_dim': 6,\n",
       " 'hidden_dim': 64,\n",
       " 'output_dim': 1}"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def generate_model_params(dataset):\n",
    "    embedding_info = []\n",
    "\n",
    "    for col in dataset.cat_cols:\n",
    "        max_val = dataset.df[col].max()\n",
    "        num_categories = int(max_val + 1)  # include -1 if used\n",
    "        emb_dim = min(50, (num_categories + 1) // 2)\n",
    "        embedding_info.append((num_categories, emb_dim))\n",
    "\n",
    "    continuous_dim = len(dataset.cont_cols)\n",
    "    output_dim = 1  # binary classification\n",
    "\n",
    "    return {\n",
    "        \"embedding_info\": embedding_info,\n",
    "        \"continuous_dim\": continuous_dim,\n",
    "        \"hidden_dim\": 64,\n",
    "        \"output_dim\": output_dim\n",
    "    }\n",
    "\n",
    "# Example:\n",
    "params = generate_model_params(ds)\n",
    "params\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4c7d0223-f39e-4e09-86e7-7150ce364f8a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "==========================================================================================\n",
       "Layer (type:depth-idx)                   Output Shape              Param #\n",
       "==========================================================================================\n",
       "SimpleNNe                                [32, 1]                   --\n",
       "├─ModuleList: 1-1                        --                        --\n",
       "│    └─Embedding: 2-1                    [32, 4]                   28\n",
       "│    └─Embedding: 2-2                    [32, 84]                  1,344\n",
       "│    └─Embedding: 2-3                    [32, 40]                  280\n",
       "│    └─Embedding: 2-4                    [32, 30]                  180\n",
       "│    └─Embedding: 2-5                    [32, 300]                 1,500\n",
       "│    └─Embedding: 2-6                    [32, 790]                 11,060\n",
       "│    └─Embedding: 2-7                    [32, 21]                  861\n",
       "├─Sequential: 1-2                        [32, 1]                   --\n",
       "│    └─Linear: 2-8                       [32, 648]                 826,848\n",
       "│    └─ReLU: 2-9                         [32, 648]                 --\n",
       "│    └─Dropout: 2-10                     [32, 648]                 --\n",
       "│    └─Linear: 2-11                      [32, 1]                   649\n",
       "==========================================================================================\n",
       "Total params: 842,750\n",
       "Trainable params: 842,750\n",
       "Non-trainable params: 0\n",
       "Total mult-adds (M): 26.97\n",
       "==========================================================================================\n",
       "Input size (MB): 0.00\n",
       "Forward/backward pass size (MB): 0.49\n",
       "Params size (MB): 3.37\n",
       "Estimated Total Size (MB): 3.86\n",
       "=========================================================================================="
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import torch.nn as nn\n",
    "from torch.nn import functional as F\n",
    "from torchinfo import summary\n",
    "\n",
    "class SimpleNNe(Model):\n",
    "    def __init__(self):\n",
    "        super().__init__()\n",
    "        self.args = {\"embedding_info\":None, \"continuous_dim\":None,'hidden_dim':None, 'drop':None}\n",
    "\n",
    "    def _setup(self, args):\n",
    "        embedding_info, continuous_dim, hidden_dim, drop = args['embedding_info'], args['continuous_dim'], args['hidden_dim'], args['drop']\n",
    "        self.embeddings = nn.ModuleList([\n",
    "            nn.Embedding(num_categories, emb_dim)\n",
    "            for num_categories, emb_dim in embedding_info\n",
    "        ])\n",
    "\n",
    "        self.continuous_dim = continuous_dim\n",
    "        total_emb_dim = sum(emb_dim for _, emb_dim in embedding_info)\n",
    "\n",
    "        self.fc = nn.Sequential(\n",
    "            nn.Linear(total_emb_dim + continuous_dim, hidden_dim),\n",
    "            nn.ReLU(),\n",
    "            nn.Dropout(drop),\n",
    "            nn.Linear(hidden_dim, 1)\n",
    "        )\n",
    "\n",
    "    def forward(self, x_cat, x_cont):\n",
    "        x = [emb(x_cat[:, i]) for i, emb in enumerate(self.embeddings)]\n",
    "        x = torch.cat(x, dim=1)\n",
    "        x = torch.cat([x, x_cont], dim=1)\n",
    "        return self.fc(x)\n",
    "\n",
    "\n",
    "args = {'embedding_info': [(7, 4),\n",
    "  (16, 84),\n",
    "  (7, 40),\n",
    "  (6, 30),\n",
    "  (5, 300),\n",
    "  (14, 790),\n",
    "  (41, 21)],\n",
    " 'continuous_dim': 6,\n",
    " 'hidden_dim': 648, 'drop':0.3}\n",
    "\n",
    "model = load_component(loc='SimpleNNe', args=args, setup=True)\n",
    "# model.to('cuda')\n",
    "summary(model=model, input_data=dt[0])\n",
    "# model(*dt[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3c608fae-78d0-43f9-a48b-29bdd841acad",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "390786eb-e4d5-439e-9816-f0476dc2fc65",
   "metadata": {},
   "outputs": [],
   "source": [
    "expargs = {\n",
    "    'dataset':{\n",
    "        'loc':'DS02',\n",
    "        'args':{ }\n",
    "    },\n",
    "    'model':{\n",
    "        'loc': 'SimpleNNe',\n",
    "        'args': {'embedding_info': [(7, 4),\n",
    "                                  (16, 84),\n",
    "                                  (7, 40),\n",
    "                                  (6, 30),\n",
    "                                  (5, 300),\n",
    "                                  (14, 790),\n",
    "                                  (41, 21)],\n",
    "                 'continuous_dim': 6,\n",
    "                 'hidden_dim': 648, 'drop':0.3\n",
    "                }\n",
    "    },\n",
    "    \"loss\":{\n",
    "        'loc':\"BCElogit\",\n",
    "        'args':{},\n",
    "    },\n",
    "    'optimizer':{\n",
    "        'loc':'OptAdam',\n",
    "        'args':{}\n",
    "    },\n",
    "    \"metrics\":{\n",
    "        \"accuracy\":{\n",
    "            'loc':\"BinAcc\",\n",
    "            'args':{}\n",
    "        } ,\n",
    "        \"auroc\":{\n",
    "            'loc':\"AUROC\",\n",
    "            'args':{}\n",
    "        } ,\n",
    "        \"f1score\":{\n",
    "            'loc':\"AUPRC\",\n",
    "            'args':{}\n",
    "        } ,\n",
    "        \"auprc\":{\n",
    "            'loc':\"F1Score\",\n",
    "            'args':{}\n",
    "        }\n",
    "    },\n",
    "    \"train_data_src\": r\"D:\\stdML\\Py310\\Adult\\Prepared\\raw\\train.csv\",\n",
    "    \"val_data_src\": r\"D:\\stdML\\Py310\\Adult\\Prepared\\raw\\valid.csv\",\n",
    "    \"train_batch_size\":36,\n",
    "    \"val_batch_size\":36\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "06874d40-346f-4913-b02b-a19366fcc800",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0a111a48-0d3b-4291-bca3-aa060e15a4a7",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}