{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "bafac90c-a1fd-4a6e-ba0f-fc5c9b2f0483", "metadata": {}, "outputs": [], "source": [ "from PTLF.lab import lab_setup\n", "lab_setup(settings_path='path/to/your/project/dir/project_name.json')" ] }, { "cell_type": "code", "execution_count": null, "id": "f3597210-b1aa-4b8a-8056-c3cfc29678c3", "metadata": {}, "outputs": [], "source": [ "from PTLF.utils import *" ] }, { "cell_type": "markdown", "id": "0fee6bae-8c2e-4090-a057-84414c4376f8", "metadata": {}, "source": [ "## Simple" ] }, { "cell_type": "code", "execution_count": null, "id": "30ea8e01-4da4-4009-8ceb-6d919c663ed0", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "class DS01(DataSet):\n", " def __init__(self):\n", " self.args = {\"data_src\":None}\n", "\n", " def _setup(self, args):\n", " self.df = pd.read_csv(args['data_src'])\n", "\n", " def __len__(self):\n", " return len(self.df)\n", "\n", " def __getitem__(self, idx):\n", " row = self.df.iloc[idx, :].values\n", " row = torch.tensor(row, dtype=torch.float32) # Convert entire row to float32 tensor\n", " label = row[-1]\n", " data = row[:-1]\n", " return [data], [label]\n" ] }, { "cell_type": "code", "execution_count": 37, "id": "7e1bc229-db7a-48b4-af0f-0cec13799bfd", "metadata": {}, "outputs": [], "source": [ "from torch.utils.data import Dataset,DataLoader\n", "import torch" ] }, { "cell_type": "code", "execution_count": 38, "id": "51b0a928-bd7c-4b45-a269-219080747b27", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "D:\\stdML\\GitDesk\\PTLF2\\src\\PTLF\\utils.py:94: UserWarning: DS01 component is not saved. Make sure to save it in an appropriate location beforeinitiating an experiment, test, or report.\n", " warnings.warn(\n" ] } ], "source": [ "dsargs = {\n", " 'loc':'DS01',\n", " 'args':{\n", " \"data_src\": r\"D:\\stdML\\Py310\\Adult\\Prepared\\raw2\\test.csv\"\n", " }\n", "}\n", "ds = load_component(**dsargs, setup=True)\n", "Dloader = DataLoader(ds,batch_size=32)\n", "dT = iter(Dloader)" ] }, { "cell_type": "code", "execution_count": 39, "id": "bcb2168b-3c5a-45ae-a64d-68d0d128e682", "metadata": {}, "outputs": [], "source": [ "dt = next(dT)" ] }, { "cell_type": "code", "execution_count": 40, "id": "24ff7ca7-a248-4115-8ed0-e4c26b4c5dba", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[tensor([0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.])]" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dt[1]" ] }, { "cell_type": "code", "execution_count": 41, "id": "cdb66a9d-d4db-436a-ad3a-0d18c24b02d9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "torch.Size([32, 14])" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dt[0][0].shape" ] }, { "cell_type": "code", "execution_count": null, "id": "afbd10f7-77da-4703-a112-48d89f21557a", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "7beec04b-05c3-430f-bea1-49115357d8f2", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "D:\\stdML\\GitDesk\\PTLF2\\src\\PTLF\\utils.py:94: UserWarning: SimpleNN component is not saved. Make sure to save it in an appropriate location beforeinitiating an experiment, test, or report.\n", " warnings.warn(\n" ] }, { "data": { "text/plain": [ "==========================================================================================\n", "Layer (type:depth-idx) Output Shape Param #\n", "==========================================================================================\n", "SimpleNN [32, 1] --\n", "├─Sequential: 1-1 [32, 200] --\n", "│ └─Linear: 2-1 [32, 120] 1,800\n", "│ └─ReLU: 2-2 [32, 120] --\n", "│ └─Linear: 2-3 [32, 100] 12,100\n", "│ └─ReLU: 2-4 [32, 100] --\n", "│ └─Linear: 2-5 [32, 200] 20,200\n", "│ └─ReLU: 2-6 [32, 200] --\n", "│ └─Linear: 2-7 [32, 200] 40,200\n", "│ └─ReLU: 2-8 [32, 200] --\n", "├─Dropout: 1-2 [32, 200] --\n", "├─Linear: 1-3 [32, 1] 201\n", "==========================================================================================\n", "Total params: 74,501\n", "Trainable params: 74,501\n", "Non-trainable params: 0\n", "Total mult-adds (M): 2.38\n", "==========================================================================================\n", "Input size (MB): 0.00\n", "Forward/backward pass size (MB): 0.16\n", "Params size (MB): 0.30\n", "Estimated Total Size (MB): 0.46\n", "==========================================================================================" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import torch.nn as nn\n", "from torch.nn import functional as F\n", "from torchinfo import summary\n", "\n", "class SimpleNN(Model):\n", " def __init__(self):\n", " super().__init__()\n", " self.args = {\"h1_dim\":None, \"h2_dim\":None,'drop':None}\n", " def _setup(self, args):\n", " h1_dim, h2_dim, drop = args['h1_dim'], args['h2_dim'], args['drop']\n", " self.seq = nn.Sequential(\n", " nn.Linear(14, h1_dim),\n", " nn.ReLU(),\n", " nn.Linear(h1_dim, h2_dim),\n", " nn.ReLU(),\n", " nn.Linear(h2_dim, h2_dim*2),\n", " nn.ReLU(),\n", " nn.Linear(h2_dim*2, h2_dim*2),\n", " nn.ReLU()\n", " )\n", "\n", " self.dropout = nn.Dropout(p=drop)\n", " self.final = nn.Linear(h2_dim*2, 1)\n", "\n", " def forward(self, x):\n", " x = self.seq(x)\n", " # print(x.shape)\n", " x = self.dropout(x)\n", " x = self.final(x)\n", " return x\n", "\n", "model = load_component(loc='SimpleNN', args={\"h1_dim\":120, \"h2_dim\":100, 'drop':0.3}, setup=True)\n", "# model.to('cuda')\n", "summary(model=model, input_data=dt[0])\n", "# model(*dt[0]).shape" ] }, { "cell_type": "code", "execution_count": 43, "id": "fc17ea54-bbbb-484c-835c-fc674747baa0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([[ -220.3309],\n", " [-3426.8130],\n", " [-1660.9607],\n", " [ -948.7296],\n", " [-1603.6975],\n", " [ -275.6187],\n", " [-1397.0565],\n", " [-1364.8336],\n", " [-1171.5076],\n", " [-1769.9922],\n", " [ -481.8759],\n", " [-1443.5280],\n", " [ -222.4868],\n", " [ -874.3320],\n", " [ -927.0721],\n", " [ -677.9394],\n", " [ -546.9390],\n", " [ -435.9732],\n", " [ -694.5280],\n", " [ -785.2214],\n", " [-1834.9946],\n", " [ -287.0668],\n", " [-2317.0957],\n", " [-1651.9097],\n", " [ -170.4194],\n", " [ -214.1748],\n", " [-5181.8262],\n", " [ -711.2175],\n", " [-1923.5010],\n", " [ -833.2474],\n", " [ -293.7393],\n", " [ -859.3155]], grad_fn=)" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model(*dt[0])" ] }, { "cell_type": "code", "execution_count": null, "id": "cac7858a-efda-48f1-a2e9-85e187409564", "metadata": {}, "outputs": [], "source": [ "#Optimizer\n", "import torch.optim as optim\n", "\n", "class OptAdam(Optimizer):\n", " def __init__(self):\n", " super().__init__()\n", "\n", " def _setup(self,args):\n", " learning_rate = args.get('learning_rate', 0.001)\n", " self.optimizer = optim.Adam(args['model_parameters'], lr=learning_rate)\n", "\n", " def step(self, **kwargs):\n", " self.optimizer.step()\n", "\n", " def zero_grad(self):\n", " self.optimizer.zero_grad()\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "9b35faed-5da0-450b-a87f-8572b233a712", "metadata": {}, "outputs": [], "source": [ "#Metrics\n", "import torch\n", "from PyTorchLabFlow.utils import Metric\n", "from torchmetrics.classification import BinaryAccuracy\n", "\n", "class BinAcc(Metric):\n", " def __init__(self):\n", " super().__init__()\n", " self.accuracy = BinaryAccuracy()\n", "\n", " def setup(self, args):\n", " if self.check_args(args):\n", "\n", " self.accuracy = BinaryAccuracy()\n", " return self\n", " def forward(self,y_pred, y_true):\n", " y_true = y_true[0]\n", " y_pred = y_pred.view_as(y_true)\n", " accuracy = self.accuracy(y_pred, y_true)\n", " return accuracy.item()\n", "\n", "\n", "import torch.nn as nn\n", "from sklearn.metrics import roc_auc_score\n", "class AUROC(Metric):\n", " def __init__(self):\n", " super().__init__()\n", " def setup(self, args):\n", " if self.check_args(args):\n", "\n", " self.accuracy = BinaryAccuracy()\n", " return self\n", " def forward(self, outputs, targets):\n", " # If outputs are raw logits, apply a sigmoid or softmax to get probabilities\n", " # For binary classification (sigmoid)\n", " targets = targets[0]\n", " # outputs = outputs.view_as(targets)\n", " if outputs.size(1) == 1:\n", " probabilities = torch.sigmoid(outputs).detach().cpu().numpy()\n", " targets = targets.detach().cpu().numpy()\n", " auroc = roc_auc_score(targets, probabilities)\n", " # For multi-class classification (softmax)\n", " else:\n", " probabilities = torch.softmax(outputs, dim=1).detach().cpu().numpy()\n", " targets = targets.detach().cpu().numpy()\n", " # One-hot encode targets for multi-class\n", " auroc = roc_auc_score(targets, probabilities, average='macro', multi_class='ovr')\n", "\n", " return auroc\n", "\n", "from sklearn.metrics import average_precision_score\n", "\n", "class AUPRC(Metric):\n", " def __init__(self):\n", " super().__init__()\n", " def setup(self, args):\n", " if self.check_args(args):\n", "\n", " self.accuracy = BinaryAccuracy()\n", " return self\n", " def forward(self, outputs, targets):\n", " # If outputs are raw logits, apply a sigmoid or softmax to get probabilities\n", " # For binary classification (sigmoid)\n", " targets = targets[0]\n", " if outputs.size(1) == 1:\n", " probabilities = torch.sigmoid(outputs).detach().cpu().numpy()\n", " targets = targets.detach().cpu().numpy()\n", " auprc = average_precision_score(targets, probabilities)\n", " # For multi-class classification (softmax)\n", " else:\n", " probabilities = torch.softmax(outputs, dim=1).detach().cpu().numpy()\n", " targets = targets.detach().cpu().numpy()\n", " # For multi-class, use average_precision_score for each class separately and average\n", " auprc = average_precision_score(targets, probabilities, average='macro', multi_class='ovr')\n", "\n", " return auprc\n", "\n", "from sklearn.metrics import f1_score\n", "class F1Score(Metric):\n", " def __init__(self):\n", " super().__init__()\n", " def setup(self, args):\n", " if self.check_args(args):\n", "\n", " self.accuracy = BinaryAccuracy()\n", " return self\n", " def forward(self, outputs, targets):\n", " # If outputs are raw logits, apply a sigmoid or softmax to get probabilities\n", " # For binary classification (sigmoid)\n", " targets = targets[0]\n", " if outputs.size(1) == 1:\n", " probabilities = torch.sigmoid(outputs).detach().cpu().numpy()\n", " predictions = (probabilities > 0.5).astype(int) # Convert to 0 or 1 (binary classification)\n", " targets = targets.detach().cpu().numpy()\n", " f1 = f1_score(targets, predictions)\n", " # For multi-class classification (softmax)\n", " else:\n", " probabilities = torch.softmax(outputs, dim=1).detach().cpu().numpy()\n", " predictions = probabilities.argmax(axis=1) # Choose the class with the highest probability\n", " targets = targets.detach().cpu().numpy()\n", " f1 = f1_score(targets, predictions, average='macro') # Macro-average for multi-class\n", " return f1\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "d3ba6b90-69ee-4f0c-8757-feb135ef5d48", "metadata": {}, "outputs": [], "source": [ "#Loss\n", "from torch import nn\n", "from PyTorchLabFlow.utils import Loss\n", "\n", "class BCElogit(Loss):\n", " def __init__(self):\n", " super().__init__()\n", " self.args ={}\n", " def _setup(self,args):\n", " self.criterion = nn.BCEWithLogitsLoss()\n", "\n", " def forward(self, logits, y_true):\n", " print('inside loss',type(y_true), y_true.shape)\n", " y_true = y_true[0]\n", " logits = logits.view_as(y_true)\n", " # print('sdjkfndjnvjnf',type(logits), type(y_true))\n", " loss = self.criterion(logits, y_true.float())\n", " return loss\n" ] }, { "cell_type": "code", "execution_count": null, "id": "5b643386-831e-4336-8b48-b552a3dd941f", "metadata": {}, "outputs": [], "source": [ "expargs = {\n", " 'dataset':{\n", " 'loc':'DS01',\n", " 'args':{ }\n", " },\n", " 'model':{\n", " 'loc': 'SimpleNN',\n", " 'args': {\"h1_dim\":120, \"h2_dim\":1000, 'drop':0.3}\n", " },\n", " \"loss\":{\n", " 'loc':\"BCElogit\",\n", " 'args':{},\n", " },\n", " 'optimizer':{\n", " 'loc':'OptAdam',\n", " 'args':{}\n", " },\n", " \"metrics\":{\n", " \"accuracy\":{\n", " 'loc':\"BinAcc\",\n", " 'args':{}\n", " } ,\n", " \"auroc\":{\n", " 'loc':\"AUROC\",\n", " 'args':{}\n", " } ,\n", " \"f1score\":{\n", " 'loc':\"AUPRC\",\n", " 'args':{}\n", " } ,\n", " \"auprc\":{\n", " 'loc':\"F1Score\",\n", " 'args':{}\n", " }\n", " },\n", " \"train_data_src\": r\"D:\\stdML\\Py310\\Adult\\Prepared\\raw2\\train.csv\",\n", " \"val_data_src\": r\"D:\\stdML\\Py310\\Adult\\Prepared\\raw2\\valid.csv\",\n", " \"train_batch_size\":36,\n", " \"val_batch_size\":36\n", "}" ] }, { "cell_type": "code", "execution_count": null, "id": "2b64ae29-6d16-4c82-9f51-c85492ae6fba", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "a9a99909-3c3a-4091-a37e-77ccfd45b401", "metadata": {}, "source": [ "## Embedding" ] }, { "cell_type": "code", "execution_count": null, "id": "608c5754-f07b-4744-a649-32f8939cbeb0", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import torch\n", "class DS02(DataSet):\n", " def __init__(self):\n", " self.args = {\"data_src\":None}\n", "\n", " def _setup(self, args):\n", " self.df = pd.read_csv(args['data_src'])\n", " self.df.replace('?', pd.NA, inplace=True)\n", " self.df = self.df.dropna()\n", " # Define categorical and continuous columns\n", " self.cat_cols = [\n", " 'workclass', 'education', 'marital_status', 'relationship', 'race',\n", " 'occupation', 'native_country'\n", " ]\n", " self.cont_cols = [\n", " 'age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week'\n", " ]\n", " self.label_col = 'income'\n", "\n", " # Define mappings for categorical columns (ensure this matches your earlier mappings)\n", " self.label_encoders = {\n", " 'workclass': {\n", " 'Private': 0, 'Local-gov': 1, 'Self-emp-not-inc': 2, 'Federal-gov': 3,\n", " 'State-gov': 4, 'Self-emp-inc': 5, 'Without-pay': 6, 'Never-worked': 7\n", " },\n", " 'education': {\n", " '11th': 0, 'HS-grad': 1, 'Assoc-acdm': 2, 'Some-college': 3, '10th': 4,\n", " 'Prof-school': 5, '7th-8th': 6, 'Bachelors': 7, 'Masters': 8, '5th-6th': 9,\n", " 'Assoc-voc': 10, '9th': 11, 'Doctorate': 12, '12th': 13, '1st-4th': 14, 'Preschool': 15\n", " },\n", " 'marital_status': {\n", " 'Never-married': 0, 'Married-civ-spouse': 1, 'Widowed': 2,\n", " 'Divorced': 3, 'Separated': 4, 'Married-spouse-absent': 5, 'Married-AF-spouse': 6\n", " },\n", " 'relationship': {\n", " 'Own-child': 0, 'Husband': 1, 'Not-in-family': 2,\n", " 'Unmarried': 3, 'Wife': 4, 'Other-relative': 5\n", " },\n", " 'race': {\n", " 'Black': 0, 'White': 1, 'Other': 2, 'Amer-Indian-Eskimo': 3, 'Asian-Pac-Islander': 4\n", " },\n", " 'occupation': {\n", " 'Machine-op-inspct': 0, 'Farming-fishing': 1, 'Protective-serv': 2,\n", " 'Other-service': 3, 'Prof-specialty': 4, 'Craft-repair': 5,\n", " 'Adm-clerical': 6, 'Exec-managerial': 7, 'Tech-support': 8,\n", " 'Sales': 9, 'Priv-house-serv': 10, 'Transport-moving': 11,\n", " 'Handlers-cleaners': 12, 'Armed-Forces': 13\n", " },\n", " 'native_country': {\n", " 'United-States': 0, 'Peru': 1, 'Guatemala': 2, 'Mexico': 3, 'Dominican-Republic': 4,\n", " 'Ireland': 5, 'Germany': 6, 'Philippines': 7, 'Thailand': 8, 'Haiti': 9, 'El-Salvador': 10,\n", " 'Puerto-Rico': 11, 'Vietnam': 12, 'South': 13, 'Columbia': 14, 'Japan': 15, 'India': 16,\n", " 'Cambodia': 17, 'Poland': 18, 'Laos': 19, 'England': 20, 'Cuba': 21, 'Taiwan': 22,\n", " 'Italy': 23, 'Canada': 24, 'Portugal': 25, 'China': 26, 'Nicaragua': 27, 'Honduras': 28,\n", " 'Iran': 29, 'Scotland': 30, 'Jamaica': 31, 'Ecuador': 32, 'Yugoslavia': 33, 'Hungary': 34,\n", " 'Hong': 35, 'Greece': 36, 'Trinadad&Tobago': 37, 'Outlying-US(Guam-USVI-etc)': 38,\n", " 'France': 39, 'Holand-Netherlands': 40\n", " }\n", " }\n", "\n", " # Encode categorical variables\n", " for col, mapping in self.label_encoders.items():\n", " self.df[col] = self.df[col].replace(mapping)\n", "\n", " # Encode label column\n", " self.df[self.label_col] = self.df[self.label_col].replace({'<=50K': 0, '>50K': 1})\n", "\n", " # Convert everything to torch tensors\n", " self.cat_data = torch.tensor(self.df[self.cat_cols].values, dtype=torch.long)\n", " self.cont_data = torch.tensor(self.df[self.cont_cols].values, dtype=torch.float32)\n", " self.labels = torch.tensor(self.df[self.label_col].values, dtype=torch.float32).unsqueeze(1)\n", " def __len__(self):\n", " return len(self.df)\n", "\n", " def __getitem__(self, idx):\n", " label = self.labels[idx]\n", " return [self.cat_data[idx], self.cont_data[idx]], [label]\n" ] }, { "cell_type": "code", "execution_count": 57, "id": "e841ece6-fd51-45a3-8083-c5b2d56486c0", "metadata": {}, "outputs": [], "source": [ "from torch.utils.data import Dataset,DataLoader\n", "import torch" ] }, { "cell_type": "code", "execution_count": 58, "id": "b35af12d-093b-481a-ad22-deb33779c149", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "D:\\stdML\\GitDesk\\PTLF2\\src\\PTLF\\utils.py:94: UserWarning: DS02 component is not saved. Make sure to save it in an appropriate location beforeinitiating an experiment, test, or report.\n", " warnings.warn(\n", "C:\\Users\\BBEK-Anand\\AppData\\Local\\Temp\\ipykernel_2920\\854477436.py:64: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", " self.df[col] = self.df[col].replace(mapping)\n", "C:\\Users\\BBEK-Anand\\AppData\\Local\\Temp\\ipykernel_2920\\854477436.py:67: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", " self.df[self.label_col] = self.df[self.label_col].replace({'<=50K': 0, '>50K': 1})\n" ] } ], "source": [ "dsargs = {\n", " 'loc':'DS02',\n", " 'args':{\n", " \"data_src\": \"D:/stdML/Py310/Adult/Prepared/raw/test.csv\"\n", " }\n", "}\n", "ds = load_component(**dsargs, setup=True)\n", "Dloader = DataLoader(ds,batch_size=32)\n", "dT = iter(Dloader)" ] }, { "cell_type": "code", "execution_count": 59, "id": "012741b4-e879-49b7-9e3f-93ae7b38440f", "metadata": {}, "outputs": [], "source": [ "dt = next(dT)" ] }, { "cell_type": "code", "execution_count": 60, "id": "ea233717-4962-490f-aa9c-8169a2fdd36e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([[3.6000e+01, 3.2709e+04, 1.0000e+01, 3.3250e+03, 0.0000e+00, 4.5000e+01],\n", " [1.9000e+01, 4.4381e+05, 9.0000e+00, 0.0000e+00, 0.0000e+00, 3.0000e+01],\n", " [2.3000e+01, 2.4040e+05, 1.3000e+01, 0.0000e+00, 0.0000e+00, 1.5000e+01],\n", " [4.9000e+01, 3.3087e+05, 1.0000e+01, 0.0000e+00, 0.0000e+00, 7.0000e+01],\n", " [3.5000e+01, 3.7646e+05, 1.0000e+01, 1.5024e+04, 0.0000e+00, 6.0000e+01],\n", " [3.4000e+01, 1.9200e+05, 1.4000e+01, 0.0000e+00, 0.0000e+00, 5.5000e+01],\n", " [2.2000e+01, 2.9293e+05, 1.3000e+01, 0.0000e+00, 0.0000e+00, 1.0000e+01],\n", " [4.3000e+01, 1.7023e+05, 1.3000e+01, 0.0000e+00, 0.0000e+00, 6.0000e+01],\n", " [3.2000e+01, 3.3154e+05, 9.0000e+00, 0.0000e+00, 0.0000e+00, 4.0000e+01],\n", " [4.6000e+01, 2.7377e+05, 9.0000e+00, 0.0000e+00, 0.0000e+00, 4.0000e+01],\n", " [3.5000e+01, 1.5231e+05, 7.0000e+00, 0.0000e+00, 0.0000e+00, 4.0000e+01],\n", " [4.2000e+01, 1.9512e+05, 4.0000e+00, 0.0000e+00, 0.0000e+00, 3.5000e+01],\n", " [4.0000e+01, 1.2147e+05, 9.0000e+00, 0.0000e+00, 0.0000e+00, 4.0000e+01],\n", " [4.6000e+01, 1.6047e+05, 9.0000e+00, 0.0000e+00, 1.5900e+03, 4.3000e+01],\n", " [3.9000e+01, 2.4505e+05, 1.3000e+01, 0.0000e+00, 0.0000e+00, 5.0000e+01],\n", " [5.3000e+01, 9.6062e+04, 1.0000e+01, 0.0000e+00, 1.7400e+03, 4.0000e+01],\n", " [1.9000e+01, 2.3448e+05, 1.0000e+01, 0.0000e+00, 0.0000e+00, 7.0000e+00],\n", " [3.0000e+01, 1.5412e+05, 1.3000e+01, 0.0000e+00, 0.0000e+00, 6.5000e+01],\n", " [1.9000e+01, 1.3043e+05, 3.0000e+00, 0.0000e+00, 0.0000e+00, 3.6000e+01],\n", " [2.2000e+01, 1.2497e+05, 9.0000e+00, 0.0000e+00, 0.0000e+00, 4.5000e+01],\n", " [4.8000e+01, 3.2466e+05, 1.3000e+01, 0.0000e+00, 0.0000e+00, 4.0000e+01],\n", " [5.0000e+01, 1.5895e+05, 4.0000e+00, 3.4110e+03, 0.0000e+00, 4.0000e+01],\n", " [2.8000e+01, 4.1195e+05, 9.0000e+00, 0.0000e+00, 0.0000e+00, 4.0000e+01],\n", " [2.6000e+01, 2.9280e+05, 1.0000e+01, 0.0000e+00, 0.0000e+00, 2.4000e+01],\n", " [1.9000e+01, 2.8145e+04, 9.0000e+00, 0.0000e+00, 0.0000e+00, 5.2000e+01],\n", " [2.5000e+01, 8.2560e+04, 1.2000e+01, 0.0000e+00, 0.0000e+00, 4.3000e+01],\n", " [5.6000e+01, 6.5956e+05, 8.0000e+00, 0.0000e+00, 0.0000e+00, 4.0000e+01],\n", " [2.2000e+01, 1.3850e+05, 1.0000e+01, 0.0000e+00, 0.0000e+00, 3.0000e+01],\n", " [3.8000e+01, 3.1223e+05, 1.0000e+01, 0.0000e+00, 0.0000e+00, 5.0000e+01],\n", " [1.9000e+01, 2.5158e+05, 1.0000e+01, 0.0000e+00, 0.0000e+00, 1.4000e+01],\n", " [1.8000e+01, 4.3272e+04, 1.0000e+01, 0.0000e+00, 0.0000e+00, 2.0000e+01],\n", " [3.5000e+01, 1.0371e+05, 1.4000e+01, 0.0000e+00, 0.0000e+00, 4.0000e+01]])" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dt[0][1]" ] }, { "cell_type": "code", "execution_count": null, "id": "ba8f6f14-a565-42bd-b078-4eb41468193e", "metadata": { "jupyter": { "source_hidden": true } }, "outputs": [ { "data": { "text/plain": [ "{'embedding_info': [(7, 4),\n", " (16, 8),\n", " (7, 4),\n", " (6, 3),\n", " (5, 3),\n", " (14, 7),\n", " (40, 20)],\n", " 'continuous_dim': 6,\n", " 'hidden_dim': 64,\n", " 'output_dim': 1}" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def generate_model_params(dataset):\n", " embedding_info = []\n", "\n", " for col in dataset.cat_cols:\n", " max_val = dataset.df[col].max()\n", " num_categories = int(max_val + 1) # include -1 if used\n", " emb_dim = min(50, (num_categories + 1) // 2)\n", " embedding_info.append((num_categories, emb_dim))\n", "\n", " continuous_dim = len(dataset.cont_cols)\n", " output_dim = 1 # binary classification\n", "\n", " return {\n", " \"embedding_info\": embedding_info,\n", " \"continuous_dim\": continuous_dim,\n", " \"hidden_dim\": 64,\n", " \"output_dim\": output_dim\n", " }\n", "\n", "# Example:\n", "params = generate_model_params(ds)\n", "params\n" ] }, { "cell_type": "code", "execution_count": null, "id": "4c7d0223-f39e-4e09-86e7-7150ce364f8a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "==========================================================================================\n", "Layer (type:depth-idx) Output Shape Param #\n", "==========================================================================================\n", "SimpleNNe [32, 1] --\n", "├─ModuleList: 1-1 -- --\n", "│ └─Embedding: 2-1 [32, 4] 28\n", "│ └─Embedding: 2-2 [32, 84] 1,344\n", "│ └─Embedding: 2-3 [32, 40] 280\n", "│ └─Embedding: 2-4 [32, 30] 180\n", "│ └─Embedding: 2-5 [32, 300] 1,500\n", "│ └─Embedding: 2-6 [32, 790] 11,060\n", "│ └─Embedding: 2-7 [32, 21] 861\n", "├─Sequential: 1-2 [32, 1] --\n", "│ └─Linear: 2-8 [32, 648] 826,848\n", "│ └─ReLU: 2-9 [32, 648] --\n", "│ └─Dropout: 2-10 [32, 648] --\n", "│ └─Linear: 2-11 [32, 1] 649\n", "==========================================================================================\n", "Total params: 842,750\n", "Trainable params: 842,750\n", "Non-trainable params: 0\n", "Total mult-adds (M): 26.97\n", "==========================================================================================\n", "Input size (MB): 0.00\n", "Forward/backward pass size (MB): 0.49\n", "Params size (MB): 3.37\n", "Estimated Total Size (MB): 3.86\n", "==========================================================================================" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import torch.nn as nn\n", "from torch.nn import functional as F\n", "from torchinfo import summary\n", "\n", "class SimpleNNe(Model):\n", " def __init__(self):\n", " super().__init__()\n", " self.args = {\"embedding_info\":None, \"continuous_dim\":None,'hidden_dim':None, 'drop':None}\n", "\n", " def _setup(self, args):\n", " embedding_info, continuous_dim, hidden_dim, drop = args['embedding_info'], args['continuous_dim'], args['hidden_dim'], args['drop']\n", " self.embeddings = nn.ModuleList([\n", " nn.Embedding(num_categories, emb_dim)\n", " for num_categories, emb_dim in embedding_info\n", " ])\n", "\n", " self.continuous_dim = continuous_dim\n", " total_emb_dim = sum(emb_dim for _, emb_dim in embedding_info)\n", "\n", " self.fc = nn.Sequential(\n", " nn.Linear(total_emb_dim + continuous_dim, hidden_dim),\n", " nn.ReLU(),\n", " nn.Dropout(drop),\n", " nn.Linear(hidden_dim, 1)\n", " )\n", "\n", " def forward(self, x_cat, x_cont):\n", " x = [emb(x_cat[:, i]) for i, emb in enumerate(self.embeddings)]\n", " x = torch.cat(x, dim=1)\n", " x = torch.cat([x, x_cont], dim=1)\n", " return self.fc(x)\n", "\n", "\n", "args = {'embedding_info': [(7, 4),\n", " (16, 84),\n", " (7, 40),\n", " (6, 30),\n", " (5, 300),\n", " (14, 790),\n", " (41, 21)],\n", " 'continuous_dim': 6,\n", " 'hidden_dim': 648, 'drop':0.3}\n", "\n", "model = load_component(loc='SimpleNNe', args=args, setup=True)\n", "# model.to('cuda')\n", "summary(model=model, input_data=dt[0])\n", "# model(*dt[0])" ] }, { "cell_type": "code", "execution_count": null, "id": "3c608fae-78d0-43f9-a48b-29bdd841acad", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "390786eb-e4d5-439e-9816-f0476dc2fc65", "metadata": {}, "outputs": [], "source": [ "expargs = {\n", " 'dataset':{\n", " 'loc':'DS02',\n", " 'args':{ }\n", " },\n", " 'model':{\n", " 'loc': 'SimpleNNe',\n", " 'args': {'embedding_info': [(7, 4),\n", " (16, 84),\n", " (7, 40),\n", " (6, 30),\n", " (5, 300),\n", " (14, 790),\n", " (41, 21)],\n", " 'continuous_dim': 6,\n", " 'hidden_dim': 648, 'drop':0.3\n", " }\n", " },\n", " \"loss\":{\n", " 'loc':\"BCElogit\",\n", " 'args':{},\n", " },\n", " 'optimizer':{\n", " 'loc':'OptAdam',\n", " 'args':{}\n", " },\n", " \"metrics\":{\n", " \"accuracy\":{\n", " 'loc':\"BinAcc\",\n", " 'args':{}\n", " } ,\n", " \"auroc\":{\n", " 'loc':\"AUROC\",\n", " 'args':{}\n", " } ,\n", " \"f1score\":{\n", " 'loc':\"AUPRC\",\n", " 'args':{}\n", " } ,\n", " \"auprc\":{\n", " 'loc':\"F1Score\",\n", " 'args':{}\n", " }\n", " },\n", " \"train_data_src\": r\"D:\\stdML\\Py310\\Adult\\Prepared\\raw\\train.csv\",\n", " \"val_data_src\": r\"D:\\stdML\\Py310\\Adult\\Prepared\\raw\\valid.csv\",\n", " \"train_batch_size\":36,\n", " \"val_batch_size\":36\n", "}" ] }, { "cell_type": "code", "execution_count": null, "id": "06874d40-346f-4913-b02b-a19366fcc800", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "0a111a48-0d3b-4291-bca3-aa060e15a4a7", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.16" } }, "nbformat": 4, "nbformat_minor": 5 }