[ ]:
from PTLF.lab import lab_setup
lab_setup(settings_path='path/to/your/project/dir/project_name.json')
[ ]:
from PTLF.utils import *
Simple
[ ]:
import pandas as pd
class DS01(DataSet):
def __init__(self):
self.args = {"data_src":None}
def _setup(self, args):
self.df = pd.read_csv(args['data_src'])
def __len__(self):
return len(self.df)
def __getitem__(self, idx):
row = self.df.iloc[idx, :].values
row = torch.tensor(row, dtype=torch.float32) # Convert entire row to float32 tensor
label = row[-1]
data = row[:-1]
return [data], [label]
[37]:
from torch.utils.data import Dataset,DataLoader
import torch
[38]:
dsargs = {
'loc':'DS01',
'args':{
"data_src": r"D:\stdML\Py310\Adult\Prepared\raw2\test.csv"
}
}
ds = load_component(**dsargs, setup=True)
Dloader = DataLoader(ds,batch_size=32)
dT = iter(Dloader)
D:\stdML\GitDesk\PTLF2\src\PTLF\utils.py:94: UserWarning: DS01 component is not saved. Make sure to save it in an appropriate location beforeinitiating an experiment, test, or report.
warnings.warn(
[39]:
dt = next(dT)
[40]:
dt[1]
[40]:
[tensor([0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.])]
[41]:
dt[0][0].shape
[41]:
torch.Size([32, 14])
[ ]:
[ ]:
import torch.nn as nn
from torch.nn import functional as F
from torchinfo import summary
class SimpleNN(Model):
def __init__(self):
super().__init__()
self.args = {"h1_dim":None, "h2_dim":None,'drop':None}
def _setup(self, args):
h1_dim, h2_dim, drop = args['h1_dim'], args['h2_dim'], args['drop']
self.seq = nn.Sequential(
nn.Linear(14, h1_dim),
nn.ReLU(),
nn.Linear(h1_dim, h2_dim),
nn.ReLU(),
nn.Linear(h2_dim, h2_dim*2),
nn.ReLU(),
nn.Linear(h2_dim*2, h2_dim*2),
nn.ReLU()
)
self.dropout = nn.Dropout(p=drop)
self.final = nn.Linear(h2_dim*2, 1)
def forward(self, x):
x = self.seq(x)
# print(x.shape)
x = self.dropout(x)
x = self.final(x)
return x
model = load_component(loc='SimpleNN', args={"h1_dim":120, "h2_dim":100, 'drop':0.3}, setup=True)
# model.to('cuda')
summary(model=model, input_data=dt[0])
# model(*dt[0]).shape
D:\stdML\GitDesk\PTLF2\src\PTLF\utils.py:94: UserWarning: SimpleNN component is not saved. Make sure to save it in an appropriate location beforeinitiating an experiment, test, or report.
warnings.warn(
==========================================================================================
Layer (type:depth-idx) Output Shape Param #
==========================================================================================
SimpleNN [32, 1] --
├─Sequential: 1-1 [32, 200] --
│ └─Linear: 2-1 [32, 120] 1,800
│ └─ReLU: 2-2 [32, 120] --
│ └─Linear: 2-3 [32, 100] 12,100
│ └─ReLU: 2-4 [32, 100] --
│ └─Linear: 2-5 [32, 200] 20,200
│ └─ReLU: 2-6 [32, 200] --
│ └─Linear: 2-7 [32, 200] 40,200
│ └─ReLU: 2-8 [32, 200] --
├─Dropout: 1-2 [32, 200] --
├─Linear: 1-3 [32, 1] 201
==========================================================================================
Total params: 74,501
Trainable params: 74,501
Non-trainable params: 0
Total mult-adds (M): 2.38
==========================================================================================
Input size (MB): 0.00
Forward/backward pass size (MB): 0.16
Params size (MB): 0.30
Estimated Total Size (MB): 0.46
==========================================================================================
[43]:
model(*dt[0])
[43]:
tensor([[ -220.3309],
[-3426.8130],
[-1660.9607],
[ -948.7296],
[-1603.6975],
[ -275.6187],
[-1397.0565],
[-1364.8336],
[-1171.5076],
[-1769.9922],
[ -481.8759],
[-1443.5280],
[ -222.4868],
[ -874.3320],
[ -927.0721],
[ -677.9394],
[ -546.9390],
[ -435.9732],
[ -694.5280],
[ -785.2214],
[-1834.9946],
[ -287.0668],
[-2317.0957],
[-1651.9097],
[ -170.4194],
[ -214.1748],
[-5181.8262],
[ -711.2175],
[-1923.5010],
[ -833.2474],
[ -293.7393],
[ -859.3155]], grad_fn=<AddmmBackward0>)
[ ]:
#Optimizer
import torch.optim as optim
class OptAdam(Optimizer):
def __init__(self):
super().__init__()
def _setup(self,args):
learning_rate = args.get('learning_rate', 0.001)
self.optimizer = optim.Adam(args['model_parameters'], lr=learning_rate)
def step(self, **kwargs):
self.optimizer.step()
def zero_grad(self):
self.optimizer.zero_grad()
[ ]:
#Metrics
import torch
from PyTorchLabFlow.utils import Metric
from torchmetrics.classification import BinaryAccuracy
class BinAcc(Metric):
def __init__(self):
super().__init__()
self.accuracy = BinaryAccuracy()
def setup(self, args):
if self.check_args(args):
self.accuracy = BinaryAccuracy()
return self
def forward(self,y_pred, y_true):
y_true = y_true[0]
y_pred = y_pred.view_as(y_true)
accuracy = self.accuracy(y_pred, y_true)
return accuracy.item()
import torch.nn as nn
from sklearn.metrics import roc_auc_score
class AUROC(Metric):
def __init__(self):
super().__init__()
def setup(self, args):
if self.check_args(args):
self.accuracy = BinaryAccuracy()
return self
def forward(self, outputs, targets):
# If outputs are raw logits, apply a sigmoid or softmax to get probabilities
# For binary classification (sigmoid)
targets = targets[0]
# outputs = outputs.view_as(targets)
if outputs.size(1) == 1:
probabilities = torch.sigmoid(outputs).detach().cpu().numpy()
targets = targets.detach().cpu().numpy()
auroc = roc_auc_score(targets, probabilities)
# For multi-class classification (softmax)
else:
probabilities = torch.softmax(outputs, dim=1).detach().cpu().numpy()
targets = targets.detach().cpu().numpy()
# One-hot encode targets for multi-class
auroc = roc_auc_score(targets, probabilities, average='macro', multi_class='ovr')
return auroc
from sklearn.metrics import average_precision_score
class AUPRC(Metric):
def __init__(self):
super().__init__()
def setup(self, args):
if self.check_args(args):
self.accuracy = BinaryAccuracy()
return self
def forward(self, outputs, targets):
# If outputs are raw logits, apply a sigmoid or softmax to get probabilities
# For binary classification (sigmoid)
targets = targets[0]
if outputs.size(1) == 1:
probabilities = torch.sigmoid(outputs).detach().cpu().numpy()
targets = targets.detach().cpu().numpy()
auprc = average_precision_score(targets, probabilities)
# For multi-class classification (softmax)
else:
probabilities = torch.softmax(outputs, dim=1).detach().cpu().numpy()
targets = targets.detach().cpu().numpy()
# For multi-class, use average_precision_score for each class separately and average
auprc = average_precision_score(targets, probabilities, average='macro', multi_class='ovr')
return auprc
from sklearn.metrics import f1_score
class F1Score(Metric):
def __init__(self):
super().__init__()
def setup(self, args):
if self.check_args(args):
self.accuracy = BinaryAccuracy()
return self
def forward(self, outputs, targets):
# If outputs are raw logits, apply a sigmoid or softmax to get probabilities
# For binary classification (sigmoid)
targets = targets[0]
if outputs.size(1) == 1:
probabilities = torch.sigmoid(outputs).detach().cpu().numpy()
predictions = (probabilities > 0.5).astype(int) # Convert to 0 or 1 (binary classification)
targets = targets.detach().cpu().numpy()
f1 = f1_score(targets, predictions)
# For multi-class classification (softmax)
else:
probabilities = torch.softmax(outputs, dim=1).detach().cpu().numpy()
predictions = probabilities.argmax(axis=1) # Choose the class with the highest probability
targets = targets.detach().cpu().numpy()
f1 = f1_score(targets, predictions, average='macro') # Macro-average for multi-class
return f1
[ ]:
#Loss
from torch import nn
from PyTorchLabFlow.utils import Loss
class BCElogit(Loss):
def __init__(self):
super().__init__()
self.args ={}
def _setup(self,args):
self.criterion = nn.BCEWithLogitsLoss()
def forward(self, logits, y_true):
print('inside loss',type(y_true), y_true.shape)
y_true = y_true[0]
logits = logits.view_as(y_true)
# print('sdjkfndjnvjnf',type(logits), type(y_true))
loss = self.criterion(logits, y_true.float())
return loss
[ ]:
expargs = {
'dataset':{
'loc':'DS01',
'args':{ }
},
'model':{
'loc': 'SimpleNN',
'args': {"h1_dim":120, "h2_dim":1000, 'drop':0.3}
},
"loss":{
'loc':"BCElogit",
'args':{},
},
'optimizer':{
'loc':'OptAdam',
'args':{}
},
"metrics":{
"accuracy":{
'loc':"BinAcc",
'args':{}
} ,
"auroc":{
'loc':"AUROC",
'args':{}
} ,
"f1score":{
'loc':"AUPRC",
'args':{}
} ,
"auprc":{
'loc':"F1Score",
'args':{}
}
},
"train_data_src": r"D:\stdML\Py310\Adult\Prepared\raw2\train.csv",
"val_data_src": r"D:\stdML\Py310\Adult\Prepared\raw2\valid.csv",
"train_batch_size":36,
"val_batch_size":36
}
[ ]:
Embedding
[ ]:
import pandas as pd
import torch
class DS02(DataSet):
def __init__(self):
self.args = {"data_src":None}
def _setup(self, args):
self.df = pd.read_csv(args['data_src'])
self.df.replace('?', pd.NA, inplace=True)
self.df = self.df.dropna()
# Define categorical and continuous columns
self.cat_cols = [
'workclass', 'education', 'marital_status', 'relationship', 'race',
'occupation', 'native_country'
]
self.cont_cols = [
'age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week'
]
self.label_col = 'income'
# Define mappings for categorical columns (ensure this matches your earlier mappings)
self.label_encoders = {
'workclass': {
'Private': 0, 'Local-gov': 1, 'Self-emp-not-inc': 2, 'Federal-gov': 3,
'State-gov': 4, 'Self-emp-inc': 5, 'Without-pay': 6, 'Never-worked': 7
},
'education': {
'11th': 0, 'HS-grad': 1, 'Assoc-acdm': 2, 'Some-college': 3, '10th': 4,
'Prof-school': 5, '7th-8th': 6, 'Bachelors': 7, 'Masters': 8, '5th-6th': 9,
'Assoc-voc': 10, '9th': 11, 'Doctorate': 12, '12th': 13, '1st-4th': 14, 'Preschool': 15
},
'marital_status': {
'Never-married': 0, 'Married-civ-spouse': 1, 'Widowed': 2,
'Divorced': 3, 'Separated': 4, 'Married-spouse-absent': 5, 'Married-AF-spouse': 6
},
'relationship': {
'Own-child': 0, 'Husband': 1, 'Not-in-family': 2,
'Unmarried': 3, 'Wife': 4, 'Other-relative': 5
},
'race': {
'Black': 0, 'White': 1, 'Other': 2, 'Amer-Indian-Eskimo': 3, 'Asian-Pac-Islander': 4
},
'occupation': {
'Machine-op-inspct': 0, 'Farming-fishing': 1, 'Protective-serv': 2,
'Other-service': 3, 'Prof-specialty': 4, 'Craft-repair': 5,
'Adm-clerical': 6, 'Exec-managerial': 7, 'Tech-support': 8,
'Sales': 9, 'Priv-house-serv': 10, 'Transport-moving': 11,
'Handlers-cleaners': 12, 'Armed-Forces': 13
},
'native_country': {
'United-States': 0, 'Peru': 1, 'Guatemala': 2, 'Mexico': 3, 'Dominican-Republic': 4,
'Ireland': 5, 'Germany': 6, 'Philippines': 7, 'Thailand': 8, 'Haiti': 9, 'El-Salvador': 10,
'Puerto-Rico': 11, 'Vietnam': 12, 'South': 13, 'Columbia': 14, 'Japan': 15, 'India': 16,
'Cambodia': 17, 'Poland': 18, 'Laos': 19, 'England': 20, 'Cuba': 21, 'Taiwan': 22,
'Italy': 23, 'Canada': 24, 'Portugal': 25, 'China': 26, 'Nicaragua': 27, 'Honduras': 28,
'Iran': 29, 'Scotland': 30, 'Jamaica': 31, 'Ecuador': 32, 'Yugoslavia': 33, 'Hungary': 34,
'Hong': 35, 'Greece': 36, 'Trinadad&Tobago': 37, 'Outlying-US(Guam-USVI-etc)': 38,
'France': 39, 'Holand-Netherlands': 40
}
}
# Encode categorical variables
for col, mapping in self.label_encoders.items():
self.df[col] = self.df[col].replace(mapping)
# Encode label column
self.df[self.label_col] = self.df[self.label_col].replace({'<=50K': 0, '>50K': 1})
# Convert everything to torch tensors
self.cat_data = torch.tensor(self.df[self.cat_cols].values, dtype=torch.long)
self.cont_data = torch.tensor(self.df[self.cont_cols].values, dtype=torch.float32)
self.labels = torch.tensor(self.df[self.label_col].values, dtype=torch.float32).unsqueeze(1)
def __len__(self):
return len(self.df)
def __getitem__(self, idx):
label = self.labels[idx]
return [self.cat_data[idx], self.cont_data[idx]], [label]
[57]:
from torch.utils.data import Dataset,DataLoader
import torch
[58]:
dsargs = {
'loc':'DS02',
'args':{
"data_src": "D:/stdML/Py310/Adult/Prepared/raw/test.csv"
}
}
ds = load_component(**dsargs, setup=True)
Dloader = DataLoader(ds,batch_size=32)
dT = iter(Dloader)
D:\stdML\GitDesk\PTLF2\src\PTLF\utils.py:94: UserWarning: DS02 component is not saved. Make sure to save it in an appropriate location beforeinitiating an experiment, test, or report.
warnings.warn(
C:\Users\BBEK-Anand\AppData\Local\Temp\ipykernel_2920\854477436.py:64: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
self.df[col] = self.df[col].replace(mapping)
C:\Users\BBEK-Anand\AppData\Local\Temp\ipykernel_2920\854477436.py:67: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
self.df[self.label_col] = self.df[self.label_col].replace({'<=50K': 0, '>50K': 1})
[59]:
dt = next(dT)
[60]:
dt[0][1]
[60]:
tensor([[3.6000e+01, 3.2709e+04, 1.0000e+01, 3.3250e+03, 0.0000e+00, 4.5000e+01],
[1.9000e+01, 4.4381e+05, 9.0000e+00, 0.0000e+00, 0.0000e+00, 3.0000e+01],
[2.3000e+01, 2.4040e+05, 1.3000e+01, 0.0000e+00, 0.0000e+00, 1.5000e+01],
[4.9000e+01, 3.3087e+05, 1.0000e+01, 0.0000e+00, 0.0000e+00, 7.0000e+01],
[3.5000e+01, 3.7646e+05, 1.0000e+01, 1.5024e+04, 0.0000e+00, 6.0000e+01],
[3.4000e+01, 1.9200e+05, 1.4000e+01, 0.0000e+00, 0.0000e+00, 5.5000e+01],
[2.2000e+01, 2.9293e+05, 1.3000e+01, 0.0000e+00, 0.0000e+00, 1.0000e+01],
[4.3000e+01, 1.7023e+05, 1.3000e+01, 0.0000e+00, 0.0000e+00, 6.0000e+01],
[3.2000e+01, 3.3154e+05, 9.0000e+00, 0.0000e+00, 0.0000e+00, 4.0000e+01],
[4.6000e+01, 2.7377e+05, 9.0000e+00, 0.0000e+00, 0.0000e+00, 4.0000e+01],
[3.5000e+01, 1.5231e+05, 7.0000e+00, 0.0000e+00, 0.0000e+00, 4.0000e+01],
[4.2000e+01, 1.9512e+05, 4.0000e+00, 0.0000e+00, 0.0000e+00, 3.5000e+01],
[4.0000e+01, 1.2147e+05, 9.0000e+00, 0.0000e+00, 0.0000e+00, 4.0000e+01],
[4.6000e+01, 1.6047e+05, 9.0000e+00, 0.0000e+00, 1.5900e+03, 4.3000e+01],
[3.9000e+01, 2.4505e+05, 1.3000e+01, 0.0000e+00, 0.0000e+00, 5.0000e+01],
[5.3000e+01, 9.6062e+04, 1.0000e+01, 0.0000e+00, 1.7400e+03, 4.0000e+01],
[1.9000e+01, 2.3448e+05, 1.0000e+01, 0.0000e+00, 0.0000e+00, 7.0000e+00],
[3.0000e+01, 1.5412e+05, 1.3000e+01, 0.0000e+00, 0.0000e+00, 6.5000e+01],
[1.9000e+01, 1.3043e+05, 3.0000e+00, 0.0000e+00, 0.0000e+00, 3.6000e+01],
[2.2000e+01, 1.2497e+05, 9.0000e+00, 0.0000e+00, 0.0000e+00, 4.5000e+01],
[4.8000e+01, 3.2466e+05, 1.3000e+01, 0.0000e+00, 0.0000e+00, 4.0000e+01],
[5.0000e+01, 1.5895e+05, 4.0000e+00, 3.4110e+03, 0.0000e+00, 4.0000e+01],
[2.8000e+01, 4.1195e+05, 9.0000e+00, 0.0000e+00, 0.0000e+00, 4.0000e+01],
[2.6000e+01, 2.9280e+05, 1.0000e+01, 0.0000e+00, 0.0000e+00, 2.4000e+01],
[1.9000e+01, 2.8145e+04, 9.0000e+00, 0.0000e+00, 0.0000e+00, 5.2000e+01],
[2.5000e+01, 8.2560e+04, 1.2000e+01, 0.0000e+00, 0.0000e+00, 4.3000e+01],
[5.6000e+01, 6.5956e+05, 8.0000e+00, 0.0000e+00, 0.0000e+00, 4.0000e+01],
[2.2000e+01, 1.3850e+05, 1.0000e+01, 0.0000e+00, 0.0000e+00, 3.0000e+01],
[3.8000e+01, 3.1223e+05, 1.0000e+01, 0.0000e+00, 0.0000e+00, 5.0000e+01],
[1.9000e+01, 2.5158e+05, 1.0000e+01, 0.0000e+00, 0.0000e+00, 1.4000e+01],
[1.8000e+01, 4.3272e+04, 1.0000e+01, 0.0000e+00, 0.0000e+00, 2.0000e+01],
[3.5000e+01, 1.0371e+05, 1.4000e+01, 0.0000e+00, 0.0000e+00, 4.0000e+01]])
[ ]:
def generate_model_params(dataset):
embedding_info = []
for col in dataset.cat_cols:
max_val = dataset.df[col].max()
num_categories = int(max_val + 1) # include -1 if used
emb_dim = min(50, (num_categories + 1) // 2)
embedding_info.append((num_categories, emb_dim))
continuous_dim = len(dataset.cont_cols)
output_dim = 1 # binary classification
return {
"embedding_info": embedding_info,
"continuous_dim": continuous_dim,
"hidden_dim": 64,
"output_dim": output_dim
}
# Example:
params = generate_model_params(ds)
params
{'embedding_info': [(7, 4),
(16, 8),
(7, 4),
(6, 3),
(5, 3),
(14, 7),
(40, 20)],
'continuous_dim': 6,
'hidden_dim': 64,
'output_dim': 1}
[ ]:
import torch.nn as nn
from torch.nn import functional as F
from torchinfo import summary
class SimpleNNe(Model):
def __init__(self):
super().__init__()
self.args = {"embedding_info":None, "continuous_dim":None,'hidden_dim':None, 'drop':None}
def _setup(self, args):
embedding_info, continuous_dim, hidden_dim, drop = args['embedding_info'], args['continuous_dim'], args['hidden_dim'], args['drop']
self.embeddings = nn.ModuleList([
nn.Embedding(num_categories, emb_dim)
for num_categories, emb_dim in embedding_info
])
self.continuous_dim = continuous_dim
total_emb_dim = sum(emb_dim for _, emb_dim in embedding_info)
self.fc = nn.Sequential(
nn.Linear(total_emb_dim + continuous_dim, hidden_dim),
nn.ReLU(),
nn.Dropout(drop),
nn.Linear(hidden_dim, 1)
)
def forward(self, x_cat, x_cont):
x = [emb(x_cat[:, i]) for i, emb in enumerate(self.embeddings)]
x = torch.cat(x, dim=1)
x = torch.cat([x, x_cont], dim=1)
return self.fc(x)
args = {'embedding_info': [(7, 4),
(16, 84),
(7, 40),
(6, 30),
(5, 300),
(14, 790),
(41, 21)],
'continuous_dim': 6,
'hidden_dim': 648, 'drop':0.3}
model = load_component(loc='SimpleNNe', args=args, setup=True)
# model.to('cuda')
summary(model=model, input_data=dt[0])
# model(*dt[0])
==========================================================================================
Layer (type:depth-idx) Output Shape Param #
==========================================================================================
SimpleNNe [32, 1] --
├─ModuleList: 1-1 -- --
│ └─Embedding: 2-1 [32, 4] 28
│ └─Embedding: 2-2 [32, 84] 1,344
│ └─Embedding: 2-3 [32, 40] 280
│ └─Embedding: 2-4 [32, 30] 180
│ └─Embedding: 2-5 [32, 300] 1,500
│ └─Embedding: 2-6 [32, 790] 11,060
│ └─Embedding: 2-7 [32, 21] 861
├─Sequential: 1-2 [32, 1] --
│ └─Linear: 2-8 [32, 648] 826,848
│ └─ReLU: 2-9 [32, 648] --
│ └─Dropout: 2-10 [32, 648] --
│ └─Linear: 2-11 [32, 1] 649
==========================================================================================
Total params: 842,750
Trainable params: 842,750
Non-trainable params: 0
Total mult-adds (M): 26.97
==========================================================================================
Input size (MB): 0.00
Forward/backward pass size (MB): 0.49
Params size (MB): 3.37
Estimated Total Size (MB): 3.86
==========================================================================================
[ ]:
[ ]:
expargs = {
'dataset':{
'loc':'DS02',
'args':{ }
},
'model':{
'loc': 'SimpleNNe',
'args': {'embedding_info': [(7, 4),
(16, 84),
(7, 40),
(6, 30),
(5, 300),
(14, 790),
(41, 21)],
'continuous_dim': 6,
'hidden_dim': 648, 'drop':0.3
}
},
"loss":{
'loc':"BCElogit",
'args':{},
},
'optimizer':{
'loc':'OptAdam',
'args':{}
},
"metrics":{
"accuracy":{
'loc':"BinAcc",
'args':{}
} ,
"auroc":{
'loc':"AUROC",
'args':{}
} ,
"f1score":{
'loc':"AUPRC",
'args':{}
} ,
"auprc":{
'loc':"F1Score",
'args':{}
}
},
"train_data_src": r"D:\stdML\Py310\Adult\Prepared\raw\train.csv",
"val_data_src": r"D:\stdML\Py310\Adult\Prepared\raw\valid.csv",
"train_batch_size":36,
"val_batch_size":36
}
[ ]:
[ ]: