# Transformers

In [1]:
import pandas as pd
import datasets
from transformers import (
    RobertaTokenizer, 
    TextDataset, 
    DataCollatorForLanguageModeling, 
    Trainer, 
    TrainingArguments, 
    RobertaForMaskedLM,
    EarlyStoppingCallback,
    DataCollator,
    RobertaForSequenceClassification
)
from sklearn.metrics import mean_squared_error
import torch
import numpy as np
import evaluate
import numpy as np
from typing import List, Set, Optional, Dict
import re

In [2]:
unlabeled = pd.read_csv("../../data/intermediate/unlabeled.csv", index_col=False)
train = pd.read_csv("../../data/intermediate/train.csv", index_col=False)
valid = pd.read_csv("../../data/intermediate/valid.csv", index_col=False)
test = pd.read_csv("../../data/intermediate/test.csv", index_col=False)

In [3]:


def custom_data_collator(batch: List) -> Dict[str, torch.Tensor]:
    input_ids = torch.stack([item[0] for item in batch])
    attention_mask = torch.stack([item[1] for item in batch])
    labels = torch.stack([item[2] for item in batch])
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }


# Metrics Function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.squeeze().tolist()
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {'rmse': rmse}

# Data Preparation
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
train_encodings = tokenizer(train['problem_abstract'].tolist(), truncation=True, padding=True, max_length=512)
valid_encodings = tokenizer(valid['problem_abstract'].tolist(), truncation=True, padding=True, max_length=512)

train_labels = train['hours_to_resolve'].tolist()
valid_labels = valid['hours_to_resolve'].tolist()

# Convert encoding to tensors
train_input_ids = torch.tensor(train_encodings['input_ids'])
train_attention_mask = torch.tensor(train_encodings['attention_mask'])
train_labels_tensor = torch.tensor(train_labels, dtype=torch.float)

valid_input_ids = torch.tensor(valid_encodings['input_ids'])
valid_attention_mask = torch.tensor(valid_encodings['attention_mask'])
valid_labels_tensor = torch.tensor(valid_labels, dtype=torch.float)

# Ensure that the tensors have the same size in the first dimension
assert train_input_ids.size(0) == train_attention_mask.size(0) == train_labels_tensor.size(0)
assert valid_input_ids.size(0) == valid_attention_mask.size(0) == valid_labels_tensor.size(0)

# Create TensorDataset
train_dataset = torch.utils.data.TensorDataset(train_input_ids, train_attention_mask, train_labels_tensor)
valid_dataset = torch.utils.data.TensorDataset(valid_input_ids, valid_attention_mask, valid_labels_tensor)

# Training Arguments
training_args = TrainingArguments(
    output_dir="../../models/roberta-regressor",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1000,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=500,
    do_train=True,
    do_eval=True,
    save_total_limit = 2,
    evaluation_strategy="epoch",
    save_strategy = 'epoch',
    load_best_model_at_end=True,
    metric_for_best_model="rmse",
    greater_is_better=False,
)

# Model Initialization & Training
model = RobertaForSequenceClassification.from_pretrained("./../../models/roberta-retrained/checkpoint-1548", num_labels=1)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
    data_collator=custom_data_collator
)
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=2,  # Number of evaluation phases with no improvement to stop training.
    early_stopping_threshold=0.000000001  # Threshold for measuring the new score against the old one.
)
trainer.add_callback(early_stopping_callback)

trainer.train()


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./../../models/roberta-retrained/checkpoint-1548 and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Rmse
1,No log,8559715.0,2925.699257
2,7768402.432000,8549193.0,2923.900283
3,7760449.536000,8539473.0,2922.237694
4,7760449.536000,8529882.0,2920.596111
5,7893727.744000,8520619.0,2919.009799
6,7742422.528000,8511487.0,2917.445361
7,7742422.528000,8502477.0,2915.900683
8,7802992.640000,8493420.0,2914.347385
9,7504704.000000,8484745.0,2912.858512
10,7504704.000000,8476118.0,2911.377314


TrainOutput(global_step=146880, training_loss=7396986.839215687, metrics={'train_runtime': 9229.7497, 'train_samples_per_second': 294.374, 'train_steps_per_second': 36.837, 'total_flos': 5.790417182760038e+16, 'train_loss': 7396986.839215687, 'epoch': 432.0})

In [4]:
# Data Preparation for the test set
test['problem_abstract'].fillna('', inplace=True)
test_encodings = tokenizer(test['problem_abstract'].tolist(), truncation=True, padding=True, max_length=512)
test_labels = test['hours_to_resolve'].tolist()
test_input_ids = torch.tensor(test_encodings['input_ids'])
test_attention_mask = torch.tensor(test_encodings['attention_mask'])
test_labels_tensor = torch.tensor(test_labels, dtype=torch.float)
test_dataset = torch.utils.data.TensorDataset(test_input_ids, test_attention_mask, test_labels_tensor)

# Set the evaluation dataset for the trainer
trainer.eval_dataset = test_dataset

# Evaluate the model
results = trainer.evaluate()

# Print the RMSE
print(f"RMSE on Test Set: {results['eval_rmse']}")


RMSE on Test Set: 2412.7506847692334
