# Transformers

In [1]:
import pandas as pd
import datasets
from transformers import (
    RobertaTokenizer, 
    TextDataset, 
    DataCollatorForLanguageModeling, 
    Trainer, 
    TrainingArguments, 
    RobertaForMaskedLM,
    EarlyStoppingCallback,
    RobertaForSequenceClassification
)
import evaluate
import numpy as np
from typing import List, Set, Optional, Dict
import re
import torch

In [2]:
unlabeled = pd.read_csv("../../data/intermediate/unlabeled.csv", index_col=False)
train = pd.read_csv("../../data/intermediate/train.csv", index_col=False)
valid = pd.read_csv("../../data/intermediate/valid.csv", index_col=False)
test = pd.read_csv("../../data/intermediate/test.csv", index_col=False)

## Finetune on language modeling

In [3]:
from transformers import RobertaTokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments, RobertaForMaskedLM
from sklearn.model_selection import train_test_split

# Data Preparation
unlabeled_texts = unlabeled['problem_abstract'].tolist()
train_texts, val_texts = train_test_split(unlabeled_texts, test_size=0.1)
with open('../../data/prepared/lang_modeling_train_texts.txt', 'w') as f:
    for item in train_texts:
        f.write("%s\n" % item)

with open('../../data/prepared/lang_modeling_val_texts.txt', 'w') as f:
    for item in val_texts:
        f.write("%s\n" % item)

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

train_dataset = TextDataset(
    tokenizer=tokenizer, 
    file_path='../../data/prepared/lang_modeling_train_texts.txt', 
    block_size=128
)
val_dataset = TextDataset(
    tokenizer=tokenizer, 
    file_path='../../data/prepared/lang_modeling_val_texts.txt', 
    block_size=128
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

# Training Arguments
training_args = TrainingArguments(
    output_dir="../../models/roberta-retrained",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=100,
    weight_decay=0.01,
    logging_dir='../../logs',
    logging_steps=500,
    do_train=True,
    do_eval=True,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

# Model Initialization & Training
model = RobertaForMaskedLM.from_pretrained("roberta-base")

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=2,
    early_stopping_threshold=0.000000001
)
trainer.add_callback(early_stopping_callback)

trainer.train()




Epoch,Training Loss,Validation Loss
1,No log,3.219258
2,No log,2.944488
3,No log,2.840271
4,3.293600,2.68289
5,3.293600,2.612758
6,3.293600,2.523335
7,3.293600,2.406949
8,2.564000,2.394855
9,2.564000,2.409589
10,2.564000,2.26025


TrainOutput(global_step=1548, training_loss=2.700213089777826, metrics={'train_runtime': 276.4566, 'train_samples_per_second': 373.295, 'train_steps_per_second': 46.662, 'total_flos': 815069874364416.0, 'train_loss': 2.700213089777826, 'epoch': 12.0})

## Finetuning text classification

In [9]:
from sklearn.metrics import f1_score

def custom_data_collator(batch: List) -> Dict[str, torch.Tensor]:
    input_ids = torch.stack([item[0] for item in batch])
    attention_mask = torch.stack([item[1] for item in batch])
    labels = torch.stack([item[2] for item in batch])
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

# Metrics Function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {'f1_micro': f1_score(labels, predictions, average='micro')}

# Data Preparation
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
train_encodings = tokenizer(train['problem_abstract'].tolist(), truncation=True, padding=True, max_length=512)
valid_encodings = tokenizer(valid['problem_abstract'].tolist(), truncation=True, padding=True, max_length=512)

all_teams = pd.concat([train['team'], valid['team'], test['team']])
unique_teams = all_teams.unique().tolist()
train_labels = train['team'].apply(lambda x: unique_teams.index(x)).tolist()
valid_labels = valid['team'].apply(lambda x: unique_teams.index(x)).tolist()

# Convert encoding to tensors
train_input_ids = torch.tensor(train_encodings['input_ids'])
train_attention_mask = torch.tensor(train_encodings['attention_mask'])
train_labels_tensor = torch.tensor(train_labels)

valid_input_ids = torch.tensor(valid_encodings['input_ids'])
valid_attention_mask = torch.tensor(valid_encodings['attention_mask'])
valid_labels_tensor = torch.tensor(valid_labels)

# Create TensorDataset
train_dataset = torch.utils.data.TensorDataset(train_input_ids, train_attention_mask, train_labels_tensor)
valid_dataset = torch.utils.data.TensorDataset(valid_input_ids, valid_attention_mask, valid_labels_tensor)

# Model Initialization & Training
num_labels = len(unique_teams)  # The number of unique teams
model = RobertaForSequenceClassification.from_pretrained("./../../models/roberta-retrained/checkpoint-1548", num_labels=num_labels)

# Training Arguments
training_args = TrainingArguments(
    output_dir="../../models/roberta-classifier",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1000,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=500,
    do_train=True,
    do_eval=True,
    load_best_model_at_end=True,
    metric_for_best_model="f1_micro",
    greater_is_better=True,  # Now you want to maximize f1_micro
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
    data_collator=custom_data_collator
)

early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.000000001
)
trainer.add_callback(early_stopping_callback)

trainer.train()


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./../../models/roberta-retrained/checkpoint-1548 and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Micro
1,No log,1.665523,0.626932
2,1.930400,1.501243,0.663355
3,1.204100,1.441638,0.665563
4,1.204100,1.453268,0.704194
5,0.797200,1.558207,0.681015
6,0.569200,1.47033,0.708609
7,0.569200,1.61533,0.693157
8,0.390500,1.709719,0.706402
9,0.274800,1.740286,0.715232
10,0.274800,1.761961,0.720751


TrainOutput(global_step=7140, training_loss=0.4047142105610097, metrics={'train_runtime': 938.5825, 'train_samples_per_second': 2894.791, 'train_steps_per_second': 362.248, 'total_flos': 2816353059968448.0, 'train_loss': 0.4047142105610097, 'epoch': 21.0})

In [10]:
test['problem_abstract'].fillna('', inplace=True)
test_encodings = tokenizer(test['problem_abstract'].tolist(), truncation=True, padding=True, max_length=512)
test_labels = test['team'].apply(lambda x: unique_teams.index(x)).tolist()

test_input_ids = torch.tensor(test_encodings['input_ids'])
test_attention_mask = torch.tensor(test_encodings['attention_mask'])
test_labels_tensor = torch.tensor(test_labels)

test_dataset = torch.utils.data.TensorDataset(test_input_ids, test_attention_mask, test_labels_tensor)
trainer.eval_dataset = test_dataset

results = trainer.evaluate()
print(f"F1-micro Score on Test Set: {results['eval_f1_micro']}")

F1-micro Score on Test Set: 0.7516556291390728
