In [2]:
from typing import List, Optional, Any
import torch
import torch.utils.data
import numpy as np
import random

import datetime

import transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

import json

In [3]:
torch.manual_seed(21)
torch.cuda.manual_seed_all(21)
np.random.seed(21)
random.seed(21)
#torch.backends.cudnn.deterministic = True
#torch.backends.cudnn.benchmark = False

In [4]:
device = torch.device("cuda")

In [5]:
class DatasetNatural2CQL(torch.utils.data.Dataset):
    def __init__(self, path: Optional[str] = None) -> None:
        self.sentence_freq = []
        self.cql2nl = []
        self.nl2cql = []
        self.natural_language_rulebased = []
        self.cql = []
        self.natural_language = []

        if path is not None:
            self.load_tsv(path)

    def add_translation(self, freq: int, cql: str, natural_language_rulebased: str, natural_language: List[str]) -> None:
        cql_index = len(self.sentence_freq)
        self.sentence_freq.append(freq)
        self.cql.append(cql)
        self.natural_language_rulebased.append(natural_language_rulebased)
        self.cql2nl.append([])

        for sentence in natural_language:
            self.nl2cql.append(cql_index)
            self.cql2nl[-1].append(len(self.natural_language))
            self.natural_language.append(sentence)

    def load_tsv(self, path: str) -> None:
        with open(path, "r") as file_data:
            for line in file_data:
                line = line.strip()
                line = line.split("\t")
                texts_json = json.loads(line[4])
                texts_extracted = texts_json["data"][0]["content"][0]["text"]["value"].split("\n")
                self.add_translation(int(line[0]), line[2], line[3], texts_extracted)

    def __len__(self):
        return len(self.nl2cql)

    def __getitem__(self, idx):
        if idx < len(self.nl2cql):
            return self.natural_language[idx], self.cql[self.nl2cql[idx]]
        return None
        

In [6]:
class DatasetNatural2CQLTokenized(DatasetNatural2CQL):
    def __init__(self, tokenizer: Any, path: Optional[str] = None) -> None:
        super().__init__(path)
        self.tokenizer = tokenizer
        self.natural_language_tokenized = []
        self.natural_language_mask = []
        self.cql_tokenized = []
        if len(self) > 0:
            self.tokenize()

    def tokenize(self) -> None:
        for sentence in self.natural_language:
            sentence_tokenized = self.tokenizer.batch_encode_plus(
                ["translate: " + sentence.replace("/", "//")],
                return_tensors="pt",
            )
            self.natural_language_tokenized.append(sentence_tokenized.input_ids.squeeze().to(dtype=torch.long))
            self.natural_language_mask.append(sentence_tokenized.attention_mask.squeeze().to(dtype=torch.long))

        for c in self.cql:
            c_tokenized = self.tokenizer.batch_encode_plus(
                [c],
                return_tensors="pt",
            )
            self.cql_tokenized.append(c_tokenized.input_ids.squeeze().to(dtype=torch.long))

    def apply_padding(self, mx0, mx2) -> None:
        for i in range(len(self.natural_language_tokenized)):
            tmp = self.natural_language_mask[i]
            if len(tmp.shape) == 0:
                tmp = tmp.unsqueeze(0)
            self.natural_language_mask[i] = torch.zeros(mx0)
            self.natural_language_mask[i][:tmp.shape[0]] = tmp
            self.natural_language_mask[i] = self.natural_language_mask[i].to(dtype=torch.long)

            tmp = self.natural_language_tokenized[i]
            if len(tmp.shape) == 0:
                tmp = tmp.unsqueeze(0)
            self.natural_language_tokenized[i] = torch.zeros(mx0)
            self.natural_language_tokenized[i][:tmp.shape[0]] = tmp
            self.natural_language_tokenized[i] = self.natural_language_tokenized[i].to(dtype=torch.long)

        for i in range(len(self.cql_tokenized)):
            tmp = self.cql_tokenized[i]
            if len(tmp.shape) == 0:
                tmp = tmp.unsqueeze(0)
            self.cql_tokenized[i] = torch.zeros(mx2)
            self.cql_tokenized[i][:tmp.shape[0]] = tmp
            self.cql_tokenized[i] = self.cql_tokenized[i].to(dtype=torch.long)

    def __getitem__(self, idx):
        if idx < len(self.nl2cql):
            return self.natural_language_tokenized[idx], self.natural_language_mask[idx], self.cql_tokenized[self.nl2cql[idx]]
        return None
        

In [7]:
class DatasetNatural2CQLTokenizedSplited(DatasetNatural2CQL):
    def __init__(self, tokenizer: Any, path: Optional[str] = None) -> None:
        super().__init__(path)
        self.tokenizer = tokenizer
        self.natural_language_tokenized = []
        self.natural_language_mask = []
        self.cql_tokenized = []
        self.enabled_natural_language = []
        if path is not None:
            self.tokenize()

    def split_on_cql(self, p):
        s = self.cql2nl[:]
        random.shuffle(s)
        sp = int(len(s) * p / 100)
        return s[:sp], s[sp:]
    
    def enable_cql(self, cqls):
        self.enabled_natural_language = []
        for cql in cqls:
            for p in cql:
                self.enabled_natural_language.append(p)

    def tokenize(self) -> None:
        for sentence in self.natural_language:
            sentence_tokenized = self.tokenizer.batch_encode_plus(
                ["translate: " + sentence.replace("/", "//")],
                return_tensors="pt",
            )
            self.natural_language_tokenized.append(sentence_tokenized.input_ids.squeeze().to(dtype=torch.long))
            self.natural_language_mask.append(sentence_tokenized.attention_mask.squeeze().to(dtype=torch.long))

        for c in self.cql:
            c_tokenized = self.tokenizer.batch_encode_plus(
                [c],
                return_tensors="pt",
            )
            self.cql_tokenized.append(c_tokenized.input_ids.squeeze().to(dtype=torch.long))

    def apply_padding(self, mx0, mx2) -> None:
        for i in range(len(self.natural_language_tokenized)):
            tmp = self.natural_language_mask[i]
            if len(tmp.shape) == 0:
                tmp = tmp.unsqueeze(0)
            self.natural_language_mask[i] = torch.zeros(mx0)
            self.natural_language_mask[i][:tmp.shape[0]] = tmp
            self.natural_language_mask[i] = self.natural_language_mask[i].to(dtype=torch.long)

            tmp = self.natural_language_tokenized[i]
            if len(tmp.shape) == 0:
                tmp = tmp.unsqueeze(0)
            self.natural_language_tokenized[i] = torch.zeros(mx0)
            self.natural_language_tokenized[i][:tmp.shape[0]] = tmp
            self.natural_language_tokenized[i] = self.natural_language_tokenized[i].to(dtype=torch.long)

        for i in range(len(self.cql_tokenized)):
            tmp = self.cql_tokenized[i]
            if len(tmp.shape) == 0:
                tmp = tmp.unsqueeze(0)
            self.cql_tokenized[i] = torch.zeros(mx2)
            self.cql_tokenized[i][:tmp.shape[0]] = tmp
            self.cql_tokenized[i] = self.cql_tokenized[i].to(dtype=torch.long)

    def __len__(self):
        return len(self.enabled_natural_language)

    def __getitem__(self, idx):
        if idx < len(self.enabled_natural_language):
            return self.natural_language_tokenized[self.enabled_natural_language[idx]], self.natural_language_mask[self.enabled_natural_language[idx]], self.cql_tokenized[self.nl2cql[self.enabled_natural_language[idx]]]
        return None

In [8]:
model_name = "google-t5/t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name, device_map="auto")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [8]:
dataset_tokenized = DatasetNatural2CQLTokenizedSplited(tokenizer, "expand_natural_texts_0004.res.tsv")



In [9]:
#train_cqls, other_cqls = dataset_tokenized.split_on_cql(80)
#sp = int(len(other_cqls) * 80 / 100)
#valid_cqls, test_cqls = other_cqls[:sp], other_cqls[sp:]

train_cqls = []
with open("train_ids.json", "r") as file:
    train_cqls = json.load(file)

In [10]:
dataset_tokenized.enable_cql(train_cqls)

In [11]:
print(len(dataset_tokenized))
dataset_tokenized[len(dataset_tokenized)-1]

84984


(tensor([13959,    10,     3,    31,     9,    77,    31,    17,    31,     6,
             3,    31,    29,    32,    31,     6,    42,     3,    31,  2264,
            31,  2348,    57,    80,    42,   192,    73,  7576,  3676, 14145,
             7,    11,     3,  5490,    16,     3,    31,    29,    32,  8052,
            31,     6,     3,    31,    29,    32,  8352,    31,     6,    42,
             3,    31,    29,    32,  6965,    31,     5,     1]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 tensor([  784,  6051, 17592,     9,    77,    31,    17,  9175,    29,    32,
          9175,  2264,   121,   908,  6306,   908,     2,  4347,   357,     2,
          6306,  6051, 17592,    29,    32,  8052,  9175,    29,    32,  8352,
          9175,    29,    32,  6965,   121,   908,     1]))

In [12]:
mx0 = 0
mx2 = 0
for i, rec in enumerate(dataset_tokenized):
    if i >= len(dataset_tokenized):
        break
    if len(rec[0].shape) > 0 and len(rec[0]) > mx0:
        mx0 = rec[0].shape[0]
    if len(rec[2].shape) > 0 and len(rec[2]) > mx2:
        mx2 = rec[2].shape[0]
print("mx0 =", mx0, "mx2 =", mx2)

mx0 = 352 mx2 = 120


In [13]:
dataset_tokenized.apply_padding(mx0, mx2)

In [14]:
print(len(dataset_tokenized))
dataset_tokenized[len(dataset_tokenized)-1]

84984


(tensor([13959,    10,     3,    31,     9,    77,    31,    17,    31,     6,
             3,    31,    29,    32,    31,     6,    42,     3,    31,  2264,
            31,  2348,    57,    80,    42,   192,    73,  7576,  3676, 14145,
             7,    11,     3,  5490,    16,     3,    31,    29,    32,  8052,
            31,     6,     3,    31,    29,    32,  8352,    31,     6,    42,
             3,    31,    29,    32,  6965,    31,     5,     1,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [15]:
# inspired from: https://github.com/Shivanandroy/T5-Finetuning-PyTorch
def train(epoch, tokenizer, model, device, loader, optimizer, log_file, save_prefix):
    epoch_start = datetime.datetime.utcnow()
    model.train()
    for _, data in enumerate(loader, 0):
        y = data[2].to(device, dtype=torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data[0].to(device, dtype=torch.long)
        mask = data[1].to(device, dtype=torch.long)
        
        outputs = model(
            input_ids=ids,
            attention_mask=mask,
            decoder_input_ids=y_ids,
            labels=lm_labels,
        )
        loss = outputs[0]

        time_delta = datetime.datetime.utcnow() - epoch_start
        if _ % 100 == 0:
            remains_seconds = time_delta.seconds / (_ + 0.01) * (len(loader) - _)
            print("time: ", datetime.datetime.utcnow().isoformat(), time_delta.seconds , "sec | ETA:", "%.2f" % (remains_seconds / 3600, ) , "h | epoch: ", str(epoch), "| batch: ", str(_), "/", len(loader), "|", str(loss.item()), file=log_file)
            print("time: ", datetime.datetime.utcnow().isoformat(), time_delta.seconds , "sec | ETA:", "%.2f" % (remains_seconds / 3600, ) , "h | epoch: ", str(epoch), "| batch: ", str(_), "/", len(loader), "|", str(loss.item()))
            log_file.flush()

        if _ % 1000 == 0:
            torch.save(model.state_dict(), save_prefix + str(epoch) + "_" + str(_) + ".pt")

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    torch.save(model.state_dict(), save_prefix + str(epoch) + "_final.pt")


In [16]:
optimizer = torch.optim.Adam(
    params=model.parameters(), lr=1e-4
)

In [None]:
with open("playground_log", "a") as log_file:
    for i in range(200):
        train(i, tokenizer, model, device, torch.utils.data.DataLoader(dataset_tokenized, batch_size=14, shuffle=True, num_workers=0), optimizer, log_file, "models/google-t5_t5-base/model_train_data_")
        i += 1

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


time:  2025-04-01T21:31:55.792818 1 sec | ETA: 168.64 h | epoch:  0 | batch:  0 / 6071 | 8.359023094177246


In [9]:
model.eval()
sentence_tokenized = tokenizer(
    "translate: Any word followed by lemma door as tag noun and then starting with open within sentence.",
    return_tensors="pt",
)
print(sentence_tokenized.input_ids)
generated_ids = model.generate(
      sentence_tokenized.input_ids.to("cuda")
)
print(generated_ids)

tensor([[13959,    10,  2372,  1448,  2348,    57,    90,   635,     9,  1365,
            38,  7860,   150,   202,    11,   258,  1684,    28,   539,   441,
          7142,     5,     1]])
tensor([[    0,     3, 24519, 18030, 13407,    20,    90,   635,     9,  1365,
           212,  7375,  7375,   267,  4809,     6,   111, 18072,   123,   539,
           441]], device='cuda:0')


In [None]:
tokenizer.convert_ids_to_tokens(generated_ids[0])