In [1]:
from typing import List, Optional, Any
import torch
import torch.utils.data
import numpy as np
import random
from math import inf

import datetime

import transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

import json
from cqlcmp import cqlcmp
import cql_checker

In [2]:
torch.manual_seed(21)
torch.cuda.manual_seed_all(21)
np.random.seed(21)
random.seed(21)
#torch.backends.cudnn.deterministic = True
#torch.backends.cudnn.benchmark = False

In [3]:
device = torch.device("cuda")

In [4]:
class DatasetNatural2CQL(torch.utils.data.Dataset):
    def __init__(self, path: Optional[str] = None) -> None:
        self.sentence_freq = []
        self.cql2nl = []
        self.nl2cql = []
        self.natural_language_rulebased = []
        self.cql = []
        self.natural_language = []

        if path is not None:
            self.load_tsv(path)

    def add_translation(self, freq: int, cql: str, natural_language_rulebased: str, natural_language: List[str]) -> None:
        cql_index = len(self.sentence_freq)
        self.sentence_freq.append(freq)
        self.cql.append(cql)
        self.natural_language_rulebased.append(natural_language_rulebased)
        self.cql2nl.append([])

        for sentence in natural_language:
            self.nl2cql.append(cql_index)
            self.cql2nl[-1].append(len(self.natural_language))
            self.natural_language.append(sentence)

    def load_tsv(self, path: str) -> None:
        with open(path, "r") as file_data:
            for line in file_data:
                line = line.strip()
                line = line.split("\t")
                texts_json = json.loads(line[4])
                texts_extracted = texts_json["data"][0]["content"][0]["text"]["value"].split("\n")
                self.add_translation(int(line[0]), line[2], line[3], texts_extracted)

    def __len__(self):
        return len(self.nl2cql)

    def __getitem__(self, idx):
        if idx < len(self.nl2cql):
            return self.natural_language[idx], self.cql[self.nl2cql[idx]]
        return None
        

In [5]:
class DatasetNatural2CQLTokenized(DatasetNatural2CQL):
    def __init__(self, tokenizer: Any, path: Optional[str] = None) -> None:
        super().__init__(path)
        self.tokenizer = tokenizer
        self.natural_language_tokenized = []
        self.natural_language_mask = []
        self.cql_tokenized = []
        if len(self) > 0:
            self.tokenize()

    def tokenize(self) -> None:
        for sentence in self.natural_language:
            sentence_tokenized = self.tokenizer.batch_encode_plus(
                ["translate: " + sentence.replace("/", "//")],
                return_tensors="pt",
            )
            self.natural_language_tokenized.append(sentence_tokenized.input_ids.squeeze().to(dtype=torch.long))
            self.natural_language_mask.append(sentence_tokenized.attention_mask.squeeze().to(dtype=torch.long))

        for c in self.cql:
            c_tokenized = self.tokenizer.batch_encode_plus(
                [c],
                return_tensors="pt",
            )
            self.cql_tokenized.append(c_tokenized.input_ids.squeeze().to(dtype=torch.long))

    def apply_padding(self, mx0, mx2) -> None:
        for i in range(len(self.natural_language_tokenized)):
            tmp = self.natural_language_mask[i]
            if len(tmp.shape) == 0:
                tmp = tmp.unsqueeze(0)
            self.natural_language_mask[i] = torch.zeros(mx0)
            self.natural_language_mask[i][:tmp.shape[0]] = tmp
            self.natural_language_mask[i] = self.natural_language_mask[i].to(dtype=torch.long)

            tmp = self.natural_language_tokenized[i]
            if len(tmp.shape) == 0:
                tmp = tmp.unsqueeze(0)
            self.natural_language_tokenized[i] = torch.zeros(mx0)
            self.natural_language_tokenized[i][:tmp.shape[0]] = tmp
            self.natural_language_tokenized[i] = self.natural_language_tokenized[i].to(dtype=torch.long)

        for i in range(len(self.cql_tokenized)):
            tmp = self.cql_tokenized[i]
            if len(tmp.shape) == 0:
                tmp = tmp.unsqueeze(0)
            self.cql_tokenized[i] = torch.zeros(mx2)
            self.cql_tokenized[i][:tmp.shape[0]] = tmp
            self.cql_tokenized[i] = self.cql_tokenized[i].to(dtype=torch.long)

    def __getitem__(self, idx):
        if idx < len(self.nl2cql):
            return self.natural_language_tokenized[idx], self.natural_language_mask[idx], self.cql_tokenized[self.nl2cql[idx]]
        return None
        

In [6]:
class DatasetNatural2CQLTokenizedSplited(DatasetNatural2CQL):
    def __init__(self, tokenizer: Any, path: Optional[str] = None) -> None:
        super().__init__(path)
        self.tokenizer = tokenizer
        self.natural_language_tokenized = []
        self.natural_language_mask = []
        self.cql_tokenized = []
        self.enabled_natural_language = []
        if path is not None:
            self.tokenize()

    def split_on_cql(self, p):
        s = self.cql2nl[:]
        random.shuffle(s)
        sp = int(len(s) * p / 100)
        return s[:sp], s[sp:]
    
    def enable_cql(self, cqls):
        self.enabled_natural_language = []
        for cql in cqls:
            for p in cql:
                self.enabled_natural_language.append(p)

    def tokenize(self) -> None:
        for sentence in self.natural_language:
            sentence_tokenized = self.tokenizer.batch_encode_plus(
                ["translate: " + sentence.replace("/", "//")],
                return_tensors="pt",
            )
            self.natural_language_tokenized.append(sentence_tokenized.input_ids.squeeze().to(dtype=torch.long))
            self.natural_language_mask.append(sentence_tokenized.attention_mask.squeeze().to(dtype=torch.long))

        for c in self.cql:
            c_tokenized = self.tokenizer.batch_encode_plus(
                [c],
                return_tensors="pt",
            )
            self.cql_tokenized.append(c_tokenized.input_ids.squeeze().to(dtype=torch.long))

    def apply_padding(self, mx0, mx2) -> None:
        for i in range(len(self.natural_language_tokenized)):
            tmp = self.natural_language_mask[i]
            if len(tmp.shape) == 0:
                tmp = tmp.unsqueeze(0)
            self.natural_language_mask[i] = torch.zeros(mx0)
            self.natural_language_mask[i][:tmp.shape[0]] = tmp
            self.natural_language_mask[i] = self.natural_language_mask[i].to(dtype=torch.long)

            tmp = self.natural_language_tokenized[i]
            if len(tmp.shape) == 0:
                tmp = tmp.unsqueeze(0)
            self.natural_language_tokenized[i] = torch.zeros(mx0)
            self.natural_language_tokenized[i][:tmp.shape[0]] = tmp
            self.natural_language_tokenized[i] = self.natural_language_tokenized[i].to(dtype=torch.long)

        for i in range(len(self.cql_tokenized)):
            tmp = self.cql_tokenized[i]
            if len(tmp.shape) == 0:
                tmp = tmp.unsqueeze(0)
            self.cql_tokenized[i] = torch.zeros(mx2)
            self.cql_tokenized[i][:tmp.shape[0]] = tmp
            self.cql_tokenized[i] = self.cql_tokenized[i].to(dtype=torch.long)

    def __len__(self):
        return len(self.enabled_natural_language)

    def __getitem__(self, idx):
        if idx < len(self.enabled_natural_language):
            return self.natural_language_tokenized[self.enabled_natural_language[idx]], self.natural_language_mask[self.enabled_natural_language[idx]], self.cql_tokenized[self.nl2cql[self.enabled_natural_language[idx]]], self.cql[self.nl2cql[self.enabled_natural_language[idx]]]
        return None

In [7]:
model_name = "google-t5/t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name, device_map="auto")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [8]:
dataset_tokenized = DatasetNatural2CQLTokenizedSplited(tokenizer, "expand_natural_texts_0004.res.tsv")



In [9]:
train_cqls, other_cqls = dataset_tokenized.split_on_cql(80)
sp = int(len(other_cqls) * 80 / 100)
valid_cqls, test_cqls = other_cqls[:sp], other_cqls[sp:]

In [10]:
dataset_tokenized.enable_cql(test_cqls)

In [11]:
print(len(dataset_tokenized))
dataset_tokenized[len(dataset_tokenized)-1]

4280


(tensor([13959,    10,    71,   712, 14145,    28,     8,  1448,  6047,     5,
             1]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 tensor([  784,  6051, 17592, 19778,     7,   121,   908,     1]),
 '[word="channels"]')

In [12]:
model.load_state_dict(torch.load("models/google-t5_t5-base/model_train_data_34_0.pt", weights_only=True))

<All keys matched successfully>

In [13]:
class CQLLogitsProcessor(transformers.generation.logits_process.LogitsProcessor):
    def __init__(self, id2string, verbose = False):
        self.id2string = id2string
        self.checker = cql_checker.CQLChecker()
        self.checker.eos_token_id = 1
        self.checker.ignore_tokens.append(0)
        self.checker.ignore_tokens.append(2)
        self.verbose = verbose

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
        scores[0][2] = -inf
        scores[0][3] = -inf
        while True:
            token_id = int(np.argmax(scores[0].to("cpu")))
            if token_id >= 32000:
                scores[0][token_id] = -inf
            elif self.checker.add_string(self.id2string[token_id], token_id):
                scores[0][token_id] = inf
                if self.verbose:
                    print("Accepted: ", self.id2string[token_id])
                break
            scores[0][token_id] = -inf
            if self.verbose:
                print("Rejected: ", self.id2string[token_id])
        return scores

In [14]:
class CQLRemoteLogitsProcessor(transformers.generation.logits_process.LogitsProcessor):
    def __init__(self, id2string, verbose = False):
        self.id2string = id2string
        self.checker = cql_checker.CQLRemoteChecker("preloaded/bnc2_tt21", "912b3694e685d7ff2b4cc8fdfe0e94cd")
        self.checker.eos_token_id = 1
        self.checker.ignore_tokens.append(0)
        self.checker.ignore_tokens.append(2)
        self.verbose = verbose

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
        scores[0][2] = -inf
        scores[0][3] = -inf
        
        while True:
            token_id = int(np.argmax(scores[0].to("cpu")))
            if token_id >= 32000:
                scores[0][token_id] = -inf
            elif self.checker.add_string(self.id2string[token_id], token_id):
                scores[0][token_id] = inf
                if self.verbose:
                    print("Accepted: ", self.id2string[token_id])
                break
            scores[0][token_id] = -inf
            if self.verbose:
                print("Rejected: ", self.id2string[token_id])
        return scores

In [15]:
model.eval()
sentence_tokenized = tokenizer(
    "translate: Any word followed by lemma door as tag noun and then followed by a word starting with open within sentence.",
    return_tensors="pt",
)
generated_ids = model.generate(
      sentence_tokenized.input_ids.to("cuda"),
        max_new_tokens=256,
)
result = "".join(tokenizer.convert_ids_to_tokens(generated_ids[0])[:-1])
result = result.replace("▁", " ")
result

'<pad>][lemma="door" & tag="N.*"][word="open" & tag="N.*"] within <unk>s/>'

In [16]:
id2token = tokenizer.convert_ids_to_tokens(list(range(tokenizer.vocab_size)))
id2string = []
for token in id2token:
    id2string.append(tokenizer.convert_tokens_to_string([token]))
processor = CQLRemoteLogitsProcessor(id2string)
logits_processor_list = transformers.generation.logits_process.LogitsProcessorList([
    processor,
])

In [None]:
model.eval()

def average(av_sum, d):
    res = {}
    for k in av_sum:
        res[k] = av_sum[k] / d
    return res

def dic_add(av_sum, to_add):
    for k in av_sum:
        av_sum[k] += to_add[k]

average_sum = {
    "n_grams_1": 0,
    "n_grams_2": 0,
    "n_grams_3": 0,
    "n_grams_4": 0,
    "n_grams_5": 0,
    "n_grams_6": 0,
    "bleu": 0,
    "precision": 0,
    "recall": 0,
    "f1": 0,
    "maximal intersection over union": 0,
    "maximal intersection over union +-1": 0,
    "maximal intersection over union +-2": 0,
    "sentence_precision": 0,
    "sentence_recall": 0,
}

with open("eval_log_logits", "w") as elog:
    for i, gold in enumerate(torch.utils.data.DataLoader(dataset_tokenized, batch_size=1, shuffle=True)):
        if i > 500:
            break
        cql_gold = gold[3][0]
        
        generated_ids = model.generate(
            gold[0].to("cuda"),
            logits_processor=logits_processor_list,
            max_new_tokens=256,
        )
        result = "".join(tokenizer.convert_ids_to_tokens(generated_ids[0])[:-1])
        result = result.replace("▁", " ")
        result = result.replace("<pad>", " ")
        result = result.strip()
    
        lb = result.find("[")
        rb = result.find("]")
        if rb != -1:
            if lb == -1 or lb > rb:
                result = "[" + result
    
        cmp_result = cqlcmp(cql_gold, result, "preloaded/bnc2_tt31", SKETCH_ENGINE_API_KEY = "912b3694e685d7ff2b4cc8fdfe0e94cd")
        dic_add(average_sum, cmp_result)
        print(str(i+1) + "/" + str(len(dataset_tokenized)), file=elog)
        print(average(average_sum, i+1), file=elog)
        elog.flush()
        if i + 1 == len(dataset_tokenized):
            break
    
        

In [None]:
print("test")