In [1]:
from typing import List, Optional, Any
import torch
import torch.utils.data
import numpy as np
import random

import datetime

import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

from peft import LoraConfig
from trl import SFTTrainer
import datasets

import json

In [2]:
torch.manual_seed(21)
torch.cuda.manual_seed_all(21)
np.random.seed(21)
random.seed(21)
#torch.backends.cudnn.deterministic = True
#torch.backends.cudnn.benchmark = False

In [3]:
device = torch.device("cuda")

In [4]:
class DatasetNatural2CQL(torch.utils.data.Dataset):
    def __init__(self, path: Optional[str] = None) -> None:
        self.sentence_freq = []
        self.cql2nl = []
        self.nl2cql = []
        self.natural_language_rulebased = []
        self.cql = []
        self.natural_language = []
        self.enabled_natural_language = []

        if path is not None:
            self.load_tsv(path)

    def enable_cql(self, cqls):
        self.enabled_natural_language = []
        for cql in cqls:
            for p in cql:
                self.enabled_natural_language.append(p)

    def dump_json(self, filepath: str) -> None:
        with open(filepath, "w") as file:
            for i in range(len(self)):
                data = json.dumps(self[i])
                file.write(data)
                file.write("\n")

    def add_translation(self, freq: int, cql: str, natural_language_rulebased: str, natural_language: List[str]) -> None:
        cql_index = len(self.sentence_freq)
        self.sentence_freq.append(freq)
        self.cql.append(cql)
        self.natural_language_rulebased.append(natural_language_rulebased)
        self.cql2nl.append([])

        for sentence in natural_language:
            self.nl2cql.append(cql_index)
            self.cql2nl[-1].append(len(self.natural_language))
            self.natural_language.append(sentence)

    def load_tsv(self, path: str) -> None:
        with open(path, "r") as file_data:
            for line in file_data:
                line = line.strip()
                line = line.split("\t")
                texts_json = json.loads(line[4])
                texts_extracted = texts_json["data"][0]["content"][0]["text"]["value"].split("\n")
                self.add_translation(int(line[0]), line[2], line[3], texts_extracted)

    def __len__(self):
        return len(self.enabled_natural_language)

    def __getitem__(self, idx):
        if idx < len(self.nl2cql):
            return {"text": self.natural_language[self.enabled_natural_language[idx]], "cql": self.cql[self.nl2cql[self.enabled_natural_language[idx]]]}
        return None
        

In [5]:
model_name = "meta-llama/Llama-3.2-1B"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map={"":0})

In [6]:
dataset = DatasetNatural2CQL("expand_natural_texts_0004.res.tsv")

In [7]:
train_cqls = []
with open("train_ids.json", "r") as file:
    train_cqls = json.load(file)

In [8]:
dataset.enable_cql(train_cqls)

In [9]:
print(len(dataset))
dataset[len(dataset)-1]

84984


{'text': "'ain't', 'no', or 'not' followed by one or two unspecified tokens and resulting in 'nothing', 'nowhere', or 'nobody'.",
 'cql': '[word="ain\'t|no|not"][]{1,2}[word="nothing|nowhere|nobody"]'}

In [10]:
prompt_prefix = ""
for i in range(100):
    val = random.randint(0, len(dataset)-1)
    prompt_prefix += dataset[val]["text"] + " -> " + dataset[val]["cql"] + "\n"

In [11]:
with open("prompt_prefix.txt", "w") as file:
    file.write(prompt_prefix)
print(prompt_prefix)

All forms of the lemma people. -> [lemma="people"]
The word is precisely selfish in lowercase.   -> [lc=="selfish"]
Identical to the lowercase form selfish.   -> [lc=="selfish"]
Verb lemma pick followed by a prepositional phrase and then a particle. -> [lemma="pick" & tag="V.*"][tag="PP.?"][tag="RP"]
Examples of adjectives followed by attitude.   -> [tag="J.*"][lemma="attitude"]
All forms of the verb know.   -> [lempos="know-v"]
The sequence consists of "in" (case insensitive), followed by "the", and ending with "sky".   -> [word="in|In"][word="the"][word="sky"]
The verb 'test' in all its forms. -> [lemma = "test" & tag = "V.*"]
Here are some natural descriptions for the provided Corpus Query Language example: -> [word!="(?i)as"] [word!="a|(?i)in"] [lemma="consequence"]
At least one lemma "very" preceding a token tagged as superlative adjective. -> [lemma="very"] + [tag="JJS"]
Token marked as 'burst-v'.   -> [lempos="burst-v"]
Lemma "as" followed by a noun.   -> [lemma="as"] [tag="N.*"

In [12]:
dataset.dump_json("_tmp.json")

In [13]:
lora_config = LoraConfig(
    r=1024,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

In [14]:
dataset_fancy = datasets.load_dataset("json", data_files="_tmp.json")

Generating train split: 0 examples [00:00, ? examples/s]

In [15]:
dataset_fancy_tokenized = dataset_fancy.map(lambda samples: tokenizer(samples["text"]), batched=True)

Map:   0%|          | 0/84984 [00:00<?, ? examples/s]

In [16]:
def f_in(data):
    return [prompt_prefix + "translate: " + data["text"][0] + " -> " + data["cql"][0] + "<eos>"]

In [17]:

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_fancy_tokenized["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=20,
        warmup_steps=2,
        num_train_epochs=5,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1000,
        save_steps=2000,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    peft_config=lora_config,
    formatting_func=f_in,
)



Truncating train dataset:   0%|          | 0/84984 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
trainer.train()

Step,Training Loss


In [None]:
inputs = tokenizer("translate: Any word followed by lemma door as tag noun and then starting with open within sentence. ->", return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
inputs = tokenizer("translate: Any word. ->", return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))