In [1]:
from typing import List, Optional, Any
import torch
import torch.utils.data
import numpy as np
import random

import datetime

import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

from peft import LoraConfig
from trl import SFTTrainer
import datasets

import json

In [2]:
torch.manual_seed(21)
torch.cuda.manual_seed_all(21)
np.random.seed(21)
random.seed(21)
#torch.backends.cudnn.deterministic = True
#torch.backends.cudnn.benchmark = False

In [3]:
device = torch.device("cuda")

In [4]:
class DatasetNatural2CQL(torch.utils.data.Dataset):
    def __init__(self, path: Optional[str] = None) -> None:
        self.sentence_freq = []
        self.cql2nl = []
        self.nl2cql = []
        self.natural_language_rulebased = []
        self.cql = []
        self.natural_language = []
        self.enabled_natural_language = []

        if path is not None:
            self.load_tsv(path)

    def enable_cql(self, cqls):
        self.enabled_natural_language = []
        for cql in cqls:
            for p in cql:
                self.enabled_natural_language.append(p)

    def dump_json(self, filepath: str) -> None:
        with open(filepath, "w") as file:
            for i in range(len(self)):
                data = json.dumps(self[i])
                file.write(data)
                file.write("\n")

    def add_translation(self, freq: int, cql: str, natural_language_rulebased: str, natural_language: List[str]) -> None:
        cql_index = len(self.sentence_freq)
        self.sentence_freq.append(freq)
        self.cql.append(cql)
        self.natural_language_rulebased.append(natural_language_rulebased)
        self.cql2nl.append([])

        for sentence in natural_language:
            self.nl2cql.append(cql_index)
            self.cql2nl[-1].append(len(self.natural_language))
            self.natural_language.append(sentence)

    def load_tsv(self, path: str) -> None:
        with open(path, "r") as file_data:
            for line in file_data:
                line = line.strip()
                line = line.split("\t")
                texts_json = json.loads(line[4])
                texts_extracted = texts_json["data"][0]["content"][0]["text"]["value"].split("\n")
                self.add_translation(int(line[0]), line[2], line[3], texts_extracted)

    def __len__(self):
        return len(self.enabled_natural_language)

    def __getitem__(self, idx):
        if idx < len(self.nl2cql):
            return {"text": self.natural_language[self.enabled_natural_language[idx]], "cql": self.cql[self.nl2cql[self.enabled_natural_language[idx]]]}
        return None
        

In [5]:
model_name = "google/gemma-2-2b"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map={"":0})

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
dataset = DatasetNatural2CQL("expand_natural_texts_0004.res.tsv")

In [7]:
train_cqls = []
with open("train_ids.json", "r") as file:
    train_cqls = json.load(file)

In [8]:
dataset.enable_cql(train_cqls)

In [9]:
print(len(dataset))
dataset[len(dataset)-1]

84984


{'text': "'ain't', 'no', or 'not' followed by one or two unspecified tokens and resulting in 'nothing', 'nowhere', or 'nobody'.",
 'cql': '[word="ain\'t|no|not"][]{1,2}[word="nothing|nowhere|nobody"]'}

In [10]:
dataset.dump_json("_tmp.json")

In [11]:
lora_config = LoraConfig(
    r=32,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

In [12]:
dataset_fancy = datasets.load_dataset("json", data_files="_tmp.json")

Generating train split: 0 examples [00:00, ? examples/s]

In [13]:
dataset_fancy_tokenized = dataset_fancy.map(lambda samples: tokenizer(samples["text"]), batched=True)

Map:   0%|          | 0/84984 [00:00<?, ? examples/s]

In [14]:
def f_in(data):
    return ["""Translate Natural Language into CQL Queries
What is Corpus Query Language (CQL)?

Corpus Query Language (CQL) is a formal query syntax used to search linguistic corpora for complex patterns that involve words, lemmas, part-of-speech tags, and other annotations. CQL enables users to define conditions on individual tokens or sequences of tokens using features like:

Square bracket notation for token features:

Word dog followed by lemma run and then followed by a noun. CQL:
```
[word="dog"][lemma="run"][pos="NN"]
```

Regular expression-like syntax:

An adjective followed by a noun. CQL:
```
[pos="JJ"][pos="NN"]
```

Lemma "be" optionaly followed by word "not". CQL:
```
[lemma="be"][word="not"]? 
```

Answer in a code block (Line enclosed with ```). Write only CQL into the code blocks.

Translate: """ + data["text"][0] + " CQL:\n```\n" + data["cql"][0] + "```<eos>"]

In [15]:

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_fancy_tokenized["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=1,
        warmup_steps=2,
        num_train_epochs=5,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1000,
        save_steps=2000,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    peft_config=lora_config,
    formatting_func=f_in,
)



Truncating train dataset:   0%|          | 0/84984 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
for i in range(1):
    trainer.train()

It is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `sdpa`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.


Step,Training Loss
