In [1]:
from typing import List, Optional, Any
import torch
import torch.utils.data
import numpy as np
import random

import datetime

import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

from peft import LoraConfig
from trl import SFTTrainer
import datasets
import time

import json
import os
import cqlcmp

In [2]:
torch.manual_seed(21)
torch.cuda.manual_seed_all(21)
np.random.seed(21)
random.seed(21)
#torch.backends.cudnn.deterministic = True
#torch.backends.cudnn.benchmark = False

In [3]:
device = torch.device("cuda")

In [4]:
class DatasetNatural2CQL(torch.utils.data.Dataset):
    def __init__(self, path: Optional[str] = None) -> None:
        self.sentence_freq = []
        self.cql2nl = []
        self.nl2cql = []
        self.natural_language_rulebased = []
        self.cql = []
        self.natural_language = []
        self.enabled_natural_language = []

        if path is not None:
            self.load_tsv(path)

    def enable_cql(self, cqls):
        self.enabled_natural_language = []
        for cql in cqls:
            for p in cql:
                self.enabled_natural_language.append(p)

    def dump_json(self, filepath: str) -> None:
        with open(filepath, "w") as file:
            for i in range(len(self)):
                data = json.dumps(self[i])
                file.write(data)
                file.write("\n")

    def add_translation(self, freq: int, cql: str, natural_language_rulebased: str, natural_language: List[str]) -> None:
        cql_index = len(self.sentence_freq)
        self.sentence_freq.append(freq)
        self.cql.append(cql)
        self.natural_language_rulebased.append(natural_language_rulebased)
        self.cql2nl.append([])

        for sentence in natural_language:
            self.nl2cql.append(cql_index)
            self.cql2nl[-1].append(len(self.natural_language))
            self.natural_language.append(sentence)

    def load_tsv(self, path: str) -> None:
        with open(path, "r") as file_data:
            for line in file_data:
                line = line.strip()
                line = line.split("\t")
                texts_json = json.loads(line[4])
                texts_extracted = texts_json["data"][0]["content"][0]["text"]["value"].split("\n")
                self.add_translation(int(line[0]), line[2], line[3], texts_extracted)

    def __len__(self):
        return len(self.enabled_natural_language)

    def __getitem__(self, idx):
        if idx < len(self.nl2cql):
            return {"text": self.natural_language[self.enabled_natural_language[idx]], "cql": self.cql[self.nl2cql[self.enabled_natural_language[idx]]]}
        return None

In [5]:
dataset = DatasetNatural2CQL("expand_natural_texts_0004.res.tsv")

In [6]:
validation_cqls = []
with open("test_ids.json", "r") as file:
    validation_cqls = json.load(file)

In [7]:
dataset.enable_cql(validation_cqls)

In [8]:
from peft import PeftConfig, PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model_name = "meta-llama/Llama-3.2-1B"
adapter_model_name = "outputs/checkpoint-102000"

model_orig = AutoModelForCausalLM.from_pretrained(base_model_name, quantization_config=bnb_config, device_map={"":0})
model = PeftModel.from_pretrained(model_orig, adapter_model_name)
model = model.to(device)

tokenizer = AutoTokenizer.from_pretrained(base_model_name)

In [9]:
import os

path = 'outputs'

# List all folders (directories) in the given path
folders = [name for name in os.listdir(path) if os.path.isdir(os.path.join(path, name))]

In [10]:
def f_in(data):
    full_input = """Translate Natural Language into CQL Queries like:

Word dog followed by lemma run, and then followed by a noun. CQL:
```
[word="dog"][lemma="run"][pos="NN"]
```

Lemma "be" optionally followed by the word "not". CQL:
```
[lemma="be"][word="not"]? 
```

""" + data + " CQL:\n```\n"
    return full_input

In [11]:
# random.shuffle(dataset.enabled_natural_language)

i = 0
while i < len(dataset):
    dataset.natural_language[dataset.enabled_natural_language[i]] = f_in(dataset.natural_language[dataset.enabled_natural_language[i]])
    i += 1

In [12]:
tokenizer.pad_token = tokenizer.eos_token

In [13]:
def get_cql(text):
    text = text.split("<eos>")[0]
    text = text.split("CQL:")[3]
    text = text.split("```")[1]
    return text.strip()

In [14]:
def get_text(text):
    text = text = text.split("CQL:")[2]
    text = text = text.split("```")[-1]
    return text.strip()

In [None]:
with open("test_results.tsv", "w") as test_tsv:
    with open("test_log.txt", "w") as log:
        for folder in folders:
            start_time = int(time.time())
            adapter_model_name = "outputs/" + folder
    
            if folder not in ["checkpoint-95500"]:
                continue
            
            model = PeftModel.from_pretrained(model_orig, adapter_model_name)
            dataloader = torch.utils.data.DataLoader(dataset, batch_size=256, shuffle=False)
            blue_sum = 0
            blue_size = 0
            batch_id = 0
            for batch in dataloader:
                batch_id += 1
                inputs = tokenizer(batch["text"], truncation=True, max_length=1024, return_tensors="pt", padding=True).to(device)
                outputs = model.generate(**inputs, max_new_tokens=100)
                for i, output in enumerate(outputs):
                    cql = get_cql(tokenizer.decode(output, skip_special_tokens=True))
                    cql_ids = cqlcmp.cql_tokenizer(cql)
                    gold_cql_ids = cqlcmp.cql_tokenizer(batch["cql"][i])
                    bleu = cqlcmp.sentence_bleu([gold_cql_ids], cql_ids, weights=(0.25, 0.25, 0.25, 0.25))
                    blue_sum += bleu
                    blue_size += 1
                    test_result = [cql.replace("\n", " "), batch["cql"][i], get_text(batch["text"][i])]
                    print("\t".join(test_result))
                    print("\t".join(test_result), file=test_tsv)
                to_log = ""
                to_log += "Working on: " + adapter_model_name + " | "
                to_log += str(blue_size) + "/" + str(len(dataset)) + " | "
                to_log += "AVG Bleu: " + str(blue_sum/blue_size) + " | "
                to_log += "Time: " + str(int(time.time())-start_time) + " | "
                to_log += "ETA: " + str(int((int(time.time())-start_time)/(blue_size/len(dataset)*(1-(blue_size/len(dataset)))))) + " sec" + " | "
                
                print(to_log, file=log)
                log.flush()
                test_tsv.flush()
                print(to_log)
            

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A deco

Translate: [!tag="DT"&!tag="J.*"&!tag="CD"][word="man"]	[tag !="DT" & tag !="JJ.*" & tag !="CD" & tag !="N.*"] [word="man"]	All tokens that are not determiners, adjectives, cardinal numbers, or nouns followed by the word "man."
Translate: [!tag="DT|JJ|CD|N.*"][lemma="man"]	[tag !="DT" & tag !="JJ.*" & tag !="CD" & tag !="N.*"] [word="man"]	Any token that does not match the tags DT, JJ, CD, or is not a noun, followed by the word "man."
�[tag!="DT|J.*|RB.?|N.*"]{0,}[lemma="man"][tag!="DT|J.*|RB.?|N.*"]{0,}	[tag !="DT" & tag !="JJ.*" & tag !="CD" & tag !="N.*"] [word="man"]	Non-determiner, non-adjective, non-number, and non-noun tokens that precede the word "man."
Translate: [!tag="DT"&!tag="J.*"&!tag="N.*"][tag="N.*" & lemma="man"]	[tag !="DT" & tag !="JJ.*" & tag !="CD" & tag !="N.*"] [word="man"]	Tokens excluding determiners, adjectives, numbers, and nouns followed by "man."
Translate: "[tag!="DT"] [tag!="JJ"] [lemma="man"]	[tag !="DT" & tag !="JJ.*" & tag !="CD" & tag !="N.*"] [word="

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Translate[lemma="determine"]	[lempos="determine-v"]	Forms of the verb determine.
Translate[lemma="determine"&tag="V.*"]	[lempos="determine-v"]	Only verb forms determine.
Translate[lemma="determine"]	[lempos="determine-v"]	All lexemes for the verb 'determine'.
Translate[lemma="determine"]	[lempos="determine-v"]	Any verb forms of determine.
Translate[lemma="determine"]	[lempos="determine-v"]	Lexical items with the verb lemma determine.
Translate[lemma="determine"]	[lempos="determine-v"]	All occurrences of the verb lemma as determine.
Translate[lemma="determine"&tag="V.*"]	[lempos="determine-v"]	Lexical forms labeled as determine in verb category.
ampay?ay	[word= "s?pr?l?ay"]	Any word that matches the pattern s?pr?l?ay.
Translate[lemma="spray"]	[word= "s?pr?l?ay"]	Words that contain variations of "spray."
Translate: "a"? ".*" "s?" "pr"? ".*" "l?" ".*" "ay"?	[word= "s?pr?l?ay"]	Words before or after a possible 's', 'pr', 'l', or 'ay'.
Translate["spray"]	[word= "s?pr?l?ay"]	Words resembling

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Translate[lemma="rude"]	[lemma="rude"]	All base forms of rude.
�[rue]	[lemma="rude"]	Rude as a lemma.
Translate[lemma="rude"]	[lemma="rude"]	Rude in its base form.
Translate[rile+"rube"]	[lemma="rude"]	Token with the lemma rude.
Translate["rude","rational"]	[lemma="rude"]	Examples of the lemma rude.
Translate[lemma="rude"]	[lemma="rude"]	Instances of lemma rude.
esperon"	[lemma="rude"]	Words with the lemma rude.
Translate["rude","jargon-like"]	[lemma="rude"]	Tokens with the lemma "rude".
Translate["rude","verb"]	[lemma="rude"]	Rude as a word form.
Translate["rude"]	[lemma="rude"]	All forms of rude.
Translate[lemma="rude"]	[lemma="rude"]	Rude in the corpus.
Translate[lemma="rude"]	[lemma="rude"]	Tokens defined by the lemma rude.
amp [lemma="remember"] [tag="VVG"]	[lemma="remember"][tag="VVG"]	Lemma remember followed by a verb in gerund form.
ample + v	[lemma="remember"][tag="VVG"]	All instances of the lemma remember followed by a gerund verb.
Translate[lemma="remember"][tag="VVG"]	[lemm

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Translate natural language to: "a" "number" "of"	[word=="a"] [word=="number"] [word=="of"]	Word "a" followed by the word "number" and then the word "of".
Translate+lemma="important"&tag="N.*"&word="number"]{1,2}[word="and"&tag="IN"][tag="N.*"]{1,2}	[word=="a"] [word=="number"] [word=="of"]	The phrase starts with "a", followed by "number", and ends with "of".
Translate natural into CQL queries like:	[word=="a"] [word=="number"] [word=="of"]	Three consecutive words: "a", "number", "of".
Translate natural into CQL queries like: "a" "number" "of"	[word=="a"] [word=="number"] [word=="of"]	The specific order of the words "a", "number", and "of".
Translate+tag="N.*" "a" "number" "of"	[word=="a"] [word=="number"] [word=="of"]	The tokens must include "a" first, then "number", followed by "of".
Translate "a", "number", "of", "the".	[word=="a"] [word=="number"] [word=="of"]	Looking for the exact phrase "a number of".
Translate+word="a"&tag="N.*"&word="number"&tag="N.*"&word="of"&tag="A.*"]	[word=

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Translate("are|Are"][word="you"][word="good"]	[word="are|Are"][word="you"][word="good"]	Sequences containing "are" or "Are" followed by "you" and concluding with "good".
Translate+word="are|Are"][word="you"][word="good"]	[word="are|Are"][word="you"][word="good"]	Using "are" or "Are" with "you" and "good".
Translate[lemma="are|Are"][word="you"][word="good"]	[word="are|Are"][word="you"][word="good"]	"Are" or "Are" appears directly before "you", which in turn comes before "good".
Translate[lemma="are|Are"][word="you"][word="good"]	[word="are|Are"][word="you"][word="good"]	The expressions that start with "are" or "Are", then "you", and finally "good".
Translate[lemma="are|Are"][word="you"][word="good"]	[word="are|Are"][word="you"][word="good"]	Identify the phrase made of "are" or "Are", followed by "you", and then "good".
Translate[lemma="think"][word="over"]	[lemma="think"] [word="over"]	Lemma think followed by the word over.
Translate[lemma="think"][word="over"]	[lemma="think"] [word="ov

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Translate[lemma="hard"&tag="J.*"]	[lempos="hard-j"]	A token containing the lemma hard tagged as an adjective.
Translate[lemma="hard"&tag="J.*"]	[lempos="hard-j"]	All instances of the adjective hard.
esperhton"v*ng"	[lempos="hard-j"]	Lemma of hard as an adjective.
Translate[lemma="hard"&tag="J.*"]	[lempos="hard-j"]	Only the forms of the lemma hard tagged with 'j' for adjective.
Translate[lemma="hard"&tag="J.*"]	[lempos="hard-j"]	Any occurrence of hard in its adjectival form.
ampiclead="hard"	[lempos="hard-j"]	Adjective hard in the corpus.
Translate[lemma="hard"&tag="J.*"]	[lempos="hard-j"]	All representations of the lemma hard as an adjective.
Translate["determine"]	[lc="determine"]	Lowercase form of determine.
Translate("determine", "into", "help")	[lc="determine"]	Token containing the lowercase word "determine."
Translate("determine", "into", "the")	[lc="determine"]	Only instances of "determine" in lowercase.
Translate lowercase="determine"	[lc="determine"]	All tokens that are in lowe

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Translate+lemma="pay"[][lemma="price"]	[lemma="pay"] [ ] [lemma="price"]	Words with the lemma pay followed by a word and then the lemma price.
�[] [lemma="price"] [] [lemma="price.*"]	[lemma="pay"] [ ] [lemma="price"]	Pay followed by any word and then followed by price.
Translate "pay" "" "price"	[lemma="pay"] [ ] [lemma="price"]	Any token coming after pay and before price.
Translate+lemma="pay" [][][lemma="price"]	[lemma="pay"] [ ] [lemma="price"]	Show me examples of pay with any token in between, followed by price.
Translate[lemma="pay" & tag="V.*"] [] [lemma="price"]	[lemma="pay"] [ ] [lemma="price"]	Finding occurrences of the verb pay, any word in between, and then price.
Translate[lemma="work"]	[word="work"]	Word work.
Translate[lemma="work"]	[word="work"]	Token containing the word work.
ampitude	[word="work"]	Only the word work.
Translate["work"]	[word="work"]	All instances of the word work.
ほろこし	[word="work"]	Any occurrence of the word work.
Translate[lemma="work"]	[word="work"]