In [1]:
from typing import List, Optional, Any
import torch
import torch.utils.data
import numpy as np
import random

import datetime

import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

from peft import LoraConfig
from trl import SFTTrainer
import datasets
import time

import json
import os
import cqlcmp

In [2]:
torch.manual_seed(21)
torch.cuda.manual_seed_all(21)
np.random.seed(21)
random.seed(21)
#torch.backends.cudnn.deterministic = True
#torch.backends.cudnn.benchmark = False

In [3]:
device = torch.device("cuda")

In [4]:
class DatasetNatural2CQL(torch.utils.data.Dataset):
    def __init__(self, path: Optional[str] = None) -> None:
        self.sentence_freq = []
        self.cql2nl = []
        self.nl2cql = []
        self.natural_language_rulebased = []
        self.cql = []
        self.natural_language = []
        self.enabled_natural_language = []

        if path is not None:
            self.load_tsv(path)

    def enable_cql(self, cqls):
        self.enabled_natural_language = []
        for cql in cqls:
            for p in cql:
                self.enabled_natural_language.append(p)

    def dump_json(self, filepath: str) -> None:
        with open(filepath, "w") as file:
            for i in range(len(self)):
                data = json.dumps(self[i])
                file.write(data)
                file.write("\n")

    def add_translation(self, freq: int, cql: str, natural_language_rulebased: str, natural_language: List[str]) -> None:
        cql_index = len(self.sentence_freq)
        self.sentence_freq.append(freq)
        self.cql.append(cql)
        self.natural_language_rulebased.append(natural_language_rulebased)
        self.cql2nl.append([])

        for sentence in natural_language:
            self.nl2cql.append(cql_index)
            self.cql2nl[-1].append(len(self.natural_language))
            self.natural_language.append(sentence)

    def load_tsv(self, path: str) -> None:
        with open(path, "r") as file_data:
            for line in file_data:
                line = line.strip()
                line = line.split("\t")
                texts_json = json.loads(line[4])
                texts_extracted = texts_json["data"][0]["content"][0]["text"]["value"].split("\n")
                self.add_translation(int(line[0]), line[2], line[3], texts_extracted)

    def __len__(self):
        return len(self.enabled_natural_language)

    def __getitem__(self, idx):
        if idx < len(self.nl2cql):
            return {"text": self.natural_language[self.enabled_natural_language[idx]], "cql": self.cql[self.nl2cql[self.enabled_natural_language[idx]]]}
        return None

In [5]:
dataset = DatasetNatural2CQL("expand_natural_texts_0004.res.tsv")

In [6]:
validation_cqls = []
with open("test_ids.json", "r") as file:
    validation_cqls = json.load(file)

In [7]:
dataset.enable_cql(validation_cqls)

In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer

base_model_name = "meta-llama/Llama-3.2-1B"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(base_model_name, quantization_config=bnb_config, device_map={"":0})
model = model.to(device)

tokenizer = AutoTokenizer.from_pretrained(base_model_name)

In [9]:
def f_in(data):
    full_input = """Translate Natural Language into CQL Queries like:

Word dog followed by lemma run, and then followed by a noun. CQL:
```
[word="dog"][lemma="run"][pos="NN"]
```

Lemma "be" optionally followed by the word "not". CQL:
```
[lemma="be"][word="not"]? 
```

""" + data + " CQL:\n```\n"
    return full_input

In [10]:
# random.shuffle(dataset.enabled_natural_language)

i = 0
while i < len(dataset):
    dataset.natural_language[dataset.enabled_natural_language[i]] = f_in(dataset.natural_language[dataset.enabled_natural_language[i]])
    i += 1

In [11]:
tokenizer.pad_token = tokenizer.eos_token

In [12]:
def get_cql(text):
    text = text.split("<eos>")[0]
    text = text.split("CQL:")[3]
    text = text.split("```")[1]
    return text.strip()

In [13]:
def get_text(text):
    text = text = text.split("CQL:")[2]
    text = text = text.split("```")[-1]
    return text.strip()

In [14]:
with open("test_results.tsv", "w") as test_tsv:
    with open("test_log.txt", "w") as log:
        start_time = int(time.time())
        
        dataloader = torch.utils.data.DataLoader(dataset, batch_size=256, shuffle=False)
        blue_sum = 0
        blue_size = 0
        batch_id = 0
        for batch in dataloader:
            batch_id += 1
            inputs = tokenizer(batch["text"], truncation=True, max_length=1024, return_tensors="pt", padding=True).to(device)
            outputs = model.generate(**inputs, max_new_tokens=100)
            for i, output in enumerate(outputs):
                cql = get_cql(tokenizer.decode(output, skip_special_tokens=True))
                cql_ids = cqlcmp.cql_tokenizer(cql)
                gold_cql_ids = cqlcmp.cql_tokenizer(batch["cql"][i])
                bleu = cqlcmp.sentence_bleu([gold_cql_ids], cql_ids, weights=(0.25, 0.25, 0.25, 0.25))
                blue_sum += bleu
                blue_size += 1
                test_result = [cql.replace("\n", " "), batch["cql"][i], get_text(batch["text"][i])]
                print("\t".join(test_result))
                print("\t".join(test_result), file=test_tsv)
            to_log = ""
            to_log += "Working on: " + "baseline" + " | "
            to_log += str(blue_size) + "/" + str(len(dataset)) + " | "
            to_log += "AVG Bleu: " + str(blue_sum/blue_size) + " | "
            to_log += "Time: " + str(int(time.time())-start_time) + " | "
            to_log += "ETA: " + str(int((int(time.time())-start_time)/(blue_size/len(dataset)*(1-(blue_size/len(dataset)))))) + " sec" + " | "
            
            print(to_log, file=log)
            log.flush()
            test_tsv.flush()
            print(to_log)
            

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A deco

	[tag !="DT" & tag !="JJ.*" & tag !="CD" & tag !="N.*"] [word="man"]	All tokens that are not determiners, adjectives, cardinal numbers, or nouns followed by the word "man."
	[tag !="DT" & tag !="JJ.*" & tag !="CD" & tag !="N.*"] [word="man"]	Any token that does not match the tags DT, JJ, CD, or is not a noun, followed by the word "man."
- [non-determiner]	[tag !="DT" & tag !="JJ.*" & tag !="CD" & tag !="N.*"] [word="man"]	Non-determiner, non-adjective, non-number, and non-noun tokens that precede the word "man."
by[not[lemma="man"]][not[lemma="dog"]][not[lemma="cat"]]	[tag !="DT" & tag !="JJ.*" & tag !="CD" & tag !="N.*"] [word="man"]	Tokens excluding determiners, adjectives, numbers, and nouns followed by "man."
	[tag !="DT" & tag !="JJ.*" & tag !="CD" & tag !="N.*"] [word="man"]	Any token that is not tagged as DT, JJ, CD, or is a noun right before the word "man."
	[tag !="DT" & tag !="JJ.*" & tag !="CD" & tag !="N.*"] [word="man"]	A sequence where the preceding token fits none of the

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


The verb "to be" is followed by "not" or "not".	[lempos="determine-v"]	Forms of the verb determine.
We need to look for a verb form.	[lempos="determine-v"]	Only verb forms determine.
	[lempos="determine-v"]	All lexemes for the verb 'determine'.
#verb	[lempos="determine-v"]	Any verb forms of determine.
lemma=verb	[lempos="determine-v"]	Lexical items with the verb lemma determine.
#lemma	[lempos="determine-v"]	All occurrences of the verb lemma as determine.
-lemma="determine"	[lempos="determine-v"]	Lexical forms labeled as determine in verb category.
1. [word="s?pr?l?ay"]	[word= "s?pr?l?ay"]	Any word that matches the pattern s?pr?l?ay.
	[word= "s?pr?l?ay"]	Words that contain variations of "spray."
-lemma="s" or lemma="pr" or lemma="l" or lemma="ay"	[word= "s?pr?l?ay"]	Words before or after a possible 's', 'pr', 'l', or 'ay'.
This word is not a valid CQL query.	[word= "s?pr?l?ay"]	Words resembling the spelling of "spray".
	[word= "s?pr?l?ay"]	All words that include 'spray' with optional p

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


	[lemma="rude"]	All base forms of rude.
	[lemma="rude"]	Rude as a lemma.
def rude: (word)	[lemma="rude"]	Rude in its base form.
...[lemma="rude"]?	[lemma="rude"]	Token with the lemma rude.
"rude"	[lemma="rude"]	Examples of the lemma rude.
-lemma="rude"	[lemma="rude"]	All variations on the lemma rude.
[lemma="rude"]	[lemma="rude"]	Instances of lemma rude.
-lemma="rude"	[lemma="rude"]	Words with the lemma rude.
"rude"	[lemma="rude"]	Tokens with the lemma "rude".
1. rude	[lemma="rude"]	Rude as a word form.
	[lemma="rude"]	All forms of rude.
#rude	[lemma="rude"]	Rude in the corpus.
	[lemma="rude"]	Tokens defined by the lemma rude.
	[lemma="remember"][tag="VVG"]	Lemma remember followed by a verb in gerund form.
In order to remember the lemma "be" optionally followed by a gerund verb, we use the following query.	[lemma="remember"][tag="VVG"]	All instances of the lemma remember followed by a gerund verb.
\[\verb|remember|VVG]	[lemma="remember"][tag="VVG"]	"Remember" as a lemma followed by a v

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


-lemma="a"[word="number"][word="of"]?	[word=="a"] [word=="number"] [word=="of"]	Word "a" followed by the word "number" and then the word "of".
- [a] number - [of]	[word=="a"] [word=="number"] [word=="of"]	The phrase starts with "a", followed by "number", and ends with "of".
	[word=="a"] [word=="number"] [word=="of"]	Three consecutive words: "a", "number", "of".
	[word=="a"] [word=="number"] [word=="of"]	The specific order of the words "a", "number", and "of".
#a number of	[word=="a"] [word=="number"] [word=="of"]	The tokens must include "a" first, then "number", followed by "of".
	[word=="a"] [word=="number"] [word=="of"]	Looking for the exact phrase "a number of".
\w+ [a][\w+][\w+]	[word=="a"] [word=="number"] [word=="of"]	An exact match of the words "a", "number", and "of" in this order.
|word="a" |pos="N" |word="number" |pos="V" |word="of" |pos="N"	[word=="a"] [word=="number"] [word=="of"]	Finding the words where "a" precedes "number" and "of" comes next.
[lemma="de" ][word="de"]?	[

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


1. are [lemma="be"] word: "good" [pos: NNS] [lemma: "you"] [word: "good"] 2. are [lemma: "be"] word: "good" [pos: NN] [lemma: "you"] [word: "good"]	[word="are|Are"][word="you"][word="good"]	Sequences containing "are" or "Are" followed by "you" and concluding with "good".
# "are" or "Are" with "you" and "good"	[word="are|Are"][word="you"][word="good"]	Using "are" or "Are" with "you" and "good".
"Are" or "Are" appears directly before "you" which in turn comes before "good".	[word="are|Are"][word="you"][word="good"]	"Are" or "Are" appears directly before "you", which in turn comes before "good".
“are”[lemma=”you”][“good”]?	[word="are|Are"][word="you"][word="good"]	The expressions that start with "are" or "Are", then "you", and finally "good".
[lemma="are"][word="you"][word="good"]	[word="are|Are"][word="you"][word="good"]	Identify the phrase made of "are" or "Are", followed by "you", and then "good".
-lemma="think"[word="over"]?	[lemma="think"] [word="over"]	Lemma think followed by the wo

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


	[lempos="hard-j"]	A token containing the lemma hard tagged as an adjective.
AFTER[ADJ="hard"]	[lempos="hard-j"]	All instances of the adjective hard.
#lemma="hard"	[lempos="hard-j"]	Lemma of hard as an adjective.
def lemma("j")	[lempos="hard-j"]	Only the forms of the lemma hard tagged with 'j' for adjective.
The word "hard" in its adjectival form.	[lempos="hard-j"]	Any occurrence of hard in its adjectival form.
...[adjective="hard"]	[lempos="hard-j"]	Adjective hard in the corpus.
	[lempos="hard-j"]	All representations of the lemma hard as an adjective.
	[lc="determine"]	Lowercase form of determine.
AFTER	[lc="determine"]	Token containing the lowercase word "determine."
by [lemma="determine"]?	[lc="determine"]	Only instances of "determine" in lowercase.
	[lc="determine"]	All tokens that are in lowercase and match "determine."
	[lc="determine"]	Any lowercase occurrence of the word determine.
	[lc="determine"]	All instances of the lowercase word "determine."
1 determine?	[lc="determine"]	

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


We're trying to find the words that are pay and then followed by price.	[lemma="pay"] [ ] [lemma="price"]	Words with the lemma pay followed by a word and then the lemma price.
	[lemma="pay"] [ ] [lemma="price"]	Pay followed by any word and then followed by price.
The word "pay" followed by a token "and" followed by the word "price".	[lemma="pay"] [ ] [lemma="price"]	Any token coming after pay and before price.
	[lemma="pay"] [ ] [lemma="price"]	Show me examples of pay with any token in between, followed by price.
What are the prices of things that pay?	[lemma="pay"] [ ] [lemma="price"]	Finding occurrences of the verb pay, any word in between, and then price.
#word="work"	[word="work"]	Word work.
def word: work	[word="work"]	Token containing the word work.
	[word="work"]	Only the word work.
What is the work?	[word="work"]	All instances of the word work.
# work	[word="work"]	Any occurrence of the word work.
[word="work"][lemma="work"]?	[word="work"]	Examples of the word work.
#work	[word

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


"be" + word + "paint" + "ing"	[tag="VB.*"][]{0,1}[lemma="paint" & tag="VVG"]	Any verb followed by any token and then by "paint" as a verb ending in -ing.
	[tag="VB.*"][]{0,1}[lemma="paint" & tag="VVG"]	Verbs that may be followed by an optional word and then by the gerund form of paint.
	[tag="VB.*"][]{0,1}[lemma="paint" & tag="VVG"]	One or more verbs followed optionally by a token leading to the verb "painting."
We can paint [verb="paint"][gerund="paint"]	[tag="VB.*"][]{0,1}[lemma="paint" & tag="VVG"]	Verbs marked as actions followed by a possible token and then specifically the gerund "paint."
The verb "paint" in the continuous aspect.	[tag="VB.*"][]{0,1}[lemma="paint" & tag="VVG"]	All forms of verbs succeeded by an optional token and followed by the verb "paint" in the continuous aspect.
defn:verb of action     :action     :verb     :verbVVG     :verbVVG     :verbVVG     :verbVVG     :verbVVG     :verbVVG     :verbVVG     :verbVVG     :verbVVG     :verbVVG     :verbVVG     :verbVVG  

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


	[lemma="sing" & tag="V.*"]	- All occurrences of the verb with the lemma sing.
	[lemma="sing" & tag="V.*"]	- The lemma sing as a verb.
[word="of"][noun]	[tag="N.*"][word="of"][word=".*"]{0,2}[lemma="tree"][tag="N.*"]	A noun followed by the word "of" followed by zero to two unspecified words followed by the lemma tree and then another noun.
The word "of" followed by a noun, followed by up to two words, continuing with "tree" followed by a noun.	[tag="N.*"][word="of"][word=".*"]{0,2}[lemma="tree"][tag="N.*"]	Any noun preceded by "of" and followed by up to two words, continuing with "tree" followed by a noun.
	[tag="N.*"][word="of"][word=".*"]{0,2}[lemma="tree"][tag="N.*"]	Nouns that appear with "of" and potentially one or two words before "tree" and concluding with another noun.
# of "tree"	[tag="N.*"][word="of"][word=".*"]{0,2}[lemma="tree"][tag="N.*"]	Any noun comes first, is followed by "of," then possibly one or two other words, then the term "tree," and finishes with another noun.
d

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


The word "submit" followed by a noun.	[lemma="submit"]	Lemma submit.
	[lemma="submit"]	One token containing lemma submit.
A [lemma="not"]A	[lemma="submit"]	Only all lemmas submit.
	[lemma="submit"]	Lemma of submits.
1	[lemma="submit"]	Any lemma submit.
#baseForm	[lemma="submit"]	All base forms of submit.
	[lemma="submit"]	Token with lemma submit.
	[lemma="submit"]	Examples of the lemma submit.
In all instances of lemma="submit"	[lemma="submit"]	All instances of the lemma submit.
The word "submit" and "not" match.	[lemma="submit"]	Lemmas that match submit.
The submit token	[lemma="submit"]	Any token with submit as the lemma.
	[lc=="leg-like"]	Lowercase word "leg-like".
The word "leg-like"?	[lc=="leg-like"]	Only lowercase form "leg-like".
-lemma="leg-like"	[lc=="leg-like"]	Exact match for "leg-like" in lowercase.
	[lc=="leg-like"]	Token with lowercase "leg-like".
#word=leg-like	[lc=="leg-like"]	Case-sensitive match for the word "leg-like".
We want to match "leg-like" as a lowercase word.

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


[TO][verb][verb][verb][verb][verb][verb][verb][verb][verb][verb][verb][verb][verb][verb][verb][verb][verb][verb][verb][verb][verb][verb][verb][verb][verb][verb][verb][verb][verb][verb][verb][verb][verb][verb][verb][verb][verb][verb][verb][verb][verb][verb][verb][verb][verb][verb][verb][verb][	<s>[tag="TO"][tag="V.*"][]{1,5}[tag="VB.*"] within <s/>	Examples of sentences where the sequence begins with TO followed by a verb, continues with up to five other tokens, and ends with a verb, all within a single sentence structure.
"TO" and "be" and "not" and "NN" and "NN" and "NN" and "NN" and "NN" and "NN" and "NN" and "NN" and "NN" and "NN" and "NN" and "NN" and "NN" and "NN" and "NN" and "NN" and "NN" and "NN" and "NN" and "NN" and "NN" and "NN"	<s>[tag="TO"][tag="V.*"][]{1,5}[tag="VB.*"] within <s/>	Identifying sentences that contain the tag TO followed by a verb, with a maximum of five words following, and ending with any verb within the sentence.
#lemma="be" or "is" or "isn't" or "was" or

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


# if the cue is "whether" then: [lemma="like"]	[tag="V.*"][word="whether"][tag="PP"][lemma="like"][tag="PP.?"][word="or"][word="not"]	A narrative thread of verbs following the cue of "whether", expanding into "like".
	[tag="V.*"][word="whether"][tag="PP"][lemma="like"][tag="PP.?"][word="or"][word="not"]	Compositions of verbs that unravel scenario possibilities presented by “whether” and "like".
What's the name of the verb "like"?	[tag="V.*"][word="whether"][tag="PP"][lemma="like"][tag="PP.?"][word="or"][word="not"]	Verbs innate in the exploration of paths defined by "whether", yielding attention to "like".
We are looking for a verb that falls before "whether" and is followed by "like".	[tag="V.*"][word="whether"][tag="PP"][lemma="like"][tag="PP.?"][word="or"][word="not"]	Syntax of verbs falling intimately before "whether", leading to discussions on "like".
	[tag="V.*"][word="whether"][tag="PP"][lemma="like"][tag="PP.?"][word="or"][word="not"]	Understanding verbs as a frontier to explor

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


#think in [noun]	[lemma="think"& tag="V.*"][tag="IN"][tag="N.*"] within <s/>	Any usage of the verb think, then "in," followed by a noun confined to sentences.
# think, in, in	[lemma="think"& tag="V.*"][tag="IN"][tag="N.*"] within <s/>	Instances of the verb think, a preposition "in," and a noun appearing within sentences.
The "sustainable" lemma.	[lemma_lc="sustainable"]	Lemma lowercase sustainable.
\w{lemma="sustainable"}/	[lemma_lc="sustainable"]	One token containing the lowercase lemma sustainable.
	[lemma_lc="sustainable"]	Only all lowercase lemmas sustainable.
The sustainable form of sustainable.	[lemma_lc="sustainable"]	Lowercase form of sustainable.
# lemma	[lemma_lc="sustainable"]	Any lowercase lemma sustainable.
def sustainable	[lemma_lc="sustainable"]	The base form sustainable in lowercase.
	[lemma_lc="sustainable"]	Occurrences of the term sustainable in lowercase.
	[lemma_lc="sustainable"]	All instances of the lowercase lemma sustainable.
	[lemma_lc="sustainable"]	Lowercase v

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


#noun	[tag="N.*|PP.?"][tag="MD"]?[tag="VH"]?[tag="VHD"]?[tag="VBN"]?[lemma="fume"&tag="V.*"][tag="DT|CD|PDT"]?[tag="N.*|PP.?"]	A noun or prepositional phrase that can be followed, but not required, by a modal verb.
#[noun|preposition|verb]	[tag="N.*|PP.?"][tag="MD"]?[tag="VH"]?[tag="VHD"]?[tag="VBN"]?[lemma="fume"&tag="V.*"][tag="DT|CD|PDT"]?[tag="N.*|PP.?"]	Nouns or prepositional phrases potentially followed by modality, then optionally followed by auxiliary or main verbs.
#verb1 verb2 verb3 verb4 verb5 verb6 verb7 verb8 verb9 verb10 verb11 verb12 verb13 verb14 verb15 verb16 verb17 verb18 verb19 verb20 verb21 verb22 verb23 verb24 verb25 verb26 verb27 verb28 verb29 verb30 verb31 verb32 verb33 verb34 verb35 verb36 verb37 verb38 verb39 verb40 verb41 verb42 verb43 verb44 verb45 verb46 verb47 verb48 verb49	[tag="N.*|PP.?"][tag="MD"]?[tag="VH"]?[tag="VHD"]?[tag="VBN"]?[lemma="fume"&tag="V.*"][tag="DT|CD|PDT"]?[tag="N.*|PP.?"]	A structure that allows for a noun or prepositional phrase follow

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


# want # [preposition] # verb # in # the # present # participle # form	[lemma="want"] [tag="PP"] [tag="VBG"]	The token want followed by a preposition and then a verb in the present participle form.
\[     [want="want"][prep="want"][action="want"] ]	[lemma="want"] [tag="PP"] [tag="VBG"]	Query for want followed by a preposition and then a continuous action (VBG).
In the phrase "want to", "wanting to" or "wanting" followed by a verb in the gerund.	[lemma="want"] [tag="PP"] [tag="VBG"]	Occurrences of the lemma want and then a phrase indicating a preposition followed by a verb in the gerund.
	[lemma="open"]	Lemma open.
\{lemma="open"}/	[lemma="open"]	Token with lemma open.
The word "open" or "openness"	[lemma="open"]	All instances of the lemma open.
	[lemma="open"]	Any form of open as lemma.
The word "open" followed by a noun.	[lemma="open"]	Tokens containing the lemma open.
	[lemma="open"]	Base form of the word open.
	[lemma="open"]	Occurrences of open as a lemma.
# lemma "be" open	[lemma=

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


#verb_tagged_as[verb_tagged_as="able"][verb_tagged_as="to"]#	[tag="VB|VBD|VBG|VBN|VBP|VBZ"][word="able"][word="to"]	Words tagged as verbs that directly precede "able" and "to".
We can say: [verb1:able][verb2:to]	[tag="VB|VBD|VBG|VBN|VBP|VBZ"][word="able"][word="to"]	A sequence of verbs followed by the word "able" and the word "to".
\verb[verb1\verb[verb2\verb[verb3]]	[tag="VB|VBD|VBG|VBN|VBP|VBZ"][word="able"][word="to"]	Tokens of various verb forms leading into "able" and "to".
	[tag="VB|VBD|VBG|VBN|VBP|VBZ"][word="able"][word="to"]	Instances where a verb comes before "able", followed by "to".
- [verb:able] - [verb:to] - [word="to" - [word="able"]	[tag="VB|VBD|VBG|VBN|VBP|VBZ"][word="able"][word="to"]	Verbs in any of the specified forms preceding "able" and "to".
The verb "able" followed by "to" and a noun.	[tag="VB|VBD|VBG|VBN|VBP|VBZ"][word="able"][word="to"]	Verbs that relate to "able" and are followed by "to".
1. verb form "able" + "to" + "able" + "to" + "able" + "to" + "able" + "

ZeroDivisionError: float division by zero