In [1]:
import transformers
import torch
import cql_checker
import numpy as np

In [2]:
prompt_prefix = ""
with open("prompt_prefix.txt", "r") as file:
    prompt_prefix = file.read()
print(prompt_prefix)

All forms of the lemma people. -> [lemma="people"]
The word is precisely selfish in lowercase.   -> [lc=="selfish"]
Identical to the lowercase form selfish.   -> [lc=="selfish"]
Verb lemma pick followed by a prepositional phrase and then a particle. -> [lemma="pick" & tag="V.*"][tag="PP.?"][tag="RP"]
Examples of adjectives followed by attitude.   -> [tag="J.*"][lemma="attitude"]
All forms of the verb know.   -> [lempos="know-v"]
The sequence consists of "in" (case insensitive), followed by "the", and ending with "sky".   -> [word="in|In"][word="the"][word="sky"]
The verb 'test' in all its forms. -> [lemma = "test" & tag = "V.*"]
Here are some natural descriptions for the provided Corpus Query Language example: -> [word!="(?i)as"] [word!="a|(?i)in"] [lemma="consequence"]
At least one lemma "very" preceding a token tagged as superlative adjective. -> [lemma="very"] + [tag="JJS"]
Token marked as 'burst-v'.   -> [lempos="burst-v"]
Lemma "as" followed by a noun.   -> [lemma="as"] [tag="N.*"

In [3]:
class CQLLogitsProcessor(transformers.generation.logits_process.LogitsProcessor):
    def __init__(self, id2string, verbose = False):
        self.id2string = id2string
        self.checker = cql_checker.CQLChecker()
        self.checker.eos_token_id = 1
        self.checker.ignore_tokens.append(0)
        self.checker.ignore_tokens.append(2)
        self.verbose = verbose

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
        scores[0][2] = -inf
        scores[0][3] = -inf
        while True:
            token_id = int(np.argmax(scores[0].to("cpu")))
            if self.checker.add_string(self.id2string[token_id], token_id):
                scores[0][token_id] = inf
                if self.verbose:
                    print("Accepted: ", self.id2string[token_id])
                break
            scores[0][token_id] = -inf
            if self.verbose:
                print("Rejected: ", self.id2string[token_id])
        return scores

In [4]:
class CQLRemoteLogitsProcessor(transformers.generation.logits_process.LogitsProcessor):
    def __init__(self, id2string, verbose = False):
        self.id2string = id2string
        self.checker = cql_checker.CQLRemoteChecker("preloaded/bnc2_tt21", "912b3694e685d7ff2b4cc8fdfe0e94cd")
        self.checker.eos_token_id = 1
        self.checker.ignore_tokens.append(0)
        self.checker.ignore_tokens.append(2)
        self.verbose = verbose

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
        scores[0][2] = 0
        scores[0][3] = 0
        while True:
            token_id = int(np.argmax(scores[0].to("cpu")))
            if self.checker.add_string(self.id2string[token_id], token_id):
                scores[0][token_id] = 1000000
                if self.verbose:
                    print("Accepted: ", self.id2string[token_id])
                break
            scores[0][token_id] = 0
            if self.verbose:
                print("Rejected: ", self.id2string[token_id])
        return scores

In [5]:
from peft import PeftConfig, PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer

base_model_name = "meta-llama/Llama-3.2-1B"
adapter_model_name = "outputs/checkpoint-21245"

model = AutoModelForCausalLM.from_pretrained(base_model_name)
model = PeftModel.from_pretrained(model, adapter_model_name)

tokenizer = AutoTokenizer.from_pretrained(base_model_name)

In [6]:
id2token = tokenizer.convert_ids_to_tokens(list(range(tokenizer.vocab_size)))

In [7]:
id2string = []
for token in id2token:
    id2string.append(tokenizer.convert_tokens_to_string([token]))

In [8]:
inputs = tokenizer("""translate: Any word -> []
translate: Any word followed by verb -> [][tag="V.*"]
translate: Any word followed by lemma door as tag noun and then starting with open within sentence. ->[""", return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


translate: Any word -> []
translate: Any word followed by verb -> [][tag="V.*"]
translate: Any word followed by lemma door as tag noun and then starting with open within sentence. ->[][tag="N.*"][][tag="V.*"]
translate: Any word followed by lemma window


In [9]:
def f_in(data):
    return [prompt_prefix + "translate: " + data["text"][0] + " -> " + data["cql"][0] + "<eos>"]

In [15]:
inputs = tokenizer(prompt_prefix + """translate: Any word that begins with 'not' and ends with 'to'. ->""", return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


All forms of the lemma people. -> [lemma="people"]
The word is precisely selfish in lowercase.   -> [lc=="selfish"]
Identical to the lowercase form selfish.   -> [lc=="selfish"]
Verb lemma pick followed by a prepositional phrase and then a particle. -> [lemma="pick" & tag="V.*"][tag="PP.?"][tag="RP"]
Examples of adjectives followed by attitude.   -> [tag="J.*"][lemma="attitude"]
All forms of the verb know.   -> [lempos="know-v"]
The sequence consists of "in" (case insensitive), followed by "the", and ending with "sky".   -> [word="in|In"][word="the"][word="sky"]
The verb 'test' in all its forms. -> [lemma = "test" & tag = "V.*"]
Here are some natural descriptions for the provided Corpus Query Language example: -> [word!="(?i)as"] [word!="a|(?i)in"] [lemma="consequence"]
At least one lemma "very" preceding a token tagged as superlative adjective. -> [lemma="very"] + [tag="JJS"]
Token marked as 'burst-v'.   -> [lempos="burst-v"]
Lemma "as" followed by a noun.   -> [lemma="as"] [tag="N.*"

In [11]:
inputs = tokenizer("translate: Any word. ->", return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


translate: Any word. -> Any word. -> Any word. -> Any word. -> Any word. -> Any word. ->


In [12]:
processor = CQLRemoteLogitsProcessor(id2string, verbose=True)
logits_processor_list = transformers.generation.logits_process.LogitsProcessorList([
    processor,
])
sentence_tokenized = tokenizer(
    "translate: Any form of the door followed by verb within a sentence.",
    return_tensors="pt",
)
generated_ids = model.generate(
      sentence_tokenized.input_ids,
      logits_processor=logits_processor_list,
      max_length=200,
)
" ".join(tokenizer.convert_ids_to_tokens(generated_ids[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Accepted:    
Rejected:   Trans
Rejected:   Translate
Rejected:   The
Rejected:   translate
Rejected:  Translate
Rejected:  -translate
Rejected:  _translate
Rejected:  translate
Rejected:   *
Rejected:   Travel
Rejected:   Translation
Rejected:   trans
Rejected:   Example
Rejected:   Tr
Rejected:   TRAN
Rejected:  .translate
Rejected:   This
Rejected:   Gl
Rejected:   Examples
Rejected:   Tran
Rejected:   In
Rejected:   Talk
Rejected:   T
Rejected:  .Translate
Rejected:  _trans
Rejected:   TRANS
Rejected:   Transform
Rejected:   For
Rejected:   +
Rejected:   To
Rejected:   Each
Rejected:   An
Rejected:   All
Rejected:  Trans
Rejected:   A
Rejected:   translates
Rejected:   If
Accepted:   "
Accepted:  The
Accepted:  "
Accepted:   followed
Accepted:   by
Accepted:   a
Accepted:   noun
Accepted:  ,
Accepted:   then
Accepted:   "
Accepted:  of
Accepted:  ,"
Accepted:   another
Accepted:   noun
Accepted:  ,
Accepted:   and
Accepted:   finally
Accepted:   "
Accepted:  are
Rejected:  "
Accept

'<|begin_of_text|> translate : ĠAny Ġform Ġof Ġthe Ġdoor Ġfollowed Ġby Ġverb Ġwithin Ġa Ġsentence . ĠĠ Ġ" The " Ġfollowed Ġby Ġa Ġnoun , Ġthen Ġ" of ," Ġanother Ġnoun , Ġand Ġfinally Ġ" are "/ Ġ" were ." ĠĠĊ The Ġsequence Ġof Ġ" The ," Ġfollowed Ġby Ġ" problem ," Ġfollowed Ġby Ġ" that ." Ġ Ġ" The ," Ġfollowed Ġby Ġa Ġnoun , Ġand Ġthen Ġa Ġverb . Ġ Ġ" The ," Ġfollowed Ġby Ġa Ġnoun , Ġand Ġthen Ġa Ġverb . Ġ Ġ" The ," Ġfollowed Ġby Ġa Ġnoun , Ġand Ġthen Ġa Ġverb . Ġ Ġ" The ," Ġfollowed Ġby Ġa Ġnoun , Ġand Ġthen Ġa Ġverb . Ġ Ġ" The ," Ġfollowed Ġby Ġa Ġnoun , Ġand Ġthen Ġa Ġverb . Ġ Ġ" The ," Ġfollowed Ġby Ġa Ġnoun , Ġand Ġthen Ġa Ġverb . Ġ Ġ" The ," Ġfollowed Ġby Ġa Ġnoun , Ġand Ġthen Ġa Ġverb . Ġ Ġ" The ," Ġfollowed Ġby Ġa Ġnoun , Ġand Ġthen Ġa Ġverb . Ġ Ġ" The ," Ġfollowed Ġby Ġa Ġnoun , Ġand Ġthen Ġa Ġverb . Ġ Ġ" The ," Ġfollowed Ġby Ġa Ġnoun , Ġand Ġthen Ġa Ġverb . Ġ Ġ" The ,"'