-
Notifications
You must be signed in to change notification settings - Fork 2
/
generate.py
208 lines (176 loc) · 11.7 KB
/
generate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
import random
import torch
from dynamic_ecc import DynamicECC
from utils import PRF, start_model, tokenize, detokenize, binarize_setup, binarize_next, \
normalize_score, compute_score_function, consistent_perm, apply_perm
from compact_text import CompactText
###############################################
# The following code implements Algorithms 3 #
# and 4 from the paper #
# "Excuse me, sir? Your language model is #
# leaking (information)" #
# authored by Or Zamir (Tel Aviv University, #
# [email protected]). #
# Note that these algortihms are undetectable #
# only for a single query, #
# and for complete undetectability an #
# implementation of Algorithms 5 #
# and 6 is necessary. #
# The PRF implemented in utils is not #
# cryptographically secure and should be #
# replaced for secure applications. #
###############################################
# Normal response generation, with the reduction to binary token space
def generate_response_binarize(model, tokenizer, prompt, length=30):
prompt = tokenize(prompt, tokenizer)
inputs = prompt.to(model.device)
attn = torch.ones_like(inputs)
blen, token_to_id, id_to_token = binarize_setup(tokenizer)
past = None
for i in range(length):
with torch.no_grad():
if past:
output = model(inputs[:, -1:], past_key_values=past, attention_mask=attn)
else:
output = model(inputs)
probs = torch.nn.functional.softmax(output.logits[:, -1, : len(tokenizer)], dim=-1).cpu()[0, :]
token_id = 0
for ind in range(blen):
p0, p1 = binarize_next(probs, ind, blen, token_id)
token_id = token_id << 1
if random.random() < p1/(p0+p1):
token_id += 1
token = torch.tensor([[token_id]])
inputs = torch.cat([inputs, token], dim=-1)
past = output.past_key_values
attn = torch.cat([attn, attn.new_ones((attn.shape[0], 1))], dim=-1)
return detokenize(inputs.detach().cpu()[0], tokenizer)
# Watermarked response generation, without a payload
def generate_watermarked_response(key, model, tokenizer, prompt, length=30):
prompt = tokenize(prompt, tokenizer)
inputs = prompt.to(model.device)
attn = torch.ones_like(inputs)
blen, token_to_id, id_to_token = binarize_setup(tokenizer)
past = None
for i in range(length):
with torch.no_grad():
if past:
output = model(inputs[:, -1:], past_key_values=past, attention_mask=attn)
else:
output = model(inputs)
probs = torch.nn.functional.softmax(output.logits[:, -1, : len(tokenizer)], dim=-1).cpu()[0, :]
token_id = 0
for ind in range(blen):
p0, p1 = binarize_next(probs, ind, blen, token_id)
token_id = token_id << 1
if PRF(key, [i, ind]) < p1/(p0+p1):
token_id += 1
token = torch.tensor([[token_id]])
inputs = torch.cat([inputs, token], dim=-1)
past = output.past_key_values
attn = torch.cat([attn, attn.new_ones((attn.shape[0], 1))], dim=-1)
return detokenize(inputs.detach().cpu()[0], tokenizer)
# Watermark detection
def compute_score(key, text, tokenizer):
blen, token_to_id, id_to_token = binarize_setup(tokenizer)
score = 0
tokens = tokenize(text, tokenizer)[0]
for i in range(len(tokens)):
token_bits = ("0"*blen + bin(tokens[i])[2:])[-blen:]
for ind in range(blen):
score += compute_score_function(key, [i, ind], token_bits[ind])
return normalize_score(score, blen*len(tokens))
# Generation a response with payload (steganography)
def generate_payloaded_response(key, model, tokenizer, prompt, payload, length=30, threshold=2, bit_limit=None, temperature=1.0):
prompt_len = len(prompt)
prompt = tokenize(prompt, tokenizer)
inputs = prompt.to(model.device)
attn = torch.ones_like(inputs)
perm, inv_perm = consistent_perm(key, len(tokenizer)) # Not necessary, but makes the token indices spread uniformly.
blen, token_to_id, id_to_token = binarize_setup(tokenizer)
if bit_limit:
assert(bit_limit <= blen)
ecc = DynamicECC(payload)
symbol = ecc.next_symbol()
scores = {'0': 0, '1': 0, '<': 0}
score_length = 0
past = None
for i in range(length):
with torch.no_grad():
if past:
output = model(inputs[:, -1:], past_key_values=past, attention_mask=attn)
else:
output = model(inputs)
probs = torch.nn.functional.softmax(output.logits[:, -1, : len(tokenizer)]/temperature, dim=-1).cpu()[0, :]
probs_permed = apply_perm(probs, perm)
token_id = 0
for ind in range(blen):
p0, p1 = binarize_next(probs_permed, ind, blen, token_id)
token_id = token_id << 1
if PRF(key, [i, ind, symbol]) < p1/(p0+p1):
token_id += 1
# Update symbol scores and ECC, only for the first bit_limit bits of each token
if (not bit_limit) or (ind < bit_limit):
score_length += 1
for s in ['0', '1', '<']:
scores[s] += compute_score_function(key, [i, ind, s], str(token_id % 2))
if normalize_score(scores[s], score_length) > threshold:
ecc.update(s)
symbol = ecc.next_symbol()
scores = {'0': 0, '1': 0, '<': 0}
score_length = 0
break
token = torch.tensor([[inv_perm[token_id]]])
inputs = torch.cat([inputs, token], dim=-1)
past = output.past_key_values
attn = torch.cat([attn, attn.new_ones((attn.shape[0], 1))], dim=-1)
return detokenize(inputs.detach().cpu()[0], tokenizer)[prompt_len:], ecc
# Retrieving the payload
def extract_payload(key, text, tokenizer, threshold=2, bit_limit=None, skip_prefix=0):
stream = []
scores = {'0': 0, '1': 0, '<': 0}
score_length = 0
perm, inv_perm = consistent_perm(key, len(tokenizer)) # Not necessary, but makes the token indices spread uniformly.
blen, token_to_id, id_to_token = binarize_setup(tokenizer)
tokens = tokenize(text, tokenizer)[0][skip_prefix:]
for i in range(len(tokens)):
token_bits = ("0"*blen + bin(perm[tokens[i]])[2:])[-blen:]
for ind in range(blen):
if (not bit_limit) or (ind < bit_limit):
score_length += 1
for s in ['0', '1', '<']:
scores[s] += compute_score_function(key, [i, ind, s], token_bits[ind])
if normalize_score(scores[s], score_length) > threshold:
stream.append(s)
scores = {'0': 0, '1': 0, '<': 0}
score_length = 0
break
return DynamicECC.decode(stream)
if __name__ == '__main__':
# --- Generating the example from the paper (Figures 1 and 3) ---
# model, tokenizer = start_model("meta-llama/Llama-2-7b-chat-hf") # Requires a LLamma token ID
# res, ecc = generate_payloaded_response(424242, model, tokenizer, "[INST]Write an email asking my professor Prof. Hannity to not make the final exam in Machine Learning 101 too difficult. Begin directly with the body of the email.[\INST]Sure! Here is the body of such an email:", CompactText.text_to_bits("OZ"), 210, threshold=1.7, bit_limit=4, temperature=1.4)
# assert(res == '\n\nSubject: Request for Consideration of Final Exam Difficulty in Machine Learning 101\n\nDear Professor Hannity,\n\nI hope this email finds you well. I am writing to respectfully request that you consider the level of difficulty for the final exam in Machine Learning 101. While I am confident in my understanding of the course materials and have put in a significant amount of effort throughout the semester, I do have concerns about the potential difficulty of the final exam.\n\nAs you may recall, several students in my previous sections of Machine Learning have found the final exam to be very challenging, leading to frustration and disappointment. While I understand that the course is intended to push students to their limits and beyond, I believe that some relaxation of the latter may be in order.\n\nI would kindly ask that you consider reducing the difficulty of the final exam or offering some additional supports or resources to help students prepare. I believe that this could enhance the learning experience or')
# payload = extract_payload(424242, '\n\nSubject: Request for Consideration of Final Exam Difficulty in Machine Learning 101\n\nDear Professor Hannity,\n\nI hope this email finds you well. I am writing to respectfully request that you consider the level of difficulty for the final exam in Machine Learning 101. While I am confident in my understanding of the course materials and have put in a significant amount of effort throughout the semester, I do have concerns about the potential difficulty of the final exam.\n\nAs you may recall, several students in my previous sections of Machine Learning have found the final exam to be very challenging, leading to frustration and disappointment. While I understand that the course is intended to push students to their limits and beyond, I believe that some relaxation of the latter may be in order.\n\nI would kindly ask that you consider reducing the difficulty of the final exam or offering some additional supports or resources to help students prepare. I believe that this could enhance', tokenizer, threshold=1.7, bit_limit=4, skip_prefix=2)
# assert(CompactText.bits_to_text(payload) == "OZ")
# --- The plot from the paper (Figure 2) ---
model, tokenizer = start_model("gpt2")
prompts = [ # Taken from the GPT-2 official example prompts https://openai.com/research/better-language-models
"In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.",
"A train carriage containing controlled nuclear materials was stolen in Cincinnati today. Its whereabouts are unknown.",
"Miley Cyrus was caught shoplifting from Abercrombie and Fitch on Hollywood Boulevard today.",
"Legolas and Gimli advanced on the orcs, raising their weapons with a harrowing war cry.",
"For today's homework assignment, please describe the reasons for the US Civil War.",
"John F. Kennedy was just elected President of the United States after rising from the grave decades after his assassination. Due to miraculous developments in nanotechnology, Kennedy's brain was rebuilt from his remains and installed in the control center of a state-of-the art humanoid robot. Below is a transcript of his acceptance speech."
]
response_sizes = [20, 40, 60, 80, 100]
samples_per_size = 100 # Set to 10 for a quicker run
for size in response_sizes:
acc = 0
print("Making samples of size " + str(size) + ":")
for i in range(samples_per_size):
res, ecc = generate_payloaded_response(random.random(), model, tokenizer, random.choice(prompts),
CompactText.text_to_bits("EXAMPLE PAYLOAD"*5), size)
print("Run ended while hiding " + str(ecc.last_index_written + 1) + " bits.")
acc += ecc.last_index_written + 1
print("On average, encoded " + str(acc/samples_per_size) + " bits.\n")