-
Notifications
You must be signed in to change notification settings - Fork 0
/
ner-classification-with-custom-models.py
296 lines (224 loc) · 14.1 KB
/
ner-classification-with-custom-models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
import evaluate
from datetime import datetime
import pandas as pd
import numpy as np
import json
from simpletransformers.ner import NERModel, NERArgs
from tqdm.autonotebook import tqdm as notebook_tqdm
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import logging
import sklearn
from numba import cuda
import argparse
import gc
import torch
import time
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)
# Import the dataset
#if __name__ == '__main__':
# parser = argparse.ArgumentParser()
# parser.add_argument("dataset", help="path to the dataset in JSON format")
# args = parser.parse_args()
# Define the path to the dataset
#dataset_path = args.dataset
# Define the path to the datasets
#datasets = ["datasets/hr500k.conllup_extracted.json", "datasets/reldi-normtagner-hr.conllup_extracted.json", "datasets/reldi-normtagner-sr.conllup_extracted.json", "datasets/set.sr.plus.conllup_extracted.json"]
datasets = ["datasets/reldi-normtagner-sr.conllup_extracted.json"]
#dataset_path = args.dataset
# Do a loop through the datasets
for dataset_path in datasets:
# Load the json file
with open(dataset_path, "r") as file:
json_dict = json.load(file)
# Open the train, eval and test dictionaries as DataFrames
train_df = pd.DataFrame(json_dict["train"])
test_df = pd.DataFrame(json_dict["test"])
dev_df = pd.DataFrame(json_dict["dev"])
# Change the sentence_ids to numbers
test_df['sentence_id'] = pd.factorize(test_df['sentence_id'])[0]
train_df['sentence_id'] = pd.factorize(train_df['sentence_id'])[0]
dev_df['sentence_id'] = pd.factorize(dev_df['sentence_id'])[0]
# Define the labels
LABELS = json_dict["labels"]
print(LABELS)
print(train_df.shape, test_df.shape, dev_df.shape)
print(train_df.head())
# Define the main model arguments
model_args = {"overwrite_output_dir": True,
"num_train_epochs": 5,
"labels_list": LABELS,
"learning_rate": 4e-05,
"train_batch_size": 32,
# Comment out no_cache and no_save if you want to save the model
"no_cache": True,
"no_save": True,
"max_seq_length": 256,
"save_steps": -1,
"silent": True,
}
# Create lists of all needed models for the task
base_dict = {"/cache/nikolal/xlmrb_bcms_exp/checkpoint-12000": "xlmrb_bcms-12", "/cache/nikolal/xlmrb_bcms_exp/checkpoint-24000": "xlmrb_bcms-24", "/cache/nikolal/xlmrb_bcms_exp/checkpoint-36000": "xlmrb_bcms-36", "/cache/nikolal/xlmrb_bcms_exp/checkpoint-48000": "xlmrb_bcms-48", "/cache/nikolal/xlmrb_bcms_exp/checkpoint-60000": "xlmrb_bcms-60", "/cache/nikolal/xlmrb_bcms_exp/checkpoint-72000": "xlmrb_bcms-72", "/cache/nikolal/xlmrb_bcms_exp/checkpoint-84000": "xlmrb_bcms-84", "/cache/nikolal/xlmrb_bcms_exp/checkpoint-96000": "xlmrb_bcms-96"}
large_dict = {"/cache/nikolal/xlmrl_bcms_exp/checkpoint-6000": "xlmrl_bcms-6", "/cache/nikolal/xlmrl_bcms_exp/checkpoint-12000":"xlmrl_bcms-12", "/cache/nikolal/xlmrl_bcms_exp/checkpoint-18000": "xlmrl_bcms-18", "/cache/nikolal/xlmrl_bcms_exp/checkpoint-24000": "xlmrl_bcms-24", "/cache/nikolal/xlmrl_bcms_exp/checkpoint-30000": "xlmrl_bcms-30", "/cache/nikolal/xlmrl_bcms_exp/checkpoint-36000": "xlmrl_bcms-36", "/cache/nikolal/xlmrl_bcms_exp/checkpoint-42000": "xlmrl_bcms-42", "/cache/nikolal/xlmrl_bcms_exp/checkpoint-48000": "xlmrl_bcms-48", "/cache/nikolal/xlmrl_sl-bcms_exp/checkpoint-6000": "xlmrl_sl-bcms-6", "/cache/nikolal/xlmrl_sl-bcms_exp/checkpoint-12000": "xlmrl_sl-bcms-12", "/cache/nikolal/xlmrl_sl-bcms_exp/checkpoint-18000": "xlmrl_sl-bcms-18", "/cache/nikolal/xlmrl_sl-bcms_exp/checkpoint-24000": "xlmrl_sl-bcms-24", "/cache/nikolal/xlmrl_sl-bcms_exp/checkpoint-30000": "xlmrl_sl-bcms-30", "/cache/nikolal/xlmrl_sl-bcms_exp/checkpoint-36000": "xlmrl_sl-bcms-36", "/cache/nikolal/xlmrl_sl-bcms_exp/checkpoint-42000": "xlmrl_sl-bcms-42", "/cache/nikolal/xlmrl_sl-bcms_exp/checkpoint-48000": "xlmrl_sl-bcms-48"}
base_list = list(base_dict.keys())
large_list = list(large_dict.keys())
add_dict = {"/cache/nikolal/xlmrl_sl-bcms_exp/checkpoint-42000": "xlmrl_sl-bcms-42"}
add_list = list(add_dict.keys())
# Create lists of all needed models for the task
path_list = {"/cache/nikolal/xlmrb_bcms_exp/checkpoint-12000": "xlmrb_bcms-12", "/cache/nikolal/xlmrb_bcms_exp/checkpoint-24000": "xlmrb_bcms-24", "/cache/nikolal/xlmrb_bcms_exp/checkpoint-36000": "xlmrb_bcms-36", "/cache/nikolal/xlmrb_bcms_exp/checkpoint-48000": "xlmrb_bcms-48", "/cache/nikolal/xlmrb_bcms_exp/checkpoint-60000": "xlmrb_bcms-60", "/cache/nikolal/xlmrb_bcms_exp/checkpoint-72000": "xlmrb_bcms-72", "/cache/nikolal/xlmrb_bcms_exp/checkpoint-84000": "xlmrb_bcms-84", "/cache/nikolal/xlmrb_bcms_exp/checkpoint-96000": "xlmrb_bcms-96", "/cache/nikolal/xlmrl_bcms_exp/checkpoint-6000": "xlmrl_bcms-6", "/cache/nikolal/xlmrl_bcms_exp/checkpoint-12000":"xlmrl_bcms-12", "/cache/nikolal/xlmrl_bcms_exp/checkpoint-18000": "xlmrl_bcms-18", "/cache/nikolal/xlmrl_bcms_exp/checkpoint-24000": "xlmrl_bcms-24", "/cache/nikolal/xlmrl_bcms_exp/checkpoint-30000": "xlmrl_bcms-30", "/cache/nikolal/xlmrl_bcms_exp/checkpoint-36000": "xlmrl_bcms-36", "/cache/nikolal/xlmrl_bcms_exp/checkpoint-42000": "xlmrl_bcms-42", "/cache/nikolal/xlmrl_bcms_exp/checkpoint-48000": "xlmrl_bcms-48", "/cache/nikolal/xlmrl_sl-bcms_exp/checkpoint-6000": "xlmrl_sl-bcms-6", "/cache/nikolal/xlmrl_sl-bcms_exp/checkpoint-12000": "xlmrl_sl-bcms-12", "/cache/nikolal/xlmrl_sl-bcms_exp/checkpoint-18000": "xlmrl_sl-bcms-18", "/cache/nikolal/xlmrl_sl-bcms_exp/checkpoint-24000": "xlmrl_sl-bcms-24", "/cache/nikolal/xlmrl_sl-bcms_exp/checkpoint-30000": "xlmrl_sl-bcms-30", "/cache/nikolal/xlmrl_sl-bcms_exp/checkpoint-36000": "xlmrl_sl-bcms-36", "/cache/nikolal/xlmrl_sl-bcms_exp/checkpoint-42000": "xlmrl_sl-bcms-42", "/cache/nikolal/xlmrl_sl-bcms_exp/checkpoint-48000": "xlmrl_sl-bcms-48"}
all_models_list = list(path_list.keys())
def train_and_save_checkpoint(model_path, train_df, LABELS, model_args):
# When fine-tuning our custom models that we pre-trained, and using them from checkpoints, the process is a bit different than with publicly available models: first, we need to fine-tune a model from the original checkpoint, so that we save the model and overwrite its original settings which force pretraining from a specific step (and disable fine-tuning by that). Then we take that new model and fine-tune it, as we did with the models before.
# Add additional arguments, specific for our own models
# Specify the folder where we want to save the models
new_model_path = "models/"
model_args["output_dir"] = new_model_path
model_args["no_save"] = False
model_args["num_train_epoch"] = 1
# Define the model
current_model = NERModel(
"xlmroberta",
model_path,
labels = LABELS,
use_cuda=True,
args = model_args)
print("Training of pre-trained model started. Current model: {}".format(model_path))
# Fine-tune the model
current_model.train_model(train_df)
print("Training of pre-trained model completed.")
print("Model saved in models/")
# Clean cache
gc.collect()
torch.cuda.empty_cache()
# After creating pre-trained model that we can use, train it properly
def train_and_test(model, train_df, test_df, dataset_path, LABELS, model_args):
# Define the model
# Define the model arguments - use the same one as for XLM-R-large if model is based on it,
# if the model is of same size as XLM-R-base, use its optimal hyperparameters (I searched for them before).
# Args also depend on the dataset.
# Define the type of dataset we are using
# - when we extend the code for SL, change this
dataset_type = "standard_hr"
if "reldi" in dataset_path:
dataset_type = "non_standard"
elif "set.sr" in dataset_path:
dataset_type = "standard_sr"
# Change no. of epochs based on the model and the dataset
if dataset_type == "standard_hr":
# If the model is based on XLM-R-base, use the same arg as XLM-R-base
if "xlmrb" in model:
model_args["num_train_epochs"] = 5
# If the model is based on XLM-R-large, use the same arg as XLM-R-large
elif "xlmrl" in model:
model_args["num_train_epochs"] = 7
elif dataset_type == "non_standard":
if "xlmrb" in model:
model_args["num_train_epochs"] = 8
elif "xlmrl" in model:
model_args["num_train_epochs"] = 11
elif dataset_type == "standard_sr":
if "xlmrb" in model:
model_args["num_train_epochs"] = 6
elif "xlmrl" in model:
model_args["num_train_epochs"] = 13
# Define the model
current_model = NERModel(
"xlmroberta",
"models/",
labels = LABELS,
use_cuda=True,
args = model_args)
print("Training started. Current model: {}".format(model))
start_time = time.time()
# Fine-tune the model
current_model.train_model(train_df)
print("Training completed.")
training_time = round((time.time() - start_time)/60,2)
print("It took {} minutes for {} instances.".format(training_time, train_df.shape[0]))
# Clean cache
gc.collect()
torch.cuda.empty_cache()
start_evaluation_time = time.time()
# Evaluate the model
results = current_model.eval_model(test_df)
print("Evaluation completed.")
evaluation_time = round((time.time() - start_evaluation_time)/60,2)
print("It took {} minutes for {} instances.".format(evaluation_time, test_df.shape[0]))
# Get predictions
preds = results[1]
# Create a list with predictions
preds_list = []
for sentence in preds:
for word in sentence:
current_word = []
for element in word:
# Find prediction with the highest value
highest_index = element.index(max(element))
# Transform the index to label
current_pred = current_model.config.id2label[highest_index]
# Append to the list
current_word.append(current_pred)
# Segmentation can result in multiple predictions for one word - use the first prediction only
preds_list.append(current_word[0])
# Get y_true
y_true = list(test_df.labels)
run_name = "{}-{}".format(dataset_path, model)
# Evaluate predictions
metrics = evaluate.testing(y_true, preds_list, list(test_df.labels.unique()), run_name, show_matrix=True)
# Add y_pred and y_true to the metrics dict
metrics["y_true"] = y_true
metrics["y_pred"] = preds_list
# Let's also add entire results
metrics["results_output"] = results
# The function returns a dict with accuracy, micro f1, macro f1, y_true and y_pred
return metrics
# For each model, repeat training and testing 5 times - let's do 2 times for starters
#model_list = base_list
#model_list = large_list
#model_list = all_models_list
model_list = add_list
for model_path in model_list:
# First, save a fine-tuned version that we can use for proper fine-tuning
train_and_save_checkpoint(model_path, train_df, LABELS, model_args)
model = path_list[model_path]
# Let's do 3 more runs
#for run in [0,1,2]:
# Let's do one run for starters
for run in [1,2]:
current_results_dict = train_and_test(model, train_df, test_df, dataset_path, LABELS, model_args)
# Add to the dict model name, dataset name and run
current_results_dict["model"] = model
current_results_dict["run"] = "{}-{}".format(model, run)
current_results_dict["dataset"] = dataset_path
# Add to the file with results all important information
with open("ner-results-all.txt", "a") as file:
file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(datetime.now().strftime("%d/%m/%Y %H:%M:%S"), current_results_dict["model"], current_results_dict["run"], current_results_dict["dataset"], current_results_dict["micro F1"], current_results_dict["macro F1"], current_results_dict["label-report"]))
# Add to the original test_df y_preds
test_df["y_pred_{}_{}".format(model, run)] = current_results_dict["y_pred"]
# Save also y_pred and y_true
with open("logs/{}-{}-{}-true-and-pred-backlog.txt".format(dataset_path,model,run), "w") as backlog:
backlog.write("y-true\ty-pred\toutputs\n")
backlog.write("{}\t{}\t{}\n".format(current_results_dict["y_true"], current_results_dict["y_pred"], current_results_dict["results_output"]))
print("Run {} finished.".format(run))
# Then delete the model from the /models folder
folder_path = "models"
# List all files in the folder
file_list = os.listdir(folder_path)
# Loop through the files and delete each one
for file_name in file_list:
file_path = os.path.join(folder_path, file_name)
if os.path.isfile(file_path):
os.remove(file_path)
# At the end, save the test_df with all predictions
test_df.to_csv("{}-test_df-with-predictions-custom-models.csv".format(dataset_path))
# At the end, create a csv table with a summary of results
results = pd.read_csv("ner-results-all.txt", sep="\t")
results["Macro F1"] = results["Macro F1"].round(2)
# Pivot the DataFrame to rearrange columns into rows
pivot_df = results.pivot(index='Run', columns='Dataset', values='Macro F1')
# Reset the index to have 'Model' as a column
pivot_df.reset_index(inplace=True)
# Pivot the DataFrame to rearrange columns into rows
pivot_df.to_csv("ner-results-summary-table-large-models.csv")