diff --git a/AdvancedModel.py b/AdvancedModel.py index e530173..75e3f71 100644 --- a/AdvancedModel.py +++ b/AdvancedModel.py @@ -6,16 +6,16 @@ from torch.utils.data.dataloader import DataLoader from torchtext.vocab import Vocab from torch.utils.data.dataset import Dataset, TensorDataset -from collections import Counter +from collections import Counter, OrderedDict from chu_liu_edmonds import decode_mst import matplotlib.pyplot as plt from datetime import datetime from tqdm import tqdm from timeit import default_timer as timer import csv +from BasicModel import run_basic_model torch.manual_seed(0) - UNKNOWN_TOKEN = "" PAD_TOKEN = "" ROOT_TOKEN = "" @@ -55,23 +55,30 @@ def get_vocabs(list_of_paths): A POS and words indexes dictionaries. """ - words_dict = {PAD_TOKEN, ROOT_TOKEN, UNKNOWN_TOKEN} - pos_dict = {PAD_TOKEN, ROOT_TOKEN, UNKNOWN_TOKEN} + words_dict = OrderedDict([(PAD_TOKEN, 1), (ROOT_TOKEN, 1), (UNKNOWN_TOKEN, 1)]) + pos_dict = OrderedDict([(PAD_TOKEN, 1), (ROOT_TOKEN, 1), (UNKNOWN_TOKEN, 1)]) + for file_path in list_of_paths: with open(file_path) as f: for line in f: split_line = line.split('\t') if len(split_line) == 1: # the end of a sentence denotes by \n line. continue - word, pos_tag, head = split_line[1], split_line[3], int(split_line[6]) - words_dict.add(word) - pos_dict.add(pos_tag) + word, pos_tag = split_line[1], split_line[3] + if word in words_dict: + words_dict[word] = words_dict[word] + 1 + else: + words_dict[word] = 1 + if pos_tag in pos_dict: + pos_dict[pos_tag] = pos_dict[pos_tag] + 1 + else: + pos_dict[pos_tag] = 1 return words_dict, pos_dict class DataReader: - """ Read the data from the requested file and hold it's components. """ + """ Reads the data from the requested file and hold it's components. """ def __init__(self, word_dict, pos_dict, file_path, competition=False): """ @@ -288,7 +295,8 @@ def __init__(self, word_emb_dim, pos_emb_dim, hidden_dim, word_vocab_size, tag_v self.relu = nn.ReLU() def word_tag_dropout(self, words, postags, p_drop): - # can't work with batches + """ Word\tag dropout based on DEEP BIAFFINE ATTENTION FOR NEURAL DEPENDENCY PARSING + - Christopher D. Manning, Timothy Dozat """ p_matrix_word = torch.rand(size=words.shape, device=words.device) p_matrix_pos = torch.rand(size=words.shape, device=words.device) w_dropout_mask = (p_matrix_word > p_drop).long() @@ -361,7 +369,7 @@ def get_acc(edge_scores, headers_idx_tensors, batch_size, max_length, sentence_l return acc -def evaluate(model, words_dict, pos_dict, batch_size): +def evaluate(model, words_dict, pos_dict, batch_size, path_test): """ Evaluate our model on a validation set. Args: @@ -375,13 +383,13 @@ def evaluate(model, words_dict, pos_dict, batch_size): print("Evaluating Started") model.eval() - path_test = "Data/test.labeled" - test = DependencyDataset(words_dict, pos_dict, path_test, padding=True) - test_data_loader = DataLoader(test, batch_size=BATCH_SIZE, shuffle=False, num_workers=0) - num_of_sentences = len(test) - acc = 0 - num_of_words = 0 with torch.no_grad(): + test = DependencyDataset(words_dict, pos_dict, path_test, padding=True) + test_data_loader = DataLoader(test, batch_size=batch_size, shuffle=False, num_workers=0) + num_of_sentences = len(test) + acc = 0 + num_of_words = 0 + for batch_idx, input_data in enumerate(test_data_loader): words_idx_tensor, pos_idx_tensor, headers_idx_tensor, sentence_length = input_data headers_idx_tensors = [headers[:sentence_length[i]] for i, headers in enumerate(headers_idx_tensor)] @@ -392,9 +400,9 @@ def evaluate(model, words_dict, pos_dict, batch_size): acc += get_acc(batched_scores, headers_idx_tensors, batch_size, max_length, sentence_length) num_of_words += (max_length - 1) - acc = acc/num_of_words - print("Evaluating Ended") - return acc, _loss + acc = acc/num_of_words + print("Evaluating Ended") + return acc, _loss def print_plots(train_acc_list, train_loss_list, test_acc_list, test_loss_list, _time=''): @@ -545,30 +553,40 @@ def train(self): train_acc_list.append(float(acc)) train_loss_list.append(float(printable_loss)) # Runs a validation phase. - test_acc, test_loss = evaluate(encoder, words_dict, pos_dict, self.batch_size) + test_acc, test_loss = evaluate(encoder, words_dict, pos_dict, self.batch_size, self.path_test) test_acc_list.append(test_acc) test_loss_list.append(test_loss) + time_id = datetime.now().strftime("%m_%d_%Y_%H_%M_%S") + with open(r"{}_advanced_model_full_{}.pkl".format(epoch, time_id), "wb") as output_file: + torch.save(encoder.state_dict(), output_file) print("Epoch {} Completed,\tLoss {}\tAccuracy: {}\t Test Accuracy: {}".format(epoch + 1, train_loss_list[-1], train_acc_list[-1], test_acc)) + # Saves our model results. + with open('parser_results_info.csv', 'a') as f: + writer = csv.writer(f) + writer.writerow([epoch + 1, train_loss_list[-1], train_acc_list[-1], test_acc]) + # Saves our learned model and plot some graphs. time_id = datetime.now().strftime("%m_%d_%Y_%H_%M_%S") print_plots(train_acc_list, train_loss_list, test_acc_list, test_loss_list, time_id) end_time = timer() - torch.save(encoder, 'encoder{}.pth'.format(time_id)) + with open(r"advanced_model_full_{}.pkl".format(time_id), "wb") as output_file: + torch.save(encoder.state_dict(), output_file) print("the training took: {} sec ".format(round(end_time - start_time, 2))) return test_acc_list, time_id -if __name__ == '__main__': +def get_hyper_parameters(): + """Returns the hyper parameters of the model.""" + path_train = "Data/combined.labeled" + path_test = "Data/val.labeled" + return (100, 100, 100, 500, 1, 30, 0.002, path_train, path_test, 0.3, 0.3, 0.3) - path_train = "Data/train.labeled" - path_test = "Data/test.labeled" - hyper_parameters_list = [(100, 100, 100, 500, 1, 30, 0.002, path_train, path_test, 0.3, 0.3, 0.3), - (80, 100, 100, 400, 1, 30, 0.002, path_train, path_test, 0.5, 0.3, 0.1), - (80, 100, 100, 400, 1, 40, 0.005, path_train, path_test, 0.3, 0.3, 0.3), - (60, 100, 200, 400, 1, 20, 0.002, path_train, path_test, 0.3, 0.3, 0.3)] +if __name__ == '__main__': + + hyper_parameters_list = [get_hyper_parameters()] for hyper_parameters in hyper_parameters_list: EPOCHS, WORD_EMBEDDING_DIM, POS_EMBEDDING_DIM, HIDDEN_DIM, BATCH_SIZE, BATCH_ACCUMULATE, LEARNING_RATE, path_train, path_test, WORD_TAG_DROPOUT, EMBEDDING_DROPOUT, LSTM_DROPOUT = hyper_parameters @@ -581,8 +599,12 @@ def train(self): epoch_max = np.argmax(test_acc_list) # Saves our model hyper parameters settings. - with open('parser_results_info.csv', 'a') as f: + with open('parser_settings.csv', 'a') as f: writer = csv.writer(f) writer.writerow([time_id, max_test_acc, epoch_max, EPOCHS, WORD_EMBEDDING_DIM, POS_EMBEDDING_DIM, HIDDEN_DIM, BATCH_SIZE, BATCH_ACCUMULATE, LEARNING_RATE, WORD_TAG_DROPOUT, EMBEDDING_DROPOUT, LSTM_DROPOUT]) + print("Finished training the model, based on the following hyper parameters mix: {}".format(hyper_parameters)) + + # print("Runs the basic model training:") + # run_basic_model() diff --git a/BasicModel.py b/BasicModel.py index a033573..bc91bfe 100644 --- a/BasicModel.py +++ b/BasicModel.py @@ -4,22 +4,24 @@ import torch.nn.functional as F import torch.optim as optim from torch.utils.data.dataloader import DataLoader -from collections import defaultdict from torchtext.vocab import Vocab from torch.utils.data.dataset import Dataset, TensorDataset -from collections import Counter +from collections import Counter, OrderedDict from chu_liu_edmonds import decode_mst import matplotlib.pyplot as plt -import time +from datetime import datetime from tqdm import tqdm +from timeit import default_timer as timer +import csv + + +torch.manual_seed(0) UNKNOWN_TOKEN = "" PAD_TOKEN = "" ROOT_TOKEN = "" SPECIAL_TOKENS = [ROOT_TOKEN, PAD_TOKEN, UNKNOWN_TOKEN] -torch.manual_seed(1) - def OpTyNLLLOSS(true_headers, score_matrix, max_len): """ @@ -45,6 +47,7 @@ def OpTyNLLLOSS(true_headers, score_matrix, max_len): return -1*_loss + def get_vocabs(list_of_paths): """ Creates a POS-tags and words vocabulary dictionaries @@ -54,30 +57,38 @@ def get_vocabs(list_of_paths): A POS and words indexes dictionaries. """ - words_dict = {PAD_TOKEN, ROOT_TOKEN, UNKNOWN_TOKEN} - pos_dict = {PAD_TOKEN, ROOT_TOKEN, UNKNOWN_TOKEN} + words_dict = OrderedDict([(PAD_TOKEN, 1), (ROOT_TOKEN, 1), (UNKNOWN_TOKEN, 1)]) + pos_dict = OrderedDict([(PAD_TOKEN, 1), (ROOT_TOKEN, 1), (UNKNOWN_TOKEN, 1)]) + for file_path in list_of_paths: with open(file_path) as f: for line in f: split_line = line.split('\t') if len(split_line) == 1: # the end of a sentence denotes by \n line. continue - word, pos_tag, head = split_line[1], split_line[3], int(split_line[6]) - words_dict.add(word) - pos_dict.add(pos_tag) + word, pos_tag = split_line[1], split_line[3] + if word in words_dict: + words_dict[word] = words_dict[word] + 1 + else: + words_dict[word] = 1 + if pos_tag in pos_dict: + pos_dict[pos_tag] = pos_dict[pos_tag] + 1 + else: + pos_dict[pos_tag] = 1 return words_dict, pos_dict class DataReader: - """ Read the data from the requested file and hold it's components. """ + """ Reads the data from the requested file and hold it's components. """ - def __init__(self, word_dict, pos_dict, file_path): + def __init__(self, word_dict, pos_dict, file_path, competition=False): """ Args: file_path (str): holds the path to the requested file. words_dict, tags_dict: a dictionary - keys:words\tags, items: counts of appearances. """ + self.competition = competition self.file_path = file_path self.words_dict = word_dict self.pos_dict = pos_dict @@ -99,13 +110,14 @@ def __readData__(self): cur_sentence_pos = [ROOT_TOKEN] cur_sentence_headers = [-1] continue - word, pos_tag, head = split_line[1], split_line[3], int(split_line[6]) + if not self.competition: + word, pos_tag, head = split_line[1], split_line[3], int(split_line[6]) + else: + word, pos_tag, head = split_line[1], split_line[3], -2 cur_sentence_word.append(word) cur_sentence_pos.append(pos_tag) cur_sentence_headers.append(head) - - def get_num_sentences(self): """returns num of sentences in data.""" return len(self.sentences) @@ -116,18 +128,23 @@ class DependencyDataset(Dataset): Holds version of our data as a PyTorch's Dataset object. """ - def __init__(self, word_dict, pos_dict, file_path, padding=False, word_embeddings=None): + def __init__(self, word_dict, pos_dict, file_path, padding=False, word_embeddings=None, competition=False): + """ Args: - file_path (str): The path of the requested file. - padding (bool): Gets true if padding is required. - word_embeddings: A set of words mapping. + word_dict: + pos_dict: + file_path: The path of the requested file. + padding: Gets true if padding is required. + word_embeddings (str): A pretrained embedding path. + competition (bool): Gets True if it works on a file without gold headers. """ super().__init__() self.file_path = file_path - self.data_reader = DataReader(word_dict, pos_dict, self.file_path) + self.data_reader = DataReader(word_dict, pos_dict, self.file_path, competition) self.vocab_size = len(self.data_reader.words_dict) + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if word_embeddings: self.words_idx_mappings, self.idx_words_mappings, self.words_vectors = word_embeddings @@ -180,11 +197,11 @@ def convert_sentences_to_dataset(self, padding): for word, pos_tag, header in zip(sentence[0], sentence[1], sentence[2]): headers_idx_list.append(header) - if word in words_dict: + if word in self.data_reader.words_dict: words_idx_list.append(self.words_idx_mappings.get(word)) else: words_idx_list.append(self.unknown_idx) - if pos_tag in pos_dict: + if pos_tag in self.data_reader.pos_dict: pos_idx_list.append(self.pos_idx_mappings.get(pos_tag)) else: pos_idx_list.append(self.unknown_idx) @@ -198,15 +215,15 @@ def convert_sentences_to_dataset(self, padding): sentence_pos_idx_list.append(pos_idx_list) sentence_headers_idx_list.append(headers_idx_list) else: - sentence_words_idx_list.append(torch.tensor(words_idx_list, dtype=torch.long, requires_grad=False)) - sentence_pos_idx_list.append(torch.tensor(pos_idx_list, dtype=torch.long, requires_grad=False)) - sentence_headers_idx_list.append(torch.tensor(headers_idx_list, dtype=torch.long, requires_grad=False)) + sentence_words_idx_list.append(torch.tensor(words_idx_list, dtype=torch.long, requires_grad=False).to(self.device)) + sentence_pos_idx_list.append(torch.tensor(pos_idx_list, dtype=torch.long, requires_grad=False).to(self.device)) + sentence_headers_idx_list.append(torch.tensor(headers_idx_list, dtype=torch.long, requires_grad=False).to(self.device)) if padding: - all_sentence_words_idx = torch.tensor(sentence_words_idx_list, dtype=torch.long, requires_grad=False) - all_sentence_tags_idx = torch.tensor(sentence_pos_idx_list, dtype=torch.long, requires_grad=False) - all_sentence_labels_idx = torch.tensor(sentence_headers_idx_list, dtype=torch.long, requires_grad=False) - all_sentence_len = torch.tensor(sentence_len_list, dtype=torch.long, requires_grad=False) + all_sentence_words_idx = torch.tensor(sentence_words_idx_list, dtype=torch.long, requires_grad=False).to(self.device, non_blocking=True) + all_sentence_tags_idx = torch.tensor(sentence_pos_idx_list, dtype=torch.long, requires_grad=False).to(self.device, non_blocking=True) + all_sentence_labels_idx = torch.tensor(sentence_headers_idx_list, dtype=torch.long, requires_grad=False).to(self.device, non_blocking=True) + all_sentence_len = torch.tensor(sentence_len_list, dtype=torch.long, requires_grad=False).to(self.device, non_blocking=True) return TensorDataset(all_sentence_words_idx, all_sentence_tags_idx, all_sentence_labels_idx, all_sentence_len) else: @@ -216,52 +233,54 @@ def convert_sentences_to_dataset(self, padding): sentence_len_list))} -class DependencyParser(nn.Module): - def __init__(self, word_emb_dim, pos_emb_dim, hidden_dim, word_vocab_size, tag_vocab_size): - super(DependencyParser, self).__init__() - torch.manual_seed(1) - - self.emb_dim = word_emb_dim + pos_emb_dim - torch.manual_seed(1) +class LSTMEncoder(nn.Module): + """ + Our model encoder, based on LSTM and Contrast. + """ + def __init__(self, batch_size, words_dict, word_to_idx, idx_to_word, word_emb_dim, pos_emb_dim, hidden_dim, word_vocab_size, tag_vocab_size): + """ + Args: + word_emb_dim: The dimension of the word embedding. + pos_emb_dim: The dimension of the POS tag embedding. + hidden_dim: The dimension of the LSTM's hidden size + word_vocab_size: The number of words in our vocabulary. + tag_vocab_size: The number of tags in our vocabulary + """ + super(LSTMEncoder, self).__init__() self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - torch.manual_seed(1) - - self.word_embedding = nn.Embedding(word_vocab_size, word_emb_dim) + self.idx_to_word = idx_to_word + self.word_to_idx = word_to_idx + self.batch_size = batch_size - torch.manual_seed(1) + self.words_dict = words_dict + self.hidden_dim = hidden_dim + self.emb_dim = word_emb_dim + pos_emb_dim + self.word_embedding = nn.Embedding(word_vocab_size, word_emb_dim) self.tag_embedding = nn.Embedding(tag_vocab_size, pos_emb_dim) - torch.manual_seed(1) self.encoder = nn.LSTM(input_size=self.emb_dim, hidden_size=hidden_dim, num_layers=2, bidirectional=True, batch_first=True) - torch.manual_seed(1) + self.mlp = nn.Sequential( - nn.Linear(self.emb_dim * 4, 100), + nn.Linear(self.hidden_dim * 4, 100), nn.Tanh(), nn.Linear(100, 1) ) - torch.manual_seed(1) - self.fc1 = nn.Linear(self.emb_dim * 4, 100) - torch.manual_seed(1) - self.tanh = nn.Tanh() - torch.manual_seed(1) - self.fc2 = nn.Linear(100, 1) - - def forward(self, words_idx_tensor, pos_idx_tensor, max_length, lengths): - torch.manual_seed(1) - words_embedded = self.word_embedding(words_idx_tensor[:, :max_length].to(self.device)) + def forward(self, words_idx_tensor, pos_idx_tensor, max_length, _evaluate=False): - torch.manual_seed(1) - - tags_embedded = self.tag_embedding(pos_idx_tensor[:, :max_length].to(self.device)) - torch.manual_seed(1) + # Goldberg and Kiperwasser dropout: + if not _evaluate: + mask = torch.rand((words_idx_tensor.shape[0], 250), dtype=torch.float).to(self.device) + drop_prob = torch.tensor([[0.25/(0.25+self.words_dict[self.idx_to_word[word_idx]]) + for word_idx in words_idx_tensor[i]] for i in range(self.batch_size)]).to(self.device) + words_idx_tensor = words_idx_tensor.where(mask > drop_prob, + torch.tensor(self.word_to_idx[UNKNOWN_TOKEN]).to(self.device)) + words_embedded = self.word_embedding(words_idx_tensor[:, :max_length].to(self.device, non_blocking=True)) + tags_embedded = self.tag_embedding(pos_idx_tensor[:, :max_length].to(self.device, non_blocking=True)) embeds = torch.cat([words_embedded, tags_embedded], 2) - - torch.manual_seed(1) - lstm_out, _ = self.encoder(embeds) features = [] @@ -272,13 +291,9 @@ def forward(self, words_idx_tensor, pos_idx_tensor, max_length, lengths): lstm_out[i].repeat(max_length, 1, 1)], -1).unsqueeze(1)) features = torch.cat(features, 1) - torch.manual_seed(1) - # features = self.mlp(features) - edge_scores = self.fc1(features) - torch.manual_seed(1) - edge_scores = self.tanh(edge_scores) - torch.manual_seed(1) - edge_scores = self.fc2(edge_scores) + + edge_scores = self.mlp(features) + return edge_scores @@ -286,12 +301,13 @@ def get_acc(edge_scores, headers_idx_tensors, batch_size, max_length, sentence_l """ Uses Chu Liu Edmonds algorithm to infer a parse tree and calculates the current batch accuracy. Args: - edge_scores: - headers_idx_tensors: - batch_size: - max_length: - sentence_length: + edge_scores: Edge scores matrix, gained our of our chosen model. + headers_idx_tensors: The gold headers to compare to. + batch_size: The number of sentences in a batch. + max_length: The maximum length of a sentence in the batch. + sentence_length: List of all the sentences length. Returns: + The summed accuracy of the current batch. """ acc = 0 trees = [] @@ -302,132 +318,235 @@ def get_acc(edge_scores, headers_idx_tensors, batch_size, max_length, sentence_l has_labels=False)[0]) for i in range(batch_size): - acc += torch.mean(torch.tensor(headers_idx_tensors[i].tolist() == trees[i], dtype=torch.float, requires_grad=False)) + acc += torch.sum(torch.tensor(headers_idx_tensors[i][1:].tolist() == trees[i][1:], dtype=torch.float, requires_grad=False)) return acc - -def evaluate(model, words_dict, pos_dict, batch_size): +def evaluate(model, path_test, words_dict, pos_dict, batch_size): """ + Evaluate our model on a validation set. Args: - words_dict: - pos_dict: - batch_size: + model: Our trained model. + words_dict: The word vocabulary the model trained with. + pos_dict: The POS tag vocabulary the model trained with. + batch_size: The number of sentences in a batch. Returns: + The given model's loss and accuracy gained on the validation set. """ print("Evaluating Started") - path_test = "Data/test.labeled" - test = DependencyDataset(words_dict, pos_dict, path_test, padding=True) - test_data_loader = DataLoader(test, batch_size=BATCH_SIZE, shuffle=False, num_workers=0) + model.eval() + test = DependencyDataset(words_dict, pos_dict, path_test, padding=True) + test_data_loader = DataLoader(test, batch_size=batch_size, shuffle=False, num_workers=0) + num_of_sentences = len(test) acc = 0 + num_of_words = 0 with torch.no_grad(): for batch_idx, input_data in enumerate(test_data_loader): words_idx_tensor, pos_idx_tensor, headers_idx_tensor, sentence_length = input_data headers_idx_tensors = [headers[:sentence_length[i]] for i, headers in enumerate(headers_idx_tensor)] max_length = max(sentence_length) - batched_scores = model(words_idx_tensor, pos_idx_tensor, max_length, sentence_length) + batched_scores = model(words_idx_tensor, pos_idx_tensor, max_length, _evaluate=True) + _loss = OpTyNLLLOSS(headers_idx_tensors, batched_scores, max_length).requires_grad_(False).item() acc += get_acc(batched_scores, headers_idx_tensors, batch_size, max_length, sentence_length) + num_of_words += sentence_length.sum()-batch_size - acc = acc / len(test) + acc = acc/num_of_words print("Evaluating Ended") - return acc + return acc, _loss -def print_plots(accuracy_list, loss_list): +def print_plots(train_acc_list, train_loss_list, test_acc_list, test_loss_list, _time=''): """ Prints two plot that describes our processes of learning through an NLLL loss function and the accuracy measure. Args: - accuracy_list: - loss_list: + train_acc_list: Contains the accuracy measure tracking through the training phase. + train_loss_list: Contains the loss measure tracking through the training phase. + test_acc_list: Contains the accuracy measure tracking through the evaluation phase. + test_loss_list: Contains the loss measure tracking through the evaluation phase. + _time: The time id to recognize the plot output. Returns: + Saves the plot in a jpeg file. """ - plt.plot(accuracy_list, c="red", label="Accuracy") - plt.xlabel("Epochs") - plt.ylabel("Value") - plt.legend() - plt.show() - - plt.plot(loss_list, c="blue", label="Loss") - plt.xlabel("Epochs") - plt.ylabel("Value") - plt.legend() - plt.show() -if __name__ == '__main__': + # sns.set_style("whitegrid") - start_time = time.time() + fig, ax = plt.subplots(2, 1, figsize=(10, 10)) + x_train = [a for a in range(len(train_loss_list))] + x_test = [a for a in range(len(test_loss_list))] - # hyper_parameters - EPOCHS = 200 - WORD_EMBEDDING_DIM = 100 - POS_EMBEDDING_DIM = 25 - HIDDEN_DIM = 125 - BATCH_SIZE = 10 - LEARNING_RATE = 0.007 + ax[0].plot(x_train, train_loss_list, label='Loss Train') + ax[0].plot(x_test, test_loss_list, label='Loss Test') - path_train = "Data/train.labeled" - path_test = "Data/test.labeled" - paths_list = [path_train] + ax[0].legend() + ax[0].set_title('Loss Convergence') + ax[0].set_xlabel('Num of Epochs') + ax[0].set_ylabel('Loss') - words_dict, pos_dict = get_vocabs(paths_list) # Gets all known vocabularies. + ax[1].plot(x_train, train_acc_list, label='Train UAS') + ax[1].plot(x_test, test_acc_list, label='Test UAS') + ax[1].legend() + ax[1].set_title('UAS') + ax[1].set_xlabel('Num of Epochs') + ax[1].set_ylabel('UAS') + fig.savefig('plots_{}.png'.format(_time)) - # Preparing the dataset - train = DependencyDataset(words_dict, pos_dict, path_train, padding=True) - train_data_loader = DataLoader(train, batch_size=BATCH_SIZE, shuffle=True, num_workers=0) +class DependencyParser: + """ + A dependency parser model object, includes all the hyper parameters mix and train phase of the selected model. + """ + def __init__(self, epochs, word_embedding_dim, pos_embedding_dim, hidden_dim, batch_size, batch_accumulate, + learning_rate, path_train, path_test, word_tag_dropout): + """ + Args: + epochs: The number of epochs the model is trained on. + word_embedding_dim: The dimension of the word embedding. + pos_embedding_dim: The dimension of the POS tag embedding. + hidden_dim: The LSTM's hidden size dimension. + batch_size: The batch size - the number of sentences we get out of the data loader. + batch_accumulate: The accumulate batch size - The practical batch size, the number of sentences which we learn on parallel. + learning_rate: The learning rate of our optimizer. + path_train: The path to the train file. + path_test: The path to the test file. + word_tag_dropout: The probability to dropout a complete word\ag and replace it with it's matched word\tag. + """ + self.epochs = epochs + self.word_embedding_dim = word_embedding_dim + self.pos_embedding_dim = pos_embedding_dim + self.hidden_dim = hidden_dim + self.batch_size = batch_size + self.batch_accumulate = batch_accumulate + self.learning_rate = learning_rate + self.path_train = path_train + self.path_test = path_test + self.word_tag_dropout = word_tag_dropout + + self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + + def train(self): + """ Runs the training phase of our model. + Returns an accuracy tracking on a validation set and a unique ID to identify this run. + """ - word_vocab_size = len(words_dict) - pos_vocab_size = len(pos_dict) + start_time = timer() + torch.cuda.empty_cache() - OpTyParser = DependencyParser(WORD_EMBEDDING_DIM, POS_EMBEDDING_DIM, HIDDEN_DIM, word_vocab_size, pos_vocab_size) - use_cuda = torch.cuda.is_available() - device = torch.device("cuda:0" if use_cuda else "cpu") + paths_list = [self.path_train] - if use_cuda: - OpTyParser.cuda() + # Prepares the dataset. + words_dict, pos_dict = get_vocabs(paths_list) # Gets all known vocabularies. + train = DependencyDataset(words_dict, pos_dict, self.path_train, padding=True) + train_data_loader = DataLoader(train, batch_size=self.batch_size, shuffle=True, num_workers=0) + word_vocab_size = len(words_dict) + pos_vocab_size = len(pos_dict) + word_to_idx, idx_to_word, _= train.get_words_embeddings() - optimizer = optim.Adam(OpTyParser.parameters(), lr=LEARNING_RATE) + # Initialize an instance of our encoder with the chosen hyper parameters. + encoder = LSTMEncoder(self.batch_size, words_dict, word_to_idx, idx_to_word, self.word_embedding_dim, self.pos_embedding_dim, + self.hidden_dim, word_vocab_size, pos_vocab_size) - # Training start - print("Training Started") - accuracy_list = [] - loss_list = [] - for epoch in range(EPOCHS): - acc = 0 # to keep track of accuracy - printable_loss = 0 # To keep track of the loss value - i = 0 - for input_data in tqdm(train_data_loader): - i += 1 + if torch.cuda.is_available(): + encoder.cuda() - words_idx_tensor, pos_idx_tensor, headers_idx_tensor, sentence_length = input_data - headers_idx_tensors = [headers[:sentence_length[i]] for i, headers in enumerate(headers_idx_tensor)] - max_length = max(sentence_length) + # Initialize the chosen optimizer. + optimizer = optim.Adam(encoder.parameters(), lr=self.learning_rate) + + # Training start + print("Training Started") + + # To keep track of the loss and accuracy values. + train_acc_list = [] + train_loss_list = [] + test_acc_list = [] + test_loss_list = [] + + for epoch in range(self.epochs): + + acc = 0 + printable_loss = 0 + num_of_words = 0 + + for input_data in tqdm(train_data_loader): + + encoder.train() + + words_idx_tensor, pos_idx_tensor, headers_idx_tensor, sentence_length = input_data + + # In case we use batches (>1 sentences) we need to cut the padding out of the gold headers. + headers_idx_tensors = [headers[:sentence_length[i]] for i, headers in enumerate(headers_idx_tensor)] + max_length = max(sentence_length) + + # Feeding our model with the current batch. + batched_weights = encoder(words_idx_tensor, pos_idx_tensor, max_length) - batched_weights = OpTyParser(words_idx_tensor, pos_idx_tensor, max_length, sentence_length) - - loss = OpTyNLLLOSS(headers_idx_tensors, batched_weights, max_length) - loss.backward() - optimizer.step() - OpTyParser.zero_grad() - - printable_loss += loss.item() - - acc += get_acc(batched_weights, headers_idx_tensors, BATCH_SIZE, max_length, sentence_length) - - printable_loss = printable_loss / len(train) - acc = acc/len(train) - accuracy_list.append(float(acc)) - loss_list.append(float(printable_loss)) - test_acc = evaluate(OpTyParser, words_dict, pos_dict, BATCH_SIZE) - e_interval = i - print("Epoch {} Completed,\tLoss {}\tAccuracy: {}\t Test Accuracy: {}".format(epoch + 1, - np.mean(loss_list[-e_interval:]), - np.mean( - accuracy_list[-e_interval:]), - test_acc)) - - print_plots(accuracy_list, loss_list) - end_time = time.time() - torch.save(OpTyParser.state_dict(), 'OpTyParser{}.pkl '.format(start_time)) - print("the training took: ", end_time - start_time) \ No newline at end of file + loss = OpTyNLLLOSS(headers_idx_tensors, batched_weights, max_length) + loss.backward() + + optimizer.step() + encoder.zero_grad() + + printable_loss += loss.item() + + acc += get_acc(batched_weights, headers_idx_tensors, self.batch_size, max_length, sentence_length) + num_of_words += sentence_length.sum() - self.batch_size # We don't count the root as we don't count it in the accuracy. + + # Adds up the new tracking measures. + printable_loss = printable_loss / len(train) + acc = acc / num_of_words + train_acc_list.append(float(acc)) + train_loss_list.append(float(printable_loss)) + # Runs a validation phase. + test_acc, test_loss = evaluate(encoder, self.path_test, words_dict, pos_dict, self.batch_size) + test_acc_list.append(test_acc) + test_loss_list.append(test_loss) + + time_id = datetime.now().strftime("%m_%d_%Y_%H_%M_%S") + with open(r"{}_basic_model_{}.pkl".format(epoch, time_id), "wb") as output_file: + torch.save(encoder.state_dict(), output_file) + + print("Epoch {} Completed,\tLoss {}\tAccuracy: {}\t Test Accuracy: {}".format(epoch + 1, train_loss_list[-1], + train_acc_list[-1], test_acc)) + + # Saves our learned model and plot some graphs. + time_id = datetime.now().strftime("%m_%d_%Y_%H_%M_%S") + print_plots(train_acc_list, train_loss_list, test_acc_list, test_loss_list, time_id) + end_time = timer() + with open(r"basic_model_{}.pkl".format(time_id), "wb") as output_file: + torch.save(encoder.state_dict(), output_file) + print("the training took: {} sec ".format(round(end_time - start_time, 2))) + return test_acc_list, time_id + +def get_hyper_parameters(): + """Returns the hyper parameters of the model.""" + path_train = "Data/combined.labeled" + path_test = "Data/val.labeled" + return (30, 100, 25, 125, 10, 1, 0.001, path_train, path_test, 0) + + +def run_basic_model(): + torch.manual_seed(0) + + hyper_parameters_list = [get_hyper_parameters()] + + for hyper_parameters in hyper_parameters_list: + EPOCHS, WORD_EMBEDDING_DIM, POS_EMBEDDING_DIM, HIDDEN_DIM, BATCH_SIZE, BATCH_ACCUMULATE, LEARNING_RATE, path_train, path_test, WORD_TAG_DROPOUT= hyper_parameters + + parser = DependencyParser(EPOCHS, WORD_EMBEDDING_DIM, POS_EMBEDDING_DIM, HIDDEN_DIM, BATCH_SIZE, BATCH_ACCUMULATE, + LEARNING_RATE, path_train, path_test, WORD_TAG_DROPOUT) + + test_acc_list, time_id = parser.train() + max_test_acc = round(max(test_acc_list).item(), 3) + epoch_max = np.argmax(test_acc_list) + + # Saves our model hyper parameters settings ina csv file. + with open('parser_results_info.csv', 'a') as f: + writer = csv.writer(f) + writer.writerow([time_id, max_test_acc, epoch_max, EPOCHS, WORD_EMBEDDING_DIM, POS_EMBEDDING_DIM, HIDDEN_DIM, + BATCH_SIZE, BATCH_ACCUMULATE, LEARNING_RATE, WORD_TAG_DROPOUT]) + + print("Finished training the model, based on the following hyper parameters mix: {}".format(hyper_parameters)) + + +if __name__ == '__main__': + run_basic_model() diff --git a/generate_comp_tagged.py b/generate_comp_tagged.py index 823573d..2484a2f 100644 --- a/generate_comp_tagged.py +++ b/generate_comp_tagged.py @@ -1,39 +1,56 @@ import AdvancedModel +import BasicModel from chu_liu_edmonds import decode_mst import torch from tqdm import tqdm import numpy as np -from AdvancedModel import LSTMEncoder + torch.manual_seed(0) -def generate_comp_tagged_file(model_path, target_path): +def generate_comp_tagged_file(model_path, target_path, path_comp, model_module): """ Generates a tagged version of the competition file. Args: + model_path (str): The path to the chosen model's pth file. target_path (str): The path for the tagged version's file. - + path_comp: The path for the untagged version's file. + model_module: The module to use. """ - torch.manual_seed(0) - words_dict, pos_dict = AdvancedModel.get_vocabs(trained_on_path_list) - comp = AdvancedModel.DependencyDataset(words_dict, pos_dict, path_comp, padding=True, competition=True) - comp_data_loader = AdvancedModel.DataLoader(comp, batch_size=1, shuffle=False, num_workers=0) with torch.no_grad(): - model = torch.load(model_path) + words_dict, pos_dict = model_module.get_vocabs(trained_on_path_list) + word_vocab_size, pos_vocab_size = len(words_dict), len(pos_dict) + comp = model_module.DependencyDataset(words_dict, pos_dict, path_comp, padding=True, competition=True) + comp_data_loader = model_module.DataLoader(comp, batch_size=1, shuffle=False, num_workers=0) + word_to_idx, idx_to_word, _ = comp.get_words_embeddings() + + if model_module == BasicModel: + _, word_embedding_dim, pos_embedding_dim, hidden_dim, batch_size, _, _, _, _, word_tag_dropout \ + = model_module.get_hyper_parameters() + + model = model_module.LSTMEncoder(batch_size, words_dict, word_to_idx, idx_to_word, word_embedding_dim, + pos_embedding_dim, hidden_dim, word_vocab_size, pos_vocab_size) + else: + _, word_embedding_dim, pos_embedding_dim, hidden_dim, _, _, _, _, _, word_tag_dropout, embedding_dropout,\ + lstm_dropout = model_module.get_hyper_parameters() + + model = model_module.LSTMEncoder(word_embedding_dim, pos_embedding_dim, hidden_dim, word_vocab_size, + pos_vocab_size, word_tag_dropout, embedding_dropout, lstm_dropout) + + model.load_state_dict(torch.load(model_path)) + model = model.eval() if torch.cuda.is_available(): model.cuda() - model = model.eval() - trees = [] for input_data in tqdm(comp_data_loader): words_idx_tensor, pos_idx_tensor, headers_idx_tensor, sentence_length = input_data max_length = max(sentence_length) - score_matrix = model(words_idx_tensor, pos_idx_tensor, max_length) + score_matrix = model(words_idx_tensor, pos_idx_tensor, max_length, _evaluate=True) trees.append(decode_mst(np.array(score_matrix[:, 0].detach().cpu()).reshape((max_length, max_length)) [:sentence_length[0], :sentence_length[0]], sentence_length[0], has_labels=False)[0]) @@ -41,9 +58,6 @@ def generate_comp_tagged_file(model_path, target_path): current_tree_idx = 0 current_word_idx = 1 - # with open(target_path, 'w') as f_labeled: - # pass - with open(path_comp, 'r') as f_unlabeled: with open(target_path, 'w') as f_labeled: @@ -63,22 +77,20 @@ def generate_comp_tagged_file(model_path, target_path): if __name__ == '__main__': - print("Starting to evaluate") torch.cuda.empty_cache() path_train = "Data/train.labeled" path_test = "Data/test.labeled" + path_combined = "Data/combined.labeled" path_comp = "Data/comp.unlabeled" - trained_on_path_list = [path_train, path_test] + trained_on_path_list = [path_combined] path_comp_m1_labeled = 'comp_m1_203933551.labeled' path_comp_m2_labeled = 'comp_m2_203933551.labeled' - basic_model_path = 'encoder06_26_2020_14_55_17.pth' - advanced_model_path = 'encoder06_26_2020_14_55_17.pth' + basic_model_path = 'basic_model_full.pkl' + advanced_model_path = 'advanced_model_full.pkl' - generate_comp_tagged_file(basic_model_path, path_comp_m1_labeled) - generate_comp_tagged_file(advanced_model_path, path_comp_m2_labeled) + generate_comp_tagged_file(basic_model_path, path_comp_m1_labeled, path_comp, BasicModel) + generate_comp_tagged_file(advanced_model_path, path_comp_m2_labeled, path_comp, AdvancedModel) print("Evaluate end") - -