from torch.utils.data import Dataset, DataLoader class FixedWindow(Dataset): def __init__(self, words, length_window): super().__init__() self.length_window = length_window # TODO: # compute the vocabulary = list of unique words in 'words', # then assign a unique id number to each word in the # vocabulary, # and finally compute a list of ids, one per word in 'words'
vocab = words.vocab vocab_size = len(vocab) self.id_to_word = {vocab.stoi[key]: key for key in vocab.stoi} self.words = words self.vocabulary = list(vocab.stoi.keys()) self.ids = [vocab.stoi[word] for word in words]