Overview of methods for creating proposal embeddings, Part 2

Hello, the continuation of the article about methods for creating sentence embeddings. This guide has few words and a lot of code, ready for Ctrl + c, Ctrl + v, improvements and further tests.



Part 1 is obligatory for reading



4. BERT



from deeppavlov.core.common.file import read_json
from deeppavlov import build_model, configs
from deeppavlov.models.embedders.elmo_embedder import ELMoEmbedder
#     http://docs.deeppavlov.ai/en/master/features/pretrained_vectors.html


4.1 rubert_cased_L-12_H-768_A-12_pt



class RU_BERT_CLASS:
    def __init__(self, name):
        bert_config = read_json(configs.embedder.bert_embedder)
        bert_config['metadata']['variables']['BERT_PATH'] = os.path.join('./.', name)
        self.m = build_model(bert_config)

    def vectorizer(self, sentences):
        return [sentence.split() for sentence in sentences]

    def predict(self, tokens):
        _, _, _, _, sent_max_embs, sent_mean_embs, _ = self.m(tokens)
        return sent_mean_embs

bert = RU_BERT_CLASS('rubert_cased_L-12_H-768_A-12_pt')
get_similarity_values = similarity_values_wrapper(bert.predict, bert.vectorizer, distance_function=cosine_distances)
evaluate(get_similarity_values, 'rubert')


'rubert: 2895.7'



4.2 ru_conversational_cased_L-12_H-768_A-12_pt



bert = RU_BERT_CLASS('ru_conversational_cased_L-12_H-768_A-12_pt')
get_similarity_values = similarity_values_wrapper(bert.predict, bert.vectorizer, distance_function=cosine_distances)
evaluate(get_similarity_values, 'ru_conversational')


'ru_conversational: 3559.1'



4.3 sentence_ru_cased_L-12_H-768_A-12_pt



bert = RU_BERT_CLASS('sentence_ru_cased_L-12_H-768_A-12_pt')
get_similarity_values = similarity_values_wrapper(bert.predict, bert.vectorizer, distance_function=cosine_distances)
evaluate(get_similarity_values, 'sentence_ru')


'sentence_ru: 2660.2'



4.4 elmo_ru-news_wmt11-16_1.5M_steps



class ELMO_CLASS(RU_BERT_CLASS):
    def __init__(self, name):
        self.m = ELMoEmbedder(f"http://files.deeppavlov.ai/deeppavlov_data/{name}")

    def predict(self, tokens):
        return self.m(tokens)


elmo = ELMO_CLASS('elmo_ru-news_wmt11-16_1.5M_steps.tar.gz')
get_similarity_values = similarity_values_wrapper(elmo.predict, elmo.vectorizer, distance_function=cosine_distances)
evaluate(get_similarity_values, 'elmo_ru-news')


'elmo_ru-news: 4631.3'



4.5 elmo_ru-wiki_600k_steps



elmo = ELMO_CLASS('elmo_ru-wiki_600k_steps.tar.gz')
get_similarity_values = similarity_values_wrapper(elmo.predict, elmo.vectorizer, distance_function=cosine_distances)
evaluate(get_similarity_values, 'elmo_ru-wiki')


'elmo_ru-wiki: 4507.6'



4.6 elmo_ru-twitter_2013-01_2018-04_600k_steps



elmo = ELMO_CLASS('elmo_ru-twitter_2013-01_2018-04_600k_steps.tar.gz')
get_similarity_values = similarity_values_wrapper(elmo.predict, elmo.vectorizer, distance_function=cosine_distances)
evaluate(get_similarity_values, 'elmo_ru-twitter')


'elmo_ru-twitter: 2962.2'



plot_results()


png



5.



, , .



5.1 embedings -> embedings



def models_builder(data_generator):
    def cosine_loss(y_true, y_pred):
        return K.mean(cosine_similarity(y_true, y_pred, axis=-1))

    complexity = 300
    inp = Input(shape=(data_generator.max_len, data_generator.embedding_size))
    X = inp
    X = Bidirectional(LSTM(complexity, return_sequences=True))(X)
    X = Bidirectional(LSTM(int(complexity/10), return_sequences=True))(X)
    X = Flatten()(X)
    X = Dense(complexity, activation='elu')(X)
    X = Dense(complexity, activation='elu')(X)
    X = Dense(complexity, activation='linear', name='embeding_output')(X)
    X = Dense(complexity, activation='elu')(X)
    X = Dense(data_generator.max_len*complexity, activation='elu')(X)
    X = Reshape((data_generator.max_len, complexity))(X)
    X = Bidirectional(LSTM(complexity, return_sequences=True))(X)
    X = Bidirectional(LSTM(complexity, return_sequences=True))(X)
    X = Dense(data_generator.embedding_size, activation='elu')(X)
    autoencoder = Model(inputs=inp, outputs=X)
    autoencoder.compile(loss=cosine_loss, optimizer='adam')
    autoencoder.summary()

    embedder = Model(inputs=inp, outputs=autoencoder.get_layer('embeding_output').output)
    return autoencoder, embedder

data_generator = EmbedingsDataGenerator(use_fasttext=False)
autoencoder, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize, distance_function=cosine_distances)


new_result = -10e5
for i in tqdm(range(1000)):
    if i%3==0:
        previous_result = new_result
        new_result = evaluate(get_similarity_values, ' embedings -> embedings')
        new_result = parse_result(new_result)
        print(i, new_result)
        if new_result < previous_result and i > 20:
            break
    for x, y in data_generator:
        autoencoder.train_on_batch(x, x)


0 1770.2

3 212.6

6 138.8

9 84.8

12 78.1

15 106.4

18 112.7

21 79.7



5.2 embedings -> indexes



def models_builder(data_generator):
    complexity = 300
    inp = Input(shape=(data_generator.max_len, data_generator.embedding_size))
    X = inp
    X = Bidirectional(LSTM(complexity, return_sequences=True))(X)
    X = Bidirectional(LSTM(int(complexity/10), return_sequences=True))(X)
    X = Flatten()(X)
    X = Dense(complexity, activation='elu')(X)
    X = Dense(complexity, activation='elu')(X)
    X = Dense(complexity, activation='linear', name='embeding_output')(X)
    X = Dense(complexity, activation='elu')(X)
    X = Dense(data_generator.max_len*complexity, activation='elu')(X)
    X = Reshape((data_generator.max_len, complexity))(X)
    X = Bidirectional(LSTM(complexity, return_sequences=True))(X)
    X = Bidirectional(LSTM(complexity, return_sequences=True))(X)
    X = Dense(len(data_generator.token2index), activation='softmax')(X)
    autoencoder = Model(inputs=inp, outputs=X)
    autoencoder.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
    autoencoder.summary()

    embedder = Model(inputs=inp, outputs=autoencoder.get_layer('embeding_output').output)
    return autoencoder, embedder

data_generator = IndexesDataGenerator()
autoencoder, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize)


new_result = -10e5
for i in tqdm(range(1000)):
    if i%3==0:
        previous_result = new_result
        new_result = evaluate(get_similarity_values, ' embedings -> indexes')
        new_result = parse_result(new_result)
        print(i, new_result)
        if new_result < previous_result and i > 20:
            break
    for x_e, x_i, y_i in data_generator:
        autoencoder.train_on_batch(x_e, x_i)


0 1352.9

3 43.6

6 41.7

9 8.1

12 -5.6

15 43.1

18 36.1

21 -3.7



5.3 LSTM -> LSTM



def models_builder(data_generator):
    def cosine_loss(y_true, y_pred):
        return K.mean(cosine_similarity(y_true, y_pred, axis=-1))

    complexity = 300
    inp = Input(shape=(data_generator.max_len, data_generator.embedding_size))
    X = inp
    X, state_h, state_c = LSTM(complexity, return_state=True)(X)
    X = Concatenate()([state_h, state_c])
    X = Dense(complexity, activation='linear', name='embeding_output')(X)

    state_c = Dense(complexity, activation='linear')(X)
    state_h = Dense(complexity, activation='linear')(X)
    inp_zeros = Input(shape=(data_generator.max_len, data_generator.embedding_size))

    X = LSTM(complexity, return_sequences=True)(inp_zeros, [state_c, state_h])
    X = Dense(data_generator.embedding_size, activation='linear')(X)

    autoencoder = Model(inputs=[inp, inp_zeros], outputs=X)
    autoencoder.compile(loss=cosine_loss, optimizer='adam')
    autoencoder.summary()

    embedder = Model(inputs=inp, outputs=autoencoder.get_layer('embeding_output').output)
    return autoencoder, embedder

data_generator = EmbedingsDataGenerator(use_fasttext=False)
autoencoder, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize)


zeros = np.zeros((data_generator.batch_size, data_generator.max_len, data_generator.embedding_size))
new_result = -10e5
for i in tqdm(range(1000)):
    if i%3==0:
        previous_result = new_result
        new_result = evaluate(get_similarity_values, ' embedings -> indexes')
        new_result = parse_result(new_result)
        print(i, new_result)
        if new_result < previous_result and i > 20:
            break
    for x, y in data_generator:
        autoencoder.train_on_batch([x, zeros], x)


0 1903.6

3 1299.3

6 313.5

9 445.3

12 454.9

15 447.7

18 454.5

21 448.1



5.4 LSTM -> LSTM -> indexes



def models_builder(data_generator):
    complexity = 300
    inp = Input(shape=(data_generator.max_len, data_generator.embedding_size))
    X = inp
    X, state_h, state_c = LSTM(complexity, return_state=True)(X)
    X = Concatenate()([state_h, state_c])
    X = Dense(complexity, activation='linear', name='embeding_output')(X)
    state_c = Dense(complexity, activation='linear')(X)
    state_h = Dense(complexity, activation='linear')(X)
    inp_zeros = Input(shape=(data_generator.max_len, data_generator.embedding_size))

    X = LSTM(complexity, return_sequences=True)(inp_zeros, [state_c, state_h])
    X = Dense(len(data_generator.token2index), activation='softmax')(X)

    autoencoder = Model(inputs=[inp, inp_zeros], outputs=X)
    autoencoder.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
    autoencoder.summary()

    embedder = Model(inputs=inp, outputs=autoencoder.get_layer('embeding_output').output)
    return autoencoder, embedder

data_generator = IndexesDataGenerator()
autoencoder, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize)


zeros = np.zeros((data_generator.batch_size, data_generator.max_len, data_generator.embedding_size))
new_result = -10e5
for i in tqdm(range(1000)):
    if i%3==0:
        previous_result = new_result
        new_result = evaluate(get_similarity_values, '  LSTM -> LSTM -> indexes')
        new_result = parse_result(new_result)
        print(i, new_result)
        if new_result < previous_result and i > 20:
            break
    for x_e, x_i, y_i in data_generator:
        autoencoder.train_on_batch([x_e, zeros], x_i)


0 1903.6

3 1483.3

6 1249.3

9 566.3

12 789.2

15 702.3

18 480.5

21 552.3

24 533.0





6. Transfer Learning



TEXTS_CORPUS_WITH_LABEL = [(sentence, topic) for topic in texts_for_training for sentence in texts_for_training[topic]]

class BowDataGenerator(EmbedingsDataGenerator):
    def __init__(self, texts_topics=TEXTS_CORPUS_WITH_LABEL, batch_size=128, batches_per_epoch=100):
        self.texts_topics = texts_topics
        self.topic2index = {topic: index for index, topic in enumerate({topic for text, topic in self.texts_topics})}
        self.batch_size = batch_size
        self.batches_per_epoch = batches_per_epoch
        self.count_vectorizer = CountVectorizer().fit([text_topic[0] for text_topic in self.texts_topics])
        counts = Counter([text_topic[1] for text_topic in self.texts_topics])
        self.class_weight = {self.topic2index[intent_id]:1/counts[intent_id] for intent_id in counts}

    def vectorize(self, sentences):
        return self.count_vectorizer.transform(sentences).toarray()

    def __iter__(self):
        for _ in tqdm(range(self.batches_per_epoch), leave=False):
            X_batch = []
            y_batch = []
            finished_batch = False
            while not finished_batch:
                text, topic = random.choice(self.texts_topics)
                X_batch.append(text)
                y_batch.append(self.topic2index[topic])

                if len(X_batch) >= self.batch_size:
                    X_batch = self.count_vectorizer.transform(X_batch).toarray()
                    y_batch = to_categorical(y_batch, num_classes=len(self.topic2index))
                    yield np.array(X_batch), np.array(y_batch)
                    finished_batch = True

data_generator = BowDataGenerator()


6.1 BOW



def models_builder(data_generator):
    complexity = 500
    inp = Input(shape=(len(data_generator.count_vectorizer.get_feature_names()),))
    X = inp
    X = Dense(complexity)(X)
    X = Activation('elu')(X)
    X = Dense(complexity)(X)
    X = Activation('elu')(X)
    X = Dense(complexity, name='embeding_output')(X)
    X = Activation('elu')(X)
    X = Dense(len(data_generator.topic2index), activation='softmax')(X)

    model = Model(inputs=inp, outputs=X)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
    model.summary()

    embedder = Model(inputs=inp, outputs=model.get_layer('embeding_output').output)
    return model, embedder

data_generator = BowDataGenerator()
model, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize)


new_result = -10e5
for i in tqdm(range(1000)):
    if i%3==0:
        previous_result = new_result
        new_result = evaluate(get_similarity_values, '  BOW')
        new_result = parse_result(new_result)
        print(i, new_result)
        if new_result < previous_result and i > 20:
            break
    for x, y in data_generator:
        model.train_on_batch(x, y, class_weight=data_generator.class_weight)


0 601.4

3 1175.4

6 1187.0

9 1175.9

12 1097.9

15 1083.4

18 1083.8

21 1060.5



6.2 LSTM + MaxPooling (InferSent)



:

Arxiv

-



class LabelsDataGenerator(EmbedingsDataGenerator):
    def __init__(self, texts_topics=TEXTS_CORPUS_WITH_LABEL, target_len=20, batch_size=128, batches_per_epoch=100, use_word2vec=True, use_fasttext=True):
        self.texts_topics = texts_topics
        self.topic2index = {topic: index for index, topic in enumerate({topic for text, topic in self.texts_topics})}
        self.target_len = target_len
        self.batch_size = batch_size
        self.batches_per_epoch = batches_per_epoch
        self.use_word2vec = use_word2vec
        self.use_fasttext = use_fasttext
        self.embedding_size = len(vectorize('token', use_word2vec=self.use_word2vec, use_fasttext=self.use_fasttext))
        counts = Counter([text_topic[1] for text_topic in self.texts_topics])
        self.class_weight = {self.topic2index[intent_id]:1/counts[intent_id] for intent_id in counts}       

    def vectorize(self, sentences):
        vectorized = []
        for text in sentences:
            tokens = str(text).split()
            x_vec = []
            for token in tokens:
                token_vec = vectorize(token, use_word2vec=self.use_word2vec, use_fasttext=self.use_fasttext)                       
                x_vec.append(token_vec)
            vectorized.append(x_vec)

        vectorized = pad_sequences(vectorized, maxlen=self.target_len)
        return vectorized

    def __iter__(self):
        for _ in tqdm(range(self.batches_per_epoch), leave=False):
            X_batch = []
            y_batch = []
            finished_batch = False
            while not finished_batch:
                text, topic = random.choice(self.texts_topics)
                tokens = text.split()
                x_vec = []
                for token in tokens:
                    token_vec = vectorize(token, use_word2vec=self.use_word2vec, use_fasttext=self.use_fasttext)
                    if len(x_vec) >= self.target_len:
                        X_batch.append(x_vec)
                        y_batch.append(self.topic2index[topic])
                        if len(X_batch) >= self.batch_size:
                            break
                    x_vec.append(token_vec)
                else:
                    X_batch.append(x_vec)
                    y_batch.append(self.topic2index[topic])

                if len(X_batch) >= self.batch_size:
                    X_batch = pad_sequences(X_batch, maxlen=self.target_len)
                    y_batch = to_categorical(y_batch, num_classes=len(self.topic2index))
                    yield np.array(X_batch), np.array(y_batch)
                    finished_batch = True


def models_builder(data_generator):
    complexity = 768
    inp = Input(shape=(data_generator.target_len, data_generator.embedding_size))
    X = inp
    X = Bidirectional(LSTM(complexity, return_sequences=True))(X)
    X = Permute((2,1))(X)
    X = MaxPooling1D(pool_size=600)(X)
    X = Flatten()(X)
    X = Dense(complexity)(X)
    X = Activation('elu')(X)
    X = Dense(complexity, name='embeding_output')(X)
    X = Activation('sigmoid')(X)
    X = Dense(len(data_generator.topic2index), activation='softmax')(X)

    model = Model(inputs=inp, outputs=X)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
    model.summary()

    embedder = Model(inputs=inp, outputs=model.get_layer('embeding_output').output)
    return model, embedder

data_generator = LabelsDataGenerator()
model, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize)


new_result = -10e5
for i in tqdm(range(1000)):
    if i%3==0:
        previous_result = new_result
        new_result = evaluate(get_similarity_values, '  LSTM + MaxPooling')
        new_result = parse_result(new_result)
        print(i, new_result)
        if new_result < previous_result and i > 20:
            break
    for x, y in data_generator:
        model.train_on_batch(x, y, class_weight=data_generator.class_weight)


0 87.0

3 152.1

6 110.5

9 146.7

12 166.2

15 79.8

18 47.2

21 84.0

24 144.8

27 83.8



6.3 LSTM + Conv1D + AveragePooling



def models_builder(data_generator):
    complexity = 600
    inp = Input(shape=(data_generator.target_len, data_generator.embedding_size))
    X_R = inp
    X_R = Bidirectional(LSTM(complexity, return_sequences=True))(X_R)
    X_R = Bidirectional(LSTM(complexity, return_sequences=True))(X_R)

    X_C = inp
    X_C = Conv1D(complexity, 3, strides=1, padding='same')(X_C)
    X_C = Conv1D(complexity, 3, strides=1, padding='same')(X_C)

    X = Concatenate()([X_R, X_C])
    X = AveragePooling1D(pool_size=2)(X)

    X = Conv1D(complexity, 3, strides=1, padding='same')(X)
    X = AveragePooling1D(pool_size=2)(X)

    X = Conv1D(complexity, 3, strides=1, padding='same')(X)
    X = AveragePooling1D(pool_size=2)(X)
    X = Flatten()(X)
    X = Dense(complexity)(X)
    X = Activation('sigmoid')(X)
    X = Dense(complexity, name = 'embeding_output')(X)
    X = Activation('elu')(X)
    X = Dense(len(data_generator.topic2index), activation='softmax')(X)

    model = Model(inputs=inp, outputs=X)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
    model.summary()

    embedder = Model(inputs=inp, outputs=model.get_layer('embeding_output').output)
    return model, embedder

data_generator = LabelsDataGenerator()
model, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize)


0 353.8

3 -147.8

6 7.6

9 5.5

12 -133.6

15 -133.6

18 9.0

21 9.0

24 -133.6



6.4 LSTM + Inception + Attention



def models_builder(data_generator):
    rate = 0.20
    complexity = 500

    def inception_convolutional_layer(X, complexity, rate=0.2, regularizer=0):
        X_7 = Conv1D(int(complexity/7), kernel_size=7, strides=1, padding='same')(X)
        X_6 = Conv1D(int(complexity/6), kernel_size=6, strides=1, padding='same')(X)
        X_5 = Conv1D(int(complexity/5), kernel_size=5, strides=1, padding='same')(X)
        X_4 = Conv1D(int(complexity/4), kernel_size=4, strides=1, padding='same')(X)
        X_3 = Conv1D(int(complexity/3), kernel_size=3, strides=1, padding='same')(X)
        X_2 = Conv1D(int(complexity/2), kernel_size=2, strides=1, padding='same')(X)
        X_1 = Conv1D(int(complexity/1), kernel_size=1, strides=1, padding='same')(X)
        X = Concatenate()([X_7, X_6, X_5, X_4, X_3, X_2, X_1])
        X = Activation('elu')(X)
        X = BatchNormalization()(X)
        X = Dropout(rate)(X)
        return X

    def bi_LSTM(X, complexity, rate=0.2, regularizer=0):
        X = Bidirectional(LSTM(int(complexity/2), return_sequences=True))(X)
        X = BatchNormalization()(X)
        X = Dropout(rate)(X)
        return X

    def dense_layer(X, complexity, activation='elu', rate=0.2, regularizer=0, name=None):
        X = Dense(int(complexity), name=name)(X)
        X = Activation(activation)(X)
        X = BatchNormalization()(X)
        X = Dropout(rate)(X)
        return X

    inp = Input(shape=(data_generator.target_len, data_generator.embedding_size))
    X = inp
    X = inception_convolutional_layer(X, complexity)
    X = inception_convolutional_layer(X, complexity)
    X = inception_convolutional_layer(X, complexity)
    X = MaxPooling1D(pool_size=2)(X)
    X = inception_convolutional_layer(X, complexity)
    X = MaxPooling1D(pool_size=2)(X)
    X = inception_convolutional_layer(X, complexity)
    X = MaxPooling1D(pool_size=2)(X)

    R = inp
    R = bi_LSTM(R, complexity)
    R = bi_LSTM(R, complexity/2)
    attention_probs = Dense(int(complexity/2), activation='sigmoid', name='attention_probs')(R)
    R = multiply([R, attention_probs], name='attention_mul')
    R = Dropout(rate)(R)
    R = MaxPooling1D(pool_size=2)(R)
    R = inception_convolutional_layer(R, complexity)
    R = MaxPooling1D(pool_size=2)(R)
    R = inception_convolutional_layer(R, complexity)
    R = MaxPooling1D(pool_size=2)(R)

    X = Concatenate(axis=-1)([X, R])
    X = Flatten()(X)
    X = BatchNormalization()(X)
    X = Dropout(rate)(X)

    X = dense_layer(X, complexity)
    X = dense_layer(X, complexity, activation='sigmoid')
    X = dense_layer(X, complexity, name='embeding_output')

    X = Dense(len(data_generator.topic2index), activation='softmax')(X)

    model = Model(inputs=inp, outputs=X)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
    model.summary()

    embedder = Model(inputs=inp, outputs=model.get_layer('embeding_output').output)
    return model, embedder

data_generator = LabelsDataGenerator()
model, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize)


new_result = -10e5
for i in tqdm(range(1000)):
    if i%3==0:
        previous_result = new_result
        new_result = evaluate(get_similarity_values, '  LSTM + Inception + Attention')
        new_result = parse_result(new_result)
        print(i, new_result)
        if new_result < previous_result and i > 20:
            break
    for x, y in data_generator:
        model.train_on_batch(x, y, class_weight=data_generator.class_weight)


0 275.0

3 126.8

6 173.9

9 155.5

12 168.4

15 287.2

18 382.8

21 303.4



plot_results()


png



7 Triplet loss



, , , . , , , .

Triplet loss



7.1 Triplet loss BOW



class TripletDataGeneratorIndexes(BowDataGenerator):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.database = {}
        for text, topic in self.texts_topics:
            if topic not in self.database:
                self.database[topic] = []
            self.database[topic].append(text)
        #     <5  
        sh_database = {}
        for topic in self.database:
            if len(self.database[topic]) > 5:
                sh_database[topic] = self.database[topic]
        self.database = sh_database

        self.all_topics = [topic for topic in self.database]

    def __iter__(self):
        for _ in tqdm(range(self.batches_per_epoch), leave=False):
            anchor = []
            positive = []
            negative = []

            for _ in range(self.batch_size):
                anchor_topic = random.choice(self.all_topics)
                anchor_index = np.random.randint(len(self.database[anchor_topic]))
                positive_index = np.random.randint(len(self.database[anchor_topic]))
                while positive_index == anchor_index:
                    positive_index = np.random.randint(len(self.database[anchor_topic]))

                negative_topic = random.choice(self.all_topics)
                while negative_topic == anchor_topic:
                    negative_topic = random.choice(self.all_topics)

                negative_index = np.random.randint(len(self.database[negative_topic]))

                anchor.append(self.database[anchor_topic][anchor_index])
                positive.append(self.database[anchor_topic][positive_index])
                negative.append(self.database[negative_topic][negative_index])

            yield self.vectorize(anchor), self.vectorize(positive), self.vectorize(negative)


def models_builder(data_generator):
    sentence_embeding_size = 100
    def lossless_triplet_loss(y_true, y_pred, N=sentence_embeding_size, beta=100, epsilon=1e-8):
        """
        Implementation of the triplet loss function

        Arguments:
        y_true -- true labels, required when you define a loss in Keras, you don't need it in this function.
        y_pred -- python list containing three objects:
                anchor -- the encodings for the anchor data
                positive -- the encodings for the positive data (similar to anchor)
                negative -- the encodings for the negative data (different from anchor)
        N  --  The number of dimension 
        beta -- The scaling factor, N is recommended
        epsilon -- The Epsilon value to prevent ln(0)

        Returns:
        loss -- real number, value of the loss
        """
        anchor = tf.convert_to_tensor(y_pred[:,0:N])
        positive = tf.convert_to_tensor(y_pred[:,N:N*2]) 
        negative = tf.convert_to_tensor(y_pred[:,N*2:N*3])

        # distance between the anchor and the positive
        pos_dist = tf.reduce_sum(tf.square(tf.subtract(anchor,positive)),1)
        # distance between the anchor and the negative
        neg_dist = tf.reduce_sum(tf.square(tf.subtract(anchor,negative)),1)

        #Non Linear Values  
        pos_dist = -tf.math.log(-tf.math.divide((pos_dist),beta)+1+epsilon)
        neg_dist = -tf.math.log(-tf.math.divide((N-neg_dist),beta)+1+epsilon)

        # compute loss
        loss = neg_dist + pos_dist
        return loss

    def basic_sentence_vectorizer():
        inp = Input(shape=(len(data_generator.count_vectorizer.get_feature_names()),))
        X = inp
        X = Dense(complexity)(X)
        X = Activation('elu')(X)
        X = Dense(complexity)(X)
        X = Activation('elu')(X)
        X = Dense(complexity, name='embeding_output')(X)
        X = Activation('elu')(X)
        X = Dense(complexity)(X)
        vectorizer = Model(inputs=inp, outputs=X)
        return vectorizer

    complexity = 300

    inp_anchor = Input(shape=(len(data_generator.count_vectorizer.get_feature_names()),))
    inp_positive = Input(shape=(len(data_generator.count_vectorizer.get_feature_names()),))
    inp_negative = Input(shape=(len(data_generator.count_vectorizer.get_feature_names()),))

    embedder = basic_sentence_vectorizer()

    anchor = embedder(inp_anchor)
    positive = embedder(inp_positive)
    negative = embedder(inp_negative)

    output = Concatenate(axis=1)([anchor, positive, negative])

    model = Model(inputs=[inp_anchor, inp_positive, inp_negative], outputs=output)
    model.compile(optimizer='adagrad', loss=lossless_triplet_loss)
    model.summary()
    return model, embedder

data_generator = TripletDataGeneratorIndexes(batch_size=128, batches_per_epoch=10000)
model, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize)


zeros = np.zeros((data_generator.batch_size, 1, 1))
new_result = -10e5
for i in tqdm(range(1000)):
    if i%3==0:
        previous_result = new_result
        new_result = evaluate(get_similarity_values, 'triplet loss indexes')
        new_result = parse_result(new_result)
        print(i, new_result)
        if new_result < previous_result and i > 20:
            break
    for a, p, n in data_generator:
        model.train_on_batch([a, p, n], zeros)


0 724.1

3 -143.5

6 11.7

9 36.2

12 -123.5

15 150.1

18 -51.9

21 5.0

24 -43.5



7.2 Triplet loss embedings



class TripletDataGeneratorEmbedings(TripletDataGeneratorIndexes):
    def __init__(self, *args, **kwargs):
        super().__init__()
        self.target_len = kwargs['target_len']
        self.embedding_size = len(vectorize('any_token'))
        self.use_word2vec = True
        self.use_fasttext = True
        self.batches_per_epoch = kwargs['batches_per_epoch']

    def vectorize(self, sentences):
        return LabelsDataGenerator.vectorize(self, sentences)


def models_builder(data_generator):
    sentence_embeding_size = 300
    def lossless_triplet_loss(y_true, y_pred, N=sentence_embeding_size, beta=100, epsilon=1e-8):
        """
        Implementation of the triplet loss function

        Arguments:
        y_true -- true labels, required when you define a loss in Keras, you don't need it in this function.
        y_pred -- python list containing three objects:
                anchor -- the encodings for the anchor data
                positive -- the encodings for the positive data (similar to anchor)
                negative -- the encodings for the negative data (different from anchor)
        N  --  The number of dimension
        beta -- The scaling factor, N is recommended
        epsilon -- The Epsilon value to prevent ln(0)

        Returns:
        loss -- real number, value of the loss
        """
        anchor = tf.convert_to_tensor(y_pred[:,0:N])
        positive = tf.convert_to_tensor(y_pred[:,N:N*2])
        negative = tf.convert_to_tensor(y_pred[:,N*2:N*3])

        # distance between the anchor and the positive
        pos_dist = tf.math.reduce_sum(tf.math.square(tf.math.subtract(anchor,positive)),1)
        # distance between the anchor and the negative
        neg_dist = tf.math.reduce_sum(tf.math.square(tf.math.subtract(anchor,negative)),1)

        #Non Linear Values  
        pos_dist = -tf.math.log(-tf.math.divide((pos_dist),beta)+1+epsilon)
        neg_dist = -tf.math.log(-tf.math.divide((N-neg_dist),beta)+1+epsilon)

        # compute loss
        loss = neg_dist + pos_dist

        return loss

    def inception_convolutional_layer(X, complexity, rate=0.2, regularizer=0):
        X_7 = Conv1D(int(complexity/7), kernel_size=7, strides=1, padding='same')(X)
        X_6 = Conv1D(int(complexity/6), kernel_size=6, strides=1, padding='same')(X)
        X_5 = Conv1D(int(complexity/5), kernel_size=5, strides=1, padding='same')(X)
        X_4 = Conv1D(int(complexity/4), kernel_size=4, strides=1, padding='same')(X)
        X_3 = Conv1D(int(complexity/3), kernel_size=3, strides=1, padding='same')(X)
        X_2 = Conv1D(int(complexity/2), kernel_size=2, strides=1, padding='same')(X)
        X_1 = Conv1D(int(complexity/1), kernel_size=1, strides=1, padding='same')(X)
        X = Concatenate()([X_7, X_6, X_5, X_4, X_3, X_2, X_1])
        X = Activation('elu')(X)
        X = BatchNormalization()(X)
        X = Dropout(rate)(X)
        return X

    def bi_LSTM(X, complexity, rate=0.2, regularizer=0):
        X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(int(complexity/2), return_sequences=True))(X)
        X = tf.keras.layers.BatchNormalization()(X)
        X = tf.keras.layers.Dropout(rate)(X)
        return X

    def dense_layer(X, complexity, rate=0.2, regularizer=0):
        X = tf.keras.layers.Dense(int(complexity))(X)
        X = tf.keras.layers.Activation('elu')(X)
        X = tf.keras.layers.BatchNormalization()(X)
        X = tf.keras.layers.Dropout(rate)(X)
        return X

    def basic_sentence_vectorizer():
        rate = 0.20
        complexity = 300
        inp = Input(shape = (data_generator.target_len, data_generator.embedding_size))

        X = inp
        X = inception_convolutional_layer(X, complexity)
        X = inception_convolutional_layer(X, complexity)
        X = inception_convolutional_layer(X, complexity)
        X = tf.keras.layers.MaxPooling1D(pool_size=2)(X)
        X = inception_convolutional_layer(X, complexity)
        X = tf.keras.layers.MaxPooling1D(pool_size=2)(X)
        X = inception_convolutional_layer(X, complexity)
        X = tf.keras.layers.MaxPooling1D(pool_size=2)(X)

        R = inp
        R = bi_LSTM(R, complexity)
        R = bi_LSTM(R, complexity/2)
        attention_probs = tf.keras.layers.Dense(int(complexity/2), activation='sigmoid', name='attention_probs')(R)
        R = multiply([R, attention_probs], name='attention_mul')
        R = Dropout(rate)(R)
        R = MaxPooling1D(pool_size=2)(R)
        R = inception_convolutional_layer(R, complexity)
        R = MaxPooling1D(pool_size=2)(R)
        R = inception_convolutional_layer(R, complexity)
        R = MaxPooling1D(pool_size=2)(R)

        X = Concatenate(axis=-1)([X, R])
        X = Flatten()(X)
        X = BatchNormalization()(X)
        X = Dropout(rate)(X)

        X = dense_layer(X, complexity)
        X = dense_layer(X, complexity)
        X = dense_layer(X, complexity)

        X = Dense(sentence_embeding_size, activation='sigmoid')(X)
        vectorizer = Model(inputs=inp, outputs=X)
        return vectorizer

    inp_anchor = Input(shape = (data_generator.target_len, data_generator.embedding_size))
    inp_positive = Input(shape = (data_generator.target_len, data_generator.embedding_size))
    inp_negative = Input(shape = (data_generator.target_len, data_generator.embedding_size))

    embedder = basic_sentence_vectorizer()

    anchor = embedder(inp_anchor)
    positive = embedder(inp_positive)
    negative = embedder(inp_negative)

    output = Concatenate(axis=1)([anchor, positive, negative])

    model = Model(inputs=[inp_anchor, inp_positive, inp_negative], outputs=output)
    model.compile(optimizer='adagrad', loss=lossless_triplet_loss)
    model.summary()
    return model, embedder

data_generator = TripletDataGeneratorEmbedings(target_len=20, batch_size=32, batches_per_epoch=10000)
model, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize)


zeros = np.zeros((data_generator.batch_size, 1, 1))
new_result = -10e5
for i in tqdm(range(1000)):
    if i%3==0:
        previous_result = new_result
        new_result = evaluate(get_similarity_values, 'triplet loss embeding')
        new_result = parse_result(new_result)
        print(i, new_result)
        if new_result < previous_result and i>20:
            break
    for a, p, n in data_generator:
        model.train_on_batch([a, p, n], zeros)


0 283.9

3 334.2

6 218.1

9 219.6

12 262.8

15 282.4

18 289.7

21 274.9



plot_results()


png





, ELMO .. . , .



BOW . , .



. , . , . , .



Triplet loss embedings . , 100 .



Two methods: BOW with lemmas without stop words and mean with weights tf-idf, although they do not give outstanding average results, for some sentences they give very, very good results. Therefore, for these methods, everything must depend on the data.



It is likely that in time there will be a Part 3, if I collect enough ideas.




All Articles