from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/Shared drives/Тяжелые проекты/ИАД/intro-to-dl-seminars/hw_5_text')
%tensorflow_version 2.x
import sys, os, re, csv, codecs, numpy as np, pandas as pd
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Bidirectional, GlobalMaxPool1D, GlobalAveragePooling1D, concatenate, Conv1D, LeakyReLU,BatchNormalization, MaxPooling1D, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers
from tensorflow.keras.optimizers import Adamax, Adam
from tqdm import tqdm_notebook
tf.test.gpu_device_name()
По результатам экспериментов, лучше себя показывает эмбединг, обученный на твиттере
EMBEDDING_FILE='glove/glove.twitter.27B.100d.txt'
TRAIN_DATA_FILE='x_train.txt'
TEST_DATA_FILE='x_test.txt'
TRAIN_LABELS='y_train.csv'
!cat {EMBEDDING_FILE} | wc -l
embed_size = 100 # Размерность эмбединга
max_features = 50000 # Максимальное количество фич
maxlen = 150 # Максимальная длина фразы в токенах
train = pd.read_csv(TRAIN_DATA_FILE, header=None, names=['text'], sep="ЯЯЯЯЯЯЯЯ")
test = pd.read_csv(TEST_DATA_FILE, header=None, names=['text'], sep="ЯЯЯЯЯЯЯЯ")
y = pd.read_csv(TRAIN_LABELS)['Probability'].values
list_sentences_train = train["text"].fillna("_na_").values
list_sentences_test = test["text"].fillna("_na_").values
%%time
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_train = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_test = pad_sequences(list_tokenized_test, maxlen=maxlen)
Как можно заметить, данные в векторах не соответствуют тому формату, который был подготовлен
def get_coefs(word,*arr):
return word, np.asarray(arr, dtype='float32')
with open(EMBEDDING_FILE) as f:
embeddings_index = dict(get_coefs(*o.strip().split()) for o in tqdm_notebook(f))
sh = embeddings_index['a'].shape
for k, v in embeddings_index.items():
if v.shape != sh:
print(k)
del embeddings_index['-0.32053']
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
if i >= max_features:
continue
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
Сводная таблица результата обучения простой CNN на 10% данных (эмбединг, Conv1D, GlobalMaxPooling, Dense).
Чем больше фильтров, тем лучше модель, но размер кернела лучше взять средним.
| val_AUC | 32 | 64 | 128 | 256 |
|---|---|---|---|---|
| 3 | 0.9678 | 0.9712 | 0.9732 | 0.9754 |
| 5 | 0.9704 | 0.9720 | 0.9741 | 0.9758 |
| 6 | 0.9694 | 0.9713 | 0.9738 | 0.9758 |
Далее возьмем 2 разноприродные модели, которые показали наилучший результат и объединим их.
Так же была попытка добавить еще 1 вход, основанный на метаданных текста (количество слов, символов, уникальных слов и т.д.), но такая модель показывала неудовлетворительное качество
def cnn_layers(inp):
x = Embedding(max_features, embed_size, input_length=maxlen, weights=[embedding_matrix], trainable = True)(inp)
x = Conv1D(filters=64, kernel_size=5)(x)
x = LeakyReLU(0.1)(x)
x = Conv1D(filters=64, kernel_size=5)(x)
x = BatchNormalization()(x)
x = LeakyReLU(0.1)(x)
x = MaxPooling1D(padding='same')(x)
x = Dropout(0.2)(x)
x = Conv1D(filters=128, kernel_size=5)(x)
x = LeakyReLU(0.1)(x)
x = Conv1D(filters=128, kernel_size=5)(x)
x = BatchNormalization()(x)
x = LeakyReLU(0.1)(x)
x = MaxPooling1D(padding='same')(x)
x = Dropout(0.2)(x)
x = Conv1D(filters=256, kernel_size=5)(x)
x = LeakyReLU(0.1)(x)
x = Conv1D(filters=256, kernel_size=5)(x)
x = BatchNormalization()(x)
x = LeakyReLU(0.1)(x)
x = MaxPooling1D(padding='same')(x)
x = Dropout(0.2)(x)
x = Flatten()(x)
x = Dense(512)(x)
x = BatchNormalization()(x)
x = LeakyReLU(0.1)(x)
return x
def lstm_layers(inp):
x = Embedding(max_features, embed_size, input_length=maxlen, weights=[embedding_matrix], trainable = True)(inp)
x = Bidirectional(LSTM(80, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
gmx = GlobalMaxPool1D()(x)
gax = GlobalAveragePooling1D()(x)
concatenated = concatenate([gmx, gax])
x = Dense(80, activation="relu")(concatenated)
return x
input_text = Input(shape=(maxlen,))
cnn = cnn_layers(input_text)
lstm = lstm_layers(input_text)
concatenated = concatenate([cnn, lstm])
x = Dropout(0.1)(concatenated)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=input_text, outputs=x)
model.compile(loss='binary_crossentropy', optimizer=Adam(clipvalue=2, clipnorm=2), metrics=['accuracy' ,'AUC'])
model=keras.models.load_model()
model.summary()
class ModelSaveCallback(keras.callbacks.Callback):
def __init__(self, file_name):
super(ModelSaveCallback, self).__init__()
self.file_name = file_name
def on_epoch_end(self, epoch, logs=None):
filename = self.file_name.format(epoch)
keras.models.save_model(self.model, filename)
class ModelPredictCallback(keras.callbacks.Callback):
def __init__(self, file_name):
super(ModelPredictCallback, self).__init__()
self.file_name = file_name
def on_epoch_end(self, epoch, logs=None):
filename = self.file_name.format(epoch)
result = self.model.predict(X_test, batch_size=batch_size*16, verbose=0)
df = pd.DataFrame()
df['Id'] = range(1, len(X_test) + 1)
df['Probability'] = result
df.to_csv(filename, index=None)
TAKE=28 # Номер попытки
batch_size=512
NAME='kernel_conv_lstm_nofeatures_glove.twitter.27B.100d'
model_filename = '{0:02d}_{1:s}_{{0:02d}}.hdf5'.format(TAKE, NAME)
predict_filename = '{0:02d}_{1:s}_{{0:02d}}.csv'.format(TAKE, NAME)
model.fit(X_train, y, batch_size=batch_size, epochs=1, validation_split=0.1,verbose=1,
initial_epoch=0,
callbacks = [
ModelSaveCallback(model_filename),
ModelPredictCallback(predict_filename),
],
)
from IPython.display import Audio
Audio('First_epoch.mp3', autoplay=True)
model.fit(X_train, y, batch_size=batch_size, epochs=7, validation_split=0.1,verbose=2,
initial_epoch=1,
callbacks = [
ModelSaveCallback(model_filename),
ModelPredictCallback(predict_filename),
],
)
Наша модель полностью обучилась, добавим элемента неожиданности
for i in range(20):
print(i)
idx = np.random.randint(X_train.shape[0], size=int(X_train.shape[0]/20))
X_train1 = X_train[idx, :]
y1 = y[idx]
model.fit(X_train1, y1, batch_size=int(batch_size/2), epochs=i+4,verbose=1, initial_epoch=i+3)
result = model.predict(X_test, batch_size=batch_size*16, verbose=0)
df = pd.DataFrame()
df['Id'] = range(1, len(X_test) + 1)
df['Probability'] = result
df.to_csv('night_sgd.csv', index=None)
model.fit(X_train, y, batch_size=batch_size, epochs=3, validation_split=0.1,verbose=1,
initial_epoch=0,
callbacks = [
ModelSaveCallback(model_filename),
ModelPredictCallback(predict_filename),
],
)