Работу выполнил Подчезерцев Алексей ИАД5
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/Shared drives/Тяжелые проекты/ИАД/applied-ds/hw_01')
!mkdir -p ~/.kaggle
!cp secret/kaggle.json ~/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json
# !pip install --upgrade -q kaggle
import numpy as np
import pandas as pd
import json
from tqdm import tqdm_notebook
import multiprocessing
multiprocessing.cpu_count()
from numba import cuda
device = cuda.get_current_device()
device.reset()
!nvidia-smi
def lazy_train_reader(fname):
data = []
with open(fname) as f:
for line in tqdm_notebook(f, total=42977):
obj = json.loads(line)
userId = int(obj['userId'])
for itemId, event in obj['trainRatings'].items():
data.append((userId, int(itemId), int(event)))
return pd.DataFrame(data, columns=['userId', 'itemId', 'event'], dtype=np.uint32)
def lazy_item_reader(fname):
items_list=[]
with open(fname) as f:
for line in tqdm_notebook(f, total=328050):
item=json.loads(line)
if isinstance(item['image'], float):
item['image']=[0 for _ in range(96)]
item['image']=np.array(item['image'])
items_list.append(item)
return pd.DataFrame(items_list).set_index('itemId')
items = lazy_item_reader('items.json')
items.to_pickle('items.pkl')
train = lazy_train_reader('train.json')
train.to_pickle('train.pkl')
if 'items' not in vars():
items = pd.read_pickle('items.pkl')
if 'train' not in vars():
train = pd.read_pickle('train.pkl')
if 'random_benchmark' not in vars():
random_benchmark = pd.read_csv('random_benchmark.csv')
train['event_float'] = train['event'].astype(np.float32).replace(1, 10).replace(0, -1)
train['event_float'] = (train['event_float'].replace(10, (train['event'] == 0).sum()/(train['event'] == 1).sum())/20)
При таком значении среднее для event_float будет равно 0
train['event_float'].unique()
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
%%time
matrix = csr_matrix((train['event_float'], (train['userId'], train['itemId'])))
%%time
u,s,vt = svds(matrix, k=100)
s = np.diag(s)
random_groups = random_benchmark.groupby('userId')['itemId'].agg(lambda x: list(x))
result = []
for userId, items_ in tqdm_notebook(random_groups.iteritems(), total=len(random_groups)):
for itemId in items_:
result.append({'userId': userId, 'itemId': itemId, 'score': u[userId] @ s @ vt[:, itemId]})
result = pd.DataFrame(result)
result = result.sort_values(['userId', 'score'], ascending=[True, False])
result
result[['userId', 'itemId']].to_csv('21_offtop_svd20.csv', index=None)
!pip install lightfm
from lightfm import LightFM
from sklearn.feature_extraction.text import TfidfVectorizer
%%time
vect_t = TfidfVectorizer(min_df=3, max_df=0.1)
title_tf = vect_t.fit_transform(items['title'])
%%time
vect_c = TfidfVectorizer(min_df=10, max_df=0.1, max_features=5000)
content_tf = vect_c.fit_transform(items['content'])
title_tf.shape, content_tf.shape
title_tf
content_tf
model = LightFM(no_components=100)
rg = random_benchmark.groupby('userId')['itemId'].agg(lambda x: list(x))
for i in tqdm_notebook(range(30)):
break
model.fit_partial(matrix, item_features=title_tf,
epochs=1,
num_threads=multiprocessing.cpu_count(),
verbose=False)
with open(f'32_lightfm_rating_{i:02d}.pkl', 'wb') as fle:
pickle.dump(model, fle)
if i % 10 == 9:
try:
result = []
for userId, items in tqdm_notebook(rg.iteritems(), total=len(rg)):
proba = model.predict(userId, np.array(items), item_features=title_tf)
result += list(zip([userId] * len(items), items, proba))
result = pd.DataFrame(result, columns=['userId', 'itemId', 'event'])
result.sort_values(['userId', 'event'], ascending=[True, False])[['userId', 'itemId']].to_csv(f'32_lightfm_rating_{i:02d}.csv', index=None)
except Exception as e:
pass
model = LightFM(no_components=100)
for i in tqdm_notebook(range(10)):
break
model.fit_partial(matrix, item_features=content_tf,
epochs=1,
num_threads=multiprocessing.cpu_count(),
verbose=False)
with open(f'34_lightfm_rating_{i:02d}.pkl', 'wb') as fle:
pickle.dump(model, fle)
if i % 5 == 4:
try:
result = []
for userId, items in tqdm_notebook(rg.iteritems(), total=len(rg)):
proba = model.predict(userId, np.array(items), item_features=content_tf)
result += list(zip([userId] * len(items), items, proba))
result = pd.DataFrame(result, columns=['userId', 'itemId', 'event'])
result.sort_values(['userId', 'event'], ascending=[True, False])[['userId', 'itemId']].to_csv(f'34_lightfm_rating_{i:02d}.csv', index=None)
except Exception as e:
pass
df1 = pd.read_csv('21_offtop_svd20.csv')
df2 = pd.read_csv('submits/32_lightfm_rating_09.csv')
df3 = pd.read_csv('submits/34_lightfm_rating_04.csv')
df1.reset_index(inplace=True)
df2.reset_index(inplace=True)
df3.reset_index(inplace=True)
df1['index'] /=3018185
df2['index'] /=3018185
df3['index'] /=3018185
df1=df1.groupby(['userId', 'itemId'])['index'].mean()
df2=df2.groupby(['userId', 'itemId'])['index'].mean()
df3=df3.groupby(['userId', 'itemId'])['index'].mean()
df1= pd.DataFrame(df1)
df2= pd.DataFrame(df2)
df3= pd.DataFrame(df3)
df = df1.join(df2, on=['userId', 'itemId'],rsuffix='_2').join(df3, on=['userId', 'itemId'], rsuffix='_3')
df.columns = ['score_1', 'score_2', 'score_3']
df['score_'] = df['score_1'] + df['score_2']+ df['score_3']
df.reset_index(inplace=True)
df.sort_values(['userId', 'score_'], ascending=[True, True])[['userId', 'itemId']].to_csv(f'50_multisubmit_32_lightfm_rating_09+34_lightfm_rating_09+21_offtop_svd20.csv', index=None)
df.sort_values(['userId', 'score_'], ascending=[True, True])
!kaggle competitions submit recsys-iad-challenge -f '50_multisubmit_32_lightfm_rating_09+34_lightfm_rating_09+21_offtop_svd20.csv' -m "Colab auto submit"
Далее идут наброски кода, который не взлетел.
Основная идея такая -- выполняем классификацию всех item на средний скор, можем вычислить скор для недостающих элементов (NDCG@20 == 0.120).
С предпоследнего слоя нейронки снимаем предсказания для всех item'ов, это передаем в инициализацию весов тензорного разложения матриц.
Далее можем снять веса с эмбединг слоев, и/или можем получить предсказания с предпоследнего и последнего слоев.
На полученных эмбедингах обучаем алгоритм кластеризации, далее на каждом кластере обучаем свой SGD (CatBoost показал качество ниже) и отправляем на предсказание.
За такие танцы с бубном можно получить 0.125 NDCG@20
%tensorflow_version 2.x
import tensorflow.keras as keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Bidirectional, GlobalMaxPool1D, GlobalAveragePooling1D, concatenate, Conv1D, LeakyReLU,BatchNormalization, MaxPooling1D, Flatten
from tensorflow.keras.models import Model
# from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers
from tensorflow.keras.optimizers import Adam
class ModelSaveCallback(keras.callbacks.Callback):
def __init__(self, file_name):
super(ModelSaveCallback, self).__init__()
self.file_name = file_name
def on_epoch_end(self, epoch, logs=None):
filename = self.file_name.format(epoch)
keras.models.save_model(self.model, filename)
def asciishell_score(data, min_views, count_col='count', mean_col='mean'):
rating = (data[count_col]/ (data[count_col] + min_views)) * data[mean_col] +\
(min_views / (min_views + data[count_col])) * data[mean_col].mean()
rating += ((data[mean_col] != 0) - 1)
return rating
item_rating = train.groupby('itemId')['event'].agg(['mean', 'count'])
item_rating['asciishell'] = asciishell_score(item_rating, 10)
df = items.join(item_rating, on='itemId')
%%time
max_features=100000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list((df["title"] + ' ' + df["content"]).values))
max_content_size=150
max_title_size=10
embed_size=300
EMBEDDING_FILE = '/content/drive/Shared drives/Тяжелые проекты/text_embeddings/cc.ru.300.vec'
def get_embedding_matrix(tokenizer, max_features, embed_size, embed_file):
def get_coefs(word,*arr):
return word, np.asarray(arr, dtype='float32')
with open(embed_file) as f:
f.readline()
embeddings_index = dict(get_coefs(*o.strip().split()) for o in tqdm_notebook(f, total=2000000))
skip_, add_, not_found_, tot_ = 0,0,0,0
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
tot_ += 1
if i >= max_features:
skip_ += 1
continue
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
add_ += 1
embedding_matrix[i] = embedding_vector
else:
not_found_ += 1
print('Skip {:03f}, Not found {:03f}, Add {:03f}, total {:03d}'.format(skip_/tot_, not_found_/tot_, add_/tot_, tot_))
return embedding_matrix
embedding_matrix = get_embedding_matrix(tokenizer, max_features, embed_size, EMBEDDING_FILE)
T_content = pad_sequences(tokenizer.texts_to_sequences(df['content'].values), maxlen=max_content_size)
T_title = pad_sequences(tokenizer.texts_to_sequences(df['title'].values), maxlen=max_title_size)
T_image = np.stack(df['image'].values)
def get_lstm(emb, lstm_units):
x = Bidirectional(LSTM(lstm_units, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(emb)
gmx = GlobalMaxPool1D()(x)
gax = GlobalAveragePooling1D()(x)
concatenated = concatenate([gmx, gax])
return Dense(lstm_units, activation=LeakyReLU())(concatenated)
def get_cnn(emb, out_uniuts=512,
kernel_size=5, initial_filters=64, pack_count=3,
leaky_relu_rate=0.1, dropout_rate=0.2):
x = emb
for pack in range(pack_count):
x = Conv1D(filters=initial_filters,
kernel_size=kernel_size,
activation=LeakyReLU(leaky_relu_rate))(x)
x = Conv1D(filters=initial_filters,
kernel_size=kernel_size,
activation=LeakyReLU(leaky_relu_rate))(x)
x = BatchNormalization()(x)
x = MaxPooling1D(padding='same')(x)
x = Dropout(dropout_rate)(x)
initial_filters *= 2
x = Flatten()(x)
x = Dense(out_uniuts, activation=LeakyReLU(leaky_relu_rate))(x)
x = BatchNormalization()(x)
return x
def get_model():
input_content = Input(shape=(max_content_size, ))
input_title = Input(shape=(max_title_size, ))
input_image = Input(shape=(96, ))
emb_title = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable = True)(input_title)
emb_conte = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable = True)(input_content)
lstm_content = get_lstm(emb_conte, 80)
lstm_title = get_lstm(emb_title, 10)
cnn_content = get_cnn(emb_conte, out_uniuts=128)
cnn_title = get_cnn(emb_title, out_uniuts=32, initial_filters=8, pack_count=1)
dense_image = Dense(48, activation='relu')(input_image)
concatenated = concatenate([lstm_content, lstm_title, cnn_content, cnn_title, dense_image])
x = Dropout(0.1)(concatenated)
x = Dense(80, activation='sigmoid', name='last_hidden_layer')(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=[input_content, input_title, input_image], outputs=x)
model.compile(loss='mse', optimizer=Adam(clipvalue=2, clipnorm=2))
intermediate_model = Model(inputs=model.input, outputs=model.get_layer("last_hidden_layer").output)
return model, intermediate_model
model, intermediate_model = get_model()
model.summary()
index_valid = df[~df['asciishell'].isna()].index
TAKE=25
batch_size=512
NAME='kernel_lstm_cnn_image_ru_embedding_kill_me_pls'
model_filename = '{0:02d}_{1:s}_{{0:02d}}.hdf5'.format(TAKE, NAME)
target = df['mean'][index_valid].values
target.min(), target.mean(), np.median(target), target.max()
model.fit(x=[T_content[index_valid], T_title[index_valid], T_image[index_valid]],
y=target,
batch_size=batch_size,
callbacks=[ModelSaveCallback(model_filename)],
validation_split=0.05,
initial_epoch=0, epochs=5)
model = keras.models.load_model('25_kernel_lstm_cnn_image_ru_embedding_kill_me_pls_01.hdf5', custom_objects={'LeakyReLU': LeakyReLU})
intermediate_model = Model(inputs=model.input, outputs=model.get_layer("last_hidden_layer").output)
from sklearn.metrics import roc_auc_score
_proba = model.predict([T_content[index_valid], T_title[index_valid], T_image[index_valid]], batch_size=batch_size*8, verbose=1)
roc_auc_score((target > np.median(target) * 0.5).astype(np.uint8),_proba), \
roc_auc_score((target > np.median(target) * 0.7).astype(np.uint8),_proba), \
roc_auc_score((target > np.median(target) * 1.0).astype(np.uint8),_proba), \
roc_auc_score((target > np.median(target) * 1.5).astype(np.uint8),_proba),
features = intermediate_model.predict([T_content, T_title, T_image], batch_size=batch_size*8, verbose=1)
features.dump('25_features.pkl')
df['mean_prob'] = model.predict([T_content, T_title, T_image], batch_size=batch_size*8, verbose=1)[:, 0]
df[['mean', 'mean_prob']].to_pickle('25_df.pkl')
df['mean_merge'] = df['mean'].fillna(0) + df['mean_prob'] * df['mean'].isna()
res1 = random_benchmark.join(df[['mean_prob', 'mean_merge']], on='itemId')
res1
res1.sort_values(['userId', 'mean_prob'], ascending=[True, False])[['userId', 'itemId']].to_csv('25_submit_mean_prob.csv', index=None)
res1.sort_values(['userId', 'mean_merge'], ascending=[True, False])[['userId', 'itemId']].to_csv('25_submit_mean_merge.csv', index=None)
!kaggle competitions submit recsys-iad-challenge -f '25_submit_mean_merge.csv' -m "Colab auto submit"
features = np.load('25_features.pkl', allow_pickle=True)
n_item=items.index.max()+1
n_users=len(train['userId'].unique())
n_latent_factors=features.shape[1]
user_input=Input(shape=(1,),name='user_input',dtype='int64')
user_embedding=Embedding(n_users,n_latent_factors,name='user_embedding')(user_input)
user_vec =Flatten(name='FlattenUsers')(user_embedding)
user_vec=Dropout(0.3)(user_vec)
item_input=Input(shape=(1,),name='item_input',dtype='int64')
item_embedding=Embedding(n_item,n_latent_factors,name='item_embedding', weights=[features], trainable = True)(item_input)
item_vec=Flatten(name='FlattenItems')(item_embedding)
item_vec=Dropout(0.3)(item_vec)
sim=keras.layers.dot([user_vec,item_vec],name='Simalarity-Dot-Product',axes=1)
model = keras.models.Model([user_input, item_input],sim)
model.compile(optimizer=Adam(),loss='mse')
model.summary()
batch_size=65536
TAKE=26
NAME='base_matrix_factorization_kill_me_pls'
model_filename = '{0:02d}_{1:s}_{{0:02d}}.hdf5'.format(TAKE, NAME)
target = (train['event_float'].replace(10, (train['event'] == 0).sum()/(train['event'] == 1).sum())/20)
model.fit(x=[train['userId'], train['itemId']],
y=target,
batch_size=batch_size,
callbacks=[ModelSaveCallback(model_filename)],
validation_split=0.005,
initial_epoch=0, epochs=30)
model = keras.models.load_model('26_base_matrix_factorization_kill_me_pls_29.hdf5')
item_embedding_w=model.get_layer('item_embedding').get_weights()[0]
user_embedding_w=model.get_layer('user_embedding').get_weights()[0]
user_model = Model(inputs=model.input, outputs=model.get_layer("dense_4").output)
item_model = Model(inputs=model.input, outputs=model.get_layer("dense_5").output)
item_embedding_w=model.get_layer('item_embedding').get_weights()[0]
user_embedding_w=model.get_layer('user_embedding').get_weights()[0]
Выполним кластеризацию пользователей на категории. Основной критерий -- примерно одинаковое число пользователей в кластере.
Будем использовать KMeans алгоритм, малые кластеры склеим в один, большие кластеры будем дальше разбивать рекурсивно. Будем продолжать до тех пор, пока все не разобьется по необходимым условиям, или достигнет определенная глубина выполнения рекурсивного алгоритма.
Далее на каждом кластере можем обучить отдельный алгоритм с повышенной обобщающей способностью.
from sklearn.cluster import MiniBatchKMeans
import random
def run_deep_kmeans(data, deep=5, too_low=100, too_high=500, n_clusters=None, pre_c=None, prefix=''):
if deep <= 0:
return pre_c
if n_clusters is None:
n_clusters = int(round(len(data) / too_high))
if n_clusters < 5:
n_clusters = 5
if n_clusters > 40:
n_clusters = 40
kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(data)
prefix = str(prefix) + str(random.randint(0, 9999)) + "_"
clusters = np.char.add(np.array([prefix] * len(data)).astype('<U999'),
kmeans.predict(data).astype('<U999'))
for i in range(3):
index_low = []
c_low = []
index_ok = []
c_ok = []
index_high = []
c_high = []
for c in np.unique(clusters):
cnt = (clusters == c).sum()
c_index = np.where(clusters == c)[0]
if cnt < too_low:
index_low.extend(c_index)
c_low.append(c)
elif cnt < too_high:
index_ok.extend(c_index)
c_ok.append(c)
else:
index_high.append(c_index)
c_high.append(c)
if (len(index_low) == 0 and len(index_high) == 0) or deep < 3:
break
if len(index_low) > 0:
if len(index_low) < too_high:
clusters[index_low] = str(c_low[0]) + "_low"
else:
clusters[index_low] = run_deep_kmeans(data[index_low],
deep - 1,
prefix=prefix,
too_low=too_low, too_high=too_high,
pre_c=np.array([str(c_low[0]) + "_low"] * len(index_low)).astype('<U999'),
)
for c_index, c in zip(index_high, c_high):
prefix_high = str(c) + "_high_"
clusters[c_index] = run_deep_kmeans(data[c_index],
deep - 1,
prefix=prefix_high,
too_low=too_low, too_high=too_high,
pre_c=np.array([prefix_high] * len(c_index)).astype('<U999'))
return clusters
clu = run_deep_kmeans(user_embedding_w, too_low=300, too_high=600,)
clu = pd.factorize(clu)[0]
for c in np.unique(clu):
cnt = (clu == c).sum()
print(c, cnt, sep='\t')
!pip install catboost -q
from catboost import CatBoostClassifier, Pool, FeaturesData
from sklearn.linear_model import SGDClassifier
num_features = [f"item {i:03d}" for i in range(item_embedding_w.shape[1])] + [f"user {i:03d}" for i in range(user_embedding_w.shape[1])] + ['prod']
result = []
for cluster_id in tqdm_notebook(np.unique(clu)):
uids = np.where(clu == cluster_id)[0]
# uids_t, uids_f = train_test_split(uids, test_size=0.1)
# df_train = train[train['userId'].isin(uids_t)]
# df_test = train[train['userId'].isin(uids_f)]
df_train = train[train['userId'].isin(uids)]
df_test = random_benchmark[random_benchmark['userId'].isin(uids)]
user_train = user_embedding_w[df_train['userId']]
user_test = user_embedding_w[df_test['userId']]
item_train = item_embedding_w[df_train['itemId']]
item_test = item_embedding_w[df_test['itemId']]
prod_train = np.array([item_row @ user_row for item_row, user_row in zip(item_train, user_train)])
prod_test = np.array([item_row @ user_row for item_row, user_row in zip(item_test, user_test)])
all_train = np.hstack([item_train, user_train, prod_train.reshape((len(prod_train), 1))])
all_test = np.hstack([item_test, user_test, prod_test.reshape((len(prod_test), 1))])
train_features = FeaturesData(all_train, df_train['userId'].values.astype(str).astype(object).reshape((len(df_train['userId']), 1)), num_features, ['user_id'])
train_features = Pool(train_features, df_train['event'].values)
test_features = FeaturesData(all_test, df_test['userId'].values.astype(str).astype(object).reshape((len(df_test['userId']), 1)), num_features, ['user_id'])
# test_features = Pool(test_features, df_test['event'].values)
test_features = Pool(test_features)
# clf = CatBoostClassifier(task_type='GPU', learning_rate=0.14, num_trees=500, verbose=1, eval_metric='AUC')
# clf.fit(train_features, eval_set=test_features)
# break
clf_cat = CatBoostClassifier(task_type='GPU', learning_rate=0.14, num_trees=500, verbose=0)
clf_cat.fit(train_features, )
proba = clf_cat.predict_proba(test_features)[:,1]
clf_sgd = SGDClassifier(loss='log', random_state=42, n_jobs=-1)
clf_sgd.fit(all_train, df_train['event'].values)
proba_sgd = clf_sgd.predict_proba(all_test)[:, 1]
result += list(zip(df_test['userId'], df_test['itemId'], proba, proba_sgd))
# break
result = pd.DataFrame(result, columns=['userId', 'itemId', 'event_catboost', 'event_sgd'])
result.to_pickle('26_kmean_deep_catboost_raw.pkl')
result.sort_values(['userId', 'event_sgd'], ascending=[True, False])
result.sort_values(['userId', 'event_catboost'], ascending=[True, False])[['userId', 'itemId']].to_csv('26_kmean_deep_catboost.csv', index=None)
# !kaggle competitions submit recsys-iad-challenge -f '26_kmean_deep_catboost.csv' -m "Colab auto submit"
result.sort_values(['userId', 'event_sgd'], ascending=[True, False])[['userId', 'itemId']].to_csv('26_kmean_deep_sgd.csv', index=None)
!kaggle competitions submit recsys-iad-challenge -f '26_kmean_deep_sgd.csv' -m "Colab auto submit"