Работу выполнил Подчезерцев Алексей ИАД5

from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive

import os
os.chdir('/content/drive/Shared drives/Тяжелые проекты/ИАД/applied-ds/hw_01')

!mkdir -p ~/.kaggle
!cp secret/kaggle.json ~/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json
# !pip install --upgrade -q kaggle

import numpy as np
import pandas as pd
import json
from tqdm import tqdm_notebook
import multiprocessing
multiprocessing.cpu_count()

4

from numba import cuda 
device = cuda.get_current_device()
device.reset()
!nvidia-smi

Wed Mar  4 21:55:50 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.59       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    30W / 250W |     10MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+

Data read¶

def lazy_train_reader(fname):    
    data = []
    with open(fname) as f:
        for line in tqdm_notebook(f, total=42977):
            obj = json.loads(line)
            userId = int(obj['userId'])
            for itemId, event in obj['trainRatings'].items():
                data.append((userId, int(itemId), int(event)))
    return pd.DataFrame(data, columns=['userId', 'itemId', 'event'], dtype=np.uint32)
def lazy_item_reader(fname):
    items_list=[]
    with open(fname) as f:
        for line in tqdm_notebook(f, total=328050):
            item=json.loads(line)
            if isinstance(item['image'], float):
                item['image']=[0 for _ in range(96)]
                
            item['image']=np.array(item['image'])
            items_list.append(item)

    return pd.DataFrame(items_list).set_index('itemId')

items = lazy_item_reader('items.json')
items.to_pickle('items.pkl')

train = lazy_train_reader('train.json')
train.to_pickle('train.pkl')

if 'items' not in vars():
    items = pd.read_pickle('items.pkl')
if 'train' not in vars():
    train = pd.read_pickle('train.pkl')
if 'random_benchmark' not in vars():
    random_benchmark = pd.read_csv('random_benchmark.csv')

SVD¶

train['event_float'] = train['event'].astype(np.float32).replace(1, 10).replace(0, -1)
train['event_float'] = (train['event_float'].replace(10, (train['event'] == 0).sum()/(train['event'] == 1).sum())/20)

При таком значении среднее для event_float будет равно 0

train['event_float'].unique()

array([-0.05      ,  0.47537431])

from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

%%time
matrix = csr_matrix((train['event_float'], (train['userId'], train['itemId'])))

CPU times: user 4.8 s, sys: 5.29 ms, total: 4.81 s
Wall time: 4.82 s

%%time
u,s,vt = svds(matrix, k=100)

CPU times: user 3min 21s, sys: 2min 5s, total: 5min 26s
Wall time: 2min 50s

s = np.diag(s)

random_groups = random_benchmark.groupby('userId')['itemId'].agg(lambda x: list(x))

result = []
for userId, items_ in tqdm_notebook(random_groups.iteritems(), total=len(random_groups)):
    for itemId in items_:
        result.append({'userId': userId, 'itemId': itemId, 'score': u[userId] @ s @ vt[:, itemId]})
result = pd.DataFrame(result)

result = result.sort_values(['userId', 'score'], ascending=[True, False])

result

result[['userId', 'itemId']].to_csv('21_offtop_svd20.csv', index=None)

LightFM¶

!pip install lightfm
from lightfm import LightFM
from sklearn.feature_extraction.text import TfidfVectorizer

Collecting lightfm
  Downloading https://files.pythonhosted.org/packages/e9/8e/5485ac5a8616abe1c673d1e033e2f232b4319ab95424b42499fabff2257f/lightfm-1.15.tar.gz (302kB)
     |████████████████████████████████| 307kB 2.7MB/s 
Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from lightfm) (1.17.5)
Requirement already satisfied: scipy>=0.17.0 in /usr/local/lib/python3.6/dist-packages (from lightfm) (1.4.1)
Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from lightfm) (2.21.0)
Requirement already satisfied: urllib3<1.25,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->lightfm) (1.24.3)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->lightfm) (2019.11.28)
Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->lightfm) (3.0.4)
Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->lightfm) (2.8)
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py) ... done
  Created wheel for lightfm: filename=lightfm-1.15-cp36-cp36m-linux_x86_64.whl size=707628 sha256=f517cd1bc35e88f4e9adb101ab940fb5a0bbe307cf57665099bd4b20f3f0fa3e
  Stored in directory: /root/.cache/pip/wheels/eb/bb/ac/188385a5da6627956be5d9663928483b36da576149ab5b8f79
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.15

%%time
vect_t = TfidfVectorizer(min_df=3, max_df=0.1)
title_tf = vect_t.fit_transform(items['title'])

CPU times: user 3.45 s, sys: 36.8 ms, total: 3.49 s
Wall time: 3.5 s

%%time
vect_c = TfidfVectorizer(min_df=10, max_df=0.1, max_features=5000)
content_tf = vect_c.fit_transform(items['content'])

CPU times: user 59.6 s, sys: 490 ms, total: 1min
Wall time: 1min

title_tf.shape, content_tf.shape

((328050, 72235), (328050, 5000))

title_tf

<328050x72235 sparse matrix of type '<class 'numpy.float64'>'
	with 2124653 stored elements in Compressed Sparse Row format>

content_tf

<328050x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 20819470 stored elements in Compressed Sparse Row format>

model = LightFM(no_components=100)

rg = random_benchmark.groupby('userId')['itemId'].agg(lambda x: list(x))

for i in tqdm_notebook(range(30)):
    break
    model.fit_partial(matrix, item_features=title_tf, 
                      epochs=1, 
                      num_threads=multiprocessing.cpu_count(), 
                      verbose=False)
    with open(f'32_lightfm_rating_{i:02d}.pkl', 'wb') as fle:
        pickle.dump(model, fle)
    if i % 10 == 9:
        try:
            result = []
            for userId, items in tqdm_notebook(rg.iteritems(), total=len(rg)):
                proba = model.predict(userId, np.array(items), item_features=title_tf)
                result += list(zip([userId] * len(items), items, proba))
            result = pd.DataFrame(result, columns=['userId', 'itemId', 'event'])
            result.sort_values(['userId', 'event'], ascending=[True, False])[['userId', 'itemId']].to_csv(f'32_lightfm_rating_{i:02d}.csv', index=None)
        except Exception as e:
            pass

model = LightFM(no_components=100)

for i in tqdm_notebook(range(10)):
    break
    model.fit_partial(matrix, item_features=content_tf, 
                      epochs=1, 
                      num_threads=multiprocessing.cpu_count(), 
                      verbose=False)
    with open(f'34_lightfm_rating_{i:02d}.pkl', 'wb') as fle:
        pickle.dump(model, fle)
    if i % 5 == 4:
        try:
            result = []
            for userId, items in tqdm_notebook(rg.iteritems(), total=len(rg)):
                proba = model.predict(userId, np.array(items), item_features=content_tf)
                result += list(zip([userId] * len(items), items, proba))
            result = pd.DataFrame(result, columns=['userId', 'itemId', 'event'])
            result.sort_values(['userId', 'event'], ascending=[True, False])[['userId', 'itemId']].to_csv(f'34_lightfm_rating_{i:02d}.csv', index=None)
        except Exception as e:
            pass

Mixing¶

df1 = pd.read_csv('21_offtop_svd20.csv')
df2 = pd.read_csv('submits/32_lightfm_rating_09.csv')
df3 = pd.read_csv('submits/34_lightfm_rating_04.csv')

df1.reset_index(inplace=True)
df2.reset_index(inplace=True)
df3.reset_index(inplace=True)
df1['index'] /=3018185
df2['index'] /=3018185
df3['index'] /=3018185

df1=df1.groupby(['userId', 'itemId'])['index'].mean()
df2=df2.groupby(['userId', 'itemId'])['index'].mean()
df3=df3.groupby(['userId', 'itemId'])['index'].mean()

df1= pd.DataFrame(df1)
df2= pd.DataFrame(df2)
df3= pd.DataFrame(df3)

df = df1.join(df2, on=['userId', 'itemId'],rsuffix='_2').join(df3, on=['userId', 'itemId'], rsuffix='_3')

df.columns = ['score_1', 'score_2', 'score_3']

df['score_'] = df['score_1'] + df['score_2']+ df['score_3']

df.reset_index(inplace=True)

df.sort_values(['userId', 'score_'], ascending=[True, True])[['userId', 'itemId']].to_csv(f'50_multisubmit_32_lightfm_rating_09+34_lightfm_rating_09+21_offtop_svd20.csv', index=None)
df.sort_values(['userId', 'score_'], ascending=[True, True])

!kaggle competitions submit recsys-iad-challenge -f '50_multisubmit_32_lightfm_rating_09+34_lightfm_rating_09+21_offtop_svd20.csv' -m "Colab auto submit"

Warning: Looks like you're using an outdated API Version, please consider updating (server 1.5.6 / client 1.5.4)
  1% 520k/35.7M [00:04<05:19, 115kB/s]

Далее идут наброски кода, который не взлетел.

Основная идея такая -- выполняем классификацию всех item на средний скор, можем вычислить скор для недостающих элементов (NDCG@20 == 0.120).

С предпоследнего слоя нейронки снимаем предсказания для всех item'ов, это передаем в инициализацию весов тензорного разложения матриц.

Далее можем снять веса с эмбединг слоев, и/или можем получить предсказания с предпоследнего и последнего слоев.

На полученных эмбедингах обучаем алгоритм кластеризации, далее на каждом кластере обучаем свой SGD (CatBoost показал качество ниже) и отправляем на предсказание.

За такие танцы с бубном можно получить 0.125 NDCG@20

Super embedding¶

%tensorflow_version 2.x
import tensorflow.keras as keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout,  Bidirectional, GlobalMaxPool1D, GlobalAveragePooling1D, concatenate, Conv1D, LeakyReLU,BatchNormalization, MaxPooling1D, Flatten
from tensorflow.keras.models import Model
# from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers
from tensorflow.keras.optimizers import  Adam

class ModelSaveCallback(keras.callbacks.Callback):
    def __init__(self, file_name):
        super(ModelSaveCallback, self).__init__()
        self.file_name = file_name
    def on_epoch_end(self, epoch, logs=None):
        filename = self.file_name.format(epoch)
        keras.models.save_model(self.model, filename)

TensorFlow 2.x selected.

def asciishell_score(data, min_views, count_col='count', mean_col='mean'):
    rating = (data[count_col]/ (data[count_col] + min_views)) * data[mean_col] +\
                            (min_views / (min_views + data[count_col])) * data[mean_col].mean()
    rating += ((data[mean_col] != 0) - 1)
    return rating

item_rating = train.groupby('itemId')['event'].agg(['mean', 'count'])
item_rating['asciishell'] = asciishell_score(item_rating, 10)
df = items.join(item_rating, on='itemId')

%%time
max_features=100000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list((df["title"] + ' ' + df["content"]).values))

CPU times: user 2min 5s, sys: 562 ms, total: 2min 5s
Wall time: 2min 5s

max_content_size=150
max_title_size=10
embed_size=300

EMBEDDING_FILE = '/content/drive/Shared drives/Тяжелые проекты/text_embeddings/cc.ru.300.vec'

def get_embedding_matrix(tokenizer, max_features, embed_size, embed_file):
    def get_coefs(word,*arr): 
        return word, np.asarray(arr, dtype='float32')
    with open(embed_file) as f:
        f.readline()
        embeddings_index = dict(get_coefs(*o.strip().split()) for o in tqdm_notebook(f, total=2000000))
    
    skip_, add_, not_found_, tot_ = 0,0,0,0
    
    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()      
    
    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        tot_ += 1
        if i >= max_features: 
            skip_ += 1
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: 
            add_ += 1
            embedding_matrix[i] = embedding_vector
        else:
            not_found_ += 1
    print('Skip {:03f}, Not found {:03f}, Add {:03f}, total {:03d}'.format(skip_/tot_, not_found_/tot_, add_/tot_, tot_))
    return embedding_matrix
embedding_matrix = get_embedding_matrix(tokenizer, max_features, embed_size, EMBEDDING_FILE)

/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py:2882: FutureWarning: arrays to stack must be passed as a "sequence" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.
  exec(code_obj, self.user_global_ns, self.user_ns)

Skip 0.927821, Not found 0.004676, Add 0.067502, total 1385440

T_content = pad_sequences(tokenizer.texts_to_sequences(df['content'].values), maxlen=max_content_size)
T_title = pad_sequences(tokenizer.texts_to_sequences(df['title'].values), maxlen=max_title_size)
T_image = np.stack(df['image'].values)

def get_lstm(emb, lstm_units):
    x = Bidirectional(LSTM(lstm_units, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(emb)
    gmx = GlobalMaxPool1D()(x)
    gax = GlobalAveragePooling1D()(x)
    concatenated = concatenate([gmx, gax])
    return Dense(lstm_units, activation=LeakyReLU())(concatenated)

def get_cnn(emb, out_uniuts=512, 
               kernel_size=5, initial_filters=64, pack_count=3,
               leaky_relu_rate=0.1, dropout_rate=0.2):
    x = emb
    for pack in range(pack_count):
        x = Conv1D(filters=initial_filters, 
                   kernel_size=kernel_size,
                   activation=LeakyReLU(leaky_relu_rate))(x)
        x = Conv1D(filters=initial_filters, 
                   kernel_size=kernel_size,
                   activation=LeakyReLU(leaky_relu_rate))(x)
        x = BatchNormalization()(x)
        x = MaxPooling1D(padding='same')(x)
        x = Dropout(dropout_rate)(x)
        initial_filters *= 2

    x = Flatten()(x)
    x = Dense(out_uniuts, activation=LeakyReLU(leaky_relu_rate))(x)
    x = BatchNormalization()(x)
    return x
def get_model():
    input_content = Input(shape=(max_content_size, ))
    input_title = Input(shape=(max_title_size, ))
    input_image = Input(shape=(96, ))
    
    emb_title = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable = True)(input_title)
    emb_conte = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable = True)(input_content)

    lstm_content = get_lstm(emb_conte, 80)
    lstm_title = get_lstm(emb_title, 10)
    cnn_content = get_cnn(emb_conte, out_uniuts=128)
    cnn_title = get_cnn(emb_title, out_uniuts=32, initial_filters=8, pack_count=1)
    dense_image = Dense(48, activation='relu')(input_image)

    concatenated = concatenate([lstm_content, lstm_title, cnn_content, cnn_title, dense_image])
    x = Dropout(0.1)(concatenated)
    x = Dense(80, activation='sigmoid', name='last_hidden_layer')(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=[input_content, input_title, input_image], outputs=x)    
    model.compile(loss='mse', optimizer=Adam(clipvalue=2, clipnorm=2))
    intermediate_model = Model(inputs=model.input, outputs=model.get_layer("last_hidden_layer").output)

    return model, intermediate_model

model, intermediate_model = get_model()

model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
input_1 (InputLayer)            [(None, 150)]        0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 150, 300)     30000000    input_1[0][0]                    
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 146, 64)      96064       embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 142, 64)      20544       conv1d[0][0]                     
__________________________________________________________________________________________________
batch_normalization (BatchNorma (None, 142, 64)      256         conv1d_1[0][0]                   
__________________________________________________________________________________________________
max_pooling1d (MaxPooling1D)    (None, 71, 64)       0           batch_normalization[0][0]        
__________________________________________________________________________________________________
dropout (Dropout)               (None, 71, 64)       0           max_pooling1d[0][0]              
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 67, 128)      41088       dropout[0][0]                    
__________________________________________________________________________________________________
conv1d_3 (Conv1D)               (None, 63, 128)      82048       conv1d_2[0][0]                   
__________________________________________________________________________________________________
batch_normalization_1 (BatchNor (None, 63, 128)      512         conv1d_3[0][0]                   
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 10)]         0                                            
__________________________________________________________________________________________________
max_pooling1d_1 (MaxPooling1D)  (None, 32, 128)      0           batch_normalization_1[0][0]      
__________________________________________________________________________________________________
embedding (Embedding)           (None, 10, 300)      30000000    input_2[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 32, 128)      0           max_pooling1d_1[0][0]            
__________________________________________________________________________________________________
conv1d_4 (Conv1D)               (None, 28, 256)      164096      dropout_1[0][0]                  
__________________________________________________________________________________________________
conv1d_6 (Conv1D)               (None, 6, 8)         12008       embedding[0][0]                  
__________________________________________________________________________________________________
conv1d_5 (Conv1D)               (None, 24, 256)      327936      conv1d_4[0][0]                   
__________________________________________________________________________________________________
conv1d_7 (Conv1D)               (None, 2, 8)         328         conv1d_6[0][0]                   
__________________________________________________________________________________________________
batch_normalization_2 (BatchNor (None, 24, 256)      1024        conv1d_5[0][0]                   
__________________________________________________________________________________________________
batch_normalization_4 (BatchNor (None, 2, 8)         32          conv1d_7[0][0]                   
__________________________________________________________________________________________________
max_pooling1d_2 (MaxPooling1D)  (None, 12, 256)      0           batch_normalization_2[0][0]      
__________________________________________________________________________________________________
max_pooling1d_3 (MaxPooling1D)  (None, 1, 8)         0           batch_normalization_4[0][0]      
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 150, 160)     243840      embedding_1[0][0]                
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 10, 20)       24880       embedding[0][0]                  
__________________________________________________________________________________________________
dropout_2 (Dropout)             (None, 12, 256)      0           max_pooling1d_2[0][0]            
__________________________________________________________________________________________________
dropout_3 (Dropout)             (None, 1, 8)         0           max_pooling1d_3[0][0]            
__________________________________________________________________________________________________
global_max_pooling1d (GlobalMax (None, 160)          0           bidirectional[0][0]              
__________________________________________________________________________________________________
global_average_pooling1d (Globa (None, 160)          0           bidirectional[0][0]              
__________________________________________________________________________________________________
global_max_pooling1d_1 (GlobalM (None, 20)           0           bidirectional_1[0][0]            
__________________________________________________________________________________________________
global_average_pooling1d_1 (Glo (None, 20)           0           bidirectional_1[0][0]            
__________________________________________________________________________________________________
flatten (Flatten)               (None, 3072)         0           dropout_2[0][0]                  
__________________________________________________________________________________________________
flatten_1 (Flatten)             (None, 8)            0           dropout_3[0][0]                  
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, 320)          0           global_max_pooling1d[0][0]       
                                                                 global_average_pooling1d[0][0]   
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 40)           0           global_max_pooling1d_1[0][0]     
                                                                 global_average_pooling1d_1[0][0] 
__________________________________________________________________________________________________
dense_2 (Dense)                 (None, 128)          393344      flatten[0][0]                    
__________________________________________________________________________________________________
dense_3 (Dense)                 (None, 32)           288         flatten_1[0][0]                  
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 96)]         0                                            
__________________________________________________________________________________________________
dense (Dense)                   (None, 80)           25680       concatenate[0][0]                
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 10)           410         concatenate_1[0][0]              
__________________________________________________________________________________________________
batch_normalization_3 (BatchNor (None, 128)          512         dense_2[0][0]                    
__________________________________________________________________________________________________
batch_normalization_5 (BatchNor (None, 32)           128         dense_3[0][0]                    
__________________________________________________________________________________________________
dense_4 (Dense)                 (None, 48)           4656        input_3[0][0]                    
__________________________________________________________________________________________________
concatenate_2 (Concatenate)     (None, 298)          0           dense[0][0]                      
                                                                 dense_1[0][0]                    
                                                                 batch_normalization_3[0][0]      
                                                                 batch_normalization_5[0][0]      
                                                                 dense_4[0][0]                    
__________________________________________________________________________________________________
dropout_4 (Dropout)             (None, 298)          0           concatenate_2[0][0]              
__________________________________________________________________________________________________
last_hidden_layer (Dense)       (None, 80)           23920       dropout_4[0][0]                  
__________________________________________________________________________________________________
dense_5 (Dense)                 (None, 1)            81          last_hidden_layer[0][0]          
==================================================================================================
Total params: 61,463,675
Trainable params: 61,462,443
Non-trainable params: 1,232
__________________________________________________________________________________________________

index_valid = df[~df['asciishell'].isna()].index

TAKE=25
batch_size=512
NAME='kernel_lstm_cnn_image_ru_embedding_kill_me_pls'
model_filename = '{0:02d}_{1:s}_{{0:02d}}.hdf5'.format(TAKE, NAME)

target = df['mean'][index_valid].values

target.min(), target.mean(), np.median(target), target.max()

(0.0, 0.06331375851999281, 0.05714285714285714, 1.0)

model.fit(x=[T_content[index_valid], T_title[index_valid], T_image[index_valid]],
          y=target,
          batch_size=batch_size,
          callbacks=[ModelSaveCallback(model_filename)],
          validation_split=0.05,
          initial_epoch=0, epochs=5)

Train on 230238 samples, validate on 12118 samples
Epoch 1/5
230238/230238 [==============================] - 587s 3ms/sample - loss: 0.0061 - val_loss: 0.0024
Epoch 2/5
230238/230238 [==============================] - 559s 2ms/sample - loss: 0.0022 - val_loss: 0.0021
Epoch 3/5
230238/230238 [==============================] - 558s 2ms/sample - loss: 0.0019 - val_loss: 0.0024
Epoch 4/5
230238/230238 [==============================] - 562s 2ms/sample - loss: 0.0014 - val_loss: 0.0027
Epoch 5/5
230238/230238 [==============================] - 553s 2ms/sample - loss: 0.0010 - val_loss: 0.0027

<tensorflow.python.keras.callbacks.History at 0x7f38a193d0b8>

model = keras.models.load_model('25_kernel_lstm_cnn_image_ru_embedding_kill_me_pls_01.hdf5', custom_objects={'LeakyReLU': LeakyReLU})
intermediate_model = Model(inputs=model.input, outputs=model.get_layer("last_hidden_layer").output)

from sklearn.metrics import roc_auc_score
_proba = model.predict([T_content[index_valid], T_title[index_valid], T_image[index_valid]], batch_size=batch_size*8, verbose=1)

242356/242356 [==============================] - 63s 258us/sample

roc_auc_score((target > np.median(target) * 0.5).astype(np.uint8),_proba), \
    roc_auc_score((target > np.median(target) * 0.7).astype(np.uint8),_proba), \
    roc_auc_score((target > np.median(target) * 1.0).astype(np.uint8),_proba), \
    roc_auc_score((target > np.median(target) * 1.5).astype(np.uint8),_proba),

(0.7350286612406992,
 0.7408438191415203,
 0.7556499040471025,
 0.7759367124694296)

features = intermediate_model.predict([T_content, T_title, T_image], batch_size=batch_size*8, verbose=1)
features.dump('25_features.pkl')

328050/328050 [==============================] - 87s 264us/sample

df['mean_prob'] = model.predict([T_content, T_title, T_image], batch_size=batch_size*8, verbose=1)[:, 0]

328050/328050 [==============================] - 85s 259us/sample

df[['mean', 'mean_prob']].to_pickle('25_df.pkl')

df['mean_merge'] = df['mean'].fillna(0) + df['mean_prob'] * df['mean'].isna()

res1 = random_benchmark.join(df[['mean_prob', 'mean_merge']], on='itemId')

res1

res1.sort_values(['userId', 'mean_prob'], ascending=[True, False])[['userId', 'itemId']].to_csv('25_submit_mean_prob.csv', index=None)
res1.sort_values(['userId', 'mean_merge'], ascending=[True, False])[['userId', 'itemId']].to_csv('25_submit_mean_merge.csv', index=None)

!kaggle competitions submit recsys-iad-challenge -f '25_submit_mean_merge.csv' -m "Colab auto submit"

Warning: Looks like you're using an outdated API Version, please consider updating (server 1.5.6 / client 1.5.4)
100% 35.7M/35.7M [00:04<00:00, 8.97MB/s]
Successfully submitted to Articles RecSys

Matrix factorization¶

features = np.load('25_features.pkl', allow_pickle=True)

n_item=items.index.max()+1
n_users=len(train['userId'].unique())
n_latent_factors=features.shape[1]

user_input=Input(shape=(1,),name='user_input',dtype='int64')
user_embedding=Embedding(n_users,n_latent_factors,name='user_embedding')(user_input)
user_vec =Flatten(name='FlattenUsers')(user_embedding)
user_vec=Dropout(0.3)(user_vec)

item_input=Input(shape=(1,),name='item_input',dtype='int64')
item_embedding=Embedding(n_item,n_latent_factors,name='item_embedding', weights=[features], trainable = True)(item_input)
item_vec=Flatten(name='FlattenItems')(item_embedding)
item_vec=Dropout(0.3)(item_vec)

sim=keras.layers.dot([user_vec,item_vec],name='Simalarity-Dot-Product',axes=1)
model = keras.models.Model([user_input, item_input],sim)
model.compile(optimizer=Adam(),loss='mse')

model.summary()

Model: "model_10"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
user_input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
item_input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
user_embedding (Embedding)      (None, 1, 80)        3438160     user_input[0][0]                 
__________________________________________________________________________________________________
item_embedding (Embedding)      (None, 1, 80)        26244000    item_input[0][0]                 
__________________________________________________________________________________________________
FlattenUsers (Flatten)          (None, 80)           0           user_embedding[0][0]             
__________________________________________________________________________________________________
FlattenItems (Flatten)          (None, 80)           0           item_embedding[0][0]             
__________________________________________________________________________________________________
dropout_19 (Dropout)            (None, 80)           0           FlattenUsers[0][0]               
__________________________________________________________________________________________________
dropout_20 (Dropout)            (None, 80)           0           FlattenItems[0][0]               
__________________________________________________________________________________________________
Simalarity-Dot-Product (Dot)    (None, 1)            0           dropout_19[0][0]                 
                                                                 dropout_20[0][0]                 
==================================================================================================
Total params: 29,682,160
Trainable params: 29,682,160
Non-trainable params: 0
__________________________________________________________________________________________________

batch_size=65536
TAKE=26
NAME='base_matrix_factorization_kill_me_pls'
model_filename = '{0:02d}_{1:s}_{{0:02d}}.hdf5'.format(TAKE, NAME)

target = (train['event_float'].replace(10, (train['event'] == 0).sum()/(train['event'] == 1).sum())/20)

model.fit(x=[train['userId'], train['itemId']],
          y=target,
          batch_size=batch_size,
          callbacks=[ModelSaveCallback(model_filename)],
          validation_split=0.005,
          initial_epoch=0, epochs=30)

Train on 67441267 samples, validate on 338901 samples
Epoch 1/30
67441267/67441267 [==============================] - 256s 4us/sample - loss: 0.0291 - val_loss: 0.0439
Epoch 2/30
67441267/67441267 [==============================] - 250s 4us/sample - loss: 0.0251 - val_loss: 0.0430
Epoch 3/30
67441267/67441267 [==============================] - 252s 4us/sample - loss: 0.0248 - val_loss: 0.0421
Epoch 4/30
67441267/67441267 [==============================] - 248s 4us/sample - loss: 0.0245 - val_loss: 0.0412
Epoch 5/30
67441267/67441267 [==============================] - 248s 4us/sample - loss: 0.0243 - val_loss: 0.0404
Epoch 6/30
67441267/67441267 [==============================] - 249s 4us/sample - loss: 0.0241 - val_loss: 0.0396
Epoch 7/30
67441267/67441267 [==============================] - 252s 4us/sample - loss: 0.0239 - val_loss: 0.0389
Epoch 8/30
67441267/67441267 [==============================] - 253s 4us/sample - loss: 0.0237 - val_loss: 0.0383
Epoch 9/30
67441267/67441267 [==============================] - 258s 4us/sample - loss: 0.0236 - val_loss: 0.0377
Epoch 10/30
67441267/67441267 [==============================] - 255s 4us/sample - loss: 0.0234 - val_loss: 0.0372
Epoch 11/30
67441267/67441267 [==============================] - 253s 4us/sample - loss: 0.0233 - val_loss: 0.0367
Epoch 12/30
67441267/67441267 [==============================] - 260s 4us/sample - loss: 0.0232 - val_loss: 0.0362
Epoch 13/30
67441267/67441267 [==============================] - 252s 4us/sample - loss: 0.0231 - val_loss: 0.0358
Epoch 14/30
67441267/67441267 [==============================] - 256s 4us/sample - loss: 0.0230 - val_loss: 0.0354
Epoch 15/30
67441267/67441267 [==============================] - 253s 4us/sample - loss: 0.0229 - val_loss: 0.0351
Epoch 16/30
67441267/67441267 [==============================] - 256s 4us/sample - loss: 0.0228 - val_loss: 0.0347
Epoch 17/30
67441267/67441267 [==============================] - 253s 4us/sample - loss: 0.0228 - val_loss: 0.0344
Epoch 18/30
67441267/67441267 [==============================] - 253s 4us/sample - loss: 0.0227 - val_loss: 0.0341
Epoch 19/30
67441267/67441267 [==============================] - 251s 4us/sample - loss: 0.0226 - val_loss: 0.0338
Epoch 20/30
67441267/67441267 [==============================] - 249s 4us/sample - loss: 0.0226 - val_loss: 0.0336
Epoch 21/30
67441267/67441267 [==============================] - 249s 4us/sample - loss: 0.0225 - val_loss: 0.0333
Epoch 22/30
67441267/67441267 [==============================] - 256s 4us/sample - loss: 0.0224 - val_loss: 0.0331
Epoch 23/30
67441267/67441267 [==============================] - 253s 4us/sample - loss: 0.0224 - val_loss: 0.0329
Epoch 24/30
67441267/67441267 [==============================] - 258s 4us/sample - loss: 0.0223 - val_loss: 0.0327
Epoch 25/30
67441267/67441267 [==============================] - 257s 4us/sample - loss: 0.0223 - val_loss: 0.0325
Epoch 26/30
67441267/67441267 [==============================] - 255s 4us/sample - loss: 0.0222 - val_loss: 0.0323
Epoch 27/30
67441267/67441267 [==============================] - 254s 4us/sample - loss: 0.0222 - val_loss: 0.0321
Epoch 28/30
67441267/67441267 [==============================] - 260s 4us/sample - loss: 0.0221 - val_loss: 0.0319
Epoch 29/30
67441267/67441267 [==============================] - 256s 4us/sample - loss: 0.0221 - val_loss: 0.0317
Epoch 30/30
67441267/67441267 [==============================] - 256s 4us/sample - loss: 0.0220 - val_loss: 0.0316

<tensorflow.python.keras.callbacks.History at 0x7f3671e643c8>

model = keras.models.load_model('26_base_matrix_factorization_kill_me_pls_29.hdf5')

item_embedding_w=model.get_layer('item_embedding').get_weights()[0]
user_embedding_w=model.get_layer('user_embedding').get_weights()[0]

user_model = Model(inputs=model.input, outputs=model.get_layer("dense_4").output)
item_model = Model(inputs=model.input, outputs=model.get_layer("dense_5").output)

item_embedding_w=model.get_layer('item_embedding').get_weights()[0]
user_embedding_w=model.get_layer('user_embedding').get_weights()[0]

Advanced clustering¶

Выполним кластеризацию пользователей на категории. Основной критерий -- примерно одинаковое число пользователей в кластере.

Будем использовать KMeans алгоритм, малые кластеры склеим в один, большие кластеры будем дальше разбивать рекурсивно. Будем продолжать до тех пор, пока все не разобьется по необходимым условиям, или достигнет определенная глубина выполнения рекурсивного алгоритма.

Далее на каждом кластере можем обучить отдельный алгоритм с повышенной обобщающей способностью.

from sklearn.cluster import MiniBatchKMeans
import random


def run_deep_kmeans(data, deep=5, too_low=100, too_high=500, n_clusters=None, pre_c=None, prefix=''):
    if deep <= 0:
        return pre_c
    if n_clusters is None:
        n_clusters = int(round(len(data) / too_high))
        if n_clusters < 5:
            n_clusters = 5
        if n_clusters > 40:
            n_clusters = 40
    kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(data)
    prefix = str(prefix) + str(random.randint(0, 9999)) + "_"

    clusters = np.char.add(np.array([prefix] * len(data)).astype('<U999'),
                           kmeans.predict(data).astype('<U999'))
    for i in range(3):
        index_low = []
        c_low = []
        index_ok = []
        c_ok = []
        index_high = []
        c_high = []
        for c in np.unique(clusters):
            cnt = (clusters == c).sum()
            c_index = np.where(clusters == c)[0]
            if cnt < too_low:
                index_low.extend(c_index)
                c_low.append(c)
            elif cnt < too_high:
                index_ok.extend(c_index)
                c_ok.append(c)
            else:
                index_high.append(c_index)
                c_high.append(c)
        if (len(index_low) == 0 and len(index_high) == 0) or deep < 3:
            break
        if len(index_low) > 0:
            if len(index_low) < too_high:
                clusters[index_low] = str(c_low[0]) + "_low"
            else:
                clusters[index_low] = run_deep_kmeans(data[index_low],
                                                      deep - 1,
                                                      prefix=prefix,
                                                       too_low=too_low, too_high=too_high,
                                                      pre_c=np.array([str(c_low[0]) + "_low"] * len(index_low)).astype('<U999'),
                                                      )
        for c_index, c in zip(index_high, c_high):
            prefix_high = str(c) + "_high_"
            clusters[c_index] = run_deep_kmeans(data[c_index],
                                                deep - 1,
                                                prefix=prefix_high,
                                                 too_low=too_low, too_high=too_high,
                                                pre_c=np.array([prefix_high] * len(c_index)).astype('<U999'))
    return clusters

clu = run_deep_kmeans(user_embedding_w, too_low=300, too_high=600,)
clu = pd.factorize(clu)[0]

for c in np.unique(clu):
    cnt = (clu == c).sum()
    print(c, cnt, sep='\t')

0	551
1	420
2	584
3	325
4	395
5	555
6	530
7	350
8	302
9	348
10	68
11	545
12	486
13	538
14	549
15	448
16	584
17	375
18	302
19	306
20	389
21	302
22	580
23	530
24	315
25	441
26	303
27	492
28	513
29	348
30	524
31	326
32	508
33	447
34	568
35	336
36	461
37	317
38	437
39	103
40	550
41	509
42	325
43	456
44	352
45	573
46	433
47	407
48	417
49	520
50	319
51	430
52	591
53	301
54	520
55	463
56	408
57	491
58	340
59	332
60	329
61	496
62	422
63	406
64	471
65	440
66	563
67	342
68	338
69	347
70	539
71	538
72	591
73	362
74	321
75	391
76	302
77	317
78	313
79	312
80	599
81	322
82	326
83	362
84	326
85	486
86	337
87	334
88	436
89	315
90	263
91	66
92	385
93	316
94	492
95	365
96	151
97	331
98	313
99	514
100	366
101	535
102	360
103	321
104	330
105	348

!pip install catboost -q
from catboost import CatBoostClassifier, Pool, FeaturesData
from sklearn.linear_model import SGDClassifier

num_features = [f"item {i:03d}" for i in range(item_embedding_w.shape[1])] + [f"user {i:03d}" for i in range(user_embedding_w.shape[1])] + ['prod']

result = []
for cluster_id in tqdm_notebook(np.unique(clu)):
    uids = np.where(clu == cluster_id)[0]
    # uids_t, uids_f = train_test_split(uids, test_size=0.1)
    # df_train = train[train['userId'].isin(uids_t)]
    # df_test = train[train['userId'].isin(uids_f)]
    
    df_train = train[train['userId'].isin(uids)]
    df_test = random_benchmark[random_benchmark['userId'].isin(uids)]


    user_train = user_embedding_w[df_train['userId']]
    user_test = user_embedding_w[df_test['userId']]
    item_train = item_embedding_w[df_train['itemId']]
    item_test = item_embedding_w[df_test['itemId']]
    prod_train = np.array([item_row @ user_row for item_row, user_row in zip(item_train, user_train)])
    prod_test = np.array([item_row @ user_row for item_row, user_row in zip(item_test, user_test)])
    all_train = np.hstack([item_train, user_train, prod_train.reshape((len(prod_train), 1))])
    all_test = np.hstack([item_test, user_test, prod_test.reshape((len(prod_test), 1))])
    train_features = FeaturesData(all_train, df_train['userId'].values.astype(str).astype(object).reshape((len(df_train['userId']), 1)), num_features, ['user_id'])
    train_features = Pool(train_features, df_train['event'].values)

    test_features = FeaturesData(all_test, df_test['userId'].values.astype(str).astype(object).reshape((len(df_test['userId']), 1)), num_features, ['user_id'])
    # test_features = Pool(test_features, df_test['event'].values)
    test_features = Pool(test_features)

    # clf = CatBoostClassifier(task_type='GPU', learning_rate=0.14, num_trees=500, verbose=1, eval_metric='AUC')
    # clf.fit(train_features, eval_set=test_features)
    # break
    clf_cat = CatBoostClassifier(task_type='GPU', learning_rate=0.14, num_trees=500, verbose=0)
    clf_cat.fit(train_features, )
    proba = clf_cat.predict_proba(test_features)[:,1]

    clf_sgd = SGDClassifier(loss='log', random_state=42, n_jobs=-1)
    clf_sgd.fit(all_train, df_train['event'].values)
    proba_sgd = clf_sgd.predict_proba(all_test)[:, 1]
    result += list(zip(df_test['userId'], df_test['itemId'], proba, proba_sgd))
    # break
result = pd.DataFrame(result, columns=['userId', 'itemId', 'event_catboost', 'event_sgd'])
result.to_pickle('26_kmean_deep_catboost_raw.pkl')

result.sort_values(['userId', 'event_sgd'], ascending=[True, False])

result.sort_values(['userId', 'event_catboost'], ascending=[True, False])[['userId', 'itemId']].to_csv('26_kmean_deep_catboost.csv', index=None)
# !kaggle competitions submit recsys-iad-challenge -f '26_kmean_deep_catboost.csv' -m "Colab auto submit"

result.sort_values(['userId', 'event_sgd'], ascending=[True, False])[['userId', 'itemId']].to_csv('26_kmean_deep_sgd.csv', index=None)
!kaggle competitions submit recsys-iad-challenge -f '26_kmean_deep_sgd.csv' -m "Colab auto submit"

Warning: Looks like you're using an outdated API Version, please consider updating (server 1.5.6 / client 1.5.4)
100% 35.7M/35.7M [00:03<00:00, 10.6MB/s]
Successfully submitted to Articles RecSys

	userId	itemId	score
360	1	322907	0.114959
248	1	178123	0.068097
577	1	225577	0.037956
112	1	159316	0.034396
239	1	251205	0.030940
...	...	...	...
3017920	42975	176823	-0.010609
3017982	42975	282939	-0.010906
3018166	42975	218128	-0.014319
3018107	42975	196240	-0.014525
3017847	42975	46331	-0.019185

	userId	itemId	score_1	score_2	score_3	score_
590	1	322907	0.000000e+00	0.000000e+00	0.000010	0.000010
543	1	296536	2.319275e-06	5.632524e-06	0.000006	0.000014
466	1	251205	1.325300e-06	3.313250e-07	0.000023	0.000025
389	1	206205	3.313250e-06	2.153612e-05	0.000009	0.000034
426	1	225577	6.626499e-07	1.358432e-05	0.000022	0.000036
...	...	...	...	...	...	...
3018102	42975	251143	9.999867e-01	9.999662e-01	0.999991	2.999944
3018050	42975	209610	9.999937e-01	9.999742e-01	0.999977	2.999945
3018065	42975	218128	9.999993e-01	9.999606e-01	0.999993	2.999953
3018004	42975	173242	9.999672e-01	9.999927e-01	0.999994	2.999954
3018042	42975	200390	9.999930e-01	9.999848e-01	0.999988	2.999966

	userId	itemId	mean_prob	mean_merge
0	1	242249	0.054810	0.054810
1	1	117266	0.041693	0.041693
2	1	123441	0.091395	0.091395
3	1	148575	0.059543	0.093306
4	1	155695	0.060008	0.101785
...	...	...	...	...
3018181	42975	4202	0.079527	0.142857
3018182	42975	111136	0.056123	0.056123
3018183	42975	195676	0.044587	0.044587
3018184	42975	292604	0.075322	0.073559
3018185	42975	303873	0.039400	0.039400

	userId	itemId	event_catboost	event_sgd
36654	1	256354	0.436731	0.520225
36434	1	307942	0.444401	0.406219
36700	1	56483	0.398544	0.379997
36413	1	206205	0.396696	0.375752
36616	1	322907	0.364676	0.367662
...	...	...	...	...
991265	42975	3738	0.003399	0.007948
991395	42975	273553	0.002449	0.007834
991065	42975	202607	0.003526	0.007277
991109	42975	200201	0.001326	0.007217
991164	42975	19192	0.002245	0.006661