diff --git a/.gitignore b/.gitignore index bd32b905..45019cb0 100644 --- a/.gitignore +++ b/.gitignore @@ -23,4 +23,4 @@ _site .coverage tags -settings.json \ No newline at end of file +settings.json diff --git a/surprise/__init__.py b/surprise/__init__.py index e87ca980..33d958a3 100644 --- a/surprise/__init__.py +++ b/surprise/__init__.py @@ -12,6 +12,8 @@ from .prediction_algorithms import NMF from .prediction_algorithms import SlopeOne from .prediction_algorithms import CoClustering +from .prediction_algorithms import Lasso +from .prediction_algorithms import FM from .prediction_algorithms import PredictionImpossible from .prediction_algorithms import Prediction @@ -30,6 +32,7 @@ 'KNNWithMeans', 'KNNBaseline', 'SVD', 'SVDpp', 'NMF', 'SlopeOne', 'CoClustering', 'PredictionImpossible', 'Prediction', 'Dataset', 'Reader', 'Trainset', 'evaluate', 'print_perf', 'GridSearch', - 'dump', 'KNNWithZScore', 'get_dataset_dir', 'model_selection'] + 'dump', 'KNNWithZScore', 'get_dataset_dir', 'model_selection', + 'Lasso', 'FM'] __version__ = get_distribution('scikit-surprise').version diff --git a/surprise/dataset.py b/surprise/dataset.py index 17638b6c..e31d08bc 100644 --- a/surprise/dataset.py +++ b/surprise/dataset.py @@ -53,6 +53,12 @@ class Dataset: def __init__(self, reader): self.reader = reader + self.user_features_nb = 0 + self.item_features_nb = 0 + self.user_features = {} + self.item_features = {} + self.user_features_labels = [] + self.item_features_labels = [] @classmethod def load_builtin(cls, name='ml-100k'): @@ -165,6 +171,42 @@ def load_from_df(cls, df, reader): return DatasetAutoFolds(reader=reader, df=df) + def load_features_df(self, features_df, user_features=True): + """Load features from a pandas dataframe into a dataset. + + Use this if you want to add user or item features to a dataset. Only + certain prediction algorithms in the :mod:`prediction_algorithms` + package support this additional data. + + Args: + features_df(`Dataframe`): The dataframe containing the features. It + must have two columns or more, corresponding to the user or + item (raw) ids, and the features, in this order. + user_features(:obj:`bool`): Whether the features are for the users + or the items. Default is ``True``. + """ + + if len(features_df.columns) < 2: + raise ValueError('features_df requires at least 2 columns.') + + if not features_df.iloc[:, 0].is_unique: + raise ValueError('first column of features_df must be unique ids.') + + if user_features: + self.user_features_df = features_df + for tup in features_df.itertuples(index=False): + self.user_features[tup[0]] = list(tup[1:]) + self.user_features_labels = features_df.columns.values.tolist()[1:] + self.user_features_nb = len(self.user_features_labels) + else: + self.item_features_df = features_df + for tup in features_df.itertuples(index=False): + self.item_features[tup[0]] = list(tup[1:]) + self.item_features_labels = features_df.columns.values.tolist()[1:] + self.item_features_nb = len(self.item_features_labels) + + return self + def read_ratings(self, file_name): """Return a list of ratings (user, item, rating, timestamp) read from file_name""" @@ -208,20 +250,36 @@ def construct_trainset(self, raw_trainset): ur = defaultdict(list) ir = defaultdict(list) + u_features = defaultdict(list) + i_features = defaultdict(list) + # user raw id, item raw id, translated rating, time stamp - for urid, irid, r, timestamp in raw_trainset: + for urid, irid, r, _ in raw_trainset: try: uid = raw2inner_id_users[urid] except KeyError: uid = current_u_index raw2inner_id_users[urid] = current_u_index current_u_index += 1 + if self.user_features_nb > 0: + try: + u_features[uid] = self.user_features[urid] + except KeyError: + raise ValueError('Features are defined for all users' + 'but user {}'.format(urid)) + try: iid = raw2inner_id_items[irid] except KeyError: iid = current_i_index raw2inner_id_items[irid] = current_i_index current_i_index += 1 + if self.item_features_nb > 0: + try: + i_features[iid] = self.item_features[irid] + except KeyError: + raise ValueError('Features are defined for all items' + 'but item {}'.format(irid)) ur[uid].append((iid, r)) ir[iid].append((uid, r)) @@ -232,8 +290,14 @@ def construct_trainset(self, raw_trainset): trainset = Trainset(ur, ir, + u_features, + i_features, n_users, n_items, + self.user_features_nb, + self.item_features_nb, + self.user_features_labels, + self.item_features_labels, n_ratings, self.reader.rating_scale, self.reader.offset, @@ -244,8 +308,25 @@ def construct_trainset(self, raw_trainset): def construct_testset(self, raw_testset): - return [(ruid, riid, r_ui_trans) - for (ruid, riid, r_ui_trans, _) in raw_testset] + testset = [] + for (ruid, riid, r_ui_trans, _) in raw_testset: + if self.user_features_nb > 0: + try: # add features if available + u_features = self.user_features[ruid] + except KeyError: + u_features = [] + else: + u_features = [] + if self.item_features_nb > 0: + try: # add features if available + i_features = self.item_features[riid] + except KeyError: + i_features = [] + else: + i_features = [] + testset.append((ruid, riid, u_features, i_features, r_ui_trans)) + + return testset class DatasetUserFolds(Dataset): diff --git a/surprise/evaluate.py b/surprise/evaluate.py index 55764d8b..65ff2b86 100644 --- a/surprise/evaluate.py +++ b/surprise/evaluate.py @@ -301,6 +301,7 @@ class CaseInsensitiveDefaultDict(defaultdict): Used for the returned dict, so that users can use perf['RMSE'] or perf['rmse'] indifferently. """ + def __setitem__(self, key, value): super(CaseInsensitiveDefaultDict, self).__setitem__(key.lower(), value) @@ -333,4 +334,5 @@ def seed_and_eval(seed, *args): different processes.""" random.seed(seed) + return evaluate(*args, verbose=0) diff --git a/surprise/model_selection/search.py b/surprise/model_selection/search.py index 0510c60f..d1811218 100644 --- a/surprise/model_selection/search.py +++ b/surprise/model_selection/search.py @@ -294,6 +294,7 @@ class GridSearchCV(BaseSearchCV): into a pandas `DataFrame` (see :ref:`example `). """ + def __init__(self, algo_class, param_grid, measures=['rmse', 'mae'], cv=None, refit=False, return_train_measures=False, n_jobs=1, pre_dispatch='2*n_jobs', joblib_verbose=0): @@ -410,6 +411,7 @@ class RandomizedSearchCV(BaseSearchCV): into a pandas `DataFrame` (see :ref:`example `). """ + def __init__(self, algo_class, param_distributions, n_iter=10, measures=['rmse', 'mae'], cv=None, refit=False, return_train_measures=False, n_jobs=1, diff --git a/surprise/model_selection/split.py b/surprise/model_selection/split.py index 14697911..5c656565 100644 --- a/surprise/model_selection/split.py +++ b/surprise/model_selection/split.py @@ -372,7 +372,7 @@ def split(self, data): Args: data(:obj:`Dataset`): The data containing - ratings that will be devided into trainsets and testsets. + ratings that will be divided into trainsets and testsets. Yields: tuple of (trainset, testset) diff --git a/surprise/prediction_algorithms/__init__.py b/surprise/prediction_algorithms/__init__.py index d5ce8288..5005c581 100644 --- a/surprise/prediction_algorithms/__init__.py +++ b/surprise/prediction_algorithms/__init__.py @@ -32,6 +32,8 @@ from .matrix_factorization import NMF from .slope_one import SlopeOne from .co_clustering import CoClustering +from .linear import Lasso +from .factorization_machines import FM from .predictions import PredictionImpossible from .predictions import Prediction @@ -39,4 +41,4 @@ __all__ = ['AlgoBase', 'NormalPredictor', 'BaselineOnly', 'KNNBasic', 'KNNBaseline', 'KNNWithMeans', 'SVD', 'SVDpp', 'NMF', 'SlopeOne', 'CoClustering', 'PredictionImpossible', 'Prediction', - 'KNNWithZScore'] + 'KNNWithZScore', 'Lasso', 'FM'] diff --git a/surprise/prediction_algorithms/algo_base.py b/surprise/prediction_algorithms/algo_base.py index 9becf7bd..e482b374 100644 --- a/surprise/prediction_algorithms/algo_base.py +++ b/surprise/prediction_algorithms/algo_base.py @@ -37,7 +37,7 @@ def __init__(self, **kwargs): self.skip_train = False if (guf(self.__class__.fit) is guf(AlgoBase.fit) and - guf(self.__class__.train) is not guf(AlgoBase.train)): + guf(self.__class__.train) is not guf(AlgoBase.train)): warnings.warn('It looks like this algorithm (' + str(self.__class__) + ') implements train() ' @@ -96,7 +96,8 @@ def fit(self, trainset): return self - def predict(self, uid, iid, r_ui=None, clip=True, verbose=False): + def predict(self, uid, iid, u_features=[], i_features=[], r_ui=None, + clip=True, verbose=False): """Compute the rating prediction for given user and item. The ``predict`` method converts raw ids to inner ids and then calls the @@ -108,6 +109,10 @@ def predict(self, uid, iid, r_ui=None, clip=True, verbose=False): Args: uid: (Raw) id of the user. See :ref:`this note`. iid: (Raw) id of the item. See :ref:`this note`. + u_features: List of user features in the same order as used in + the ``fit`` method. Optional, default is ``[]``. + i_features: List of item features in the same order as used in + the ``fit`` method. Optional, default is ``[]``. r_ui(float): The true rating :math:`r_{ui}`. Optional, default is ``None``. clip(bool): Whether to clip the estimation into the rating scale. @@ -143,7 +148,7 @@ def predict(self, uid, iid, r_ui=None, clip=True, verbose=False): details = {} try: - est = self.estimate(iuid, iiid) + est = self.estimate(iuid, iiid, u_features, i_features) # If the details dict was also returned if isinstance(est, tuple): @@ -207,9 +212,12 @@ def test(self, testset, verbose=False): # The ratings are translated back to their original scale. predictions = [self.predict(uid, iid, + u_features, + i_features, r_ui_trans - self.trainset.offset, verbose=verbose) - for (uid, iid, r_ui_trans) in testset] + for (uid, iid, u_features, i_features, r_ui_trans) + in testset] return predictions def compute_baselines(self): diff --git a/surprise/prediction_algorithms/baseline_only.py b/surprise/prediction_algorithms/baseline_only.py index a5b3036e..25b22103 100644 --- a/surprise/prediction_algorithms/baseline_only.py +++ b/surprise/prediction_algorithms/baseline_only.py @@ -22,10 +22,10 @@ class BaselineOnly(AlgoBase): computation. See :ref:`baseline_estimates_configuration` for accepted options. verbose(bool): Whether to print trace messages of bias estimation, - similarity, etc. Default is True. + similarity, etc. Default is False. """ - def __init__(self, bsl_options={}, verbose=True): + def __init__(self, bsl_options={}, verbose=False): AlgoBase.__init__(self, bsl_options=bsl_options) self.verbose = verbose @@ -37,7 +37,7 @@ def fit(self, trainset): return self - def estimate(self, u, i): + def estimate(self, u, i, *_): est = self.trainset.global_mean if self.trainset.knows_user(u): diff --git a/surprise/prediction_algorithms/co_clustering.pyx b/surprise/prediction_algorithms/co_clustering.pyx index 408780fc..85837718 100644 --- a/surprise/prediction_algorithms/co_clustering.pyx +++ b/surprise/prediction_algorithms/co_clustering.pyx @@ -62,7 +62,7 @@ class CoClustering(AlgoBase): self.n_cltr_u = n_cltr_u self.n_cltr_i = n_cltr_i self.n_epochs = n_epochs - self.verbose=verbose + self.verbose = verbose self.random_state = random_state def fit(self, trainset): @@ -236,7 +236,7 @@ class CoClustering(AlgoBase): return avg_cltr_u, avg_cltr_i, avg_cocltr - def estimate(self, u, i): + def estimate(self, u, i, *_): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): return self.trainset.global_mean diff --git a/surprise/prediction_algorithms/factorization_machines.py b/surprise/prediction_algorithms/factorization_machines.py new file mode 100644 index 00000000..9752cf40 --- /dev/null +++ b/surprise/prediction_algorithms/factorization_machines.py @@ -0,0 +1,450 @@ +""" +the :mod:`knns` module includes some k-NN inspired algorithms. +""" + +from __future__ import (absolute_import, division, print_function, + unicode_literals) +import copy + +import numpy as np +import pandas as pd +from sklearn.model_selection import train_test_split +import torch +from torch import nn + +# from .predictions import PredictionImpossible +from .algo_base import AlgoBase + + +class FMtorchNN(nn.Module): + """ The PyTorch model for factorization machine. This class is used by + `FM`. The initilization is done as in Rendle (2012). + + Args: + n_features: int + Defines the number of features in x. + n_factors: int, default: 20 + Defines the number of factors in the interaction terms. + init_std: float, default: 0.01 + The standard deviation of the normal distribution for + initialization. + """ + + def __init__(self, n_features, n_factors=20, init_std=0.01): + super(FMtorchNN, self).__init__() + self.n_features = n_features + self.n_factors = n_factors + self.init_std = init_std + + # Initialize bias term + self.b = nn.Parameter(torch.Tensor(1), + requires_grad=True) + self.b.data.fill_(0.) + # self.b.data.normal_(init_mean, init_std) + # self.b.data.uniform_(-0.01, 0.01) + + # Initialize linear terms + self.w = nn.Parameter(torch.Tensor(self.n_features, 1), + requires_grad=True) + self.w.data.fill_(0.) + # self.w.data.normal_(init_mean, init_std) + # self.w.data.uniform_(-0.01, 0.01) + + # Initialize interaction terms + self.V = nn.Parameter(torch.Tensor(self.n_features, self.n_factors), + requires_grad=True) + self.V.data.normal_(0., self.init_std) + # self.V.data.uniform_(-0.01, 0.01) + + def forward(self, x): + + # The linear part + total_linear = torch.sum(torch.mm(x, self.w), dim=1) + + # The interaction part + # O(kn) formulation from Steffen Rendle + total_inter_1 = torch.mm(x, self.V) ** 2 + total_inter_2 = torch.mm(x ** 2, self.V ** 2) + total_inter = 0.5 * torch.sum(total_inter_1 - total_inter_2, dim=1) + + # Compute predictions + y_pred = self.b + total_linear + total_inter + + return y_pred + + +class FM(AlgoBase): + """A factorization machine algorithm implemented using pytorch. + + Args: + rating_lst : list of str or `None`, default : ['userID', 'itemID'] + This list specifies what information from the `raw_ratings` to put + in the `x` vector. Accepted list values are 'userID', 'itemID', + 'imp_u_rating' and 'exp_u_rating'. Implicit and explicit user + rating values are scaled by the number of values. If `None`, no + info is added. + user_lst : list of str or `None`, default : `None` + This list specifies what information from the `user_features` to + put in the `x` vector. Accepted list values consist of the names of + features. If `None`, no info is added. + item_lst : list of str or `None`, default : `None` + This list specifies what information from the `item_features` to + put in the `x` vector. Accepted list values consist of the names of + features. If `None`, no info is added. + n_factors : int, default: 20 + Number of latent factors in low-rank appoximation. + n_epochs : int, default : 30 + Number of epochs. All epochs are ran but only the best model out of + all epochs is kept. + dev_ratio : float, default : 0.3 + Ratio of `trainset` to dedicate to development data set to identify + best model. Should be either positive and smaller than the number + of samples or a float in the (0, 1) range. + init_std: float, default : 0.01 + The standard deviation of the normal distribution for + initialization. + lr : float, default: 0.001 + Learning rate for optimization method. + reg : float, default: 0.02 + Strength of L2 regularization. It can be disabled by setting it to + zero. + random_state : int, default: `None` + Determines the RNG that will be used for initialization. If + int, ``random_state`` will be used as a seed for a new RNG. This is + useful to get the same initialization over multiple calls to + ``fit()``. If ``None``, the current RNG from torch is used. + verbose : int, default: False + Level of verbosity. + """ + + def __init__(self, rating_lst=['userID', 'itemID'], user_lst=None, + item_lst=None, n_factors=20, n_epochs=30, dev_ratio=0.3, + init_std=0.01, lr=0.001, reg=0.02, random_state=None, + verbose=False, **kwargs): + + AlgoBase.__init__(self, **kwargs) + self.rating_lst = rating_lst + self.user_lst = user_lst + self.item_lst = item_lst + self.n_factors = n_factors + self.n_epochs = n_epochs + self.dev_ratio = dev_ratio + self.init_std = init_std + self.lr = lr + self.reg = reg + self.random_state = random_state + self.verbose = verbose + + torch.set_default_dtype(torch.float64) # use float64 + + def fit(self, trainset): + + AlgoBase.fit(self, trainset) + + # Construct data and initialize model + # Initialization needs to be done in fit() since it depends on the + # trainset + if self.random_state: + np.random.seed(self.random_state) + torch.manual_seed(self.random_state) + self._construct_FM_data() + self.model = FMtorchNN(self.n_features, self.n_factors, self.init_std) + params = FM._add_weight_decay(self.model, self.reg) + self.optimizer = torch.optim.Adam(params, lr=self.lr) + + # Define data (TODO : sample_weights) + x = self.libsvm_df.loc[ + :, self.libsvm_df.columns != 'rating'].values.astype('float64') + y = self.libsvm_df.loc[ + :, 'rating'].values.astype('float64') + sample_weights = None + if sample_weights: + x_train, x_dev, y_train, y_dev, w_train, w_dev = train_test_split( + x, y, sample_weights, test_size=self.dev_ratio, + random_state=self.random_state) + w_train = torch.Tensor(w_train) + w_dev = torch.Tensor(w_dev) + else: + x_train, x_dev, y_train, y_dev = train_test_split( + x, y, test_size=self.dev_ratio, random_state=self.random_state) + w_train = None + w_dev = None + x_train = torch.Tensor(x_train) + y_train = torch.Tensor(y_train) + x_dev = torch.Tensor(x_dev) + y_dev = torch.Tensor(y_dev) + + best_loss = np.inf + best_model = None + for epoch in range(self.n_epochs): + # Switch to training mode, clear gradient accumulators + self.model.train() + self.optimizer.zero_grad() + # Forward pass + y_pred = self.model(x_train) + # Compute loss + self.train_loss = self._compute_loss(y_pred, y_train, w_train) + # Backward pass and update weights + self.train_loss.backward() + self.optimizer.step() + + # Switch to eval mode and evaluate with development data + # See https://github.com/pytorch/examples/blob/master/snli/train.py + self.model.eval() + y_pred = self.model(x_dev) + self.dev_loss = self._compute_loss(y_pred, y_dev, w_dev) + + if self.verbose: + print(epoch, self.train_loss.item(), self.dev_loss.item()) + + if self.dev_loss.item() < best_loss: + best_model = copy.deepcopy(self.model) + best_loss = self.dev_loss.item() + if self.verbose: + print('A new best model have been found!') + + self.model = best_model + + return self + + def _add_weight_decay(model, reg, skip_list=[]): + """ Add weight_decay with no regularization for bias. + """ + + decay, no_decay = [], [] + for name, param in model.named_parameters(): + if not param.requires_grad: + continue # frozen weights + if ((len(param.shape) == 1) or name.endswith(".bias") or + (name in skip_list)): + no_decay.append(param) + else: + decay.append(param) + + return [{'params': no_decay, 'weight_decay': 0.}, + {'params': decay, 'weight_decay': reg}] + + def _compute_loss(self, y_pred, y, sample_weights=None): + """ Computes a different loss depending on whether `sample_weights` are + defined. + """ + + if sample_weights is not None: + criterion = nn.MSELoss(reduction='none') + loss = criterion(y_pred, y) + loss = torch.dot(sample_weights, loss) / y.shape[0] + else: + criterion = nn.MSELoss() + loss = criterion(y_pred, y) + + return loss + + def _construct_FM_data(self): + """ Construct the data needed by `FM`. + + It is assumed that the user and item features are correctly encoded. + These dummies are created (if needed) using only the info in the + trainset. + """ + + if self.user_lst and (self.trainset.n_user_features == 0): + raise ValueError('user_lst cannot be used since ' + 'there are no user_features') + if self.item_lst and (self.trainset.n_item_features == 0): + raise ValueError('item_lst cannot be used since ' + 'there are no item_features') + + n_ratings = self.trainset.n_ratings + # n_users = self.trainset.n_users + n_items = self.trainset.n_items + + # Construct ratings_df from trainset + # The IDs are unique and start at 0 + ratings_df = pd.DataFrame([tup for tup in self.trainset.all_ratings()], + columns=['userID', 'itemID', 'rating']) + + # Initialize df with rating values + libsvm_df = pd.DataFrame(ratings_df['rating']) + + # Add rating features + if self.rating_lst: + for feature in self.rating_lst: + if feature == 'userID': + libsvm_df = pd.concat([libsvm_df, pd.get_dummies( + ratings_df['userID'], prefix='userID')], axis=1) + elif feature == 'itemID': + libsvm_df = pd.concat([libsvm_df, pd.get_dummies( + ratings_df['itemID'], prefix='itemID')], axis=1) + elif feature == 'imp_u_rating': + temp = np.zeros((n_ratings, n_items)) + for row in ratings_df.itertuples(): + iid = row.itemID + all_u_ratings = self.trainset.ur[row.userID] + for other_iid, rating in all_u_ratings: + if other_iid != iid: # only the other ratings + temp[row.Index, other_iid] = 1 + count = np.count_nonzero(temp, axis=1)[:, None] + count[count == 0] = 1 # remove zeros for division + temp = temp / count + cols = ['imp_u_rating_{}'.format(i) + for i in range(n_items)] + libsvm_df = pd.concat([libsvm_df, pd.DataFrame( + temp, columns=cols)], axis=1) + elif feature == 'exp_u_rating': + # a rating is at least 1 with the offset + temp = np.zeros((n_ratings, n_items)) + for row in ratings_df.itertuples(): + iid = row.itemID + all_u_ratings = self.trainset.ur[row.userID] + for other_iid, rating in all_u_ratings: + if other_iid != iid: # only the other ratings + temp[row.Index, other_iid] = rating + count = np.count_nonzero(temp, axis=1)[:, None] + count[count == 0] = 1 # remove zeros for division + temp = temp / count + cols = ['exp_u_rating_{}'.format(i) + for i in range(n_items)] + libsvm_df = pd.concat([libsvm_df, pd.DataFrame( + temp, columns=cols)], axis=1) + else: + raise ValueError('{} is not an accepted value ' + 'for rating_lst'.format(feature)) + + # Add user features + if self.user_lst: + temp = pd.DataFrame( + [self.trainset.u_features[uid] + for uid in ratings_df['userID']], + columns=self.trainset.user_features_labels) + for feature in self.user_lst: + if feature in self.trainset.user_features_labels: + libsvm_df[feature] = temp[feature] + else: + raise ValueError( + '{} is not part of user_features'.format(feature)) + + # Add item features + if self.item_lst: + temp = pd.DataFrame( + [self.trainset.i_features[iid] + for iid in ratings_df['itemID']], + columns=self.trainset.item_features_labels) + for feature in self.item_lst: + if feature in self.trainset.item_features_labels: + libsvm_df[feature] = temp[feature] + else: + raise ValueError( + '{} is not part of item_features'.format(feature)) + + self.libsvm_df = libsvm_df + self.n_features = libsvm_df.shape[1] - 1 + + def estimate(self, u, i, u_features, i_features): + + # Estimate rating + x = self._construct_estimate_input(u, i, u_features, i_features) + x = torch.Tensor(x[None, :]) # add dimension + est = float(self.model(x)) + + # Construct details + details = {} + if self.trainset.knows_user(u) and self.trainset.knows_item(i): + details['knows_user'] = True + details['knows_item'] = True + elif self.trainset.knows_user(u): + details['knows_user'] = True + details['knows_item'] = False + elif self.trainset.knows_item(i): + details['knows_user'] = False + details['knows_item'] = True + else: + details['knows_user'] = False + details['knows_item'] = False + + return est, details + + def _construct_estimate_input(self, u, i, u_features, i_features): + """ Construct the input for the model. + + It is assumed that if features are given in u_features or i_features, + they are all given and in the same order as in the trainset. + """ + + if (self.user_lst and u_features and ( + len(u_features) != len(self.trainset.user_features_labels))): + raise ValueError('If u_features are provided for predict(), they' + 'should all be provided as in trainset') + if (self.item_lst and i_features and ( + len(i_features) != len(self.trainset.item_features_labels))): + raise ValueError('If i_features are provided for predict(), they' + 'should all be provided as in trainset') + + n_users = self.trainset.n_users + n_items = self.trainset.n_items + + x = [] + + # Add rating features + if self.rating_lst: + for feature in self.rating_lst: + if feature == 'userID': + temp = [0.] * n_users + if self.trainset.knows_user(u): + temp[u] = 1. + x.extend(temp) + elif feature == 'itemID': + temp = [0.] * n_items + if self.trainset.knows_item(i): + temp[i] = 1. + x.extend(temp) + elif feature == 'imp_u_rating': + temp = [0.] * n_items + if self.trainset.knows_user(u): + all_u_ratings = self.trainset.ur[u] + for other_i, rating in all_u_ratings: + if other_i != i: # only the other ratings + temp[other_i] = 1. + temp = np.array(temp) + count = np.count_nonzero(temp) + if count == 0: + count = 1 + temp = list(temp / count) + x.extend(temp) + elif feature == 'exp_u_rating': + # a rating is at least 1 with the offset + temp = [0.] * n_items + if self.trainset.knows_user(u): + all_u_ratings = self.trainset.ur[u] + for other_i, rating in all_u_ratings: + if other_i != i: # only the other ratings + temp[other_i] = rating + temp = np.array(temp) + count = np.count_nonzero(temp) + if count == 0: + count = 1 + temp = list(temp / count) + x.extend(temp) + + # Add user features + if self.user_lst: + temp = [0.] * len(self.user_lst) + if u_features: + # It is assumed that if features are given, they are all given. + temp_df = pd.Series( + u_features, index=self.trainset.user_features_labels) + for idx, feature in enumerate(self.user_lst): + temp[idx] = temp_df[feature] + x.extend(temp) + + # Add item features + if self.item_lst: + temp = [0.] * len(self.item_lst) + if i_features: + # It is assumed that if features are given, they are all given. + temp_df = pd.Series( + i_features, index=self.trainset.item_features_labels) + for idx, feature in enumerate(self.item_lst): + temp[idx] = temp_df[feature] + x.extend(temp) + + return np.array(x) diff --git a/surprise/prediction_algorithms/knns.py b/surprise/prediction_algorithms/knns.py index 0a15e1d4..9307db18 100644 --- a/surprise/prediction_algorithms/knns.py +++ b/surprise/prediction_algorithms/knns.py @@ -27,7 +27,7 @@ class SymmetricAlgo(AlgoBase): reversed. """ - def __init__(self, sim_options={}, verbose=True, **kwargs): + def __init__(self, sim_options={}, verbose=False, **kwargs): AlgoBase.__init__(self, sim_options=sim_options, **kwargs) self.verbose = verbose @@ -83,10 +83,10 @@ class KNNBasic(SymmetricAlgo): measure. See :ref:`similarity_measures_configuration` for accepted options. verbose(bool): Whether to print trace messages of bias estimation, - similarity, etc. Default is True. + similarity, etc. Default is False. """ - def __init__(self, k=40, min_k=1, sim_options={}, verbose=True, **kwargs): + def __init__(self, k=40, min_k=1, sim_options={}, verbose=False, **kwargs): SymmetricAlgo.__init__(self, sim_options=sim_options, verbose=verbose, **kwargs) @@ -100,10 +100,10 @@ def fit(self, trainset): return self - def estimate(self, u, i): + def estimate(self, u, i, *_): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): - raise PredictionImpossible('User and/or item is unkown.') + raise PredictionImpossible('User and/or item is unknown.') x, y = self.switch(u, i) @@ -161,10 +161,10 @@ class KNNWithMeans(SymmetricAlgo): measure. See :ref:`similarity_measures_configuration` for accepted options. verbose(bool): Whether to print trace messages of bias estimation, - similarity, etc. Default is True. + similarity, etc. Default is False. """ - def __init__(self, k=40, min_k=1, sim_options={}, verbose=True, **kwargs): + def __init__(self, k=40, min_k=1, sim_options={}, verbose=False, **kwargs): SymmetricAlgo.__init__(self, sim_options=sim_options, verbose=verbose, **kwargs) @@ -183,10 +183,10 @@ def fit(self, trainset): return self - def estimate(self, u, i): + def estimate(self, u, i, *_): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): - raise PredictionImpossible('User and/or item is unkown.') + raise PredictionImpossible('User and/or item is unknown.') x, y = self.switch(u, i) @@ -254,17 +254,15 @@ class KNNBaseline(SymmetricAlgo): measure. See :ref:`similarity_measures_configuration` for accepted options. It is recommended to use the :func:`pearson_baseline ` similarity measure. - bsl_options(dict): A dictionary of options for the baseline estimates computation. See :ref:`baseline_estimates_configuration` for accepted options. verbose(bool): Whether to print trace messages of bias estimation, - similarity, etc. Default is True. - + similarity, etc. Default is False. """ def __init__(self, k=40, min_k=1, sim_options={}, bsl_options={}, - verbose=True, **kwargs): + verbose=False, **kwargs): SymmetricAlgo.__init__(self, sim_options=sim_options, bsl_options=bsl_options, verbose=verbose, @@ -282,7 +280,7 @@ def fit(self, trainset): return self - def estimate(self, u, i): + def estimate(self, u, i, *_): est = self.trainset.global_mean if self.trainset.knows_user(u): @@ -354,10 +352,10 @@ class KNNWithZScore(SymmetricAlgo): measure. See :ref:`similarity_measures_configuration` for accepted options. verbose(bool): Whether to print trace messages of bias estimation, - similarity, etc. Default is True. + similarity, etc. Default is False. """ - def __init__(self, k=40, min_k=1, sim_options={}, verbose=True, **kwargs): + def __init__(self, k=40, min_k=1, sim_options={}, verbose=False, **kwargs): SymmetricAlgo.__init__(self, sim_options=sim_options, verbose=verbose, **kwargs) @@ -384,10 +382,10 @@ def fit(self, trainset): return self - def estimate(self, u, i): + def estimate(self, u, i, *_): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): - raise PredictionImpossible('User and/or item is unkown.') + raise PredictionImpossible('User and/or item is unknown.') x, y = self.switch(u, i) diff --git a/surprise/prediction_algorithms/linear.py b/surprise/prediction_algorithms/linear.py new file mode 100644 index 00000000..03b7109b --- /dev/null +++ b/surprise/prediction_algorithms/linear.py @@ -0,0 +1,131 @@ +""" +the :mod:`linear` module includes linear features-based algorithms. +""" + +from __future__ import (absolute_import, division, print_function, + unicode_literals) +import numpy as np +from sklearn import linear_model + +from .predictions import PredictionImpossible +from .algo_base import AlgoBase + + +class Lasso(AlgoBase): + """A basic lasso algorithm with user-item interaction terms. + + The prediction :math:`\\hat{r}_{ui}` is set as: + + .. math:: + \hat{r}_{ui} = \alpha_1 + \alpha_2^\top y_u + \alpha_3^\top z_i + + \alpha_4^\top \text{vec}(y_u \otimes z_i) + + where :math:`\alpha_1 \in \mathbb{R}, \alpha_2 \in \mathbb{R}^o, \alpha_3 + \in \mathbb{R}^p` and :math:`\alpha_4 \in \mathbb{R}^{op}` are coefficient + vectors, and :math:`\otimes` represent the Kronecker product of two vectors + (i.e., all possible cross-product combinations). + + Args: + add_interactions(bool): Whether to add user-item interaction terms. + Optional, default is True. + other args: See ``sklearn`` documentation for ``linear_model.Lasso``. + """ + + def __init__(self, add_interactions=True, alpha=1.0, fit_intercept=True, + normalize=False, precompute=False, max_iter=1000, tol=0.0001, + positive=False, random_state=None, selection='cyclic', + **kwargs): + + AlgoBase.__init__(self, **kwargs) + self.add_interactions = add_interactions + self.alpha = alpha + self.fit_intercept = fit_intercept + self.normalize = normalize + self.precompute = precompute + self.max_iter = max_iter + self.tol = tol + self.positive = positive + self.random_state = random_state + self.selection = selection + + def fit(self, trainset): + + AlgoBase.fit(self, trainset) + self.lasso(trainset) + + return self + + def lasso(self, trainset): + + if (self.trainset.n_user_features == 0 or + self.trainset.n_item_features == 0): + raise ValueError('trainset does not contain user and/or item ' + 'features.') + + n_ratings = self.trainset.n_ratings + n_uf = self.trainset.n_user_features + n_if = self.trainset.n_item_features + u_features = self.trainset.u_features + i_features = self.trainset.i_features + uf_labels = self.trainset.user_features_labels + if_labels = self.trainset.item_features_labels + + X = np.empty((n_ratings, n_uf + n_if)) + y = np.empty((n_ratings,)) + for k, (uid, iid, rating) in enumerate(self.trainset.all_ratings()): + y[k] = rating + + try: + X[k, :n_uf] = u_features[uid] + except KeyError: + raise ValueError('No features for user ' + + str(self.trainset.to_raw_uid(uid))) + + try: + X[k, n_uf:] = i_features[iid] + except KeyError: + raise ValueError('No features for item ' + + str(self.trainset.to_raw_iid(iid))) + + coef_labels = uf_labels + if_labels + if self.add_interactions: + temp = np.array([X[:, v] * X[:, j] for v in range(n_uf) + for j in range(n_uf, n_uf + n_if)]).T + X = np.concatenate([X, temp], axis=1) + temp = [coef_labels[v] + '*' + coef_labels[j] for v in range(n_uf) + for j in range(n_uf, n_uf + n_if)] + coef_labels += temp + + reg = linear_model.Lasso( + alpha=self.alpha, fit_intercept=self.fit_intercept, + normalize=self.normalize, precompute=self.precompute, + max_iter=self.max_iter, tol=self.tol, positive=self.positive, + random_state=self.random_state, selection=self.selection) + reg.fit(X, y) + + self.X = X + self.y = y + self.coef = reg.coef_ + self.coef_labels = coef_labels + self.intercept = reg.intercept_ + + def estimate(self, u, i, u_features, i_features): + + n_uf = self.trainset.n_user_features + n_if = self.trainset.n_item_features + + if (len(u_features) != n_uf or + len(i_features) != n_if): + raise PredictionImpossible( + 'User and/or item features are missing.') + + X = np.concatenate([u_features, i_features]) + + if self.add_interactions: + temp = np.array([X[v] * X[j] for v in range(n_uf) + for j in range(n_uf, n_uf + n_if)]) + X = np.concatenate([X, temp]) + + est = self.intercept + np.dot(X, self.coef) + + return est diff --git a/surprise/prediction_algorithms/matrix_factorization.pyx b/surprise/prediction_algorithms/matrix_factorization.pyx index 0e898632..7a3cede5 100644 --- a/surprise/prediction_algorithms/matrix_factorization.pyx +++ b/surprise/prediction_algorithms/matrix_factorization.pyx @@ -253,7 +253,7 @@ class SVD(AlgoBase): self.pu = pu self.qi = qi - def estimate(self, u, i): + def estimate(self, u, i, *_): # Should we cythonize this as well? known_user = self.trainset.knows_user(u) @@ -275,7 +275,7 @@ class SVD(AlgoBase): if known_user and known_item: est = np.dot(self.qi[i], self.pu[u]) else: - raise PredictionImpossible('User and item are unkown.') + raise PredictionImpossible('User and item are unknown.') return est @@ -484,7 +484,7 @@ class SVDpp(AlgoBase): self.qi = qi self.yj = yj - def estimate(self, u, i): + def estimate(self, u, i, *_): est = self.trainset.global_mean @@ -715,7 +715,7 @@ class NMF(AlgoBase): self.pu = pu self.qi = qi - def estimate(self, u, i): + def estimate(self, u, i, *_): # Should we cythonize this as well? known_user = self.trainset.knows_user(u) @@ -737,6 +737,6 @@ class NMF(AlgoBase): if known_user and known_item: est = np.dot(self.qi[i], self.pu[u]) else: - raise PredictionImpossible('User and item are unkown.') + raise PredictionImpossible('User and item are unknown.') return est diff --git a/surprise/prediction_algorithms/slope_one.pyx b/surprise/prediction_algorithms/slope_one.pyx index 8049a6cf..f986e496 100644 --- a/surprise/prediction_algorithms/slope_one.pyx +++ b/surprise/prediction_algorithms/slope_one.pyx @@ -79,7 +79,7 @@ class SlopeOne(AlgoBase): return self - def estimate(self, u, i): + def estimate(self, u, i, *_): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): raise PredictionImpossible('User and/or item is unkown.') diff --git a/surprise/trainset.py b/surprise/trainset.py index ebb95204..bf69c850 100644 --- a/surprise/trainset.py +++ b/surprise/trainset.py @@ -33,21 +33,37 @@ class Trainset: ir(:obj:`defaultdict` of :obj:`list`): The items ratings. This is a dictionary containing lists of tuples of the form ``(user_inner_id, rating)``. The keys are item inner ids. + u_features(:obj:`defaultdict` of :obj:`list`): The user features. This + is a dictionary containing lists of features. The keys are user + inner ids. + i_features(:obj:`defaultdict` of :obj:`list`): The item features. This + is a dictionary containing lists of features. The keys are item + inner ids. n_users: Total number of users :math:`|U|`. n_items: Total number of items :math:`|I|`. + n_user_features: Total number of user features. + n_item_features: Total number of item features. n_ratings: Total number of ratings :math:`|R_{train}|`. rating_scale(tuple): The minimum and maximal rating of the rating scale. global_mean: The mean of all ratings :math:`\\mu`. """ - def __init__(self, ur, ir, n_users, n_items, n_ratings, rating_scale, - offset, raw2inner_id_users, raw2inner_id_items): + def __init__(self, ur, ir, u_features, i_features, n_users, n_items, + n_user_features, n_item_features, user_features_labels, + item_features_labels, n_ratings, rating_scale, offset, + raw2inner_id_users, raw2inner_id_items): self.ur = ur self.ir = ir + self.u_features = u_features + self.i_features = i_features self.n_users = n_users self.n_items = n_items + self.n_user_features = n_user_features + self.n_item_features = n_item_features + self.user_features_labels = user_features_labels + self.item_features_labels = item_features_labels self.n_ratings = n_ratings self.rating_scale = rating_scale self.offset = offset @@ -87,6 +103,30 @@ def knows_item(self, iid): return iid in self.ir + def has_user_features(self, uid): + """Indicate if the user features are part of the trainset. + + Args: + uid(int): The (inner) user id. See :ref:`this + note`. + Returns: + ``True`` if user features are part of the trainset, else ``False``. + """ + + return uid in self.u_features + + def has_item_features(self, iid): + """Indicate if the item features are part of the trainset. + + Args: + iid(int): The (inner) item id. See :ref:`this + note`. + Returns: + ``True`` if item features are part of the trainset, else ``False``. + """ + + return iid in self.i_features + def to_inner_uid(self, ruid): """Convert a **user** raw id to an inner id. @@ -200,8 +240,14 @@ def build_testset(self): cases where you want to to test your algorithm on the trainset. """ - return [(self.to_raw_uid(u), self.to_raw_iid(i), r) - for (u, i, r) in self.all_ratings()] + testset = [] + for (u, i, r) in self.all_ratings(): + u_features = self.u_features.get(u, []) + i_features = self.i_features.get(i, []) + testset.append((self.to_raw_uid(u), self.to_raw_iid(i), u_features, + i_features, r)) + + return testset def build_anti_testset(self, fill=None): """Return a list of ratings that can be used as a testset in the @@ -228,9 +274,13 @@ def build_anti_testset(self, fill=None): anti_testset = [] for u in self.all_users(): user_items = set([j for (j, _) in self.ur[u]]) - anti_testset += [(self.to_raw_uid(u), self.to_raw_iid(i), fill) for - i in self.all_items() if - i not in user_items] + anti_testset += [(self.to_raw_uid(u), + self.to_raw_iid(i), + self.u_features.get(u, []), + self.i_features.get(i, []), + fill) + for i in self.all_items() + if i not in user_items] return anti_testset def all_users(self): diff --git a/tests/test_accuracy.py b/tests/test_accuracy.py index e56821de..1d264f40 100644 --- a/tests/test_accuracy.py +++ b/tests/test_accuracy.py @@ -49,8 +49,8 @@ def test_rmse(): def test_fcp(): """Tests for the FCP function.""" - predictions = [pred(0, 0, u0='u1'), pred(1, 1, u0='u1'), pred(2, 2, - u0='u2'), pred(100, 100, u0='u2')] + predictions = [pred(0, 0, u0='u1'), pred(1, 1, u0='u1'), + pred(2, 2, u0='u2'), pred(100, 100, u0='u2')] assert fcp(predictions) == 1 predictions = [pred(0, 0, u0='u1'), pred(0, 0, u0='u1')] @@ -61,8 +61,8 @@ def test_fcp(): with pytest.raises(ValueError): fcp(predictions) - predictions = [pred(0, 1, u0='u1'), pred(1, 0, u0='u1'), pred(2, 0.5, - u0='u2'), pred(0, 0.6, u0='u2')] + predictions = [pred(0, 1, u0='u1'), pred(1, 0, u0='u1'), + pred(2, 0.5, u0='u2'), pred(0, 0.6, u0='u2')] assert fcp(predictions) == 0 with pytest.raises(ValueError): diff --git a/tests/test_algorithms.py b/tests/test_algorithms.py index 15cfd5f1..50cfa77a 100644 --- a/tests/test_algorithms.py +++ b/tests/test_algorithms.py @@ -7,6 +7,7 @@ import os import pytest +import pandas as pd from surprise import NormalPredictor from surprise import BaselineOnly @@ -18,16 +19,20 @@ from surprise import NMF from surprise import SlopeOne from surprise import CoClustering +from surprise import KNNWithZScore +from surprise import Lasso +from surprise import FM from surprise import Dataset from surprise import Reader -from surprise import KNNWithZScore from surprise.model_selection import PredefinedKFold from surprise.model_selection import train_test_split def test_unknown_user_or_item(): """Ensure that all algorithms act gracefully when asked to predict a rating - of an unknown user, an unknown item, and when both are unknown. + of an unknown user and/or an unknown item with unknown or known user + features and/or unknown or known item features. Also, test how they react + when features are missing/added for fit. """ reader = Reader(line_format='user item rating', sep=' ', skip_lines=3, @@ -35,22 +40,160 @@ def test_unknown_user_or_item(): file_path = os.path.dirname(os.path.realpath(__file__)) + '/custom_dataset' + # df with all users + u_features_df = pd.DataFrame( + {'urid': ['user0', 'user2', 'user3', 'user1', 'user4'], + 'isMale': [False, True, False, True, False]}, + columns=['urid', 'isMale']) + + # df with all items + i_features_df = pd.DataFrame( + {'irid': ['item0', 'item1'], + 'isNew': [False, True], + 'webRating': [4, 3], + 'isComedy': [True, False]}, + columns=['irid', 'isNew', 'webRating', 'isComedy']) + + # df with missing user + u_features_m_df = pd.DataFrame( + {'urid': ['user0', 'user2', 'user3', 'user1'], + 'isMale': [False, True, False, True]}, + columns=['urid', 'isMale']) + + # df with missing item + i_features_m_df = pd.DataFrame( + {'irid': ['item0'], + 'isNew': [False], + 'webRating': [4], + 'isComedy': [True]}, + columns=['irid', 'isNew', 'webRating', 'isComedy']) + data = Dataset.load_from_file(file_path=file_path, reader=reader) trainset = data.build_full_trainset() + data_u = Dataset.load_from_file(file_path=file_path, reader=reader) + data_u.load_features_df(u_features_df, user_features=True) + trainset_u = data_u.build_full_trainset() + + data_i = Dataset.load_from_file(file_path=file_path, reader=reader) + data_i.load_features_df(i_features_df, user_features=False) + trainset_i = data_i.build_full_trainset() + + data_ui = Dataset.load_from_file(file_path=file_path, reader=reader) + data_ui.load_features_df(u_features_df, user_features=True) + data_ui.load_features_df(i_features_df, user_features=False) + trainset_ui = data_ui.build_full_trainset() + + data_ui_mu = Dataset.load_from_file(file_path=file_path, reader=reader) + data_ui_mu.load_features_df(u_features_m_df, user_features=True) + data_ui_mu.load_features_df(i_features_df, user_features=False) + with pytest.raises(ValueError): + data_ui_mu.build_full_trainset() + + data_ui_mi = Dataset.load_from_file(file_path=file_path, reader=reader) + data_ui_mi.load_features_df(u_features_df, user_features=True) + data_ui_mi.load_features_df(i_features_m_df, user_features=False) + with pytest.raises(ValueError): + data_ui_mi.build_full_trainset() + + # algos not using features klasses = (NormalPredictor, BaselineOnly, KNNBasic, KNNWithMeans, KNNBaseline, SVD, SVDpp, NMF, SlopeOne, CoClustering, KNNWithZScore) for klass in klasses: algo = klass() algo.fit(trainset) - algo.predict('user0', 'unknown_item', None) - algo.predict('unkown_user', 'item0', None) - algo.predict('unkown_user', 'unknown_item', None) + algo.fit(trainset_u) + algo.fit(trainset_i) + algo.fit(trainset_ui) + algo.predict('user0', 'unknown_item') + algo.predict('unkown_user', 'item0') + algo.predict('unkown_user', 'unknown_item') + algo.predict('user0', 'unknown_item', [], []) + algo.predict('unkown_user', 'item0', [], []) + algo.predict('unkown_user', 'unknown_item', [], []) + algo.predict('user0', 'unknown_item', [False], []) + algo.predict('unkown_user', 'item0', [False], []) + algo.predict('unkown_user', 'unknown_item', [False], []) + algo.predict('user0', 'unknown_item', [], [False, 4, True]) + algo.predict('unkown_user', 'item0', [], [False, 4, True]) + algo.predict('unkown_user', 'unknown_item', [], [False, 4, True]) + algo.predict('user0', 'unknown_item', [False], [False, 4, True]) + algo.predict('unkown_user', 'item0', [False], [False, 4, True]) + algo.predict('unkown_user', 'unknown_item', [False], [False, 4, True]) + + # algos using user and item features + klasses_ui = (Lasso,) + for klass in klasses_ui: + algo = klass() + with pytest.raises(ValueError): + algo.fit(trainset) + with pytest.raises(ValueError): + algo.fit(trainset_u) + with pytest.raises(ValueError): + algo.fit(trainset_i) + algo.fit(trainset_ui) + algo.predict('user0', 'unknown_item') + algo.predict('unkown_user', 'item0') + algo.predict('unkown_user', 'unknown_item') + algo.predict('user0', 'unknown_item', [], []) + algo.predict('unkown_user', 'item0', [], []) + algo.predict('unkown_user', 'unknown_item', [], []) + algo.predict('user0', 'unknown_item', [False], []) + algo.predict('unkown_user', 'item0', [False], []) + algo.predict('unkown_user', 'unknown_item', [False], []) + algo.predict('user0', 'unknown_item', [], [False, 4, True]) + algo.predict('unkown_user', 'item0', [], [False, 4, True]) + algo.predict('unkown_user', 'unknown_item', [], [False, 4, True]) + algo.predict('user0', 'unknown_item', [False], [False, 4, True]) + algo.predict('unkown_user', 'item0', [False], [False, 4, True]) + algo.predict('unkown_user', 'unknown_item', [False], [False, 4, True]) + + # FM using (optionally) user and item features + rating_lst_opt = ['userID', 'itemID', 'imp_u_rating', 'exp_u_rating'] + user_lst_opt = ['isMale'] + item_lst_opt = ['isNew', 'webRating', 'isComedy'] + algos = (FM(rating_lst=rating_lst_opt), FM(user_lst=user_lst_opt), + FM(item_lst=item_lst_opt), + FM(rating_lst=rating_lst_opt, user_lst=user_lst_opt, + item_lst=item_lst_opt)) + for algo in algos: + if algo.user_lst: + with pytest.raises(ValueError): + algo.fit(trainset) + with pytest.raises(ValueError): + algo.fit(trainset_i) + if algo.item_lst: + with pytest.raises(ValueError): + algo.fit(trainset) + with pytest.raises(ValueError): + algo.fit(trainset_u) + algo.fit(trainset_ui) + if algo.user_lst: + with pytest.raises(ValueError): + algo.predict('user0', 'item0', [False, 'AddedFeature'], []) + if algo.item_lst: + with pytest.raises(ValueError): + algo.predict('user0', 'item0', [], ['NotEnoughFeatures']) + algo.predict('user0', 'unknown_item') + algo.predict('unkown_user', 'item0') + algo.predict('unkown_user', 'unknown_item') + algo.predict('user0', 'unknown_item', [], []) + algo.predict('unkown_user', 'item0', [], []) + algo.predict('unkown_user', 'unknown_item', [], []) + algo.predict('user0', 'unknown_item', [False], []) + algo.predict('unkown_user', 'item0', [False], []) + algo.predict('unkown_user', 'unknown_item', [False], []) + algo.predict('user0', 'unknown_item', [], [False, 4, True]) + algo.predict('unkown_user', 'item0', [], [False, 4, True]) + algo.predict('unkown_user', 'unknown_item', [], [False, 4, True]) + algo.predict('user0', 'unknown_item', [False], [False, 4, True]) + algo.predict('unkown_user', 'item0', [False], [False, 4, True]) + algo.predict('unkown_user', 'unknown_item', [False], [False, 4, True]) # unrelated, but test the fit().test() one-liner: - trainset, testset = train_test_split(data, test_size=2) - for klass in klasses: + trainset, testset = train_test_split(data_ui, test_size=2) + for klass in (klasses + klasses_ui + (FM,)): algo = klass() algo.fit(trainset).test(testset) with pytest.warns(UserWarning): diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 69311404..c29c989f 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -23,6 +23,7 @@ def test_wrong_file_name(): """Ensure file names are checked when creating a (custom) Dataset.""" + wrong_files = [('does_not_exist', 'does_not_either')] with pytest.raises(ValueError): @@ -40,8 +41,77 @@ def test_build_full_trainset(): assert len(trainset.ur) == 5 assert len(trainset.ir) == 2 + assert len(trainset.u_features) == 0 + assert len(trainset.i_features) == 0 assert trainset.n_users == 5 assert trainset.n_items == 2 + assert trainset.n_user_features == 0 + assert trainset.n_item_features == 0 + assert len(trainset.user_features_labels) == 0 + assert len(trainset.item_features_labels) == 0 + + +def test_load_features_df_columns_number(): + """Ensure number of columns in features DataFrame is checked.""" + + custom_dataset_path = (os.path.dirname(os.path.realpath(__file__)) + + '/custom_dataset') + data = Dataset.load_from_file(file_path=custom_dataset_path, reader=reader) + onecol_df = pd.DataFrame({'test': [False, True]}, columns=['test']) + + with pytest.raises(ValueError): + data.load_features_df(onecol_df) + + +def test_load_features_df_unique_ids(): + """Ensure that there is a check for unique values in the first column of + the features DataFrame.""" + + custom_dataset_path = (os.path.dirname(os.path.realpath(__file__)) + + '/custom_dataset') + data = Dataset.load_from_file(file_path=custom_dataset_path, reader=reader) + nonunique_df = pd.DataFrame( + {'ids': ['user0', 'user1', 'user0'], + 'feature': [True, False, True]}, + columns=['ids', 'feature']) + + with pytest.raises(ValueError): + data.load_features_df(nonunique_df) + + +def test_build_full_trainset_ui_features(): + """Test the build_full_trainset method with user and item features.""" + + custom_dataset_path = (os.path.dirname(os.path.realpath(__file__)) + + '/custom_dataset') + data = Dataset.load_from_file(file_path=custom_dataset_path, reader=reader) + + u_features_df = pd.DataFrame( + {'urid': ['user0', 'user2', 'user3', 'user1', 'user4'], + 'isMale': [False, True, False, True, False]}, + columns=['urid', 'isMale']) + data = data.load_features_df(u_features_df, user_features=True) + + i_features_df = pd.DataFrame( + {'irid': ['item0', 'item1'], + 'isNew': [False, True], + 'webRating': [4, 3], + 'isComedy': [True, False]}, + columns=['irid', 'isNew', 'webRating', 'isComedy']) + data = data.load_features_df(i_features_df, user_features=False) + + trainset = data.build_full_trainset() + + assert len(trainset.ur) == 5 + assert len(trainset.ir) == 2 + assert len(trainset.u_features) == 5 + assert len(trainset.i_features) == 2 + assert trainset.n_users == 5 + assert trainset.n_items == 2 + assert trainset.n_user_features == 1 + assert trainset.n_item_features == 3 + assert len(trainset.user_features_labels) == 1 + assert len(trainset.item_features_labels) == 3 def test_no_call_to_split(): @@ -141,16 +211,133 @@ def test_trainset_testset(): assert trainset.n_ratings == 6 assert trainset.rating_scale == (1, 5) + # test user features + u_features = trainset.u_features + assert u_features[0] == [] # no u_features_df added + assert u_features[1] == [] # no u_features_df added + assert u_features[3] == [] # no u_features_df added + assert u_features[40] == [] # not in trainset and no u_features_df + assert trainset.user_features_labels == [] + assert trainset.n_user_features == 0 + + # test item features + i_features = trainset.i_features + assert i_features[0] == [] # no i_features_df added + assert i_features[1] == [] # no i_features_df added + assert i_features[20000] == [] # not in trainset and no i_features_df + assert trainset.item_features_labels == [] + assert trainset.n_item_features == 0 + + # test raw2inner + for i in range(4): + assert trainset.to_inner_uid('user' + str(i)) == i + with pytest.raises(ValueError): + trainset.to_inner_uid('unknown_user') + + for i in range(2): + assert trainset.to_inner_iid('item' + str(i)) == i + with pytest.raises(ValueError): + trainset.to_inner_iid('unknown_item') + + # test inner2raw + assert trainset._inner2raw_id_users is None + assert trainset._inner2raw_id_items is None + for i in range(4): + assert trainset.to_raw_uid(i) == 'user' + str(i) + for i in range(2): + assert trainset.to_raw_iid(i) == 'item' + str(i) + assert trainset._inner2raw_id_users is not None + assert trainset._inner2raw_id_items is not None + + # Test the build_testset() method + algo = BaselineOnly() + algo.fit(trainset) + testset = trainset.build_testset() + algo.test(testset) # ensure an algorithm can manage the data + assert ('user0', 'item0', [], [], 4) in testset + assert ('user3', 'item1', [], [], 5) in testset + assert ('user3', 'item1', [], [], 0) not in testset + + # Test the build_anti_testset() method + algo = BaselineOnly() + algo.fit(trainset) + testset = trainset.build_anti_testset() + algo.test(testset) # ensure an algorithm can manage the data + assert ('user0', 'item0', [], [], trainset.global_mean) not in testset + assert ('user3', 'item1', [], [], trainset.global_mean) not in testset + assert ('user0', 'item1', [], [], trainset.global_mean) in testset + assert ('user3', 'item0', [], [], trainset.global_mean) in testset + + +def test_trainset_testset_ui_features(): + """Test the construct_trainset and construct_testset methods with user and + item features.""" + + current_dir = os.path.dirname(os.path.realpath(__file__)) + folds_files = [(current_dir + '/custom_train', + current_dir + '/custom_test')] + + data = Dataset.load_from_folds(folds_files=folds_files, reader=reader) + + u_features_df = pd.DataFrame( + {'urid': ['user0', 'user2', 'user3', 'user1'], + 'isMale': [False, True, False, True]}, + columns=['urid', 'isMale']) + data = data.load_features_df(u_features_df, user_features=True) + + i_features_df = pd.DataFrame( + {'irid': ['item0', 'item1'], + 'isNew': [False, True], + 'webRating': [4, 3], + 'isComedy': [True, False]}, + columns=['irid', 'isNew', 'webRating', 'isComedy']) + data = data.load_features_df(i_features_df, user_features=False) + + with pytest.warns(UserWarning): + trainset, testset = next(data.folds()) + + # test ur + ur = trainset.ur + assert ur[0] == [(0, 4)] + assert ur[1] == [(0, 4), (1, 2)] + assert ur[40] == [] # not in the trainset + + # test ir + ir = trainset.ir + assert ir[0] == [(0, 4), (1, 4), (2, 1)] + assert ir[1] == [(1, 2), (2, 1), (3, 5)] + assert ir[20000] == [] # not in the trainset + + # test n_users, n_items, n_ratings, rating_scale + assert trainset.n_users == 4 + assert trainset.n_items == 2 + assert trainset.n_ratings == 6 + assert trainset.rating_scale == (1, 5) + + # test user features + u_features = trainset.u_features + assert u_features[0] == [False] + assert u_features[40] == [] # not in trainset and u_features_df + assert trainset.user_features_labels == ['isMale'] + assert trainset.n_user_features == 1 + + # test item features + i_features = trainset.i_features + assert i_features[0] == [False, 4, True] + assert i_features[20000] == [] # not in trainset and i_features_df + assert trainset.item_features_labels == ['isNew', 'webRating', 'isComedy'] + assert trainset.n_item_features == 3 + # test raw2inner for i in range(4): assert trainset.to_inner_uid('user' + str(i)) == i with pytest.raises(ValueError): - trainset.to_inner_uid('unkown_user') + trainset.to_inner_uid('unknown_user') for i in range(2): assert trainset.to_inner_iid('item' + str(i)) == i with pytest.raises(ValueError): - trainset.to_inner_iid('unkown_item') + trainset.to_inner_iid('unknown_item') # test inner2raw assert trainset._inner2raw_id_users is None @@ -167,19 +354,24 @@ def test_trainset_testset(): algo.fit(trainset) testset = trainset.build_testset() algo.test(testset) # ensure an algorithm can manage the data - assert ('user0', 'item0', 4) in testset - assert ('user3', 'item1', 5) in testset - assert ('user3', 'item1', 0) not in testset + assert ('user0', 'item0', [False], [False, 4, True], 4) in testset + assert ('user2', 'item1', [True], [True, 3, False], 1) in testset + assert ('user3', 'item1', [False], [True, 3, False], 5) in testset + assert ('user3', 'item1', [False], [True, 3, False], 0) not in testset # Test the build_anti_testset() method algo = BaselineOnly() algo.fit(trainset) testset = trainset.build_anti_testset() algo.test(testset) # ensure an algorithm can manage the data - assert ('user0', 'item0', trainset.global_mean) not in testset - assert ('user3', 'item1', trainset.global_mean) not in testset - assert ('user0', 'item1', trainset.global_mean) in testset - assert ('user3', 'item0', trainset.global_mean) in testset + assert (('user0', 'item0', [False], [False, 4, True], trainset.global_mean) + not in testset) + assert (('user3', 'item1', [False], [True, 3, False], trainset.global_mean) + not in testset) + assert (('user0', 'item1', [False], [True, 3, False], trainset.global_mean) + in testset) + assert (('user3', 'item0', [False], [False, 4, True], trainset.global_mean) + in testset) def test_load_form_df(): @@ -238,11 +430,11 @@ def test_build_anti_testset(): # fill with some specific value for fillvalue in (0, 42., -1): anti = trainset.build_anti_testset(fill=fillvalue) - for (u, i, r) in anti: + for (u, i, u_f, i_f, r) in anti: assert r == fillvalue # fill with global_mean anti = trainset.build_anti_testset(fill=None) - for (u, i, r) in anti: + for (u, i, u_f, i_f, r) in anti: assert r == trainset.global_mean expect = trainset.n_users * trainset.n_items assert trainset.n_ratings + len(anti) == expect diff --git a/tests/test_dump.py b/tests/test_dump.py index 128442f9..a6be8522 100644 --- a/tests/test_dump.py +++ b/tests/test_dump.py @@ -34,19 +34,21 @@ def test_dump(): algo.fit(trainset) predictions = algo.test(testset) - with tempfile.NamedTemporaryFile() as tmp_file: + with tempfile.NamedTemporaryFile(delete=False) as tmp_file: dump.dump(tmp_file.name, predictions, algo) predictions_dumped, algo_dumped = dump.load(tmp_file.name) predictions_algo_dumped = algo_dumped.test(testset) assert predictions == predictions_dumped assert predictions == predictions_algo_dumped + os.remove(tmp_file.name) def test_dump_nothing(): """Ensure that by default None objects are dumped.""" - with tempfile.NamedTemporaryFile() as tmp_file: + with tempfile.NamedTemporaryFile(delete=False) as tmp_file: dump.dump(tmp_file.name) predictions, algo = dump.load(tmp_file.name) assert predictions is None assert algo is None + os.remove(tmp_file.name) diff --git a/tests/test_grid_search.py b/tests/test_grid_search.py index e54eeff3..1ba4cb68 100644 --- a/tests/test_grid_search.py +++ b/tests/test_grid_search.py @@ -99,15 +99,17 @@ def test_same_splits(): data.split(3) # all RMSE should be the same (as param combinations are the same) - param_grid = {'n_epochs': [1, 1], 'lr_all': [.5, .5]} + param_grid = {'n_epochs': [1, 1], 'lr_all': [.5, .5], 'random_state': [0]} with pytest.warns(UserWarning): - grid_search = GridSearch(SVD, param_grid, measures=['RMSE'], n_jobs=-1) - grid_search.evaluate(data) + grid_search = GridSearch(SVD, param_grid, measures=['RMSE'], n_jobs=1) + with pytest.warns(UserWarning): + grid_search.evaluate(data) rmse_scores = [s['RMSE'] for s in grid_search.cv_results['scores']] assert len(set(rmse_scores)) == 1 # assert rmse_scores are all equal # evaluate grid search again, to make sure that splits are still the same. - grid_search.evaluate(data) + with pytest.warns(UserWarning): + grid_search.evaluate(data) rmse_scores += [s['RMSE'] for s in grid_search.cv_results['scores']] assert len(set(rmse_scores)) == 1 diff --git a/tests/test_split.py b/tests/test_split.py index 0c12cb53..d55eb5ad 100644 --- a/tests/test_split.py +++ b/tests/test_split.py @@ -299,7 +299,7 @@ def test_LeaveOneOut(): # Make sure only one rating per user is present in the testset loo = LeaveOneOut() for _, testset in loo.split(data): - cnt = Counter([uid for (uid, _, _) in testset]) + cnt = Counter([uid for (uid, _, _, _, _) in testset]) assert all(val == 1 for val in itervalues(cnt)) # test the min_n_ratings parameter diff --git a/tests/test_train2fit.py b/tests/test_train2fit.py index ab0634e4..b2993031 100644 --- a/tests/test_train2fit.py +++ b/tests/test_train2fit.py @@ -35,7 +35,7 @@ def fit(self, trainset): self.bu, self.bi = 1, 1 self.cnt += 1 - def estimate(self, u, i): + def estimate(self, u, i, *_): return self.est algo = CustomAlgoFit() @@ -91,7 +91,7 @@ def train(self, trainset): self.bu, self.bi = 1, 1 self.cnt += 1 - def estimate(self, u, i): + def estimate(self, u, i, *_): return self.est with pytest.warns(UserWarning):