NicolasHug · martincousi · Mar 26, 2018 · Mar 26, 2018 · Mar 26, 2018 · Mar 27, 2018
diff --git a/.gitignore b/.gitignore
@@ -23,4 +23,4 @@ _site
 
 .coverage
 tags
-settings.json
+settings.json
diff --git a/surprise/__init__.py b/surprise/__init__.py
@@ -12,6 +12,8 @@
 from .prediction_algorithms import NMF
 from .prediction_algorithms import SlopeOne
 from .prediction_algorithms import CoClustering
+from .prediction_algorithms import Lasso
+from .prediction_algorithms import FM
 
 from .prediction_algorithms import PredictionImpossible
 from .prediction_algorithms import Prediction
@@ -30,6 +32,7 @@
            'KNNWithMeans', 'KNNBaseline', 'SVD', 'SVDpp', 'NMF', 'SlopeOne',
            'CoClustering', 'PredictionImpossible', 'Prediction', 'Dataset',
            'Reader', 'Trainset', 'evaluate', 'print_perf', 'GridSearch',
-           'dump', 'KNNWithZScore', 'get_dataset_dir', 'model_selection']
+           'dump', 'KNNWithZScore', 'get_dataset_dir', 'model_selection',
+           'Lasso', 'FM']
 
 __version__ = get_distribution('scikit-surprise').version
diff --git a/surprise/dataset.py b/surprise/dataset.py
@@ -53,6 +53,12 @@ class Dataset:
     def __init__(self, reader):
 
         self.reader = reader
+        self.user_features_nb = 0
+        self.item_features_nb = 0
+        self.user_features = {}
+        self.item_features = {}
+        self.user_features_labels = []
+        self.item_features_labels = []
 
     @classmethod
     def load_builtin(cls, name='ml-100k'):
@@ -165,6 +171,42 @@ def load_from_df(cls, df, reader):
 
         return DatasetAutoFolds(reader=reader, df=df)
 
+    def load_features_df(self, features_df, user_features=True):
+        """Load features from a pandas dataframe into a dataset.
+
+        Use this if you want to add user or item features to a dataset. Only
+        certain prediction algorithms in the :mod:`prediction_algorithms`
+        package support this additional data.
+
+        Args:
+            features_df(`Dataframe`): The dataframe containing the features. It
+                must have two columns or more, corresponding to the user or
+                item (raw) ids, and the features, in this order.
+            user_features(:obj:`bool`): Whether the features are for the users
+                or the items. Default is ``True``.
+        """
+
+        if len(features_df.columns) < 2:
+            raise ValueError('features_df requires at least 2 columns.')
+
+        if not features_df.iloc[:, 0].is_unique:
+            raise ValueError('first column of features_df must be unique ids.')
+
+        if user_features:
+            self.user_features_df = features_df
+            for tup in features_df.itertuples(index=False):
+                self.user_features[tup[0]] = list(tup[1:])
+            self.user_features_labels = features_df.columns.values.tolist()[1:]
+            self.user_features_nb = len(self.user_features_labels)
+        else:
+            self.item_features_df = features_df
+            for tup in features_df.itertuples(index=False):
+                self.item_features[tup[0]] = list(tup[1:])
+            self.item_features_labels = features_df.columns.values.tolist()[1:]
+            self.item_features_nb = len(self.item_features_labels)
+
+        return self
+
     def read_ratings(self, file_name):
         """Return a list of ratings (user, item, rating, timestamp) read from
         file_name"""
@@ -208,20 +250,36 @@ def construct_trainset(self, raw_trainset):
         ur = defaultdict(list)
         ir = defaultdict(list)
 
+        u_features = defaultdict(list)
+        i_features = defaultdict(list)
+
         # user raw id, item raw id, translated rating, time stamp
-        for urid, irid, r, timestamp in raw_trainset:
+        for urid, irid, r, _ in raw_trainset:
             try:
                 uid = raw2inner_id_users[urid]
             except KeyError:
                 uid = current_u_index
                 raw2inner_id_users[urid] = current_u_index
                 current_u_index += 1
+                if self.user_features_nb > 0:
+                    try:
+                        u_features[uid] = self.user_features[urid]
+                    except KeyError:
+                        raise ValueError('Features are defined for all users'
+                                         'but user {}'.format(urid))
+
             try:
                 iid = raw2inner_id_items[irid]
             except KeyError:
                 iid = current_i_index
                 raw2inner_id_items[irid] = current_i_index
                 current_i_index += 1
+                if self.item_features_nb > 0:
+                    try:
+                        i_features[iid] = self.item_features[irid]
+                    except KeyError:
+                        raise ValueError('Features are defined for all items'
+                                         'but item {}'.format(irid))
 
             ur[uid].append((iid, r))
             ir[iid].append((uid, r))
@@ -232,8 +290,14 @@ def construct_trainset(self, raw_trainset):
 
         trainset = Trainset(ur,
                             ir,
+                            u_features,
+                            i_features,
                             n_users,
                             n_items,
+                            self.user_features_nb,
+                            self.item_features_nb,
+                            self.user_features_labels,
+                            self.item_features_labels,
                             n_ratings,
                             self.reader.rating_scale,
                             self.reader.offset,
@@ -244,8 +308,25 @@ def construct_trainset(self, raw_trainset):
 
     def construct_testset(self, raw_testset):
 
-        return [(ruid, riid, r_ui_trans)
-                for (ruid, riid, r_ui_trans, _) in raw_testset]
+        testset = []
+        for (ruid, riid, r_ui_trans, _) in raw_testset:
+            if self.user_features_nb > 0:
+                try:  # add features if available
+                    u_features = self.user_features[ruid]
+                except KeyError:
+                    u_features = []
+            else:
+                u_features = []
+            if self.item_features_nb > 0:
+                try:  # add features if available
+                    i_features = self.item_features[riid]
+                except KeyError:
+                    i_features = []
+            else:
+                i_features = []
+            testset.append((ruid, riid, u_features, i_features, r_ui_trans))
+
+        return testset
 
 
 class DatasetUserFolds(Dataset):

diff --git a/surprise/evaluate.py b/surprise/evaluate.py
@@ -301,6 +301,7 @@ class CaseInsensitiveDefaultDict(defaultdict):
         Used for the returned dict, so that users can use perf['RMSE'] or
         perf['rmse'] indifferently.
     """
+
     def __setitem__(self, key, value):
         super(CaseInsensitiveDefaultDict, self).__setitem__(key.lower(), value)
 
@@ -333,4 +334,5 @@ def seed_and_eval(seed, *args):
     different processes."""
 
     random.seed(seed)
+
     return evaluate(*args, verbose=0)
diff --git a/surprise/model_selection/search.py b/surprise/model_selection/search.py
@@ -294,6 +294,7 @@ class GridSearchCV(BaseSearchCV):
             into a pandas `DataFrame` (see :ref:`example
             <cv_results_example>`).
     """
+
     def __init__(self, algo_class, param_grid, measures=['rmse', 'mae'],
                  cv=None, refit=False, return_train_measures=False, n_jobs=1,
                  pre_dispatch='2*n_jobs', joblib_verbose=0):
@@ -410,6 +411,7 @@ class RandomizedSearchCV(BaseSearchCV):
             into a pandas `DataFrame` (see :ref:`example
             <cv_results_example>`).
     """
+
     def __init__(self, algo_class, param_distributions, n_iter=10,
                  measures=['rmse', 'mae'], cv=None, refit=False,
                  return_train_measures=False, n_jobs=1,

diff --git a/surprise/model_selection/split.py b/surprise/model_selection/split.py
@@ -372,7 +372,7 @@ def split(self, data):
 
         Args:
             data(:obj:`Dataset<surprise.dataset.Dataset>`): The data containing
-                ratings that will be devided into trainsets and testsets.
+                ratings that will be divided into trainsets and testsets.
 
         Yields:
             tuple of (trainset, testset)

diff --git a/surprise/prediction_algorithms/__init__.py b/surprise/prediction_algorithms/__init__.py
@@ -32,11 +32,13 @@
 from .matrix_factorization import NMF
 from .slope_one import SlopeOne
 from .co_clustering import CoClustering
+from .linear import Lasso
+from .factorization_machines import FM
 
 from .predictions import PredictionImpossible
 from .predictions import Prediction
 
 __all__ = ['AlgoBase', 'NormalPredictor', 'BaselineOnly', 'KNNBasic',
            'KNNBaseline', 'KNNWithMeans', 'SVD', 'SVDpp', 'NMF', 'SlopeOne',
            'CoClustering', 'PredictionImpossible', 'Prediction',
-           'KNNWithZScore']
+           'KNNWithZScore', 'Lasso', 'FM']
diff --git a/surprise/prediction_algorithms/algo_base.py b/surprise/prediction_algorithms/algo_base.py
@@ -37,7 +37,7 @@ def __init__(self, **kwargs):
         self.skip_train = False
 
         if (guf(self.__class__.fit) is guf(AlgoBase.fit) and
-           guf(self.__class__.train) is not guf(AlgoBase.train)):
+                guf(self.__class__.train) is not guf(AlgoBase.train)):
             warnings.warn('It looks like this algorithm (' +
                           str(self.__class__) +
                           ') implements train() '
@@ -96,7 +96,8 @@ def fit(self, trainset):
 
         return self
 
-    def predict(self, uid, iid, r_ui=None, clip=True, verbose=False):
+    def predict(self, uid, iid, u_features=[], i_features=[], r_ui=None,
+                clip=True, verbose=False):
         """Compute the rating prediction for given user and item.
 
         The ``predict`` method converts raw ids to inner ids and then calls the
@@ -108,6 +109,10 @@ def predict(self, uid, iid, r_ui=None, clip=True, verbose=False):
         Args:
             uid: (Raw) id of the user. See :ref:`this note<raw_inner_note>`.
             iid: (Raw) id of the item. See :ref:`this note<raw_inner_note>`.
+            u_features: List of user features in the same order as used in
+                the ``fit`` method. Optional, default is ``[]``.
+            i_features: List of item features in the same order as used in
+                the ``fit`` method. Optional, default is ``[]``.
             r_ui(float): The true rating :math:`r_{ui}`. Optional, default is
                 ``None``.
             clip(bool): Whether to clip the estimation into the rating scale.
@@ -143,7 +148,7 @@ def predict(self, uid, iid, r_ui=None, clip=True, verbose=False):
 
         details = {}
         try:
-            est = self.estimate(iuid, iiid)
+            est = self.estimate(iuid, iiid, u_features, i_features)
 
             # If the details dict was also returned
             if isinstance(est, tuple):
@@ -207,9 +212,12 @@ def test(self, testset, verbose=False):
         # The ratings are translated back to their original scale.
         predictions = [self.predict(uid,
                                     iid,
+                                    u_features,
+                                    i_features,
                                     r_ui_trans - self.trainset.offset,
                                     verbose=verbose)
-                       for (uid, iid, r_ui_trans) in testset]
+                       for (uid, iid, u_features, i_features, r_ui_trans)
+                       in testset]
         return predictions
 
     def compute_baselines(self):

diff --git a/surprise/prediction_algorithms/baseline_only.py b/surprise/prediction_algorithms/baseline_only.py
@@ -22,10 +22,10 @@ class BaselineOnly(AlgoBase):
             computation. See :ref:`baseline_estimates_configuration` for
             accepted options.
         verbose(bool): Whether to print trace messages of bias estimation,
-            similarity, etc.  Default is True.
+            similarity, etc.  Default is False.
     """
 
-    def __init__(self, bsl_options={}, verbose=True):
+    def __init__(self, bsl_options={}, verbose=False):
 
         AlgoBase.__init__(self, bsl_options=bsl_options)
         self.verbose = verbose
@@ -37,7 +37,7 @@ def fit(self, trainset):
 
         return self
 
-    def estimate(self, u, i):
+    def estimate(self, u, i, *_):
 
         est = self.trainset.global_mean
         if self.trainset.knows_user(u):

diff --git a/surprise/prediction_algorithms/co_clustering.pyx b/surprise/prediction_algorithms/co_clustering.pyx
@@ -62,7 +62,7 @@ class CoClustering(AlgoBase):
         self.n_cltr_u = n_cltr_u
         self.n_cltr_i = n_cltr_i
         self.n_epochs = n_epochs
-        self.verbose=verbose
+        self.verbose = verbose
         self.random_state = random_state
 
     def fit(self, trainset):
@@ -236,7 +236,7 @@ class CoClustering(AlgoBase):
 
         return avg_cltr_u, avg_cltr_i, avg_cocltr
 
-    def estimate(self, u, i):
+    def estimate(self, u, i, *_):
 
         if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
             return self.trainset.global_mean