diff --git a/setup.py b/setup.py index b37582f1..d99610a4 100644 --- a/setup.py +++ b/setup.py @@ -10,6 +10,7 @@ name='spotlight', version=version, packages=find_packages(), + install_requires=['dynarray'], license='MIT', classifiers=['Development Status :: 3 - Alpha', 'License :: OSI Approved :: MIT License', diff --git a/spotlight/evaluation.py b/spotlight/evaluation.py index 54909e82..cd9693e4 100644 --- a/spotlight/evaluation.py +++ b/spotlight/evaluation.py @@ -83,19 +83,22 @@ def sequence_mrr_score(model, test, exclude_preceding=False): Array of MRR scores for each sequence in test. """ - sequences = test.sequences[:, :-1] - targets = test.sequences[:, -1:] - mrrs = [] - for i in range(len(sequences)): + for sequence in test: + + subsequence = sequence[:-1] + target = sequence[-1:] + + if not len(subsequence): + continue - predictions = -model.predict(sequences[i]) + predictions = -model.predict(subsequence) if exclude_preceding: - predictions[sequences[i]] = FLOAT_MAX + predictions[subsequence] = FLOAT_MAX - mrr = (1.0 / st.rankdata(predictions)[targets[i]]).mean() + mrr = (1.0 / st.rankdata(predictions)[target]).mean() mrrs.append(mrr) diff --git a/spotlight/interactions.py b/spotlight/interactions.py index 25b9f2fc..f8f5584c 100644 --- a/spotlight/interactions.py +++ b/spotlight/interactions.py @@ -7,6 +7,12 @@ import scipy.sparse as sp +import torch + +from dynarray import DynamicArray + +from spotlight.torch_utils import gpu + def _sliding_window(tensor, window_size, step_size=1): @@ -14,10 +20,28 @@ def _sliding_window(tensor, window_size, step_size=1): yield tensor[max(i - window_size, 0):i] -def _generate_sequences(user_ids, item_ids, - indices, +def _generate_sequences(interactions, max_sequence_length, - step_size): + min_sequence_length=1, + step_size=1): + + if interactions.timestamps is None: + raise ValueError('Cannot convert to sequences, ' + 'timestamps not available.') + + if step_size is None: + step_size = max_sequence_length + + # Sort first by user id, then by timestamp + sort_indices = np.lexsort((interactions.timestamps, + interactions.user_ids)) + + user_ids = interactions.user_ids[sort_indices] + item_ids = interactions.item_ids[sort_indices] + + user_ids, indices, counts = np.unique(user_ids, + return_index=True, + return_counts=True) for i in range(len(indices)): @@ -32,9 +56,30 @@ def _generate_sequences(user_ids, item_ids, max_sequence_length, step_size): + if len(seq) < min_sequence_length: + continue + yield (user_ids[i], seq) +def _pack_sequences(sequences): + + packed = {} + + for user_id, sequence in sequences: + + (packed.setdefault(len(sequence), + DynamicArray((None, + len(sequence)), + dtype=np.int64)) + .append(sequence)) + + for value in packed.values(): + value.shrink_to_fit() + + return {key: value[:] for (key, value) in packed.items()} + + class Interactions(object): """ Interactions object. Contains (at a minimum) pair of user-item @@ -167,45 +212,44 @@ def tocsr(self): return self.tocoo().tocsr() - def to_sequence(self, max_sequence_length=10, min_sequence_length=None, step_size=None): + def to_sequence(self, max_sequence_length=10, min_sequence_length=1, step_size=None): """ Transform to sequence form. User-item interaction pairs are sorted by their timestamps, and sequences of up to max_sequence_length events are arranged - into a (zero-padded from the left) matrix with dimensions - (num_sequences x max_sequence_length). + are returned. The returned sequences are not padded, + taking advatnage of PyTorch's flexibility. Valid subsequences of users' interactions are returned. For example, if a user interacted with items [1, 2, 3, 4, 5], the - returned interactions matrix at sequence length 5 and step size + returned interactions set at sequence length 5 and step size 1 will be be given by: .. code-block:: python [[1, 2, 3, 4, 5], - [0, 1, 2, 3, 4], - [0, 0, 1, 2, 3], - [0, 0, 0, 1, 2], - [0, 0, 0, 0, 1]] + [1, 2, 3, 4], + [1, 2, 3], + [1, 2], + [1]] At step size 2: .. code-block:: python [[1, 2, 3, 4, 5], - [0, 0, 1, 2, 3], - [0, 0, 0, 0, 1]] + [1, 2, 3], + [1]] Parameters ---------- max_sequence_length: int, optional - Maximum sequence length. Subsequences shorter than this - will be left-padded with zeros. + Maximum sequence length. min_sequence_length: int, optional If set, only sequences with at least min_sequence_length - non-padding elements will be returned. + elements will be returned. step-size: int, optional The returned subsequences are the effect of moving a a sliding window over the input. This parameter @@ -219,63 +263,32 @@ def to_sequence(self, max_sequence_length=10, min_sequence_length=None, step_siz The resulting sequence interactions. """ - if self.timestamps is None: - raise ValueError('Cannot convert to sequences, ' - 'timestamps not available.') - if 0 in self.item_ids: raise ValueError('0 is used as an item id, conflicting ' 'with the sequence padding value.') - if step_size is None: - step_size = max_sequence_length - - # Sort first by user id, then by timestamp - sort_indices = np.lexsort((self.timestamps, - self.user_ids)) - - user_ids = self.user_ids[sort_indices] - item_ids = self.item_ids[sort_indices] - - user_ids, indices, counts = np.unique(user_ids, - return_index=True, - return_counts=True) - - num_subsequences = int(np.ceil(counts / float(step_size)).sum()) - - sequences = np.zeros((num_subsequences, max_sequence_length), - dtype=np.int32) - sequence_users = np.empty(num_subsequences, - dtype=np.int32) - for i, (uid, - seq) in enumerate(_generate_sequences(user_ids, - item_ids, - indices, - max_sequence_length, - step_size)): - sequences[i][-len(seq):] = seq - sequence_users[i] = uid - - if min_sequence_length is not None: - long_enough = sequences[:, -min_sequence_length] != 0 - sequences = sequences[long_enough] - sequence_users = sequence_users[long_enough] + sequences = _pack_sequences( + _generate_sequences(self, + max_sequence_length, + min_sequence_length, + step_size) + ) return (SequenceInteractions(sequences, - user_ids=sequence_users, num_items=self.num_items)) class SequenceInteractions(object): """ - Interactions encoded as a sequence matrix. + Interactions encoded as sequences. This object is not normally constructed + directly, but rather returned from :func:`~Interactions.to_sequence`. Parameters ---------- - sequences: array of np.int32 of shape (num_sequences x max_sequence_length) - The interactions sequence matrix, as produced by - :func:`~Interactions.to_sequence` + sequences: dict of np.int64 arrays of shape (num_sequences x sequence_length) + The interactions sequence, dictionary grouping all + subsequences into matrices by their length. num_items: int, optional The number of distinct items in the data @@ -289,24 +302,80 @@ class SequenceInteractions(object): def __init__(self, sequences, - user_ids=None, num_items=None): + num_items=None): self.sequences = sequences - self.user_ids = user_ids - self.max_sequence_length = sequences.shape[1] + + self._num_sequences = sum(x.shape[0] for x in self.sequences.values()) + self._max_sequence_length = max(x.shape[1] for x in self.sequences.values()) if num_items is None: - self.num_items = sequences.max() + 1 + self.num_items = max(x.max() + 1 for x in self.sequences.values()) else: self.num_items = num_items def __repr__(self): - num_sequences, sequence_length = self.sequences.shape - return ('' + 'sequences x {sequence_length} max sequence length)>' .format( - num_sequences=num_sequences, - sequence_length=sequence_length, + num_sequences=self._num_sequences, + sequence_length=self._max_sequence_length, )) + + def __iter__(self): + + for value in self.sequences.values(): + for row in value: + yield row + + def minibatch(self, batch_size, random_state=None, use_cuda=False, only_full_batches=True): + """ + Iterate over minibatches of the dataset. Each minibatch has the same sequence length, + but the lengths of each minibatche can differ to avoid padding. Minibatch order as + well as sequences within a minibatch are shuffled on each epoch. + + Parameters + ---------- + + batch_size: int + The size of each minibatch. Minibatches smaller than this will not be emitted + if `only_full_batches` is `True` (default). + random_state: instance of numpy.random.RandomState, optional + Random generator to use when returning the minibatches. + use_cuda: bool, optional + Whether to send the minibatch data to the GPU. + only_full_batches: bool, optional + If `True`, minibatches smaller than `batch_size` are omitted. This is helpful + if loss values are averaged across minibatch. In that case, minibatches with + few examples would have a disproportionately large effect on the gradients. + """ + + if random_state is None: + random_state = np.random.RandomState() + + # Shuffle the sequences within blocks of same length. + for value in self.sequences.values(): + random_state.shuffle(value) + + # Build a list of minibatches to execute that + # randomly alternates between the lengths. + minibatches = [] + + for key, value in self.sequences.items(): + for i in range(0, len(value), batch_size): + minibatches.append([key, i, i + batch_size]) + + minibatches = np.array(minibatches, dtype=np.int64) + random_state.shuffle(minibatches) + + # Convert to tensors (and possibly transfer to GPU). + tensor_data = {k: gpu(torch.from_numpy(v), use_cuda) for (k, v) in self.sequences.items()} + + for (key, start, stop) in minibatches: + btch = tensor_data[key][start:stop] + + if len(btch) != batch_size: + continue + + yield btch diff --git a/spotlight/sequence/implicit.py b/spotlight/sequence/implicit.py index ba0d2171..86f6414c 100644 --- a/spotlight/sequence/implicit.py +++ b/spotlight/sequence/implicit.py @@ -12,13 +12,14 @@ from torch.autograd import Variable from spotlight.helpers import _repr_model +from spotlight.interactions import SequenceInteractions from spotlight.losses import (adaptive_hinge_loss, bpr_loss, hinge_loss, pointwise_loss) from spotlight.sequence.representations import PADDING_IDX, CNNNet, LSTMNet, PoolNet from spotlight.sampling import sample_items -from spotlight.torch_utils import cpu, gpu, minibatch, set_seed, shuffle +from spotlight.torch_utils import cpu, gpu, set_seed class ImplicitSequenceModel(object): @@ -70,7 +71,7 @@ class ImplicitSequenceModel(object): .. code-block:: python [[1, 2, 3, 4, 5], - [0, 0, 7, 1, 4]] + [7, 1, 4]] In this case, the loss for the first example will be the mean loss @@ -175,7 +176,9 @@ def _initialize(self, interactions): def _check_input(self, item_ids): - if isinstance(item_ids, int): + if isinstance(item_ids, SequenceInteractions): + item_id_max = item_ids.num_items - 1 + elif isinstance(item_ids, int): item_id_max = item_ids else: item_id_max = item_ids.max() @@ -199,25 +202,19 @@ def fit(self, interactions, verbose=False): The input sequence dataset. """ - sequences = interactions.sequences.astype(np.int64) - if not self._initialized: self._initialize(interactions) - self._check_input(sequences) + self._check_input(interactions) for epoch_num in range(self._n_iter): - sequences = shuffle(sequences, - random_state=self._random_state) - - sequences_tensor = gpu(torch.from_numpy(sequences), - self._use_cuda) - epoch_loss = 0.0 - for minibatch_num, batch_sequence in enumerate(minibatch(sequences_tensor, - batch_size=self._batch_size)): + for (minibatch_num, + batch_sequence) in enumerate(interactions + .minibatch(batch_size=self._batch_size, + random_state=self._random_state)): sequence_var = Variable(batch_sequence) diff --git a/spotlight/torch_utils.py b/spotlight/torch_utils.py index f425fa30..287204b0 100644 --- a/spotlight/torch_utils.py +++ b/spotlight/torch_utils.py @@ -32,6 +32,19 @@ def minibatch(*tensors, **kwargs): yield tuple(x[i:i + batch_size] for x in tensors) +def minibatch_indices(*tensors, **kwargs): + + batch_size = kwargs.get('batch_size', 128) + + if len(tensors) == 1: + tensor = tensors[0] + for i in range(0, len(tensor), batch_size): + yield tensor[i:i + batch_size] + else: + for i in range(0, len(tensors[0]), batch_size): + yield tuple(x[i:i + batch_size] for x in tensors) + + def shuffle(*arrays, **kwargs): random_state = kwargs.get('random_state') diff --git a/tests/sequence/test_sequence_implicit.py b/tests/sequence/test_sequence_implicit.py index 378c1cc8..d0d2ea79 100644 --- a/tests/sequence/test_sequence_implicit.py +++ b/tests/sequence/test_sequence_implicit.py @@ -103,7 +103,7 @@ def test_implicit_lstm_synthetic(randomness, expected_mrr): embedding_dim=EMBEDDING_DIM, learning_rate=1e-2, l2=1e-7, - n_iter=NUM_EPOCHS * 5, + n_iter=NUM_EPOCHS * 10, random_state=random_state, use_cuda=CUDA) @@ -204,7 +204,7 @@ def test_implicit_pooling_losses(loss, expected_mrr): @pytest.mark.parametrize('compression_ratio, expected_mrr', [ (0.2, 0.14), (0.5, 0.30), - (1.0, 0.5), + (1.0, 0.48), ]) def test_bloom_cnn(compression_ratio, expected_mrr): diff --git a/tests/test_interactions.py b/tests/test_interactions.py index 70f13610..5ca21e75 100644 --- a/tests/test_interactions.py +++ b/tests/test_interactions.py @@ -4,59 +4,39 @@ from spotlight.cross_validation import random_train_test_split from spotlight.datasets import movielens -from spotlight.interactions import Interactions +from spotlight.interactions import Interactions, _generate_sequences -def _test_just_padding(sequences): - """ - There should be no rows with only padding in them. - """ - - row_sum = sequences.sum(axis=1) - - assert len(row_sum) == sequences.shape[0] - assert np.all(row_sum > 0) - - -def _test_final_column_no_padding(sequences): - """ - The final column should always have an interaction. - """ - - assert np.all(sequences[:, -1] > 0) - - -def _test_shifted(sequence_users, sequences, step_size): +def _test_shifted(sequences, step_size): """ Unless there was a change of user, row i + 1's interactions should contain row i's interactions shifted to the right by step size. """ - for i in range(1, len(sequences)): + previous_uid = None + previous_sequence = None - if sequence_users[i] != sequence_users[i - 1]: - # Change of user - continue + for user_id, sequence in sequences: + if previous_uid == user_id: + assert (np.all(sequence[-len(previous_sequence) + step_size:] == + previous_sequence[:-step_size])) - assert np.all(sequences[i][step_size:] == sequences[i - 1][:-step_size]) + previous_uid = user_id + previous_sequence = sequence -def _test_temporal_order(sequence_users, sequences, interactions): +def _test_temporal_order(sequences, interactions): interaction_matrix = interactions.tocoo() interaction_matrix.data = interactions.timestamps interaction_matrix = interaction_matrix.tocsr().todense() - for i, sequence in enumerate(sequences): - - user_id = sequence_users[i] - nonpadded_sequence = sequence[sequence != 0] + for user_id, sequence in sequences: + for j in range(0, len(sequence) - 1): + item_id = sequence[j] - for j in range(0, len(nonpadded_sequence) - 1): - item_id = nonpadded_sequence[j] - - next_item_id = nonpadded_sequence[j + 1] + next_item_id = sequence[j + 1] item_timestamp = interaction_matrix[user_id, item_id] next_item_timestamp = interaction_matrix[user_id, next_item_id] @@ -69,18 +49,19 @@ def test_known_output_step_1(): interactions = Interactions(np.zeros(5), np.arange(5) + 1, timestamps=np.arange(5)) - sequences = interactions.to_sequence(max_sequence_length=5, - step_size=1).sequences + sequences = list(v.tolist() for (_, v) in _generate_sequences(interactions, + max_sequence_length=5, + step_size=1)) - expected = np.array([ + expected = [ [1, 2, 3, 4, 5], - [0, 1, 2, 3, 4], - [0, 0, 1, 2, 3], - [0, 0, 0, 1, 2], - [0, 0, 0, 0, 1] - ]) + [1, 2, 3, 4], + [1, 2, 3], + [1, 2], + [1] + ] - assert np.all(sequences == expected) + assert sequences == expected def test_known_output_step_2(): @@ -88,16 +69,17 @@ def test_known_output_step_2(): interactions = Interactions(np.zeros(5), np.arange(5) + 1, timestamps=np.arange(5)) - sequences = interactions.to_sequence(max_sequence_length=5, - step_size=2).sequences + sequences = list(v.tolist() for (_, v) in _generate_sequences(interactions, + max_sequence_length=5, + step_size=2)) - expected = np.array([ + expected = [ [1, 2, 3, 4, 5], - [0, 0, 1, 2, 3], - [0, 0, 0, 0, 1], - ]) + [1, 2, 3], + [1], + ] - assert np.all(sequences == expected) + assert sequences == expected @pytest.mark.parametrize('max_sequence_length, step_size', [ @@ -113,23 +95,15 @@ def test_to_sequence(max_sequence_length, step_size): interactions = movielens.get_movielens_dataset('100K') _, interactions = random_train_test_split(interactions) - sequences = interactions.to_sequence( - max_sequence_length=max_sequence_length, - step_size=step_size) - - if step_size == 1: - assert sequences.sequences.shape == (len(interactions), - max_sequence_length) - else: - assert sequences.sequences.shape[1] == max_sequence_length + def seqs(): + return _generate_sequences( + interactions, + max_sequence_length=max_sequence_length, + step_size=step_size) - _test_just_padding(sequences.sequences) - _test_final_column_no_padding(sequences.sequences) - _test_shifted(sequences.user_ids, - sequences.sequences, + _test_shifted(seqs(), step_size) - _test_temporal_order(sequences.user_ids, - sequences.sequences, + _test_temporal_order(seqs(), interactions) @@ -138,12 +112,16 @@ def test_to_sequence_min_length(): min_sequence_length = 10 interactions = movielens.get_movielens_dataset('100K') + def seqs(min_sequence_length): + return _generate_sequences( + interactions, + max_sequence_length=10, + min_sequence_length=min_sequence_length, + step_size=1) + # Check that with default arguments there are sequences # that are shorter than we want - sequences = interactions.to_sequence(max_sequence_length=20) - assert np.any((sequences.sequences != 0).sum(axis=1) < min_sequence_length) + assert any(len(v) < min_sequence_length for (_, v) in seqs(min_sequence_length=1)) # But no such sequences after we specify min length. - sequences = interactions.to_sequence(max_sequence_length=20, - min_sequence_length=min_sequence_length) - assert not np.any((sequences.sequences != 0).sum(axis=1) < min_sequence_length) + assert not any(len(v) < min_sequence_length for (_, v) in seqs(min_sequence_length=20))