Skip to content

Commit 387ccd1

Browse files
authored
feat: Improve docstrings and variable naming (Pringled#3)
* Renamed alpha to lambda_param * Updated docstrings * Updated typing * Renamed relevances to scores
1 parent 659cef9 commit 387ccd1

File tree

8 files changed

+117
-91
lines changed

8 files changed

+117
-91
lines changed

src/pyversity/core.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,29 +8,29 @@
88

99
def diversify(
1010
strategy: Strategy,
11-
relevances: np.ndarray,
1211
embeddings: np.ndarray,
12+
scores: np.ndarray,
1313
k: int,
1414
**kwargs: Any,
1515
) -> tuple[np.ndarray, np.ndarray]:
1616
"""
1717
Diversify a retrieval result using a selected strategy.
1818
1919
:param strategy: The diversification strategy to apply. Supported strategies are: MMR, MSD, COVER, and DPP.
20-
:param relevances: Array of relevance scores for the items.
2120
:param embeddings: Array of embeddings for the items.
21+
:param scores: Array of relevance scores for the items.
2222
:param k: The number of items to select in the diversified result.
2323
:param **kwargs: Additional keyword arguments passed to the specific strategy function.
2424
:return: A tuple containing an array of indices of the selected items
2525
and an array of corresponding relevance scores for the selected items.
2626
:raises ValueError: If the provided strategy is not recognized.
2727
"""
2828
if strategy == Strategy.MMR:
29-
return mmr(relevances, embeddings, k, **kwargs)
29+
return mmr(scores, embeddings, k, **kwargs)
3030
if strategy == Strategy.MSD:
31-
return msd(relevances, embeddings, k, **kwargs)
31+
return msd(scores, embeddings, k, **kwargs)
3232
if strategy == Strategy.COVER:
33-
return cover(relevances, embeddings, k, **kwargs)
33+
return cover(scores, embeddings, k, **kwargs)
3434
if strategy == Strategy.DPP:
35-
return dpp(relevances, embeddings, k, **kwargs)
35+
return dpp(scores, embeddings, k, **kwargs)
3636
raise ValueError(f"Unknown strategy: {strategy}")

src/pyversity/datatypes.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,16 @@
22

33

44
class Strategy(str, Enum):
5+
"""Supported diversification strategies."""
6+
57
MMR = "mmr"
68
MSD = "msd"
79
COVER = "cover"
810
DPP = "dpp"
911

1012

1113
class Metric(str, Enum):
14+
"""Supported similarity metrics."""
15+
1216
COSINE = "cosine"
1317
DOT = "dot"

src/pyversity/strategies/cover.py

Lines changed: 24 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55

66

77
def cover(
8-
relevances: np.ndarray,
98
embeddings: np.ndarray,
9+
scores: np.ndarray,
1010
k: int,
1111
theta: float = 0.5,
1212
gamma: float = 0.5,
@@ -19,8 +19,8 @@ def cover(
1919
This strategy chooses `k` items by combining pure relevance with
2020
diversity-driven coverage using a concave submodular formulation.
2121
22-
:param relevances: 1D array of relevance scores for each item.
2322
:param embeddings: 2D array of shape (n_samples, n_features).
23+
:param scores: 1D array of relevance scores for each item.
2424
:param k: Number of items to select.
2525
:param theta: Trade-off between relevance and coverage in [0, 1].
2626
1.0 = pure relevance, 0.0 = pure coverage.
@@ -31,47 +31,55 @@ def cover(
3131
:raises ValueError: If theta is not in [0, 1].
3232
:raises ValueError: If gamma is not in (0, 1].
3333
"""
34+
# Validate parameters
3435
if not (0.0 <= float(theta) <= 1.0):
3536
raise ValueError("theta must be in [0, 1]")
3637
if not (0.0 < float(gamma) <= 1.0):
3738
raise ValueError("gamma must be in (0, 1]")
3839

39-
relevance_scores, feature_matrix, top_k, early_exit = prepare_inputs(relevances, embeddings, k)
40+
# Prepare inputs
41+
relevance_scores, feature_matrix, top_k, early_exit = prepare_inputs(scores, embeddings, k)
4042
if early_exit:
43+
# Nothing to select: return empty arrays
4144
return np.empty(0, np.int32), np.empty(0, np.float32)
4245

4346
if metric == Metric.COSINE and normalize:
47+
# Normalize feature vectors to unit length for cosine similarity
4448
feature_matrix = normalize_rows(feature_matrix)
4549

46-
# Pure relevance: short-circuit
4750
if float(theta) == 1.0:
51+
# Pure relevance: select top-k by relevance scores
4852
topk = np.argsort(-relevance_scores)[:top_k].astype(np.int32)
4953
gains = relevance_scores[topk].astype(np.float32, copy=False)
5054
return topk, gains
5155

52-
# Nonnegative similarities for coverage to avoid concave-power NaNs
56+
# Compute non-negative similarities for coverage to avoid concave-power NaNs
5357
similarity_matrix = pairwise_similarity(feature_matrix, metric)
54-
transposed_similarity = similarity_matrix.T
58+
transposed_similarity_matrix = similarity_matrix.T
5559

56-
n = similarity_matrix.shape[0]
57-
accumulated_coverage = np.zeros(n, dtype=np.float32)
58-
selected_mask = np.zeros(n, dtype=bool)
60+
# Initialize selection state
61+
accumulated_coverage = np.zeros(similarity_matrix.shape[0], dtype=np.float32)
62+
selected_mask = np.zeros(similarity_matrix.shape[0], dtype=bool)
5963
selected_indices = np.empty(top_k, dtype=np.int32)
6064
marginal_gains = np.empty(top_k, dtype=np.float32)
6165

62-
for t in range(top_k):
66+
for step in range(top_k):
67+
# Compute coverage gains using concave transformation
6368
concave_before = np.power(accumulated_coverage, gamma)
64-
concave_after = np.power(transposed_similarity + accumulated_coverage[None, :], gamma)
69+
concave_after = np.power(transposed_similarity_matrix + accumulated_coverage[None, :], gamma)
6570
coverage_gains = (concave_after - concave_before[None, :]).sum(axis=1)
6671

72+
# Combine relevance and coverage gains
6773
candidate_scores = theta * relevance_scores + (1.0 - theta) * coverage_gains
6874
candidate_scores[selected_mask] = -np.inf
6975

70-
chosen = int(np.argmax(candidate_scores))
71-
selected_indices[t] = chosen
72-
marginal_gains[t] = float(candidate_scores[chosen])
73-
selected_mask[chosen] = True
76+
# Select item with highest combined score
77+
best_index = int(np.argmax(candidate_scores))
78+
selected_indices[step] = best_index
79+
marginal_gains[step] = float(candidate_scores[best_index])
80+
selected_mask[best_index] = True
7481

75-
accumulated_coverage += similarity_matrix[:, chosen]
82+
# Update accumulated coverage
83+
accumulated_coverage += similarity_matrix[:, best_index]
7684

7785
return selected_indices, marginal_gains

src/pyversity/strategies/dpp.py

Lines changed: 28 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ def _exp_zscore_weights(relevance: np.ndarray, beta: float) -> np.ndarray:
1212

1313

1414
def dpp(
15-
relevances: np.ndarray,
1615
embeddings: np.ndarray,
16+
scores: np.ndarray,
1717
k: int,
1818
beta: float = 1.0,
1919
) -> tuple[np.ndarray, np.ndarray]:
@@ -24,58 +24,67 @@ def dpp(
2424
maximizing the determinant of a kernel matrix that balances item relevance
2525
and pairwise similarity.
2626
27-
:param relevances: 1D array of relevance scores for each item.
2827
:param embeddings: 2D array of shape (n_samples, n_features).
28+
:param scores: 1D array of relevance scores for each item.
2929
:param k: Number of items to select.
3030
:param beta: Controls the influence of relevance scores in the DPP kernel.
3131
Higher values increase the emphasis on relevance.
3232
:return: Tuple of selected indices and their marginal gains.
3333
"""
34-
relevance_scores, feature_matrix, top_k, early_exit = prepare_inputs(relevances, embeddings, k)
34+
# Prepare inputs
35+
relevance_scores, feature_matrix, top_k, early_exit = prepare_inputs(scores, embeddings, k)
3536
if early_exit:
37+
# Nothing to select: return empty arrays
3638
return np.empty(0, np.int32), np.empty(0, np.float32)
3739

40+
# Normalize feature vectors to unit length for cosine similarity
3841
feature_matrix = normalize_rows(feature_matrix)
3942

4043
num_items = feature_matrix.shape[0]
4144
weights = _exp_zscore_weights(relevance_scores, beta)
4245

43-
# Diagonal of L plus jitter is the initial residual variance.
46+
# Initial residual variance is the weighted self-similarity
4447
residual_variance = (weights * weights + float(EPS32)).astype(np.float32, copy=False)
4548

46-
# Columns will store orthogonalized update components.
49+
# Initialize selection state
4750
component_matrix = np.zeros((num_items, top_k), dtype=np.float32)
48-
4951
selected_indices = np.empty(top_k, dtype=np.int32)
5052
marginal_gains = np.empty(top_k, dtype=np.float32)
5153
selected_mask = np.zeros(num_items, dtype=bool)
5254

53-
t = 0
54-
for t in range(top_k):
55+
step = 0
56+
for step in range(top_k):
57+
# Select item with highest residual variance
5558
residual_variance[selected_mask] = -np.inf
5659
best_index = int(np.argmax(residual_variance))
57-
best_gain = float(residual_variance[best_index])
60+
best_score = float(residual_variance[best_index])
5861

59-
selected_indices[t] = best_index
60-
marginal_gains[t] = best_gain
62+
selected_indices[step] = best_index
63+
marginal_gains[step] = best_score
6164
selected_mask[best_index] = True
6265

63-
if t == top_k - 1 or best_gain <= 0.0:
64-
t += 1
66+
if step == top_k - 1 or best_score <= 0.0:
67+
# No more items to select or no positive gain
68+
step += 1
6569
break
6670

71+
# Update residual variance using the new component
6772
weighted_similarity_to_best = (weights * (feature_matrix @ feature_matrix[best_index])) * weights[best_index]
6873

69-
if t > 0:
70-
projected_component: np.ndarray = component_matrix[:, :t] @ component_matrix[best_index, :t]
74+
if step > 0:
75+
# Project out the component in the span of previously selected items
76+
projected_component: np.ndarray = component_matrix[:, :step] @ component_matrix[best_index, :step]
7177
else:
78+
# No previous components, so projection is zero
7279
projected_component = np.zeros(num_items, dtype=np.float32)
7380

74-
sqrt_best_gain = np.float32(np.sqrt(best_gain))
75-
update_component = (weighted_similarity_to_best - projected_component) / (sqrt_best_gain + EPS32)
81+
# Compute update component
82+
sqrt_best_score = np.float32(np.sqrt(best_score))
83+
update_component = (weighted_similarity_to_best - projected_component) / (sqrt_best_score + EPS32)
7684

77-
component_matrix[:, t] = update_component
85+
# Update component matrix and residual variance
86+
component_matrix[:, step] = update_component
7887
residual_variance -= update_component * update_component
7988
np.maximum(residual_variance, 0.0, out=residual_variance)
8089

81-
return selected_indices[:t], marginal_gains[:t]
90+
return selected_indices[:step], marginal_gains[:step]

src/pyversity/strategies/mmr.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@
55

66

77
def mmr(
8-
relevances: np.ndarray,
98
embeddings: np.ndarray,
9+
scores: np.ndarray,
1010
k: int,
11-
alpha: float = 0.5,
11+
lambda_param: float = 0.5,
1212
metric: Metric = Metric.COSINE,
1313
normalize: bool = True,
1414
) -> tuple[np.ndarray, np.ndarray]:
@@ -19,21 +19,21 @@ def mmr(
1919
iteratively choosing items that maximize a combination of their relevance
2020
and their dissimilarity to already selected items.
2121
22-
:param relevances: 1D array of relevance scores for each item.
2322
:param embeddings: 2D array of shape (n_samples, n_features).
23+
:param scores: 1D array of relevance scores for each item.
2424
:param k: Number of items to select.
25-
:param alpha: Trade-off parameter in [0, 1].
25+
:param lambda_param: Trade-off parameter in [0, 1].
2626
1.0 = pure relevance, 0.0 = pure diversity.
2727
:param metric: Similarity metric to use. Default is Metric.COSINE.
2828
:param normalize: Whether to normalize embeddings before computing similarity.
2929
:return: Tuple of selected indices and their marginal gains.
3030
"""
3131
return greedy_select(
3232
"mmr",
33-
relevances,
33+
scores,
3434
embeddings,
3535
k,
3636
metric=metric,
3737
normalize=normalize,
38-
alpha=alpha,
38+
lambda_param=lambda_param,
3939
)

src/pyversity/strategies/msd.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@
55

66

77
def msd(
8-
relevances: np.ndarray,
98
embeddings: np.ndarray,
9+
scores: np.ndarray,
1010
k: int,
11-
alpha: float = 0.5,
11+
lambda_param: float = 0.5,
1212
metric: Metric = Metric.COSINE,
1313
normalize: bool = True,
1414
) -> tuple[np.ndarray, np.ndarray]:
@@ -18,10 +18,11 @@ def msd(
1818
This strategy selects `k` items that balance relevance and diversity by
1919
iteratively choosing items that maximize a combination of their relevance
2020
and their total distance to already selected items.
21-
:param relevances: 1D array of relevance scores for each item.
21+
2222
:param embeddings: 2D array of shape (n_samples, n_features).
23+
:param scores: 1D array of relevance scores for each item.
2324
:param k: Number of items to select.
24-
:param alpha: Trade-off parameter in [0, 1].
25+
:param lambda_param: Trade-off parameter in [0, 1].
2526
1.0 = pure relevance, 0.0 = pure diversity.
2627
2728
:param metric: Similarity metric to use. Default is Metric.COSINE.
@@ -30,10 +31,10 @@ def msd(
3031
"""
3132
return greedy_select(
3233
"msd",
33-
relevances,
34+
scores,
3435
embeddings,
3536
k,
3637
metric=metric,
3738
normalize=normalize,
38-
alpha=alpha,
39+
lambda_param=lambda_param,
3940
)

0 commit comments

Comments
 (0)