import numpy as np
from scipy.spatial.distance import cityblock, euclidean, cosine, chebyshev, canberra, braycurtis
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import pairwise_distances
from typing import List, Tuple
import warnings
from wordviz.loading import EmbeddingLoader
[docs]
def word_distance(loader: EmbeddingLoader, word1: str, word2: str, dist: str = 'cosine') -> float:
'''
Computes distance between two words given by user. Also supports sentence distance.
Parameters
-----------
loader: EmbeddingLoader
Object used to load embeddings
word1, word2: str
Word to compute distance between
dist: str, default='cosine'
Type of distance to use:
- 'braycurtis'
- 'canberra'
- 'chebyshev'
- 'cosine'
- 'dot'
- 'euclidean'
- 'manhattan'
- 'pearson'
- 'pearson'
Returns
--------
distance: float
'''
warnings.warn(
"The parameter names word1/word2 will be renamed to item1/item2 in a future release. "
"Please update your code accordingly.",
FutureWarning
)
words = loader.tokens
missing = [w for w in (word1, word2) if w not in words]
if missing:
raise ValueError(f"Word(s) not in vocabulary: {', '.join(missing)}")
vec1 = loader.get_embedding(word1)
vec2 = loader.get_embedding(word2)
emb_matrix = loader.embeddings
match dist:
case 'braycurtis':
distance = braycurtis(vec1, vec2)
case 'canberra':
distance = canberra(vec1, vec2)
case 'chebyshev':
distance = chebyshev(vec1, vec2)
case 'cosine':
distance = cosine(vec1, vec2)
case 'dot':
distance = -np.dot(vec1, vec2)
case 'euclidean':
distance = euclidean(vec1, vec2)
case 'manhattan':
distance = cityblock(vec1, vec2)
case 'pearson':
pearson_corr, _ = pearsonr(vec1, vec2)
distance = 1 - pearson_corr
case 'spearman':
spearman_corr, _ = spearmanr(vec1, vec2)
distance = 1 - spearman_corr
return distance
[docs]
def n_most_similar(loader: EmbeddingLoader, target_word: str, dist: str = 'cosine', n: int = 10) -> Tuple[List[str], np.ndarray, List[float]]:
'''
Finds pairwise the n most similar words to a given target word using a specified distance metric.
Parameters
-----------
loader : EmbeddingLoader
An instance of the embedding loader containing word vectors.
target_word : str
The word for which to find the most similar neighbors.
dist : str, default='cosine'
The distance metric to use. Options include 'cosine', 'euclidean', etc.
n : int, default=10
The number of most similar words to retrieve.
Returns
--------
words : list of str
The most similar words found.
vectors : np.ndarray
Embedding vectors corresponding to the most similar words.
distances : list of float
Distances from the target word to each of the most similar words.
'''
warnings.warn(
"The parameter names target_word will be renamed to target in a future release. "
"Please update your code accordingly.",
FutureWarning
)
words = loader.tokens
if target_word not in words:
raise ValueError(f'{target_word} is not in vocabulary')
target_vector = loader.get_embedding(target_word)
target_index = words.index(target_word)
word_indices = list(range(len(words)))
word_indices.remove(target_index)
filtered_words = [words[i] for i in word_indices]
# process in batch
batch_size = 10000
all_distances = []
all_indices = []
for i in range(0, len(filtered_words), batch_size):
batch_words = filtered_words[i:i+batch_size]
batch_vectors = np.array([loader.get_embedding(word) for word in batch_words])
X = np.vstack([target_vector, batch_vectors])
D = compute_distances(X, metric=dist)
distances = D[0, 1:]
all_distances.extend(distances)
all_indices.extend(range(i, min(i+batch_size, len(filtered_words))))
# select indices
if len(all_distances) <= n:
top_n_indices = np.argsort(all_distances)
else:
top_n_indices = np.argpartition(all_distances, n-1)[:n]
# sort by distance
top_n_indices = top_n_indices[np.argsort(np.array(all_distances)[top_n_indices])]
result_words = [filtered_words[all_indices[i]] for i in top_n_indices]
result_distances = [all_distances[i] for i in top_n_indices]
result_vectors = np.array([loader.get_embedding(word) for word in result_words])
return result_words, result_vectors, result_distances
[docs]
def compute_distances(X, metric='euclidean'):
if metric in ['euclidean', 'cosine', 'manhattan', 'braycurtis', 'canberra', 'chebyshev']:
return pairwise_distances(X, metric=metric)
elif metric == 'dot':
# dot
return 1 - (X @ X.T)
elif metric == 'pearson':
# pearson
corr = np.corrcoef(X)
return 1 - corr
elif metric == 'spearman':
# spearman
n = X.shape[0]
dist_mat = np.zeros((n, n))
for i in range(n):
for j in range(i, n):
r, _ = spearmanr(X[i], X[j])
dist_mat[i, j] = dist_mat[j, i] = 1 - r
return dist_mat
else:
raise ValueError(f"Unknown metric: {metric}")