Source code for wordviz.similarity

import numpy as np
from scipy.spatial.distance import cityblock, euclidean, cosine, chebyshev, canberra, braycurtis
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import pairwise_distances
from typing import List, Tuple
import warnings
from wordviz.loading import EmbeddingLoader

[docs] def word_distance(loader: EmbeddingLoader, word1: str, word2: str, dist: str = 'cosine') -> float: ''' Computes distance between two words given by user. Also supports sentence distance. Parameters ----------- loader: EmbeddingLoader Object used to load embeddings word1, word2: str Word to compute distance between dist: str, default='cosine' Type of distance to use: - 'braycurtis' - 'canberra' - 'chebyshev' - 'cosine' - 'dot' - 'euclidean' - 'manhattan' - 'pearson' - 'pearson' Returns -------- distance: float ''' warnings.warn( "The parameter names word1/word2 will be renamed to item1/item2 in a future release. " "Please update your code accordingly.", FutureWarning ) words = loader.tokens missing = [w for w in (word1, word2) if w not in words] if missing: raise ValueError(f"Word(s) not in vocabulary: {', '.join(missing)}") vec1 = loader.get_embedding(word1) vec2 = loader.get_embedding(word2) emb_matrix = loader.embeddings match dist: case 'braycurtis': distance = braycurtis(vec1, vec2) case 'canberra': distance = canberra(vec1, vec2) case 'chebyshev': distance = chebyshev(vec1, vec2) case 'cosine': distance = cosine(vec1, vec2) case 'dot': distance = -np.dot(vec1, vec2) case 'euclidean': distance = euclidean(vec1, vec2) case 'manhattan': distance = cityblock(vec1, vec2) case 'pearson': pearson_corr, _ = pearsonr(vec1, vec2) distance = 1 - pearson_corr case 'spearman': spearman_corr, _ = spearmanr(vec1, vec2) distance = 1 - spearman_corr return distance
[docs] def n_most_similar(loader: EmbeddingLoader, target_word: str, dist: str = 'cosine', n: int = 10) -> Tuple[List[str], np.ndarray, List[float]]: ''' Finds pairwise the n most similar words to a given target word using a specified distance metric. Parameters ----------- loader : EmbeddingLoader An instance of the embedding loader containing word vectors. target_word : str The word for which to find the most similar neighbors. dist : str, default='cosine' The distance metric to use. Options include 'cosine', 'euclidean', etc. n : int, default=10 The number of most similar words to retrieve. Returns -------- words : list of str The most similar words found. vectors : np.ndarray Embedding vectors corresponding to the most similar words. distances : list of float Distances from the target word to each of the most similar words. ''' warnings.warn( "The parameter names target_word will be renamed to target in a future release. " "Please update your code accordingly.", FutureWarning ) words = loader.tokens if target_word not in words: raise ValueError(f'{target_word} is not in vocabulary') target_vector = loader.get_embedding(target_word) target_index = words.index(target_word) word_indices = list(range(len(words))) word_indices.remove(target_index) filtered_words = [words[i] for i in word_indices] # process in batch batch_size = 10000 all_distances = [] all_indices = [] for i in range(0, len(filtered_words), batch_size): batch_words = filtered_words[i:i+batch_size] batch_vectors = np.array([loader.get_embedding(word) for word in batch_words]) X = np.vstack([target_vector, batch_vectors]) D = compute_distances(X, metric=dist) distances = D[0, 1:] all_distances.extend(distances) all_indices.extend(range(i, min(i+batch_size, len(filtered_words)))) # select indices if len(all_distances) <= n: top_n_indices = np.argsort(all_distances) else: top_n_indices = np.argpartition(all_distances, n-1)[:n] # sort by distance top_n_indices = top_n_indices[np.argsort(np.array(all_distances)[top_n_indices])] result_words = [filtered_words[all_indices[i]] for i in top_n_indices] result_distances = [all_distances[i] for i in top_n_indices] result_vectors = np.array([loader.get_embedding(word) for word in result_words]) return result_words, result_vectors, result_distances
[docs] def compute_distances(X, metric='euclidean'): if metric in ['euclidean', 'cosine', 'manhattan', 'braycurtis', 'canberra', 'chebyshev']: return pairwise_distances(X, metric=metric) elif metric == 'dot': # dot return 1 - (X @ X.T) elif metric == 'pearson': # pearson corr = np.corrcoef(X) return 1 - corr elif metric == 'spearman': # spearman n = X.shape[0] dist_mat = np.zeros((n, n)) for i in range(n): for j in range(i, n): r, _ = spearmanr(X[i], X[j]) dist_mat[i, j] = dist_mat[j, i] = 1 - r return dist_mat else: raise ValueError(f"Unknown metric: {metric}")