Source code for compshs.utils.metrics

"""
Created in 2025
@author: Simon Delarue <simon.delarue@telecom-paris.fr>
"""
import itertools
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


[docs]def diversity(top_words: dict) -> float:
    """Diversity over topics.
    
    Parameters
    ----------
    top_words: dict
        Topic index as keys, top-word sets as values.
        
    Returns
    -------
    float
        Diversity.
    """
    unique_words = set()

    for s in top_words.values():
        unique_words |= s

    div = len(unique_words) / np.sum([len(s) for s in top_words.values()])

    return div


[docs]def topic_coherence(topic_words, dtm, vocab_index):
    """ Topic coherence as average NPMI over characteristic words of the topic. """
    scores = []

    n_docs = dtm.shape[0]

    for word_source, word_target in itertools.combinations(topic_words, 2):
        if word_source not in vocab_index or word_target not in vocab_index:
            continue

        docs_i = dtm[:, vocab_index[word_source]].toarray()
        docs_j = dtm[:, vocab_index[word_target]].toarray()

        p_i = docs_i.sum() / n_docs
        p_j = docs_j.sum() / n_docs
        p_ij = (docs_i & docs_j).sum() / n_docs

        if p_ij > 0:
            pmi = np.log(p_ij / (p_i * p_j))
            npmi = pmi / (-np.log(p_ij))
            scores.append(npmi)

    if len(scores) == 0:
        return 0
    else:
        return np.mean(scores)


[docs]def coherence(corpus: list, word_sets: dict) -> float:
    """Coherence over topics defined by word_sets.
    
    Parameters
    ----------
    corpus: list
        Corpus of documents.
    word_sets: dict
        Topic index as keys, word sets as values.
        
    Returns
    -------
    float
        Overall topic coherence.
    """
    vectorizer_bin = CountVectorizer(binary=True, ngram_range=(1, 2), token_pattern=r"(?u)\b[a-zA-Z]{4,}\b", max_features=30000)
    dtm_bin = vectorizer_bin.fit_transform(corpus)
    vocab = vectorizer_bin.get_feature_names_out()
    vocab_index = {word: i for i, word in enumerate(vocab)}

    coherences = []
    for i in range(len(word_sets)):
        topic_words = word_sets.get(i)
        coherence = topic_coherence(topic_words, dtm_bin, vocab_index)
        coherences.append(coherence)

    return coherence

[docs]def average_pairwise_similarity(values_source, values_target) -> float:
    r"""Average pairwise similarity between two arrays of values.
    
    Given two arrays of values :math:`I,J`, average pairwise similarity, denoted with
    :math:`psim(I,J)` is computed as: 
    
    .. math::
    
        psim(I,J)=\dfrac{\sum_{i\in I}\sum_{j \in J}sim(i,j)}{|I||J|}

    Parameters
    ----------
    value_source
        Array of values.
    value_target
        Array of values.

    Returns
    -------
    float
        Average pairwise similarity.
    """
    total_sim = np.sum([cosine_similarity(i.reshape(1, -1), j.reshape(1, -1)) for j in values_target for i in values_source])
    psim = total_sim / (len(values_source) * len(values_target))
    return psim