Source code for compshs.text.topic_modelling

"""
Created in 2025
@author: Simon Delarue <simon.delarue@telecom-paris.fr>
"""
import numpy as np
from scipy import sparse
from sklearn.decomposition import LatentDirichletAllocation, NMF
from typing import Tuple, Union

from compshs.text.base import BaseText


[docs]class TopicModeler(BaseText):
    """Topic modeler.

    Parameters
    ----------
    model_name: str
        Model name.
            - ``'LDA'``, Latent Dirichlet Allocation.
            - ``'NMF'``, Non-Negative Matrix Factorization.
    n_components: int
        Number of topics.
    """
    def __init__(self, model_name: str = 'LDA', n_components: int = 10):
        super().__init__()
        self.model_name = model_name
        self.n_components = n_components
        self.modeler = self._get_modeler(self.model_name, self.n_components)

    @staticmethod
    def _get_modeler(model_name: str, n_components: int):
        """Get modeler.

        Parameters
        ----------
        model_name: str
            Model name.
                - ``'LDA'``, Latent Dirichlet Allocation.
                - ``'NMF'``, Non-Negative Matrix Factorization.
        n_components: int
            Number of topics.

        Returns
        -------
        :class:`LatentDirichletAllocation` or :class:`NMF()`.
        """
        if model_name == 'LDA':
            return LatentDirichletAllocation(n_components=n_components)
        elif model_name == 'NMF':
            return NMF(n_components=n_components)
        else:
            raise ValueError(f"Unknown model_name: {model_name}; must be in {'LDA', 'NMF'}.")

[docs]    def fit(self, matrix: Union[sparse.csr_matrix, np.ndarray]) -> 'TopicModeler':
        """Fit algorithm to the document term matrix.

        Parameters
        ----------
        matrix: sparse.csr_matrix, np.ndarray
            Document term matrix (n_documents, n_words).

        Returns
        -------
        self: :class:`TopicModeler`
        """
        self.modeler = self.modeler.fit(matrix)
        return self

[docs]    def transform(self, matrix: Union[sparse.csr_matrix, np.ndarray]) -> Tuple:
        """Transform data according to the fitted model.

        Parameters
        ----------
        matrix: sparse.csr_matrix, np.ndarray
            Document term matrix (n_documents, n_words).

        Returns
        -------
        tuple
            Tuple of topic names and document topic distribution matrix (n_samples, n_components).
        """
        topic_distribution = self.modeler.transform(matrix)
        topic_names = self.modeler.get_feature_names_out()

        return topic_names, topic_distribution

[docs]    def fit_transform(self, matrix: Union[sparse.csr_matrix, np.ndarray], *args, **kwargs) -> Tuple:
        """Fit and transform data.

        Parameters
        ----------
        matrix: sparse.csr_matrix, np.ndarray
            Document term matrix (n_documents, n_words).

        Returns
        -------
        tuple
            Tuple of topic names and document topic distribution matrix (n_documents, n_components).
        """
        _ = self.fit(matrix)
        return self.transform(matrix)

[docs]    def get_word_contributions(self) -> np.ndarray:
        """Get matrix of word (n_components, n_words) contributions to each topic."""
        return self.modeler.components_