Source code for compshs.semantics.shift

"""
Created in 2025
@author: Simon Delarue <simon.delarue@telecom-paris.fr>
"""
import itertools
import numpy as np
import pandas as pd

from compshs.semantics.base import BaseSemanticShift
from compshs.utils.metrics import average_pairwise_similarity


[docs]class SSTA(BaseSemanticShift):
    """Time-aware Self Similarity between word embeddings.

    .. math::

        SS_{TA}(w, a, k) = psim(I_{w,a,k}, I_{w,a,k+1})

    where:

        - :math:`w` is a word
        - :math:`a` is an attribute
        - :math:`k` is a timestep
    
    References
    ----------
    Soler, A. G., Labeau, M., & Clavel, C. (2023). Measuring lexico-semantic alignment in debates with contextualized word representations. In Proceedings of the First Workshop on Social Influence in Conversations (SICon 2023) (pp. 50-63). Association for Computational Linguistics.
    """
    def __init__(self):
        super().__init__()

[docs]    def attribute_exist_at_timedates(self, embeddings, attribute, 
                                     timedate_source, timedate_target) -> bool:
        """True if attribute exists in contextual embeddings at two reference timedates."""
        return attribute in embeddings.get(timedate_source) and attribute in embeddings.get(timedate_target)
    
[docs]    def keyword_exist_at_timedates(self, embeddings, keyword, attribute, 
                                   timedate_source, timedate_target) -> bool:
        """True if keyword exists in contextual embeddings at two reference timedates."""
        return keyword in embeddings.get(timedate_source).get(attribute) and keyword in embeddings.get(timedate_target).get(attribute)

[docs]    def transform(self, embeddings, timedates, attributes, keywords, 
                  group_embeddings: bool = True, group_output: bool = False, 
                  groups: list = None, group_names: list = None) -> pd.DataFrame:
        """Compute time-aware self similarity.
        
        Parameters
        ----------
        embeddings: list
            List of Dictionaries of embeddings in the form of :meth:`ContextualEmbedding.transform()` output.
        timedates: np.ndarray
            Array of time values.
        attributes: np.ndarray
            Define subcorporas. Array of attribute values.
        keywords: list
            List of keywords.
        group_embeddings: bool
            If ``True``, group embeddings by timedate, attribute, keyword (default).
        group_output: bool
            If ``True``, group output dataframe using `groups`. Average is used for grouping.
        groups: list
            When group_output is set to ``True``, use this parameter to group multiple keywords together. Expected format is a list of lists of keywords.
        group_names: list
            When group_output is set to ``True``, use this parameter to rename groups of keywords.

        Returns
        -------
        :class:`pd.DataFrame()`
            DataFrame of time-aware similarities between embeddings at different timedates.
        """
        if group_embeddings:
            embeddings = self.group_embeddings(embeddings, timedates, attributes)

        unique_timedates = sorted(np.unique(timedates))
        unique_attributes = np.unique(attributes)
        df_psim = pd.DataFrame()

        for i in range(1, len(unique_timedates) - 1):
            if unique_timedates[i] in embeddings and unique_timedates[i - 1] in embeddings:
                for attribute in unique_attributes:
                    if self.attribute_exist_at_timedates(embeddings, attribute, unique_timedates[i], unique_timedates[i - 1]):
                        for keyword in keywords:
                            if self.keyword_exist_at_timedates(embeddings, keyword, attribute, unique_timedates[i], unique_timedates[i - 1]):
                                
                                # retrieve embeddings at source ant target timedates
                                emb_source = embeddings.get(unique_timedates[i - 1]).get(attribute).get(keyword)
                                emb_target = embeddings.get(unique_timedates[i]).get(attribute).get(keyword)

                                # compute pairwise similarity
                                psim = average_pairwise_similarity(emb_source, emb_target)
                                tmp = pd.DataFrame({
                                    'timedate': [unique_timedates[i]],
                                    'attribute': [attribute],
                                    'keyword': [keyword],
                                    'similarity': [psim]
                                })
                                df_psim = pd.concat([df_psim, tmp])
        
        if group_output:
            df_ssta_all = pd.DataFrame()
            for group, group_name in zip(groups, group_names):
                tmp = df_psim[df_psim['keyword'].isin(group)].copy()
                tmp = tmp.groupby(['timedate', 'attribute']).mean('similarity').reset_index()
                tmp['group'] = group_name
                df_ssta_all = pd.concat([df_ssta_all, tmp])
            return df_ssta_all

        return df_psim


[docs]class SApp(BaseSemanticShift):
    r"""Symmetric approaching between word embeddings.

    .. math::
    
        sApp(w) = psim(I_{w,a,k+1}, I_{w,a^{\prime},k+1}) - psim(I_{w,a,k}, I_{w,a^{\prime},k})
    
    where :math:`I_{w,a,k}` is the set of contextual embeddings of word :math:`w`, with attribute :math:`a`, at timestep :math:`k`.

    A positive value for :math:`sApp(w)` indicates that two subcorporas achieved a closer word semantics over time. Conversely, a negative value indicates that word representations diverged over time.
    
    References
    ----------
    Soler, A. G., Labeau, M., & Clavel, C. (2023). Measuring lexico-semantic alignment in debates with contextualized word representations. In Proceedings of the First Workshop on Social Influence in Conversations (SICon 2023) (pp. 50-63). Association for Computational Linguistics.
    """
    def __init__(self):
        super().__init__()
        self.metric_name = self.__class__.__name__.lower()

[docs]    def sapp_keywords_strategy(self, embeddings: list, attribute_x: str, attribute_y: str, timedate_x: str, timedate_y: str):
        """Compute intersection between sets of keywords at different timedates."""
        common_curr = self.get_common_keywords(embeddings, attribute_x, attribute_y, timedate_y, timedate_y)
        common_prev = self.get_common_keywords(embeddings, attribute_x, attribute_y, timedate_x, timedate_x)
        return common_curr.intersection(common_prev)
    
[docs]    def sapp_embeddings_strategy(self, embeddings: list, attribute_x: str, attribute_y: str, timedate_x: str, timedate_y: str, keyword: str):
        """Returns embeddings (embeddings for both attributes at current timedate, embeddings for both attributes at previous timedate)."""
        return (
            (embeddings[timedate_y][attribute_x][keyword], embeddings[timedate_y][attribute_y][keyword]),
            (embeddings[timedate_x][attribute_x][keyword], embeddings[timedate_x][attribute_y][keyword])
            )
    
[docs]    def compute_sapp_for_timepair(self, embeddings: list, timedate_source: str, timedate_target: str, attribute_x: str, attribute_y: str) -> list:
        """Compute sapp metric for a timepair.
        
        Parameters
        ----------
        embeddings: list
            List of Dictionaries of embeddings in the form of :meth:`ContextualEmbedding.transform()` output.
        timedate_source: str
            Source timedate
        timedate_target: str
            Target timedate
        attribute_x: str
            First attribute
        attribute_y: str
            Second attribute

        Returns
        -------
        list
            List of dictionaries with metric information for all keywords.
        """
        return self.compute_metric_for_timepair(
            embeddings, timedate_source, timedate_target, attribute_x, attribute_y,
            keywords_strategy=self.sapp_keywords_strategy,
            embeddings_strategy=self.sapp_embeddings_strategy,
            metric_name=self.metric_name)

[docs]    def transform(self, embeddings: list, timedates: np.ndarray = None,
                  attributes: np.ndarray = None, group_embeddings: bool = True,
                  mode: str = 'sequential', time_pair: tuple = None,
                  group_output: bool = False, groups: list = None, group_names: list = None) -> pd.DataFrame:
        """Compute symmetric approaching for all available keywords and attributes in contextual embeddings.

        Parameters
        ----------
        embeddings: list
            List of Dictionaries of embeddings in the form of :meth:`ContextualEmbedding.transform()` output.
        timedates: np.ndarray
            Array of time values.
        attributes: np.ndarray
            Define subcorporas. Array of attribute values.
        group_embeddings: bool
            If ``True``, group embeddings by timedate, attribute, keyword (default).
        mode: str
            ``'sequential'`` or ``'fixed'``.
        time_pair: tuple
            Tuple of timedates in str format (prev_time, curr_time), if mode = ``'fixed'``.
        group_output: bool
            If ``True``, group output dataframe using `groups`. Average is used for grouping.
        groups: list
            When group_output is set to ``True``, use this parameter to group multiple keywords together. Expected format is a list of lists of keywords.
        group_names: list
            When group_output is set to ``True``, use this parameter to rename groups of keywords.

        Returns
        -------
        :class:`pd.DataFrame()`
            DataFrame of symmetric approaching similarities between embeddings at different timedates. 
        """
        df_sapp = pd.DataFrame()

        if group_embeddings:
            embeddings = self.group_embeddings(embeddings, timedates, attributes)

        if mode=='sequential':
            unique_timedates = sorted(np.unique(timedates))
            time_pairs = [(unique_timedates[i - 1], unique_timedates[i]) for i in range(1, len(unique_timedates))]
        elif mode=='fixed':
            time_pairs = [time_pair]

        for prev_time, curr_time in time_pairs:

            if prev_time in embeddings and curr_time in embeddings:
                # sApp requires that word embeddings exist for both sectors at two timesteps
                common_attributes = self.get_common_attributes(embeddings, prev_time, curr_time)

                if len(common_attributes) > 2:

                    if mode=='sequential':
                        attribute_pairs = list(itertools.combinations(common_attributes, 2))
                    elif mode=='fixed':
                        attribute_pairs = list(itertools.product(common_attributes, repeat=2))
                    
                    for attribute_x, attribute_y in attribute_pairs:
                        if mode=='sequential':
                            attribute_x, attribute_y = sorted((attribute_x, attribute_y))

                        rows = self.compute_sapp_for_timepair(embeddings,
                                                               prev_time,
                                                               curr_time,
                                                               attribute_x,
                                                               attribute_y)
                        
                        df_sapp = pd.concat([df_sapp, pd.DataFrame(rows)], ignore_index=True)

        if group_output:
            if mode=='sequential':
                df_sapp = self.group_output_sequential(df_sapp,
                              metric=self.metric_name,
                              groups=groups,
                              group_names=group_names)
            elif mode=='fixed':
                df_sapp, self.ordered_keywords = self.group_output_fixed(df_sapp,
                              groups=groups,
                              group_names=group_names)

        return df_sapp
        

[docs]class AsApp(BaseSemanticShift):
    r"""Asymmetric Approaching between word embeddings.

    .. math::

        asApp(w, a) = psim(I_{w,a,t+1}, I_{w,a^{\\prime},t}) - psim(I_{w,a,t}, I_{w,a^{\\prime},t})
    
    where :math:`I_{w,a,t}` is the set of contextual embeddings of word :math:`w`, with attribute :math:`a`, at timestep :math:`t`.

    A positive value of :math:`asApp` indicates that attribute :math:`a` in the pair :math:`(a,a^{\prime})` has a recent (time :math:`t`) representation of a word :math:`w` that is close to the one initially used by attribute :math:`a^{\prime}` (time :math:`t-1`).

    A negative value of :math:`asApp` indicates that attribute :math:`a` in the pair :math:`(a,a^{\prime})` has a recent (time :math:`t`) representation of a word :math:`w` that moves away from the one initially used by attribute :math:`a^{\prime}` (time :math:`t-1`).

    References
    ----------
    Soler, A. G., Labeau, M., & Clavel, C. (2023). Measuring lexico-semantic alignment in debates with contextualized word representations. In Proceedings of the First Workshop on Social Influence in Conversations (SICon 2023) (pp. 50-63). Association for Computational Linguistics.
    """
    def __init__(self):
        super().__init__()
        self.metric_name = self.__class__.__name__.lower()

[docs]    def asapp_keywords_strategy(self, embeddings: list, attribute_x: str, attribute_y: str, timedate_x: str, timedate_y: str):
        """Compute intersection between sets of keywords at different timedates."""
        common_first = self.get_common_keywords(embeddings, attribute_x, attribute_y, timedate_y, timedate_x)
        common_second = self.get_common_keywords(embeddings, attribute_x, attribute_y, timedate_x, timedate_x)
        return common_first.intersection(common_second)
    
[docs]    def asapp_embeddings_strategy(self, embeddings: list, attribute_x: str, attribute_y: str, timedate_x: str, timedate_y: str, keyword: str):
        """Returns embeddings (embeddings for both attributes at current and previous timedates, embeddings for both attributes at previous timedate)."""
        return (
            (embeddings[timedate_y][attribute_x][keyword], embeddings[timedate_x][attribute_y][keyword]),
            (embeddings[timedate_x][attribute_x][keyword], embeddings[timedate_x][attribute_y][keyword])
            )

[docs]    def compute_asapp_for_timepair(self, embeddings: list, timedate_source: str, timedate_target: str, attribute_x: str, attribute_y: str) -> list:
        """Compute asapp metric for a timepair.
        
        Parameters
        ----------
        embeddings: list
            List of Dictionaries of embeddings in the form of :meth:`ContextualEmbedding.transform()` output.
        timedate_source: str
            Source timedate
        timedate_target: str
            Target timedate
        attribute_x: str
            First attribute
        attribute_y: str
            Second attribute

        Returns
        -------
        list
            List of dictionaries with metric information for all keywords.
        """
        return self.compute_metric_for_timepair(
            embeddings, timedate_source, timedate_target, attribute_x, attribute_y,
            keywords_strategy=self.asapp_keywords_strategy,
            embeddings_strategy=self.asapp_embeddings_strategy,
            metric_name=self.metric_name)

[docs]    def transform(self, embeddings: list, timedates: np.ndarray = None,
                  attributes: np.ndarray = None, group_embeddings: bool = True,
                  mode: str = 'sequential', time_pair: tuple = None,
                  group_output: bool = False, groups: list = None, group_names: list = None) -> pd.DataFrame:
        """Compute asymmetric approaching for all available keywords and attributes in contextual embeddings.

        Parameters
        ----------
        embeddings: list
            List of Dictionaries of embeddings in the form of :meth:`ContextualEmbedding.transform()` output.
        timedates: np.ndarray
            Array of time values.
        attributes: np.ndarray
            Define subcorporas. Array of attribute values.
        group_embeddings: bool
            If ``True``, group embeddings by timedate, attribute, keyword (default).
        mode: str
            ``'sequential'`` or ``'fixed'``.
        time_pair: tuple
            Tuple of timedates in str format (prev_time, curr_time), if mode = ``'fixed'``.
        group_output: bool
            If ``True``, group output dataframe using `groups`. Average is used for grouping.
        groups: list
            When group_output is set to ``True``, use this parameter to group multiple keywords together. Expected format is a list of lists of keywords.
        group_names: list
            When group_output is set to ``True``, use this parameter to rename groups of keywords.

        Returns
        -------
        :class:`pd.DataFrame()`
            DataFrame of asymmetric approaching similarities between embeddings at different timedates. 
        """
        df_asapp = pd.DataFrame()

        if group_embeddings:
            embeddings = self.group_embeddings(embeddings, timedates, attributes)

        if mode=='sequential':
            unique_timedates = sorted(np.unique(timedates))
            time_pairs = [(unique_timedates[i - 1], unique_timedates[i]) for i in range(1, len(unique_timedates))]
        elif mode=='fixed':
            time_pairs = [time_pair]

        for prev_time, curr_time in time_pairs:
            if prev_time in embeddings and curr_time in embeddings:
                # asApp requires that word embeddings exist for both attributes at two timesteps
                common_attributes = self.get_common_attributes(embeddings, curr_time, prev_time)

                if len(common_attributes) > 2:
                    attribute_pairs = list(itertools.product(common_attributes, repeat=2))
                    
                    for attribute_x, attribute_y in attribute_pairs:
                        # do not account for self comparisons between sectors
                        if attribute_x != attribute_y:

                            rows = self.compute_asapp_for_timepair(embeddings,
                                                                   prev_time,
                                                                   curr_time,
                                                                   attribute_x,
                                                                   attribute_y)
                        
                            df_asapp = pd.concat([df_asapp, pd.DataFrame(rows)], ignore_index=True)

        if group_output:
            if mode=='sequential':
                df_asapp = self.group_output_sequential(df_asapp,
                              metric=self.metric_name,
                              groups=groups,
                              group_names=group_names)
            elif mode=='fixed':
                df_asapp, self.ordered_keywords = self.group_output_fixed(df_asapp,
                              groups=groups,
                              group_names=group_names)
                
        return df_asapp
    

[docs]class DS(BaseSemanticShift):
    r"""Driving Strength between word embeddings.

    .. math::
    
        DS(w,a) = \dfrac{asApp(w,a)}{|asApp(w,a)|+|asApp(w,a^{\prime})|}
    
    Driving Strength is an asymmetric time-aware normalised measure indicating how much of the total approaching 
    between two subcorporas is done by one side.

    References
    ----------
    Soler, A. G., Labeau, M., & Clavel, C. (2023). Measuring lexico-semantic alignment in debates with contextualized word representations. In Proceedings of the First Workshop on Social Influence in Conversations (SICon 2023) (pp. 50-63). Association for Computational Linguistics.
    """
    def __init__(self):
        super().__init__()
        self.metric_name = self.__class__.__name__.lower()

[docs]    def transform(self, embeddings: list, timedates: np.ndarray = None,
                  attributes: np.ndarray = None, group_embeddings: bool = True,
                  time_pair: tuple = None,
                  groups: list = None, group_names: list = None) -> pd.DataFrame:
        """Compute driving strength metric for all available keywords and attributes in contextual embeddings.

        Parameters
        ----------
        embeddings: list
            List of Dictionaries of embeddings in the form of :meth:`ContextualEmbedding.transform()` output.
        timedates: np.ndarray
            Array of time values.
        attributes: np.ndarray
            Define subcorporas. Array of attribute values.
        group_embeddings: bool
            If ``True``, group embeddings by timedate, attribute, keyword (default).
        time_pair: tuple
            Tuple of timedates in str format (prev_time, curr_time).
        groups: list
            Use this parameter to group multiple keywords together. Expected format is a list of lists of keywords.
        group_names: list
            Use this parameter to rename groups of keywords.

        Returns
        -------
        :class:`pd.DataFrame()`
            DataFrame of driving strength similarities between embeddings at different timedates. 
        """
        df_ds = pd.DataFrame()
        rows = []

        asapp = AsApp()
        df_asapp = asapp.transform(embeddings, timedates, attributes, group_embeddings,
                                   'fixed', time_pair, True,
                                   groups, group_names)
        
        attribute_pairs = list(itertools.product(np.unique(attributes), repeat=2))
        for attribute_x, attribute_y in attribute_pairs:
            for keyword in asapp.ordered_keywords:
                asapp_x = df_asapp[((df_asapp['attribute_x']==attribute_x) & (df_asapp['attribute_y']==attribute_y)) & (df_asapp['keyword']==keyword)]['asapp']
                asapp_y = df_asapp[((df_asapp['attribute_x']==attribute_y) & (df_asapp['attribute_y']==attribute_x)) & (df_asapp['keyword']==keyword)]['asapp']

                if len(asapp_x) > 0 and len(asapp_y) > 0:
                    
                    # Compute DS
                    ds = asapp_x.item() / (np.abs(asapp_x.item()) + np.abs(asapp_y.item()))
                    
                    rows.append({
                        'timedate': time_pair[1],
                        'attribute_x': attribute_x,
                        'attribute_y': attribute_y,
                        'keyword': keyword,
                        self.metric_name: ds
                    })
                    
        df_ds = pd.concat([df_ds, pd.DataFrame(rows)], ignore_index=True)

        df_ds, self.ordered_keywords = self.group_output_fixed(df_ds,
                              groups=groups,
                              group_names=group_names)
                
        return df_ds