Source code for compshs.semantics.shift
"""
Created in 2025
@author: Simon Delarue <simon.delarue@telecom-paris.fr>
"""
import itertools
import numpy as np
import pandas as pd
from compshs.semantics.base import BaseSemanticShift
from compshs.utils.metrics import average_pairwise_similarity
[docs]class SSTA(BaseSemanticShift):
"""Time-aware Self Similarity between word embeddings.
.. math::
SS_{TA}(w, a, k) = psim(I_{w,a,k}, I_{w,a,k+1})
where:
- :math:`w` is a word
- :math:`a` is an attribute
- :math:`k` is a timestep
References
----------
Soler, A. G., Labeau, M., & Clavel, C. (2023). Measuring lexico-semantic alignment in debates with contextualized word representations. In Proceedings of the First Workshop on Social Influence in Conversations (SICon 2023) (pp. 50-63). Association for Computational Linguistics.
"""
def __init__(self):
super().__init__()
[docs] def attribute_exist_at_timedates(self, embeddings, attribute,
timedate_source, timedate_target) -> bool:
"""True if attribute exists in contextual embeddings at two reference timedates."""
return attribute in embeddings.get(timedate_source) and attribute in embeddings.get(timedate_target)
[docs] def keyword_exist_at_timedates(self, embeddings, keyword, attribute,
timedate_source, timedate_target) -> bool:
"""True if keyword exists in contextual embeddings at two reference timedates."""
return keyword in embeddings.get(timedate_source).get(attribute) and keyword in embeddings.get(timedate_target).get(attribute)
[docs] def transform(self, embeddings, timedates, attributes, keywords,
group_embeddings: bool = True, group_output: bool = False,
groups: list = None, group_names: list = None) -> pd.DataFrame:
"""Compute time-aware self similarity.
Parameters
----------
embeddings: list
List of Dictionaries of embeddings in the form of :meth:`ContextualEmbedding.transform()` output.
timedates: np.ndarray
Array of time values.
attributes: np.ndarray
Define subcorporas. Array of attribute values.
keywords: list
List of keywords.
group_embeddings: bool
If ``True``, group embeddings by timedate, attribute, keyword (default).
group_output: bool
If ``True``, group output dataframe using `groups`. Average is used for grouping.
groups: list
When group_output is set to ``True``, use this parameter to group multiple keywords together. Expected format is a list of lists of keywords.
group_names: list
When group_output is set to ``True``, use this parameter to rename groups of keywords.
Returns
-------
:class:`pd.DataFrame()`
DataFrame of time-aware similarities between embeddings at different timedates.
"""
if group_embeddings:
embeddings = self.group_embeddings(embeddings, timedates, attributes)
unique_timedates = sorted(np.unique(timedates))
unique_attributes = np.unique(attributes)
df_psim = pd.DataFrame()
for i in range(1, len(unique_timedates) - 1):
if unique_timedates[i] in embeddings and unique_timedates[i - 1] in embeddings:
for attribute in unique_attributes:
if self.attribute_exist_at_timedates(embeddings, attribute, unique_timedates[i], unique_timedates[i - 1]):
for keyword in keywords:
if self.keyword_exist_at_timedates(embeddings, keyword, attribute, unique_timedates[i], unique_timedates[i - 1]):
# retrieve embeddings at source ant target timedates
emb_source = embeddings.get(unique_timedates[i - 1]).get(attribute).get(keyword)
emb_target = embeddings.get(unique_timedates[i]).get(attribute).get(keyword)
# compute pairwise similarity
psim = average_pairwise_similarity(emb_source, emb_target)
tmp = pd.DataFrame({
'timedate': [unique_timedates[i]],
'attribute': [attribute],
'keyword': [keyword],
'similarity': [psim]
})
df_psim = pd.concat([df_psim, tmp])
if group_output:
df_ssta_all = pd.DataFrame()
for group, group_name in zip(groups, group_names):
tmp = df_psim[df_psim['keyword'].isin(group)].copy()
tmp = tmp.groupby(['timedate', 'attribute']).mean('similarity').reset_index()
tmp['group'] = group_name
df_ssta_all = pd.concat([df_ssta_all, tmp])
return df_ssta_all
return df_psim
[docs]class SApp(BaseSemanticShift):
r"""Symmetric approaching between word embeddings.
.. math::
sApp(w) = psim(I_{w,a,k+1}, I_{w,a^{\prime},k+1}) - psim(I_{w,a,k}, I_{w,a^{\prime},k})
where :math:`I_{w,a,k}` is the set of contextual embeddings of word :math:`w`, with attribute :math:`a`, at timestep :math:`k`.
A positive value for :math:`sApp(w)` indicates that two subcorporas achieved a closer word semantics over time. Conversely, a negative value indicates that word representations diverged over time.
References
----------
Soler, A. G., Labeau, M., & Clavel, C. (2023). Measuring lexico-semantic alignment in debates with contextualized word representations. In Proceedings of the First Workshop on Social Influence in Conversations (SICon 2023) (pp. 50-63). Association for Computational Linguistics.
"""
def __init__(self):
super().__init__()
self.metric_name = self.__class__.__name__.lower()
[docs] def sapp_keywords_strategy(self, embeddings: list, attribute_x: str, attribute_y: str, timedate_x: str, timedate_y: str):
"""Compute intersection between sets of keywords at different timedates."""
common_curr = self.get_common_keywords(embeddings, attribute_x, attribute_y, timedate_y, timedate_y)
common_prev = self.get_common_keywords(embeddings, attribute_x, attribute_y, timedate_x, timedate_x)
return common_curr.intersection(common_prev)
[docs] def sapp_embeddings_strategy(self, embeddings: list, attribute_x: str, attribute_y: str, timedate_x: str, timedate_y: str, keyword: str):
"""Returns embeddings (embeddings for both attributes at current timedate, embeddings for both attributes at previous timedate)."""
return (
(embeddings[timedate_y][attribute_x][keyword], embeddings[timedate_y][attribute_y][keyword]),
(embeddings[timedate_x][attribute_x][keyword], embeddings[timedate_x][attribute_y][keyword])
)
[docs] def compute_sapp_for_timepair(self, embeddings: list, timedate_source: str, timedate_target: str, attribute_x: str, attribute_y: str) -> list:
"""Compute sapp metric for a timepair.
Parameters
----------
embeddings: list
List of Dictionaries of embeddings in the form of :meth:`ContextualEmbedding.transform()` output.
timedate_source: str
Source timedate
timedate_target: str
Target timedate
attribute_x: str
First attribute
attribute_y: str
Second attribute
Returns
-------
list
List of dictionaries with metric information for all keywords.
"""
return self.compute_metric_for_timepair(
embeddings, timedate_source, timedate_target, attribute_x, attribute_y,
keywords_strategy=self.sapp_keywords_strategy,
embeddings_strategy=self.sapp_embeddings_strategy,
metric_name=self.metric_name)
[docs] def transform(self, embeddings: list, timedates: np.ndarray = None,
attributes: np.ndarray = None, group_embeddings: bool = True,
mode: str = 'sequential', time_pair: tuple = None,
group_output: bool = False, groups: list = None, group_names: list = None) -> pd.DataFrame:
"""Compute symmetric approaching for all available keywords and attributes in contextual embeddings.
Parameters
----------
embeddings: list
List of Dictionaries of embeddings in the form of :meth:`ContextualEmbedding.transform()` output.
timedates: np.ndarray
Array of time values.
attributes: np.ndarray
Define subcorporas. Array of attribute values.
group_embeddings: bool
If ``True``, group embeddings by timedate, attribute, keyword (default).
mode: str
``'sequential'`` or ``'fixed'``.
time_pair: tuple
Tuple of timedates in str format (prev_time, curr_time), if mode = ``'fixed'``.
group_output: bool
If ``True``, group output dataframe using `groups`. Average is used for grouping.
groups: list
When group_output is set to ``True``, use this parameter to group multiple keywords together. Expected format is a list of lists of keywords.
group_names: list
When group_output is set to ``True``, use this parameter to rename groups of keywords.
Returns
-------
:class:`pd.DataFrame()`
DataFrame of symmetric approaching similarities between embeddings at different timedates.
"""
df_sapp = pd.DataFrame()
if group_embeddings:
embeddings = self.group_embeddings(embeddings, timedates, attributes)
if mode=='sequential':
unique_timedates = sorted(np.unique(timedates))
time_pairs = [(unique_timedates[i - 1], unique_timedates[i]) for i in range(1, len(unique_timedates))]
elif mode=='fixed':
time_pairs = [time_pair]
for prev_time, curr_time in time_pairs:
if prev_time in embeddings and curr_time in embeddings:
# sApp requires that word embeddings exist for both sectors at two timesteps
common_attributes = self.get_common_attributes(embeddings, prev_time, curr_time)
if len(common_attributes) > 2:
if mode=='sequential':
attribute_pairs = list(itertools.combinations(common_attributes, 2))
elif mode=='fixed':
attribute_pairs = list(itertools.product(common_attributes, repeat=2))
for attribute_x, attribute_y in attribute_pairs:
if mode=='sequential':
attribute_x, attribute_y = sorted((attribute_x, attribute_y))
rows = self.compute_sapp_for_timepair(embeddings,
prev_time,
curr_time,
attribute_x,
attribute_y)
df_sapp = pd.concat([df_sapp, pd.DataFrame(rows)], ignore_index=True)
if group_output:
if mode=='sequential':
df_sapp = self.group_output_sequential(df_sapp,
metric=self.metric_name,
groups=groups,
group_names=group_names)
elif mode=='fixed':
df_sapp, self.ordered_keywords = self.group_output_fixed(df_sapp,
groups=groups,
group_names=group_names)
return df_sapp
[docs]class AsApp(BaseSemanticShift):
r"""Asymmetric Approaching between word embeddings.
.. math::
asApp(w, a) = psim(I_{w,a,t+1}, I_{w,a^{\\prime},t}) - psim(I_{w,a,t}, I_{w,a^{\\prime},t})
where :math:`I_{w,a,t}` is the set of contextual embeddings of word :math:`w`, with attribute :math:`a`, at timestep :math:`t`.
A positive value of :math:`asApp` indicates that attribute :math:`a` in the pair :math:`(a,a^{\prime})` has a recent (time :math:`t`) representation of a word :math:`w` that is close to the one initially used by attribute :math:`a^{\prime}` (time :math:`t-1`).
A negative value of :math:`asApp` indicates that attribute :math:`a` in the pair :math:`(a,a^{\prime})` has a recent (time :math:`t`) representation of a word :math:`w` that moves away from the one initially used by attribute :math:`a^{\prime}` (time :math:`t-1`).
References
----------
Soler, A. G., Labeau, M., & Clavel, C. (2023). Measuring lexico-semantic alignment in debates with contextualized word representations. In Proceedings of the First Workshop on Social Influence in Conversations (SICon 2023) (pp. 50-63). Association for Computational Linguistics.
"""
def __init__(self):
super().__init__()
self.metric_name = self.__class__.__name__.lower()
[docs] def asapp_keywords_strategy(self, embeddings: list, attribute_x: str, attribute_y: str, timedate_x: str, timedate_y: str):
"""Compute intersection between sets of keywords at different timedates."""
common_first = self.get_common_keywords(embeddings, attribute_x, attribute_y, timedate_y, timedate_x)
common_second = self.get_common_keywords(embeddings, attribute_x, attribute_y, timedate_x, timedate_x)
return common_first.intersection(common_second)
[docs] def asapp_embeddings_strategy(self, embeddings: list, attribute_x: str, attribute_y: str, timedate_x: str, timedate_y: str, keyword: str):
"""Returns embeddings (embeddings for both attributes at current and previous timedates, embeddings for both attributes at previous timedate)."""
return (
(embeddings[timedate_y][attribute_x][keyword], embeddings[timedate_x][attribute_y][keyword]),
(embeddings[timedate_x][attribute_x][keyword], embeddings[timedate_x][attribute_y][keyword])
)
[docs] def compute_asapp_for_timepair(self, embeddings: list, timedate_source: str, timedate_target: str, attribute_x: str, attribute_y: str) -> list:
"""Compute asapp metric for a timepair.
Parameters
----------
embeddings: list
List of Dictionaries of embeddings in the form of :meth:`ContextualEmbedding.transform()` output.
timedate_source: str
Source timedate
timedate_target: str
Target timedate
attribute_x: str
First attribute
attribute_y: str
Second attribute
Returns
-------
list
List of dictionaries with metric information for all keywords.
"""
return self.compute_metric_for_timepair(
embeddings, timedate_source, timedate_target, attribute_x, attribute_y,
keywords_strategy=self.asapp_keywords_strategy,
embeddings_strategy=self.asapp_embeddings_strategy,
metric_name=self.metric_name)
[docs] def transform(self, embeddings: list, timedates: np.ndarray = None,
attributes: np.ndarray = None, group_embeddings: bool = True,
mode: str = 'sequential', time_pair: tuple = None,
group_output: bool = False, groups: list = None, group_names: list = None) -> pd.DataFrame:
"""Compute asymmetric approaching for all available keywords and attributes in contextual embeddings.
Parameters
----------
embeddings: list
List of Dictionaries of embeddings in the form of :meth:`ContextualEmbedding.transform()` output.
timedates: np.ndarray
Array of time values.
attributes: np.ndarray
Define subcorporas. Array of attribute values.
group_embeddings: bool
If ``True``, group embeddings by timedate, attribute, keyword (default).
mode: str
``'sequential'`` or ``'fixed'``.
time_pair: tuple
Tuple of timedates in str format (prev_time, curr_time), if mode = ``'fixed'``.
group_output: bool
If ``True``, group output dataframe using `groups`. Average is used for grouping.
groups: list
When group_output is set to ``True``, use this parameter to group multiple keywords together. Expected format is a list of lists of keywords.
group_names: list
When group_output is set to ``True``, use this parameter to rename groups of keywords.
Returns
-------
:class:`pd.DataFrame()`
DataFrame of asymmetric approaching similarities between embeddings at different timedates.
"""
df_asapp = pd.DataFrame()
if group_embeddings:
embeddings = self.group_embeddings(embeddings, timedates, attributes)
if mode=='sequential':
unique_timedates = sorted(np.unique(timedates))
time_pairs = [(unique_timedates[i - 1], unique_timedates[i]) for i in range(1, len(unique_timedates))]
elif mode=='fixed':
time_pairs = [time_pair]
for prev_time, curr_time in time_pairs:
if prev_time in embeddings and curr_time in embeddings:
# asApp requires that word embeddings exist for both attributes at two timesteps
common_attributes = self.get_common_attributes(embeddings, curr_time, prev_time)
if len(common_attributes) > 2:
attribute_pairs = list(itertools.product(common_attributes, repeat=2))
for attribute_x, attribute_y in attribute_pairs:
# do not account for self comparisons between sectors
if attribute_x != attribute_y:
rows = self.compute_asapp_for_timepair(embeddings,
prev_time,
curr_time,
attribute_x,
attribute_y)
df_asapp = pd.concat([df_asapp, pd.DataFrame(rows)], ignore_index=True)
if group_output:
if mode=='sequential':
df_asapp = self.group_output_sequential(df_asapp,
metric=self.metric_name,
groups=groups,
group_names=group_names)
elif mode=='fixed':
df_asapp, self.ordered_keywords = self.group_output_fixed(df_asapp,
groups=groups,
group_names=group_names)
return df_asapp
[docs]class DS(BaseSemanticShift):
r"""Driving Strength between word embeddings.
.. math::
DS(w,a) = \dfrac{asApp(w,a)}{|asApp(w,a)|+|asApp(w,a^{\prime})|}
Driving Strength is an asymmetric time-aware normalised measure indicating how much of the total approaching
between two subcorporas is done by one side.
References
----------
Soler, A. G., Labeau, M., & Clavel, C. (2023). Measuring lexico-semantic alignment in debates with contextualized word representations. In Proceedings of the First Workshop on Social Influence in Conversations (SICon 2023) (pp. 50-63). Association for Computational Linguistics.
"""
def __init__(self):
super().__init__()
self.metric_name = self.__class__.__name__.lower()
[docs] def transform(self, embeddings: list, timedates: np.ndarray = None,
attributes: np.ndarray = None, group_embeddings: bool = True,
time_pair: tuple = None,
groups: list = None, group_names: list = None) -> pd.DataFrame:
"""Compute driving strength metric for all available keywords and attributes in contextual embeddings.
Parameters
----------
embeddings: list
List of Dictionaries of embeddings in the form of :meth:`ContextualEmbedding.transform()` output.
timedates: np.ndarray
Array of time values.
attributes: np.ndarray
Define subcorporas. Array of attribute values.
group_embeddings: bool
If ``True``, group embeddings by timedate, attribute, keyword (default).
time_pair: tuple
Tuple of timedates in str format (prev_time, curr_time).
groups: list
Use this parameter to group multiple keywords together. Expected format is a list of lists of keywords.
group_names: list
Use this parameter to rename groups of keywords.
Returns
-------
:class:`pd.DataFrame()`
DataFrame of driving strength similarities between embeddings at different timedates.
"""
df_ds = pd.DataFrame()
rows = []
asapp = AsApp()
df_asapp = asapp.transform(embeddings, timedates, attributes, group_embeddings,
'fixed', time_pair, True,
groups, group_names)
attribute_pairs = list(itertools.product(np.unique(attributes), repeat=2))
for attribute_x, attribute_y in attribute_pairs:
for keyword in asapp.ordered_keywords:
asapp_x = df_asapp[((df_asapp['attribute_x']==attribute_x) & (df_asapp['attribute_y']==attribute_y)) & (df_asapp['keyword']==keyword)]['asapp']
asapp_y = df_asapp[((df_asapp['attribute_x']==attribute_y) & (df_asapp['attribute_y']==attribute_x)) & (df_asapp['keyword']==keyword)]['asapp']
if len(asapp_x) > 0 and len(asapp_y) > 0:
# Compute DS
ds = asapp_x.item() / (np.abs(asapp_x.item()) + np.abs(asapp_y.item()))
rows.append({
'timedate': time_pair[1],
'attribute_x': attribute_x,
'attribute_y': attribute_y,
'keyword': keyword,
self.metric_name: ds
})
df_ds = pd.concat([df_ds, pd.DataFrame(rows)], ignore_index=True)
df_ds, self.ordered_keywords = self.group_output_fixed(df_ds,
groups=groups,
group_names=group_names)
return df_ds