"""
Created in 2025
@author: Simon Delarue <simon.delarue@telecom-paris.fr>
"""
from collections import defaultdict
import numpy as np
import pandas as pd
from typing import Tuple
from compshs.utils.metrics import average_pairwise_similarity
[docs]class BaseSemanticShift():
"""Base class for Semantic Shift Detection.
Note: Semantic shift detection requires contextual word embeddings with additional attributes, e.g. time information.
"""
def __init__(self):
pass
[docs] def group_embeddings(self, embeddings: list, timedates: np.ndarray, attributes: np.ndarray) -> dict:
"""Group dictionaries of contextual embeddings by:
- time
- attribute
- keyword
Parameters
----------
embeddings: list
List of dictionaries of embeddings in the form of :class:`ContextualEmbedding.transform()` output.
timedates: np.ndarray
Array of time values.
attributes: np.ndarray
Array of attribute values.
Returns
-------
Single dictionary of keyword embeddings indexed by timedate, attribute, keyword.
"""
grouped_embeddings = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
for timedate, attribute, embedding in zip(timedates, attributes, embeddings):
for keyword, contexts in embedding.items():
for context, words in contexts.items():
context_emb = words.get(keyword)
if context_emb is not None:
grouped_embeddings[timedate][attribute][keyword].append(context_emb)
return grouped_embeddings
[docs] def get_common_keywords(self, embeddings: list, attribute_x: str,
attribute_y: str, timedate_x: str,
timedate_y: str) -> set:
"""Compute the set of common keywords between subcorporas at two timesteps and for two attributes."""
common_keywords = set(embeddings.get(timedate_x).get(attribute_x)).intersection(embeddings.get(timedate_y).get(attribute_y))
return common_keywords
[docs] def get_common_attributes(self, embeddings: list, timedate_x: str,
timedate_y: str) -> set:
"""Compute the set of common attributes between subcorporas at two different timesteps."""
common_attributes = set(embeddings.get(timedate_x).keys()).intersection(embeddings.get(timedate_y).keys())
return common_attributes
[docs] def get_group(self, keyword, groups, group_names):
for i, group in enumerate(groups):
if keyword in group:
return group_names[i]
return 'NA'
[docs] def group_output_sequential(self, similarities: pd.DataFrame, metric: str, groups: list, group_names: list) -> pd.DataFrame:
"""Group approaching-based semantic detection output for sequential mode.
Parameters
----------
similarities: pd.DataFrame
DataFrame containing approaching-based semantic similarities (see `shift.py` classes).
metric: str
Metric name.
groups: list
When group_output is set to ``True``, use this parameter to group multiple keywords together. Expected format is a list of lists of keywords.
group_names: list
When group_output is set to ``True``, use this parameter to rename groups of keywords.
Returns
-------
Grouped similarities for approaching-based semantic detection methods.
"""
df_output = pd.DataFrame()
for group, group_name in zip(groups, group_names):
tmp = similarities[similarities['keyword'].isin(group)].copy()
tmp['label'] = tmp['attribute_x'] + ' x ' + tmp['attribute_y']
tmp = tmp.groupby(['timedate', 'label']).mean(metric).reset_index()
tmp['group'] = group_name
df_output = pd.concat([df_output, tmp])
return df_output
[docs] def group_output_fixed(self, similarities: pd.DataFrame, groups: list, group_names: list) -> Tuple[pd.DataFrame, list]:
"""Group approaching-based semantic detection output for fixed mode.
Parameters
----------
similarities: pd.DataFrame
DataFrame containing approaching-based semantic similarities (see `shift.py` classes).
groups: list
When group_output is set to ``True``, use this parameter to group multiple keywords together. Expected format is a list of lists of keywords.
group_names: list
When group_output is set to ``True``, use this parameter to rename groups of keywords.
Returns
-------
Tuple of grouped similarities for approaching-based semantic detection methods and ordered keywords list.
"""
ordered_keywords = [word for group in groups for word in group]
similarities['group'] = similarities['keyword'].apply(lambda x: self.get_group(x, groups, group_names))
return similarities, ordered_keywords
[docs] def compute_metric_for_timepair(self, embeddings, timedate_source, timedate_target, attribute_x, attribute_y, keywords_strategy, embeddings_strategy, metric_name) -> list:
"""Generic computation for metrics between embeddings over time.
Parameters
----------
embeddings: list
List of Dictionaries of embeddings in the form of :class:`ContextualEmbedding.transform()` output.
timedate_source: str
Source timedate
timedate_target: str
Target timedate
attribute_x: str
First attribute
attribute_y: str
Second attribute
keywords_strategy
Strategy for selecting keyword intersection at different timedates.
embeddings_strategy
Strategy for selecting embeddings at different timedates.
metric_name: str
Metric name.
Returns
-------
List of dictionaries with metric information for all keywords.
"""
rows = []
common_keywords = keywords_strategy(embeddings, attribute_x, attribute_y, timedate_source, timedate_target)
for keyword in common_keywords:
(vec_a, vec_b), (vec_c, vec_d) = embeddings_strategy(embeddings, attribute_x, attribute_y, timedate_source, timedate_target, keyword)
psim_first = average_pairwise_similarity(vec_a, vec_b)
psim_second = average_pairwise_similarity(vec_c, vec_d)
rows.append({
'timedate': timedate_target,
'attribute_x': attribute_x,
'attribute_y': attribute_y,
'keyword': keyword,
metric_name: psim_first - psim_second
})
return rows