Source code for compshs.visualization.plot

"""
Created in 2025
@author: Simon Delarue <simon.delarue@telecom-paris.fr>
"""
import altair as alt
import matplotlib.pyplot as plt
from matplotlib.figure import Figure
import numpy as np
import os
import pandas as pd
import pyLDAvis
import pyLDAvis.lda_model
import scattertext as st
from scattertext import produce_scattertext_explorer
from scipy import sparse
from typing import Union
import webbrowser

from compshs.text import FrequencyCounter, TopicModeler
from compshs.utils import top_k


[docs]def plot_top_words(topic_modeler: TopicModeler, token_names: np.ndarray, k: int = 5, title: str = None) -> Figure: """Plot top :math:`k` tokens in each modeled topic. Parameters ---------- topic_modeler: :class:`TopicModeler` Fitted topic modeler. token_names: np.ndarray Array of token names. k: int Number of tokens displayed per topic (default = 5). title: str Plot title. If ``None``, the name of the model is used. Returns ------- Figure References ---------- Scikit-learn documentation (see `<https://scikit-learn.org/stable/lite/lab/index.html>`_). """ fig, axes = plt.subplots(1, topic_modeler.n_components, figsize=(30, 15), sharex=True) axes = axes.flatten() for topic_idx, words_contributions in enumerate(topic_modeler.get_word_contributions()): top_token_indices = top_k(words_contributions, k) top_token_names = token_names[top_token_indices] values = words_contributions[top_token_indices] ax = axes[topic_idx] ax.barh(top_token_names, values, height=0.7) ax.set_title(f"Topic {topic_idx + 1}", fontdict={"fontsize": 30}) ax.tick_params(axis="both", which="major", labelsize=20) for i in "top right left".split(): ax.spines[i].set_visible(False) if title is None: title = topic_modeler.model_name fig.suptitle(title, fontsize=40) plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3) return fig
[docs]def plot_pyLDA(topic_modeler: TopicModeler, matrix: Union[sparse.csr_matrix, np.ndarray], counter: FrequencyCounter): """ Plot LDA using pyLDAvis library. Parameters ---------- topic_modeler: :class:`TopicModeler` Fitted topic modeler. matrix: sparse.csr_matrix, np.ndarray Document term matrix (n_documents, n_words). counter: :class:`FrequencyCounter` Frequency counter. Returns ------- Visualization object. """ if topic_modeler.model_name == 'LDA': viz_data = pyLDAvis.lda_model.prepare(topic_modeler.modeler, matrix, counter.vectorizer) return viz_data else: raise Exception
[docs]def plot_ssta(similarities: pd.DataFrame): """Plot SSTA. Parameters ---------- similarities: pd.DataFrame DataFrame of embedding similarities (such as :meth:`SSTA.transform()` output). """ lines = alt.Chart(similarities).mark_line(point=True).encode( x=alt.X('timedate:O', title=''), y=alt.Y('similarity:Q', title="Self similarity"), color=alt.Color('group:N').legend(None) ) base_chart = alt.layer(lines).properties( width=150, height=70, ) # Facet : sector en colonne, concept en ligne chart = base_chart.facet( column=alt.Column('attribute:N', title=None), row=alt.Row('group:N', title=None) ) output_path = os.path.abspath('ssta_plot.html') chart.save(output_path) print(f'Chart saved here : {output_path}') webbrowser.open('file://' + output_path)
[docs]def plot_sequential_approaching(similarities: pd.DataFrame, metric: str = 'sapp'): """Heatmap for approaching-based semantic shift results in sequential mode. Parameters ---------- similarities: pd.DataFrame Semantic shift detection results. metric: str Name of semantic shift detection metric to plot ({``'sapp'``, ``'asapp'``}). """ base = alt.Chart(similarities).mark_rect(stroke='white', strokeWidth=1).encode( x=alt.X('timedate:O', title='Year'), y=alt.Y('label:N', title='', axis=alt.Axis(labelAngle=0, labelLimit=400, labelFontSize=10)), color=alt.Color(f'{metric}:Q', title=metric, scale=alt.Scale(scheme='redblue', type='sqrt', reverse=True)), tooltip=['timedate', 'label', 'group', metric] ).properties( width=150, height=300 ) # Facet grid per group facet = base.facet( facet=alt.Facet('group:N', title=''), columns=3 ) output_path = os.path.abspath(f'{metric}_plot_sequential.html') facet.save(output_path) print(f'Chart saved here : {output_path}') webbrowser.open('file://' + output_path)
[docs]def plot_fixed_approaching(similarities: pd.DataFrame, metric: str, keyword_order: list, keyword_colors: dict): """Barchart for approaching-based semantic shift detection results for fixed mode. Parameters ---------- similarities: pd.DataFrame Semantic shift detection results. metric: str Name of semantic shift detection metric to plot ({``'sapp'``, ``'asapp'``}). keyword_order: list Legend ordering. keyword_colors: dict Dictionary with keywords as keys and corresponding colour as values (both str). """ bars = alt.Chart(similarities).mark_bar().encode( y=alt.Y('keyword:O', title='', sort=keyword_order), x=alt.X(f'{metric}:Q', title=metric), color=alt.Color("keyword:N", sort=keyword_order, scale=alt.Scale(domain=list(keyword_colors.keys()), range=list(keyword_colors.values())) ) ) base_chart = alt.layer(bars).properties( width=250, height=230, ) chart = base_chart.facet( row=alt.Row("attribute_x:N").title("Sector").header(labelAngle=0), column=alt.Column("attribute_y:N").title("Sector"), title=f'{metric} between fixed dates.' ) output_path = os.path.abspath(f'{metric}_plot_fixed.html') chart.save(output_path) print(f'Chart saved here : {output_path}') webbrowser.open('file://' + output_path)
[docs]def plot_feature_selection(corpus, category, category_name, not_category_name, min_term_frequency): """Plot feature selection in html using ``ScatterText`` library. Parameters ---------- corpus: category: category_name: not_category_name: min_term_frequency: """ html = produce_scattertext_explorer( corpus, category=category, category_name=category_name, not_category_name=not_category_name, minimum_term_frequency=min_term_frequency, width_in_pixels=1000, term_significance=st.LogOddsRatioUninformativeDirichletPrior() ) filename = 'feature_selection_plot.html' open(filename, 'wb').write(html.encode('utf-8'))