Source code for compshs.text.feature_selection

"""
Created in 2025
@author: Simon Delarue <simon.delarue@telecom-paris.fr>
"""
import pandas as pd
import re
import scattertext as st
from scattertext import AssociationCompactor
import spacy
from spacy.tokens import Doc
from tqdm import tqdm

from compshs.text.base import BaseText


[docs]class FeatureSelection(BaseText): """Feature selection.""" def __init__(self): super().__init__() self.formatted_corpus = None
[docs] def get_df_from_corpus(self, corpus: list, attributes: list) -> pd.DataFrame: """Convert a list of documents with attribute information into a pandas DataFrame object. Parameters ---------- corpus: list List of documents. attributes: list list (with same length as ``corpus``) of attributes. Returns ------- :class:`pd.DataFrame()` DataFrame with corpus information. """ df_corpus = pd.DataFrame() rows = [] for i, (txt, attribute) in tqdm(enumerate(zip(corpus, attributes))): rows.append({ 'doc_id': i, 'txt': txt, 'attribute': attribute }) df_corpus = pd.DataFrame(rows) return df_corpus
[docs] def spacy_doc_from_txt(self, txt: str, input_type: str = 'words') -> Doc: r"""Create a Spacy :class:`Doc()` from text content. Note: Words with length :math:`\leq` 2 are filtered out. Parameters ---------- txt: str Text content to convert. input_type: str - ``'words'``: ``txt`` contains only words separated by whitespaces. Useful in case of preprocessed text. - ``'sentences'``: ``txt`` contains sentences separated by commas. Useful in case of raw text. Returns ------- Spacy :class:`Doc()`. """ nlp = spacy.blank("en") if input_type == "sentences": sentences = re.split(r'(?<=[.!?]) +', txt) tokenized_sentences = [sent.split() for sent in sentences if sent.strip()] words = [] spaces = [] sent_starts = [] for sent in tokenized_sentences: for j, word in enumerate(sent): if len(word) > 2: words.append(word) spaces.append(True) sent_starts.append(j == 0) if spaces: spaces[-1] = False doc = Doc(nlp.vocab, words=words, spaces=spaces) for token, is_start in zip(doc, sent_starts): token.is_sent_start = is_start elif input_type == 'words': words = [w for w in txt.split() if len(w) > 2] spaces = [True] * len(words) if spaces: spaces[-1] = False doc = Doc(nlp.vocab, words=words, spaces=spaces) # the whole document is considered as a sentence for i, token in enumerate(doc): token.is_sent_start = (i == 0) return doc
[docs] def transform(self, corpus: list, attributes: list, max_tokens: int = 2000, input_type: str = 'words'): """Transform corpus of documents into `scattertext` format using attribute information. Parameters ---------- corpus: list List of documents. attributes: list list (with same length as ``corpus``) of attributes. max_tokens: int Maximum number of tokens to keep. input_type: str - ``'words'``: ``txt`` contains only words separated by whitespaces. Useful in case of preprocessed text. - ``'sentences'``: ``txt`` contains sentences separated by commas. Useful in case of raw text. Returns ------- :class:`scattertext` corpus. """ print('Transforming data...') df_corpus = self.get_df_from_corpus(corpus, attributes) # convert corpus to Spacy Doc tqdm.pandas() df_corpus['parsed'] = df_corpus.txt.progress_apply(self.spacy_doc_from_txt) unigram_corpus = st.CorpusFromParsedDocuments( df_corpus, category_col='attribute', parsed_col='parsed' ).build( show_progress=True ).get_unigram_corpus( ).compact(AssociationCompactor(max_tokens)) print('Done!') self.formatted_corpus = unigram_corpus return unigram_corpus