Source code for compshs.text.preprocess

from collections import defaultdict
import re
from tqdm.auto import tqdm

from compshs.text.base import BaseText
from compshs.utils import load_lang


[docs]class Preprocess(BaseText): """ Preprocessing of a corpus of documents. Parameters ---------- lang: str Spacy language model name (``'en_core_web_sm'``). exclude_stop_words: bool If ``True``, exclude stopwords (default). exclude_punctuation: bool If ``True``, exclude punctuation (default). exclude_numbers: bool If ``True``, exclude numbers (default). lemmatize: bool If ``True``, lemmatize tokens (default). batch_size: int Number of documents to process in each batch (default = 10). chunk_size: int Maximum length of a piece of text. Beyond this length, the document is divided into chunks (default = 500000). nlp: Spacy model build upon ``lang`` parameter. """ def __init__(self, lang: str = 'en_core_web_sm', exclude_stop_words: bool = True, exclude_punctuation: bool = True, exclude_numbers: bool = True, lemmatize: bool = True, batch_size: int = 10, chunk_size: int = 500000): super().__init__() self.lang = lang self.exclude_stop_words = exclude_stop_words self.exclude_punctuation = exclude_punctuation self.exclude_numbers = exclude_numbers self.lemmatize = lemmatize self.batch_size = batch_size self.chunk_size = chunk_size self.nlp = None def _chunk_document(self, document: str) -> list: """Chunk a document into a list of subdocuments. Parameters ---------- document: str Textual document. Returns ------- list List of subdocuments, each of maximal length equals to ``chunk_size``. """ return [document[i:i + self.chunk_size] for i in range(0, len(document), self.chunk_size)] def _clean_text(self, text: str) -> str: text = text.replace("-\n", "") text = text.replace("\n", " ") text = text.replace("\t", " ") text = re.sub(r'\s+', ' ', text) text = re.sub(r'[\u200b\xa0]', ' ', text) text = re.sub(r"http\S+", "", text) # text = re.sub(r'(?<=\b) (?! )|(?<! ) (?=\b)', '', text) text = text.strip() return text
[docs] def fit(self): """Fit algorithm to the data.""" self.nlp = load_lang(self.lang) return self
[docs] def transform(self, corpus: list) -> list: """Preprocess corpus: - remove stopwords - remove punctuation - remove numbers - extract lemmatized tokens - set tokens in lowercase Parameters ---------- corpus: list List of documents. Returns ------- list List of preprocessed documents. """ indexed_chunks = [] doc_indices = defaultdict(list) # Chunk large documents for i, document in enumerate(corpus): document = self._clean_text(document) chunks = self._chunk_document(document) for chunk in chunks: indexed_chunks.append((i, chunk)) doc_indices[i].append(len(indexed_chunks) - 1) processed_chunks = [] # Preprocess documents for document in tqdm(self.nlp.pipe((chunk for _, chunk in indexed_chunks), disable=['ner', 'parser'], batch_size=self.batch_size), desc='preprocessing'): tokens = [] for token in document: if (self.exclude_stop_words and token.is_stop) or (self.exclude_punctuation and token.is_punct) or (self.exclude_numbers and (token.like_num or token.is_currency)): continue if self.lemmatize: transformed_token = token.lemma_.lower() else: transformed_token = token.text.lower() tokens.append(transformed_token) processed_chunks.append(tokens) transformed_corpus = [] # Merge chunked documents for i in range(len(corpus)): document_tokens = [] for j in doc_indices[i]: document_tokens.extend(processed_chunks[j]) transformed_corpus.append(" ".join(document_tokens)) return transformed_corpus