Source code for compshs.text.tests.test_preprocess
import unittest
from compshs.text.preprocess import Preprocess
[docs]class TestPreprocess(unittest.TestCase):
[docs] def setUp(self):
self.preprocessor = Preprocess(lang='en_core_web_sm')
self.preprocessor.fit()
self.corpus = ['The quick brown fox.', 'The dog is lazier.']
[docs] def test_fit(self):
self.assertIsNotNone(self.preprocessor.nlp, 'Model should be loaded after calling .fit().')
[docs] def test_transform(self):
# Default settings
result = self.preprocessor.transform(self.corpus)
self.assertEqual(len(result), 2)
for doc in result:
self.assertTrue(all(isinstance(token, str) for token in doc))
self.assertNotIn('is', doc)
self.assertNotIn('.', doc)
self.assertNotIn('lazier', doc)
# Including stopwords
self.preprocessor = Preprocess(lang='en_core_web_sm', exclude_stop_words=False)
self.preprocessor.fit()
result = self.preprocessor.transform(self.corpus)
self.assertIn('the', result[0])
# Punctuation included
self.preprocessor = Preprocess(lang='en_core_web_sm', exclude_punctuation=False)
self.preprocessor.fit()
result = self.preprocessor.transform(self.corpus)
self.assertIn('.', result[0])
# Without lemmatization
self.preprocessor = Preprocess(lang='en_core_web_sm', lemmatize=False)
self.preprocessor.fit()
result = self.preprocessor.transform(self.corpus)
self.assertIn('lazier', result[1])
# Empty corpus
result = self.preprocessor.transform([])
self.assertEqual(result, [])
# Batch size
self.preprocessor = Preprocess(lang='en_core_web_sm', batch_size=2)
self.preprocessor.fit()
result = self.preprocessor.transform(self.corpus)
self.assertEqual(len(result), len(self.corpus))