Source code for compshs.data.parse

"""
Created in 2025
@author: Simon Delarue <simon.delarue@telecom-paris.fr>
"""
from contextlib import contextmanager
import sqlite3
from glob import glob
import os
from typing import Optional, Tuple

from compshs.data import Dataset
from compshs.utils import check_sql_identifiers, check_exist_table_name, check_exist_column_name


[docs]def from_directory(directory_path: str, dataset_name: str = None) -> Dataset: """Load a corpus from a directory containing text files. Parameters ---------- directory_path: str Path to the directory containing text files (.txt). dataset_name: str Name of the returned Dataset. Directory name is used if not specified. Returns ------- dataset: :class:`Dataset` Example ------- >>> directory_path = 'path_to_txt_files' >>> dataset = from_directory(directory_path) >>> dataset.name 'path_to_txt_files' """ directory_path = os.path.expanduser(directory_path) txt_files = glob(os.path.join(directory_path, "*.txt")) corpus = [] for file_path in txt_files: with open(file_path, 'r', encoding='utf-8') as f: corpus.append(f.read()) if dataset_name is None: dataset_name = os.path.basename(os.path.normpath(directory_path)) return Dataset(name=dataset_name, corpus=corpus)
@contextmanager def get_db_connection(database_path: str): conn = sqlite3.connect(os.path.expanduser(database_path)) try: yield conn finally: conn.close()
[docs]def from_sql(database_path: str, dataset_name: str, table_name: str, document_column_name: str, document_name_field: str, document_text_field: str, filter_field: Optional[str] = None, excluded_value: Optional[str] = None, included_value: Optional[str] = None) -> Dataset: """Load a corpus from a sqlite database where documents are stored as JSON entries. Parameters ---------- database_path: str Path to the `.db` file. dataset_name: str Name of the returned Dataset. table_name: str Name of the database table containing JSON documents. document_column_name: str Name of the table column containing JSON documents. document_name_field: str Name of the JSON field containing document label. document_text_field: str Name of the JSON field containing document textual content. filter_field: str, optional JSON key to filter by (default=None). excluded_value: str, optional Value to exclude from filtering the filter_field key (default=None). included_value: str, optional Value to include from filtering the filter_field key (default=None). Returns ------- :class:`Dataset` """ (table_name, document_column_name, document_name_field, document_text_field ) = check_sql_identifiers(( table_name, document_column_name, document_name_field, document_text_field)) with get_db_connection(database_path) as conn: cursor = conn.cursor() check_exist_table_name(conn, table_name) check_exist_column_name(conn, table_name, document_column_name) query, params = get_query(table_name, document_column_name, document_name_field, document_text_field, filter_field, excluded_value, included_value) # Execute query cursor.execute(query, params) corpus = cursor.fetchall() conn.close() if dataset_name is None: dataset_name = table_name return Dataset(name=dataset_name, corpus=corpus)
[docs]def get_query(table_name: str, document_column_name: str, document_name_field: str, document_text_field: str, filter_field: Optional[str] = None, excluded_value: Optional[str] = None, included_value: Optional[str] = None) -> Tuple: """ Generate a parametrized SQL query. Parameters ---------- table_name: str Name of the database table containing JSON documents. document_column_name: str Name of the table column containing JSON documents. document_name_field: str Name of the JSON field containing document label. document_text_field: str Name of the JSON field containing document textual content. filter_field: str, optional JSON key to filter by (default=None). excluded_value: str, optional Value to exclude from filtering the filter_field key (default=None). included_value: str, optional Value to include from filtering the filter_field key (default=None). Returns ------- tuple Tuple of query and parameters. """ if included_value is not None and excluded_value is not None: raise ValueError('Either included_value or excluded_value should be used, not both.') query = f''' SELECT json_extract("{document_column_name}", '$.{document_name_field}') AS document_name, json_extract("{document_column_name}", '$.{document_text_field}') AS txt FROM "{table_name}" ''' params = () if filter_field and excluded_value is not None: excluded_value = excluded_value.lower() query += f''' WHERE json_extract({document_column_name}, '$.{filter_field}') IS NULL OR LOWER(json_extract({document_column_name}, '$.{filter_field}')) != ? ''' params = (excluded_value,) if filter_field and included_value is not None: included_value = included_value.lower() query += f''' WHERE json_extract({document_column_name}, '$.{filter_field}') IS NULL OR LOWER(json_extract({document_column_name}, '$.{filter_field}')) = ? ''' params = (included_value,) return query, params