Code source de pylexique.pylexique

# -*- coding: utf-8 -*-

"""Main module of pylexique."""

from collections import OrderedDict, defaultdict
from collections.abc import Sequence
import pkg_resources
import json
from math import isnan
# import faster_than_csv as csv
import csv
from csv import reader
import pandas as pd
from dataclasses import dataclass
from typing import DefaultDict, Dict, List, Optional, Tuple, Union, Generator, Any, Iterator

__all__ = ['Lexique383', 'LexItem', 'LexEntryTypes']

try:
    from utils import logger
except (ModuleNotFoundError, ImportError):
    from .utils import logger

_RESOURCE_PACKAGE = __name__

HOME_PATH = '/'.join(('Lexique', ''))
_RESOURCE_PATH_csv = pkg_resources.resource_filename(_RESOURCE_PACKAGE, 'Lexique383/Lexique383.txt')
_VALUE_ERRORS_PATH = pkg_resources.resource_filename(_RESOURCE_PACKAGE, 'errors/value_errors.json')
_LENGTH_ERRORS_PATH = pkg_resources.resource_filename(_RESOURCE_PACKAGE, 'errors/length_errors.json')

LEXIQUE383_FIELD_NAMES = ['ortho', 'phon', 'lemme', 'cgram', 'genre', 'nombre', 'freqlemfilms2', 'freqlemlivres',
                          'freqfilms2',
                          'freqlivres', 'infover', 'nbhomogr', 'nbhomoph', 'islem', 'nblettres', 'nbphons', 'cvcv',
                          'p_cvcv',
                          'voisorth', 'voisphon', 'puorth', 'puphon', 'syll', 'nbsyll', 'cv_cv', 'orthrenv', 'phonrenv',
                          'orthosyll', 'cgramortho', 'deflem', 'defobs', 'old20', 'pld20', 'morphoder', 'nbmorph']

ConvertedRow = Tuple[str, str, str, str, str, str, float, float, float, float, str, int, int, bool,
                     int, int, str, str, int, int, int, int, str, int, str, str, str, str, str, float,
                     int, float, float, str, int]


[docs]@dataclass(init=True, repr=False, eq=True, order=False, unsafe_hash=False, frozen=True) class LexEntryTypes: """ Type information about all the lexical attributes in a LexItem object. """ ortho: str phon: str lemme: str cgram: str genre: str nombre: str freqlemfilms2: float freqlemlivres: float freqfilms2: float freqlivres: float infover: str nbhomogr: int nbhomoph: int islem: bool nblettres: int nbphons: int cvcv: str p_cvcv: str voisorth: int voisphon: int puorth: int puphon: int syll: str nbsyll: int cv_cv: str orthrenv: str phonrenv: str orthosyll: str cgramortho: str deflem: float defobs: int old20: float pld20: float morphoder: str nbmorph: int
[docs]@dataclass(init=True, repr=False, eq=True, order=False, unsafe_hash=False, frozen=True) class LexItem(LexEntryTypes): """ | This class defines the lexical items in Lexique383. | It uses slots for memory efficiency. """ _s = LEXIQUE383_FIELD_NAMES __slots__ = _s def __repr__(self) -> str: return '{0}({1}, {2}, {3})'.format(self.__class__.__name__, self.ortho, self.lemme, self.cgram)
[docs] def to_dict(self) -> Dict[str, Union[str, float, int, bool]]: """ | Converts the LexItem to a dict containing its attributes and their values :return: OrderedDict. Dictionary with key/values correspondence wit LexItem objects. :raises: AttributeError. """ attributes = [] for attr in self.__slots__: try: value = getattr(self, attr) except AttributeError as e: logger.warning(e) continue attributes.append((attr, value)) result = OrderedDict(attributes) return result
[docs]class Lexique383: """ This is the class handling the lexique database. It provides methods for interacting with the Lexique DB and retrieve lexical items. All the lexical items are then stored in an Ordered Dict. :param lexique_path: string. Path to the lexique file. :param parser_type: string. 'pandas_csv' and 'csv' are valid values. 'csv' is the default value. :cvar lexique: Dictionary containing all the LexicalItem objects indexed by orthography. :cvar lemmes: Dictionary containing all the LexicalItem objects indexed by lemma. :cvar anagrams: Dictionary containing all the LexicalItem objects indexed by anagram form. """ lexique: Dict[str, Any] = OrderedDict() value_errors: List[Any] = [] length_errors: List[Any] = [] lemmes: Dict[str, List[LexItem]] = defaultdict(list) anagrams: Dict[str, List[LexItem]] = defaultdict(list) def __init__(self, lexique_path: Optional[str] = None, parser_type: str = 'csv') -> None: self.lexique_path = lexique_path if parser_type not in {'pandas_csv', 'csv'}: raise ValueError(f"The value {parser_type} is not permitted. Only 'pandas_csv' and 'csv' are valid values.") if lexique_path: if not isinstance(lexique_path, str): raise TypeError(f"Argument 'lexique_path' must be of type String, not {type(lexique_path)}") try: self._parse_lexique(lexique_path, parser_type) except UnicodeDecodeError as e: raise UnicodeError(f"There was a unicode error while parsing {type(lexique_path)}.") from e except FileNotFoundError as e: raise ValueError(f"Argument 'lexique_path' must be a valid path to Lexique383") from e else: try: # Tries to load the pre-shipped Lexique38X if no path file to the lexicon is provided. self._parse_lexique(_RESOURCE_PATH_csv, parser_type) except UnicodeDecodeError as e: raise UnicodeError(f"There was a unicode error while parsing {type(_RESOURCE_PATH_csv)}.") from e except FileNotFoundError as e: raise ValueError(f"Argument 'lexique_path' must be a valid path to Lexique383") from e return def __repr__(self) -> str: return '{0}.{1}'.format(__name__, self.__class__.__name__) def __len__(self) -> int: return len(self.lexique)
[docs] @staticmethod def _parse_csv(lexique_path: str) -> Generator[list, Any, None]: #type: ignore[type-arg] """ :param lexique_path: string. Path to the lexique file. :return: generator of rows: Content of the Lexique38x database. """ with open(lexique_path, 'r', encoding='utf-8', errors='ignore') as csv_file: raw_content = csv_file.readlines() content = (row.strip().split('\t') for row in raw_content[1:]) return content
[docs] def _parse_lexique(self, lexique_path: str, parser_type: str) -> None: """ | Parses the given lexique file and creates 2 hash tables to store the data. :param lexique_path: string. Path to the lexique file. :param parser_type: string. Can be either 'csv', 'pandas_csv'. :return: """ try: if parser_type == 'pandas_csv': df = pd.read_csv(lexique_path, delimiter='\t') content = (list(row) for row in df.values) elif parser_type == 'csv': content = self._parse_csv(lexique_path) else: content = self._parse_csv(lexique_path) except UnicodeDecodeError: logger.warn(f"there was an issue while parsing the file {lexique_path}." f" Trying again with built-in csv parser") content = self._parse_csv(lexique_path) self._create_db(content) if self.value_errors: self._save_errors(self.value_errors, _VALUE_ERRORS_PATH) if self.length_errors: self._save_errors(self.length_errors, _LENGTH_ERRORS_PATH) return
[docs] def _create_db(self, lexicon: Generator[list, Any, None]) -> None: #type: ignore[type-arg] """ | Creates 2 hash tables populated with the entries in lexique if it does not exist yet. | One hash table holds the LexItems, the other holds the same data but grouped by lemmma to give access to all lexical forms of a word. :param lexicon: Iterable. Iterable containing the lexique383 entries. :return: """ for row in lexicon: try: converted_row_fields = self._convert_entries(row) except ValueError: continue lexical_entry = LexItem(*converted_row_fields) self.lemmes[lexical_entry.lemme].append(lexical_entry) sorted_form = ''.join(sorted(lexical_entry.ortho)) self.anagrams[sorted_form].append(lexical_entry) if converted_row_fields[0] in self.lexique and not isinstance(self.lexique[converted_row_fields[0]], list): self.lexique[converted_row_fields[0]] = [self.lexique[converted_row_fields[0]]] self.lexique[converted_row_fields[0]].append(lexical_entry) elif converted_row_fields[0] in self.lexique and isinstance(self.lexique[converted_row_fields[0]], list): self.lexique[converted_row_fields[0]].append(lexical_entry) else: self.lexique[converted_row_fields[0]] = lexical_entry return
[docs] def _convert_entries(self, row_fields: Union[List[str], List[Union[str, float, int, bool]]]) -> ConvertedRow: """ | Convert entries from `strings` to `int`, `bool` or `float` and generates | a new list with typed entries. :param row_fields: List of column entries representing a row. :return: ConvertedRow: List of typed column entries representing a typed row. """ errors = defaultdict(list) converted_row_fields = [] for attr, value in zip(LEXIQUE383_FIELD_NAMES, row_fields): if isinstance(value, float) and isnan(value): value = '' if attr in {'freqlemfilms2', 'freqlemlivres', 'freqfilms2', 'freqlivres', 'old20', 'pld20'}: if not isinstance(value, float): if (value != '' or value != ' ') and ',' in value: value = value.replace(',', '.') value = float(value) if attr == 'islem': if isinstance(value, str): value = value.strip() if value != '' and value not in ('0', '1', 0, 1): value = 0 try: value = bool(int(value)) except ValueError: errors[row_fields[0]].append({attr: value}) value = value self.value_errors.append(errors) if attr in {'nbhomogr', 'nbhomoph', 'nblettres', 'nbphons', 'voisorth', 'voisphon', 'puorth', 'puphon', 'nbsyll'}: if value != '' or value != ' ': try: value = int(value) except ValueError: errors[row_fields[0]].append({attr: value}) value = value self.value_errors.append(errors) converted_row_fields.append(value) if len(converted_row_fields) != 35: self.length_errors.append((converted_row_fields, row_fields)) raise ValueError return converted_row_fields # type: ignore[return-value]
[docs] def get_lex(self, words: Union[Tuple[str, ...], str]) -> Dict[str, Union[LexItem, List[LexItem]]]: """ Recovers the lexical entries for the words in the sequence :param words: A string or a tuple of multiple strings for getting the LexItems for multiple words. :return: Dictionary of LexItems. :raises: TypeError. """ results = OrderedDict() if isinstance(words, str): try: results[words] = self.lexique[words.lower()] except AttributeError: logger.warning('the word {} is not in Lexique383'.format(words)) elif isinstance(words, Sequence): for word in words: if isinstance(word, str): try: results[word] = self.lexique[word.lower()] except AttributeError: logger.warning('The word {} is not in Lexique383\n'.format(word)) continue else: logger.warning('{} is not a valid string'.format(word)) raise TypeError else: raise TypeError return results
[docs] def get_all_forms(self, word: str) -> List[LexItem]: """ Gets all lexical forms of a given word. :param word: String. :return: List of LexItem objects sharing the same root lemma. :raises: ValueError. :raises: TypeError. """ try: lex_entry = self.lexique[word.lower()] except ValueError as e: logger.warning('The word {} is not in Lexique383\n'.format(word)) raise ValueError from e if isinstance(lex_entry, LexItem): lemmes = self.lemmes[lex_entry.lemme] elif isinstance(lex_entry, OrderedDict): lemmes = self.lemmes[lex_entry['lemme']] elif isinstance(lex_entry, list): distinct = {elmt.lemme for elmt in lex_entry} lemmes = [] for lemme in distinct: lemmes.extend(self.lemmes[lemme]) else: raise TypeError return lemmes
[docs] def get_anagrams(self, word: str) -> List[LexItem]: """ Gets all lexical forms of a given word. :param word: String. :return: List of LexItem objects which are anagrams of the given word. :raises: ValueError. :raises: TypeError. """ try: lex_entry = self.lexique[word.lower()] except ValueError as e: logger.warning('The word {} is not in Lexique383\n'.format(word)) raise ValueError from e if isinstance(lex_entry, LexItem): sorted_form = ''.join(sorted(lex_entry.ortho)) anagrams = self.anagrams[sorted_form] elif isinstance(lex_entry, OrderedDict): sorted_form = ''.join(sorted(lex_entry['ortho'])) anagrams = self.anagrams[sorted_form] elif isinstance(lex_entry, list): sorted_form = ''.join(sorted(lex_entry[0].ortho)) anagrams = self.anagrams[sorted_form] else: raise TypeError final_anagrams = [lex_item for lex_item in anagrams if lex_item.ortho != word.lower()] return final_anagrams
[docs] @staticmethod def _save_errors(errors: Union[ List[Tuple[List[Union[str, float, int, bool]], List[str]]], List[DefaultDict[str, List[Dict[str, str]]]]], errors_path: str) -> None: """ Saves the mismatched key/values in Lexique383 based on type coercion. :param errors: List of errors encountered while parsing Lexique38x :param errors_path: Path to save the errors. :return: """ with open(errors_path, 'w', encoding='utf-8') as json_file: json.dump(errors, json_file, indent=4) return
if __name__ == "__main__": pass