diff --git a/main.py b/main.py index 49839a9..752da34 100644 --- a/main.py +++ b/main.py @@ -1,30 +1,23 @@ #!/usr/bin/env python3 import sys -from typing import List from src.myontology import MyOntology from src.nlp import NLP from src.parse_tree.parse_tree import ParseTree -from src.speech import Speech from src.syntax import Syntax def _main(wav_file: str): - text: str = Speech().run_recognition(wav_file) - # text: str = 'как получить кредит на обучение' + # text: str = Speech().run_recognition(wav_file) + text: str = 'У меня кредит в другом банке. Можно ли его перевести в ваш банк? ' print(f'Text: {text}') parse_tree: ParseTree = Syntax().get_parse_tree(text) print(f'Parse tree:\n{parse_tree}') - nlp: NLP = NLP() - terms: List[str] = [] - terms.extend(nlp.get_adj_noun(parse_tree)) - terms.extend(nlp.get_nouns(parse_tree)) - if len(terms) == 0: - terms.extend(nlp.get_terms_by_template(parse_tree, ['NOUN', 'VERB', 'ADJ'])) + terms = NLP().get_terms(parse_tree) print(f'Extracted terms:\n{", ".join(terms)}') result: str = MyOntology().get_event_description(terms) - print(f'Test:\n{result}') + print(f'Result:\n{result}') if __name__ == '__main__': diff --git a/src/nlp.py b/src/nlp.py index 8cce3ad..cb1da1d 100644 --- a/src/nlp.py +++ b/src/nlp.py @@ -9,26 +9,48 @@ from src.parse_tree.parse_tree_node import ParseTreeNode class NLP: + + def __init__(self) -> None: + self.__model = ru_core_news_sm.load() + + def __lemmatizer(self, text: str): + doc = self.__model(text) + tokens = [token.lemma_ for token in doc] + return ' '.join(tokens) + @staticmethod def __get_nodes_by_template(tree: ParseTree, template: str) -> list: + msg_none = f'Can\'t find some terms by template {template}' + msg_success = f'Success terms extraction by template {template}' top = tree.get_tree_root() r = Resolver('upos') nodes: list = [] try: nodes.extend(r.glob(top, template)) except (ChildResolverError, ResolverError): - print(f'Can\'t find some terms by template {template}') + pass + if len(nodes) == 0: + print(msg_none) + else: + print(msg_success) return nodes - def __get_terms_by_template(self, tree: ParseTree, template: str) -> List[str]: + def __get_terms_by_template(self, tree: ParseTree, template: str, reverse: bool = False) -> List[str]: terms: Set[str] = set() nodes: list = [] + tmplt = template + if '/' in template: + split = template.split('/', 1) + tmplt = f'*/{split.pop(0)}*/{"".join(split)}' nodes.extend(self.__get_nodes_by_template(tree, f'/{template}')) - nodes.extend(self.__get_nodes_by_template(tree, template)) + nodes.extend(self.__get_nodes_by_template(tree, tmplt)) for node in nodes: if node.parent is None: continue - terms.add(f'{self.lemmatizer(node.lemma)} {self.lemmatizer(node.parent.lemma)}') + if not reverse: + terms.add(f'{self.__lemmatizer(node.lemma)} {self.__lemmatizer(node.parent.lemma)}') + else: + terms.add(f'{self.__lemmatizer(node.parent.lemma)} {self.__lemmatizer(node.lemma)}') return list(terms) def __get_terms_by_upos(self, tree: ParseTree, upos: str) -> List[str]: @@ -38,32 +60,61 @@ class NLP: for node in nodes: if node.upos != upos: continue - upos_terms.append(self.lemmatizer(node.lemma)) + upos_terms.append(self.__lemmatizer(node.lemma)) if len(upos_terms) < 2: continue terms.add(" ".join(upos_terms)) if len(terms) == 0: print(f'Can\'t find some terms by template NOUN/./NOUN') + else: + print(f'Success terms extraction by template NOUN/./NOUN') return list(terms) - def lemmatizer(self, text: str): - doc = ru_core_news_sm.load()(text) - tokens = [token.lemma_ for token in doc] - return ' '.join(tokens) - - def get_adj_noun(self, tree: ParseTree) -> List[str]: + def __get_adj_noun(self, tree: ParseTree) -> List[str]: return self.__get_terms_by_template(tree, 'NOUN/ADJ') - def get_nouns(self, tree: ParseTree) -> List[str]: + def __get_nouns(self, tree: ParseTree) -> List[str]: terms: List[str] = [] terms.extend(self.__get_terms_by_upos(tree, 'NOUN')) - terms.extend(self.__get_terms_by_template(tree, 'NOUN/NOUN')) + terms.extend(self.__get_terms_by_template(tree, 'NOUN/NOUN', True)) return terms - def get_terms_by_template(self, tree: ParseTree, template: List[str]) -> List[str]: + def __get_verb_noun(self, tree: ParseTree) -> List[str]: + return self.__get_terms_by_template(tree, 'VERB/NOUN', True) + + def __get_single_terms_by_template(self, tree: ParseTree, template: List[str]) -> List[str]: nouns: List[ParseTreeNode] = [] for node in LevelOrderIter(tree.get_tree_root()): if node.upos not in template: continue nouns.append(node) - return list(set([self.lemmatizer(noun.lemma) for noun in nouns])) + return list(set([self.__lemmatizer(noun.lemma) for noun in nouns])) + + @staticmethod + def __merge_nouns(nouns: List[str], verb_nouns: List[str]) -> List[str]: + terms: Set[str] = set() + if len(nouns) == 0 or len(verb_nouns) == 0: + return list(terms) + for verb_noun in verb_nouns: + split = verb_noun.split(' ') + current_noun = split.pop() + current_verb = split.pop() + for noun in nouns: + if current_noun in noun.split(' '): + terms.add(f'{current_verb} {noun}') + break + return list(terms) + + def get_terms(self, tree: ParseTree) -> List[str]: + terms: List[str] = [] + terms.extend(self.__get_adj_noun(tree)) + nouns: List[str] = self.__get_nouns(tree) + terms.extend(nouns) + # verb_nouns: List[str] = self.__get_verb_noun(tree) + # terms.extend(verb_nouns) + # merged: List[str] = self.__merge_nouns(nouns, verb_nouns) + # if len(merged) > 0: + # return merged + if len(terms) == 0: + terms.extend(self.__get_single_terms_by_template(tree, ['NOUN', 'VERB', 'ADJ'])) + return terms