diff --git a/main.py b/main.py index 752da34..745dc6d 100644 --- a/main.py +++ b/main.py @@ -10,7 +10,7 @@ from src.syntax import Syntax def _main(wav_file: str): # text: str = Speech().run_recognition(wav_file) - text: str = 'У меня кредит в другом банке. Можно ли его перевести в ваш банк? ' + text: str = 'Можно ли рефинансировать ипотеку?' print(f'Text: {text}') parse_tree: ParseTree = Syntax().get_parse_tree(text) print(f'Parse tree:\n{parse_tree}') diff --git a/requirements.txt b/requirements.txt index dbe6fb9..3d29a8c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,5 @@ requests==2.27.1 anytree==2.8.0 spacy==3.3.0 https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.3.0/ru_core_news_sm-3.3.0.tar.gz -Owlready2==0.36 \ No newline at end of file +Owlready2==0.36 +ordered-set==4.1.0 \ No newline at end of file diff --git a/src/nlp.py b/src/nlp.py index cb1da1d..530522d 100644 --- a/src/nlp.py +++ b/src/nlp.py @@ -3,14 +3,17 @@ from typing import List, Set import ru_core_news_sm from anytree import LevelOrderIter, Resolver, ChildResolverError, LevelOrderGroupIter, \ ResolverError +from ordered_set import OrderedSet from src.parse_tree.parse_tree import ParseTree from src.parse_tree.parse_tree_node import ParseTreeNode +from src.semantic_tree.semantic_tree import SemanticTree +from src.semantic_tree.semantic_tree_node import SemanticTreeNode class NLP: - def __init__(self) -> None: + self.__tree = SemanticTree() self.__model = ru_core_news_sm.load() def __lemmatizer(self, text: str): @@ -36,14 +39,15 @@ class NLP: return nodes def __get_terms_by_template(self, tree: ParseTree, template: str, reverse: bool = False) -> List[str]: - terms: Set[str] = set() + terms: OrderedSet[str] = OrderedSet() nodes: list = [] tmplt = template if '/' in template: split = template.split('/', 1) - tmplt = f'*/{split.pop(0)}*/{"".join(split)}' + tmplt = f'{split.pop(0)}*/{"".join(split)}' nodes.extend(self.__get_nodes_by_template(tree, f'/{template}')) nodes.extend(self.__get_nodes_by_template(tree, tmplt)) + nodes.extend(self.__get_nodes_by_template(tree, f'*/{tmplt}')) for node in nodes: if node.parent is None: continue @@ -80,7 +84,10 @@ class NLP: return terms def __get_verb_noun(self, tree: ParseTree) -> List[str]: - return self.__get_terms_by_template(tree, 'VERB/NOUN', True) + terms: List[str] = [] + terms.extend(self.__get_terms_by_template(tree, 'VERB/NOUN', True)) + terms.extend(self.__get_terms_by_template(tree, 'NOUN/VERB', True)) + return terms def __get_single_terms_by_template(self, tree: ParseTree, template: List[str]) -> List[str]: nouns: List[ParseTreeNode] = [] @@ -90,31 +97,40 @@ class NLP: nouns.append(node) return list(set([self.__lemmatizer(noun.lemma) for noun in nouns])) - @staticmethod - def __merge_nouns(nouns: List[str], verb_nouns: List[str]) -> List[str]: - terms: Set[str] = set() - if len(nouns) == 0 or len(verb_nouns) == 0: - return list(terms) - for verb_noun in verb_nouns: - split = verb_noun.split(' ') - current_noun = split.pop() - current_verb = split.pop() - for noun in nouns: - if current_noun in noun.split(' '): - terms.add(f'{current_verb} {noun}') - break - return list(terms) + def __add_to_semantic_tree(self, terms: List[str], term_type: str): + if terms is None or len(terms) == 0: + return + for term in terms: + split = term.split(' ') + if len(split) < 2: + return + noun = split.pop() + parent_node = self.__tree.add_to_tree(noun, 'noun') + for child in split: + self.__tree.add_to_tree(child, term_type, parent_node) + + def __merge_terms(self, leaves: List[List[SemanticTreeNode]]) -> List[str]: + terms: List[str] = [] + for group in leaves: + if len(group) == 0: + continue + current_term = ' '.join([leaf.name for leaf in group]) + current_leaf = group[0] + while current_leaf.parent is not None: + current_leaf = current_leaf.parent + current_term = f'{current_term} {current_leaf.name}' + terms.append(current_term.strip()) + return terms def get_terms(self, tree: ParseTree) -> List[str]: terms: List[str] = [] - terms.extend(self.__get_adj_noun(tree)) nouns: List[str] = self.__get_nouns(tree) - terms.extend(nouns) - # verb_nouns: List[str] = self.__get_verb_noun(tree) - # terms.extend(verb_nouns) - # merged: List[str] = self.__merge_nouns(nouns, verb_nouns) - # if len(merged) > 0: - # return merged - if len(terms) == 0: - terms.extend(self.__get_single_terms_by_template(tree, ['NOUN', 'VERB', 'ADJ'])) + self.__add_to_semantic_tree(nouns, 'noun') + verb_nouns: List[str] = self.__get_verb_noun(tree) + self.__add_to_semantic_tree(verb_nouns, 'verb') + adj_nouns = self.__get_adj_noun(tree) + self.__add_to_semantic_tree(adj_nouns, 'adj') + print(f'Semantic tree:\n{self.__tree}') + terms: List[str] = self.__merge_terms(self.__tree.get_leaves()) + # terms.extend(self.__get_single_terms_by_template(tree, ['NOUN', 'VERB', 'ADJ'])) return terms diff --git a/src/semantic_tree/semantic_tree.py b/src/semantic_tree/semantic_tree.py new file mode 100644 index 0000000..71ca457 --- /dev/null +++ b/src/semantic_tree/semantic_tree.py @@ -0,0 +1,32 @@ +from typing import List, Dict + +from anytree import RenderTree, PreOrderIter + +from src.semantic_tree.semantic_tree_node import SemanticTreeNode + + +class SemanticTree: + + def __init__(self) -> None: + self.__tree_root = SemanticTreeNode('') + self.__nodes = {} + + def add_to_tree(self, name: str, node_type: str, parent: SemanticTreeNode = None) -> SemanticTreeNode: + new_node = self.__nodes.get(name) + if new_node is None: + new_node = SemanticTreeNode(name, node_type, + self.__tree_root if parent is None else parent) + self.__nodes[name] = new_node + return new_node + + def get_leaves(self) -> List[List[SemanticTreeNode]]: + leaves: List[SemanticTreeNode] = list(PreOrderIter(self.__tree_root, filter_=lambda node: node.is_leaf)) + leaves_dict: Dict[List[str]] = {} + for leaf in leaves: + if leaves_dict.get(leaf.parent) is None: + leaves_dict[leaf.parent] = [] + leaves_dict[leaf.parent].append(leaf) + return list(leaves_dict.values()) + + def __repr__(self) -> str: + return '\n'.join([f'{pre}{node}' for pre, fill, node in RenderTree(self.__tree_root)]) diff --git a/src/semantic_tree/semantic_tree_node.py b/src/semantic_tree/semantic_tree_node.py new file mode 100644 index 0000000..691b62c --- /dev/null +++ b/src/semantic_tree/semantic_tree_node.py @@ -0,0 +1,12 @@ +from anytree import NodeMixin + + +class SemanticTreeNode(NodeMixin): + def __init__(self, name: str, node_type: str = None, parent: NodeMixin = None): + super(SemanticTreeNode, self).__init__() + self.type = node_type + self.name = name + self.parent = parent + + def __repr__(self) -> str: + return f'{"ROOT" if not self.name else self.name} {self.type}'