From b59505fc25973a182e219bd58ffaa81aabdaff2a Mon Sep 17 00:00:00 2001 From: Aleksey Filippov Date: Mon, 24 Jan 2022 17:04:54 +0400 Subject: [PATCH] Add nouns extractor and lemmatizer --- requirements.txt | 4 +++- src/main.py | 12 +++++++----- src/nlp.py | 26 ++++++++++++++++++++++++++ src/parse_tree/parse_tree.py | 10 ++++------ 4 files changed, 40 insertions(+), 12 deletions(-) create mode 100644 src/nlp.py diff --git a/requirements.txt b/requirements.txt index 3045a83..f471c42 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ scipy==1.7.3 requests==2.27.1 -anytree==2.8.0 \ No newline at end of file +anytree==2.8.0 +spacy==3.2.1 +https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.2.0/ru_core_news_sm-3.2.0.tar.gz \ No newline at end of file diff --git a/src/main.py b/src/main.py index b624020..c7a7f0c 100644 --- a/src/main.py +++ b/src/main.py @@ -3,7 +3,7 @@ import sys import warnings -from speech import Speech +from src.nlp import NLP from syntax import Syntax if not sys.warnoptions: @@ -14,14 +14,16 @@ def _main(): if len(sys.argv) < 2: print(f'Usage: {sys.argv[0]} FILE') exit(1) - wav_file = sys.argv[1] - speech_server = 'http://vosk.athene.tech' - text = Speech().run(wav_file, speech_server) + # wav_file = sys.argv[1] + # speech_server = 'http://vosk.athene.tech' + # text = Speech().run(wav_file, speech_server) + text = 'не могу оплатить из-за ограничений карты' print(f'Text: {text}') syntax_server = 'http://syntaxnet.athene.tech' parse_tree = Syntax().run(text, syntax_server) print(f'Parse tree:\n{parse_tree}') - print(f'Nouns:\n{parse_tree.get_nouns()}') + nouns = NLP().run(parse_tree) + print(f'Nouns:\n{nouns}') if __name__ == '__main__': diff --git a/src/nlp.py b/src/nlp.py new file mode 100644 index 0000000..504516a --- /dev/null +++ b/src/nlp.py @@ -0,0 +1,26 @@ +from typing import List + +import ru_core_news_sm +from anytree import LevelOrderIter + +from src.parse_tree.parse_tree import ParseTree +from src.parse_tree.parse_tree_node import ParseTreeNode + + +class NLP: + @staticmethod + def _lemmatizer(text: str): + doc = ru_core_news_sm.load()(text) + tokens = [token.lemma_ for token in doc] + return ' '.join(tokens) + + def _get_nouns(self, tree: ParseTreeNode) -> str: + nouns: List[ParseTreeNode] = [] + for node in LevelOrderIter(tree): + if node.upos != 'NOUN': + continue + nouns.append(node) + return self._lemmatizer(' '.join([noun.lemma for noun in nouns])) + + def run(self, tree: ParseTree) -> str: + return self._get_nouns(tree.get_tree_root()) diff --git a/src/parse_tree/parse_tree.py b/src/parse_tree/parse_tree.py index ed2cab9..7a04deb 100644 --- a/src/parse_tree/parse_tree.py +++ b/src/parse_tree/parse_tree.py @@ -1,6 +1,6 @@ from typing import List, Dict, Optional -from anytree import RenderTree, Resolver +from anytree import RenderTree from src.parse_tree.parse_tree_node import ParseTreeNode @@ -45,10 +45,8 @@ class ParseTree: break return root - def get_nouns(self) -> str: - r = Resolver('upos') - nodes: List[ParseTreeNode] = r.glob(self._tree, '*/NOUN/ADJ/*') - return ' '.join([node.lemma for node in reversed(nodes)]) - def __repr__(self) -> str: return '\n'.join([f'{pre}{node}' for pre, fill, node in RenderTree(self._tree)]) + + def get_tree_root(self) -> ParseTreeNode: + return self._tree