Add nouns extractor and lemmatizer

This commit is contained in:
Aleksey Filippov 2022-01-24 17:04:54 +04:00
parent e761a4e4ac
commit b59505fc25
4 changed files with 40 additions and 12 deletions

View File

@ -1,3 +1,5 @@
scipy==1.7.3
requests==2.27.1
anytree==2.8.0
anytree==2.8.0
spacy==3.2.1
https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.2.0/ru_core_news_sm-3.2.0.tar.gz

View File

@ -3,7 +3,7 @@
import sys
import warnings
from speech import Speech
from src.nlp import NLP
from syntax import Syntax
if not sys.warnoptions:
@ -14,14 +14,16 @@ def _main():
if len(sys.argv) < 2:
print(f'Usage: {sys.argv[0]} FILE')
exit(1)
wav_file = sys.argv[1]
speech_server = 'http://vosk.athene.tech'
text = Speech().run(wav_file, speech_server)
# wav_file = sys.argv[1]
# speech_server = 'http://vosk.athene.tech'
# text = Speech().run(wav_file, speech_server)
text = 'не могу оплатить из-за ограничений карты'
print(f'Text: {text}')
syntax_server = 'http://syntaxnet.athene.tech'
parse_tree = Syntax().run(text, syntax_server)
print(f'Parse tree:\n{parse_tree}')
print(f'Nouns:\n{parse_tree.get_nouns()}')
nouns = NLP().run(parse_tree)
print(f'Nouns:\n{nouns}')
if __name__ == '__main__':

26
src/nlp.py Normal file
View File

@ -0,0 +1,26 @@
from typing import List
import ru_core_news_sm
from anytree import LevelOrderIter
from src.parse_tree.parse_tree import ParseTree
from src.parse_tree.parse_tree_node import ParseTreeNode
class NLP:
@staticmethod
def _lemmatizer(text: str):
doc = ru_core_news_sm.load()(text)
tokens = [token.lemma_ for token in doc]
return ' '.join(tokens)
def _get_nouns(self, tree: ParseTreeNode) -> str:
nouns: List[ParseTreeNode] = []
for node in LevelOrderIter(tree):
if node.upos != 'NOUN':
continue
nouns.append(node)
return self._lemmatizer(' '.join([noun.lemma for noun in nouns]))
def run(self, tree: ParseTree) -> str:
return self._get_nouns(tree.get_tree_root())

View File

@ -1,6 +1,6 @@
from typing import List, Dict, Optional
from anytree import RenderTree, Resolver
from anytree import RenderTree
from src.parse_tree.parse_tree_node import ParseTreeNode
@ -45,10 +45,8 @@ class ParseTree:
break
return root
def get_nouns(self) -> str:
r = Resolver('upos')
nodes: List[ParseTreeNode] = r.glob(self._tree, '*/NOUN/ADJ/*')
return ' '.join([node.lemma for node in reversed(nodes)])
def __repr__(self) -> str:
return '\n'.join([f'{pre}{node}' for pre, fill, node in RenderTree(self._tree)])
def get_tree_root(self) -> ParseTreeNode:
return self._tree