Add nouns extractor and lemmatizer
This commit is contained in:
parent
e761a4e4ac
commit
b59505fc25
@ -1,3 +1,5 @@
|
||||
scipy==1.7.3
|
||||
requests==2.27.1
|
||||
anytree==2.8.0
|
||||
anytree==2.8.0
|
||||
spacy==3.2.1
|
||||
https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.2.0/ru_core_news_sm-3.2.0.tar.gz
|
12
src/main.py
12
src/main.py
@ -3,7 +3,7 @@
|
||||
import sys
|
||||
import warnings
|
||||
|
||||
from speech import Speech
|
||||
from src.nlp import NLP
|
||||
from syntax import Syntax
|
||||
|
||||
if not sys.warnoptions:
|
||||
@ -14,14 +14,16 @@ def _main():
|
||||
if len(sys.argv) < 2:
|
||||
print(f'Usage: {sys.argv[0]} FILE')
|
||||
exit(1)
|
||||
wav_file = sys.argv[1]
|
||||
speech_server = 'http://vosk.athene.tech'
|
||||
text = Speech().run(wav_file, speech_server)
|
||||
# wav_file = sys.argv[1]
|
||||
# speech_server = 'http://vosk.athene.tech'
|
||||
# text = Speech().run(wav_file, speech_server)
|
||||
text = 'не могу оплатить из-за ограничений карты'
|
||||
print(f'Text: {text}')
|
||||
syntax_server = 'http://syntaxnet.athene.tech'
|
||||
parse_tree = Syntax().run(text, syntax_server)
|
||||
print(f'Parse tree:\n{parse_tree}')
|
||||
print(f'Nouns:\n{parse_tree.get_nouns()}')
|
||||
nouns = NLP().run(parse_tree)
|
||||
print(f'Nouns:\n{nouns}')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
26
src/nlp.py
Normal file
26
src/nlp.py
Normal file
@ -0,0 +1,26 @@
|
||||
from typing import List
|
||||
|
||||
import ru_core_news_sm
|
||||
from anytree import LevelOrderIter
|
||||
|
||||
from src.parse_tree.parse_tree import ParseTree
|
||||
from src.parse_tree.parse_tree_node import ParseTreeNode
|
||||
|
||||
|
||||
class NLP:
|
||||
@staticmethod
|
||||
def _lemmatizer(text: str):
|
||||
doc = ru_core_news_sm.load()(text)
|
||||
tokens = [token.lemma_ for token in doc]
|
||||
return ' '.join(tokens)
|
||||
|
||||
def _get_nouns(self, tree: ParseTreeNode) -> str:
|
||||
nouns: List[ParseTreeNode] = []
|
||||
for node in LevelOrderIter(tree):
|
||||
if node.upos != 'NOUN':
|
||||
continue
|
||||
nouns.append(node)
|
||||
return self._lemmatizer(' '.join([noun.lemma for noun in nouns]))
|
||||
|
||||
def run(self, tree: ParseTree) -> str:
|
||||
return self._get_nouns(tree.get_tree_root())
|
@ -1,6 +1,6 @@
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
from anytree import RenderTree, Resolver
|
||||
from anytree import RenderTree
|
||||
|
||||
from src.parse_tree.parse_tree_node import ParseTreeNode
|
||||
|
||||
@ -45,10 +45,8 @@ class ParseTree:
|
||||
break
|
||||
return root
|
||||
|
||||
def get_nouns(self) -> str:
|
||||
r = Resolver('upos')
|
||||
nodes: List[ParseTreeNode] = r.glob(self._tree, '*/NOUN/ADJ/*')
|
||||
return ' '.join([node.lemma for node in reversed(nodes)])
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return '\n'.join([f'{pre}{node}' for pre, fill, node in RenderTree(self._tree)])
|
||||
|
||||
def get_tree_root(self) -> ParseTreeNode:
|
||||
return self._tree
|
||||
|
Loading…
Reference in New Issue
Block a user