Add nouns extractor and lemmatizer
This commit is contained in:
parent
e761a4e4ac
commit
b59505fc25
@ -1,3 +1,5 @@
|
|||||||
scipy==1.7.3
|
scipy==1.7.3
|
||||||
requests==2.27.1
|
requests==2.27.1
|
||||||
anytree==2.8.0
|
anytree==2.8.0
|
||||||
|
spacy==3.2.1
|
||||||
|
https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.2.0/ru_core_news_sm-3.2.0.tar.gz
|
12
src/main.py
12
src/main.py
@ -3,7 +3,7 @@
|
|||||||
import sys
|
import sys
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from speech import Speech
|
from src.nlp import NLP
|
||||||
from syntax import Syntax
|
from syntax import Syntax
|
||||||
|
|
||||||
if not sys.warnoptions:
|
if not sys.warnoptions:
|
||||||
@ -14,14 +14,16 @@ def _main():
|
|||||||
if len(sys.argv) < 2:
|
if len(sys.argv) < 2:
|
||||||
print(f'Usage: {sys.argv[0]} FILE')
|
print(f'Usage: {sys.argv[0]} FILE')
|
||||||
exit(1)
|
exit(1)
|
||||||
wav_file = sys.argv[1]
|
# wav_file = sys.argv[1]
|
||||||
speech_server = 'http://vosk.athene.tech'
|
# speech_server = 'http://vosk.athene.tech'
|
||||||
text = Speech().run(wav_file, speech_server)
|
# text = Speech().run(wav_file, speech_server)
|
||||||
|
text = 'не могу оплатить из-за ограничений карты'
|
||||||
print(f'Text: {text}')
|
print(f'Text: {text}')
|
||||||
syntax_server = 'http://syntaxnet.athene.tech'
|
syntax_server = 'http://syntaxnet.athene.tech'
|
||||||
parse_tree = Syntax().run(text, syntax_server)
|
parse_tree = Syntax().run(text, syntax_server)
|
||||||
print(f'Parse tree:\n{parse_tree}')
|
print(f'Parse tree:\n{parse_tree}')
|
||||||
print(f'Nouns:\n{parse_tree.get_nouns()}')
|
nouns = NLP().run(parse_tree)
|
||||||
|
print(f'Nouns:\n{nouns}')
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
26
src/nlp.py
Normal file
26
src/nlp.py
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
from typing import List
|
||||||
|
|
||||||
|
import ru_core_news_sm
|
||||||
|
from anytree import LevelOrderIter
|
||||||
|
|
||||||
|
from src.parse_tree.parse_tree import ParseTree
|
||||||
|
from src.parse_tree.parse_tree_node import ParseTreeNode
|
||||||
|
|
||||||
|
|
||||||
|
class NLP:
|
||||||
|
@staticmethod
|
||||||
|
def _lemmatizer(text: str):
|
||||||
|
doc = ru_core_news_sm.load()(text)
|
||||||
|
tokens = [token.lemma_ for token in doc]
|
||||||
|
return ' '.join(tokens)
|
||||||
|
|
||||||
|
def _get_nouns(self, tree: ParseTreeNode) -> str:
|
||||||
|
nouns: List[ParseTreeNode] = []
|
||||||
|
for node in LevelOrderIter(tree):
|
||||||
|
if node.upos != 'NOUN':
|
||||||
|
continue
|
||||||
|
nouns.append(node)
|
||||||
|
return self._lemmatizer(' '.join([noun.lemma for noun in nouns]))
|
||||||
|
|
||||||
|
def run(self, tree: ParseTree) -> str:
|
||||||
|
return self._get_nouns(tree.get_tree_root())
|
@ -1,6 +1,6 @@
|
|||||||
from typing import List, Dict, Optional
|
from typing import List, Dict, Optional
|
||||||
|
|
||||||
from anytree import RenderTree, Resolver
|
from anytree import RenderTree
|
||||||
|
|
||||||
from src.parse_tree.parse_tree_node import ParseTreeNode
|
from src.parse_tree.parse_tree_node import ParseTreeNode
|
||||||
|
|
||||||
@ -45,10 +45,8 @@ class ParseTree:
|
|||||||
break
|
break
|
||||||
return root
|
return root
|
||||||
|
|
||||||
def get_nouns(self) -> str:
|
|
||||||
r = Resolver('upos')
|
|
||||||
nodes: List[ParseTreeNode] = r.glob(self._tree, '*/NOUN/ADJ/*')
|
|
||||||
return ' '.join([node.lemma for node in reversed(nodes)])
|
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
return '\n'.join([f'{pre}{node}' for pre, fill, node in RenderTree(self._tree)])
|
return '\n'.join([f'{pre}{node}' for pre, fill, node in RenderTree(self._tree)])
|
||||||
|
|
||||||
|
def get_tree_root(self) -> ParseTreeNode:
|
||||||
|
return self._tree
|
||||||
|
Loading…
Reference in New Issue
Block a user