24 lines
696 B
Python
24 lines
696 B
Python
from typing import List
|
|
|
|
import ru_core_news_sm
|
|
from anytree import LevelOrderIter
|
|
|
|
from src.parse_tree.parse_tree import ParseTree
|
|
from src.parse_tree.parse_tree_node import ParseTreeNode
|
|
|
|
|
|
class NLP:
|
|
@staticmethod
|
|
def _lemmatizer(text: str):
|
|
doc = ru_core_news_sm.load()(text)
|
|
tokens = [token.lemma_ for token in doc]
|
|
return ' '.join(tokens)
|
|
|
|
def get_nouns(self, tree: ParseTree) -> List[str]:
|
|
nouns: List[ParseTreeNode] = []
|
|
for node in LevelOrderIter(tree.get_tree_root()):
|
|
if node.upos != 'NOUN':
|
|
continue
|
|
nouns.append(node)
|
|
return list(set([self.lemmatizer(noun.lemma) for noun in nouns]))
|