Move term extraction rules to nlp class, add some new rules
This commit is contained in:
parent
0a8e192789
commit
e876220057
15
main.py
15
main.py
@ -1,30 +1,23 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
from typing import List
|
|
||||||
|
|
||||||
from src.myontology import MyOntology
|
from src.myontology import MyOntology
|
||||||
from src.nlp import NLP
|
from src.nlp import NLP
|
||||||
from src.parse_tree.parse_tree import ParseTree
|
from src.parse_tree.parse_tree import ParseTree
|
||||||
from src.speech import Speech
|
|
||||||
from src.syntax import Syntax
|
from src.syntax import Syntax
|
||||||
|
|
||||||
|
|
||||||
def _main(wav_file: str):
|
def _main(wav_file: str):
|
||||||
text: str = Speech().run_recognition(wav_file)
|
# text: str = Speech().run_recognition(wav_file)
|
||||||
# text: str = 'как получить кредит на обучение'
|
text: str = 'У меня кредит в другом банке. Можно ли его перевести в ваш банк? '
|
||||||
print(f'Text: {text}')
|
print(f'Text: {text}')
|
||||||
parse_tree: ParseTree = Syntax().get_parse_tree(text)
|
parse_tree: ParseTree = Syntax().get_parse_tree(text)
|
||||||
print(f'Parse tree:\n{parse_tree}')
|
print(f'Parse tree:\n{parse_tree}')
|
||||||
nlp: NLP = NLP()
|
terms = NLP().get_terms(parse_tree)
|
||||||
terms: List[str] = []
|
|
||||||
terms.extend(nlp.get_adj_noun(parse_tree))
|
|
||||||
terms.extend(nlp.get_nouns(parse_tree))
|
|
||||||
if len(terms) == 0:
|
|
||||||
terms.extend(nlp.get_terms_by_template(parse_tree, ['NOUN', 'VERB', 'ADJ']))
|
|
||||||
print(f'Extracted terms:\n{", ".join(terms)}')
|
print(f'Extracted terms:\n{", ".join(terms)}')
|
||||||
result: str = MyOntology().get_event_description(terms)
|
result: str = MyOntology().get_event_description(terms)
|
||||||
print(f'Test:\n{result}')
|
print(f'Result:\n{result}')
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
81
src/nlp.py
81
src/nlp.py
@ -9,26 +9,48 @@ from src.parse_tree.parse_tree_node import ParseTreeNode
|
|||||||
|
|
||||||
|
|
||||||
class NLP:
|
class NLP:
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self.__model = ru_core_news_sm.load()
|
||||||
|
|
||||||
|
def __lemmatizer(self, text: str):
|
||||||
|
doc = self.__model(text)
|
||||||
|
tokens = [token.lemma_ for token in doc]
|
||||||
|
return ' '.join(tokens)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def __get_nodes_by_template(tree: ParseTree, template: str) -> list:
|
def __get_nodes_by_template(tree: ParseTree, template: str) -> list:
|
||||||
|
msg_none = f'Can\'t find some terms by template {template}'
|
||||||
|
msg_success = f'Success terms extraction by template {template}'
|
||||||
top = tree.get_tree_root()
|
top = tree.get_tree_root()
|
||||||
r = Resolver('upos')
|
r = Resolver('upos')
|
||||||
nodes: list = []
|
nodes: list = []
|
||||||
try:
|
try:
|
||||||
nodes.extend(r.glob(top, template))
|
nodes.extend(r.glob(top, template))
|
||||||
except (ChildResolverError, ResolverError):
|
except (ChildResolverError, ResolverError):
|
||||||
print(f'Can\'t find some terms by template {template}')
|
pass
|
||||||
|
if len(nodes) == 0:
|
||||||
|
print(msg_none)
|
||||||
|
else:
|
||||||
|
print(msg_success)
|
||||||
return nodes
|
return nodes
|
||||||
|
|
||||||
def __get_terms_by_template(self, tree: ParseTree, template: str) -> List[str]:
|
def __get_terms_by_template(self, tree: ParseTree, template: str, reverse: bool = False) -> List[str]:
|
||||||
terms: Set[str] = set()
|
terms: Set[str] = set()
|
||||||
nodes: list = []
|
nodes: list = []
|
||||||
|
tmplt = template
|
||||||
|
if '/' in template:
|
||||||
|
split = template.split('/', 1)
|
||||||
|
tmplt = f'*/{split.pop(0)}*/{"".join(split)}'
|
||||||
nodes.extend(self.__get_nodes_by_template(tree, f'/{template}'))
|
nodes.extend(self.__get_nodes_by_template(tree, f'/{template}'))
|
||||||
nodes.extend(self.__get_nodes_by_template(tree, template))
|
nodes.extend(self.__get_nodes_by_template(tree, tmplt))
|
||||||
for node in nodes:
|
for node in nodes:
|
||||||
if node.parent is None:
|
if node.parent is None:
|
||||||
continue
|
continue
|
||||||
terms.add(f'{self.lemmatizer(node.lemma)} {self.lemmatizer(node.parent.lemma)}')
|
if not reverse:
|
||||||
|
terms.add(f'{self.__lemmatizer(node.lemma)} {self.__lemmatizer(node.parent.lemma)}')
|
||||||
|
else:
|
||||||
|
terms.add(f'{self.__lemmatizer(node.parent.lemma)} {self.__lemmatizer(node.lemma)}')
|
||||||
return list(terms)
|
return list(terms)
|
||||||
|
|
||||||
def __get_terms_by_upos(self, tree: ParseTree, upos: str) -> List[str]:
|
def __get_terms_by_upos(self, tree: ParseTree, upos: str) -> List[str]:
|
||||||
@ -38,32 +60,61 @@ class NLP:
|
|||||||
for node in nodes:
|
for node in nodes:
|
||||||
if node.upos != upos:
|
if node.upos != upos:
|
||||||
continue
|
continue
|
||||||
upos_terms.append(self.lemmatizer(node.lemma))
|
upos_terms.append(self.__lemmatizer(node.lemma))
|
||||||
if len(upos_terms) < 2:
|
if len(upos_terms) < 2:
|
||||||
continue
|
continue
|
||||||
terms.add(" ".join(upos_terms))
|
terms.add(" ".join(upos_terms))
|
||||||
if len(terms) == 0:
|
if len(terms) == 0:
|
||||||
print(f'Can\'t find some terms by template NOUN/./NOUN')
|
print(f'Can\'t find some terms by template NOUN/./NOUN')
|
||||||
|
else:
|
||||||
|
print(f'Success terms extraction by template NOUN/./NOUN')
|
||||||
return list(terms)
|
return list(terms)
|
||||||
|
|
||||||
def lemmatizer(self, text: str):
|
def __get_adj_noun(self, tree: ParseTree) -> List[str]:
|
||||||
doc = ru_core_news_sm.load()(text)
|
|
||||||
tokens = [token.lemma_ for token in doc]
|
|
||||||
return ' '.join(tokens)
|
|
||||||
|
|
||||||
def get_adj_noun(self, tree: ParseTree) -> List[str]:
|
|
||||||
return self.__get_terms_by_template(tree, 'NOUN/ADJ')
|
return self.__get_terms_by_template(tree, 'NOUN/ADJ')
|
||||||
|
|
||||||
def get_nouns(self, tree: ParseTree) -> List[str]:
|
def __get_nouns(self, tree: ParseTree) -> List[str]:
|
||||||
terms: List[str] = []
|
terms: List[str] = []
|
||||||
terms.extend(self.__get_terms_by_upos(tree, 'NOUN'))
|
terms.extend(self.__get_terms_by_upos(tree, 'NOUN'))
|
||||||
terms.extend(self.__get_terms_by_template(tree, 'NOUN/NOUN'))
|
terms.extend(self.__get_terms_by_template(tree, 'NOUN/NOUN', True))
|
||||||
return terms
|
return terms
|
||||||
|
|
||||||
def get_terms_by_template(self, tree: ParseTree, template: List[str]) -> List[str]:
|
def __get_verb_noun(self, tree: ParseTree) -> List[str]:
|
||||||
|
return self.__get_terms_by_template(tree, 'VERB/NOUN', True)
|
||||||
|
|
||||||
|
def __get_single_terms_by_template(self, tree: ParseTree, template: List[str]) -> List[str]:
|
||||||
nouns: List[ParseTreeNode] = []
|
nouns: List[ParseTreeNode] = []
|
||||||
for node in LevelOrderIter(tree.get_tree_root()):
|
for node in LevelOrderIter(tree.get_tree_root()):
|
||||||
if node.upos not in template:
|
if node.upos not in template:
|
||||||
continue
|
continue
|
||||||
nouns.append(node)
|
nouns.append(node)
|
||||||
return list(set([self.lemmatizer(noun.lemma) for noun in nouns]))
|
return list(set([self.__lemmatizer(noun.lemma) for noun in nouns]))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def __merge_nouns(nouns: List[str], verb_nouns: List[str]) -> List[str]:
|
||||||
|
terms: Set[str] = set()
|
||||||
|
if len(nouns) == 0 or len(verb_nouns) == 0:
|
||||||
|
return list(terms)
|
||||||
|
for verb_noun in verb_nouns:
|
||||||
|
split = verb_noun.split(' ')
|
||||||
|
current_noun = split.pop()
|
||||||
|
current_verb = split.pop()
|
||||||
|
for noun in nouns:
|
||||||
|
if current_noun in noun.split(' '):
|
||||||
|
terms.add(f'{current_verb} {noun}')
|
||||||
|
break
|
||||||
|
return list(terms)
|
||||||
|
|
||||||
|
def get_terms(self, tree: ParseTree) -> List[str]:
|
||||||
|
terms: List[str] = []
|
||||||
|
terms.extend(self.__get_adj_noun(tree))
|
||||||
|
nouns: List[str] = self.__get_nouns(tree)
|
||||||
|
terms.extend(nouns)
|
||||||
|
# verb_nouns: List[str] = self.__get_verb_noun(tree)
|
||||||
|
# terms.extend(verb_nouns)
|
||||||
|
# merged: List[str] = self.__merge_nouns(nouns, verb_nouns)
|
||||||
|
# if len(merged) > 0:
|
||||||
|
# return merged
|
||||||
|
if len(terms) == 0:
|
||||||
|
terms.extend(self.__get_single_terms_by_template(tree, ['NOUN', 'VERB', 'ADJ']))
|
||||||
|
return terms
|
||||||
|
Loading…
Reference in New Issue
Block a user