Source code for kyoto_reader.base_phrase

import logging
from typing import List, Dict, Optional, Iterator

from pyknp import Tag, Morpheme

logger = logging.getLogger(__name__)
logger.setLevel(logging.WARNING)


[docs]class BasePhrase: """文中に出現する基本句を表すクラス Attributes: tag (Tag): Tag object in pyknp. sid (str): Sentence ID. dtid (int): Document-wide tag ID. content_dmid (int): Document-wide morpheme ID of the content word in the base phrase. parent (Optional[BasePhrase]): Dependency parent. children (List[BasePhrase]): Dependency children. """
[docs] def __init__(self, tag: Tag, dmid_offset: int, dtid: int, sid: str, doc_id: str, parent: Optional['BasePhrase'] = None, children: Optional[List['BasePhrase']] = None, ): """ Args: tag (Tag): Tag object in pyknp. dmid_offset (int): Document-wide morpheme ID of the previous morpheme. dtid (int): Document-wide tag ID. sid (str): Sentence ID. doc_id (str): Document ID. parent (Optional[BasePhrase]): Dependency parent. children (List[BasePhrase]): Dependency children. """ self.tag: Tag = tag self.dtid: int = dtid self.sid: str = sid self.doc_id: str = doc_id self._mrph2dmid: Dict[Morpheme, int] = {} dmid = dmid_offset for mrph in tag.mrph_list(): self._mrph2dmid[mrph] = dmid dmid += 1 self.content: Morpheme = self._get_content_word() self.content_dmid: int = self._mrph2dmid[self.content] self.parent: Optional['BasePhrase'] = parent self.children: List['BasePhrase'] = children if children is not None else []
def _get_content_word(self) -> Morpheme: """Return the first morpheme that is a content word if any. Otherwise, return the first morpheme""" for mrph in self.tag.mrph_list(): if '<内容語>' in mrph.fstring: return mrph else: logger.info(f'{self.sid}: cannot find content word in: {self.tag.midasi}. Use first mrph instead') return self.tag.mrph_list()[0] @property def dmid(self) -> int: """Document-wide morpheme ID.""" return self.content_dmid @property def tid(self) -> int: """Tag ID in pyknp.""" return self.tag.tag_id @property def core(self) -> str: """A core expression without ancillary words.""" mrph_list = self.tag.mrph_list() sidx = 0 for i, mrph in enumerate(mrph_list): if mrph.hinsi not in ('助詞', '特殊', '判定詞'): sidx += i break eidx = len(mrph_list) for i, mrph in enumerate(reversed(mrph_list)): if mrph.hinsi not in ('助詞', '特殊', '判定詞'): eidx -= i break ret = ''.join(mrph.midasi for mrph in mrph_list[sidx:eidx]) if not ret: ret = self.tag.midasi return ret @property def mrph2dmid(self) -> Dict[Morpheme, int]: """A mapping from morpheme to its document-wide ID.""" return self._mrph2dmid @property def mrphs(self) -> List[Morpheme]: """A list of morphemes.""" return list(self._mrph2dmid.keys()) @property def dmids(self) -> List[int]: """A list of document-wide morpheme IDs.""" return list(self._mrph2dmid.values()) @property def surf(self) -> str: """A surface expression.""" return self.tag.midasi
[docs] def mrph_list(self) -> List[Morpheme]: """A list of morphemes""" return self.mrphs
def __len__(self) -> int: """Number of morphemes in the base phrase""" return len(self._mrph2dmid) def __getitem__(self, mid: int) -> Optional[Morpheme]: if 0 <= mid < len(self): return self.mrphs[mid] else: logger.error(f'{self.sid}: morpheme id: {mid} out of range') return None def __iter__(self) -> Iterator[Morpheme]: return iter(self.mrphs) def __eq__(self, other: 'BasePhrase') -> bool: return self.sid == other.sid and self.dtid == other.dtid def __str__(self) -> str: return self.surf def __repr__(self) -> str: return f'BasePhrase(dtid: {self.dtid}, mrphs: {" ".join(m.midasi for m in self)}, sid: {self.sid})'