Source code for kyoto_reader.sentence

import logging
from collections import ChainMap
from typing import List, Dict, Optional, Iterator

from pyknp import BList, Morpheme

from .base_phrase import BasePhrase

logger = logging.getLogger(__name__)
logger.setLevel(logging.WARNING)


[docs]class Sentence: """A class to represent a single sentence. Attributes: blist (BList): BList object of pyknp. doc_id (str): The document ID of this sentence. bps (List[BasePhrase]): Base phrases in this sentence. """
[docs] def __init__(self, knp_string: str, dtid_offset: int, dmid_offset: int, doc_id: str, ) -> None: """ Args: knp_string(str): KNP format string of this sentence. dtid_offset (int): The document-wide tag ID of the previous base phrase. dmid_offset (int): The document-wide morpheme ID of the previous morpheme. doc_id(str): The document ID of this sentence. """ self.blist = BList(knp_string) self.doc_id: str = doc_id self.bps: List[BasePhrase] = [] dtid = dtid_offset dmid = dmid_offset for tag in self.blist.tag_list(): base_phrase = BasePhrase(tag, dmid, dtid, self.blist.sid, doc_id) self.bps.append(base_phrase) dtid += 1 dmid += len(base_phrase) self._mrph2dmid: Dict[Morpheme, int] = dict(ChainMap(*(bp.mrph2dmid for bp in self.bps))) for bp in self.bps: if bp.tag.parent_id >= 0: bp.parent = self.bps[bp.tag.parent_id] for child in bp.tag.children: bp.children.append(self.bps[child.tag_id])
@property def sid(self) -> str: """A sentence ID.""" return self.blist.sid @property def dtids(self) -> List[int]: """A document-wide tag ID.""" return [bp.dtid for bp in self.bps] @property def mrph2dmid(self) -> Dict[Morpheme, int]: """A mapping from morpheme to its document-wide ID.""" return self._mrph2dmid @property def surf(self) -> str: """A surface expression""" return ''.join(bp.surf for bp in self.bps)
[docs] def bnst_list(self): """Return list of Bunsetsu object in pyknp.""" return self.blist.bnst_list()
[docs] def tag_list(self): """Return list of Tag object in pyknp.""" return self.blist.tag_list()
[docs] def mrph_list(self): """Return list of Morpheme object in pyknp.""" return self.blist.mrph_list()
def __len__(self) -> int: """Number of base phrases in this sentence""" return len(self.bps) def __getitem__(self, tid: int) -> Optional[BasePhrase]: if 0 <= tid < len(self): return self.bps[tid] else: logger.error(f'base phrase: {tid} out of range') return None def __iter__(self) -> Iterator[BasePhrase]: return iter(self.bps) def __eq__(self, other: 'Sentence') -> bool: return self.sid == other.sid def __str__(self) -> str: return self.surf def __repr__(self) -> str: return f'Sentence(\'{self.surf}\', sid: {self.sid})'