Source code for kyoto_reader.reader

import gzip
import io
import logging
import os
import pickle
import tarfile
import zipfile
from collections import ChainMap
from concurrent import futures
from contextlib import contextmanager, nullcontext
from enum import Enum
from functools import partial
from pathlib import Path
from typing import List, Dict, Optional, Union, Iterable, Collection, Any, BinaryIO, TextIO

from .constants import ALL_CASES, ALL_COREFS, SID_PTN, SID_PTN_KWDLC, SID_PTN_WAC
from .document import Document

logger = logging.getLogger(__name__)
logger.setLevel(logging.WARNING)


[docs]class ArchiveType(Enum):
    """Enum for file collection types."""
    TAR_GZ = '.tar.gz'
    ZIP = '.zip'


ArchiveFile = Union[tarfile.TarFile, zipfile.ZipFile]


[docs]class ArchiveHandler:
[docs]    def __init__(self, path: Path) -> None:
        self.path: Path = path
        self.type: ArchiveType = self._get_type(path)
        self.members: List[str] = self._list_members()

    @staticmethod
    def _get_type(path: Path) -> ArchiveType:
        assert path.is_file() is True
        if str(path).endswith(ArchiveType.TAR_GZ.value):
            return ArchiveType.TAR_GZ
        elif str(path).endswith(ArchiveType.ZIP.value):
            return ArchiveType.ZIP
        else:
            raise ValueError(f'Unsupported archive type: {path}')

    def _list_members(self) -> List[str]:
        if self.type == ArchiveType.TAR_GZ:
            with tarfile.open(self.path, mode='r') as f:
                return f.getnames()
        elif self.type == ArchiveType.ZIP:
            with zipfile.ZipFile(self.path, mode='r') as f:
                return f.namelist()
        else:
            raise ValueError(f'Unsupported archive type: {self.type}')

[docs]    @contextmanager
    def open(self) -> ArchiveFile:
        file = None
        try:
            if self.type == ArchiveType.TAR_GZ:
                file = tarfile.open(self.path, mode='r')
            elif self.type == ArchiveType.ZIP:
                file = zipfile.ZipFile(self.path, mode='r')
            else:
                raise ValueError(f'Unsupported archive type: {self.type}')
            yield file
        finally:
            hasattr(file, 'close') and file.close()

[docs]    @contextmanager
    def open_member(self, archive: ArchiveFile, member: str) -> BinaryIO:
        file = None
        try:
            if self.type == ArchiveType.TAR_GZ:
                file = archive.extractfile(member)
            elif self.type == ArchiveType.ZIP:
                file = archive.open(member)
            else:
                raise ValueError(f'Unsupported archive type: {self.type}')
            yield file
        finally:
            hasattr(file, 'close') and file.close()

[docs]    @classmethod
    def is_supported_path(cls, path: Path) -> bool:
        return any(str(path).endswith(t.value) for t in ArchiveType)


[docs]class FileType(Enum):
    """Enum for file types."""
    GZ = '.gz'
    # XZ = '.xz'
    UNCOMPRESSED = ''


[docs]class FileHandler:
[docs]    def __init__(self, path: Path) -> None:
        self.path: Path = path
        self.type: FileType = self._get_type(path)

    @property
    def content_basename(self) -> str:
        if self.type == FileType.UNCOMPRESSED:
            return self.path.name
        return self.path.name[:-len(self.type.value)]

    @staticmethod
    def _get_type(path: Path) -> FileType:
        if path.suffix == FileType.GZ.value:
            return FileType.GZ
        return FileType.UNCOMPRESSED

[docs]    @contextmanager
    def open(self, *args, **kwargs) -> TextIO:
        file = None
        try:
            if self.type == FileType.GZ:
                file = gzip.open(self.path, *args, **kwargs)
            elif self.type == FileType.UNCOMPRESSED:
                file = self.path.open(*args, **kwargs)
            else:
                raise ValueError(f'Unsupported collection type: {self.type}')
            yield file
        finally:
            hasattr(file, 'close') and file.close()

    def __lt__(self, other) -> bool:
        return self.path < other.path


[docs]class KyotoReader:
    """A class to manage a set of corpus documents.
    Compressed file is supported.
    However, nested compression (e.g. .knp.gz in zip file) is not supported.

    Args:
        source (Union[Path, str]): 対象の文書へのパス。ディレクトリが指定された場合、その中の全てのファイルを対象とする
        target_cases (Optional[Collection[str]]): 抽出の対象とする格。(default: 全ての格)
        target_corefs (Optional[Collection[str]]): 抽出の対象とする共参照関係(=など)。(default: 全ての関係)
        extract_nes (bool): 固有表現をコーパスから抽出するかどうか (default: True)
        relax_cases (bool): ガ≒格などをガ格として扱うか (default: False)
        knp_ext (str): KWDLC または KC ファイルの拡張子 (default: knp)
        pickle_ext (str): Document を pickle 形式で読む場合の拡張子 (default: pkl)
        use_pas_tag (bool): <rel>タグからではなく、<述語項構造:>タグから PAS を読むかどうか (default: False)
        n_jobs (int): 文書を読み込む処理の並列数。0: 並列処理なし、-1: コア数 (default: -1)
        did_from_sid (bool): 文書IDを文書中のS-IDから決定する (default: True)

    Note:
        サポートされる入力パス (i.e. `source` argument)
        - 単一ファイル (.knp, .knp.gz, .pkl, .pkl.gz)
        - 単一ファイルを含むディレクトリ
        - 単一非圧縮ファイルを含むアーカイブファイル (.tar.gz, .zip)
    """

[docs]    def __init__(self,
                 source: Union[Path, str],
                 target_cases: Optional[Collection[str]] = None,
                 target_corefs: Optional[Collection[str]] = None,
                 extract_nes: bool = True,
                 relax_cases: bool = False,
                 use_pas_tag: bool = False,
                 knp_ext: str = '.knp',
                 pickle_ext: str = '.pkl',
                 n_jobs: int = -1,
                 did_from_sid: bool = True,
                 ) -> None:
        if not (isinstance(source, Path) or isinstance(source, str)):
            raise TypeError(f"document source must be Path or str type, but got '{type(source)}' type")
        source = Path(source)
        self.archive_handler = None

        if source.is_dir():
            logger.info(f'got a directory path, files in the directory are treated as source files')
            file_paths: List[FileHandler] = sorted(FileHandler(p) for p in source.glob(f'**/*') if p.is_file())
        elif ArchiveHandler.is_supported_path(source):
            logger.info(f'got an archive file path, files in the archive are treated as source files')
            self.archive_handler = ArchiveHandler(source)
            file_paths: List[FileHandler] = sorted(FileHandler(Path(p)) for p in self.archive_handler.members)
        elif source.is_file():
            logger.info(f'got a single file path, this file is treated as a source file')
            file_paths: List[FileHandler] = [FileHandler(source)]
        else:
            raise ValueError(f'document source: {source} not found')

        # If True, determine the document ID from the sentence ID in the document.
        self.did_from_sid: bool = did_from_sid

        self._did2pkl = {file.path.stem: file for file in file_paths if file.content_basename.endswith(pickle_ext)}
        if n_jobs == -1:
            self.n_jobs = os.cpu_count()
        elif n_jobs >= 0:
            self.n_jobs = n_jobs
        else:
            raise ValueError(f'n_jobs must be >= 0 or -1, but got {n_jobs}')
        if self.n_jobs > 0 and self.archive_handler is not None:
            logger.info('Multiprocessing with archive is too slow, so it is disabled')
            logger.info(
                'Running without multiprocessing can be relatively slow, consider unarchiving the input file in advance'
            )
            self.n_jobs = 0

        self._did2knp: Dict[str, str] = {}
        self._did2file: Dict[str, FileHandler] = {}
        if self.did_from_sid is True:
            with (self.archive_handler.open() if self.archive_handler else nullcontext()) as archive:
                args_iter = (
                    (self, file, archive) for file in file_paths if file.content_basename.endswith(knp_ext)
                )
                if self.n_jobs > 0:
                    with futures.ProcessPoolExecutor(max_workers=self.n_jobs) as executor:
                        rets: Iterable[Dict[str, str]] = executor.map(KyotoReader._read_knp_wrapper, *zip(*args_iter))
                else:
                    rets: List[Dict[str, str]] = [KyotoReader._read_knp_wrapper(*args) for args in args_iter]
            self._did2knp.update(dict(ChainMap(*rets)))
        else:
            self._did2file.update(
                {file.path.stem: file for file in file_paths if file.content_basename.endswith(knp_ext)}
            )

        self.doc_ids: List[str] = sorted({*self._did2knp.keys(), *self._did2pkl.keys(), *self._did2file.keys()})

        self.target_cases: Collection[str] = self._get_targets(target_cases, ALL_CASES, 'case')
        self.target_corefs: Collection[str] = self._get_targets(target_corefs, ALL_COREFS, 'coref')
        self.relax_cases: bool = relax_cases
        self.extract_nes: bool = extract_nes
        self.use_pas_tag: bool = use_pas_tag
        self.knp_ext: str = knp_ext
        self.pickle_ext: str = pickle_ext

[docs]    def get_knp(self, did: str) -> str:
        if did in self._did2knp:
            return self._did2knp[did]
        with (self.archive_handler.open() if self.archive_handler else nullcontext()) as archive:
            if did in self._did2file:
                self._did2knp.update(self._read_knp_wrapper(self._did2file[did], archive))
                return self._did2knp[did]
            if did in self._did2pkl:
                if archive is not None:
                    with self.archive_handler.open_member(archive, str(self._did2pkl[did].path)) as f:
                        document = pickle.load(f)
                else:
                    with self._did2pkl[did].open(mode='rb') as f:
                        document = pickle.load(f)
                self._did2knp[did] = document.knp_string
                return self._did2knp[did]
        raise ValueError(f'document id: {did} not found')

    def _read_knp_wrapper(self,
                          file: FileHandler,
                          archive: Optional[ArchiveFile] = None,
                          ) -> Dict[str, str]:
        """Read KNP format file that is located at the specified path. The file can contain multiple documents.

        Args:
            file (FileHandler): A file handler indicating a path to a KNP format file.
            archive (Optional[ArchiveFile]): An archive to read the document from.

        Returns:
            Dict[str, str]: A mapping from a document ID to a KNP format string.
        """

        if archive is not None:
            with self.archive_handler.open_member(archive, str(file.path)) as f:
                return self._read_knp(io.TextIOWrapper(f, encoding='utf-8'), file.path, did_from_sid=self.did_from_sid)
        else:
            with file.open(mode='rt') as f:
                return self._read_knp(f, file.path, did_from_sid=self.did_from_sid)

    @staticmethod
    def _read_knp(file: TextIO,
                  path: Path,
                  did_from_sid: bool
                  ) -> Dict[str, str]:
        buff = ''
        did = sid = None
        did2knps = {}
        for line in file:
            if line.startswith('# S-ID:') and did_from_sid:
                sid_string = line[7:].strip().split()[0]
                match = SID_PTN_KWDLC.match(sid_string) or SID_PTN_WAC.match(sid_string) or SID_PTN.match(sid_string)
                if match is None:
                    raise ValueError(f'unsupported S-ID format: {sid_string} in {path}')
                if did != match.group('did') or sid == match.group('sid'):
                    if did is not None:
                        did2knps[did] = buff
                        buff = ''
                    did = match.group('did')
                    sid = match.group('sid')
            buff += line
        if did_from_sid is False:
            did = path.stem
        if did is not None and buff:
            did2knps[did] = buff
        else:
            logger.warning(f'empty file found and skipped: {path}')
        return did2knps

    @staticmethod
    def _get_targets(input_: Optional[Collection],
                     all_: Collection[Any],
                     type_: str,
                     ) -> Collection[Any]:
        """Return a list of known relations."""
        if input_ is None:
            return all_
        target = []
        for item in input_:
            if item not in all_:
                logger.warning(f'unknown target {type_}: {item}')
                continue
            target.append(item)
        return target

[docs]    def process_document(self,
                         doc_id: str,
                         archive: Optional[ArchiveFile] = None
                         ) -> Optional[Document]:
        """Process one document following the given document ID.

        Args:
            doc_id (str): An ID of a document to process.
            archive (Optional[ArchiveFile]): An archive to read the document from.
        """
        if doc_id in self._did2pkl:
            if archive is not None:
                with self.archive_handler.open_member(archive, str(self._did2pkl[doc_id].path)) as f:
                    return pickle.load(f)
            else:
                with self._did2pkl[doc_id].open(mode='rb') as f:
                    return pickle.load(f)
        return Document(self.get_knp(doc_id),
                        doc_id,
                        self.target_cases,
                        self.target_corefs,
                        self.relax_cases,
                        self.extract_nes,
                        self.use_pas_tag)

[docs]    def process_documents(self,
                          doc_ids: Iterable[str],
                          n_jobs: Optional[int] = None,
                          ) -> List[Optional[Document]]:
        """Process multiple documents following the given document IDs.

        Args:
            doc_ids (List[str]): IDs of documents to process.
            n_jobs (int): The number of processes spawned to finish this task. (default: inherit from self)
        """
        if n_jobs is None:
            n_jobs = self.n_jobs
        elif n_jobs == -1:
            n_jobs = os.cpu_count()
        elif n_jobs < -1:
            raise ValueError(f'n_jobs must be >= 0 or -1, but got {n_jobs}')
        if self.archive_handler is not None:
            assert n_jobs == 0
        with (self.archive_handler.open() if self.archive_handler else nullcontext()) as archive:
            process_document = partial(KyotoReader.process_document, self, archive=archive)
            if n_jobs > 0:
                with futures.ProcessPoolExecutor(max_workers=n_jobs) as executor:
                    rets: Iterable[Optional[Document]] = executor.map(process_document, doc_ids)
            else:
                rets: Iterable[Optional[Document]] = map(process_document, doc_ids)
            return list(rets)

[docs]    def process_all_documents(self,
                              n_jobs: Optional[int] = None,
                              ) -> List[Optional[Document]]:
        """Process all documents that KyotoReader has loaded.

        Args:
            n_jobs (int): The number of processes spawned to finish this task. (default: inherit from self)
        """
        return self.process_documents(self.doc_ids, n_jobs)

    def __len__(self):
        return len(self.doc_ids)

    def __getitem__(self, doc_id: str) -> Document:
        return self.process_document(doc_id)