Source code for cosmel.corpus.parsed

#!/usr/bin/env python3
# -*- coding:utf-8 -*-


__author__    = 'Mu Yang <emfomy@gmail.com>'
__copyright__ = 'Copyright 2017-2018'


import collections.abc
import os

from cosmel.util import *
from cosmel.corpus.article import *

[docs]class ParsedArticle(collections.abc.Sequence): """The parsed article object (contains list of sentences). * Item: the parsed sentence (str) Args: file_path (str): the path to the article. """ def __init__(self, file_path, article): super().__init__() with open(file_path) as fin: self.__data = [self.__load_line(line) for line in fin] self.__article = article self.__path = file_path @staticmethod def __load_line(line): line = line.lstrip('#1:1.[0] ') line = line.split('#', 1)[0] return line def __contains__(self, item): return item in self.__data def __getitem__(self, key): return self.__data[key] def __iter__(self): return iter(self.__data) def __len__(self): return len(self.__data) def __str__(self): return '\n'.join(map(str, self.__data)) def __repr__(self): return '\n'.join(map(repr, self.__data)) def __hash__(self): return hash(self.aid) @property def article(self): """:class:`.Article`: the article of this bundle.""" return self.__article @property def aid(self): """str: the article ID (with leading author name and underscore).""" return self.__article.aid @property def path(self): """str: the related file path.""" return self.__path
[docs]class ParsedArticleSet(collections.abc.Collection): """The set of parsed articles. * Item: the parsed article object (:class:`.ParsedArticle`) Args: parsed_root (str): the path to the folder containing parsed article files. article_set (:class:`.ArticleSet`): the set of articles. Notes: * Load all articles from ``parsed_root``/``part`` for all ``part`` in ``parts``. """ def __init__(self, parsed_root, article_set): super().__init__() n = str(len(article_set)) self.__data = [self.__parsed_article(article, article_set.path, parsed_root, i, n) for i, article in enumerate(article_set)] self.__path = parsed_root print() @staticmethod def __parsed_article(article, article_root, parsed_root, i, n): file_path = transform_path(article.path, article_root, parsed_root, '.parse') printr(f'{i+1:0{len(n)}}/{n}\tReading {file_path}') parsed = ParsedArticle(file_path, article) article._Article__parsed = parsed return parsed def __contains__(self, item): return item in self.__data def __iter__(self): return iter(self.__data) def __len__(self): return len(self.__data) @property def path(self): """str: the root path of the articles.""" return self.__path
[docs]class Id2ParsedArticle(collections.abc.Mapping): """The dictionary maps article ID to parsed article. * Key: the article ID (str). * Item: the parsed article (:class:`.ParsedArticle`). Args: id_to_article (:class:`.Id2Article`): the dictionary maps article ID to article object. """ def __init__(self, id_to_article): super().__init__() self.__data = id_to_article def __contains__(self, key): return key in self.__data def __getitem__(self, key): return self.__data[key].parsed def __iter__(self): return iter(map(self.__data, operator.attrgetter('parsed'))) def __len__(self): return len(self.__data)