#!/usr/bin/env python3
# -*- coding:utf-8 -*-
__author__ = 'Mu Yang <emfomy@gmail.com>'
__copyright__ = 'Copyright 2017-2018'
import collections.abc
import os
from cosmel.util import *
from cosmel.corpus.article import *
[docs]class ParsedArticle(collections.abc.Sequence):
"""The parsed article object (contains list of sentences).
* Item: the parsed sentence (str)
file_path (str): the path to the article.
def __init__(self, file_path, article):
with open(file_path) as fin:
self.__data = [self.__load_line(line) for line in fin]
self.__article = article
self.__path = file_path
def __load_line(line):
line = line.lstrip('#1:1.[0] ')
line = line.split('#', 1)[0]
return line
def __contains__(self, item):
return item in self.__data
def __getitem__(self, key):
return self.__data[key]
def __iter__(self):
return iter(self.__data)
def __len__(self):
return len(self.__data)
def __str__(self):
return '\n'.join(map(str, self.__data))
def __repr__(self):
return '\n'.join(map(repr, self.__data))
def __hash__(self):
return hash(self.aid)
def article(self):
""":class:`.Article`: the article of this bundle."""
return self.__article
def aid(self):
"""str: the article ID (with leading author name and underscore)."""
return self.__article.aid
def path(self):
"""str: the related file path."""
return self.__path
[docs]class ParsedArticleSet(collections.abc.Collection):
"""The set of parsed articles.
* Item: the parsed article object (:class:`.ParsedArticle`)
parsed_root (str): the path to the folder containing parsed article files.
article_set (:class:`.ArticleSet`): the set of articles.
* Load all articles from ``parsed_root``/``part`` for all ``part`` in ``parts``.
def __init__(self, parsed_root, article_set):
n = str(len(article_set))
self.__data = [self.__parsed_article(article, article_set.path, parsed_root, i, n) for i, article in enumerate(article_set)]
self.__path = parsed_root
def __parsed_article(article, article_root, parsed_root, i, n):
file_path = transform_path(article.path, article_root, parsed_root, '.parse')
printr(f'{i+1:0{len(n)}}/{n}\tReading {file_path}')
parsed = ParsedArticle(file_path, article)
article._Article__parsed = parsed
return parsed
def __contains__(self, item):
return item in self.__data
def __iter__(self):
return iter(self.__data)
def __len__(self):
return len(self.__data)
def path(self):
"""str: the root path of the articles."""
return self.__path
[docs]class Id2ParsedArticle(collections.abc.Mapping):
"""The dictionary maps article ID to parsed article.
* Key: the article ID (str).
* Item: the parsed article (:class:`.ParsedArticle`).
id_to_article (:class:`.Id2Article`): the dictionary maps article ID to article object.
def __init__(self, id_to_article):
self.__data = id_to_article
def __contains__(self, key):
return key in self.__data
def __getitem__(self, key):
return self.__data[key].parsed
def __iter__(self):
return iter(map(self.__data, operator.attrgetter('parsed')))
def __len__(self):
return len(self.__data)