Source code for cosmel.corpus.mention

#!/usr/bin/env python3
# -*- coding:utf-8 -*-


__author__    = 'Mu Yang <emfomy@gmail.com>'
__copyright__ = 'Copyright 2017-2018'


import collections.abc
import itertools
import json
import operator
import os

from cosmel.util import *
from cosmel.corpus.article import *


[docs]class Mention: """The mention class. Args: article (:class:`.Article`): the article containing this mention. sid (int): the sentence index in the aritcle. mid (int): the mention index in the sentence. gid (str): the golden product ID. nid (str): the network-predicted product ID. rid (str): the rule-labeled product ID. rule (str): the rule. idxs (slice): the indix slice of this mention. """ def __init__(self, article, sid, mid, *args, gid='', nid='', rid='', rule='', **kwargs): assert len(args) == 0 super().__init__() self.__article = article self.__sid = int(sid) self.__mid = int(mid) self.__gid = gid self.__nid = nid self.__rid = rid self.__rule = rule self.__start = self.__mid self.__end = self.__mid+1 self.__kwargs = kwargs def __str__(self): return f'{str(self.sentence_pre)} [{colored("0;95", str(self.mention))}] {str(self.sentence_post)}' def __repr__(self): return f'{repr(self.sentence_pre)} [{colored("0;95", repr(self.mention))}] {repr(self.sentence_post)}' def __txtstr__(self): return f'{txtstr(self.sentence_pre)} [{colored("0;95", txtstr(self.mention))}] {txtstr(self.sentence_post)}' def __roledstr__(self): return f'{roledstr(self.sentence_pre)} [{colored("0;95", roledstr(self.mention))}] {roledstr(self.sentence_post)}' def __hash__(self): return hash(ids) @property def article(self): """:class:`.Article`: the article containing this mention.""" return self.__article @property def sentence(self): """:class:`.WsWords`: the sentence containing this mention.""" return self.article[self.__sid] @property def sentence_pre(self): """:class:`.WsWords`: the words before this mention in the sentence.""" return self.sentence[self._slice_pre]
[docs] def sentence_pre_(self, with_mention=True): """:class:`.WsWords`: the words before this mention in the sentence (with/without mention itself).""" return self.sentence[self._slice_pre_(with_mention)]
@property def sentence_post(self): """:class:`.WsWords`: the words after this mention in the sentence.""" return self.sentence[self._slice_post]
[docs] def sentence_post_(self, with_mention=True): """:class:`.WsWords`: the words after this mention in the sentence (with/without mention itself).""" return self.sentence[self._slice_post_(with_mention)]
@property def mention(self): """:class:`.WsWords`: this mention.""" return self.sentence[self._slice] @property def bundle(self): """:class:`.MentionBundle`: the mention bundle containing this mention.""" return self.__article.bundle @property def asmid(self): """tuple: the tuple of article ID, sentence ID, and mention ID.""" return (self.aid, self.sid, self.mid,) @property def ids(self): return self.asmid @property def aid(self): """str: the article ID.""" return self.__article.aid @property def sid(self): """int: the sentence ID (the sentence index in the article).""" return self.__sid @property def mid(self): """int: the mention ID (the mention index in the sentence).""" return self.__mid @property def start_idx(self): """int: the starting index of the mention in the sentence.""" return self.__start @property def end_idx(self): """int: the ending index of the mention in the sentence.""" return self.__end @property def last_idx(self): """int: the index of the last word of the mention in the sentence.""" return self.__end-1 @property def mid(self): """int: the mention ID (the mention index in the sentence).""" return self.__mid @property def _slice(self): """slice: the slice index of this mention in the sentence.""" return slice(self.__start, self.__end) @property def _slice_pre(self): """slice: the slice index of the words before this mention in the sentence.""" return slice(None, self.__start) def _slice_pre_(self, with_mention=True): """slice: the slice index of the words before this mention in the sentence (with/without mention).""" return slice(None, self.__end) if with_mention else slice(None, self.__start) @property def _slice_post(self): """slice: the slice index of the words after this mention in the sentence.""" return slice(self.__end, None) def _slice_post_(self, with_mention=True): """slice: the slice index of the words after this mention in the sentence (with/without mention).""" return slice(self.__start, None) if with_mention else slice(self.__end, None) @property def gid(self): """str: the golden product ID.""" return self.__gid @property def nid(self): """str: the network-predicted product ID.""" return self.__nid @property def rid(self): """str: the rule-labeled product ID.""" return self.__rid @property def rule(self): """str: the rule for the product ID.""" return self.__rule @property def head_ws(self): """:class:`.WsWords`: the word-segmented head word.""" return self.sentence[self.__mid] @property def head(self): """str: the head word.""" return self.head_txt @property def head_txt(self): """str: the head word.""" return self.sentence.txts[self.__mid] @property def head_tag(self): """str: the head post-tag.""" return self.sentence.tags[self.__mid] @property def head_role(self): """str: the head role.""" return self.sentence.roles[self.__mid] @property def attrs(self): """The xml attributes.""" return dict(sid=self.__sid, mid=self.__mid, gid=self.__gid, nid=self.__nid, rid=self.__rid, \ rule=self.__rule, **self.__kwargs) @property def start_xml(self): """str: the starting XML tag.""" return f'<product ' + ' '.join(f'{k}="{v}"' for k, v in self.attrs.items()) + '>'
[docs] def start_xml_(self, **kwargs): """str: the starting XML tag with custom attributes.""" attrs = self.attrs attrs.update(kwargs) return f'<product ' + ' '.join(f'{k}="{v}"' for k, v in attrs.items()) + '>'
@property def end_xml(self): """str: the ending XML tag.""" return f'</product>' @property def json(self): """Convert to json.""" return json.dumps(self.attrs)
[docs] def set_gid(self, gid): """Sets the golden product ID.""" self.__gid = gid
[docs] def set_nid(self, nid): """Sets the network-predicted product ID.""" self.__nid = nid
[docs] def set_rid(self, rid): """Sets the rule-labeled product ID.""" self.__rid = rid
[docs] def set_rule(self, rule): """Sets the rule for the product ID.""" self.__rule = rule
[docs]class MentionSet(collections.abc.Collection): """The set of mentions. * Item: mention (:class:`.Mention`) Args: mention_bundles (:class:`.MentionBundleSet`): the set of mention bundles. """ def __init__(self, mention_bundles): super().__init__() self.__data = list(itertools.chain.from_iterable(mention_bundles)) self.__path = mention_bundles.path def __contains__(self, item): return item in self.__data def __iter__(self): return iter(self.__data) def __len__(self): return len(self.__data) @property def path(self): """str: the root path of the mentions.""" return self.__path
[docs]class MentionBundle(collections.abc.Sequence): """The bundle of mentions in an article. * Item: mention (:class:`.Mention`) Args: file_path (str): the path to the mention bundle. article (:class:`.Article`): the article containing this mention bundle. """ def __init__(self, file_path, article): super().__init__() with open(file_path) as fin: self.__data = [Mention(article, **json.loads(line)) for line in fin] self.__article = article self.__path = file_path def __contains__(self, item): return item in self.__data def __getitem__(self, key): return self.__data[key] def __iter__(self): return iter(self.__data) def __len__(self): return len(self.__data) def __str__(self): return '\n'.join(map(str, self.__data)) def __repr__(self): return '\n'.join(map(repr, self.__data)) def __txtstr__(self): return '\n'.join(map(txtstr, self.__data)) def __roledstr__(self): return '\n'.join(map(roledstr, self.__data)) def __hash__(self): return hash(self.aid) @property def article(self): """:class:`.Article`: the article of this bundle.""" return self.__article @property def aid(self): """str: the article ID (with leading author name and underscore).""" return self.__article.aid @property def path(self): """str: the related file path.""" return self.__path
[docs] def save(self, file_path): """Save the mention bundle to json file.""" os.makedirs(os.path.dirname(file_path), exist_ok=True) with open(file_path, 'w') as fout: for mention in self: fout.write(mention.json+'\n')
[docs]class MentionBundleSet(collections.abc.Collection): """The set of mention bundles. * Item: mention bundle (:class:`.MentionBundle`) Args: article_root (str): the path to the folder containing word segmented article files. mention_root (str): the path to the folder containing mention files. article_set (:class:`.ArticleSet`): the set of articles. """ def __init__(self, mention_root, article_set): super().__init__() n = str(len(article_set)) self.__data = [self.__mention_bundle(article, article_set.path, mention_root, i, n) \ for i, article in enumerate(article_set)] self.__path = mention_root print() @staticmethod def __mention_bundle(article, article_root, mention_root, i, n): file_path = transform_path(article.path, article_root, mention_root, '.json') printr(f'{i+1:0{len(n)}}/{n}\tReading {file_path}') bundle = MentionBundle(file_path, article) article._Article__bundle = bundle return bundle def __contains__(self, item): return item in self.__data def __iter__(self): return iter(self.__data) def __len__(self): return len(self.__data) @property def path(self): """str: the root path of the mentions.""" return self.__path
[docs] def save(self, output_root): """Save all mention bundles to files.""" n = str(len(self)) for i, bundle in enumerate(self): file_path = bundle.path.replace(self.__path, output_root) printr(f'{i+1:0{len(n)}}/{n}\t{file_path}') bundle.save(file_path) print()
[docs]class Id2Mention(collections.abc.Mapping): """The dictionary maps article ID, sentence ID, and mention ID to mention object. * Key: the article ID, sentence ID, and mention ID (tuple). * Item: the mention object (:class:`.Mention`). Args: mention_set (:class:`.MentionSet`): the mention set. """ def __init__(self, mention_set): super().__init__() self.__data = dict((mention.asmid, mention,) for mention in mention_set) def __contains__(self, key): return key in self.__data def __getitem__(self, key): return self.__data[key] def __iter__(self): return iter(self.__data) def __len__(self): return len(self.__data)
[docs]class Id2MentionBundle(collections.abc.Mapping): """The dictionary maps article ID to mention bundle. * Key: the article ID (str). * Item: the mention bundle (:class:`.MentionBundle`). Args: id_to_article (:class:`.Id2Article`): the dictionary maps article ID to article object. """ def __init__(self, id_to_article): super().__init__() self.__data = id_to_article def __contains__(self, key): return key in self.__data def __getitem__(self, key): return self.__data[key].bundle def __iter__(self): return iter(map(self.__data, operator.attrgetter('bundle'))) def __len__(self): return len(self.__data)
[docs]class Head2MentionList(collections.abc.Mapping): """The dictionary maps head word to mention object list. * Key: mention head word (str). * Item: :class:`.ReadOnlyList` of mention object (:class:`.Mention`). Args: mention_set (:class:`.MentionSet`): the mention set. """ def __init__(self, mention_set): super().__init__() self.__data = dict() mention_dict = dict() for mention in mention_set: if mention.head not in mention_dict: mention_dict[mention.head] = [mention] else: mention_dict[mention.head] += [mention] for head, mention_set in mention_dict.items(): self.__data[head] = ReadOnlyList(mention_set) self.__empty_collection = ReadOnlyList() def __contains__(self, key): return key in self.__data def __getitem__(self, key): return self.__data.get(key, self.__empty_collection) def __iter__(self): return iter(self.__data) def __len__(self): return len(self.__data)