Source code for cosmel.util.word

#!/usr/bin/env python3
# -*- coding:utf-8 -*-


__author__    = 'Mu Yang <emfomy@gmail.com>'
__copyright__ = 'Copyright 2017-2018'


import collections.abc

from cosmel.util.core import *


[docs]class WsWords(collections.abc.Sequence): """The sequence of word-segmented words. Args: chars (str): the text with tag. (the format should be several 'text(post-tag)'s seperated by <U+3000>s.) """ @classmethod def __split(self, w): txt, w = w.split('(', 1) tag, role = w.split(')', 1) return txt, tag, role def __init__(self, chars): chars_seg = [self.__split(w) for w in chars.strip().split(' ') if not w == ''] self.__txts = [w[0] for w in chars_seg] self.__tags = [w[1] for w in chars_seg] self.__roles = [w[2] for w in chars_seg]
[docs] def index(self, word, *args): """int -- returns the index of the first word. Args: word (tuple): the tuple of text and tag, (optional) and role. """ if isinstance(word, tuple): if len(word) == 2: return list(self.zip2).index(word, *args) if len(word) == 3: return list(self.zip3).index(word, *args) raise ValueError(f'{word} is not in list')
def __contains__(self, key): return key in self.zip2 or key in self.zip3 def __getitem__(self, idxs): if isinstance(idxs, int): idxs = slice(idxs, idxs+1) retval = WsWords('') retval.__txts = self.__txts[idxs] retval.__tags = self.__tags[idxs] retval.__roles = self.__roles[idxs] return retval def __len__(self): return len(self.__txts) def __str__(self): return ' '.join([f'{txt}({tag})' for txt, tag in self.zip2]) def __repr__(self): return str(self) def __txtstr__(self): return ''.join(self.__txts) def __roledstr__(self): return ' '.join([f'{txt}({tag}){role}' for txt, tag, role in self.zip3]) def __roledtxtstr__(self): return ''.join([f'{txt}{role}' for txt, _, role in self.zip3]) @property def txts(self): """:class:`list` -- the texts.""" return self.__txts @property def tags(self): """:class:`list` -- the post-tags.""" return self.__tags @property def roles(self): """:class:`list` -- the roles.""" return self.__roles @property def zip(self): """zip -- the zip iterator of the texts, the tags, and the roles. (= :attr:`zip3`).""" return zip(self.__txts, self.__tags, self.__roles) @property def zip2(self): """zip -- the zip iterator of the texts and the tags.""" return zip(self.__txts, self.__tags) @property def zip3(self): """zip -- the zip iterator of the texts, the tags, and the roles..""" return zip(self.__txts, self.__tags, self.__roles)
[docs]def txtstr(obj): """str -- return the string of texts (obj.txts)""" return obj.__txtstr__()
[docs]def roledstr(obj): """str -- return the string with role (obj.txts, obj.tags, obj.roles)""" return obj.__roledstr__()
[docs]def roledtxtstr(obj): """str -- return the string with texts and role (obj.txts, obj.roles)""" return obj.__roledtxtstr__()