Source code for cosmel.util.ckipws

#!/usr/bin/env python3
# -*- coding:utf-8 -*-


__author__    = 'Mu Yang <emfomy@gmail.com>'
__copyright__ = 'Copyright 2017-2018'


import ctypes
import itertools
import os
import re
import sys

from ckipws import CkipWS

from cosmel.util.core import *


[docs]class CkipWs(): """The word segmentation driver.""" def __init__(self, ini_file, lex_files, compound_files, input_encoding=None, output_encoding=None, verbose=True): with open(ini_file, encoding=input_encoding) as fin: lines = fin.read().splitlines() idx = lines.index('[CTextLexicon]') if idx < 0: raise Exception(f'INI file "{ini_file}" contains no [CTextLexicon]') for line in lines[idx+1::]: if '[' == line[0]: raise Exception(f'INI file "{ini_file}" contains no [CTextLexicon] FileName') if 'FileName' in line: lex_file = line.split('=')[1] break os.makedirs(os.path.dirname(lex_file), exist_ok=True) with open(lex_file, 'w', encoding=output_encoding) as fout: for file in lex_files: with open(file, encoding=input_encoding) as fin: fout.write(fin.read()+'\n') self.__regexes = [] for line in itertools.chain.from_iterable(map(open, compound_files)): if line.strip() == '': continue seg = line.strip().split('\t') self.__regexes.append((re.compile(rf'(\A|(?<=\n| )){re.escape(seg[0])}\([A-Za-z0-9]*?\)'), seg[1], seg[0])) self.__regexes.append((re.compile(r' □\(SP\)'), '', '□')) self.__core = CkipWS(ini_file) if verbose: print(f'Initialize CKIPWS with INI "{ini_file}" using lexicon "{lex_file}"')
[docs] def ws_file(self, input_file, output_file, verbose=True): if verbose: print(f'Processing Word Segment on {input_file} to {output_file}') self.__core.apply_file(input_file, output_file)
[docs] def ws_list(self, input_file, output_file, verbose=True): if verbose: print(f'Processing Word Segment on {input_file} to {output_file}') with open(input_file) as fin, open(output_file, 'w') as fout: fout.write('\n'.join(self.__core.apply_article(fin.readlines())))
[docs] def ws_line(self, input_file, output_file, verbose=True): if verbose: print(f'Processing Word Segment on {input_file} to {output_file}') with open(input_file) as fin, open(output_file, 'w') as fout: lines = fin.readlines() n = str(len(lines)) for i, line in enumerate(lines): if verbose: printr(f'{i+1:0{len(n)}}/{n}\t{line[:8].strip()}...') fout.write(' '.join(self.__core.apply_article([line[i:i+80] for i in range(0, len(line), 80)]))+'\n') if verbose: print()
[docs] def replace(self, input_file, output_file, input_encoding=None, output_encoding=None, verbose=True): with open(input_file, encoding=input_encoding) as fin, open(output_file, 'w', encoding=output_encoding) as fout: lines = fin.read() for regex in self.__regexes: if verbose: printr(regex[2]) lines = regex[0].sub(regex[1], lines) fout.write(lines)