Source code for cosmel.util.ckipws

#!/usr/bin/env python3
# -*- coding:utf-8 -*-


__author__    = 'Mu Yang <emfomy@gmail.com>'
__copyright__ = 'Copyright 2017-2018'


import ctypes
import itertools
import os
import re
import sys

from ckipws import CkipWS

from cosmel.util.core import *


[docs]class CkipWs():
	"""The word segmentation driver."""

	def __init__(self, ini_file, lex_files, compound_files, input_encoding=None, output_encoding=None, verbose=True):

		with open(ini_file, encoding=input_encoding) as fin:
			lines = fin.read().splitlines()
			idx = lines.index('[CTextLexicon]')
			if idx < 0:
				raise Exception(f'INI file "{ini_file}" contains no [CTextLexicon]')
			for line in lines[idx+1::]:
				if '[' == line[0]:
					raise Exception(f'INI file "{ini_file}" contains no [CTextLexicon] FileName')
				if 'FileName' in line:
					lex_file = line.split('=')[1]
					break

		os.makedirs(os.path.dirname(lex_file), exist_ok=True)
		with open(lex_file, 'w', encoding=output_encoding) as fout:
			for file in lex_files:
				with open(file, encoding=input_encoding) as fin:
					fout.write(fin.read()+'\n')

		self.__regexes = []
		for line in itertools.chain.from_iterable(map(open, compound_files)):
			if line.strip() == '': continue
			seg = line.strip().split('\t')
			self.__regexes.append((re.compile(rf'(\A|(?<=\n|　)){re.escape(seg[0])}\([A-Za-z0-9]*?\)'), seg[1], seg[0]))
		self.__regexes.append((re.compile(r'　□\(SP\)'), '', '□'))

		self.__core = CkipWS(ini_file)
		if verbose: print(f'Initialize CKIPWS with INI "{ini_file}" using lexicon "{lex_file}"')

[docs]	def ws_file(self, input_file, output_file, verbose=True):
		if verbose: print(f'Processing Word Segment on {input_file} to {output_file}')
		self.__core.apply_file(input_file, output_file)

[docs]	def ws_list(self, input_file, output_file, verbose=True):
		if verbose: print(f'Processing Word Segment on {input_file} to {output_file}')
		with open(input_file) as fin, open(output_file, 'w') as fout:
			fout.write('\n'.join(self.__core.apply_article(fin.readlines())))

[docs]	def ws_line(self, input_file, output_file, verbose=True):
		if verbose: print(f'Processing Word Segment on {input_file} to {output_file}')
		with open(input_file) as fin, open(output_file, 'w') as fout:
			lines = fin.readlines()
			n = str(len(lines))
			for i, line in enumerate(lines):
				if verbose: printr(f'{i+1:0{len(n)}}/{n}\t{line[:8].strip()}...')
				fout.write('　'.join(self.__core.apply_article([line[i:i+80] for i in range(0, len(line), 80)]))+'\n')
		if verbose: print()

[docs]	def replace(self, input_file, output_file, input_encoding=None, output_encoding=None, verbose=True):
		with open(input_file, encoding=input_encoding) as fin, open(output_file, 'w', encoding=output_encoding) as fout:
			lines = fin.read()
			for regex in self.__regexes:
				if verbose: printr(regex[2])
				lines = regex[0].sub(regex[1], lines)
			fout.write(lines)