# -*- coding: utf-8 -*-

__doc__ = """Functions related to reading morph.xml-like XML files
and writing TEI-like disambiguated output.
Adam Radziszewski, 19.08.2009
"""

import re, codecs
from xml.sax import saxutils

plex = re.compile(r'\s*<lex(.*)>\s*<base>(.*)</base>\s*<ctag>(.*)</ctag>\s*</lex>')
porth = re.compile(r'\s*<orth>(.*)</orth>')
pendtok = re.compile(r'\s*</tok(.*)>')
pbegchunk = re.compile(u'\s*<chunk(>|\s[^>]+>)')
pendchunk = re.compile(r'\s*</chunk>')
pns = re.compile(r'\s*<ns\s*/>')
pother = re.compile(r'\s*(<div|</div|<head|</head)')

unescaping = {"&apos;":"'", "&quot;":'"'}
escaping = {"'":"&apos;", '"':"&quot;"} # <, >, & are already included

tei_opening = '<text id="%s" lang="pl">\n<body>\n'
tei_closing = '</body>\n</text>\n'


def esc(what):
	return saxutils.escape(what, escaping)

def unesc(what):
	return saxutils.unescape(what, unescaping)

def from_xces(in_fname, details):
	"""Prepares a generator yielding consecutice items.
	The items are either KIPI-like tokens, or beginnings/engings
	of a portion of text.
	
	The input filename is assumed to be in KIPI XCES format. The reader
	is primitive and not fully compliant to XML specs."""
	
	in_str = codecs.open(in_fname, 'r', 'utf-8')
	chunkstack = []
	try:
		while True:
			item = _read_one(in_str, details)
			if isinstance(item, ChunkMarker):
				if item.is_beg:
					chunkstack.append(item)
				else:
					last_beg = chunkstack.pop()
					item.take_data_from(last_beg)
			yield item
	except StopIteration:
		in_str.close()

def to_tei(out_fname, details):
	"""Returns a function ready to handle an iterable sequence (or whatever)
	of items. The function will store the items in TEI XML format."""
	
	def handler(item_source):
		# paragraph numbering: name_base.num.subnum.subsubnum...
		name_base = details.get_text_id(out_fname)
		numbering = [name_base.rstrip('.'),1]
		
		out_str = codecs.open(out_fname, 'w', 'utf-8')
		out_str.write(tei_opening % esc(name_base))
		for item in item_source:
			item.write_tei(out_str, numbering)
		out_str.write(tei_closing)
		out_str.close()
	return handler

class Item:
	"""An item read from XCES format. Token, chunk beg or chunk end."""
	
	def write_tei(self, out_fname, numbering = None):
		raise NotImplementedError('Abstract item')

class Other(Item):
	"""This is other XML tag, which should be preserved in output as it is in input."""
	def __init__(self, data):
		self.content = data.strip()
	def write_tei(self, out_str, numbering = None):
		out_str.write(u'%s\n' % self.content)

class ChunkMarker(Item):
	UNNAMED = u'UNNAMED'
	
	def __init__(self, is_beg):
		self.is_beg = is_beg
		self.name = None
		self.xref = None
		self.type = None
	
	def parse_data(self, data):
		vals = data.rstrip('>').split()
		for val in vals:
			k,v = val.split('=')
			v = v[1:-1]
			if k == 'type':
				self.type = v
			elif k == 'id':
				self.name = v
			elif k == 'xlink:href':
				self.xref = v
	
	def take_data_from(self, chunk_beg):
		self.name = chunk_beg.name
		self.xref = chunk_beg.xref
		self.type = chunk_beg.type
	
	def __unicode__(self):
		"""Represents as simplified KIPI-like XML. For debugging purposes."""
		s_type = self.type and (u' type="%s"' % self.type) or u''
		s_xref = self.xref and (u' xlink:href="%s"' % self.xref) or u''
		s_name = self.name and (u' id="%s"' % self.name) or u''
		s_end = (not self.is_beg) and '/' or u''
		return u'<%schunk%s%s%s>' % (s_end, s_name, s_xref, s_type)
	
	def write_tei(self, out_str, numbering = None):
		"""Writes in TEI XML format."""
		
		# If no chunk type is given, the chunk marker will be ignored
		if self.type:
			# if not sentence, make it a paragraph
			chtype = (self.type == 's') and 's' or 'p'
			if self.is_beg:
				name = numbering is None and ChunkMarker.UNNAMED or u'.'.join(map(unicode, numbering))
				if self.name:
					# append the name we've got
					name += '-' + self.name
				out_str.write(u'<%s id="%s">\n' % (chtype, esc(name)))
				if numbering is not None:
					numbering += [1] # append next subnumber
			else: # chunk end
				if numbering is not None:
					numbering.pop() # remove last subnumber
					numbering[-1] += 1 # advance last remaining subnumber
				out_str.write(u'</%s>\n' % chtype)

class Punctuation(Item):
	def __init__(self, orth):
		self.orth = orth
	
	def write_tei(self, out_str, numbering = None):
		out_str.write(u'<c>%s</c>\n' % esc(self.orth))

class Token(Item):
	def __init__(self, orth, no_space = False):
		self.no_space = no_space
		self.orth = orth
		self.sel_d_lexem = None # selected disamb lexem; a list tag, lemma]
		self.disamb_lexems = [] # other disamb lexems; a list of lists [tag, lemma]
		self.ndisamb_lexems = [] # non-disamb lexems: a list of lists [tag, lemma]
	
	def __unicode__(self):
		nosp = self.no_space and u'<ns/> ' or u''
		return nosp + (u'<tok><orth>%s</orth>...</tok>' % self.orth)
	
	def foreach_lexem(self):
		"""Yields all lexems -- disamb and non-disamb ones."""
		yield self.sel_d_lexem
		for lex in self.disamb_lexems:
			yield lex
		for lex in self.ndisamb_lexems:
			yield lex
	
	def foreach_d_lexem(self):
		"""Yields all disamb lexems."""
		yield self.sel_d_lexem
		for lex in self.disamb_lexems:
			yield lex
	
	def select_lexems(self, details):
		"""Performs lexem sorting according to given details.
		If desired, will get rid of all non-disamb lexems.
		One disamb lexem may be chosen."""
		
		if self.sel_d_lexem is not None:
			raise ValueError('Disamb already selected (orth=%s, disamb=%s)' % (self.orth, self.sel_d_lexem))
		if len(self.disamb_lexems) == 0:
			raise ValueError('A token with no disamb lexems (orth=%s)' % self.orth)
		
		if len(self.disamb_lexems) == 1:
			self.sel_d_lexem = self.disamb_lexems[0]
			self.disamb_lexems = []
		else:
			idx = details.select(self.disamb_lexems)
			self.sel_d_lexem = self.disamb_lexems.pop(idx)
		
		# clean non-disamb and rejected-disamb lexems
		if details.clean_lex:
			self.ndisamb_lexems = []
			self.disamb_lexems = []
	
	def _without_duplicates(self, lexems):
		"""Removes duplicates among given lexems."""
		return sorted(set(map(tuple, lexems)))
	
	def write_tei(self, out_str, numbering = None):
		"""Writes the token in TEI XML format."""
		if self.sel_d_lexem is None:
			raise ValueError('No disamb selected for token (orth=%s)' % self.orth)
		# write selected disamb lexem
		out_str.write(u'<w lemma="%s" ana="%s">%s</w>\n' % (esc(self.sel_d_lexem[1]), esc(self.sel_d_lexem[0]), esc(self.orth)))
		# write rejected disambs (if any)
		for lex in self._without_duplicates(self.disamb_lexems):
			out_str.write(u'<w disamb="0" lemma="%s" ana="%s">%s</w>\n' % (esc(lex[1]), esc(lex[0]), esc(self.orth)))
		# write non-disamb lexems (if any)
		for lex in self._without_duplicates(self.ndisamb_lexems):
			out_str.write(u'<w disamb="0" lemma="%s" ana="%s">%s</w>\n' % (esc(lex[1]), esc(lex[0]), esc(self.orth)))

def _read_one(in_str, details):
	no_space = False
	while True:
		line = in_str.next()
		# maybe ns
		m = pns.match(line)
		if m:
			no_space = True
		else:
			# maybe token orth
			m = porth.match(line)
			if m:
				orth = unesc(m.group(1))
				return _read_tok(in_str, orth, no_space, details)
			else:
				# maybe chunk beg or end
				m = pbegchunk.match(line)
				if m:
					data = m.group(1)
					item = ChunkMarker(True) # chunk beg
					item.parse_data(data)
					return item
				
				if pendchunk.match(line):
					return ChunkMarker(False) # chunk end
				if pother.match(line): # other xml stuff to output unchanged
					return Other(line)

def _read_tok(in_str, orth, no_space, details):
	tok = Token(orth, no_space)
	line = in_str.next()
	while not pendtok.match(line):
		# look for lexems
		m = plex.match(line)
		if m:
			tag = unesc(m.group(3))
			base = unesc(m.group(2))
			is_disamb = ('1' in m.group(1))
			lex = [tag, base]
			if is_disamb:
				tok.disamb_lexems.append(lex)
			else:
				tok.ndisamb_lexems.append(lex)
		line = in_str.next()
	tok.select_lexems(details)
	return tok


