#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys, os
from optparse import OptionParser
import io, conv_table, details

__doc__ = """Tagset+represenation converter for Polish. Reads XCES
morph.xml-like input files assuming the KIPI tagset, produces TEI-like
disambiguated output with MTE tagset.

MTE specifications for Polish: Natalia Kotsyba, Ivan Derzhanski, Adam Radziszewski, 06.08.2009
Conversion tables: Natalia Kotsyba, 06.09.2009
Converter code: Adam Radziszewski, 01.09.2009
"""

def _add_choice(parser, str1, str2, opt, dest, help):
	default = opt[0][0]
	mapped = dict(opt)
	parser.add_option(str1, str2, choices=mapped.keys(), default=default, dest = dest, help = help)
	
def go():
	"Runs the program."
	descr = '%prog INPUT_FILE [options]\n\n'
	parser = OptionParser(usage=descr)
	parser.add_option('-o', '--out', type='string', action='store', default = '', dest='output', help='output filename')
	_add_choice(parser, '-d', '--disamb-selection', details.Details.SELECT_DISAMB, 'select', 'disamb tag selection method')
	_add_choice(parser, '-n', '--text-naming', details.Details.TEXT_ID, 'textid', 'text name generation method')
	parser.add_option('-a', '--leave-ambiguities', action='store_false', default=True, dest='clean_lex',
		help='preserve ambiguity info in output')
	parser.add_option('-w', '--walk-dir', action='store_true', default=False, dest='walk_dir',
		help='treat input file as directory to traverse for .xml files')
	
	(options, args) = parser.parse_args()
	if len(args) != 1:
		if options.walk_dir:
			print 'You need to provide an input directory (or an input file)'
		else:
			print 'You need to provide an input file (morph.xml-like)'
		print
		parser.print_help()
		sys.exit(1)
	
	if options.walk_dir and options.output:
		print 'When processing multiple files, you cannot specify output name (the default must be used)'
		sys.exit(1)
	
	in_fname = args[0]
	out_fname = options.output
	
	converter = conv_table.Converter()
	if options.walk_dir:
		process_dir(converter, in_fname, details.Details(options))
	else:
		process_file(converter, in_fname, out_fname, details.Details(options))

def _walk_dir(dir_name, ext = '.xml'):
	"Yields all .xml files in given directory and its subdirectories."
	if not os.path.isdir(dir_name):
		print 'Skipping %s (not a directory)' % dir_name
	else:
		for f in os.listdir(dir_name):
			fullpath = os.path.join(dir_name, f)
			if os.path.isdir(fullpath):
				# skip links
				if not os.path.islink(fullpath):
					for subpath in _walk_dir(fullpath):
						yield subpath
			else:
				if fullpath.lower().endswith(ext):
					# a file to process
					yield fullpath

def process_dir(converter, dir_name, ds):
	# have all filenames listed
	paths = [p for p in _walk_dir(dir_name)]
	for num, path in enumerate(paths):
		print 'Converting file %d / %d (%s)' % (num+1, len(paths), path)
		process_file(converter, path, None, ds)
def process_file(converter, in_fname, out_fname, ds):
	"Processes a single file reusing once initialised converter."
	if not out_fname:
		out_fname = in_fname + details.Details.EXT_CONVD
	
	input = io.from_xces(in_fname, ds)
	output = io.to_tei(out_fname, ds)
	output(converter.process(input))

if __name__ == '__main__':
	go()
