#tab-width: 3
import sys
import yaml
import random
import itertools
import re
from string import *

# Arguments: filename, numWords, IPAmode

#Python 2 and 3 compatible

random.seed()

def chooseFrom(data, list, depth=-16):
	"""Select a random value from the list, recursing on references"""
	list = [x for x in list if x.get("freq",1)]
	a = random.uniform(0,sum([x.get("freq",1) for x in list]))
	stop = 0
	Rets = []
	# This needs no normalization because values are never directly compared.
	for i,c in enumerate([x.get("freq",1) for x in list]):
		a -= c
		if a <= 0:
			stop = i
			break;
	
	if depth < 0:
		# 1+ elements are strings and 1+ elements are references to arrays
		rets = {"val": "", "ipa": ""}
		#If val is empty, insert ipa and bail
		if not list[stop].get("val",""):
			rets["ipa"] = list[stop].get("ipa","")
			return {"val": "", "ipa": list[stop].get("ipa","")}
		#Determine which is a string and which is a reference
		else:
			for s in Formatter().parse(list[stop].get("val","")):
				#Recurse on reference and insert results into string
				if s[1]:
					# Throws a KeyError on invalid reference. Not caught because
						# the Python default error message is good enough and there's
						# nothing for the code to do with an error.
					#Fill reference
					tmp = chooseFrom(data, data[s[1]], depth+1)
					
					rets["val"] = rets["val"] + s[0] + tmp.get("val","")
					if s[0]:
						#If reference+literal text, insert 
						rets["ipa"] = rets["ipa"] + list[stop].get("ipa","") + tmp.get("ipa","")
					else:
						rets["ipa"] = rets["ipa"] + tmp.get("ipa","")
				#No reference, only literal text
				else:
					rets["val"] = rets["val"] + s[0]
					rets["ipa"] = rets["ipa"] + list[stop].get("ipa","")
			return {"val": rets["val"], "ipa": rets["ipa"]}
	else:
		#Recursion depth reached
		print("wordgen.py: recursion depth reached", file=sys.stderr)
		return list[stop]

def isDigraphData(data, s1, s2):
	"""Check to see if a syllable boundary is ambiguous"""
	if data["digraphProtection"]["builtins"]["geminates"]:
		if s1[-1] == s2[0]:
			return data["digraphProtection"]["insertChar"]
	for D in data["digraphProtection"]["sets"]:
		if any(s1.endswith(c) for c in D[0] or [""]) and any(s2.startswith(c) for c in D[1] or [""]):
			return data["digraphProtection"]["insertChar"]
	return ""

def makeWords(Data, n, root, depth_limit=16, keepHistory=False):
	"""Generate a list of n random descendants of {root}"""
	# For very complex data files, the depth limit may need to be increased.
		# 16 should handle up to ~6-8 syllables for any sensible language
		# If you see {Syllable} or the like in the output, either you have a
		# reference loop or you need to increase this.
	# Assuming a roughly tail-recursive file:
	# num_syllables ~= depth_limit - (1 + syllable_complexity)
		# where syllable_complexity (~4.5 for Cūrórayn) is the average number
		# of recursions between a {Syllable} node and an output string
		# The constant 1 is for the root node,
	# The theoretical maximum is node_width^depth_limit which is obviously much
		# greater, so "wide" data files are able to produce much more output.
			# See recursive.yml for a simple example of this with node_width=2
		# The limit is mostly there to avoid loops, though, so this is actually
		# alright. Increase it or decrease it if needed.
	
	# TL;DR: Computation is dependent on total expansions, which is less
		# than (is bounded by) exponential in depth_limit.
	words = []
	for x in range(n):
		cword = chooseFrom(Data, Data[root], 0-depth_limit)
		word = [cword["val"], cword["ipa"]]
		if "digraphProtection" in Data:
			#add apostrophes between syllables that need them
			syllables = cword["val"].split(Data.get(".",[{}])[0].get("val","."))
			spelling = []
			for index in range(len(syllables[:-1])):
				spelling.append(syllables[index] + isDigraphData(Data, syllables[index],syllables[index+1]))
			spelling.append(syllables[-1])
			word = (''.join(spelling), cword["ipa"])
		its=0
		if "replacement" in Data:
			for stage in Data["replacement"]:
				for rule in stage:
					# Produces approximately a 40% speedup.
					rule["c"] = re.compile(filterRE(rule["m"])+'$')
				#print(stage)
				cline = ''
				for c in word[0]:
					cline += c
					#print(cline)
					for rule in stage:
						#print(rule)
						# Determine if rule-match matches, then replace
						cline = rule["c"].sub(filterRE(rule.get("r",".")), cline)
						#its+=1
				if keepHistory:
					if word[0] != cline:
						word[0] = word[0] + "→" + cline
				else:
					word[0] = cline
		if "replaceIPA" in Data:
			for stage in Data["replaceIPA"]:
				for rule in stage:
					# Produces approximately a 40% speedup.
					rule["c"] = re.compile(filterRE(rule["m"])+'$')
				#print(stage)
				cline = ''
				for c in word[1]:
					cline += c
					#print(cline)
					for rule in stage:
						#print(rule)
						# Determine if rule-match matches, then replace
						cline = rule["c"].sub(filterRE(rule.get("r",".")), cline)
						#its+=1
				if keepHistory:
					if word[1] != cline:
						word[1] = word[1] + "→" + cline
				else:
					word[1] = cline
			#print(its)
		words.append(word)
	return words;

def filterRE(RE):
	"""Processes regex from file for use.
	Currently does nothing.
	Does not sanitize RE."""
	return RE

def listAll(data, node):
	'''Print all possible descendants of node'''
	# I'll eventually implement this function
	return
	

def printWords(words, IPAmode=True, HTMLmode=False):
	if HTMLmode:
		print("<table><tr><th>Words</th>")
		if IPAmode:
			print("<th>IPA</th>")
		print("</tr>")
		if (IPAmode):
			for a in words:
				print("<tr><td>"+a[0] + '</td><td>/' + a[1] + '/</td></tr>')
		else:
			for a in words:
				print("<tr><td>"+a[0]+"</td></tr>")
		print("</table>")
	else:
		if (IPAmode):
			for a in words:
				print(a[0] + ': \t/' + a[1] + '/')
		else:
			for a in words:
				print(a[0])


# Print $3 descendants of $2 from datafile $1 with mode flags $4 and $5
printWords(makeWords(yaml.safe_load(open(sys.argv[1],'r', encoding="utf8")), int(sys.argv[3]), sys.argv[2]), int(sys.argv[4]), int(sys.argv[5]))