#tab-width: 3 import sys import yaml import random import itertools import re from string import * # Arguments: filename, numWords, IPAmode #Python 2 and 3 compatible random.seed() def chooseFrom(data, list, depth=-16): """Select a random value from the list, recursing on references""" list = [x for x in list if x.get("freq",1)] a = random.uniform(0,sum([x.get("freq",1) for x in list])) stop = 0 Rets = [] # This needs no normalization because values are never directly compared. for i,c in enumerate([x.get("freq",1) for x in list]): a -= c if a <= 0: stop = i break; if depth < 0: # 1+ elements are strings and 1+ elements are references to arrays rets = {"val": "", "ipa": ""} #If val is empty, insert ipa and bail if not list[stop].get("val",""): rets["ipa"] = list[stop].get("ipa","") return {"val": "", "ipa": list[stop].get("ipa","")} #Determine which is a string and which is a reference else: for s in Formatter().parse(list[stop].get("val","")): #Recurse on reference and insert results into string if s[1]: # Throws a KeyError on invalid reference. Not caught because # the Python default error message is good enough and there's # nothing for the code to do with an error. #Fill reference tmp = chooseFrom(data, data[s[1]], depth+1) rets["val"] = rets["val"] + s[0] + tmp.get("val","") if s[0]: #If reference+literal text, insert rets["ipa"] = rets["ipa"] + list[stop].get("ipa","") + tmp.get("ipa","") else: rets["ipa"] = rets["ipa"] + tmp.get("ipa","") #No reference, only literal text else: rets["val"] = rets["val"] + s[0] rets["ipa"] = rets["ipa"] + list[stop].get("ipa","") return {"val": rets["val"], "ipa": rets["ipa"]} else: #Recursion depth reached print("wordgen.py: recursion depth reached", file=sys.stderr) return list[stop] def isDigraphData(data, s1, s2): """Check to see if a syllable boundary is ambiguous""" if data["digraphProtection"]["builtins"]["geminates"]: if s1[-1] == s2[0]: return data["digraphProtection"]["insertChar"] for D in data["digraphProtection"]["sets"]: if any(s1.endswith(c) for c in D[0] or [""]) and any(s2.startswith(c) for c in D[1] or [""]): return data["digraphProtection"]["insertChar"] return "" def makeWords(Data, n, root, depth_limit=16, keepHistory=False): """Generate a list of n random descendants of {root}""" # For very complex data files, the depth limit may need to be increased. # 16 should handle up to ~6-8 syllables for any sensible language # If you see {Syllable} or the like in the output, either you have a # reference loop or you need to increase this. # Assuming a roughly tail-recursive file: # num_syllables ~= depth_limit - (1 + syllable_complexity) # where syllable_complexity (~4.5 for Cūrórayn) is the average number # of recursions between a {Syllable} node and an output string # The constant 1 is for the root node, # The theoretical maximum is node_width^depth_limit which is obviously much # greater, so "wide" data files are able to produce much more output. # See recursive.yml for a simple example of this with node_width=2 # The limit is mostly there to avoid loops, though, so this is actually # alright. Increase it or decrease it if needed. # TL;DR: Computation is dependent on total expansions, which is less # than (is bounded by) exponential in depth_limit. words = [] for x in range(n): cword = chooseFrom(Data, Data[root], 0-depth_limit) word = [cword["val"], cword["ipa"]] if "digraphProtection" in Data: #add apostrophes between syllables that need them syllables = cword["val"].split(Data.get(".",[{}])[0].get("val",".")) spelling = [] for index in range(len(syllables[:-1])): spelling.append(syllables[index] + isDigraphData(Data, syllables[index],syllables[index+1])) spelling.append(syllables[-1]) word = (''.join(spelling), cword["ipa"]) its=0 if "replacement" in Data: for stage in Data["replacement"]: for rule in stage: # Produces approximately a 40% speedup. rule["c"] = re.compile(filterRE(rule["m"])+'$') #print(stage) cline = '' for c in word[0]: cline += c #print(cline) for rule in stage: #print(rule) # Determine if rule-match matches, then replace cline = rule["c"].sub(filterRE(rule.get("r",".")), cline) #its+=1 if keepHistory: if word[0] != cline: word[0] = word[0] + "→" + cline else: word[0] = cline if "replaceIPA" in Data: for stage in Data["replaceIPA"]: for rule in stage: # Produces approximately a 40% speedup. rule["c"] = re.compile(filterRE(rule["m"])+'$') #print(stage) cline = '' for c in word[1]: cline += c #print(cline) for rule in stage: #print(rule) # Determine if rule-match matches, then replace cline = rule["c"].sub(filterRE(rule.get("r",".")), cline) #its+=1 if keepHistory: if word[1] != cline: word[1] = word[1] + "→" + cline else: word[1] = cline #print(its) words.append(word) return words; def filterRE(RE): """Processes regex from file for use. Currently does nothing. Does not sanitize RE.""" return RE def listAll(data, node): '''Print all possible descendants of node''' # I'll eventually implement this function return def printWords(words, IPAmode=True, HTMLmode=False): if HTMLmode: print("") if IPAmode: print("") print("") if (IPAmode): for a in words: print("') else: for a in words: print("") print("
WordsIPA
"+a[0] + '/' + a[1] + '/
"+a[0]+"
") else: if (IPAmode): for a in words: print(a[0] + ': \t/' + a[1] + '/') else: for a in words: print(a[0]) # Print $3 descendants of $2 from datafile $1 with mode flags $4 and $5 printWords(makeWords(yaml.safe_load(open(sys.argv[1],'r', encoding="utf8")), int(sys.argv[3]), sys.argv[2]), int(sys.argv[4]), int(sys.argv[5]))