#tab-width: 3 import sys import yaml import random import itertools from string import * # Arguments: filename, numWords, IPAmode #Python 2 and 3 compatible random.seed() def chooseFrom(data, list, depth=-16): """Select a random value from the list, recursing on references""" a = random.uniform(0,sum([x["freq"] for x in list])) stop = 0 Rets = [] # This needs no normalization because values are never directly compared. for i,c in enumerate([x["freq"] for x in list]): a -= c if a <= 0: stop = i break; if depth < 0: # 1+ elements are strings and 1+ elements are references to arrays rets = {"val": "", "ipa": ""} #Determine which is a string and which is a reference for s in Formatter().parse(list[stop]["val"]): #Recurse on reference and insert results into string if s[1]: # Throws a KeyError on invalid reference. Not caught because # the Python default error message is good enough and there's # nothing for the code to do with an error. tmp = chooseFrom(data, data[s[1]], depth+1) rets["val"] = rets["val"] + s[0] + tmp["val"] if s[0]: rets["ipa"] = rets["ipa"] + list[stop]["ipa"] + tmp["ipa"] else: rets["ipa"] = rets["ipa"] + tmp["ipa"] #No reference, only literal text else: rets["val"] = rets["val"] + s[0] rets["ipa"] = rets["ipa"] + list[stop]["ipa"] return {"val": rets["val"], "ipa": rets["ipa"]} else: #Recursion depth reached return list[stop] def isDigraph(s1, s2): """Check to see if a spelling is ambiguous (hardcoded)""" # Duplicated consonants (and like vowels) are confusing #if s1[-1] == s2[0]: if s1.endswith(s2[0]): return True #Special case elif s1[-1] == 'k' and s2[0] == 'c': return True # Vowels at start of syllable are likely to be ambiguous elif s2[0] in u'aeiíoóuūy': return True #vowel+r is ambiguous elif s1[-1] in 'aeiou' and s2[0] == 'r': return True elif s1[-1] == 'r': return True # May be difficult to know which syllable S's are in elif s1[-1] == 's' and s2[0] in 'pktnml': return True elif (len(s1) > 1 and s1[-2] == 's'): return True else: return False def isDigraphData(data, s1, s2): """Check to see if a syllable boundary is ambiguous""" if data["digraphProtection"]["builtins"]["geminates"]: if s1[-1] == s2[0]: return data["digraphProtection"]["insertChar"] for D in data["digraphProtection"]["sets"]: if any(s1.endswith(c) for c in D[0] or [""]) and any(s2.startswith(c) for c in D[1] or [""]): return data["digraphProtection"]["insertChar"] return "" def makeWords(filename, n): """Generate a list of n words""" words = [] Data = yaml.safe_load(open(filename,'r', encoding="utf8")) # For very complex data files, the depth limit may need to be increased. # 16 should handle up to ~6-8 syllables for any sensible language # If you see {Syllable} or the like in the output, either you have a # reference loop or you need to increase this. # num_syllables ~= depth_limit - (~2 + syllable_complexity) # where syllable_complexity = ~4 for Cūrórayn. depth_limit = -16; for x in range(n): cword = chooseFrom(Data, Data["root"], depth_limit) #add apostrophes between syllables that need them syllables = cword["val"].split('.') spelling = [] for index in range(len(syllables[:-1])): spelling.append(syllables[index] + isDigraphData(Data, syllables[index],syllables[index+1])) spelling.append(syllables[-1]) word = (''.join(spelling), cword["ipa"]) words.append(word) return words; def listAll(): # I'll eventually implement this function pass def printWords(words, IPAmode=True, HTMLmode=False): if HTMLmode: print("") if IPAmode: print("") print("") if (IPAmode): for a in words: print("') else: for a in words: print("") print("
WordsIPA
"+a[0] + '/' + a[1] + '/
"+a[0]+"
") else: if (IPAmode): for a in words: print(a[0] + ':\t/' + a[1] + '/') else: for a in words: print(a[0]) printWords(makeWords(sys.argv[1], int(sys.argv[2])), int(sys.argv[3]), int(sys.argv[4]))