import sys import random import time import re import optparse import copy import collections import html import operator from unicodedata import normalize # import pdb import gmpy2 import yaml from tr import tr sep = re.compile('[^0-9.]*[^0-9.+*=]') sep2 = re.compile('[^0-9.:]*[^0-9.:]') aChar = re.compile('(.)') expansionCount = 0 one = float('1') channels = {} retable = dict() def refParse(refstr): """Basically copy Formatter().parse() but treat ! differently: Only one of : or ! is allowed to follow the refname, and | is used to separate arguments. Also, the subtle points of {{ and }} interpretation are surely different. """ lit = "" name = None flist = None ilist = None args = [] state = "lit" ostate = "lit" for c in refstr: # print((c,state)) if state == "lit": if c == "{": ostate = state state = "{" elif c == "}": ostate = state state = "}" elif c == '\\': ostate = state state = "esc" else: lit += c elif state == "{": if c == "{": if ostate == "lit": lit += c elif ostate == "name": name += c elif ostate == "flist": flist += c elif ostate == "ilist": ilist += c elif ostate == "args": args[-1] += c state = ostate elif c == "}": if ostate == "lit": state = "}" ostate = "name" name = "" else: raise ValueError("{ encountered in reference") elif c == ":": name = "" state = "flist" flist = "" elif c == "!": name = "" state = "ilist" ilist = "" elif c == "|": name = "" state = "args" args = [""] elif c == "\\": name = "" ostate = state state = "esc" else: state = "name" name = c elif state == "}": if c == "}": if ostate == "lit": lit += c elif ostate == "name": name += c elif ostate == "flist": flist += c elif ostate == "ilist": ilist += c elif ostate == "args": args[-1] += c state = ostate else: if ostate == "lit": raise ValueError("Single '}' encountered", refstr) elif c == "{": yield (lit, name, flist, ilist, args) state = "{" lit = "" name = None flist = None ilist = None args = None else: yield (lit, name, flist, ilist, args) state = "lit" lit = c name = None flist = None ilist = None args = None elif state == "name": if c == ":": state = "flist" flist = "" elif c == "!": state = "ilist" ilist = "" elif c == "|": state = "args" args = [""] elif c == "\\": ostate = state state = "esc" elif c == "}": ostate = state state = "}" elif c == "{": ostate = state state = "{" else: name += c elif state == "flist": if c == "}": ostate = state state = "}" elif c == "{": ostate = state state = "{" else: flist += c elif state == "ilist": if c == "}": ostate = state state = "}" elif c == "{": ostate = state state = "{" else: ilist += c elif state == "args": if c == ":": state = "flist" flist = "" elif c == "!": state = "ilist" ilist = "" elif c == "|": args.append("") elif c == "}": ostate = state state = "}" elif c == "{": ostate = state state = "{" elif c == "\\": ostate = state state = "esc" else: args[-1] += c elif state == "esc": if c in "\\\"':!|{}<>": if ostate == "lit": lit += c elif ostate == "name": name += c elif ostate == "flist": flist += c elif ostate == "ilist": ilist += c elif ostate == "args": args[-1] += c # if ostate == "args": # args[-1] = c # elif ostate == "name": # name += c state = ostate else: raise ValueError("Illegal escape", "\\" + c, refstr) if state == "lit": if not args: args = None yield (lit, name, flist, ilist, args) elif state == "}": if ostate != "lit": if not args: args = None yield (lit, name, flist, ilist, args) else: raise ValueError("stray '}' outside reference") elif state == "esc": raise ValueError(("Unterminated escape: state = " + repr(state) + " ostate = " + repr(ostate)), (" values = " + repr((lit, name, flist, ilist, args))), refstr) else: raise ValueError(("Unterminated reference: state = " + repr(state) + " ostate = " + repr(ostate)), (" values = " + repr((lit, name, flist, ilist, args))), refstr) def argExpand_stack(arg, args, depth, maxDepth): """Prototype stack-based expander, for recursive argrefs. Not yet used. """ argref_cexpr = re.compile(r"([^<>()]*)\(([^<>]*)\)") tok_literal = 1 tok_push_func = 2 tok_call_func = 3 tok_arg = 4 # tok_ellipsis = 5 token = collections.namedtuple('token', ['type', 'text']) def get_tokens(arg): type = tok_literal text = "" for ch in arg: if ch == '#' and not text: type = tok_arg elif ch == '|': yield token(type, text) text = "" type = tok_literal elif ch == ')': yield token(type, text) yield token(tok_call_func, ')') text = "" type = tok_literal elif ch == '(': if type == tok_literal: yield token(tok_push_func, text) text = "" type = tok_literal elif type == tok_arg: raise ValueError('#' + text + " cannot be used as function name") else: text = text + ch return token(type, text) # Set up the "registers" for the hybrid stack machine registers = { # Special registers: # for escaping 'p': '|', 'lt': '<', 'gt': '>', 'b': "\\", 'lb': '\\{', 'rb': '\\}', # Expansion depth of the node being expanded 'd': depth, # Maximum depth 'D': maxDepth, # Number of arguments passed to this template 'c': len(args['a']), # Number of arguments declared 'C': args['c'], # Full list of declared arguments 'a': args[:args['c']], # 'Extra' arguments '...': args['a'][args['c']:], # Full list of arguments 'A': args['a'], } for i in range(len(args['a'])): registers[str(i)] = args[i] def valOf(a, b): pass # Declare the instruction set def mather(op, init, regs, *vals): acc = init for v in vals: if isinstance(v, list): for i in v: acc = op(acc, float(i)) else: acc = op(acc, float(v)) return str(int(acc)) if acc.is_integer() else str(acc) def flatten(ln): for el in ln: if (isinstance(el, collections.Iterable) and not isinstance(el, (str, bytes))): yield from flatten(el) else: yield el def pn(regs, *params): ops = { "+": lambda a, b: float(a) + float(b), "-": lambda a, b: float(a) - float(b), "*": lambda a, b: float(a) * float(b), "/": lambda a, b: float(a) / float(b), ".": lambda a, b: a + b, "^": lambda a, b: float(a) ** float(b), } opstack = [] valstack = [] pending = False exparams = flatten(params) for p in exparams: if p in ops: opstack.append(p) pending = False else: if pending: while len(valstack): v1 = valstack.pop() op = opstack.pop() try: p = str(ops[op](v1, p)) except ZeroDivisionError as err: err.args += v1, p valstack.append(p) pending = True return valstack.pop() def comparer(op, regs, a, b, t, f): return t if op(a, b) else f def repeat(regs, *params): acc = "" params = list(params) # Handle odd lists params.append("1") for p in zip(params[::2], params[1::2]): acc += p[0] * int(p[1]) return acc def oneof(args, *params): exparams = flatten([valOf(args, p) for p in params]) return random.choice(exparams) def truthy(regs, val): if not val: return False if not float(val): return False return True funs = { # Length of a list "len": lambda r, v: str(len(v) if isinstance(v, list) else 1), # Math functions "+": lambda r, *v: mather(operator.add, 0, r, *v), "-": lambda r, *v: mather(operator.sub, 0, r, *v), "*": lambda r, *v: mather(operator.mul, 1, r, *v), "/": lambda r, *v: mather(operator.truediv, 1, r, *v), "^": lambda r, *v: mather(operator.pow, 1, r, *v), # Polish notation evaluator "math": pn, "calc": pn, # Branching functions "if": lambda r, v, t, f: t if truthy(r, v) else f, "gt": lambda *v: comparer(operator.gt, *v), "gte": lambda *v: comparer(operator.ge, *v), "lt": lambda *v: comparer(operator.lt, *v), "lte": lambda *v: comparer(operator.le, *v), "=": lambda *v: comparer(operator.eq, *v), "eq": lambda *v: comparer(operator.eq, *v), "!=": lambda *v: comparer(operator.ne, *v), "ne": lambda *v: comparer(operator.ne, *v), # Logic "not": lambda r, v: not truthy(r, v), # Indexing functions "?": lambda r, c, *v: v[c], "i": lambda r, c, *v: v[c], "which": lambda r, c, *v: v.index(c), # Why does this exist? "num": lambda r, v: str(float(v)), # String concatenation ".": lambda r, *v: "".join(v), ".*": repeat, "oneof": oneof, } if argref_cexpr.fullmatch(arg): datastack = [] funcstack = [] argstack = [] for tok in get_tokens(arg): if tok.type == tok_push_func: funcstack.append(tok.text) argstack.append(0) elif tok.type == tok_literal: datastack.append(tok.text) argstack[-1] += 1 elif tok.type == tok_arg: datastack.append(registers[tok.text]) argstack[-1] += 1 elif tok.type == tok_call_func: count_args = argstack.pop() computed_args = datastack[-count_args:] del datastack[-count_args:] fun = funcstack.pop() datastack.append(funs[fun](registers, *computed_args)) argstack[-1] += 1 if len(argstack) > 0: raise ValueError("Unterminated function") if len(datastack) > 1: raise ValueError("| encountered outside function") assert(len(datastack) == 1) ret = datastack.pop() if isinstance(ret, list): ret = '|'.join(ret) return ret elif arg == "...": return '|'.join(args['a'][args['c']:]) else: return registers[arg] def argsExpand(argstr, args, depth, maxDepth): argref = re.compile(r"(<<|>>|[^<>]*)(<([^<>]*)>)?") argfun = re.compile(r"([^<>()]*)\(([^<>()]*)\)") outstr = "" # print(argstr, args) # print(argstr, argref.findall(argstr)) global expansionCount def valOf(args, val): registers = { # Special registers: # for escaping 'p': '|', 'lt': '<', 'gt': '>', 'b': "\\", 'lb': '\\{', 'rb': '\\}', # Expansion depth of the node being expanded 'd': depth, # Maximum depth 'D': maxDepth, 'e': expansionCount, 'E': maxDepth ** 2, # Number of arguments passed to this template 'c': len(args['a']), # Number of arguments declared 'C': args['c'], # Full list of declared arguments 'a': args['a'][:args['c']], # 'Extra' arguments '...': args['a'][args['c']:], # Full list of arguments 'A': args['a'], } def impl(args, val): if len(val) == 0: return val if val[0] == "#": if val[1:] in registers: return str(registers[val[1:]]) else: return args['a'][int(val[1:])] elif val == "...": return args['a'][args['c']:] elif argfun.fullmatch(val): fun, argname = argfun.fullmatch(val).groups() return funs[fun](args, *argname.split('|')) else: return val v = impl(args, val) # print("Computed: ", repr(val), " = ", repr(v), file=sys.stderr) return v def mather(op, init, a, *vals): if init is None: acc = float(valOf(args, vals[0])) for v in vals[1:]: if v == "...": for i in range(a['c'], len(a['a']), 1): acc = op(acc, float(valOf(args, a['a'][i]))) else: acc = op(acc, float(valOf(args, v))) # elif v[0] == "#": # acc = op(acc, float(a['a'][int(v[1:])])) # else: # acc = op(acc, float(v)) return str(int(acc)) if acc.is_integer() else str(acc) else: acc = init for v in vals: if v == "...": for i in range(a['c'], len(a['a']), 1): acc = op(acc, float(a['a'][i])) else: acc = op(acc, float(valOf(args, v))) # elif v[0] == "#": # acc = op(acc, float(a['a'][int(v[1:])])) # else: # acc = op(acc, float(v)) return str(int(acc)) if acc.is_integer() else str(acc) def select(args, choose, *vals): return valOf(args, vals[valOf(args, choose)]) # if choose[0] != "#": # raise ValueError("? expects first argument to be an argument") # return vals[int(args['a'][int(choose[1:])])] def which(args, choose, *vals): return [valOf(args, _) for _ in vals].index(valOf(args, choose)) # if choose[0] != "#": # raise ValueError("which expects first argument to be an argument") # return vals.index(args['a'][int(choose[1:])]) def num(args, val): return str(float(valOf(args, val))) def greater(args, var, pivot, t, f): if valOf(args, var) > valOf(args, pivot): return valOf(args, t) else: return valOf(args, f) def lesser(args, var, pivot, t, f): if valOf(args, var) < valOf(args, pivot): return valOf(args, t) else: return valOf(args, f) def equal(args, var, pivot, t, f): if valOf(args, var) == valOf(args, pivot): return valOf(args, t) else: return valOf(args, f) def concat(args, *params): return "".join([valOf(args, p) for p in params]) def repeat(args, *params): acc = "" # Handle odd lists params = list(params) params.append("1") for p in zip(params[::2], params[1::2]): acc += valOf(args, p[0]) * int(valOf(args, p[1])) return acc def flatten(ln): for el in ln: if (isinstance(el, collections.abc.Iterable) and not isinstance(el, (str, bytes))): yield from flatten(el) else: yield el def pn(args, *params): ops = { "+": lambda a, b: float(a) + float(b), "-": lambda a, b: float(a) - float(b), "*": lambda a, b: float(a) * float(b), "/": lambda a, b: float(a) / float(b), ".": lambda a, b: a + b, "^": lambda a, b: float(a) ** float(b), } opstack = [] valstack = [] pending = False exparams = flatten([valOf(args, p) for p in params]) for _ in exparams: p = valOf(args, _) if p in ops: opstack.append(p) pending = False else: if pending: while len(valstack): v1 = valstack.pop() op = opstack.pop() try: p = str(ops[op](v1, p)) except ZeroDivisionError as err: err.args += v1, p raise valstack.append(p) pending = True return valstack.pop() def rpn(args, *params): ops = { "+": lambda a, b: float(a) + float(b), "-": lambda a, b: float(a) - float(b), "*": lambda a, b: float(a) * float(b), "/": lambda a, b: float(a) / float(b), ".": lambda a, b: a + b, "^": lambda a, b: float(a) ** float(b), } valstack = [] exparams = flatten([valOf(args, p) for p in params]) for _ in exparams: p = valOf(args, _) if p in ops: v2 = valstack.pop() v1 = valstack.pop() try: valstack.append(str(ops[p](v1, v2))) except ZeroDivisionError as err: err.args += v1, v2, p raise else: valstack.append(p) return valstack.pop() def mapper(args, text, set1, set2): return tr(valOf(args, set1), valOf(args, set2), valOf(args, text)) def replacer(args, text, *params): params = list(params) text = valOf(args, text) if len(params) % 2: raise ValueError("'replace' requires its arguments to be in pairs.") for p in zip(params[::2], params[1::2]): m = valOf(args, p[0]) if m not in retable: retable[m] = re.compile(m) text = re.sub(retable[m], valOf(args, p[1]), text) return text def oneof(args, *params): exparams = flatten([valOf(args, p) for p in params]) return random.choice(exparams) def truthy(args, arg): val = valOf(args, arg) if not val: return False if not float(val): return False return True funs = { # "len": lambda a, v: "1" if not v == "..." else str(len(a['a']) - a['c']), "len": lambda a, v: str(len(a['a']) - a['c']) if v == "..." else "1", "+": lambda a, *v: mather(operator.add, 0, a, *v), "-": lambda a, *v: mather(operator.sub, None, a, *v), "*": lambda a, *v: mather(operator.mul, 1, a, *v), "/": lambda a, *v: mather(operator.truediv, None, a, *v), "^": lambda a, *v: mather(operator.pow, 1, a, *v), "?": select, "which": which, "num": num, "if": lambda r, v, t, f: t if truthy(r, v) else f, "gt": greater, "lt": lesser, "=": equal, "eq": equal, ".": concat, ".*": repeat, "math": pn, "calc": pn, "pn": pn, "rpn": rpn, "map": mapper, "replace": replacer, "oneof": oneof, None: lambda a, v: v } for m in argref.finditer(argstr): # print(m.groups()) lit, _, arg = m.groups("") outstr += lit if argfun.fullmatch(arg): fun, argname = argfun.fullmatch(arg).groups() outstr += funs[fun](args, *argname.split('|')) elif arg == "...": outstr += '|'.join(args['a'][args['c']:]) elif arg: outstr += valOf(args, '#' + arg) return outstr def processRef(Data, s): node = None subargs = {'c': 0, 'a': []} if s[1] is not None: ref = s[1] if s[4] is not None: if ref + '|' * len(s[4]) in Data: ref += '|' * len(s[4]) subargs = {'c': len(s[4]), 'a': s[4]} else: for a in range(len(s[4]), 0, -1): if ref + '|' * a + '...' in Data: ref += '|' * a + '...' break else: raise KeyError("Node " + ref + " with " + str(len(s[4])) + " arguments not found") subargs = {'c': a, 'a': s[4]} # print(ref) node = copy.deepcopy(Data[ref]) if s[2]: _ = re.match(sep, s[2]) if _: _ = _.end() flist = re.split(sep, s[2][_:]) # NYI # mode = "assign" for i in range(min(len(flist), len(node))): if flist[i][0] == "*": # mode = "multiply" flist[i] = flist[i][1:] elif flist[i][0] == "+": # mode = "add" flist[i] = flist[i][1:] elif flist[i][0] == "=": # mode = "assign" flist[i] = flist[i][1:] d = float(flist[i]) node[i]['freq'] = d # nstr += ','+str(d) # Data[nstr] = node elif s[3]: # print(s[3]) _ = re.match(sep2, s[3]) if _: _ = _.end() ilist = re.split(sep2, s[3][_:]) # print(ilist) node2 = [] for i in ilist: ival = i.split(":") # print(node) node2.append(node[int(ival[0])]) if len(ival) > 1: node2[-1]['freq'] = float(ival[1]) node = node2 return (s[0], node, subargs) def showNodes(Data, nodes, depth=-16, maxDepth=16): for s in refParse(nodes): _, node, args = processRef(Data, s) if args: onode = copy.deepcopy(node) for x in node: oldliteral = copy.deepcopy(x) if args: for ch in x: if isinstance(x[ch], str): try: x[ch] = argsExpand(x[ch], args, -depth, maxDepth) except ValueError as err: err.args += x, raise x["val"] = x.get("val", "") try: x["freq"] = float(x.get("freq", one)) except ValueError as err: err.args += args, {'Original': oldliteral}, x, raise if x["freq"] < 0: x["freq"] = 0 # Ensure that every channel named exists for ch in channels: x[ch] = x.get(ch, "") print("Original: ", onode) print("Computed: ", node) def chooseFrom(Data, branches, depth=-16, maxDepth=16, args=None): """Select a random value from the branches, recursing on references. This function implements the essential algorithm of wordgen. """ global expansionCount specialChannels = set(["val", "freq", "path"]) # print(branches) if args is None: args = {'c': 0, 'a': []} if isinstance(branches, dict): branches["val"] = branches.get("val", "") branches["freq"] = one branches["path"] = None return branches expansionCount += 1 obranch = copy.deepcopy(branches) for x in branches: oldliteral = copy.deepcopy(x) for ch in x: if isinstance(x[ch], str): try: x[ch] = argsExpand(x[ch], args, maxDepth + depth, maxDepth) except ValueError as err: err.args += x, raise # print(branches) # for x in branches: x["val"] = x.get("val", "") try: x["freq"] = float(x.get("freq", one)) except ValueError as err: err.args += args, {'Original': oldliteral}, x, raise if x["freq"] < 0: x["freq"] = 0 # Ensure that every channel named exists for ch in channels: x[ch] = x.get(ch, "") branchesSum = sum([x["freq"] for x in branches]) if not branchesSum: rets = { "val": "", "path": [0], "freq": 1 } for ch in channels: rets[ch] = rets.get(ch, "") return rets a = float(random.uniform(0, float(branchesSum))) stop = 0 # This needs no normalization because values are never directly compared. for i, c in enumerate([x["freq"] for x in branches]): a -= c if a <= 0: stop = i break obranch = obranch[stop] other_channels = ( set([_ for _ in branches[stop]]) - specialChannels ) if "path" in branches[stop]: pass if expansionCount >= maxDepth ** 2: # Expansion limit reached print("wordgen.py: expansion limit reached", file=sys.stderr) rets = { "val": branches[stop]["val"], "path": [0], "freq": branches[stop]["freq"] / branchesSum } for ch in other_channels: rets[ch] = branches[stop].get(ch, "") return rets elif depth >= 0: # Recursion limit reached print("wordgen.py: depth limit reached", file=sys.stderr) rets = { "val": branches[stop]["val"], "path": [0], "freq": branches[stop]["freq"] / branchesSum } for ch in other_channels: rets[ch] = branches[stop].get(ch, "") return rets rets = {"val": "", "freq": one / branchesSum, "path": [stop]} # If val is empty, simply return the other channels if not branches[stop]["val"]: for ch in other_channels: rets[ch] = branches[stop].get(ch, "") return rets # Determine which is a string and which is a reference # print(branches[stop]["val"]) # print(list(refParse(branches[stop]["val"]))) try: for s in refParse(branches[stop]["val"]): # Recurse on reference and insert results into string # print(s) text, node, subargs = processRef(Data, s) if text: rets["val"] = rets["val"] + text for ch in other_channels: rets[ch] = rets.get(ch, "") + branches[stop].get(ch, "") if node: # Throws a KeyError on invalid reference. Not caught # because the Python default error message is good # enough and there's nothing for the code to do with # an error. # Fill reference tmp = chooseFrom(Data, node, depth + 1, maxDepth, subargs) other_channels.update( set([_ for _ in tmp]) - specialChannels ) rets["val"] = rets["val"] + tmp["val"] rets["freq"] = rets["freq"] * tmp["freq"] rets["path"].append(tmp["path"]) for ch in other_channels: rets[ch] = rets.get(ch, "") + tmp.get(ch, "") except ValueError as err: err.args += args, obranch, raise return rets def filterRE(RE): """Processes regex from file for use. Currently no-op.""" return RE def applyRE(Data, word, keepHistory=False, KHSep=" → "): """Applies regular expressions in Data to word.""" def doStagedMatchReplace(regexes, word, fullword): def defaultPlaceholder(defStr, c): # return aChar.sub(str, c) out = "" for t in refParse(defStr): out += t[0] if t[1] is not None: out += c return out def matchesSet(set, c): return True if tr(set, "", c, "cd") else False def doMaps(maps, matches, c): def doFSMMatch(map1, map2, c, S): if tr(map1, "", c, "cd"): return (True, tr(map1, map2, c), S) return (False, "", None, None) for map in maps: m = doFSMMatch(map[0], map[1], c, map[2] if len(map) > 2 else None) if m[0]: return (m[1], m[2]) for match in matches: if matchesSet(match[0], c): return (c, match[1]) return False ret = [word] for stage in regexes: if isinstance(stage, dict) and "S" in stage: # Order of Operations: # Most specific rule first: # 1. character rules # 2. set rules # like generalized character rules # 3. map rules # 4. match rules (like maps with set2 = set1) # 5. default rule # 6. return rule # (equivalent to default: ["{}", ]) # pdb.set_trace() state = "S" cline = "" # print("begin: "+ret[-1]) if "reversed" in stage and stage["reversed"] & 1: ret[-1] = ret[-1][::-1] for c in ret[-1]: s = stage[state] m = doMaps(s.get("map", []), s.get("match", []), c) if c in s: r = s[c] cline += defaultPlaceholder(r[0], c) if len(r) > 1: state = r[1] elif "set" in s: for r in s["set"]: if matchesSet(r[0], c): cline += defaultPlaceholder(r[1], c) if len(r) > 2: state = r[2] elif m: cline += m[0] if m[1]: state = m[1] elif "default" in s: r = s["default"] cline += defaultPlaceholder(r[0], c) if len(r) > 1: state = r[1] else: cline += c if "return" in s: state = s["return"] if "end" in stage[state]: cline += stage[state]["end"] if "reversed" in stage and stage["reversed"] & 1: # unreverse the input ret[-1] = ret[-1][::-1] if "reversed" in stage and stage["reversed"] & 2: cline = cline[::-1] ret.append(cline[:]) elif isinstance(stage, dict) and "normalize" in stage: form = stage["normalize"] if form == "default": form = "NFC" ret.append(normalize(form.upper(), ret[-1])) elif isinstance(stage, dict) and "repeat" in stage: pass elif isinstance(stage, list) and len(stage) > 0 and "m" in stage[0]: for rule in stage: if "c" in rule: # not continue because rules are never added break rule["c"] = re.compile(filterRE(rule["m"])) cline = ret[-1] for rule in stage: cline = rule["c"].sub(rule["r"], cline) ret.append(cline[:]) elif "assign" in stage: tmp = "" for ref in refParse(stage["assign"]): tmp += ref[0] if ref[1] is not None: tmp += fullword.get(ref[1], "") ret.append(tmp) else: print("replace stage invalid: {0!r}".format(stage), file=sys.stderr) return ret ret = {} if "replace" in Data: assert "path" not in Data["replace"], \ "path is not a valid channel for replacement rules" for channel in Data["replace"]: # if channel in word: ret[channel] = ( doStagedMatchReplace( Data["replace"][channel], word.get(channel, ""), word ) ) else: # Compatibility if "replacement" in Data: ret["val"] = ( doStagedMatchReplace( Data["replacement"], word["val"], word ) ) if "replaceIPA" in Data: ret["ipa"] = ( doStagedMatchReplace( Data["replaceIPA"], word["ipa"], word ) ) if keepHistory: for channel in ret: word[channel] = KHSep.join(ret[channel]) else: for channel in ret: word[channel] = ret[channel][-1] return word def listAll(Data, node, opts={ "ipa": True, "HTML": False, "path": False, "depth": -16, "keepHistory": False, "keepHistorySep": "→", "ignoreZeros": True }): '''Traverse all descendants of node''' def listWords(Data, node, depth, opts, path=[], flist=None): pass def nextPath(Data, node, path): return path global expansionCount # tmpbuf = [] ret = DFSPrint(listAllR( Data, node, opts["depth"], opts["ignoreZeros"] )) for word in ret: yield formatWord( applyRE(Data, { "val": word[0], "ipa": word[1], # DFSPrint doesn't work with paths "path": [0], "freq": word[2]}), opts) expansionCount = 0 # newword = applyRE(Data, {"val":word[0], "ipa":word[1]}) # word = (newword["val"], newword["ipa"], word[2]) # tmpbuf.append(word[0]+' :\t'+word[1]+'\t'+str(word[2])) time.sleep(0.0001) # return '\n'.join(tmpbuf) def listAllR(Data, node, depth, ignoreZeros, path=[], flist=None): '''Implementation of listAll. Do not call.''' if node in path: return {"t": 'V', "node": node} elif depth < 0: path.append(node) list = [] if not flist: flist = [float(x.get("freq", one)) for x in Data[node]] listSum = sum(flist) if ignoreZeros: for i in range(len(Data[node])): if i < len(flist): # Copy Data[node][i] so that Data is not altered N = dict(Data[node][i]) N["freq"] = flist[i] if flist[i]: list.append(N) else: if Data[node][i].get("freq"): list.append(Data[node][i]) else: list = Data[node] matches = [] for child in list: # 1+ elements are strings and 1+ elements are references to arrays # Determine which is a string and which is a reference matches.append({ "t": 'A', "freq": child.get("freq", one) / listSum, "Acontents": [] }) # If no val, insert IPA anyway if not child.get("val", ""): matches[-1]["Acontents"].append( {"t": 'L', "val": '', "ipa": child.get("ipa", "")} ) else: for s in refParse(child["val"]): # Recurse on reference and insert results into string if s[1]: nstr = s[1] node = Data[s[1]] if s[2]: _ = re.match('[^0-9.]+', s[2]) if _: _ = _.end() flist = re.split('[^0-9.]+', s[2][_:]) nstr = s[1] for i in range(min(len(flist), len(node))): d = float(flist[i]) node[i]['freq'] = d # Flist.append(d) nstr += ',' + str(d) if nstr not in Data: Data[nstr] = node else: flist = None # Throws a KeyError on invalid reference. Not caught because # the Python default error message is good enough and # there's nothing for the code to do with an error. # Fill reference tmp = listAllR(Data, nstr, depth + 1, ignoreZeros, path, None) if s[0]: # If reference+literal text, insert matches[-1]["Acontents"].append({ "t": 'L', "val": s[0], "ipa": child.get("ipa", "") }) matches[-1]["Acontents"].append(tmp) else: matches[-1]["Acontents"].append(tmp) # No reference, only literal text else: matches[-1]["Acontents"].append({ "t": 'L', "val": s[0], "ipa": child.get("ipa", "") }) # path.pop() return {"t": 'N', "node": node, "sum": listSum, "Ncontents": matches} else: # Recursion depth reached print("wordgen.py: recursion depth reached", file=sys.stderr) return {"t": 'T', "node": node, "raw": Data[node]} def DFSPrint(Node, freq=1): '''Generate list of words suitable for printing from tree structure.''' # Main case def f_A(Node, freq): buf1 = [("", "", 1)] for n in Node["Acontents"]: # tfreq = freq*Node["freq"] buf2 = DFSPrint(n, freq) # print('n: '+str(Node)) # print('1: '+str(buf1)) # print('2: '+str(buf2)) buf3 = [] for i in buf1: for j in buf2: # print('---\ni: '+str(i)+'\nj:'+str(j)+'\n---') buf3.append((i[0] + j[0], i[1] + j[1], i[2] * j[2])) # print('3: '+str(buf3)) buf1 = buf3 return buf1 # Simply iterate and recurse def f_N(Node, freq): ret = [] # N will always contain As for n in Node["Ncontents"]: ret.extend(DFSPrint(n, float(freq) * float(n["freq"]))) return ret # Leaf def f_L(Node, freq): # print('L: '+str(path)) return [(Node["val"], Node["ipa"], freq)] # Turn into reference def f_V(Node, freq): return [("{" + Node["node"] + "}", "{" + Node["node"] + "}", freq)] # Truncation -- pretend it's L but different def f_T(Node, freq): # print('T: '+str(freq)) return [("{" + Node["node"] + "}", "{" + Node["node"] + "}", freq)] switch = { 'A': f_A, 'N': f_N, 'L': f_L, 'V': f_V, 'T': f_T } return switch[Node['t']](Node, freq) def formatWord(word, opts, formatStr=None): '''Print words''' if formatStr is not None: # dbgWord = word.copy() # del dbgWord["path"] # del dbgWord["freq"] # print(dbgWord) return formatStr.format(**word) else: if not opts["HTML"]: fstr = "" first = True for ch in opts["channels"]: if first: first = False else: fstr += "\t" fstr += "{" + ch if ch == "path": word[ch] = printPath(word.get(ch, "")) elif ch == "freq": # fstr += ":.4e" pass else: word[ch] = word.get(ch, "") fstr += "}" else: fstr = "" for ch in opts["channels"]: fstr += "{" + ch + "}" if ch == "path": word[ch] = printPath(word.get(ch, "")) elif ch == "freq": pass else: word[ch] = html.escape(word.get(ch, "")) fstr += "" word["val"] = word.get("val", "") return formatWord(word, opts, fstr) def printPath(path): def recurse(path): ret = gmpy2.mpz(path[0]).digits(62) for a in path[1:]: if a: ret = ret + '[' + (recurse(a)) + ']' return ret if path: return '+' + recurse(path) else: return '+0' def readPath(pathStr): # Simple token separator function - recognizes + [ ] and alphanumerics def tokensOf(pathStr): ret = "" inInt = False for c in pathStr: if c in "[]": inInt = False if ret: yield ret ret = c elif c in "+": pass elif c in "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz": # Ints are in base 62 # Int tokens can be multiple characters if inInt: ret = ret + c else: inInt = True if ret: yield ret ret = c else: # This shouldn't be hit -- raise ValueError? pass return ret def constructPath(tokens, i=0): ret = [] while i < len(tokens): if tokens[i] == "[": tret, ti = constructPath(tokens, i + 1) ret.append(tret) if ti == len(tokens): pass # raise ValueError( # "Unterminated subpath", # i, # str(tokens), # str(tokens[i:]) # ) i = ti elif tokens[i] == "]": return ret, i else: ret.append(int(gmpy2.mpz(tokens[i], 62))) i += 1 return ret, i try: return constructPath(list(tokensOf(pathStr)))[0] except ValueError as err: # In case of error, report the full path in addition to the subpath err.args = err.args + (pathStr, ) raise def followPath(Data, node, path): # print(node) # root = [{ # "val": x["val"], # "ipa": x.get("ipa",""), # "freq":float(x.get("freq",one)) # } for x in Data[node] if x.get("freq",one) # ] sumFreq = sum( [float(x.get("freq", one)) for x in Data[node]] ) SNode = { "val": Data[node][path[0]].get("val", ""), "ipa": Data[node][path[0]].get("ipa", ""), "freq": (float(Data[node][path[0]].get("freq", one)) / sumFreq) } rets = {"val": "", "ipa": "", "freq": one} # print(SNode) for i, s in enumerate(refParse(SNode["val"])): # Recurse on reference and insert results into string if s[1]: # Throws a KeyError on invalid reference. Not caught because # the Python default error message is good enough and there's # nothing for the code to do with an error. # Fill reference tmp = {"val": "", "ipa": "", "freq": one} if i + 1 < len(path): tmp = followPath(Data, s[1], path[i + 1]) rets["val"] = rets["val"] + s[0] + tmp["val"] rets["freq"] = rets["freq"] * tmp["freq"] if s[0]: # If reference+literal text, insert rets["ipa"] = rets["ipa"] + SNode["ipa"] + tmp["ipa"] else: rets["ipa"] = rets["ipa"] + tmp["ipa"] # No reference, only literal text else: rets["val"] = rets["val"] + s[0] rets["ipa"] = rets["ipa"] + SNode["ipa"] return rets def toBNF(Data, StartDef): nodes = Data.copy() for N in set(["replace", "replaceIPA", "replacement", "channels"]): nodes.pop(N) pass def genPathCallback(option, opt, value, parser): parser.values.genFrom = True try: parser.values.genPath = readPath(value) except ValueError as err: raise OptionValueError(*err.args) from err def main(): global expansionCount # print(list(refParse("{} test"))) # Enable shorthand for decimal numbers: def dec_repr(dumper, data): return dumper.represent_scalar(u'!d', 'd' + str(data)) yaml.Dumper.add_representer(float, dec_repr) yaml.SafeDumper.add_representer(float, dec_repr) yaml.Loader.add_implicit_resolver(u'!d', re.compile(r'd\d*\.?\d+'), ['d']) yaml.SafeLoader.add_implicit_resolver(u'!d', re.compile(r'd\d*\.?\d+'), ['d']) def dec_cons(loader, node): return float(loader.construct_scalar(node)[1:]) yaml.Loader.add_constructor(u'!d', dec_cons) yaml.SafeLoader.add_constructor(u'!d', dec_cons) # Command-line options parser = optparse.OptionParser( usage="usage: %prog [options] [command]\n" " [command] may be either 'gen' [default], 'list', 'show', or 'diag'") genGroup = optparse.OptionGroup(parser, "Options for gen") listGroup = optparse.OptionGroup(parser, "Options for list") diagGroup = optparse.OptionGroup(parser, "Options for diag") showGroup = optparse.OptionGroup(parser, "Options for show") debugGroup = optparse.OptionGroup(parser, "Debugging options") parser.add_option("-c", "--channel", dest="channels", action="append", metavar="CHANNEL", default=[], help="print CHANNEL (can be used multiple times)") parser.add_option("-p", "--ipa", dest="channels", action="append_const", const="ipa", help="print IPA transcriptions (-c ipa)") parser.add_option("-d", "--depth", dest="depth", type="int", default=16, help="maximum recursion depth [default: %default]") parser.add_option("-H", "--html", dest="HTMLmode", action="store_true", default=False, help="write output as HTML table") genGroup.add_option("-n", dest="num", type="int", default=1, metavar="numWords", help="number of words to generate") genGroup.add_option("-V", dest="noVal", action="store_true", default=False, help="Suppress implicit 'val' printing") genGroup.add_option("-q", "--quiet", dest="quiet", action="store_true", default=False, help="Disable printing of the header") # Never implemented, and potentially unsafe # genGroup.add_option("-F", "--fmt", dest="fstr", # type="string", metavar="FMT_STR", default=[], # help="Format string for printing words") parser.add_option_group(genGroup) listGroup.add_option("-0", "--listZeros", dest="ignoreZeros", action="store_false", default=True, help="include 0-frequency values in list") parser.add_option_group(listGroup) showGroup.add_option("-D", "--startdepth", dest="showStartDepth", type="int", default=0, help="Starting depth") parser.add_option_group(showGroup) diagGroup.add_option("--regex", dest="dbgRE", action="store_true", default=False, help="Dump regular expressions after filtering.") diagGroup.add_option("--nodes", dest="dbgNodes", action="store_true", default=False, help="Dump switching nodes after filtering.") diagGroup.add_option("--retest", dest="dbgRETest", action="store_true", default=False, help="Apply regexes for CHANNELS to input.") diagGroup.add_option("--bnf", dest="dbgBNFExport", action="store_true", default=False, help="Export to BNF (val only).") parser.add_option_group(diagGroup) debugGroup.add_option("-P", "--path", dest="channels", action="append_const", const="path", help="print paths for generated words (-c path)") debugGroup.add_option("-K", "--keepHistory", dest="keepHistory", action="store_true", default=False, help="save every step of regex application\n" "May be hard to read.") debugGroup.add_option("--KHSep", dest="KHSep", type="string", default=" → ", metavar="SEP", help="what to insert between regex applications") debugGroup.add_option("-r", "--seed", dest="seed", action="store", default=None, help="random seed") debugGroup.add_option("-f", dest="channels", action="append_const", const="freq", help="show calculated frequencies (-c freq)") debugGroup.add_option("-G", "--generate", dest="genFrom", action="store", default=None, type="string", metavar="PATH", help="Generate a sentence from a path string.") parser.add_option_group(debugGroup) (options, args) = parser.parse_args() if len(args) < 2: parser.error("Not enough arguments") if len(args) < 3: args.append("gen") if "ipa" in options.channels: options.IPAmode = True else: options.IPAmode = False if "path" in options.channels: options.path = True else: options.path = False if "freq" in options.channels: options.showFreqs = True if not options.noVal: options.channels = ['val'] + options.channels opts = { "HTML": options.HTMLmode, "path": options.path, "depth": -1 * options.depth, "keepHistory": options.keepHistory, "keepHistorySep": options.KHSep, "ignoreZeros": options.ignoreZeros, "channels": options.channels, "genFrom": options.genFrom, } random.seed(options.seed) Data = yaml.safe_load(open(args[0], 'r', encoding="utf8")) if args[2] == "gen": Header = "" # Default some channel names for printing channels = {"val": "Words", "ipa": "IPA", "path": "Path"} if "channels" in Data: for ch, name in Data["channels"].items(): channels[ch] = name if options.HTMLmode: Header = "" for ch in options.channels: Header += "" Header += "" print(Header) else: if not options.quiet: Header += '\t'.join([ channels.get(ch, ch) for ch in options.channels ]) print(Header) print('-' * 40) if list(refParse(args[1]))[0][1] is not None: Data[":arg"] = [{"val": args[1]}] else: Data[":arg"] = [{"val": "{" + args[1] + "}"}] try: for _ in range(options.num): expansionCount = 0 if opts["genFrom"] is not None: raw_word = followPath(Data, ":arg", readPath(opts["genFrom"])) else: raw_word = chooseFrom( Data, Data[":arg"], -1 * options.depth - 1, options.depth ) word = applyRE( Data, raw_word, options.keepHistory, options.KHSep ) print(formatWord(word, opts)) finally: if options.HTMLmode: print("
" + html.escape(channels.get(ch, ch)) + "
") elif args[2] == "list": for word in listAll(Data, args[1], opts): print(word) elif args[2] == "xform": word = yaml.safe_load(args[1]) print(formatWord(applyRE( Data, word, options.keepHistory, options.KHSep ), opts)) elif args[2] == "show": showNodes(Data, args[1], options.depth + options.showStartDepth, options.depth) elif args[2] == "diag": if options.dbgRE: if "replace" in Data: for channel in Data["replace"]: print(channel + ':') for stage in Data["replace"][channel]: print(' [') for rule in stage: print( ' {' + "m: {m}, r: {r}".format( m=repr(filterRE(rule['m'])), r=repr(rule['r']) ) + '}' ) print(' ]') if options.dbgNodes: pass # G = SwitchingGraph(Data) # G.addNode(":arg", [{"val": args[1]}]) # print(repr(G)) # print(repr(G[":arg"])) if options.dbgRETest: for ch in options.channels: pass # NYI if options.dbgBNFExport: print('-' * 40) print(toBNF(Data, args[1])) print('-' * 40) if options.genFrom is not None: print(repr(readPath(options.genFrom))) else: for s in refParse(branches[stop]["val"]): pass main()