#!/bin/python3
# This file is not used while TeX is running. It's for generating unicode-math-input-table.tex file only.
# This requires pythonimmediate (not sure which version is compatible but commit 63f94476a5cb11e33db1215a9bf7c17657d9773d on Python 3.10.10 is)

from __future__ import annotations

from pythonimmediate.engine import ChildProcessEngine
from pythonimmediate.engine import default_engine
from pythonimmediate import*
import pythonimmediate
from collections import defaultdict
import os
import json
import subprocess
import re
import sys
import unicodedata
import functools
from dataclasses import dataclass

# ========

print(r'% This file is automatically generated from unicode-math-input-script.py.')


# ======== start a luatex engine
engine=ChildProcessEngine("luatex", env={**os.environ, "hash_extra": "0"})
# https://tex.stackexchange.com/questions/574607/tex-hashtokens-incomplete
default_engine.set_engine(engine)

Catcode.active("ａ").meaning_str()


"""
from the TeXbook: 

(INITEX starts out with
\mathcode x = x for all characters x that are neither letters nor digits. The ten digits
have \mathcode x = x+"7000; the 52 letters have \mathcode x = x+"7100.)
"""
# ======== reset all mathcode to 0
TokenList([r"\directlua", TokenList.fstr(
r"""
for i=0, 0x10ffff do
	tex.setmathcode(i, {0, 0, 0})
end
"""
)]).execute()

# ======== load unicode-math
execute(r'''
\documentclass{article}
\usepackage{unicode-math}
\begin{document}
''')

# ======== print changed mathcodes (we aim to support all of these)
changed_mathcodes = TokenList([r"\directlua", TokenList.fstr(
r"""
for i=0, 0x10ffff do
	local cls, family, pos=table.unpack(tex.getmathcode(i))
	if not (
		--(cls==0 and family==0 and pos==i) or (cls==7 and family==1 and pos==i) 
		(cls==0 and family==0 and pos==0)
	) then
		tex.print(-2, i .. ":"..utf8.char(i)..":" .. cls..' '..family..' '..pos .. "\n")
	end
end
"""
)]).expand_x().str()

changed_chars: set[str] = set()
for line in changed_mathcodes.splitlines():
	match = re.fullmatch(r'(\d+):(.):(\d+) (\d+) (\d+)', line)
	assert match
	unicode_char = match[2]
	assert match[2]==chr(int(match[1])), match
	if match[3]=="8" and match[4]=="0" and match[5]=="0":
		code = Umathcode.active
	else:
		code = Umathcode(int(match[4]), MathClass.lookup(int(match[3])), int(match[5]))
	changed_chars.add(unicode_char)

# ======== parse the unicode math table

path = subprocess.run(["kpsewhich", "unicode-math-table.tex"], stdout=subprocess.PIPE).stdout
lines = Path(path.decode('u8').strip('\n')).read_text().splitlines()
lines = [line for line in lines if line and not line.startswith("%")]
unicode_math_table_=defaultdict(list)
for line in lines:
	match = re.fullmatch(r'\\UnicodeMathSymbol{"(.*)}{\\(.*?) *}{\\math(.*)}{(.*)}%', line)
	assert match
	unicode_char=chr(int(match[1], 16))
	csname=match[2]
	#unicode_math_table_.append(Item(unicode_char=unicode_char, csname=csname))
	unicode_math_table_[unicode_char].append(csname)
unicode_math_table={unicode_char: tuple(csnames) for unicode_char, csnames in unicode_math_table_.items()}

# ======== check how much of the table is valid on unicode-math/luatex

def getdelcode(x: str)->tuple[int, int, int, int]:
	return tuple(map(int, TokenList([r"\directlua", TokenList.fstr(  # type: ignore
		r"""
		for _, v in ipairs(tex.getdelcode(""" + str(ord(x)) + r""")) do tex.sprint(v..",") end
		"""
		)]).expand_x().str().rstrip(",").split(",")))

@functools.lru_cache(maxsize=None)
def meaning(csname: str)->str:
	return T[csname].meaning_str()

@functools.lru_cache(maxsize=None)
def good_delimiter(meaning: str, ch: str)->bool:
	math = umathcode[ch]
	o = ord(ch)
	if math.family!=0 or math.position!=o: return False
	a, b, c, d = getdelcode(ch)
	if a!=0 or b!=o or c!=0 or d!=0: return False
	other = f'\\protected macro:->\\Udelimiter {math.cls.value}\\symoperators "{o:05X}\\scan_stop: '
	return meaning==other

specially_handled = {
		match[1] for match in 
		re.finditer(r'\\__umi_special_handle{(.)}', Path("unicode-math-input.sty").read_text())
		}

not_handled = {*"⎴⎵⏜⏝⏞⏟⟌\u03a2\U0001d455"}

math_alphabet_translate = {
		"mup"      : None,
		"mbf"      : "umiMathbf",
		"mit"      : "umiMathit",
		"mbfit"    : "umiMathbfit",    # https://tex.stackexchange.com/questions/14395/bold-italic-vectors
		"mscr"     : "umiMathscr",
		"mbfscr"   : "umiMathbfscr",   # https://tex.stackexchange.com/questions/23455/latex-calligraphic-script-bold
		"mfrak"    : "umiMathfrak",
		"Bbb"      : "umiMathbb",
		"mitBbb"   : "umiMathbbit",    # https://tex.stackexchange.com/questions/16645/blackboard-italic-font
		"mbffrak"  : "umiMathbffrak",  # https://tex.stackexchange.com/questions/610696/may-i-have-bold-mathfraktur
		"msans"    : "umiMathsf",
		"mbfsans"  : "umiMathsfbf",    # https://tex.stackexchange.com/questions/340097/bold-sans-serif-math-font
		"mitsans"  : "umiMathsfit",
		"mbfitsans": "umiMathsfbfit",
		"mtt"      : "umiMathtt",
		}
math_alphabet_translate = dict(sorted(math_alphabet_translate.items(), key=lambda x: -len(x[0])))  # match against longest prefix first
math_alphabet_csname_translation = {
	"alpha": r"\alpha",
	"Alpha": r"\Alpha",
	"beta": r"\beta",
	"Beta": r"\Beta",
	"chi": r"\chi",
	"Chi": r"\Chi",
	"delta": r"\delta",
	"Delta": r"\Delta",
	"digamma": r"\digamma",
	"Digamma": r"\Digamma",
	"epsilon": r"\epsilon",
	"Epsilon": r"\Epsilon",
	"eta": r"\eta",
	"Eta": r"\Eta",
	"gamma": r"\gamma",
	"Gamma": r"\Gamma",
	"iota": r"\iota",
	"Iota": r"\Iota",
	"kappa": r"\kappa",
	"Kappa": r"\Kappa",
	"lambda": r"\lambda",
	"Lambda": r"\Lambda",
	"mu": r"\mu",
	"Mu": r"\Mu",
	"nabla": r"\nabla",
	"nu": r"\nu",
	"Nu": r"\Nu",
	"omega": r"\omega",
	"Omega": r"\Omega",
	"omicron": r"\omicron",
	"Omicron": r"\Omicron",
	"partial": r"\partial",
	"phi": r"\phi",
	"Phi": r"\Phi",
	"pi": r"\pi",
	"Pi": r"\Pi",
	"psi": r"\psi",
	"Psi": r"\Psi",
	"rho": r"\rho",
	"Rho": r"\Rho",
	"sigma": r"\sigma",
	"Sigma": r"\Sigma",
	"sum": r"\sum",
	"tau": r"\tau",
	"Tau": r"\Tau",
	"theta": r"\theta",
	"Theta": r"\Theta",
	"upsilon": r"\upsilon",
	"Upsilon": r"\Upsilon",
	"varepsilon": r"\varepsilon",
	"varkappa": r"\varkappa",
	"varphi": r"\varphi",
	"varpi": r"\varpi",
	"varrho": r"\varrho",
	"varsigma": r"\varsigma",
	"vartheta": r"\vartheta",
	"varTheta": r"\varTheta",
	"xi": r"\xi",
	"Xi": r"\Xi",
	"zeta": r"\zeta",
	"Zeta": r"\Zeta",
	"a": "a",
	"A": "A",
	"b": "b",
	"B": "B",
	"c": "c",
	"C": "C",
	"d": "d",
	"D": "D",
	"e": "e",
	"E": "E",
	"f": "f",
	"F": "F",
	"g": "g",
	"G": "G",
	"h": "h",
	"H": "H",
	"i": "i",
	"I": "I",
	"j": "j",
	"J": "J",
	"k": "k",
	"K": "K",
	"l": "l",
	"L": "L",
	"m": "m",
	"M": "M",
	"n": "n",
	"N": "N",
	"o": "o",
	"O": "O",
	"p": "p",
	"P": "P",
	"q": "q",
	"Q": "Q",
	"r": "r",
	"R": "R",
	"s": "s",
	"S": "S",
	"t": "t",
	"T": "T",
	"u": "u",
	"U": "U",
	"v": "v",
	"V": "V",
	"w": "w",
	"W": "W",
	"x": "x",
	"X": "X",
	"y": "y",
	"Y": "Y",
	"z": "z",
	"Z": "Z",
	"zero" : "0",
	"one"  : "1",
	"two"  : "2",
	"three": "3",
	"four" : "4",
	"five" : "5",
	"six"  : "6",
	"seven": "7",
	"eight": "8",
	"nine" : "9",
	}

math_alphabet_redundant_greek = {
	r"\Alpha"  : "A",
	r"\Beta"   : "B",
	r"\Chi"    : "X",
	r"\Digamma": "F",
	r"\Epsilon": "E",
	r"\Eta"    : "H",
	r"\Iota"   : "I",
	r"\Kappa"  : "K",
	r"\Mu"     : "M",
	r"\Nu"     : "N",
	r"\omicron": "o",
	r"\Omicron": "O",
	r"\Rho"    : "P",
	r"\Tau"    : "T",
	r"\Zeta"   : "Z",
	}

extra_synonyms = {v: u for u in 
				  [
					  ["adots", "iddots"]
					  ]
				  for v in u}

##

remaining_chars = changed_chars - {*unicode_math_table} - specially_handled - not_handled
remaining_chars = {x for x in remaining_chars if ord(x) >= 0x80}

for i in range(ord("!"), ord("~")+1):
	fullch=chr(0xff00+i-0x20)
	assert unicodedata.name(fullch) == "FULLWIDTH " + unicodedata.name(chr(i))
	if fullch in remaining_chars: remaining_chars.remove(fullch)
	print(r'\__umi_define_char{' + fullch + r'}{\char'+str(i)+' }')

defined_csnames = {x for l in unicode_math_table.values() for x in l}


pdf_engine=ChildProcessEngine("pdftex")
execute(r"""
\documentclass{article}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsfonts}
\usepackage{mathrsfs}
\begin{document}
""", engine=pdf_engine)

for unicode_char, csnames_ in unicode_math_table.items():
	csnames = [*csnames_]
	if unicodedata.combining(unicode_char) != 0:
		if 0:
			print(
					repr(unicode_char),
					f"U+{ord(unicode_char):04X}",
					unicodedata.name(unicode_char),
					"mathcode: ", umathcode[unicode_char],
					{csname: meaning(csname) for csname in csnames},
					f" -- good: {good}" if good else ""
					)
		for csname in csnames:
			assert "Umathaccent" in meaning(csname), (unicode_char, unicodedata.name(unicode_char), csname, meaning(csname))
		continue # don't support combining characters

	is_combining2="COMBINING" in unicodedata.name(unicode_char).split()
	if is_combining2:
		for csname in csnames:
			assert csname in "enclosecircle enclosesquare enclosediamond enclosetriangle".split(), (unicode_char, csname)
		assert len(csnames)==1

	optional_space=" " if is_combining2 else ""

	if ord(unicode_char) <= 0x7f: continue
	if unicode_char in specially_handled or unicode_char in not_handled: continue
	#
	bad_or_delimiter = [csname for csname in csnames if meaning(csname) != "the character " + unicode_char]
	delimiter = [csname for csname in bad_or_delimiter if good_delimiter(meaning(csname), unicode_char)]
	bad = [*{*bad_or_delimiter} - {*delimiter}]
	is_delimiter = delimiter or getdelcode(unicode_char)!=(-1, 0, 0, 0)
	# in unicode-math:
	# the situation with ⟨/langle and ↑/uparrow is different
	# in both cases the character gets assigned mathcode and delcode so \left⟨ and \left↑ both work
	# in langle case the macro is defined to be \protected macro:->\Udelimiter 4\symoperators "027E8\scan_stop:
	#   this is because of @@_set_math_open logic which sets the macro like that
	#   (I don't know why it doesn't just do the thing below)
	# in uparrow case the macro is defined to be "the character ↑"
	#   then delcode is assigned in @@_assign_delcode manually
	if bad:
		good = [*set(csnames) - set(bad)]
		print(
				repr(unicode_char),
				f"U+{ord(unicode_char):04X}",
				unicodedata.name(unicode_char),
				"mathcode: ", umathcode[unicode_char],
				"bad: ", {csname: meaning(csname) for csname in bad},
				f" -- good: {good}" if good else ""
		)
		assert False, "please specially handle this"
	else:
		csnames = [*csnames_]
		for csname in [*csnames]:
			if csname in extra_synonyms:
				csnames+=extra_synonyms[csname]
		csnames=[*set(csnames)]

		items1=[]
		for csname in csnames:
			if not is_delimiter:
				assert "delimiter" not in T[csname].meaning_str(engine=pdf_engine), (unicode_char, csname)
				# that is the symbol is not a delimiter in pdf_engine either (check is not particularly reliable but okay)

			for prefix, replacement in math_alphabet_translate.items():
				if csname.startswith(prefix):
					cs = math_alphabet_csname_translation[csname.removeprefix(prefix)]
					def wrap_in_alphabet_selector(cs: str)->str:
						if replacement is None: return cs
						return "\\" + replacement + "{" + cs + "}"

					if cs in math_alphabet_redundant_greek:
						items1.append(wrap_in_alphabet_selector(
							"\\__umi_alternatives_iisafe" + cs + ("" if math_alphabet_redundant_greek[cs].startswith("\\") else " ") + math_alphabet_redundant_greek[cs]
							))
					else:
						items1.append(wrap_in_alphabet_selector(cs))
					break
			else:
				items1.append("\\" + csname)

		if len(items1)==1:
			a = items1[0]
			if a.startswith(r"\not") and a.removeprefix(r"\not") in defined_csnames:
				assert not is_delimiter
				b='\\' + a.removeprefix(r"\not")
				print(f"\\__umi_define_char{{{optional_space}{unicode_char}}}{{\__umi_alternatives_not{a}{b}}}")
				a.removeprefix(r"\not")
			elif a.startswith(r"\n") and a.removeprefix(r"\n") in defined_csnames:
				assert not is_delimiter
				b='\\' + a.removeprefix(r"\n")
				print(f"\\__umi_define_char{{{optional_space}{unicode_char}}}{{\__umi_alternatives_not{a}{b}}}")
				a.removeprefix(r"\n")
			else:
				if is_delimiter:
					print(f"\\__umi_define_char_maybe_delimiter{{{optional_space}{unicode_char}}}{{{a}}}")
				else:
					print(f"\\__umi_define_char{{{optional_space}{unicode_char}}}{{{a}}}")
		else:
			assert not is_delimiter, (unicode_char, delimiter)
			assert len(items1)==2, items1
			assert re.fullmatch(r'\\[a-zA-Z]+', items1[0]), items1
			assert re.fullmatch(r'\\[a-zA-Z]+', items1[1]), items1
			print(f"\\__umi_define_char{{{optional_space}{unicode_char}}}{{\\__umi_alternatives{items1[0]}{items1[1]}}}")

##

# ========

sys.exit()

# ========  part below are draft.

T.longdivisionsign.meaning_str()


T.mathexclam.meaning_str()

T.symoperators.meaning_str()

T.perp.meaning_str()

umathcode[" ̅"[1]]

BalancedTokenList(r'\the\Udelcode `̅').expand_o().int()

x = BalancedTokenList(r'\the\Udelcode `!').expand_o().int()
print(hex(x))

hex(BalancedTokenList(r'\the\delcode `!').expand_o().int())


if 0:

	data = TokenList([r"\directlua", TokenList.fstr(
	r"""
	for k, v in pairs(tex.hashtokens()) do
		tex.print(-2, v .. "\0")
	end
	"""
	)]).expand_x().str()
	control_sequences = data.split("\x00")
	assert control_sequences[-1]==""
	del control_sequences[-1]


Path("/tmp/control_sequences.json").write_text(json.dumps(control_sequences))  # type: ignore
control_sequences = json.loads(Path("/tmp/control_sequences.json").read_text())  # type: ignore


Path("/tmp/control_sequences_unicode_math.json").write_text(json.dumps(control_sequences))
control_sequences = json.loads(Path("/tmp/control_sequences_unicode_math.json").read_text())
assert "mitrho" in control_sequences

if 0:
	# try some other random things

	control_sequences = data.split("\x00")
	assert control_sequences[-1]==""
	del control_sequences[-1]


	BalancedTokenList(r'\the\Umathcode `′').expand_o().int() == 0x1000000


control_sequences

BalancedTokenList(r'\the\mathcode`⨁').expand_o().int()

Catcode.active("⨁").meaning_str(engine=engine)

Catcode.active("′").meaning_str(engine=engine)

T.bigoplus_sym.meaning_str()

T.bigoplusop.meaning_str()

T.bigoplus.meaning_str()


T.rho.meaning_str()

T.mitrho.meaning_str()

T.bigoplus.meaning_str()

engine._stdout_lines[-100:] + [bytes(engine._stdout_buffer)]


T.mscrA.meaning_str()


@functools.lru_cache(maxsize=None)
def is_defined(csname: str)->bool:
	return T[csname].meaning_str()!="undefined"


# show distinct items with math alphabet
a_=defaultdict(list)
for l in unicode_math_table.values():
	for csname in l:
		if csname.startswith(tuple(math_alphabet_translate)):
			t = csname
			for prefix in sorted(math_alphabet_translate, key=len, reverse=True):
				if t.startswith(prefix):
					t=t.removeprefix(prefix)
					break
			a_[t].append(csname)
a_

a_.keys()

for v in math_alphabet_csname_translation.values():
	if v.startswith("\\") and not is_defined(v[1:]):
		print(v)


def is_okay(csname: str)->bool:
	if is_defined(csname): return True
	if csname.startswith(tuple(math_alphabet_translate)): return True
	return False
#
# print bad ones
for unicode_char, csnames_ in unicode_math_table.items():
	if ord(unicode_char) >= 0x80 and all( not is_okay(csname) for csname in csnames_ ):
		print(unicode_char, csnames_)


# print okay ones
for unicode_char, csnames_ in unicode_math_table.items():
	valid_csnames = [ csname for csname in csnames_ if T[csname].meaning_str()!="undefined" ]
	if ord(unicode_char) >= 0x80 and valid_csnames:
		print(unicode_char, valid_csnames)

T.lsime.meaning_str()

"ℝ".encode('u8')

BalancedTokenList([T.meaning, Catcode.active("\xe2")]).expand_x(engine=pdf_engine)

T["UTFviii@three@octets"].meaning_str(engine=pdf_engine)

T["UTFviii@three@octets@combine"].meaning_str(engine=pdf_engine)

T["UTF@three@octets@noexpand"].meaning_str(engine=pdf_engine)

BalancedTokenList([T.meaning, Catcode.active("\xe2")]).expand_x()

test_engine=ChildProcessEngine("pdftex")
BalancedTokenList(r"\def\aa{bb}").execute(engine=test_engine)
BalancedTokenList(r"\csname\noexpand\aa\endcsname").expand_o(engine=test_engine)  # give error
BalancedTokenList(r"\csname\string\aa\endcsname").expand_o(engine=test_engine)  # \[\aa] as expected

T.iddots.meaning_str(engine=engine)

T.adots.meaning_str(engine=engine)