diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..156781e --- /dev/null +++ b/Makefile @@ -0,0 +1,23 @@ +.PHONY: default clean + +EXPORT_FORMAT?=json + +default: tools fonts + +FONT_DIRS=$(wildcard fonts/*.sfdir) +FONT_LIST=$(patsubst fonts/%.sfdir,%,$(FONT_DIRS)) +FONT_TTFS=$(patsubst %,%.ttf,$(FONT_LIST)) + +fonts: $(FONT_TTFS) + +%.ttf: fonts/%.sfdir + fontforge -lang=ff -c 'Open($$1); Generate($$2)' $^ $@ + +dump-dictionary: + @cue export --out $(EXPORT_FORMAT) -p shenikan + +print-%: + @echo '$* = $($*)' + +clean: + -rm $(FONT_TTFS) diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..0fdc677 --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module labprogramming.net/shenikandata + +go 1.23.1 diff --git a/shenikan.go b/shenikan.go new file mode 100644 index 0000000..7431ee7 --- /dev/null +++ b/shenikan.go @@ -0,0 +1,6 @@ +package shenikandata + +import "embed" + +//go:embed *.cue +var Cues embed.FS diff --git a/tools/esh.py b/tools/esh.py new file mode 100755 index 0000000..5894588 --- /dev/null +++ b/tools/esh.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python3 + +""" +This losely does similar behaviour to asking ChatGPT the following: + +Can you help me find words in your embedding space? I want to give you a basic +arithmetic expression involving words to find relationships between words in +your embedding model. For example king minus man plus woman should probably be +something like queen. Please give me 10 options each time. Are you ready? +""" + +import cmd +import re +import os + +from gensim import downloader +from thefuzz import process + +EMBEDDING_TOKENS = [ + ('NUMBER', r'\d+(\.\d*)?'), # an integer or decimal number + ('WORD', r'\w+'), # a word + ('PAREN', r'[()]'), # a parenthesis + ('OP', r'[+\-*/~]'), # an arithmetic operator + ('COMMA', r','), # a comma + ('WS', r'\s+'), # whitespace + ('ERROR', r'.'), # anything else +] +EMBEDDING_TOKENIZATION_RE = re.compile('|'.join( + f'(?P<{x[0]}>{x[1]})' for x in EMBEDDING_TOKENS +)) + + +def tokenize_embedding_expr(expr): + """ Generates (token_kind, token) for each token in expr. """ + for mo in EMBEDDING_TOKENIZATION_RE.finditer(expr): + yield (mo.lastgroup, mo.group()) + + +def token_precedence(token): + """ + Returns the precedence of the token. + Negative precedences are right-associative + """ + if token in {'+', '-', '~'}: + return 1 + + if token in {'*', '/'}: + return 2 + + return 0 + + +def _goes_first(a, b): + ap = token_precedence(a) + bp = token_precedence(b) + aap = abs(ap) + abp = abs(bp) + + if aap > abp: + return True + + if aap == abp and bp > 0: + return True + + return False + + +def shunt_embedding_tokens(tokens): + """ + Tokens are (kind, value) where kind is: + + w - word to be looked up in model and converted to embedding vector + s - scalar value + o - operator + """ + stack = [] # operator stack, just the op itself! + + for (kind, tok) in tokens: + if kind == 'WORD': + yield ('w', tok) + + elif kind == 'NUMBER': + yield ('s', tok) + + elif kind == 'OP': + while stack and stack[-1] != '(' and _goes_first(stack[-1], tok): + yield ('o', stack.pop()) + stack.append(tok) + + elif kind == 'PAREN': + if tok == '(': + stack.append(tok) + else: + while stack and stack[-1] != '(': + yield ('o', stack.pop()) + + if stack: + stack.pop() # remove the '(' + + while stack: + yield ('o', stack.pop()) + + +def evaluate_embedding_shunt(shunt, model): + """ Evaluates shunt using model. """ + stack = [] + + for (kind, x) in shunt: + if kind == 'w': + if x[0] == '_': + if x[1:] in model: + stack.append(-model[x[1:]]) + else: + most_similar = process.extractOne(x[1:], model.key_to_index.keys())[0] + stack.append(-model[most_similar]) + + if x in model: + stack.append(model[x]) + else: + most_similar = process.extractOne(x, model.key_to_index.keys())[0] + stack.append(model[most_similar]) + + elif kind == 's': + stack.append(float(x)) + + elif kind == 'o': + if x == '+': + a = stack.pop() + b = stack.pop() + stack.append(a + b) + + elif x == '-': + a = stack.pop() + b = stack.pop() + stack.append(b - a) + + elif x == '*': + a = stack.pop() + b = stack.pop() + stack.append(a * b) + + elif x == '/': + a = stack.pop() + b = stack.pop() + stack.append(b / a) + + elif x == '~': + a = stack.pop() + b = stack.pop() + stack.append((a + b) / 2) + + return stack[-1] + + +class EmbeddingShell(cmd.Cmd): + """ Actual embedding shell wrapper. """ + intro = 'Welcome to the embedding shell. Enter words in an equation to see similar embeddings. Type :help for more information' + prompt = '(Ʃ) ' + + def __init__(self, *args, model='glove-wiki-gigaword-300', **kwargs): + super().__init__(completekey='tab', stdin=None, stdout=None, *args, **kwargs) + print('Loading model...', end='', flush=True) + self._model = downloader.load(model) + self._keys = self._model.key_to_index.keys() + print(' DONE') + + def do_exec(self, arg): + """ Test """ + try: + result = evaluate_embedding_shunt(shunt_embedding_tokens(tokenize_embedding_expr(arg)), self._model) + + for (word, sim) in self._model.most_similar(result, restrict_vocab=10000): + (w, _) = os.get_terminal_size() + bar = '-' * int((w - 20) * sim) + print(f'{word:10} {bar}') + except Exception as e: + print("Could not evaluate expression:", e) + + + def do_shunt(self, arg): + for x in shunt_embedding_tokens(tokenize_embedding_expr(arg)): + print(x) + + def do_quit(self, arg): + """ Exit the embedding shell. """ + return True + + def precmd(self, line): + if not line: + return line + if line[0] == ':': + return line[1:] + return 'exec ' + line + + + + +if __name__ == '__main__': + EmbeddingShell().cmdloop() diff --git a/tools/listencoding.sh b/tools/listencoding.sh new file mode 100755 index 0000000..dd7af84 --- /dev/null +++ b/tools/listencoding.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +shopt -s extglob + +: "${OFFSET:=0}" + +codepoint="$((0xF3A00 + OFFSET * 256))" +while read -r glyph_json; do + ascii_ortho="$(jq -r '.ortho | gsub("θ"; "th") | gsub("∫"; "sh") | @uri | gsub("%"; "q")' <<<"$glyph_json")" + + printf '%s\t=\t%x\n' "$ascii_ortho" "$codepoint" + codepoint="$((codepoint + 1))" +done < <(cue export -p shenikan | jq -c '.dictionary.glyphs[]') diff --git a/tools/mkemptyfont.sh b/tools/mkemptyfont.sh new file mode 100755 index 0000000..9cb6fb6 --- /dev/null +++ b/tools/mkemptyfont.sh @@ -0,0 +1,107 @@ +#!/bin/bash + +shopt -s extglob + +fontdir="$1.sfdir" + +DEFAULT_GLYPH_WIDTH=555 + +read -r -p "Font name:" fontname +read -r -p "Full name:" fullname +read -r -p "Unicode page offset (/256, 0 is 0xF3Axx):" offset + +mkdir "$fontdir" + +cat < "$fontdir/font.props" +SplineFontDB: 3.2 +FontName: $fontname +FullName: $fullname +Weight: Book +Copyright: Copyright (C) $(date +%Y), $(id -un) +UComments: "$(date +%Y-%M-%d): Created with mkemptyfont.sh" +Version: 001.000 +ItalicAngle: 0 +UnderlinePosition: -150 +UnderlineWidth: 50 +Ascent: 800 +Descent: 200 +sfntRevision: 0x00010000 +LayerCount: 2 +Layer: 0 0 "Back" 1 +Layer: 1 0 "Fore" 0 +DisplaySize: -48 +AntiAlias: 1 +FitToEm: 0 +Encoding: Custom +CreationTime: $(date +%s) +ModificationTime: $(date +%s) +DEI: 91125 +Lookup: 4 0 1 "'liga' Standard Ligatures in Latin lookup 0" { "'liga' Standard Ligatures in Latin lookup 0-1" } ['liga' ('DFLT' <'dflt' > 'latn' <'dflt' > ) ] +EOF + +ord() { + LC_CTYPE=C printf '%d' "'$1" +} + +codepoint="$((0xF3A00 + offset * 256))" +encodingidx=0 + +# generate the 'raw' replacement glyphs +while read -r glyph_json; do + while read -r liga_code_point; do + glyph_file="$fontdir/raw$liga_code_point.glyph" + if [ ! -e "$glyph_file" ]; then + echo -n "Generating raw$liga_code_point..." + + cat < "$glyph_file" +StartChar: raw$liga_code_point +Encoding: $encodingidx $liga_code_point $encodingidx +Width: $DEFAULT_GLYPH_WIDTH +LayerCount: 2 +Comment: "Raw glyph for ligature replacement" +Colour: ff0000 +EndChar +EOF + encodingidx=$((encodingidx + 1)) + + echo ' DONE' + fi + done < <(jq -r '.ortho | gsub("θ"; "th") | gsub("∫"; "sh") | explode[]' <<<"$glyph_json") +done < <(cue export -p shenikan | jq -c '.dictionary.glyphs[]') + +# generate the 'real' ligature glyphs +while read -r glyph_json; do + liga=( ) + while read -r liga_code_point; do + liga+=( "raw$liga_code_point" ) + done < <(jq -r '.ortho | gsub("θ"; "th") | gsub("∫"; "sh") | explode[]' <<<"$glyph_json") + + ortho="$(jq -r '.ortho' <<<"$glyph_json")" + ascii_ortho="$(jq -r '.ortho | gsub("θ"; "th") | gsub("∫"; "sh") | @uri | gsub("%"; "q")' <<<"$glyph_json")" + echo -n "Generating $ortho..." + + glyph_file="$fontdir/sh$ascii_ortho.glyph" + cat < "$glyph_file" +StartChar: sh$ascii_ortho +Encoding: $encodingidx $codepoint $encodingidx +Width: $DEFAULT_GLYPH_WIDTH +LayerCount: 2 +Comment: "Shenikan $ascii_ortho glyph" +Ligature2: "'liga' Standard Ligatures in Latin lookup 0-1" ${liga[*]} +EOF + + if [ "$(jq '.ortho | length' <<<"$glyph_json")" -gt 1 ]; then + liga=( ) + while read -r liga_code_point; do + liga+=( "raw$liga_code_point" ) + done < <(jq -r '.ortho | explode | reverse | implode | gsub("θ"; "th") | gsub("∫"; "sh") | explode[]' <<<"$glyph_json") + + echo "Ligature2: \"'liga' Standard Ligatures in Latin lookup 0-1\" ${liga[*]}" >> "$glyph_file" + fi + + echo 'EndChar' >> "$glyph_file" + encodingidx=$((encodingidx + 1)) + codepoint=$((codepoint + 1)) + + echo ' DONE' +done < <(cue export -p shenikan | jq -c '.dictionary.glyphs[]')