Added some basic tools

2025-02-03 20:59:03 -05:00 · 2025-02-03 20:59:03 -05:00 · 51e77feee8
parent d77046d404
commit 51e77feee8
6 changed files with 351 additions and 0 deletions
--- a/23
+++ b/23
@ -0,0 +1,23 @@
+.PHONY: default clean
+
+EXPORT_FORMAT?=json
+
+default: tools fonts
+
+FONT_DIRS=$(wildcard fonts/*.sfdir)
+FONT_LIST=$(patsubst fonts/%.sfdir,%,$(FONT_DIRS))
+FONT_TTFS=$(patsubst %,%.ttf,$(FONT_LIST))
+
+fonts: $(FONT_TTFS)
+
+%.ttf: fonts/%.sfdir
+	fontforge -lang=ff -c 'Open($$1); Generate($$2)' $^ $@
+
+dump-dictionary:
+	@cue export --out $(EXPORT_FORMAT) -p shenikan
+
+print-%:
+	@echo '$* = $($*)'
+
+clean:
+	-rm $(FONT_TTFS)
--- a/go.mod
+++ b/go.mod
@ -0,0 +1,3 @@
+module labprogramming.net/shenikandata
+
+go 1.23.1
--- a/shenikan.go
+++ b/shenikan.go
@ -0,0 +1,6 @@
+package shenikandata
+
+import "embed"
+
+//go:embed *.cue
+var Cues embed.FS
--- a/tools/esh.py
+++ b/tools/esh.py
@ -0,0 +1,199 @@
+#!/usr/bin/env python3
+
+"""
+This losely does similar behaviour to asking ChatGPT the following:
+
+Can you help me find words in your embedding space? I want to give you a basic
+arithmetic expression involving words to find relationships between words in
+your embedding model. For example king minus man plus woman should probably be
+something like queen. Please give me 10 options each time. Are you ready?
+"""
+
+import cmd
+import re
+import os
+
+from gensim import downloader
+from thefuzz import process
+
+EMBEDDING_TOKENS = [
+    ('NUMBER', r'\d+(\.\d*)?'),  # an integer or decimal number
+    ('WORD', r'\w+'),  # a word
+    ('PAREN', r'[()]'),  # a parenthesis
+    ('OP', r'[+\-*/~]'),  # an arithmetic operator
+    ('COMMA', r','),  # a comma
+    ('WS', r'\s+'),  # whitespace
+    ('ERROR', r'.'),  # anything else
+]
+EMBEDDING_TOKENIZATION_RE = re.compile('|'.join(
+    f'(?P<{x[0]}>{x[1]})' for x in EMBEDDING_TOKENS
+))
+
+
+def tokenize_embedding_expr(expr):
+    """ Generates (token_kind, token) for each token in expr. """
+    for mo in EMBEDDING_TOKENIZATION_RE.finditer(expr):
+        yield (mo.lastgroup, mo.group())
+
+
+def token_precedence(token):
+    """
+    Returns the precedence of the token.
+    Negative precedences are right-associative
+    """
+    if token in {'+', '-', '~'}:
+        return 1
+
+    if token in {'*', '/'}:
+        return 2
+
+    return 0
+
+
+def _goes_first(a, b):
+    ap = token_precedence(a)
+    bp = token_precedence(b)
+    aap = abs(ap)
+    abp = abs(bp)
+
+    if aap > abp:
+        return True
+
+    if aap == abp and bp > 0:
+        return True
+
+    return False
+
+
+def shunt_embedding_tokens(tokens):
+    """
+    Tokens are (kind, value) where kind is:
+
+    w - word to be looked up in model and converted to embedding vector
+    s - scalar value
+    o - operator
+    """
+    stack = []  # operator stack, just the op itself!
+
+    for (kind, tok) in tokens:
+        if kind == 'WORD':
+            yield ('w', tok)
+
+        elif kind == 'NUMBER':
+            yield ('s', tok)
+
+        elif kind == 'OP':
+            while stack and stack[-1] != '(' and _goes_first(stack[-1], tok):
+                yield ('o', stack.pop())
+            stack.append(tok)
+
+        elif kind == 'PAREN':
+            if tok == '(':
+                stack.append(tok)
+            else:
+                while stack and stack[-1] != '(':
+                    yield ('o', stack.pop())
+
+                if stack:
+                    stack.pop()  # remove the '('
+
+    while stack:
+        yield ('o', stack.pop())
+
+
+def evaluate_embedding_shunt(shunt, model):
+    """ Evaluates shunt using model. """
+    stack = []
+
+    for (kind, x) in shunt:
+        if kind == 'w':
+            if x[0] == '_':
+                if x[1:] in model:
+                    stack.append(-model[x[1:]])
+                else:
+                    most_similar = process.extractOne(x[1:], model.key_to_index.keys())[0]
+                    stack.append(-model[most_similar])
+
+            if x in model:
+                stack.append(model[x])
+            else:
+                most_similar = process.extractOne(x, model.key_to_index.keys())[0]
+                stack.append(model[most_similar])
+
+        elif kind == 's':
+            stack.append(float(x))
+
+        elif kind == 'o':
+            if x == '+':
+                a = stack.pop()
+                b = stack.pop()
+                stack.append(a + b)
+
+            elif x == '-':
+                a = stack.pop()
+                b = stack.pop()
+                stack.append(b - a)
+
+            elif x == '*':
+                a = stack.pop()
+                b = stack.pop()
+                stack.append(a * b)
+
+            elif x == '/':
+                a = stack.pop()
+                b = stack.pop()
+                stack.append(b / a)
+
+            elif x == '~':
+                a = stack.pop()
+                b = stack.pop()
+                stack.append((a + b) / 2)
+
+    return stack[-1]
+
+
+class EmbeddingShell(cmd.Cmd):
+    """ Actual embedding shell wrapper. """
+    intro = 'Welcome to the embedding shell. Enter words in an equation to see similar embeddings. Type :help for more information'
+    prompt = '(Ʃ) '
+
+    def __init__(self, *args, model='glove-wiki-gigaword-300', **kwargs):
+        super().__init__(completekey='tab', stdin=None, stdout=None, *args, **kwargs)
+        print('Loading model...', end='', flush=True)
+        self._model = downloader.load(model)
+        self._keys = self._model.key_to_index.keys()
+        print(' DONE')
+
+    def do_exec(self, arg):
+        """ Test """
+        try:
+            result = evaluate_embedding_shunt(shunt_embedding_tokens(tokenize_embedding_expr(arg)), self._model)
+
+            for (word, sim) in self._model.most_similar(result, restrict_vocab=10000):
+                (w, _) = os.get_terminal_size()
+                bar = '-' * int((w - 20) * sim)
+                print(f'{word:10}       {bar}')
+        except Exception as e:
+            print("Could not evaluate expression:", e)
+
+
+    def do_shunt(self, arg):
+        for x in shunt_embedding_tokens(tokenize_embedding_expr(arg)):
+            print(x)
+
+    def do_quit(self, arg):
+        """ Exit the embedding shell. """
+        return True
+
+    def precmd(self, line):
+        if not line:
+            return line
+        if line[0] == ':':
+            return line[1:]
+        return 'exec ' + line
+
+
+
+
+if __name__ == '__main__':
+    EmbeddingShell().cmdloop()
--- a/tools/listencoding.sh
+++ b/tools/listencoding.sh
@ -0,0 +1,13 @@
+#!/bin/bash
+
+shopt -s extglob
+
+: "${OFFSET:=0}"
+
+codepoint="$((0xF3A00 + OFFSET * 256))"
+while read -r glyph_json; do
+    ascii_ortho="$(jq -r '.ortho | gsub("θ"; "th") | gsub("∫"; "sh") | @uri | gsub("%"; "q")' <<<"$glyph_json")"
+
+    printf '%s\t=\t%x\n' "$ascii_ortho" "$codepoint"
+    codepoint="$((codepoint + 1))"
+done < <(cue export -p shenikan | jq -c '.dictionary.glyphs[]')
--- a/tools/mkemptyfont.sh
+++ b/tools/mkemptyfont.sh
@ -0,0 +1,107 @@
+#!/bin/bash
+
+shopt -s extglob
+
+fontdir="$1.sfdir"
+
+DEFAULT_GLYPH_WIDTH=555
+
+read -r -p "Font name:" fontname
+read -r -p "Full name:" fullname
+read -r -p "Unicode page offset (/256, 0 is 0xF3Axx):" offset
+
+mkdir "$fontdir"
+
+cat <<EOF > "$fontdir/font.props"
+SplineFontDB: 3.2
+FontName: $fontname
+FullName: $fullname
+Weight: Book
+Copyright: Copyright (C) $(date +%Y), $(id -un)
+UComments: "$(date +%Y-%M-%d): Created with mkemptyfont.sh"
+Version: 001.000
+ItalicAngle: 0
+UnderlinePosition: -150
+UnderlineWidth: 50
+Ascent: 800
+Descent: 200
+sfntRevision: 0x00010000
+LayerCount: 2
+Layer: 0 0 "Back" 1
+Layer: 1 0 "Fore" 0
+DisplaySize: -48
+AntiAlias: 1
+FitToEm: 0
+Encoding: Custom
+CreationTime: $(date +%s)
+ModificationTime: $(date +%s)
+DEI: 91125
+Lookup: 4 0 1 "'liga' Standard Ligatures in Latin lookup 0" { "'liga' Standard Ligatures in Latin lookup 0-1" } ['liga' ('DFLT' <'dflt' > 'latn' <'dflt' > ) ]
+EOF
+
+ord() {
+    LC_CTYPE=C printf '%d' "'$1"
+}
+
+codepoint="$((0xF3A00 + offset * 256))"
+encodingidx=0
+
+# generate the 'raw' replacement glyphs
+while read -r glyph_json; do
+    while read -r liga_code_point; do
+        glyph_file="$fontdir/raw$liga_code_point.glyph"
+        if [ ! -e "$glyph_file" ]; then
+            echo -n "Generating raw$liga_code_point..."
+
+            cat <<EOF > "$glyph_file"
+StartChar: raw$liga_code_point
+Encoding: $encodingidx $liga_code_point $encodingidx
+Width: $DEFAULT_GLYPH_WIDTH
+LayerCount: 2
+Comment: "Raw glyph for ligature replacement"
+Colour: ff0000
+EndChar
+EOF
+            encodingidx=$((encodingidx + 1))
+
+            echo ' DONE'
+        fi
+    done < <(jq -r '.ortho | gsub("θ"; "th") | gsub("∫"; "sh") | explode[]' <<<"$glyph_json")
+done < <(cue export -p shenikan | jq -c '.dictionary.glyphs[]')
+
+# generate the 'real' ligature glyphs
+while read -r glyph_json; do
+    liga=( )
+    while read -r liga_code_point; do
+        liga+=( "raw$liga_code_point" )
+    done < <(jq -r '.ortho | gsub("θ"; "th") | gsub("∫"; "sh") | explode[]' <<<"$glyph_json")
+
+    ortho="$(jq -r '.ortho' <<<"$glyph_json")"
+    ascii_ortho="$(jq -r '.ortho | gsub("θ"; "th") | gsub("∫"; "sh") | @uri | gsub("%"; "q")' <<<"$glyph_json")"
+    echo -n "Generating $ortho..."
+
+    glyph_file="$fontdir/sh$ascii_ortho.glyph"
+    cat <<EOF > "$glyph_file"
+StartChar: sh$ascii_ortho
+Encoding: $encodingidx $codepoint $encodingidx
+Width: $DEFAULT_GLYPH_WIDTH
+LayerCount: 2
+Comment: "Shenikan $ascii_ortho glyph"
+Ligature2: "'liga' Standard Ligatures in Latin lookup 0-1" ${liga[*]}
+EOF
+
+    if [ "$(jq '.ortho | length' <<<"$glyph_json")" -gt 1 ]; then
+        liga=( )
+        while read -r liga_code_point; do
+            liga+=( "raw$liga_code_point" )
+        done < <(jq -r '.ortho | explode | reverse | implode | gsub("θ"; "th") | gsub("∫"; "sh") | explode[]' <<<"$glyph_json")
+
+        echo "Ligature2: \"'liga' Standard Ligatures in Latin lookup 0-1\" ${liga[*]}" >> "$glyph_file"
+    fi
+
+    echo 'EndChar' >> "$glyph_file"
+    encodingidx=$((encodingidx + 1))
+    codepoint=$((codepoint + 1))
+
+    echo ' DONE'
+done < <(cue export -p shenikan | jq -c '.dictionary.glyphs[]')