Added some basic tools

2025-02-03 20:59:03 -05:00 · 2025-02-03 20:59:03 -05:00 · 51e77feee8
parent d77046d404
commit 51e77feee8
6 changed files with 351 additions and 0 deletions
--- a/23
+++ b/23
@ -0,0 +1,23 @@
 .PHONY: default clean
 EXPORT_FORMAT?=json
 default: tools fonts
 FONT_DIRS=$(wildcard fonts/*.sfdir)
 FONT_LIST=$(patsubst fonts/%.sfdir,%,$(FONT_DIRS))
 FONT_TTFS=$(patsubst %,%.ttf,$(FONT_LIST))
 fonts: $(FONT_TTFS)
 %.ttf: fonts/%.sfdir
 	fontforge -lang=ff -c 'Open($$1); Generate($$2)' $^ $@
 dump-dictionary:
 	@cue export --out $(EXPORT_FORMAT) -p shenikan
 print-%:
 	@echo '$* = $($*)'
 clean:
 	-rm $(FONT_TTFS)
--- a/go.mod
+++ b/go.mod
@ -0,0 +1,3 @@
 module labprogramming.net/shenikandata
 go 1.23.1
--- a/shenikan.go
+++ b/shenikan.go
@ -0,0 +1,6 @@
 package shenikandata
 import "embed"
 //go:embed *.cue
 var Cues embed.FS
--- a/tools/esh.py
+++ b/tools/esh.py
@ -0,0 +1,199 @@
 #!/usr/bin/env python3
 """
 This losely does similar behaviour to asking ChatGPT the following:
 Can you help me find words in your embedding space? I want to give you a basic
 arithmetic expression involving words to find relationships between words in
 your embedding model. For example king minus man plus woman should probably be
 something like queen. Please give me 10 options each time. Are you ready?
 """
 import cmd
 import re
 import os
 from gensim import downloader
 from thefuzz import process
 EMBEDDING_TOKENS = [
    ('NUMBER', r'\d+(\.\d*)?'),  # an integer or decimal number
    ('WORD', r'\w+'),  # a word
    ('PAREN', r'[()]'),  # a parenthesis
    ('OP', r'[+\-*/~]'),  # an arithmetic operator
    ('COMMA', r','),  # a comma
    ('WS', r'\s+'),  # whitespace
    ('ERROR', r'.'),  # anything else
 ]
 EMBEDDING_TOKENIZATION_RE = re.compile('|'.join(
    f'(?P<{x[0]}>{x[1]})' for x in EMBEDDING_TOKENS
 ))
 def tokenize_embedding_expr(expr):
    """ Generates (token_kind, token) for each token in expr. """
    for mo in EMBEDDING_TOKENIZATION_RE.finditer(expr):
        yield (mo.lastgroup, mo.group())
 def token_precedence(token):
    """
    Returns the precedence of the token.
    Negative precedences are right-associative
    """
    if token in {'+', '-', '~'}:
        return 1
    if token in {'*', '/'}:
        return 2
    return 0
 def _goes_first(a, b):
    ap = token_precedence(a)
    bp = token_precedence(b)
    aap = abs(ap)
    abp = abs(bp)
    if aap > abp:
        return True
    if aap == abp and bp > 0:
        return True
    return False
 def shunt_embedding_tokens(tokens):
    """
    Tokens are (kind, value) where kind is:
    w - word to be looked up in model and converted to embedding vector
    s - scalar value
    o - operator
    """
    stack = []  # operator stack, just the op itself!
    for (kind, tok) in tokens:
        if kind == 'WORD':
            yield ('w', tok)
        elif kind == 'NUMBER':
            yield ('s', tok)
        elif kind == 'OP':
            while stack and stack[-1] != '(' and _goes_first(stack[-1], tok):
                yield ('o', stack.pop())
            stack.append(tok)
        elif kind == 'PAREN':
            if tok == '(':
                stack.append(tok)
            else:
                while stack and stack[-1] != '(':
                    yield ('o', stack.pop())
                if stack:
                    stack.pop()  # remove the '('
    while stack:
        yield ('o', stack.pop())
 def evaluate_embedding_shunt(shunt, model):
    """ Evaluates shunt using model. """
    stack = []
    for (kind, x) in shunt:
        if kind == 'w':
            if x[0] == '_':
                if x[1:] in model:
                    stack.append(-model[x[1:]])
                else:
                    most_similar = process.extractOne(x[1:], model.key_to_index.keys())[0]
                    stack.append(-model[most_similar])
            if x in model:
                stack.append(model[x])
            else:
                most_similar = process.extractOne(x, model.key_to_index.keys())[0]
                stack.append(model[most_similar])
        elif kind == 's':
            stack.append(float(x))
        elif kind == 'o':
            if x == '+':
                a = stack.pop()
                b = stack.pop()
                stack.append(a + b)
            elif x == '-':
                a = stack.pop()
                b = stack.pop()
                stack.append(b - a)
            elif x == '*':
                a = stack.pop()
                b = stack.pop()
                stack.append(a * b)
            elif x == '/':
                a = stack.pop()
                b = stack.pop()
                stack.append(b / a)
            elif x == '~':
                a = stack.pop()
                b = stack.pop()
                stack.append((a + b) / 2)
    return stack[-1]
 class EmbeddingShell(cmd.Cmd):
    """ Actual embedding shell wrapper. """
    intro = 'Welcome to the embedding shell. Enter words in an equation to see similar embeddings. Type :help for more information'
    prompt = '(Ʃ) '
    def __init__(self, *args, model='glove-wiki-gigaword-300', **kwargs):
        super().__init__(completekey='tab', stdin=None, stdout=None, *args, **kwargs)
        print('Loading model...', end='', flush=True)
        self._model = downloader.load(model)
        self._keys = self._model.key_to_index.keys()
        print(' DONE')
    def do_exec(self, arg):
        """ Test """
        try:
            result = evaluate_embedding_shunt(shunt_embedding_tokens(tokenize_embedding_expr(arg)), self._model)
            for (word, sim) in self._model.most_similar(result, restrict_vocab=10000):
                (w, _) = os.get_terminal_size()
                bar = '-' * int((w - 20) * sim)
                print(f'{word:10}       {bar}')
        except Exception as e:
            print("Could not evaluate expression:", e)
    def do_shunt(self, arg):
        for x in shunt_embedding_tokens(tokenize_embedding_expr(arg)):
            print(x)
    def do_quit(self, arg):
        """ Exit the embedding shell. """
        return True
    def precmd(self, line):
        if not line:
            return line
        if line[0] == ':':
            return line[1:]
        return 'exec ' + line
 if __name__ == '__main__':
    EmbeddingShell().cmdloop()
--- a/tools/listencoding.sh
+++ b/tools/listencoding.sh
@ -0,0 +1,13 @@
 #!/bin/bash
 shopt -s extglob
 : "${OFFSET:=0}"
 codepoint="$((0xF3A00 + OFFSET * 256))"
 while read -r glyph_json; do
    ascii_ortho="$(jq -r '.ortho | gsub("θ"; "th") | gsub("∫"; "sh") | @uri | gsub("%"; "q")' <<<"$glyph_json")"
    printf '%s\t=\t%x\n' "$ascii_ortho" "$codepoint"
    codepoint="$((codepoint + 1))"
 done < <(cue export -p shenikan | jq -c '.dictionary.glyphs[]')
--- a/tools/mkemptyfont.sh
+++ b/tools/mkemptyfont.sh
@ -0,0 +1,107 @@
 #!/bin/bash
 shopt -s extglob
 fontdir="$1.sfdir"
 DEFAULT_GLYPH_WIDTH=555
 read -r -p "Font name:" fontname
 read -r -p "Full name:" fullname
 read -r -p "Unicode page offset (/256, 0 is 0xF3Axx):" offset
 mkdir "$fontdir"
 cat <<EOF > "$fontdir/font.props"
 SplineFontDB: 3.2
 FontName: $fontname
 FullName: $fullname
 Weight: Book
 Copyright: Copyright (C) $(date +%Y), $(id -un)
 UComments: "$(date +%Y-%M-%d): Created with mkemptyfont.sh"
 Version: 001.000
 ItalicAngle: 0
 UnderlinePosition: -150
 UnderlineWidth: 50
 Ascent: 800
 Descent: 200
 sfntRevision: 0x00010000
 LayerCount: 2
 Layer: 0 0 "Back" 1
 Layer: 1 0 "Fore" 0
 DisplaySize: -48
 AntiAlias: 1
 FitToEm: 0
 Encoding: Custom
 CreationTime: $(date +%s)
 ModificationTime: $(date +%s)
 DEI: 91125
 Lookup: 4 0 1 "'liga' Standard Ligatures in Latin lookup 0" { "'liga' Standard Ligatures in Latin lookup 0-1" } ['liga' ('DFLT' <'dflt' > 'latn' <'dflt' > ) ]
 EOF
 ord() {
    LC_CTYPE=C printf '%d' "'$1"
 }
 codepoint="$((0xF3A00 + offset * 256))"
 encodingidx=0
 # generate the 'raw' replacement glyphs
 while read -r glyph_json; do
    while read -r liga_code_point; do
        glyph_file="$fontdir/raw$liga_code_point.glyph"
        if [ ! -e "$glyph_file" ]; then
            echo -n "Generating raw$liga_code_point..."
            cat <<EOF > "$glyph_file"
 StartChar: raw$liga_code_point
 Encoding: $encodingidx $liga_code_point $encodingidx
 Width: $DEFAULT_GLYPH_WIDTH
 LayerCount: 2
 Comment: "Raw glyph for ligature replacement"
 Colour: ff0000
 EndChar
 EOF
            encodingidx=$((encodingidx + 1))
            echo ' DONE'
        fi
    done < <(jq -r '.ortho | gsub("θ"; "th") | gsub("∫"; "sh") | explode[]' <<<"$glyph_json")
 done < <(cue export -p shenikan | jq -c '.dictionary.glyphs[]')
 # generate the 'real' ligature glyphs
 while read -r glyph_json; do
    liga=( )
    while read -r liga_code_point; do
        liga+=( "raw$liga_code_point" )
    done < <(jq -r '.ortho | gsub("θ"; "th") | gsub("∫"; "sh") | explode[]' <<<"$glyph_json")
    ortho="$(jq -r '.ortho' <<<"$glyph_json")"
    ascii_ortho="$(jq -r '.ortho | gsub("θ"; "th") | gsub("∫"; "sh") | @uri | gsub("%"; "q")' <<<"$glyph_json")"
    echo -n "Generating $ortho..."
    glyph_file="$fontdir/sh$ascii_ortho.glyph"
    cat <<EOF > "$glyph_file"
 StartChar: sh$ascii_ortho
 Encoding: $encodingidx $codepoint $encodingidx
 Width: $DEFAULT_GLYPH_WIDTH
 LayerCount: 2
 Comment: "Shenikan $ascii_ortho glyph"
 Ligature2: "'liga' Standard Ligatures in Latin lookup 0-1" ${liga[*]}
 EOF
    if [ "$(jq '.ortho | length' <<<"$glyph_json")" -gt 1 ]; then
        liga=( )
        while read -r liga_code_point; do
            liga+=( "raw$liga_code_point" )
        done < <(jq -r '.ortho | explode | reverse | implode | gsub("θ"; "th") | gsub("∫"; "sh") | explode[]' <<<"$glyph_json")
        echo "Ligature2: \"'liga' Standard Ligatures in Latin lookup 0-1\" ${liga[*]}" >> "$glyph_file"
    fi
    echo 'EndChar' >> "$glyph_file"
    encodingidx=$((encodingidx + 1))
    codepoint=$((codepoint + 1))
    echo ' DONE'
 done < <(cue export -p shenikan | jq -c '.dictionary.glyphs[]')
		`@ -0,0 +1,3 @@`
							`module labprogramming.net/shenikandata`

							`go 1.23.1`