Added some basic tools

This commit is contained in:
Louis Burke 2025-02-03 20:59:03 -05:00
parent d77046d404
commit 51e77feee8
6 changed files with 351 additions and 0 deletions

23
Makefile Normal file
View file

@ -0,0 +1,23 @@
.PHONY: default clean
EXPORT_FORMAT?=json
default: tools fonts
FONT_DIRS=$(wildcard fonts/*.sfdir)
FONT_LIST=$(patsubst fonts/%.sfdir,%,$(FONT_DIRS))
FONT_TTFS=$(patsubst %,%.ttf,$(FONT_LIST))
fonts: $(FONT_TTFS)
%.ttf: fonts/%.sfdir
fontforge -lang=ff -c 'Open($$1); Generate($$2)' $^ $@
dump-dictionary:
@cue export --out $(EXPORT_FORMAT) -p shenikan
print-%:
@echo '$* = $($*)'
clean:
-rm $(FONT_TTFS)

3
go.mod Normal file
View file

@ -0,0 +1,3 @@
module labprogramming.net/shenikandata
go 1.23.1

6
shenikan.go Normal file
View file

@ -0,0 +1,6 @@
package shenikandata
import "embed"
//go:embed *.cue
var Cues embed.FS

199
tools/esh.py Executable file
View file

@ -0,0 +1,199 @@
#!/usr/bin/env python3
"""
This losely does similar behaviour to asking ChatGPT the following:
Can you help me find words in your embedding space? I want to give you a basic
arithmetic expression involving words to find relationships between words in
your embedding model. For example king minus man plus woman should probably be
something like queen. Please give me 10 options each time. Are you ready?
"""
import cmd
import re
import os
from gensim import downloader
from thefuzz import process
EMBEDDING_TOKENS = [
('NUMBER', r'\d+(\.\d*)?'), # an integer or decimal number
('WORD', r'\w+'), # a word
('PAREN', r'[()]'), # a parenthesis
('OP', r'[+\-*/~]'), # an arithmetic operator
('COMMA', r','), # a comma
('WS', r'\s+'), # whitespace
('ERROR', r'.'), # anything else
]
EMBEDDING_TOKENIZATION_RE = re.compile('|'.join(
f'(?P<{x[0]}>{x[1]})' for x in EMBEDDING_TOKENS
))
def tokenize_embedding_expr(expr):
""" Generates (token_kind, token) for each token in expr. """
for mo in EMBEDDING_TOKENIZATION_RE.finditer(expr):
yield (mo.lastgroup, mo.group())
def token_precedence(token):
"""
Returns the precedence of the token.
Negative precedences are right-associative
"""
if token in {'+', '-', '~'}:
return 1
if token in {'*', '/'}:
return 2
return 0
def _goes_first(a, b):
ap = token_precedence(a)
bp = token_precedence(b)
aap = abs(ap)
abp = abs(bp)
if aap > abp:
return True
if aap == abp and bp > 0:
return True
return False
def shunt_embedding_tokens(tokens):
"""
Tokens are (kind, value) where kind is:
w - word to be looked up in model and converted to embedding vector
s - scalar value
o - operator
"""
stack = [] # operator stack, just the op itself!
for (kind, tok) in tokens:
if kind == 'WORD':
yield ('w', tok)
elif kind == 'NUMBER':
yield ('s', tok)
elif kind == 'OP':
while stack and stack[-1] != '(' and _goes_first(stack[-1], tok):
yield ('o', stack.pop())
stack.append(tok)
elif kind == 'PAREN':
if tok == '(':
stack.append(tok)
else:
while stack and stack[-1] != '(':
yield ('o', stack.pop())
if stack:
stack.pop() # remove the '('
while stack:
yield ('o', stack.pop())
def evaluate_embedding_shunt(shunt, model):
""" Evaluates shunt using model. """
stack = []
for (kind, x) in shunt:
if kind == 'w':
if x[0] == '_':
if x[1:] in model:
stack.append(-model[x[1:]])
else:
most_similar = process.extractOne(x[1:], model.key_to_index.keys())[0]
stack.append(-model[most_similar])
if x in model:
stack.append(model[x])
else:
most_similar = process.extractOne(x, model.key_to_index.keys())[0]
stack.append(model[most_similar])
elif kind == 's':
stack.append(float(x))
elif kind == 'o':
if x == '+':
a = stack.pop()
b = stack.pop()
stack.append(a + b)
elif x == '-':
a = stack.pop()
b = stack.pop()
stack.append(b - a)
elif x == '*':
a = stack.pop()
b = stack.pop()
stack.append(a * b)
elif x == '/':
a = stack.pop()
b = stack.pop()
stack.append(b / a)
elif x == '~':
a = stack.pop()
b = stack.pop()
stack.append((a + b) / 2)
return stack[-1]
class EmbeddingShell(cmd.Cmd):
""" Actual embedding shell wrapper. """
intro = 'Welcome to the embedding shell. Enter words in an equation to see similar embeddings. Type :help for more information'
prompt = '(Ʃ) '
def __init__(self, *args, model='glove-wiki-gigaword-300', **kwargs):
super().__init__(completekey='tab', stdin=None, stdout=None, *args, **kwargs)
print('Loading model...', end='', flush=True)
self._model = downloader.load(model)
self._keys = self._model.key_to_index.keys()
print(' DONE')
def do_exec(self, arg):
""" Test """
try:
result = evaluate_embedding_shunt(shunt_embedding_tokens(tokenize_embedding_expr(arg)), self._model)
for (word, sim) in self._model.most_similar(result, restrict_vocab=10000):
(w, _) = os.get_terminal_size()
bar = '-' * int((w - 20) * sim)
print(f'{word:10} {bar}')
except Exception as e:
print("Could not evaluate expression:", e)
def do_shunt(self, arg):
for x in shunt_embedding_tokens(tokenize_embedding_expr(arg)):
print(x)
def do_quit(self, arg):
""" Exit the embedding shell. """
return True
def precmd(self, line):
if not line:
return line
if line[0] == ':':
return line[1:]
return 'exec ' + line
if __name__ == '__main__':
EmbeddingShell().cmdloop()

13
tools/listencoding.sh Executable file
View file

@ -0,0 +1,13 @@
#!/bin/bash
shopt -s extglob
: "${OFFSET:=0}"
codepoint="$((0xF3A00 + OFFSET * 256))"
while read -r glyph_json; do
ascii_ortho="$(jq -r '.ortho | gsub("θ"; "th") | gsub("∫"; "sh") | @uri | gsub("%"; "q")' <<<"$glyph_json")"
printf '%s\t=\t%x\n' "$ascii_ortho" "$codepoint"
codepoint="$((codepoint + 1))"
done < <(cue export -p shenikan | jq -c '.dictionary.glyphs[]')

107
tools/mkemptyfont.sh Executable file
View file

@ -0,0 +1,107 @@
#!/bin/bash
shopt -s extglob
fontdir="$1.sfdir"
DEFAULT_GLYPH_WIDTH=555
read -r -p "Font name:" fontname
read -r -p "Full name:" fullname
read -r -p "Unicode page offset (/256, 0 is 0xF3Axx):" offset
mkdir "$fontdir"
cat <<EOF > "$fontdir/font.props"
SplineFontDB: 3.2
FontName: $fontname
FullName: $fullname
Weight: Book
Copyright: Copyright (C) $(date +%Y), $(id -un)
UComments: "$(date +%Y-%M-%d): Created with mkemptyfont.sh"
Version: 001.000
ItalicAngle: 0
UnderlinePosition: -150
UnderlineWidth: 50
Ascent: 800
Descent: 200
sfntRevision: 0x00010000
LayerCount: 2
Layer: 0 0 "Back" 1
Layer: 1 0 "Fore" 0
DisplaySize: -48
AntiAlias: 1
FitToEm: 0
Encoding: Custom
CreationTime: $(date +%s)
ModificationTime: $(date +%s)
DEI: 91125
Lookup: 4 0 1 "'liga' Standard Ligatures in Latin lookup 0" { "'liga' Standard Ligatures in Latin lookup 0-1" } ['liga' ('DFLT' <'dflt' > 'latn' <'dflt' > ) ]
EOF
ord() {
LC_CTYPE=C printf '%d' "'$1"
}
codepoint="$((0xF3A00 + offset * 256))"
encodingidx=0
# generate the 'raw' replacement glyphs
while read -r glyph_json; do
while read -r liga_code_point; do
glyph_file="$fontdir/raw$liga_code_point.glyph"
if [ ! -e "$glyph_file" ]; then
echo -n "Generating raw$liga_code_point..."
cat <<EOF > "$glyph_file"
StartChar: raw$liga_code_point
Encoding: $encodingidx $liga_code_point $encodingidx
Width: $DEFAULT_GLYPH_WIDTH
LayerCount: 2
Comment: "Raw glyph for ligature replacement"
Colour: ff0000
EndChar
EOF
encodingidx=$((encodingidx + 1))
echo ' DONE'
fi
done < <(jq -r '.ortho | gsub("θ"; "th") | gsub("∫"; "sh") | explode[]' <<<"$glyph_json")
done < <(cue export -p shenikan | jq -c '.dictionary.glyphs[]')
# generate the 'real' ligature glyphs
while read -r glyph_json; do
liga=( )
while read -r liga_code_point; do
liga+=( "raw$liga_code_point" )
done < <(jq -r '.ortho | gsub("θ"; "th") | gsub("∫"; "sh") | explode[]' <<<"$glyph_json")
ortho="$(jq -r '.ortho' <<<"$glyph_json")"
ascii_ortho="$(jq -r '.ortho | gsub("θ"; "th") | gsub("∫"; "sh") | @uri | gsub("%"; "q")' <<<"$glyph_json")"
echo -n "Generating $ortho..."
glyph_file="$fontdir/sh$ascii_ortho.glyph"
cat <<EOF > "$glyph_file"
StartChar: sh$ascii_ortho
Encoding: $encodingidx $codepoint $encodingidx
Width: $DEFAULT_GLYPH_WIDTH
LayerCount: 2
Comment: "Shenikan $ascii_ortho glyph"
Ligature2: "'liga' Standard Ligatures in Latin lookup 0-1" ${liga[*]}
EOF
if [ "$(jq '.ortho | length' <<<"$glyph_json")" -gt 1 ]; then
liga=( )
while read -r liga_code_point; do
liga+=( "raw$liga_code_point" )
done < <(jq -r '.ortho | explode | reverse | implode | gsub("θ"; "th") | gsub("∫"; "sh") | explode[]' <<<"$glyph_json")
echo "Ligature2: \"'liga' Standard Ligatures in Latin lookup 0-1\" ${liga[*]}" >> "$glyph_file"
fi
echo 'EndChar' >> "$glyph_file"
encodingidx=$((encodingidx + 1))
codepoint=$((codepoint + 1))
echo ' DONE'
done < <(cue export -p shenikan | jq -c '.dictionary.glyphs[]')