Added some basic tools
This commit is contained in:
parent
d77046d404
commit
51e77feee8
23
Makefile
Normal file
23
Makefile
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
.PHONY: default clean
|
||||||
|
|
||||||
|
EXPORT_FORMAT?=json
|
||||||
|
|
||||||
|
default: tools fonts
|
||||||
|
|
||||||
|
FONT_DIRS=$(wildcard fonts/*.sfdir)
|
||||||
|
FONT_LIST=$(patsubst fonts/%.sfdir,%,$(FONT_DIRS))
|
||||||
|
FONT_TTFS=$(patsubst %,%.ttf,$(FONT_LIST))
|
||||||
|
|
||||||
|
fonts: $(FONT_TTFS)
|
||||||
|
|
||||||
|
%.ttf: fonts/%.sfdir
|
||||||
|
fontforge -lang=ff -c 'Open($$1); Generate($$2)' $^ $@
|
||||||
|
|
||||||
|
dump-dictionary:
|
||||||
|
@cue export --out $(EXPORT_FORMAT) -p shenikan
|
||||||
|
|
||||||
|
print-%:
|
||||||
|
@echo '$* = $($*)'
|
||||||
|
|
||||||
|
clean:
|
||||||
|
-rm $(FONT_TTFS)
|
6
shenikan.go
Normal file
6
shenikan.go
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
package shenikandata
|
||||||
|
|
||||||
|
import "embed"
|
||||||
|
|
||||||
|
//go:embed *.cue
|
||||||
|
var Cues embed.FS
|
199
tools/esh.py
Executable file
199
tools/esh.py
Executable file
|
@ -0,0 +1,199 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
"""
|
||||||
|
This losely does similar behaviour to asking ChatGPT the following:
|
||||||
|
|
||||||
|
Can you help me find words in your embedding space? I want to give you a basic
|
||||||
|
arithmetic expression involving words to find relationships between words in
|
||||||
|
your embedding model. For example king minus man plus woman should probably be
|
||||||
|
something like queen. Please give me 10 options each time. Are you ready?
|
||||||
|
"""
|
||||||
|
|
||||||
|
import cmd
|
||||||
|
import re
|
||||||
|
import os
|
||||||
|
|
||||||
|
from gensim import downloader
|
||||||
|
from thefuzz import process
|
||||||
|
|
||||||
|
EMBEDDING_TOKENS = [
|
||||||
|
('NUMBER', r'\d+(\.\d*)?'), # an integer or decimal number
|
||||||
|
('WORD', r'\w+'), # a word
|
||||||
|
('PAREN', r'[()]'), # a parenthesis
|
||||||
|
('OP', r'[+\-*/~]'), # an arithmetic operator
|
||||||
|
('COMMA', r','), # a comma
|
||||||
|
('WS', r'\s+'), # whitespace
|
||||||
|
('ERROR', r'.'), # anything else
|
||||||
|
]
|
||||||
|
EMBEDDING_TOKENIZATION_RE = re.compile('|'.join(
|
||||||
|
f'(?P<{x[0]}>{x[1]})' for x in EMBEDDING_TOKENS
|
||||||
|
))
|
||||||
|
|
||||||
|
|
||||||
|
def tokenize_embedding_expr(expr):
|
||||||
|
""" Generates (token_kind, token) for each token in expr. """
|
||||||
|
for mo in EMBEDDING_TOKENIZATION_RE.finditer(expr):
|
||||||
|
yield (mo.lastgroup, mo.group())
|
||||||
|
|
||||||
|
|
||||||
|
def token_precedence(token):
|
||||||
|
"""
|
||||||
|
Returns the precedence of the token.
|
||||||
|
Negative precedences are right-associative
|
||||||
|
"""
|
||||||
|
if token in {'+', '-', '~'}:
|
||||||
|
return 1
|
||||||
|
|
||||||
|
if token in {'*', '/'}:
|
||||||
|
return 2
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
def _goes_first(a, b):
|
||||||
|
ap = token_precedence(a)
|
||||||
|
bp = token_precedence(b)
|
||||||
|
aap = abs(ap)
|
||||||
|
abp = abs(bp)
|
||||||
|
|
||||||
|
if aap > abp:
|
||||||
|
return True
|
||||||
|
|
||||||
|
if aap == abp and bp > 0:
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def shunt_embedding_tokens(tokens):
|
||||||
|
"""
|
||||||
|
Tokens are (kind, value) where kind is:
|
||||||
|
|
||||||
|
w - word to be looked up in model and converted to embedding vector
|
||||||
|
s - scalar value
|
||||||
|
o - operator
|
||||||
|
"""
|
||||||
|
stack = [] # operator stack, just the op itself!
|
||||||
|
|
||||||
|
for (kind, tok) in tokens:
|
||||||
|
if kind == 'WORD':
|
||||||
|
yield ('w', tok)
|
||||||
|
|
||||||
|
elif kind == 'NUMBER':
|
||||||
|
yield ('s', tok)
|
||||||
|
|
||||||
|
elif kind == 'OP':
|
||||||
|
while stack and stack[-1] != '(' and _goes_first(stack[-1], tok):
|
||||||
|
yield ('o', stack.pop())
|
||||||
|
stack.append(tok)
|
||||||
|
|
||||||
|
elif kind == 'PAREN':
|
||||||
|
if tok == '(':
|
||||||
|
stack.append(tok)
|
||||||
|
else:
|
||||||
|
while stack and stack[-1] != '(':
|
||||||
|
yield ('o', stack.pop())
|
||||||
|
|
||||||
|
if stack:
|
||||||
|
stack.pop() # remove the '('
|
||||||
|
|
||||||
|
while stack:
|
||||||
|
yield ('o', stack.pop())
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate_embedding_shunt(shunt, model):
|
||||||
|
""" Evaluates shunt using model. """
|
||||||
|
stack = []
|
||||||
|
|
||||||
|
for (kind, x) in shunt:
|
||||||
|
if kind == 'w':
|
||||||
|
if x[0] == '_':
|
||||||
|
if x[1:] in model:
|
||||||
|
stack.append(-model[x[1:]])
|
||||||
|
else:
|
||||||
|
most_similar = process.extractOne(x[1:], model.key_to_index.keys())[0]
|
||||||
|
stack.append(-model[most_similar])
|
||||||
|
|
||||||
|
if x in model:
|
||||||
|
stack.append(model[x])
|
||||||
|
else:
|
||||||
|
most_similar = process.extractOne(x, model.key_to_index.keys())[0]
|
||||||
|
stack.append(model[most_similar])
|
||||||
|
|
||||||
|
elif kind == 's':
|
||||||
|
stack.append(float(x))
|
||||||
|
|
||||||
|
elif kind == 'o':
|
||||||
|
if x == '+':
|
||||||
|
a = stack.pop()
|
||||||
|
b = stack.pop()
|
||||||
|
stack.append(a + b)
|
||||||
|
|
||||||
|
elif x == '-':
|
||||||
|
a = stack.pop()
|
||||||
|
b = stack.pop()
|
||||||
|
stack.append(b - a)
|
||||||
|
|
||||||
|
elif x == '*':
|
||||||
|
a = stack.pop()
|
||||||
|
b = stack.pop()
|
||||||
|
stack.append(a * b)
|
||||||
|
|
||||||
|
elif x == '/':
|
||||||
|
a = stack.pop()
|
||||||
|
b = stack.pop()
|
||||||
|
stack.append(b / a)
|
||||||
|
|
||||||
|
elif x == '~':
|
||||||
|
a = stack.pop()
|
||||||
|
b = stack.pop()
|
||||||
|
stack.append((a + b) / 2)
|
||||||
|
|
||||||
|
return stack[-1]
|
||||||
|
|
||||||
|
|
||||||
|
class EmbeddingShell(cmd.Cmd):
|
||||||
|
""" Actual embedding shell wrapper. """
|
||||||
|
intro = 'Welcome to the embedding shell. Enter words in an equation to see similar embeddings. Type :help for more information'
|
||||||
|
prompt = '(Ʃ) '
|
||||||
|
|
||||||
|
def __init__(self, *args, model='glove-wiki-gigaword-300', **kwargs):
|
||||||
|
super().__init__(completekey='tab', stdin=None, stdout=None, *args, **kwargs)
|
||||||
|
print('Loading model...', end='', flush=True)
|
||||||
|
self._model = downloader.load(model)
|
||||||
|
self._keys = self._model.key_to_index.keys()
|
||||||
|
print(' DONE')
|
||||||
|
|
||||||
|
def do_exec(self, arg):
|
||||||
|
""" Test """
|
||||||
|
try:
|
||||||
|
result = evaluate_embedding_shunt(shunt_embedding_tokens(tokenize_embedding_expr(arg)), self._model)
|
||||||
|
|
||||||
|
for (word, sim) in self._model.most_similar(result, restrict_vocab=10000):
|
||||||
|
(w, _) = os.get_terminal_size()
|
||||||
|
bar = '-' * int((w - 20) * sim)
|
||||||
|
print(f'{word:10} {bar}')
|
||||||
|
except Exception as e:
|
||||||
|
print("Could not evaluate expression:", e)
|
||||||
|
|
||||||
|
|
||||||
|
def do_shunt(self, arg):
|
||||||
|
for x in shunt_embedding_tokens(tokenize_embedding_expr(arg)):
|
||||||
|
print(x)
|
||||||
|
|
||||||
|
def do_quit(self, arg):
|
||||||
|
""" Exit the embedding shell. """
|
||||||
|
return True
|
||||||
|
|
||||||
|
def precmd(self, line):
|
||||||
|
if not line:
|
||||||
|
return line
|
||||||
|
if line[0] == ':':
|
||||||
|
return line[1:]
|
||||||
|
return 'exec ' + line
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
EmbeddingShell().cmdloop()
|
13
tools/listencoding.sh
Executable file
13
tools/listencoding.sh
Executable file
|
@ -0,0 +1,13 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
shopt -s extglob
|
||||||
|
|
||||||
|
: "${OFFSET:=0}"
|
||||||
|
|
||||||
|
codepoint="$((0xF3A00 + OFFSET * 256))"
|
||||||
|
while read -r glyph_json; do
|
||||||
|
ascii_ortho="$(jq -r '.ortho | gsub("θ"; "th") | gsub("∫"; "sh") | @uri | gsub("%"; "q")' <<<"$glyph_json")"
|
||||||
|
|
||||||
|
printf '%s\t=\t%x\n' "$ascii_ortho" "$codepoint"
|
||||||
|
codepoint="$((codepoint + 1))"
|
||||||
|
done < <(cue export -p shenikan | jq -c '.dictionary.glyphs[]')
|
107
tools/mkemptyfont.sh
Executable file
107
tools/mkemptyfont.sh
Executable file
|
@ -0,0 +1,107 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
shopt -s extglob
|
||||||
|
|
||||||
|
fontdir="$1.sfdir"
|
||||||
|
|
||||||
|
DEFAULT_GLYPH_WIDTH=555
|
||||||
|
|
||||||
|
read -r -p "Font name:" fontname
|
||||||
|
read -r -p "Full name:" fullname
|
||||||
|
read -r -p "Unicode page offset (/256, 0 is 0xF3Axx):" offset
|
||||||
|
|
||||||
|
mkdir "$fontdir"
|
||||||
|
|
||||||
|
cat <<EOF > "$fontdir/font.props"
|
||||||
|
SplineFontDB: 3.2
|
||||||
|
FontName: $fontname
|
||||||
|
FullName: $fullname
|
||||||
|
Weight: Book
|
||||||
|
Copyright: Copyright (C) $(date +%Y), $(id -un)
|
||||||
|
UComments: "$(date +%Y-%M-%d): Created with mkemptyfont.sh"
|
||||||
|
Version: 001.000
|
||||||
|
ItalicAngle: 0
|
||||||
|
UnderlinePosition: -150
|
||||||
|
UnderlineWidth: 50
|
||||||
|
Ascent: 800
|
||||||
|
Descent: 200
|
||||||
|
sfntRevision: 0x00010000
|
||||||
|
LayerCount: 2
|
||||||
|
Layer: 0 0 "Back" 1
|
||||||
|
Layer: 1 0 "Fore" 0
|
||||||
|
DisplaySize: -48
|
||||||
|
AntiAlias: 1
|
||||||
|
FitToEm: 0
|
||||||
|
Encoding: Custom
|
||||||
|
CreationTime: $(date +%s)
|
||||||
|
ModificationTime: $(date +%s)
|
||||||
|
DEI: 91125
|
||||||
|
Lookup: 4 0 1 "'liga' Standard Ligatures in Latin lookup 0" { "'liga' Standard Ligatures in Latin lookup 0-1" } ['liga' ('DFLT' <'dflt' > 'latn' <'dflt' > ) ]
|
||||||
|
EOF
|
||||||
|
|
||||||
|
ord() {
|
||||||
|
LC_CTYPE=C printf '%d' "'$1"
|
||||||
|
}
|
||||||
|
|
||||||
|
codepoint="$((0xF3A00 + offset * 256))"
|
||||||
|
encodingidx=0
|
||||||
|
|
||||||
|
# generate the 'raw' replacement glyphs
|
||||||
|
while read -r glyph_json; do
|
||||||
|
while read -r liga_code_point; do
|
||||||
|
glyph_file="$fontdir/raw$liga_code_point.glyph"
|
||||||
|
if [ ! -e "$glyph_file" ]; then
|
||||||
|
echo -n "Generating raw$liga_code_point..."
|
||||||
|
|
||||||
|
cat <<EOF > "$glyph_file"
|
||||||
|
StartChar: raw$liga_code_point
|
||||||
|
Encoding: $encodingidx $liga_code_point $encodingidx
|
||||||
|
Width: $DEFAULT_GLYPH_WIDTH
|
||||||
|
LayerCount: 2
|
||||||
|
Comment: "Raw glyph for ligature replacement"
|
||||||
|
Colour: ff0000
|
||||||
|
EndChar
|
||||||
|
EOF
|
||||||
|
encodingidx=$((encodingidx + 1))
|
||||||
|
|
||||||
|
echo ' DONE'
|
||||||
|
fi
|
||||||
|
done < <(jq -r '.ortho | gsub("θ"; "th") | gsub("∫"; "sh") | explode[]' <<<"$glyph_json")
|
||||||
|
done < <(cue export -p shenikan | jq -c '.dictionary.glyphs[]')
|
||||||
|
|
||||||
|
# generate the 'real' ligature glyphs
|
||||||
|
while read -r glyph_json; do
|
||||||
|
liga=( )
|
||||||
|
while read -r liga_code_point; do
|
||||||
|
liga+=( "raw$liga_code_point" )
|
||||||
|
done < <(jq -r '.ortho | gsub("θ"; "th") | gsub("∫"; "sh") | explode[]' <<<"$glyph_json")
|
||||||
|
|
||||||
|
ortho="$(jq -r '.ortho' <<<"$glyph_json")"
|
||||||
|
ascii_ortho="$(jq -r '.ortho | gsub("θ"; "th") | gsub("∫"; "sh") | @uri | gsub("%"; "q")' <<<"$glyph_json")"
|
||||||
|
echo -n "Generating $ortho..."
|
||||||
|
|
||||||
|
glyph_file="$fontdir/sh$ascii_ortho.glyph"
|
||||||
|
cat <<EOF > "$glyph_file"
|
||||||
|
StartChar: sh$ascii_ortho
|
||||||
|
Encoding: $encodingidx $codepoint $encodingidx
|
||||||
|
Width: $DEFAULT_GLYPH_WIDTH
|
||||||
|
LayerCount: 2
|
||||||
|
Comment: "Shenikan $ascii_ortho glyph"
|
||||||
|
Ligature2: "'liga' Standard Ligatures in Latin lookup 0-1" ${liga[*]}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
if [ "$(jq '.ortho | length' <<<"$glyph_json")" -gt 1 ]; then
|
||||||
|
liga=( )
|
||||||
|
while read -r liga_code_point; do
|
||||||
|
liga+=( "raw$liga_code_point" )
|
||||||
|
done < <(jq -r '.ortho | explode | reverse | implode | gsub("θ"; "th") | gsub("∫"; "sh") | explode[]' <<<"$glyph_json")
|
||||||
|
|
||||||
|
echo "Ligature2: \"'liga' Standard Ligatures in Latin lookup 0-1\" ${liga[*]}" >> "$glyph_file"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo 'EndChar' >> "$glyph_file"
|
||||||
|
encodingidx=$((encodingidx + 1))
|
||||||
|
codepoint=$((codepoint + 1))
|
||||||
|
|
||||||
|
echo ' DONE'
|
||||||
|
done < <(cue export -p shenikan | jq -c '.dictionary.glyphs[]')
|
Loading…
Reference in a new issue