Added some basic tools
This commit is contained in:
parent
d77046d404
commit
51e77feee8
23
Makefile
Normal file
23
Makefile
Normal file
|
@ -0,0 +1,23 @@
|
|||
.PHONY: default clean
|
||||
|
||||
EXPORT_FORMAT?=json
|
||||
|
||||
default: tools fonts
|
||||
|
||||
FONT_DIRS=$(wildcard fonts/*.sfdir)
|
||||
FONT_LIST=$(patsubst fonts/%.sfdir,%,$(FONT_DIRS))
|
||||
FONT_TTFS=$(patsubst %,%.ttf,$(FONT_LIST))
|
||||
|
||||
fonts: $(FONT_TTFS)
|
||||
|
||||
%.ttf: fonts/%.sfdir
|
||||
fontforge -lang=ff -c 'Open($$1); Generate($$2)' $^ $@
|
||||
|
||||
dump-dictionary:
|
||||
@cue export --out $(EXPORT_FORMAT) -p shenikan
|
||||
|
||||
print-%:
|
||||
@echo '$* = $($*)'
|
||||
|
||||
clean:
|
||||
-rm $(FONT_TTFS)
|
6
shenikan.go
Normal file
6
shenikan.go
Normal file
|
@ -0,0 +1,6 @@
|
|||
package shenikandata
|
||||
|
||||
import "embed"
|
||||
|
||||
//go:embed *.cue
|
||||
var Cues embed.FS
|
199
tools/esh.py
Executable file
199
tools/esh.py
Executable file
|
@ -0,0 +1,199 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
This losely does similar behaviour to asking ChatGPT the following:
|
||||
|
||||
Can you help me find words in your embedding space? I want to give you a basic
|
||||
arithmetic expression involving words to find relationships between words in
|
||||
your embedding model. For example king minus man plus woman should probably be
|
||||
something like queen. Please give me 10 options each time. Are you ready?
|
||||
"""
|
||||
|
||||
import cmd
|
||||
import re
|
||||
import os
|
||||
|
||||
from gensim import downloader
|
||||
from thefuzz import process
|
||||
|
||||
EMBEDDING_TOKENS = [
|
||||
('NUMBER', r'\d+(\.\d*)?'), # an integer or decimal number
|
||||
('WORD', r'\w+'), # a word
|
||||
('PAREN', r'[()]'), # a parenthesis
|
||||
('OP', r'[+\-*/~]'), # an arithmetic operator
|
||||
('COMMA', r','), # a comma
|
||||
('WS', r'\s+'), # whitespace
|
||||
('ERROR', r'.'), # anything else
|
||||
]
|
||||
EMBEDDING_TOKENIZATION_RE = re.compile('|'.join(
|
||||
f'(?P<{x[0]}>{x[1]})' for x in EMBEDDING_TOKENS
|
||||
))
|
||||
|
||||
|
||||
def tokenize_embedding_expr(expr):
|
||||
""" Generates (token_kind, token) for each token in expr. """
|
||||
for mo in EMBEDDING_TOKENIZATION_RE.finditer(expr):
|
||||
yield (mo.lastgroup, mo.group())
|
||||
|
||||
|
||||
def token_precedence(token):
|
||||
"""
|
||||
Returns the precedence of the token.
|
||||
Negative precedences are right-associative
|
||||
"""
|
||||
if token in {'+', '-', '~'}:
|
||||
return 1
|
||||
|
||||
if token in {'*', '/'}:
|
||||
return 2
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def _goes_first(a, b):
|
||||
ap = token_precedence(a)
|
||||
bp = token_precedence(b)
|
||||
aap = abs(ap)
|
||||
abp = abs(bp)
|
||||
|
||||
if aap > abp:
|
||||
return True
|
||||
|
||||
if aap == abp and bp > 0:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def shunt_embedding_tokens(tokens):
|
||||
"""
|
||||
Tokens are (kind, value) where kind is:
|
||||
|
||||
w - word to be looked up in model and converted to embedding vector
|
||||
s - scalar value
|
||||
o - operator
|
||||
"""
|
||||
stack = [] # operator stack, just the op itself!
|
||||
|
||||
for (kind, tok) in tokens:
|
||||
if kind == 'WORD':
|
||||
yield ('w', tok)
|
||||
|
||||
elif kind == 'NUMBER':
|
||||
yield ('s', tok)
|
||||
|
||||
elif kind == 'OP':
|
||||
while stack and stack[-1] != '(' and _goes_first(stack[-1], tok):
|
||||
yield ('o', stack.pop())
|
||||
stack.append(tok)
|
||||
|
||||
elif kind == 'PAREN':
|
||||
if tok == '(':
|
||||
stack.append(tok)
|
||||
else:
|
||||
while stack and stack[-1] != '(':
|
||||
yield ('o', stack.pop())
|
||||
|
||||
if stack:
|
||||
stack.pop() # remove the '('
|
||||
|
||||
while stack:
|
||||
yield ('o', stack.pop())
|
||||
|
||||
|
||||
def evaluate_embedding_shunt(shunt, model):
|
||||
""" Evaluates shunt using model. """
|
||||
stack = []
|
||||
|
||||
for (kind, x) in shunt:
|
||||
if kind == 'w':
|
||||
if x[0] == '_':
|
||||
if x[1:] in model:
|
||||
stack.append(-model[x[1:]])
|
||||
else:
|
||||
most_similar = process.extractOne(x[1:], model.key_to_index.keys())[0]
|
||||
stack.append(-model[most_similar])
|
||||
|
||||
if x in model:
|
||||
stack.append(model[x])
|
||||
else:
|
||||
most_similar = process.extractOne(x, model.key_to_index.keys())[0]
|
||||
stack.append(model[most_similar])
|
||||
|
||||
elif kind == 's':
|
||||
stack.append(float(x))
|
||||
|
||||
elif kind == 'o':
|
||||
if x == '+':
|
||||
a = stack.pop()
|
||||
b = stack.pop()
|
||||
stack.append(a + b)
|
||||
|
||||
elif x == '-':
|
||||
a = stack.pop()
|
||||
b = stack.pop()
|
||||
stack.append(b - a)
|
||||
|
||||
elif x == '*':
|
||||
a = stack.pop()
|
||||
b = stack.pop()
|
||||
stack.append(a * b)
|
||||
|
||||
elif x == '/':
|
||||
a = stack.pop()
|
||||
b = stack.pop()
|
||||
stack.append(b / a)
|
||||
|
||||
elif x == '~':
|
||||
a = stack.pop()
|
||||
b = stack.pop()
|
||||
stack.append((a + b) / 2)
|
||||
|
||||
return stack[-1]
|
||||
|
||||
|
||||
class EmbeddingShell(cmd.Cmd):
|
||||
""" Actual embedding shell wrapper. """
|
||||
intro = 'Welcome to the embedding shell. Enter words in an equation to see similar embeddings. Type :help for more information'
|
||||
prompt = '(Ʃ) '
|
||||
|
||||
def __init__(self, *args, model='glove-wiki-gigaword-300', **kwargs):
|
||||
super().__init__(completekey='tab', stdin=None, stdout=None, *args, **kwargs)
|
||||
print('Loading model...', end='', flush=True)
|
||||
self._model = downloader.load(model)
|
||||
self._keys = self._model.key_to_index.keys()
|
||||
print(' DONE')
|
||||
|
||||
def do_exec(self, arg):
|
||||
""" Test """
|
||||
try:
|
||||
result = evaluate_embedding_shunt(shunt_embedding_tokens(tokenize_embedding_expr(arg)), self._model)
|
||||
|
||||
for (word, sim) in self._model.most_similar(result, restrict_vocab=10000):
|
||||
(w, _) = os.get_terminal_size()
|
||||
bar = '-' * int((w - 20) * sim)
|
||||
print(f'{word:10} {bar}')
|
||||
except Exception as e:
|
||||
print("Could not evaluate expression:", e)
|
||||
|
||||
|
||||
def do_shunt(self, arg):
|
||||
for x in shunt_embedding_tokens(tokenize_embedding_expr(arg)):
|
||||
print(x)
|
||||
|
||||
def do_quit(self, arg):
|
||||
""" Exit the embedding shell. """
|
||||
return True
|
||||
|
||||
def precmd(self, line):
|
||||
if not line:
|
||||
return line
|
||||
if line[0] == ':':
|
||||
return line[1:]
|
||||
return 'exec ' + line
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
EmbeddingShell().cmdloop()
|
13
tools/listencoding.sh
Executable file
13
tools/listencoding.sh
Executable file
|
@ -0,0 +1,13 @@
|
|||
#!/bin/bash
|
||||
|
||||
shopt -s extglob
|
||||
|
||||
: "${OFFSET:=0}"
|
||||
|
||||
codepoint="$((0xF3A00 + OFFSET * 256))"
|
||||
while read -r glyph_json; do
|
||||
ascii_ortho="$(jq -r '.ortho | gsub("θ"; "th") | gsub("∫"; "sh") | @uri | gsub("%"; "q")' <<<"$glyph_json")"
|
||||
|
||||
printf '%s\t=\t%x\n' "$ascii_ortho" "$codepoint"
|
||||
codepoint="$((codepoint + 1))"
|
||||
done < <(cue export -p shenikan | jq -c '.dictionary.glyphs[]')
|
107
tools/mkemptyfont.sh
Executable file
107
tools/mkemptyfont.sh
Executable file
|
@ -0,0 +1,107 @@
|
|||
#!/bin/bash
|
||||
|
||||
shopt -s extglob
|
||||
|
||||
fontdir="$1.sfdir"
|
||||
|
||||
DEFAULT_GLYPH_WIDTH=555
|
||||
|
||||
read -r -p "Font name:" fontname
|
||||
read -r -p "Full name:" fullname
|
||||
read -r -p "Unicode page offset (/256, 0 is 0xF3Axx):" offset
|
||||
|
||||
mkdir "$fontdir"
|
||||
|
||||
cat <<EOF > "$fontdir/font.props"
|
||||
SplineFontDB: 3.2
|
||||
FontName: $fontname
|
||||
FullName: $fullname
|
||||
Weight: Book
|
||||
Copyright: Copyright (C) $(date +%Y), $(id -un)
|
||||
UComments: "$(date +%Y-%M-%d): Created with mkemptyfont.sh"
|
||||
Version: 001.000
|
||||
ItalicAngle: 0
|
||||
UnderlinePosition: -150
|
||||
UnderlineWidth: 50
|
||||
Ascent: 800
|
||||
Descent: 200
|
||||
sfntRevision: 0x00010000
|
||||
LayerCount: 2
|
||||
Layer: 0 0 "Back" 1
|
||||
Layer: 1 0 "Fore" 0
|
||||
DisplaySize: -48
|
||||
AntiAlias: 1
|
||||
FitToEm: 0
|
||||
Encoding: Custom
|
||||
CreationTime: $(date +%s)
|
||||
ModificationTime: $(date +%s)
|
||||
DEI: 91125
|
||||
Lookup: 4 0 1 "'liga' Standard Ligatures in Latin lookup 0" { "'liga' Standard Ligatures in Latin lookup 0-1" } ['liga' ('DFLT' <'dflt' > 'latn' <'dflt' > ) ]
|
||||
EOF
|
||||
|
||||
ord() {
|
||||
LC_CTYPE=C printf '%d' "'$1"
|
||||
}
|
||||
|
||||
codepoint="$((0xF3A00 + offset * 256))"
|
||||
encodingidx=0
|
||||
|
||||
# generate the 'raw' replacement glyphs
|
||||
while read -r glyph_json; do
|
||||
while read -r liga_code_point; do
|
||||
glyph_file="$fontdir/raw$liga_code_point.glyph"
|
||||
if [ ! -e "$glyph_file" ]; then
|
||||
echo -n "Generating raw$liga_code_point..."
|
||||
|
||||
cat <<EOF > "$glyph_file"
|
||||
StartChar: raw$liga_code_point
|
||||
Encoding: $encodingidx $liga_code_point $encodingidx
|
||||
Width: $DEFAULT_GLYPH_WIDTH
|
||||
LayerCount: 2
|
||||
Comment: "Raw glyph for ligature replacement"
|
||||
Colour: ff0000
|
||||
EndChar
|
||||
EOF
|
||||
encodingidx=$((encodingidx + 1))
|
||||
|
||||
echo ' DONE'
|
||||
fi
|
||||
done < <(jq -r '.ortho | gsub("θ"; "th") | gsub("∫"; "sh") | explode[]' <<<"$glyph_json")
|
||||
done < <(cue export -p shenikan | jq -c '.dictionary.glyphs[]')
|
||||
|
||||
# generate the 'real' ligature glyphs
|
||||
while read -r glyph_json; do
|
||||
liga=( )
|
||||
while read -r liga_code_point; do
|
||||
liga+=( "raw$liga_code_point" )
|
||||
done < <(jq -r '.ortho | gsub("θ"; "th") | gsub("∫"; "sh") | explode[]' <<<"$glyph_json")
|
||||
|
||||
ortho="$(jq -r '.ortho' <<<"$glyph_json")"
|
||||
ascii_ortho="$(jq -r '.ortho | gsub("θ"; "th") | gsub("∫"; "sh") | @uri | gsub("%"; "q")' <<<"$glyph_json")"
|
||||
echo -n "Generating $ortho..."
|
||||
|
||||
glyph_file="$fontdir/sh$ascii_ortho.glyph"
|
||||
cat <<EOF > "$glyph_file"
|
||||
StartChar: sh$ascii_ortho
|
||||
Encoding: $encodingidx $codepoint $encodingidx
|
||||
Width: $DEFAULT_GLYPH_WIDTH
|
||||
LayerCount: 2
|
||||
Comment: "Shenikan $ascii_ortho glyph"
|
||||
Ligature2: "'liga' Standard Ligatures in Latin lookup 0-1" ${liga[*]}
|
||||
EOF
|
||||
|
||||
if [ "$(jq '.ortho | length' <<<"$glyph_json")" -gt 1 ]; then
|
||||
liga=( )
|
||||
while read -r liga_code_point; do
|
||||
liga+=( "raw$liga_code_point" )
|
||||
done < <(jq -r '.ortho | explode | reverse | implode | gsub("θ"; "th") | gsub("∫"; "sh") | explode[]' <<<"$glyph_json")
|
||||
|
||||
echo "Ligature2: \"'liga' Standard Ligatures in Latin lookup 0-1\" ${liga[*]}" >> "$glyph_file"
|
||||
fi
|
||||
|
||||
echo 'EndChar' >> "$glyph_file"
|
||||
encodingidx=$((encodingidx + 1))
|
||||
codepoint=$((codepoint + 1))
|
||||
|
||||
echo ' DONE'
|
||||
done < <(cue export -p shenikan | jq -c '.dictionary.glyphs[]')
|
Loading…
Reference in a new issue