diff --git a/README.md b/README.md index 905c432..6b7c064 100644 --- a/README.md +++ b/README.md @@ -206,7 +206,7 @@ server { ## Enable the spell checker -This is an experimental feature. The process of building the dictionaries takes a lot of disk space. +The process of building the dictionaries takes a lot of disk space and require huge amount of memory (around 1GB). Install `wget` and `aspell` (to have the `word-list-compress` command), and run: @@ -215,4 +215,65 @@ cd dict-dists-extractor make ``` -Then, set `ENABLE_CHECK_SPELLING` to `true` in `public/config.js`. +If you want to limit spelling to a subset of languages, you can define the list by the `LANGS` variable before executing the `make` command such as: + +```sh +cd dict-dists-extractor +LANGS="de en fr" make +``` + +This way, you only build English, French and German dictionaries. + +To deactivate spell checker at system level, you need to empty dictionary list contained into file `public/dict/list.js`. Such command can be used: + +```sh +echo > public/dict/list.js +``` + +## Technical appendices + +### Board extractor + +Wikipedia provides tile distributions (content of the bag and points) for various languages. Board languages are built from a Wikipedia page. + +#### General parsing + +Most board languages are extracted from the French version of the Wikipedia page which has a simple document structure. Parsing is done by a JS script named `make_board.js`: +- each new language board is proceeded by a level 2 header containing language name, +- tile points is followed by `point(s)` +- tile letter and number of tiles are easy extracted by a regex which looks like `(letter) x(times)` + +#### Special languages + +Some languages are only defined into the English version. So we copy them into separated text files and parsing is done by an AWK named `make_board.awk`. + +### Dictionary building process + +The spell checker in Trivabble is based on Aspell dictionaries. However, as Trivabble only uses a subset of letters (most accentuated characters have to be replaced by standard ones), we can't use Aspell engine straightforward; we need to build lists of words that only use playable characters. + +#### Overall concept + +In order to build dictionaries for various languages, we first need to retrieve Aspell dictionary lists. Then we retrieve the last versions of dictionaries for every languages and we build them. Thanks to two Aspell commands, we build the full list of words. Based on the distribution of tiles, we translate every derived (most of the time accentuated) character by the appropriate one, and finally we remove duplicated entries. + +#### Details on Makefile rules + +Rule '$(OBJ_DIR)/src.mk' retrieves on Aspell site the last version of the dictionary list, extract dictionary archive URL and build a sub-makefile with rules to retrieve every needed dictionary. + +Rule '%.dir' unpacks dictionary archives and build Aspell dictionary. Result directory is rename as LANG.dir + +Rule '%.low' expands a Aspell dictionary into a list of words (low means "List Of Words"". Phrases are split into words. Finally the list is ordered and clean from duplicated words. + +Rule '$(ROOT_DICT)/%.dict': +- excludes words with numbers, dashes, apostrophes and other forbidden signs, +- translates accentuated characters into standard ones, +- up-case all words, +- and remove duplicated words. +This file can be used by Trivabble for spell checking. + +Last rule 'check-%' checks that file %.dict contains only words with characters base on language tile bag. + +#### Remarks + +Some lists of words are too huge (for example Hungarian or Turkish are agglutinative languages), so Trivabble will not be able to check spelling for those languages. + +Some languages don't have any Aspell dictionary. diff --git a/dict-dists-extractor/Makefile b/dict-dists-extractor/Makefile index c494283..0dbe3f9 100644 --- a/dict-dists-extractor/Makefile +++ b/dict-dists-extractor/Makefile @@ -9,7 +9,7 @@ SMALL = br ca cy da de en es fr ga hr hy is lv nl no pt sv LARGE = bg el eo it pl ro ru sk sl uk # 50MB < dictionary size < 500MB VERY_LARGE = ar cs et fi -LANGS = $(SMALL) $(LARGE) +LANGS ?= $(SMALL) $(LARGE) DICT_RULE_PRELUDE = sortuniq() { cat > "$$1"; sort -u -o "$$1" "$$1"; }; \ @@ -83,8 +83,9 @@ include $(wildcard ${OBJ_DIR}/src.mk) export DICT_NAME="$$(basename "$<" ".dir")"; \ aspell --dict-dir="$$(realpath $<)" -d "$$DICT_NAME" dump master "$$DICT_NAME" | \ aspell --dict-dir="$$(realpath $<)" -l "$$DICT_NAME" expand | \ - tr -s '[:space:]' '\n' > "$@" && \ - LC_ALL=C sort -S28G -u -o "$@" "$@" + tr -s '[:space:]' '\n' > "$@~" && \ + LC_ALL=C sort -S28G -u -o "$@~" "$@~" && \ + mv "$@~" "$@" $(OBJ_DIR)/no.low: make $(OBJ_DIR)/nb.low diff --git a/l10n/po/fr/trivabble.po b/l10n/po/fr/trivabble.po index 4050bb2..33cda7f 100644 --- a/l10n/po/fr/trivabble.po +++ b/l10n/po/fr/trivabble.po @@ -228,3 +228,6 @@ msgstr "Astuce suivante" msgid "Disable the spell checker" msgstr "Désactiver la vérification orthographique" + +msgid "Spell checking is not available for this language." +msgstr "La vérification orthographique n'est pas disponible dans cette langue." diff --git a/l10n/pot/trivabble.pot b/l10n/pot/trivabble.pot index 9fb9dc9..87365a5 100644 --- a/l10n/pot/trivabble.pot +++ b/l10n/pot/trivabble.pot @@ -232,4 +232,7 @@ msgid "Next tip" msgstr "" msgid "Disable the spell checker" -msgstr "" \ No newline at end of file +msgstr "" + +msgid "Spell checking is not available for this language." +msgstr "" diff --git a/public/index.html b/public/index.html index c56bccf..27b5c76 100644 --- a/public/index.html +++ b/public/index.html @@ -66,6 +66,9 @@ Disable the spell checker

+

+ Spell checking is not available for this language. +

diff --git a/public/trivabble.js b/public/trivabble.js index 30dce5d..3249d8c 100644 --- a/public/trivabble.js +++ b/public/trivabble.js @@ -252,6 +252,7 @@ const code = document.getElementById("board-lang").value; const availableLang = Object.prototype.hasOwnProperty.call(DictionaryList, code); document.getElementById("disable-spell-checker-p").hidden = !availableLang; + document.getElementById("no-spell-checker-p").hidden = availableLang; if (availableLang && !document.getElementById("disable-spell-checker").checked) { document.getElementById("check-spelling").hidden = false;