diff options
Diffstat (limited to 'docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc')
-rw-r--r-- | docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc | 57 |
1 files changed, 57 insertions, 0 deletions
diff --git a/docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc new file mode 100644 index 0000000..23e6bc5 --- /dev/null +++ b/docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc @@ -0,0 +1,57 @@ +[[analysis-ngram-tokenizer]] +=== NGram Tokenizer + +A tokenizer of type `nGram`. + +The following are settings that can be set for a `nGram` tokenizer type: + +[cols="<,<,<",options="header",] +|======================================================================= +|Setting |Description |Default value +|`min_gram` |Minimum size in codepoints of a single n-gram |`1`. + +|`max_gram` |Maximum size in codepoints of a single n-gram |`2`. + +|`token_chars` |Characters classes to keep in the +tokens, Elasticsearch will split on characters that don't belong to any +of these classes. |`[]` (Keep all characters) +|======================================================================= + +`token_chars` accepts the following character classes: + +[horizontal] +`letter`:: for example `a`, `b`, `ï` or `京` +`digit`:: for example `3` or `7` +`whitespace`:: for example `" "` or `"\n"` +`punctuation`:: for example `!` or `"` +`symbol`:: for example `$` or `√` + +[float] +==== Example + +[source,js] +-------------------------------------------------- + curl -XPUT 'localhost:9200/test' -d ' + { + "settings" : { + "analysis" : { + "analyzer" : { + "my_ngram_analyzer" : { + "tokenizer" : "my_ngram_tokenizer" + } + }, + "tokenizer" : { + "my_ngram_tokenizer" : { + "type" : "nGram", + "min_gram" : "2", + "max_gram" : "3", + "token_chars": [ "letter", "digit" ] + } + } + } + } + }' + + curl 'localhost:9200/test/_analyze?pretty=1&analyzer=my_ngram_analyzer' -d 'FC Schalke 04' + # FC, Sc, Sch, ch, cha, ha, hal, al, alk, lk, lke, ke, 04 +-------------------------------------------------- |