Imported Upstream version 1.0.3upstream/1.0.3

author: Hilko Bengen <bengen@debian.org> 2014-06-07 12:02:12 +0200
committer: Hilko Bengen <bengen@debian.org> 2014-06-07 12:02:12 +0200
commit: d5ed89b946297270ec28abf44bef2371a06f1f4f (patch)
tree: ce2d945e4dde69af90bd9905a70d8d27f4936776 /docs/reference/analysis/tokenizers/standard-tokenizer.asciidoc
download: elasticsearch-d5ed89b946297270ec28abf44bef2371a06f1f4f.tar.gz
1 files changed, 18 insertions, 0 deletions
diff --git a/docs/reference/analysis/tokenizers/standard-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/standard-tokenizer.asciidoc
new file mode 100644
index 0000000..c8b405b
--- /dev/null
+++ b/docs/reference/analysis/tokenizers/standard-tokenizer.asciidoc
@@ -0,0 +1,18 @@
+[[analysis-standard-tokenizer]]
+=== Standard Tokenizer
+
+A tokenizer of type `standard` providing grammar based tokenizer that is
+a good tokenizer for most European language documents. The tokenizer
+implements the Unicode Text Segmentation algorithm, as specified in
+http://unicode.org/reports/tr29/[Unicode Standard Annex #29].
+
+The following are settings that can be set for a `standard` tokenizer
+type:
+
+[cols="<,<",options="header",]
+|=======================================================================
+|Setting |Description
+|`max_token_length` |The maximum token length. If a token is seen that
+exceeds this length then it is discarded. Defaults to `255`.
+|=======================================================================
+
author	Hilko Bengen <bengen@debian.org>	2014-06-07 12:02:12 +0200
committer	Hilko Bengen <bengen@debian.org>	2014-06-07 12:02:12 +0200
commit	d5ed89b946297270ec28abf44bef2371a06f1f4f (patch)
tree	ce2d945e4dde69af90bd9905a70d8d27f4936776 /docs/reference/analysis/tokenizers/standard-tokenizer.asciidoc
download	elasticsearch-d5ed89b946297270ec28abf44bef2371a06f1f4f.tar.gz