summaryrefslogtreecommitdiff
path: root/docs/reference/analysis
diff options
context:
space:
mode:
Diffstat (limited to 'docs/reference/analysis')
-rw-r--r--docs/reference/analysis/analyzers.asciidoc71
-rw-r--r--docs/reference/analysis/analyzers/custom-analyzer.asciidoc52
-rw-r--r--docs/reference/analysis/analyzers/keyword-analyzer.asciidoc7
-rw-r--r--docs/reference/analysis/analyzers/lang-analyzer.asciidoc21
-rw-r--r--docs/reference/analysis/analyzers/pattern-analyzer.asciidoc130
-rw-r--r--docs/reference/analysis/analyzers/simple-analyzer.asciidoc6
-rw-r--r--docs/reference/analysis/analyzers/snowball-analyzer.asciidoc64
-rw-r--r--docs/reference/analysis/analyzers/standard-analyzer.asciidoc27
-rw-r--r--docs/reference/analysis/analyzers/stop-analyzer.asciidoc22
-rw-r--r--docs/reference/analysis/analyzers/whitespace-analyzer.asciidoc6
-rw-r--r--docs/reference/analysis/charfilters.asciidoc16
-rw-r--r--docs/reference/analysis/charfilters/htmlstrip-charfilter.asciidoc5
-rw-r--r--docs/reference/analysis/charfilters/mapping-charfilter.asciidoc38
-rw-r--r--docs/reference/analysis/charfilters/pattern-replace-charfilter.asciidoc37
-rw-r--r--docs/reference/analysis/icu-plugin.asciidoc220
-rw-r--r--docs/reference/analysis/tokenfilters.asciidoc74
-rw-r--r--docs/reference/analysis/tokenfilters/asciifolding-tokenfilter.asciidoc7
-rw-r--r--docs/reference/analysis/tokenfilters/common-grams-tokenfilter.asciidoc61
-rw-r--r--docs/reference/analysis/tokenfilters/compound-word-tokenfilter.asciidoc48
-rw-r--r--docs/reference/analysis/tokenfilters/delimited-payload-tokenfilter.asciidoc16
-rw-r--r--docs/reference/analysis/tokenfilters/edgengram-tokenfilter.asciidoc16
-rw-r--r--docs/reference/analysis/tokenfilters/elision-tokenfilter.asciidoc28
-rw-r--r--docs/reference/analysis/tokenfilters/hunspell-tokenfilter.asciidoc115
-rw-r--r--docs/reference/analysis/tokenfilters/keep-words-tokenfilter.asciidoc49
-rw-r--r--docs/reference/analysis/tokenfilters/keyword-marker-tokenfilter.asciidoc34
-rw-r--r--docs/reference/analysis/tokenfilters/keyword-repeat-tokenfilter.asciidoc26
-rw-r--r--docs/reference/analysis/tokenfilters/kstem-tokenfilter.asciidoc6
-rw-r--r--docs/reference/analysis/tokenfilters/length-tokenfilter.asciidoc16
-rw-r--r--docs/reference/analysis/tokenfilters/limit-token-count-tokenfilter.asciidoc32
-rw-r--r--docs/reference/analysis/tokenfilters/lowercase-tokenfilter.asciidoc37
-rw-r--r--docs/reference/analysis/tokenfilters/ngram-tokenfilter.asciidoc15
-rw-r--r--docs/reference/analysis/tokenfilters/normalization-tokenfilter.asciidoc15
-rw-r--r--docs/reference/analysis/tokenfilters/pattern-capture-tokenfilter.asciidoc134
-rw-r--r--docs/reference/analysis/tokenfilters/pattern_replace-tokenfilter.asciidoc9
-rw-r--r--docs/reference/analysis/tokenfilters/phonetic-tokenfilter.asciidoc5
-rw-r--r--docs/reference/analysis/tokenfilters/porterstem-tokenfilter.asciidoc15
-rw-r--r--docs/reference/analysis/tokenfilters/reverse-tokenfilter.asciidoc4
-rw-r--r--docs/reference/analysis/tokenfilters/shingle-tokenfilter.asciidoc36
-rw-r--r--docs/reference/analysis/tokenfilters/snowball-tokenfilter.asciidoc33
-rw-r--r--docs/reference/analysis/tokenfilters/standard-tokenfilter.asciidoc7
-rw-r--r--docs/reference/analysis/tokenfilters/stemmer-override-tokenfilter.asciidoc34
-rw-r--r--docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc78
-rw-r--r--docs/reference/analysis/tokenfilters/stop-tokenfilter.asciidoc35
-rw-r--r--docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc123
-rw-r--r--docs/reference/analysis/tokenfilters/trim-tokenfilter.asciidoc4
-rw-r--r--docs/reference/analysis/tokenfilters/truncate-tokenfilter.asciidoc10
-rw-r--r--docs/reference/analysis/tokenfilters/unique-tokenfilter.asciidoc7
-rw-r--r--docs/reference/analysis/tokenfilters/word-delimiter-tokenfilter.asciidoc80
-rw-r--r--docs/reference/analysis/tokenizers.asciidoc30
-rw-r--r--docs/reference/analysis/tokenizers/edgengram-tokenizer.asciidoc80
-rw-r--r--docs/reference/analysis/tokenizers/keyword-tokenizer.asciidoc15
-rw-r--r--docs/reference/analysis/tokenizers/letter-tokenizer.asciidoc7
-rw-r--r--docs/reference/analysis/tokenizers/lowercase-tokenizer.asciidoc15
-rw-r--r--docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc57
-rw-r--r--docs/reference/analysis/tokenizers/pathhierarchy-tokenizer.asciidoc32
-rw-r--r--docs/reference/analysis/tokenizers/pattern-tokenizer.asciidoc29
-rw-r--r--docs/reference/analysis/tokenizers/standard-tokenizer.asciidoc18
-rw-r--r--docs/reference/analysis/tokenizers/uaxurlemail-tokenizer.asciidoc16
-rw-r--r--docs/reference/analysis/tokenizers/whitespace-tokenizer.asciidoc4
59 files changed, 2234 insertions, 0 deletions
diff --git a/docs/reference/analysis/analyzers.asciidoc b/docs/reference/analysis/analyzers.asciidoc
new file mode 100644
index 0000000..b97231b
--- /dev/null
+++ b/docs/reference/analysis/analyzers.asciidoc
@@ -0,0 +1,71 @@
+[[analysis-analyzers]]
+== Analyzers
+
+Analyzers are composed of a single <<analysis-tokenizers,Tokenizer>>
+and zero or more <<analysis-tokenfilters,TokenFilters>>. The tokenizer may
+be preceded by one or more <<analysis-charfilters,CharFilters>>.
+The analysis module allows you to register `Analyzers` under logical
+names which can then be referenced either in mapping definitions or in
+certain APIs.
+
+Elasticsearch comes with a number of prebuilt analyzers which are
+ready to use. Alternatively, you can combine the built in
+character filters, tokenizers and token filters to create
+<<analysis-custom-analyzer,custom analyzers>>.
+
+[float]
+[[default-analyzers]]
+=== Default Analyzers
+
+An analyzer is registered under a logical name. It can then be
+referenced from mapping definitions or certain APIs. When none are
+defined, defaults are used. There is an option to define which analyzers
+will be used by default when none can be derived.
+
+The `default` logical name allows one to configure an analyzer that will
+be used both for indexing and for searching APIs. The `default_index`
+logical name can be used to configure a default analyzer that will be
+used just when indexing, and the `default_search` can be used to
+configure a default analyzer that will be used just when searching.
+
+[float]
+[[aliasing-analyzers]]
+=== Aliasing Analyzers
+
+Analyzers can be aliased to have several registered lookup names
+associated with them. For example, the following will allow
+the `standard` analyzer to also be referenced with `alias1`
+and `alias2` values.
+
+
+[source,js]
+--------------------------------------------------
+index :
+ analysis :
+ analyzer :
+ standard :
+ alias: [alias1, alias2]
+ type : standard
+ stopwords : [test1, test2, test3]
+--------------------------------------------------
+
+Below is a list of the built in analyzers.
+
+include::analyzers/standard-analyzer.asciidoc[]
+
+include::analyzers/simple-analyzer.asciidoc[]
+
+include::analyzers/whitespace-analyzer.asciidoc[]
+
+include::analyzers/stop-analyzer.asciidoc[]
+
+include::analyzers/keyword-analyzer.asciidoc[]
+
+include::analyzers/pattern-analyzer.asciidoc[]
+
+include::analyzers/lang-analyzer.asciidoc[]
+
+include::analyzers/snowball-analyzer.asciidoc[]
+
+include::analyzers/custom-analyzer.asciidoc[]
+
diff --git a/docs/reference/analysis/analyzers/custom-analyzer.asciidoc b/docs/reference/analysis/analyzers/custom-analyzer.asciidoc
new file mode 100644
index 0000000..5c778a6
--- /dev/null
+++ b/docs/reference/analysis/analyzers/custom-analyzer.asciidoc
@@ -0,0 +1,52 @@
+[[analysis-custom-analyzer]]
+=== Custom Analyzer
+
+An analyzer of type `custom` that allows to combine a `Tokenizer` with
+zero or more `Token Filters`, and zero or more `Char Filters`. The
+custom analyzer accepts a logical/registered name of the tokenizer to
+use, and a list of logical/registered names of token filters.
+
+The following are settings that can be set for a `custom` analyzer type:
+
+[cols="<,<",options="header",]
+|=======================================================================
+|Setting |Description
+|`tokenizer` |The logical / registered name of the tokenizer to use.
+
+|`filter` |An optional list of logical / registered name of token
+filters.
+
+|`char_filter` |An optional list of logical / registered name of char
+filters.
+|=======================================================================
+
+Here is an example:
+
+[source,js]
+--------------------------------------------------
+index :
+ analysis :
+ analyzer :
+ myAnalyzer2 :
+ type : custom
+ tokenizer : myTokenizer1
+ filter : [myTokenFilter1, myTokenFilter2]
+ char_filter : [my_html]
+ tokenizer :
+ myTokenizer1 :
+ type : standard
+ max_token_length : 900
+ filter :
+ myTokenFilter1 :
+ type : stop
+ stopwords : [stop1, stop2, stop3, stop4]
+ myTokenFilter2 :
+ type : length
+ min : 0
+ max : 2000
+ char_filter :
+ my_html :
+ type : html_strip
+ escaped_tags : [xxx, yyy]
+ read_ahead : 1024
+--------------------------------------------------
diff --git a/docs/reference/analysis/analyzers/keyword-analyzer.asciidoc b/docs/reference/analysis/analyzers/keyword-analyzer.asciidoc
new file mode 100644
index 0000000..7704895
--- /dev/null
+++ b/docs/reference/analysis/analyzers/keyword-analyzer.asciidoc
@@ -0,0 +1,7 @@
+[[analysis-keyword-analyzer]]
+=== Keyword Analyzer
+
+An analyzer of type `keyword` that "tokenizes" an entire stream as a
+single token. This is useful for data like zip codes, ids and so on.
+Note, when using mapping definitions, it might make more sense to simply
+mark the field as `not_analyzed`.
diff --git a/docs/reference/analysis/analyzers/lang-analyzer.asciidoc b/docs/reference/analysis/analyzers/lang-analyzer.asciidoc
new file mode 100644
index 0000000..f963e4b
--- /dev/null
+++ b/docs/reference/analysis/analyzers/lang-analyzer.asciidoc
@@ -0,0 +1,21 @@
+[[analysis-lang-analyzer]]
+=== Language Analyzers
+
+A set of analyzers aimed at analyzing specific language text. The
+following types are supported: `arabic`, `armenian`, `basque`,
+`brazilian`, `bulgarian`, `catalan`, `chinese`, `cjk`, `czech`,
+`danish`, `dutch`, `english`, `finnish`, `french`, `galician`, `german`,
+`greek`, `hindi`, `hungarian`, `indonesian`, `italian`, `norwegian`,
+`persian`, `portuguese`, `romanian`, `russian`, `spanish`, `swedish`,
+`turkish`, `thai`.
+
+All analyzers support setting custom `stopwords` either internally in
+the config, or by using an external stopwords file by setting
+`stopwords_path`. Check <<analysis-stop-analyzer,Stop Analyzer>> for
+more details.
+
+The following analyzers support setting custom `stem_exclusion` list:
+`arabic`, `armenian`, `basque`, `brazilian`, `bulgarian`, `catalan`,
+`czech`, `danish`, `dutch`, `english`, `finnish`, `french`, `galician`,
+`german`, `hindi`, `hungarian`, `indonesian`, `italian`, `norwegian`,
+`portuguese`, `romanian`, `russian`, `spanish`, `swedish`, `turkish`.
diff --git a/docs/reference/analysis/analyzers/pattern-analyzer.asciidoc b/docs/reference/analysis/analyzers/pattern-analyzer.asciidoc
new file mode 100644
index 0000000..424ecfa
--- /dev/null
+++ b/docs/reference/analysis/analyzers/pattern-analyzer.asciidoc
@@ -0,0 +1,130 @@
+[[analysis-pattern-analyzer]]
+=== Pattern Analyzer
+
+An analyzer of type `pattern` that can flexibly separate text into terms
+via a regular expression. Accepts the following settings:
+
+The following are settings that can be set for a `pattern` analyzer
+type:
+
+[cols="<,<",options="header",]
+|===================================================================
+|Setting |Description
+|`lowercase` |Should terms be lowercased or not. Defaults to `true`.
+|`pattern` |The regular expression pattern, defaults to `\W+`.
+|`flags` |The regular expression flags.
+|`stopwords` |A list of stopwords to initialize the stop filter with.
+Defaults to an 'empty' stopword list added[1.0.0.RC1, Previously
+defaulted to the English stopwords list]. Check
+<<analysis-stop-analyzer,Stop Analyzer>> for more details.
+|===================================================================
+
+*IMPORTANT*: The regular expression should match the *token separators*,
+not the tokens themselves.
+
+Flags should be pipe-separated, eg `"CASE_INSENSITIVE|COMMENTS"`. Check
+http://download.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html#field_summary[Java
+Pattern API] for more details about `flags` options.
+
+[float]
+==== Pattern Analyzer Examples
+
+In order to try out these examples, you should delete the `test` index
+before running each example:
+
+[source,js]
+--------------------------------------------------
+ curl -XDELETE localhost:9200/test
+--------------------------------------------------
+
+[float]
+===== Whitespace tokenizer
+
+[source,js]
+--------------------------------------------------
+ curl -XPUT 'localhost:9200/test' -d '
+ {
+ "settings":{
+ "analysis": {
+ "analyzer": {
+ "whitespace":{
+ "type": "pattern",
+ "pattern":"\\\\s+"
+ }
+ }
+ }
+ }
+ }'
+
+ curl 'localhost:9200/test/_analyze?pretty=1&analyzer=whitespace' -d 'foo,bar baz'
+ # "foo,bar", "baz"
+--------------------------------------------------
+
+[float]
+===== Non-word character tokenizer
+
+[source,js]
+--------------------------------------------------
+
+ curl -XPUT 'localhost:9200/test' -d '
+ {
+ "settings":{
+ "analysis": {
+ "analyzer": {
+ "nonword":{
+ "type": "pattern",
+ "pattern":"[^\\\\w]+"
+ }
+ }
+ }
+ }
+ }'
+
+ curl 'localhost:9200/test/_analyze?pretty=1&analyzer=nonword' -d 'foo,bar baz'
+ # "foo,bar baz" becomes "foo", "bar", "baz"
+
+ curl 'localhost:9200/test/_analyze?pretty=1&analyzer=nonword' -d 'type_1-type_4'
+ # "type_1","type_4"
+--------------------------------------------------
+
+[float]
+===== CamelCase tokenizer
+
+[source,js]
+--------------------------------------------------
+
+ curl -XPUT 'localhost:9200/test?pretty=1' -d '
+ {
+ "settings":{
+ "analysis": {
+ "analyzer": {
+ "camel":{
+ "type": "pattern",
+ "pattern":"([^\\\\p{L}\\\\d]+)|(?<=\\\\D)(?=\\\\d)|(?<=\\\\d)(?=\\\\D)|(?<=[\\\\p{L}&&[^\\\\p{Lu}]])(?=\\\\p{Lu})|(?<=\\\\p{Lu})(?=\\\\p{Lu}[\\\\p{L}&&[^\\\\p{Lu}]])"
+ }
+ }
+ }
+ }
+ }'
+
+ curl 'localhost:9200/test/_analyze?pretty=1&analyzer=camel' -d '
+ MooseX::FTPClass2_beta
+ '
+ # "moose","x","ftp","class","2","beta"
+--------------------------------------------------
+
+The regex above is easier to understand as:
+
+[source,js]
+--------------------------------------------------
+
+ ([^\\p{L}\\d]+) # swallow non letters and numbers,
+ | (?<=\\D)(?=\\d) # or non-number followed by number,
+ | (?<=\\d)(?=\\D) # or number followed by non-number,
+ | (?<=[ \\p{L} && [^\\p{Lu}]]) # or lower case
+ (?=\\p{Lu}) # followed by upper case,
+ | (?<=\\p{Lu}) # or upper case
+ (?=\\p{Lu} # followed by upper case
+ [\\p{L}&&[^\\p{Lu}]] # then lower case
+ )
+--------------------------------------------------
diff --git a/docs/reference/analysis/analyzers/simple-analyzer.asciidoc b/docs/reference/analysis/analyzers/simple-analyzer.asciidoc
new file mode 100644
index 0000000..9d7a7c3
--- /dev/null
+++ b/docs/reference/analysis/analyzers/simple-analyzer.asciidoc
@@ -0,0 +1,6 @@
+[[analysis-simple-analyzer]]
+=== Simple Analyzer
+
+An analyzer of type `simple` that is built using a
+<<analysis-lowercase-tokenizer,Lower
+Case Tokenizer>>.
diff --git a/docs/reference/analysis/analyzers/snowball-analyzer.asciidoc b/docs/reference/analysis/analyzers/snowball-analyzer.asciidoc
new file mode 100644
index 0000000..64804fc
--- /dev/null
+++ b/docs/reference/analysis/analyzers/snowball-analyzer.asciidoc
@@ -0,0 +1,64 @@
+[[analysis-snowball-analyzer]]
+=== Snowball Analyzer
+
+An analyzer of type `snowball` that uses the
+<<analysis-standard-tokenizer,standard
+tokenizer>>, with
+<<analysis-standard-tokenfilter,standard
+filter>>,
+<<analysis-lowercase-tokenfilter,lowercase
+filter>>,
+<<analysis-stop-tokenfilter,stop
+filter>>, and
+<<analysis-snowball-tokenfilter,snowball
+filter>>.
+
+The Snowball Analyzer is a stemming analyzer from Lucene that is
+originally based on the snowball project from
+http://snowball.tartarus.org[snowball.tartarus.org].
+
+Sample usage:
+
+[source,js]
+--------------------------------------------------
+{
+ "index" : {
+ "analysis" : {
+ "analyzer" : {
+ "my_analyzer" : {
+ "type" : "snowball",
+ "language" : "English"
+ }
+ }
+ }
+ }
+}
+--------------------------------------------------
+
+The `language` parameter can have the same values as the
+<<analysis-snowball-tokenfilter,snowball
+filter>> and defaults to `English`. Note that not all the language
+analyzers have a default set of stopwords provided.
+
+The `stopwords` parameter can be used to provide stopwords for the
+languages that have no defaults, or to simply replace the default set
+with your custom list. Check <<analysis-stop-analyzer,Stop Analyzer>>
+for more details. A default set of stopwords for many of these
+languages is available from for instance
+https://github.com/apache/lucene-solr/tree/trunk/lucene/analysis/common/src/resources/org/apache/lucene/analysis/[here]
+and
+https://github.com/apache/lucene-solr/tree/trunk/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball[here.]
+
+A sample configuration (in YAML format) specifying Swedish with
+stopwords:
+
+[source,js]
+--------------------------------------------------
+index :
+ analysis :
+ analyzer :
+ my_analyzer:
+ type: snowball
+ language: Swedish
+ stopwords: "och,det,att,i,en,jag,hon,som,han,på,den,med,var,sig,för,så,till,är,men,ett,om,hade,de,av,icke,mig,du,henne,då,sin,nu,har,inte,hans,honom,skulle,hennes,där,min,man,ej,vid,kunde,något,från,ut,när,efter,upp,vi,dem,vara,vad,över,än,dig,kan,sina,här,ha,mot,alla,under,någon,allt,mycket,sedan,ju,denna,själv,detta,åt,utan,varit,hur,ingen,mitt,ni,bli,blev,oss,din,dessa,några,deras,blir,mina,samma,vilken,er,sådan,vår,blivit,dess,inom,mellan,sådant,varför,varje,vilka,ditt,vem,vilket,sitta,sådana,vart,dina,vars,vårt,våra,ert,era,vilkas"
+--------------------------------------------------
diff --git a/docs/reference/analysis/analyzers/standard-analyzer.asciidoc b/docs/reference/analysis/analyzers/standard-analyzer.asciidoc
new file mode 100644
index 0000000..4aae94a
--- /dev/null
+++ b/docs/reference/analysis/analyzers/standard-analyzer.asciidoc
@@ -0,0 +1,27 @@
+[[analysis-standard-analyzer]]
+=== Standard Analyzer
+
+An analyzer of type `standard` is built using the
+<<analysis-standard-tokenizer,Standard
+Tokenizer>> with the
+<<analysis-standard-tokenfilter,Standard
+Token Filter>>,
+<<analysis-lowercase-tokenfilter,Lower
+Case Token Filter>>, and
+<<analysis-stop-tokenfilter,Stop
+Token Filter>>.
+
+The following are settings that can be set for a `standard` analyzer
+type:
+
+[cols="<,<",options="header",]
+|=======================================================================
+|Setting |Description
+|`stopwords` |A list of stopwords to initialize the stop filter with.
+Defaults to an 'empty' stopword list added[1.0.0.Beta1, Previously
+defaulted to the English stopwords list]. Check
+<<analysis-stop-analyzer,Stop Analyzer>> for more details.
+|`max_token_length` |The maximum token length. If a token is seen that
+exceeds this length then it is discarded. Defaults to `255`.
+|=======================================================================
+
diff --git a/docs/reference/analysis/analyzers/stop-analyzer.asciidoc b/docs/reference/analysis/analyzers/stop-analyzer.asciidoc
new file mode 100644
index 0000000..9a19772
--- /dev/null
+++ b/docs/reference/analysis/analyzers/stop-analyzer.asciidoc
@@ -0,0 +1,22 @@
+[[analysis-stop-analyzer]]
+=== Stop Analyzer
+
+An analyzer of type `stop` that is built using a
+<<analysis-lowercase-tokenizer,Lower
+Case Tokenizer>>, with
+<<analysis-stop-tokenfilter,Stop
+Token Filter>>.
+
+The following are settings that can be set for a `stop` analyzer type:
+
+[cols="<,<",options="header",]
+|=======================================================================
+|Setting |Description
+|`stopwords` |A list of stopwords to initialize the stop filter with.
+Defaults to the english stop words.
+|`stopwords_path` |A path (either relative to `config` location, or
+absolute) to a stopwords file configuration.
+|=======================================================================
+
+Use `stopwords: _none_` to explicitly specify an 'empty' stopword list.
+
diff --git a/docs/reference/analysis/analyzers/whitespace-analyzer.asciidoc b/docs/reference/analysis/analyzers/whitespace-analyzer.asciidoc
new file mode 100644
index 0000000..2095686
--- /dev/null
+++ b/docs/reference/analysis/analyzers/whitespace-analyzer.asciidoc
@@ -0,0 +1,6 @@
+[[analysis-whitespace-analyzer]]
+=== Whitespace Analyzer
+
+An analyzer of type `whitespace` that is built using a
+<<analysis-whitespace-tokenizer,Whitespace
+Tokenizer>>.
diff --git a/docs/reference/analysis/charfilters.asciidoc b/docs/reference/analysis/charfilters.asciidoc
new file mode 100644
index 0000000..a40cfff
--- /dev/null
+++ b/docs/reference/analysis/charfilters.asciidoc
@@ -0,0 +1,16 @@
+[[analysis-charfilters]]
+== Character Filters
+
+Character filters are used to preprocess the string of
+characters before it is passed to the <<analysis-tokenizers,tokenizer>>.
+A character filter may be used to strip out HTML markup, , or to convert
+`"&"` characters to the word `"and"`.
+
+Elasticsearch has built in characters filters which can be
+used to build <<analysis-custom-analyzer,custom analyzers>>.
+
+include::charfilters/mapping-charfilter.asciidoc[]
+
+include::charfilters/htmlstrip-charfilter.asciidoc[]
+
+include::charfilters/pattern-replace-charfilter.asciidoc[]
diff --git a/docs/reference/analysis/charfilters/htmlstrip-charfilter.asciidoc b/docs/reference/analysis/charfilters/htmlstrip-charfilter.asciidoc
new file mode 100644
index 0000000..f12238a
--- /dev/null
+++ b/docs/reference/analysis/charfilters/htmlstrip-charfilter.asciidoc
@@ -0,0 +1,5 @@
+[[analysis-htmlstrip-charfilter]]
+=== HTML Strip Char Filter
+
+A char filter of type `html_strip` stripping out HTML elements from an
+analyzed text.
diff --git a/docs/reference/analysis/charfilters/mapping-charfilter.asciidoc b/docs/reference/analysis/charfilters/mapping-charfilter.asciidoc
new file mode 100644
index 0000000..ef4df81
--- /dev/null
+++ b/docs/reference/analysis/charfilters/mapping-charfilter.asciidoc
@@ -0,0 +1,38 @@
+[[analysis-mapping-charfilter]]
+=== Mapping Char Filter
+
+A char filter of type `mapping` replacing characters of an analyzed text
+with given mapping.
+
+Here is a sample configuration:
+
+[source,js]
+--------------------------------------------------
+{
+ "index" : {
+ "analysis" : {
+ "char_filter" : {
+ "my_mapping" : {
+ "type" : "mapping",
+ "mappings" : ["ph=>f", "qu=>q"]
+ }
+ },
+ "analyzer" : {
+ "custom_with_char_filter" : {
+ "tokenizer" : "standard",
+ "char_filter" : ["my_mapping"]
+ },
+ }
+ }
+ }
+}
+--------------------------------------------------
+
+Otherwise the setting `mappings_path` can specify a file where you can
+put the list of char mapping :
+
+[source,js]
+--------------------------------------------------
+ph => f
+qu => k
+--------------------------------------------------
diff --git a/docs/reference/analysis/charfilters/pattern-replace-charfilter.asciidoc b/docs/reference/analysis/charfilters/pattern-replace-charfilter.asciidoc
new file mode 100644
index 0000000..5a0cf28
--- /dev/null
+++ b/docs/reference/analysis/charfilters/pattern-replace-charfilter.asciidoc
@@ -0,0 +1,37 @@
+[[analysis-pattern-replace-charfilter]]
+=== Pattern Replace Char Filter
+
+The `pattern_replace` char filter allows the use of a regex to
+manipulate the characters in a string before analysis. The regular
+expression is defined using the `pattern` parameter, and the replacement
+string can be provided using the `replacement` parameter (supporting
+referencing the original text, as explained
+http://docs.oracle.com/javase/6/docs/api/java/util/regex/Matcher.html#appendReplacement(java.lang.StringBuffer,%20java.lang.String)[here]).
+For more information check the
+http://lucene.apache.org/core/4_3_1/analyzers-common/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.html[lucene
+documentation]
+
+Here is a sample configuration:
+
+[source,js]
+--------------------------------------------------
+{
+ "index" : {
+ "analysis" : {
+ "char_filter" : {
+ "my_pattern":{
+ "type":"pattern_replace",
+ "pattern":"sample(.*)",
+ "replacement":"replacedSample $1"
+ }
+ },
+ "analyzer" : {
+ "custom_with_char_filter" : {
+ "tokenizer" : "standard",
+ "char_filter" : ["my_pattern"]
+ },
+ }
+ }
+ }
+}
+--------------------------------------------------
diff --git a/docs/reference/analysis/icu-plugin.asciidoc b/docs/reference/analysis/icu-plugin.asciidoc
new file mode 100644
index 0000000..c1be216
--- /dev/null
+++ b/docs/reference/analysis/icu-plugin.asciidoc
@@ -0,0 +1,220 @@
+[[analysis-icu-plugin]]
+== ICU Analysis Plugin
+
+The http://icu-project.org/[ICU] analysis plugin allows for unicode
+normalization, collation and folding. The plugin is called
+https://github.com/elasticsearch/elasticsearch-analysis-icu[elasticsearch-analysis-icu].
+
+The plugin includes the following analysis components:
+
+[float]
+[[icu-normalization]]
+=== ICU Normalization
+
+Normalizes characters as explained
+http://userguide.icu-project.org/transforms/normalization[here]. It
+registers itself by default under `icu_normalizer` or `icuNormalizer`
+using the default settings. Allows for the name parameter to be provided
+which can include the following values: `nfc`, `nfkc`, and `nfkc_cf`.
+Here is a sample settings:
+
+[source,js]
+--------------------------------------------------
+{
+ "index" : {
+ "analysis" : {
+ "analyzer" : {
+ "normalization" : {
+ "tokenizer" : "keyword",
+ "filter" : ["icu_normalizer"]
+ }
+ }
+ }
+ }
+}
+--------------------------------------------------
+
+[float]
+[[icu-folding]]
+=== ICU Folding
+
+Folding of unicode characters based on `UTR#30`. It registers itself
+under `icu_folding` and `icuFolding` names.
+The filter also does lowercasing, which means the lowercase filter can
+normally be left out. Sample setting:
+
+[source,js]
+--------------------------------------------------
+{
+ "index" : {
+ "analysis" : {
+ "analyzer" : {
+ "folding" : {
+ "tokenizer" : "keyword",
+ "filter" : ["icu_folding"]
+ }
+ }
+ }
+ }
+}
+--------------------------------------------------
+
+[float]
+[[icu-filtering]]
+==== Filtering
+
+The folding can be filtered by a set of unicode characters with the
+parameter `unicodeSetFilter`. This is useful for a non-internationalized
+search engine where retaining a set of national characters which are
+primary letters in a specific language is wanted. See syntax for the
+UnicodeSet
+http://icu-project.org/apiref/icu4j/com/ibm/icu/text/UnicodeSet.html[here].
+
+The Following example exempts Swedish characters from the folding. Note
+that the filtered characters are NOT lowercased which is why we add that
+filter below.
+
+[source,js]
+--------------------------------------------------
+{
+ "index" : {
+ "analysis" : {
+ "analyzer" : {
+ "folding" : {
+ "tokenizer" : "standard",
+ "filter" : ["my_icu_folding", "lowercase"]
+ }
+ }
+ "filter" : {
+ "my_icu_folding" : {
+ "type" : "icu_folding"
+ "unicodeSetFilter" : "[^åäöÅÄÖ]"
+ }
+ }
+ }
+ }
+}
+--------------------------------------------------
+
+[float]
+[[icu-collation]]
+=== ICU Collation
+
+Uses collation token filter. Allows to either specify the rules for
+collation (defined
+http://www.icu-project.org/userguide/Collate_Customization.html[here])
+using the `rules` parameter (can point to a location or expressed in the
+settings, location can be relative to config location), or using the
+`language` parameter (further specialized by country and variant). By
+default registers under `icu_collation` or `icuCollation` and uses the
+default locale.
+
+Here is a sample settings:
+
+[source,js]
+--------------------------------------------------
+{
+ "index" : {
+ "analysis" : {
+ "analyzer" : {
+ "collation" : {
+ "tokenizer" : "keyword",
+ "filter" : ["icu_collation"]
+ }
+ }
+ }
+ }
+}
+--------------------------------------------------
+
+And here is a sample of custom collation:
+
+[source,js]
+--------------------------------------------------
+{
+ "index" : {
+ "analysis" : {
+ "analyzer" : {
+ "collation" : {
+ "tokenizer" : "keyword",
+ "filter" : ["myCollator"]
+ }
+ },
+ "filter" : {
+ "myCollator" : {
+ "type" : "icu_collation",
+ "language" : "en"
+ }
+ }
+ }
+ }
+}
+--------------------------------------------------
+
+[float]
+==== Options
+
+[horizontal]
+`strength`::
+ The strength property determines the minimum level of difference considered significant during comparison.
+ The default strength for the Collator is `tertiary`, unless specified otherwise by the locale used to create the Collator.
+ Possible values: `primary`, `secondary`, `tertiary`, `quaternary` or `identical`.
+ +
+ See http://icu-project.org/apiref/icu4j/com/ibm/icu/text/Collator.html[ICU Collation] documentation for a more detailed
+ explanation for the specific values.
+
+`decomposition`::
+ Possible values: `no` or `canonical`. Defaults to `no`. Setting this decomposition property with
+ `canonical` allows the Collator to handle un-normalized text properly, producing the same results as if the text were
+ normalized. If `no` is set, it is the user's responsibility to insure that all text is already in the appropriate form
+ before a comparison or before getting a CollationKey. Adjusting decomposition mode allows the user to select between
+ faster and more complete collation behavior. Since a great many of the world's languages do not require text
+ normalization, most locales set `no` as the default decomposition mode.
+
+[float]
+==== Expert options:
+
+[horizontal]
+`alternate`::
+ Possible values: `shifted` or `non-ignorable`. Sets the alternate handling for strength `quaternary`
+ to be either shifted or non-ignorable. What boils down to ignoring punctuation and whitespace.
+
+`caseLevel`::
+ Possible values: `true` or `false`. Default is `false`. Whether case level sorting is required. When
+ strength is set to `primary` this will ignore accent differences.
+
+`caseFirst`::
+ Possible values: `lower` or `upper`. Useful to control which case is sorted first when case is not ignored
+ for strength `tertiary`.
+
+`numeric`::
+ Possible values: `true` or `false`. Whether digits are sorted according to numeric representation. For
+ example the value `egg-9` is sorted before the value `egg-21`. Defaults to `false`.
+
+`variableTop`::
+ Single character or contraction. Controls what is variable for `alternate`.
+
+`hiraganaQuaternaryMode`::
+ Possible values: `true` or `false`. Defaults to `false`. Distinguishing between Katakana and
+ Hiragana characters in `quaternary` strength .
+
+[float]
+=== ICU Tokenizer
+
+Breaks text into words according to UAX #29: Unicode Text Segmentation ((http://www.unicode.org/reports/tr29/)).
+
+[source,js]
+--------------------------------------------------
+{
+ "index" : {
+ "analysis" : {
+ "analyzer" : {
+ "collation" : {
+ "tokenizer" : "icu_tokenizer",
+ }
+ }
+ }
+ }
+}
+--------------------------------------------------
+
diff --git a/docs/reference/analysis/tokenfilters.asciidoc b/docs/reference/analysis/tokenfilters.asciidoc
new file mode 100644
index 0000000..ad72fb7
--- /dev/null
+++ b/docs/reference/analysis/tokenfilters.asciidoc
@@ -0,0 +1,74 @@
+[[analysis-tokenfilters]]
+== Token Filters
+
+Token filters accept a stream of tokens from a
+<<analysis-tokenizers,tokenizer>> and can modify tokens
+(eg lowercasing), delete tokens (eg remove stopwords)
+or add tokens (eg synonyms).
+
+Elasticsearch has a number of built in token filters which can be
+used to build <<analysis-custom-analyzer,custom analyzers>>.
+
+include::tokenfilters/standard-tokenfilter.asciidoc[]
+
+include::tokenfilters/asciifolding-tokenfilter.asciidoc[]
+
+include::tokenfilters/length-tokenfilter.asciidoc[]
+
+include::tokenfilters/lowercase-tokenfilter.asciidoc[]
+
+include::tokenfilters/ngram-tokenfilter.asciidoc[]
+
+include::tokenfilters/edgengram-tokenfilter.asciidoc[]
+
+include::tokenfilters/porterstem-tokenfilter.asciidoc[]
+
+include::tokenfilters/shingle-tokenfilter.asciidoc[]
+
+include::tokenfilters/stop-tokenfilter.asciidoc[]
+
+include::tokenfilters/word-delimiter-tokenfilter.asciidoc[]
+
+include::tokenfilters/stemmer-tokenfilter.asciidoc[]
+
+include::tokenfilters/stemmer-override-tokenfilter.asciidoc[]
+
+include::tokenfilters/keyword-marker-tokenfilter.asciidoc[]
+
+include::tokenfilters/keyword-repeat-tokenfilter.asciidoc[]
+
+include::tokenfilters/kstem-tokenfilter.asciidoc[]
+
+include::tokenfilters/snowball-tokenfilter.asciidoc[]
+
+include::tokenfilters/phonetic-tokenfilter.asciidoc[]
+
+include::tokenfilters/synonym-tokenfilter.asciidoc[]
+
+include::tokenfilters/compound-word-tokenfilter.asciidoc[]
+
+include::tokenfilters/reverse-tokenfilter.asciidoc[]
+
+include::tokenfilters/elision-tokenfilter.asciidoc[]
+
+include::tokenfilters/truncate-tokenfilter.asciidoc[]
+
+include::tokenfilters/unique-tokenfilter.asciidoc[]
+
+include::tokenfilters/pattern-capture-tokenfilter.asciidoc[]
+
+include::tokenfilters/pattern_replace-tokenfilter.asciidoc[]
+
+include::tokenfilters/trim-tokenfilter.asciidoc[]
+
+include::tokenfilters/limit-token-count-tokenfilter.asciidoc[]
+
+include::tokenfilters/hunspell-tokenfilter.asciidoc[]
+
+include::tokenfilters/common-grams-tokenfilter.asciidoc[]
+
+include::tokenfilters/normalization-tokenfilter.asciidoc[]
+
+include::tokenfilters/delimited-payload-tokenfilter.asciidoc[]
+
+include::tokenfilters/keep-words-tokenfilter.asciidoc[]
diff --git a/docs/reference/analysis/tokenfilters/asciifolding-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/asciifolding-tokenfilter.asciidoc
new file mode 100644
index 0000000..aaca0eb
--- /dev/null
+++ b/docs/reference/analysis/tokenfilters/asciifolding-tokenfilter.asciidoc
@@ -0,0 +1,7 @@
+[[analysis-asciifolding-tokenfilter]]
+=== ASCII Folding Token Filter
+
+A token filter of type `asciifolding` that converts alphabetic, numeric,
+and symbolic Unicode characters which are not in the first 127 ASCII
+characters (the "Basic Latin" Unicode block) into their ASCII
+equivalents, if one exists.
diff --git a/docs/reference/analysis/tokenfilters/common-grams-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/common-grams-tokenfilter.asciidoc
new file mode 100644
index 0000000..f0659e0
--- /dev/null
+++ b/docs/reference/analysis/tokenfilters/common-grams-tokenfilter.asciidoc
@@ -0,0 +1,61 @@
+[[analysis-common-grams-tokenfilter]]
+=== Common Grams Token Filter
+
+Token filter that generates bigrams for frequently occuring terms.
+Single terms are still indexed. It can be used as an alternative to the
+<<analysis-stop-tokenfilter,Stop
+Token Filter>> when we don't want to completely ignore common terms.
+
+For example, the text "the quick brown is a fox" will be tokenized as
+"the", "the_quick", "quick", "brown", "brown_is", "is_a", "a_fox",
+"fox". Assuming "the", "is" and "a" are common words.
+
+When `query_mode` is enabled, the token filter removes common words and
+single terms followed by a common word. This parameter should be enabled
+in the search analyzer.
+
+For example, the query "the quick brown is a fox" will be tokenized as
+"the_quick", "quick", "brown_is", "is_a", "a_fox", "fox".
+
+The following are settings that can be set:
+
+[cols="<,<",options="header",]
+|=======================================================================
+|Setting |Description
+|`common_words` |A list of common words to use.
+
+|`common_words_path` |A path (either relative to `config` location, or
+absolute) to a list of common words. Each word should be in its own
+"line" (separated by a line break). The file must be UTF-8 encoded.
+
+|`ignore_case` |If true, common words matching will be case insensitive
+(defaults to `false`).
+
+|`query_mode` |Generates bigrams then removes common words and single
+terms followed by a common word (defaults to `false`).
+|=======================================================================
+
+Note, `common_words` or `common_words_path` field is required.
+
+Here is an example:
+
+[source,js]
+--------------------------------------------------
+index :
+ analysis :
+ analyzer :
+ index_grams :
+ tokenizer : whitespace
+ filter : [common_grams]
+ search_grams :
+ tokenizer : whitespace
+ filter : [common_grams_query]
+ filter :
+ common_grams :
+ type : common_grams
+ common_words: [a, an, the]
+ common_grams_query :
+ type : common_grams
+ query_mode: true
+ common_words: [a, an, the]
+--------------------------------------------------
diff --git a/docs/reference/analysis/tokenfilters/compound-word-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/compound-word-tokenfilter.asciidoc
new file mode 100644
index 0000000..6719a9c
--- /dev/null
+++ b/docs/reference/analysis/tokenfilters/compound-word-tokenfilter.asciidoc
@@ -0,0 +1,48 @@
+[[analysis-compound-word-tokenfilter]]
+=== Compound Word Token Filter
+
+Token filters that allow to decompose compound words. There are two
+types available: `dictionary_decompounder` and
+`hyphenation_decompounder`.
+
+The following are settings that can be set for a compound word token
+filter type:
+
+[cols="<,<",options="header",]
+|=======================================================================
+|Setting |Description
+|`word_list` |A list of words to use.
+
+|`word_list_path` |A path (either relative to `config` location, or
+absolute) to a list of words.
+
+|`min_word_size` |Minimum word size(Integer). Defaults to 5.
+
+|`min_subword_size` |Minimum subword size(Integer). Defaults to 2.
+
+|`max_subword_size` |Maximum subword size(Integer). Defaults to 15.
+
+|`only_longest_match` |Only matching the longest(Boolean). Defaults to
+`false`
+|=======================================================================
+
+Here is an example:
+
+[source,js]
+--------------------------------------------------
+index :
+ analysis :
+ analyzer :
+ myAnalyzer2 :
+ type : custom
+ tokenizer : standard
+ filter : [myTokenFilter1, myTokenFilter2]
+ filter :
+ myTokenFilter1 :
+ type : dictionary_decompounder
+ word_list: [one, two, three]
+ myTokenFilter2 :
+ type : hyphenation_decompounder
+ word_list_path: path/to/words.txt
+ max_subword_size : 22
+--------------------------------------------------
diff --git a/docs/reference/analysis/tokenfilters/delimited-payload-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/delimited-payload-tokenfilter.asciidoc
new file mode 100644
index 0000000..293b51a
--- /dev/null
+++ b/docs/reference/analysis/tokenfilters/delimited-payload-tokenfilter.asciidoc
@@ -0,0 +1,16 @@
+[[analysis-delimited-payload-tokenfilter]]
+=== Delimited Payload Token Filter
+
+Named `delimited_payload_filter`. Splits tokens into tokens and payload whenever a delimiter character is found.
+
+Example: "the|1 quick|2 fox|3" is split per default int to tokens `fox`, `quick` and `the` with payloads `1`, `2` and `3` respectively.
+
+
+
+Parameters:
+
+`delimiter`::
+ Character used for splitting the tokens. Default is `|`.
+
+`encoding`::
+ The type of the payload. `int` for integer, `float` for float and `identity` for characters. Default is `float`. \ No newline at end of file
diff --git a/docs/reference/analysis/tokenfilters/edgengram-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/edgengram-tokenfilter.asciidoc
new file mode 100644
index 0000000..3ba0ede
--- /dev/null
+++ b/docs/reference/analysis/tokenfilters/edgengram-tokenfilter.asciidoc
@@ -0,0 +1,16 @@
+[[analysis-edgengram-tokenfilter]]
+=== Edge NGram Token Filter
+
+A token filter of type `edgeNGram`.
+
+The following are settings that can be set for a `edgeNGram` token
+filter type:
+
+[cols="<,<",options="header",]
+|======================================================
+|Setting |Description
+|`min_gram` |Defaults to `1`.
+|`max_gram` |Defaults to `2`.
+|`side` |Either `front` or `back`. Defaults to `front`.
+|======================================================
+
diff --git a/docs/reference/analysis/tokenfilters/elision-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/elision-tokenfilter.asciidoc
new file mode 100644
index 0000000..c44ccff
--- /dev/null
+++ b/docs/reference/analysis/tokenfilters/elision-tokenfilter.asciidoc
@@ -0,0 +1,28 @@
+[[analysis-elision-tokenfilter]]
+=== Elision Token Filter
+
+A token filter which removes elisions. For example, "l'avion" (the
+plane) will tokenized as "avion" (plane).
+
+Accepts `articles` setting which is a set of stop words articles. For
+example:
+
+[source,js]
+--------------------------------------------------
+"index" : {
+ "analysis" : {
+ "analyzer" : {
+ "default" : {
+ "tokenizer" : "standard",
+ "filter" : ["standard", "elision"]
+ }
+ },
+ "filter" : {
+ "elision" : {
+ "type" : "elision",
+ "articles" : ["l", "m", "t", "qu", "n", "s", "j"]
+ }
+ }
+ }
+}
+--------------------------------------------------
diff --git a/docs/reference/analysis/tokenfilters/hunspell-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/hunspell-tokenfilter.asciidoc
new file mode 100644
index 0000000..9a235dd
--- /dev/null
+++ b/docs/reference/analysis/tokenfilters/hunspell-tokenfilter.asciidoc
@@ -0,0 +1,115 @@
+[[analysis-hunspell-tokenfilter]]
+=== Hunspell Token Filter
+
+Basic support for hunspell stemming. Hunspell dictionaries will be
+picked up from a dedicated hunspell directory on the filesystem
+(defaults to `<path.conf>/hunspell`). Each dictionary is expected to
+have its own directory named after its associated locale (language).
+This dictionary directory is expected to hold both the \*.aff and \*.dic
+files (all of which will automatically be picked up). For example,
+assuming the default hunspell location is used, the following directory
+layout will define the `en_US` dictionary:
+
+[source,js]
+--------------------------------------------------
+- conf
+ |-- hunspell
+ | |-- en_US
+ | | |-- en_US.dic
+ | | |-- en_US.aff
+--------------------------------------------------
+
+The location of the hunspell directory can be configured using the
+`indices.analysis.hunspell.dictionary.location` settings in
+_elasticsearch.yml_.
+
+Each dictionary can be configured with two settings:
+
+`ignore_case`::
+ If true, dictionary matching will be case insensitive
+ (defaults to `false`)
+
+`strict_affix_parsing`::
+ Determines whether errors while reading a
+ affix rules file will cause exception or simple be ignored (defaults to
+ `true`)
+
+These settings can be configured globally in `elasticsearch.yml` using
+
+* `indices.analysis.hunspell.dictionary.ignore_case` and
+* `indices.analysis.hunspell.dictionary.strict_affix_parsing`
+
+or for specific dictionaries:
+
+* `indices.analysis.hunspell.dictionary.en_US.ignore_case` and
+* `indices.analysis.hunspell.dictionary.en_US.strict_affix_parsing`.
+
+It is also possible to add `settings.yml` file under the dictionary
+directory which holds these settings (this will override any other
+settings defined in the `elasticsearch.yml`).
+
+One can use the hunspell stem filter by configuring it the analysis
+settings:
+
+[source,js]
+--------------------------------------------------
+{
+ "analysis" : {
+ "analyzer" : {
+ "en" : {
+ "tokenizer" : "standard",
+ "filter" : [ "lowercase", "en_US" ]
+ }
+ },
+ "filter" : {
+ "en_US" : {
+ "type" : "hunspell",
+ "locale" : "en_US",
+ "dedup" : true
+ }
+ }
+ }
+}
+--------------------------------------------------
+
+The hunspell token filter accepts four options:
+
+`locale`::
+ A locale for this filter. If this is unset, the `lang` or
+ `language` are used instead - so one of these has to be set.
+
+`dictionary`::
+ The name of a dictionary. The path to your hunspell
+ dictionaries should be configured via
+ `indices.analysis.hunspell.dictionary.location` before.
+
+`dedup`::
+ If only unique terms should be returned, this needs to be
+ set to `true`. Defaults to `true`.
+
+`recursion_level`::
+ Configures the recursion level a
+ stemmer can go into. Defaults to `2`. Some languages (for example czech)
+ give better results when set to `1` or `0`, so you should test it out.
+
+NOTE: As opposed to the snowball stemmers (which are algorithm based)
+this is a dictionary lookup based stemmer and therefore the quality of
+the stemming is determined by the quality of the dictionary.
+
+[float]
+==== References
+
+Hunspell is a spell checker and morphological analyzer designed for
+languages with rich morphology and complex word compounding and
+character encoding.
+
+1. Wikipedia, http://en.wikipedia.org/wiki/Hunspell
+
+2. Source code, http://hunspell.sourceforge.net/
+
+3. Open Office Hunspell dictionaries, http://wiki.openoffice.org/wiki/Dictionaries
+
+4. Mozilla Hunspell dictionaries, https://addons.mozilla.org/en-US/firefox/language-tools/
+
+5. Chromium Hunspell dictionaries,
+ http://src.chromium.org/viewvc/chrome/trunk/deps/third_party/hunspell_dictionaries/
diff --git a/docs/reference/analysis/tokenfilters/keep-words-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/keep-words-tokenfilter.asciidoc
new file mode 100644
index 0000000..e4abbef
--- /dev/null
+++ b/docs/reference/analysis/tokenfilters/keep-words-tokenfilter.asciidoc
@@ -0,0 +1,49 @@
+[[analysis-keep-words-tokenfilter]]
+=== Keep Words Token Filter
+
+A token filter of type `keep` that only keeps tokens with text contained in a
+predefined set of words. The set of words can be defined in the settings or
+loaded from a text file containing one word per line.
+
+
+[float]
+=== Options
+[horizontal]
+keep_words:: a list of words to keep
+keep_words_path:: a path to a words file
+keep_words_case:: a boolean indicating whether to lower case the words (defaults to `false`)
+
+
+
+[float]
+=== Settings example
+
+[source,js]
+--------------------------------------------------
+{
+ "index" : {
+ "analysis" : {
+ "analyzer" : {
+ "my_analyzer" : {
+ "tokenizer" : "standard",
+ "filter" : ["standard", "lowercase", "words_till_three"]
+ },
+ "my_analyzer1" : {
+ "tokenizer" : "standard",
+ "filter" : ["standard", "lowercase", "words_on_file"]
+ }
+ },
+ "filter" : {
+ "words_till_three" : {
+ "type" : "keep",
+ "keep_words" : [ "one", "two", "three"]
+ },
+ "words_on_file" : {
+ "type" : "keep",
+ "keep_words_path" : "/path/to/word/file"
+ }
+ }
+ }
+ }
+}
+--------------------------------------------------
diff --git a/docs/reference/analysis/tokenfilters/keyword-marker-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/keyword-marker-tokenfilter.asciidoc
new file mode 100644
index 0000000..465bfd5
--- /dev/null
+++ b/docs/reference/analysis/tokenfilters/keyword-marker-tokenfilter.asciidoc
@@ -0,0 +1,34 @@
+[[analysis-keyword-marker-tokenfilter]]
+=== Keyword Marker Token Filter
+
+Protects words from being modified by stemmers. Must be placed before
+any stemming filters.
+
+[cols="<,<",options="header",]
+|=======================================================================
+|Setting |Description
+|`keywords` |A list of words to use.
+
+|`keywords_path` |A path (either relative to `config` location, or
+absolute) to a list of words.
+
+|`ignore_case` |Set to `true` to lower case all words first. Defaults to
+`false`.
+|=======================================================================
+
+Here is an example:
+
+[source,js]
+--------------------------------------------------
+index :
+ analysis :
+ analyzer :
+ myAnalyzer :
+ type : custom
+ tokenizer : standard
+ filter : [lowercase, protwods, porter_stem]
+ filter :
+ protwods :
+ type : keyword_marker
+ keywords_path : analysis/protwords.txt
+--------------------------------------------------
diff --git a/docs/reference/analysis/tokenfilters/keyword-repeat-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/keyword-repeat-tokenfilter.asciidoc
new file mode 100644
index 0000000..fddab34
--- /dev/null
+++ b/docs/reference/analysis/tokenfilters/keyword-repeat-tokenfilter.asciidoc
@@ -0,0 +1,26 @@
+[[analysis-keyword-repeat-tokenfilter]]
+=== Keyword Repeat Token Filter
+
+The `keyword_repeat` token filter Emits each incoming token twice once
+as keyword and once as a non-keyword to allow an un-stemmed version of a
+term to be indexed side by site to the stemmed version of the term.
+Given the nature of this filter each token that isn't transformed by a
+subsequent stemmer will be indexed twice. Therefore, consider adding a
+`unique` filter with `only_on_same_position` set to `true` to drop
+unnecessary duplicates.
+
+Here is an example:
+
+[source,js]
+--------------------------------------------------
+index :
+ analysis :
+ analyzer :
+ myAnalyzer :
+ type : custom
+ tokenizer : standard
+ filter : [lowercase, keyword_repeat, porter_stem, unique_stem]
+ unique_stem:
+ type: unique
+ only_on_same_position : true
+--------------------------------------------------
diff --git a/docs/reference/analysis/tokenfilters/kstem-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/kstem-tokenfilter.asciidoc
new file mode 100644
index 0000000..ff0695e
--- /dev/null
+++ b/docs/reference/analysis/tokenfilters/kstem-tokenfilter.asciidoc
@@ -0,0 +1,6 @@
+[[analysis-kstem-tokenfilter]]
+=== KStem Token Filter
+
+The `kstem` token filter is a high performance filter for english. All
+terms must already be lowercased (use `lowercase` filter) for this
+filter to work correctly.
diff --git a/docs/reference/analysis/tokenfilters/length-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/length-tokenfilter.asciidoc
new file mode 100644
index 0000000..2651980
--- /dev/null
+++ b/docs/reference/analysis/tokenfilters/length-tokenfilter.asciidoc
@@ -0,0 +1,16 @@
+[[analysis-length-tokenfilter]]
+=== Length Token Filter
+
+A token filter of type `length` that removes words that are too long or
+too short for the stream.
+
+The following are settings that can be set for a `length` token filter
+type:
+
+[cols="<,<",options="header",]
+|===========================================================
+|Setting |Description
+|`min` |The minimum number. Defaults to `0`.
+|`max` |The maximum number. Defaults to `Integer.MAX_VALUE`.
+|===========================================================
+
diff --git a/docs/reference/analysis/tokenfilters/limit-token-count-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/limit-token-count-tokenfilter.asciidoc
new file mode 100644
index 0000000..a6598be
--- /dev/null
+++ b/docs/reference/analysis/tokenfilters/limit-token-count-tokenfilter.asciidoc
@@ -0,0 +1,32 @@
+[[analysis-limit-token-count-tokenfilter]]
+=== Limit Token Count Token Filter
+
+Limits the number of tokens that are indexed per document and field.
+
+[cols="<,<",options="header",]
+|=======================================================================
+|Setting |Description
+|`max_token_count` |The maximum number of tokens that should be indexed
+per document and field. The default is `1`
+
+|`consume_all_tokens` |If set to `true` the filter exhaust the stream
+even if `max_token_count` tokens have been consumed already. The default
+is `false`.
+|=======================================================================
+
+Here is an example:
+
+[source,js]
+--------------------------------------------------
+index :
+ analysis :
+ analyzer :
+ myAnalyzer :
+ type : custom
+ tokenizer : standard
+ filter : [lowercase, five_token_limit]
+ filter :
+ five_token_limit :
+ type : limit
+ max_token_count : 5
+--------------------------------------------------
diff --git a/docs/reference/analysis/tokenfilters/lowercase-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/lowercase-tokenfilter.asciidoc
new file mode 100644
index 0000000..857c0d7
--- /dev/null
+++ b/docs/reference/analysis/tokenfilters/lowercase-tokenfilter.asciidoc
@@ -0,0 +1,37 @@
+[[analysis-lowercase-tokenfilter]]
+=== Lowercase Token Filter
+
+A token filter of type `lowercase` that normalizes token text to lower
+case.
+
+Lowercase token filter supports Greek and Turkish lowercase token
+filters through the `language` parameter. Below is a usage example in a
+custom analyzer
+
+[source,js]
+--------------------------------------------------
+index :
+ analysis :
+ analyzer :
+ myAnalyzer2 :
+ type : custom
+ tokenizer : myTokenizer1
+ filter : [myTokenFilter1, myGreekLowerCaseFilter]
+ char_filter : [my_html]
+ tokenizer :
+ myTokenizer1 :
+ type : standard
+ max_token_length : 900
+ filter :
+ myTokenFilter1 :
+ type : stop
+ stopwords : [stop1, stop2, stop3, stop4]
+ myGreekLowerCaseFilter :
+ type : lowercase
+ language : greek
+ char_filter :
+ my_html :
+ type : html_strip
+ escaped_tags : [xxx, yyy]
+ read_ahead : 1024
+--------------------------------------------------
diff --git a/docs/reference/analysis/tokenfilters/ngram-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/ngram-tokenfilter.asciidoc
new file mode 100644
index 0000000..5f91136
--- /dev/null
+++ b/docs/reference/analysis/tokenfilters/ngram-tokenfilter.asciidoc
@@ -0,0 +1,15 @@
+[[analysis-ngram-tokenfilter]]
+=== NGram Token Filter
+
+A token filter of type `nGram`.
+
+The following are settings that can be set for a `nGram` token filter
+type:
+
+[cols="<,<",options="header",]
+|============================
+|Setting |Description
+|`min_gram` |Defaults to `1`.
+|`max_gram` |Defaults to `2`.
+|============================
+
diff --git a/docs/reference/analysis/tokenfilters/normalization-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/normalization-tokenfilter.asciidoc
new file mode 100644
index 0000000..8751872
--- /dev/null
+++ b/docs/reference/analysis/tokenfilters/normalization-tokenfilter.asciidoc
@@ -0,0 +1,15 @@
+[[analysis-normalization-tokenfilter]]
+=== Normalization Token Filter
+
+There are several token filters available which try to normalize special
+characters of a certain language.
+
+You can currently choose between `arabic_normalization` and
+`persian_normalization` normalization in your token filter
+configuration. For more information check the
+http://lucene.apache.org/core/4_3_1/analyzers-common/org/apache/lucene/analysis/ar/ArabicNormalizer.html[ArabicNormalizer]
+or the
+http://lucene.apache.org/core/4_3_1/analyzers-common/org/apache/lucene/analysis/fa/PersianNormalizer.html[PersianNormalizer]
+documentation.
+
+*Note:* These filters are available since `0.90.2`
diff --git a/docs/reference/analysis/tokenfilters/pattern-capture-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/pattern-capture-tokenfilter.asciidoc
new file mode 100644
index 0000000..4091296
--- /dev/null
+++ b/docs/reference/analysis/tokenfilters/pattern-capture-tokenfilter.asciidoc
@@ -0,0 +1,134 @@
+[[analysis-pattern-capture-tokenfilter]]
+=== Pattern Capture Token Filter
+
+The `pattern_capture` token filter, unlike the `pattern` tokenizer,
+emits a token for every capture group in the regular expression.
+Patterns are not anchored to the beginning and end of the string, so
+each pattern can match multiple times, and matches are allowed to
+overlap.
+
+For instance a pattern like :
+
+[source,js]
+--------------------------------------------------
+"(([a-z]+)(\d*))"
+--------------------------------------------------
+
+when matched against:
+
+[source,js]
+--------------------------------------------------
+"abc123def456"
+--------------------------------------------------
+
+would produce the tokens: [ `abc123`, `abc`, `123`, `def456`, `def`,
+`456` ]
+
+If `preserve_original` is set to `true` (the default) then it would also
+emit the original token: `abc123def456`.
+
+This is particularly useful for indexing text like camel-case code, eg
+`stripHTML` where a user may search for `"strip html"` or `"striphtml"`:
+
+[source,js]
+--------------------------------------------------
+curl -XPUT localhost:9200/test/ -d '
+{
+ "settings" : {
+ "analysis" : {
+ "filter" : {
+ "code" : {
+ "type" : "pattern_capture",
+ "preserve_original" : 1,
+ "patterns" : [
+ "(\\p{Ll}+|\\p{Lu}\\p{Ll}+|\\p{Lu}+)",
+ "(\\d+)"
+ ]
+ }
+ },
+ "analyzer" : {
+ "code" : {
+ "tokenizer" : "pattern",
+ "filter" : [ "code", "lowercase" ]
+ }
+ }
+ }
+ }
+}
+'
+--------------------------------------------------
+
+When used to analyze the text
+
+[source,js]
+--------------------------------------------------
+import static org.apache.commons.lang.StringEscapeUtils.escapeHtml
+--------------------------------------------------
+
+this emits the tokens: [ `import`, `static`, `org`, `apache`, `commons`,
+`lang`, `stringescapeutils`, `string`, `escape`, `utils`, `escapehtml`,
+`escape`, `html` ]
+
+Another example is analyzing email addresses:
+
+[source,js]
+--------------------------------------------------
+curl -XPUT localhost:9200/test/ -d '
+{
+ "settings" : {
+ "analysis" : {
+ "filter" : {
+ "email" : {
+ "type" : "pattern_capture",
+ "preserve_original" : 1,
+ "patterns" : [
+ "(\\w+)",
+ "(\\p{L}+)",
+ "(\\d+)",
+ "@(.+)"
+ ]
+ }
+ },
+ "analyzer" : {
+ "email" : {
+ "tokenizer" : "uax_url_email",
+ "filter" : [ "email", "lowercase", "unique" ]
+ }
+ }
+ }
+ }
+}
+'
+--------------------------------------------------
+
+When the above analyzer is used on an email address like:
+
+[source,js]
+--------------------------------------------------
+john-smith_123@foo-bar.com
+--------------------------------------------------
+
+it would produce the following tokens: [ `john-smith_123`,
+`foo-bar.com`, `john`, `smith_123`, `smith`, `123`, `foo`,
+`foo-bar.com`, `bar`, `com` ]
+
+Multiple patterns are required to allow overlapping captures, but also
+means that patterns are less dense and easier to understand.
+
+*Note:* All tokens are emitted in the same position, and with the same
+character offsets, so when combined with highlighting, the whole
+original token will be highlighted, not just the matching subset. For
+instance, querying the above email address for `"smith"` would
+highlight:
+
+[source,js]
+--------------------------------------------------
+ <em>john-smith_123@foo-bar.com</em>
+--------------------------------------------------
+
+not:
+
+[source,js]
+--------------------------------------------------
+ john-<em>smith</em>_123@foo-bar.com
+--------------------------------------------------
diff --git a/docs/reference/analysis/tokenfilters/pattern_replace-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/pattern_replace-tokenfilter.asciidoc
new file mode 100644
index 0000000..54e0842
--- /dev/null
+++ b/docs/reference/analysis/tokenfilters/pattern_replace-tokenfilter.asciidoc
@@ -0,0 +1,9 @@
+[[analysis-pattern_replace-tokenfilter]]
+=== Pattern Replace Token Filter
+
+The `pattern_replace` token filter allows to easily handle string
+replacements based on a regular expression. The regular expression is
+defined using the `pattern` parameter, and the replacement string can be
+provided using the `replacement` parameter (supporting referencing the
+original text, as explained
+http://docs.oracle.com/javase/6/docs/api/java/util/regex/Matcher.html#appendReplacement(java.lang.StringBuffer,%20java.lang.String)[here]).
diff --git a/docs/reference/analysis/tokenfilters/phonetic-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/phonetic-tokenfilter.asciidoc
new file mode 100644
index 0000000..b7e9334
--- /dev/null
+++ b/docs/reference/analysis/tokenfilters/phonetic-tokenfilter.asciidoc
@@ -0,0 +1,5 @@
+[[analysis-phonetic-tokenfilter]]
+=== Phonetic Token Filter
+
+The `phonetic` token filter is provided as a plugin and located
+https://github.com/elasticsearch/elasticsearch-analysis-phonetic[here].
diff --git a/docs/reference/analysis/tokenfilters/porterstem-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/porterstem-tokenfilter.asciidoc
new file mode 100644
index 0000000..fc2edf5
--- /dev/null
+++ b/docs/reference/analysis/tokenfilters/porterstem-tokenfilter.asciidoc
@@ -0,0 +1,15 @@
+[[analysis-porterstem-tokenfilter]]
+=== Porter Stem Token Filter
+
+A token filter of type `porter_stem` that transforms the token stream as
+per the Porter stemming algorithm.
+
+Note, the input to the stemming filter must already be in lower case, so
+you will need to use
+<<analysis-lowercase-tokenfilter,Lower
+Case Token Filter>> or
+<<analysis-lowercase-tokenizer,Lower
+Case Tokenizer>> farther down the Tokenizer chain in order for this to
+work properly!. For example, when using custom analyzer, make sure the
+`lowercase` filter comes before the `porter_stem` filter in the list of
+filters.
diff --git a/docs/reference/analysis/tokenfilters/reverse-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/reverse-tokenfilter.asciidoc
new file mode 100644
index 0000000..b004998
--- /dev/null
+++ b/docs/reference/analysis/tokenfilters/reverse-tokenfilter.asciidoc
@@ -0,0 +1,4 @@
+[[analysis-reverse-tokenfilter]]
+=== Reverse Token Filter
+
+A token filter of type `reverse` that simply reverses each token.
diff --git a/docs/reference/analysis/tokenfilters/shingle-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/shingle-tokenfilter.asciidoc
new file mode 100644
index 0000000..e3c6c44
--- /dev/null
+++ b/docs/reference/analysis/tokenfilters/shingle-tokenfilter.asciidoc
@@ -0,0 +1,36 @@
+[[analysis-shingle-tokenfilter]]
+=== Shingle Token Filter
+
+A token filter of type `shingle` that constructs shingles (token
+n-grams) from a token stream. In other words, it creates combinations of
+tokens as a single token. For example, the sentence "please divide this
+sentence into shingles" might be tokenized into shingles "please
+divide", "divide this", "this sentence", "sentence into", and "into
+shingles".
+
+This filter handles position increments > 1 by inserting filler tokens
+(tokens with termtext "_"). It does not handle a position increment of
+0.
+
+The following are settings that can be set for a `shingle` token filter
+type:
+
+[cols="<,<",options="header",]
+|=======================================================================
+|Setting |Description
+|`max_shingle_size` |The maximum shingle size. Defaults to `2`.
+
+|`min_shingle_size` |The minimum shingle size. Defaults to `2`.
+
+|`output_unigrams` |If `true` the output will contain the input tokens
+(unigrams) as well as the shingles. Defaults to `true`.
+
+|`output_unigrams_if_no_shingles` |If `output_unigrams` is `false` the
+output will contain the input tokens (unigrams) if no shingles are
+available. Note if `output_unigrams` is set to `true` this setting has
+no effect. Defaults to `false`.
+
+|`token_separator` |The string to use when joining adjacent tokens to
+form a shingle. Defaults to `" "`.
+|=======================================================================
+
diff --git a/docs/reference/analysis/tokenfilters/snowball-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/snowball-tokenfilter.asciidoc
new file mode 100644
index 0000000..58d8898
--- /dev/null
+++ b/docs/reference/analysis/tokenfilters/snowball-tokenfilter.asciidoc
@@ -0,0 +1,33 @@
+[[analysis-snowball-tokenfilter]]
+=== Snowball Token Filter
+
+A filter that stems words using a Snowball-generated stemmer. The
+`language` parameter controls the stemmer with the following available
+values: `Armenian`, `Basque`, `Catalan`, `Danish`, `Dutch`, `English`,
+`Finnish`, `French`, `German`, `German2`, `Hungarian`, `Italian`, `Kp`,
+`Lovins`, `Norwegian`, `Porter`, `Portuguese`, `Romanian`, `Russian`,
+`Spanish`, `Swedish`, `Turkish`.
+
+For example:
+
+[source,js]
+--------------------------------------------------
+{
+ "index" : {
+ "analysis" : {
+ "analyzer" : {
+ "my_analyzer" : {
+ "tokenizer" : "standard",
+ "filter" : ["standard", "lowercase", "my_snow"]
+ }
+ },
+ "filter" : {
+ "my_snow" : {
+ "type" : "snowball",
+ "language" : "Lovins"
+ }
+ }
+ }
+ }
+}
+--------------------------------------------------
diff --git a/docs/reference/analysis/tokenfilters/standard-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/standard-tokenfilter.asciidoc
new file mode 100644
index 0000000..3dd4fbf
--- /dev/null
+++ b/docs/reference/analysis/tokenfilters/standard-tokenfilter.asciidoc
@@ -0,0 +1,7 @@
+[[analysis-standard-tokenfilter]]
+=== Standard Token Filter
+
+A token filter of type `standard` that normalizes tokens extracted with
+the
+<<analysis-standard-tokenizer,Standard
+Tokenizer>>.
diff --git a/docs/reference/analysis/tokenfilters/stemmer-override-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/stemmer-override-tokenfilter.asciidoc
new file mode 100644
index 0000000..649366c
--- /dev/null
+++ b/docs/reference/analysis/tokenfilters/stemmer-override-tokenfilter.asciidoc
@@ -0,0 +1,34 @@
+[[analysis-stemmer-override-tokenfilter]]
+=== Stemmer Override Token Filter
+
+Overrides stemming algorithms, by applying a custom mapping, then
+protecting these terms from being modified by stemmers. Must be placed
+before any stemming filters.
+
+Rules are separated by "=>"
+
+[cols="<,<",options="header",]
+|=======================================================================
+|Setting |Description
+|`rules` |A list of mapping rules to use.
+
+|`rules_path` |A path (either relative to `config` location, or
+absolute) to a list of mappings.
+|=======================================================================
+
+Here is an example:
+
+[source,js]
+--------------------------------------------------
+index :
+ analysis :
+ analyzer :
+ myAnalyzer :
+ type : custom
+ tokenizer : standard
+ filter : [lowercase, custom_stems, porter_stem]
+ filter:
+ custom_stems:
+ type: stemmer_override
+ rules_path : analysis/custom_stems.txt
+--------------------------------------------------
diff --git a/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc
new file mode 100644
index 0000000..6526f37
--- /dev/null
+++ b/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc
@@ -0,0 +1,78 @@
+[[analysis-stemmer-tokenfilter]]
+=== Stemmer Token Filter
+
+A filter that stems words (similar to `snowball`, but with more
+options). The `language`/`name` parameter controls the stemmer with the
+following available values:
+
+http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Far%2FArabicStemmer.html[arabic],
+http://snowball.tartarus.org/algorithms/armenian/stemmer.html[armenian],
+http://snowball.tartarus.org/algorithms/basque/stemmer.html[basque],
+http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Fbr%2FBrazilianStemmer.html[brazilian],
+http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf[bulgarian],
+http://snowball.tartarus.org/algorithms/catalan/stemmer.html[catalan],
+http://portal.acm.org/citation.cfm?id=1598600[czech],
+http://snowball.tartarus.org/algorithms/danish/stemmer.html[danish],
+http://snowball.tartarus.org/algorithms/dutch/stemmer.html[dutch],
+http://snowball.tartarus.org/algorithms/english/stemmer.html[english],
+http://snowball.tartarus.org/algorithms/finnish/stemmer.html[finnish],
+http://snowball.tartarus.org/algorithms/french/stemmer.html[french],
+http://snowball.tartarus.org/algorithms/german/stemmer.html[german],
+http://snowball.tartarus.org/algorithms/german2/stemmer.html[german2],
+http://sais.se/mthprize/2007/ntais2007.pdf[greek],
+http://snowball.tartarus.org/algorithms/hungarian/stemmer.html[hungarian],
+http://snowball.tartarus.org/algorithms/italian/stemmer.html[italian],
+http://snowball.tartarus.org/algorithms/kraaij_pohlmann/stemmer.html[kp],
+http://ciir.cs.umass.edu/pubfiles/ir-35.pdf[kstem],
+http://snowball.tartarus.org/algorithms/lovins/stemmer.html[lovins],
+http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Flv%2FLatvianStemmer.html[latvian],
+http://snowball.tartarus.org/algorithms/norwegian/stemmer.html[norwegian],
+http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Fno%2FNorwegianMinimalStemFilter.html[minimal_norwegian],
+http://snowball.tartarus.org/algorithms/porter/stemmer.html[porter],
+http://snowball.tartarus.org/algorithms/portuguese/stemmer.html[portuguese],
+http://snowball.tartarus.org/algorithms/romanian/stemmer.html[romanian],
+http://snowball.tartarus.org/algorithms/russian/stemmer.html[russian],
+http://snowball.tartarus.org/algorithms/spanish/stemmer.html[spanish],
+http://snowball.tartarus.org/algorithms/swedish/stemmer.html[swedish],
+http://snowball.tartarus.org/algorithms/turkish/stemmer.html[turkish],
+http://www.medialab.tfe.umu.se/courses/mdm0506a/material/fulltext_ID%3D10049387%26PLACEBO%3DIE.pdf[minimal_english],
+http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Fen%2FEnglishPossessiveFilter.html[possessive_english],
+http://clef.isti.cnr.it/2003/WN_web/22.pdf[light_finnish],
+http://dl.acm.org/citation.cfm?id=1141523[light_french],
+http://dl.acm.org/citation.cfm?id=318984[minimal_french],
+http://dl.acm.org/citation.cfm?id=1141523[light_german],
+http://members.unine.ch/jacques.savoy/clef/morpho.pdf[minimal_german],
+http://computing.open.ac.uk/Sites/EACLSouthAsia/Papers/p6-Ramanathan.pdf[hindi],
+http://dl.acm.org/citation.cfm?id=1141523&dl=ACM&coll=DL&CFID=179095584&CFTOKEN=80067181[light_hungarian],
+http://www.illc.uva.nl/Publications/ResearchReports/MoL-2003-02.text.pdf[indonesian],
+http://www.ercim.eu/publication/ws-proceedings/CLEF2/savoy.pdf[light_italian],
+http://dl.acm.org/citation.cfm?id=1141523&dl=ACM&coll=DL&CFID=179095584&CFTOKEN=80067181[light_portuguese],
+http://www.inf.ufrgs.br/\~buriol/papers/Orengo_CLEF07.pdf[minimal_portuguese],
+http://www.inf.ufrgs.br/\~viviane/rslp/index.htm[portuguese],
+http://doc.rero.ch/lm.php?url=1000%2C43%2C4%2C20091209094227-CA%2FDolamic_Ljiljana_-_Indexing_and_Searching_Strategies_for_the_Russian_20091209.pdf[light_russian],
+http://www.ercim.eu/publication/ws-proceedings/CLEF2/savoy.pdf[light_spanish],
+http://clef.isti.cnr.it/2003/WN_web/22.pdf[light_swedish].
+
+For example:
+
+[source,js]
+--------------------------------------------------
+{
+ "index" : {
+ "analysis" : {
+ "analyzer" : {
+ "my_analyzer" : {
+ "tokenizer" : "standard",
+ "filter" : ["standard", "lowercase", "my_stemmer"]
+ }
+ },
+ "filter" : {
+ "my_stemmer" : {
+ "type" : "stemmer",
+ "name" : "light_german"
+ }
+ }
+ }
+ }
+}
+--------------------------------------------------
diff --git a/docs/reference/analysis/tokenfilters/stop-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/stop-tokenfilter.asciidoc
new file mode 100644
index 0000000..14b3a32
--- /dev/null
+++ b/docs/reference/analysis/tokenfilters/stop-tokenfilter.asciidoc
@@ -0,0 +1,35 @@
+[[analysis-stop-tokenfilter]]
+=== Stop Token Filter
+
+A token filter of type `stop` that removes stop words from token
+streams.
+
+The following are settings that can be set for a `stop` token filter
+type:
+
+[cols="<,<",options="header",]
+|=======================================================================
+|Setting |Description
+|`stopwords` |A list of stop words to use. Defaults to english stop
+words.
+
+|`stopwords_path` |A path (either relative to `config` location, or
+absolute) to a stopwords file configuration. Each stop word should be in
+its own "line" (separated by a line break). The file must be UTF-8
+encoded.
+
+|`ignore_case` |Set to `true` to lower case all words first. Defaults to
+`false`.
+
+|`remove_trailing` |Set to `false` in order to not ignore the last term of
+a search if it is a stop word. This is very useful for the completion
+suggester as a query like `green a` can be extended to `green apple` even
+though you remove stop words in general. Defaults to `true`.
+|=======================================================================
+
+stopwords allow for custom language specific expansion of default
+stopwords. It follows the `_lang_` notation and supports: arabic,
+armenian, basque, brazilian, bulgarian, catalan, czech, danish, dutch,
+english, finnish, french, galician, german, greek, hindi, hungarian,
+indonesian, italian, norwegian, persian, portuguese, romanian, russian,
+spanish, swedish, turkish.
diff --git a/docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc
new file mode 100644
index 0000000..ce6e1ed
--- /dev/null
+++ b/docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc
@@ -0,0 +1,123 @@
+[[analysis-synonym-tokenfilter]]
+=== Synonym Token Filter
+
+The `synonym` token filter allows to easily handle synonyms during the
+analysis process. Synonyms are configured using a configuration file.
+Here is an example:
+
+[source,js]
+--------------------------------------------------
+{
+ "index" : {
+ "analysis" : {
+ "analyzer" : {
+ "synonym" : {
+ "tokenizer" : "whitespace",
+ "filter" : ["synonym"]
+ }
+ },
+ "filter" : {
+ "synonym" : {
+ "type" : "synonym",
+ "synonyms_path" : "analysis/synonym.txt"
+ }
+ }
+ }
+ }
+}
+--------------------------------------------------
+
+The above configures a `synonym` filter, with a path of
+`analysis/synonym.txt` (relative to the `config` location). The
+`synonym` analyzer is then configured with the filter. Additional
+settings are: `ignore_case` (defaults to `false`), and `expand`
+(defaults to `true`).
+
+The `tokenizer` parameter controls the tokenizers that will be used to
+tokenize the synonym, and defaults to the `whitespace` tokenizer.
+
+Two synonym formats are supported: Solr, WordNet.
+
+[float]
+==== Solr synonyms
+
+The following is a sample format of the file:
+
+[source,js]
+--------------------------------------------------
+# blank lines and lines starting with pound are comments.
+
+#Explicit mappings match any token sequence on the LHS of "=>"
+#and replace with all alternatives on the RHS. These types of mappings
+#ignore the expand parameter in the schema.
+#Examples:
+i-pod, i pod => ipod,
+sea biscuit, sea biscit => seabiscuit
+
+#Equivalent synonyms may be separated with commas and give
+#no explicit mapping. In this case the mapping behavior will
+#be taken from the expand parameter in the schema. This allows
+#the same synonym file to be used in different synonym handling strategies.
+#Examples:
+ipod, i-pod, i pod
+foozball , foosball
+universe , cosmos
+
+# If expand==true, "ipod, i-pod, i pod" is equivalent to the explicit mapping:
+ipod, i-pod, i pod => ipod, i-pod, i pod
+# If expand==false, "ipod, i-pod, i pod" is equivalent to the explicit mapping:
+ipod, i-pod, i pod => ipod
+
+#multiple synonym mapping entries are merged.
+foo => foo bar
+foo => baz
+#is equivalent to
+foo => foo bar, baz
+--------------------------------------------------
+
+You can also define synonyms for the filter directly in the
+configuration file (note use of `synonyms` instead of `synonyms_path`):
+
+[source,js]
+--------------------------------------------------
+{
+ "filter" : {
+ "synonym" : {
+ "type" : "synonym",
+ "synonyms" : [
+ "i-pod, i pod => ipod",
+ "universe, cosmos"
+ ]
+ }
+ }
+}
+--------------------------------------------------
+
+However, it is recommended to define large synonyms set in a file using
+`synonyms_path`.
+
+[float]
+==== WordNet synonyms
+
+Synonyms based on http://wordnet.princeton.edu/[WordNet] format can be
+declared using `format`:
+
+[source,js]
+--------------------------------------------------
+{
+ "filter" : {
+ "synonym" : {
+ "type" : "synonym",
+ "format" : "wordnet",
+ "synonyms" : [
+ "s(100000001,1,'abstain',v,1,0).",
+ "s(100000001,2,'refrain',v,1,0).",
+ "s(100000001,3,'desist',v,1,0)."
+ ]
+ }
+ }
+}
+--------------------------------------------------
+
+Using `synonyms_path` to define WordNet synonyms in a file is supported
+as well.
diff --git a/docs/reference/analysis/tokenfilters/trim-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/trim-tokenfilter.asciidoc
new file mode 100644
index 0000000..34a0e93
--- /dev/null
+++ b/docs/reference/analysis/tokenfilters/trim-tokenfilter.asciidoc
@@ -0,0 +1,4 @@
+[[analysis-trim-tokenfilter]]
+=== Trim Token Filter
+
+The `trim` token filter trims the whitespace surrounding a token.
diff --git a/docs/reference/analysis/tokenfilters/truncate-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/truncate-tokenfilter.asciidoc
new file mode 100644
index 0000000..14652f4
--- /dev/null
+++ b/docs/reference/analysis/tokenfilters/truncate-tokenfilter.asciidoc
@@ -0,0 +1,10 @@
+[[analysis-truncate-tokenfilter]]
+=== Truncate Token Filter
+
+The `truncate` token filter can be used to truncate tokens into a
+specific length. This can come in handy with keyword (single token)
+based mapped fields that are used for sorting in order to reduce memory
+usage.
+
+It accepts a `length` parameter which control the number of characters
+to truncate to, defaults to `10`.
diff --git a/docs/reference/analysis/tokenfilters/unique-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/unique-tokenfilter.asciidoc
new file mode 100644
index 0000000..8b42f6b
--- /dev/null
+++ b/docs/reference/analysis/tokenfilters/unique-tokenfilter.asciidoc
@@ -0,0 +1,7 @@
+[[analysis-unique-tokenfilter]]
+=== Unique Token Filter
+
+The `unique` token filter can be used to only index unique tokens during
+analysis. By default it is applied on all the token stream. If
+`only_on_same_position` is set to `true`, it will only remove duplicate
+tokens on the same position.
diff --git a/docs/reference/analysis/tokenfilters/word-delimiter-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/word-delimiter-tokenfilter.asciidoc
new file mode 100644
index 0000000..9ce81e1
--- /dev/null
+++ b/docs/reference/analysis/tokenfilters/word-delimiter-tokenfilter.asciidoc
@@ -0,0 +1,80 @@
+[[analysis-word-delimiter-tokenfilter]]
+=== Word Delimiter Token Filter
+
+Named `word_delimiter`, it Splits words into subwords and performs
+optional transformations on subword groups. Words are split into
+subwords with the following rules:
+
+* split on intra-word delimiters (by default, all non alpha-numeric
+characters).
+* "Wi-Fi" -> "Wi", "Fi"
+* split on case transitions: "PowerShot" -> "Power", "Shot"
+* split on letter-number transitions: "SD500" -> "SD", "500"
+* leading and trailing intra-word delimiters on each subword are
+ignored: "//hello---there, 'dude'" -> "hello", "there", "dude"
+* trailing "'s" are removed for each subword: "O'Neil's" -> "O", "Neil"
+
+Parameters include:
+
+`generate_word_parts`::
+ If `true` causes parts of words to be
+ generated: "PowerShot" => "Power" "Shot". Defaults to `true`.
+
+`generate_number_parts`::
+ If `true` causes number subwords to be
+ generated: "500-42" => "500" "42". Defaults to `true`.
+
+`catenate_words`::
+ If `true` causes maximum runs of word parts to be
+ catenated: "wi-fi" => "wifi". Defaults to `false`.
+
+`catenate_numbers`::
+ If `true` causes maximum runs of number parts to
+ be catenated: "500-42" => "50042". Defaults to `false`.
+
+`catenate_all`::
+ If `true` causes all subword parts to be catenated:
+ "wi-fi-4000" => "wifi4000". Defaults to `false`.
+
+`split_on_case_change`::
+ If `true` causes "PowerShot" to be two tokens;
+ ("Power-Shot" remains two parts regards). Defaults to `true`.
+
+`preserve_original`::
+ If `true` includes original words in subwords:
+ "500-42" => "500-42" "500" "42". Defaults to `false`.
+
+`split_on_numerics`::
+ If `true` causes "j2se" to be three tokens; "j"
+ "2" "se". Defaults to `true`.
+
+`stem_english_possessive`::
+ If `true` causes trailing "'s" to be
+ removed for each subword: "O'Neil's" => "O", "Neil". Defaults to `true`.
+
+Advance settings include:
+
+`protected_words`::
+ A list of protected words from being delimiter.
+ Either an array, or also can set `protected_words_path` which resolved
+ to a file configured with protected words (one on each line).
+ Automatically resolves to `config/` based location if exists.
+
+`type_table`::
+ A custom type mapping table, for example (when configured
+ using `type_table_path`):
+
+[source,js]
+--------------------------------------------------
+ # Map the $, %, '.', and ',' characters to DIGIT
+ # This might be useful for financial data.
+ $ => DIGIT
+ % => DIGIT
+ . => DIGIT
+ \\u002C => DIGIT
+
+ # in some cases you might not want to split on ZWJ
+ # this also tests the case where we need a bigger byte[]
+ # see http://en.wikipedia.org/wiki/Zero-width_joiner
+ \\u200D => ALPHANUM
+--------------------------------------------------
diff --git a/docs/reference/analysis/tokenizers.asciidoc b/docs/reference/analysis/tokenizers.asciidoc
new file mode 100644
index 0000000..3118b0d
--- /dev/null
+++ b/docs/reference/analysis/tokenizers.asciidoc
@@ -0,0 +1,30 @@
+[[analysis-tokenizers]]
+== Tokenizers
+
+Tokenizers are used to break a string down into a stream of terms
+or tokens. A simple tokenizer might split the string up into terms
+wherever it encounters whitespace or punctuation.
+
+Elasticsearch has a number of built in tokenizers which can be
+used to build <<analysis-custom-analyzer,custom analyzers>>.
+
+include::tokenizers/standard-tokenizer.asciidoc[]
+
+include::tokenizers/edgengram-tokenizer.asciidoc[]
+
+include::tokenizers/keyword-tokenizer.asciidoc[]
+
+include::tokenizers/letter-tokenizer.asciidoc[]
+
+include::tokenizers/lowercase-tokenizer.asciidoc[]
+
+include::tokenizers/ngram-tokenizer.asciidoc[]
+
+include::tokenizers/whitespace-tokenizer.asciidoc[]
+
+include::tokenizers/pattern-tokenizer.asciidoc[]
+
+include::tokenizers/uaxurlemail-tokenizer.asciidoc[]
+
+include::tokenizers/pathhierarchy-tokenizer.asciidoc[]
+
diff --git a/docs/reference/analysis/tokenizers/edgengram-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/edgengram-tokenizer.asciidoc
new file mode 100644
index 0000000..41cc233
--- /dev/null
+++ b/docs/reference/analysis/tokenizers/edgengram-tokenizer.asciidoc
@@ -0,0 +1,80 @@
+[[analysis-edgengram-tokenizer]]
+=== Edge NGram Tokenizer
+
+A tokenizer of type `edgeNGram`.
+
+This tokenizer is very similar to `nGram` but only keeps n-grams which
+start at the beginning of a token.
+
+The following are settings that can be set for a `edgeNGram` tokenizer
+type:
+
+[cols="<,<,<",options="header",]
+|=======================================================================
+|Setting |Description |Default value
+|`min_gram` |Minimum size in codepoints of a single n-gram |`1`.
+
+|`max_gram` |Maximum size in codepoints of a single n-gram |`2`.
+
+|`token_chars` | Characters classes to keep in the
+tokens, Elasticsearch will split on characters that don't belong to any
+of these classes. |`[]` (Keep all characters)
+|=======================================================================
+
+
+`token_chars` accepts the following character classes:
+
+[horizontal]
+`letter`:: for example `a`, `b`, `ï` or `京`
+`digit`:: for example `3` or `7`
+`whitespace`:: for example `" "` or `"\n"`
+`punctuation`:: for example `!` or `"`
+`symbol`:: for example `$` or `√`
+
+[float]
+==== Example
+
+[source,js]
+--------------------------------------------------
+ curl -XPUT 'localhost:9200/test' -d '
+ {
+ "settings" : {
+ "analysis" : {
+ "analyzer" : {
+ "my_edge_ngram_analyzer" : {
+ "tokenizer" : "my_edge_ngram_tokenizer"
+ }
+ },
+ "tokenizer" : {
+ "my_edge_ngram_tokenizer" : {
+ "type" : "edgeNGram",
+ "min_gram" : "2",
+ "max_gram" : "5",
+ "token_chars": [ "letter", "digit" ]
+ }
+ }
+ }
+ }
+ }'
+
+ curl 'localhost:9200/test/_analyze?pretty=1&analyzer=my_edge_ngram_analyzer' -d 'FC Schalke 04'
+ # FC, Sc, Sch, Scha, Schal, 04
+--------------------------------------------------
+
+[float]
+==== `side` deprecated
+
+There used to be a `side` parameter up to `0.90.1` but it is now deprecated. In
+order to emulate the behavior of `"side" : "BACK"` a
+<<analysis-reverse-tokenfilter,`reverse` token filter>> should be used together
+with the <<analysis-edgengram-tokenfilter,`edgeNGram` token filter>>. The
+`edgeNGram` filter must be enclosed in `reverse` filters like this:
+
+[source,js]
+--------------------------------------------------
+ "filter" : ["reverse", "edgeNGram", "reverse"]
+--------------------------------------------------
+
+which essentially reverses the token, builds front `EdgeNGrams` and reverses
+the ngram again. This has the same effect as the previous `"side" : "BACK"` setting.
+
diff --git a/docs/reference/analysis/tokenizers/keyword-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/keyword-tokenizer.asciidoc
new file mode 100644
index 0000000..be75f3d
--- /dev/null
+++ b/docs/reference/analysis/tokenizers/keyword-tokenizer.asciidoc
@@ -0,0 +1,15 @@
+[[analysis-keyword-tokenizer]]
+=== Keyword Tokenizer
+
+A tokenizer of type `keyword` that emits the entire input as a single
+input.
+
+The following are settings that can be set for a `keyword` tokenizer
+type:
+
+[cols="<,<",options="header",]
+|=======================================================
+|Setting |Description
+|`buffer_size` |The term buffer size. Defaults to `256`.
+|=======================================================
+
diff --git a/docs/reference/analysis/tokenizers/letter-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/letter-tokenizer.asciidoc
new file mode 100644
index 0000000..03025cc
--- /dev/null
+++ b/docs/reference/analysis/tokenizers/letter-tokenizer.asciidoc
@@ -0,0 +1,7 @@
+[[analysis-letter-tokenizer]]
+=== Letter Tokenizer
+
+A tokenizer of type `letter` that divides text at non-letters. That's to
+say, it defines tokens as maximal strings of adjacent letters. Note,
+this does a decent job for most European languages, but does a terrible
+job for some Asian languages, where words are not separated by spaces.
diff --git a/docs/reference/analysis/tokenizers/lowercase-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/lowercase-tokenizer.asciidoc
new file mode 100644
index 0000000..0cdbbc3
--- /dev/null
+++ b/docs/reference/analysis/tokenizers/lowercase-tokenizer.asciidoc
@@ -0,0 +1,15 @@
+[[analysis-lowercase-tokenizer]]
+=== Lowercase Tokenizer
+
+A tokenizer of type `lowercase` that performs the function of
+<<analysis-letter-tokenizer,Letter
+Tokenizer>> and
+<<analysis-lowercase-tokenfilter,Lower
+Case Token Filter>> together. It divides text at non-letters and converts
+them to lower case. While it is functionally equivalent to the
+combination of
+<<analysis-letter-tokenizer,Letter
+Tokenizer>> and
+<<analysis-lowercase-tokenfilter,Lower
+Case Token Filter>>, there is a performance advantage to doing the two
+tasks at once, hence this (redundant) implementation.
diff --git a/docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc
new file mode 100644
index 0000000..23e6bc5
--- /dev/null
+++ b/docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc
@@ -0,0 +1,57 @@
+[[analysis-ngram-tokenizer]]
+=== NGram Tokenizer
+
+A tokenizer of type `nGram`.
+
+The following are settings that can be set for a `nGram` tokenizer type:
+
+[cols="<,<,<",options="header",]
+|=======================================================================
+|Setting |Description |Default value
+|`min_gram` |Minimum size in codepoints of a single n-gram |`1`.
+
+|`max_gram` |Maximum size in codepoints of a single n-gram |`2`.
+
+|`token_chars` |Characters classes to keep in the
+tokens, Elasticsearch will split on characters that don't belong to any
+of these classes. |`[]` (Keep all characters)
+|=======================================================================
+
+`token_chars` accepts the following character classes:
+
+[horizontal]
+`letter`:: for example `a`, `b`, `ï` or `京`
+`digit`:: for example `3` or `7`
+`whitespace`:: for example `" "` or `"\n"`
+`punctuation`:: for example `!` or `"`
+`symbol`:: for example `$` or `√`
+
+[float]
+==== Example
+
+[source,js]
+--------------------------------------------------
+ curl -XPUT 'localhost:9200/test' -d '
+ {
+ "settings" : {
+ "analysis" : {
+ "analyzer" : {
+ "my_ngram_analyzer" : {
+ "tokenizer" : "my_ngram_tokenizer"
+ }
+ },
+ "tokenizer" : {
+ "my_ngram_tokenizer" : {
+ "type" : "nGram",
+ "min_gram" : "2",
+ "max_gram" : "3",
+ "token_chars": [ "letter", "digit" ]
+ }
+ }
+ }
+ }
+ }'
+
+ curl 'localhost:9200/test/_analyze?pretty=1&analyzer=my_ngram_analyzer' -d 'FC Schalke 04'
+ # FC, Sc, Sch, ch, cha, ha, hal, al, alk, lk, lke, ke, 04
+--------------------------------------------------
diff --git a/docs/reference/analysis/tokenizers/pathhierarchy-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/pathhierarchy-tokenizer.asciidoc
new file mode 100644
index 0000000..e6876f5
--- /dev/null
+++ b/docs/reference/analysis/tokenizers/pathhierarchy-tokenizer.asciidoc
@@ -0,0 +1,32 @@
+[[analysis-pathhierarchy-tokenizer]]
+=== Path Hierarchy Tokenizer
+
+The `path_hierarchy` tokenizer takes something like this:
+
+-------------------------
+/something/something/else
+-------------------------
+
+And produces tokens:
+
+-------------------------
+/something
+/something/something
+/something/something/else
+-------------------------
+
+[cols="<,<",options="header",]
+|=======================================================================
+|Setting |Description
+|`delimiter` |The character delimiter to use, defaults to `/`.
+
+|`replacement` |An optional replacement character to use. Defaults to
+the `delimiter`.
+
+|`buffer_size` |The buffer size to use, defaults to `1024`.
+
+|`reverse` |Generates tokens in reverse order, defaults to `false`.
+
+|`skip` |Controls initial tokens to skip, defaults to `0`.
+|=======================================================================
+
diff --git a/docs/reference/analysis/tokenizers/pattern-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/pattern-tokenizer.asciidoc
new file mode 100644
index 0000000..72ca604
--- /dev/null
+++ b/docs/reference/analysis/tokenizers/pattern-tokenizer.asciidoc
@@ -0,0 +1,29 @@
+[[analysis-pattern-tokenizer]]
+=== Pattern Tokenizer
+
+A tokenizer of type `pattern` that can flexibly separate text into terms
+via a regular expression. Accepts the following settings:
+
+[cols="<,<",options="header",]
+|======================================================================
+|Setting |Description
+|`pattern` |The regular expression pattern, defaults to `\\W+`.
+|`flags` |The regular expression flags.
+|`group` |Which group to extract into tokens. Defaults to `-1` (split).
+|======================================================================
+
+*IMPORTANT*: The regular expression should match the *token separators*,
+not the tokens themselves.
+
+`group` set to `-1` (the default) is equivalent to "split". Using group
+>= 0 selects the matching group as the token. For example, if you have:
+
+------------------------
+pattern = \\'([^\']+)\\'
+group = 0
+input = aaa 'bbb' 'ccc'
+------------------------
+
+the output will be two tokens: 'bbb' and 'ccc' (including the ' marks).
+With the same input but using group=1, the output would be: bbb and ccc
+(no ' marks).
diff --git a/docs/reference/analysis/tokenizers/standard-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/standard-tokenizer.asciidoc
new file mode 100644
index 0000000..c8b405b
--- /dev/null
+++ b/docs/reference/analysis/tokenizers/standard-tokenizer.asciidoc
@@ -0,0 +1,18 @@
+[[analysis-standard-tokenizer]]
+=== Standard Tokenizer
+
+A tokenizer of type `standard` providing grammar based tokenizer that is
+a good tokenizer for most European language documents. The tokenizer
+implements the Unicode Text Segmentation algorithm, as specified in
+http://unicode.org/reports/tr29/[Unicode Standard Annex #29].
+
+The following are settings that can be set for a `standard` tokenizer
+type:
+
+[cols="<,<",options="header",]
+|=======================================================================
+|Setting |Description
+|`max_token_length` |The maximum token length. If a token is seen that
+exceeds this length then it is discarded. Defaults to `255`.
+|=======================================================================
+
diff --git a/docs/reference/analysis/tokenizers/uaxurlemail-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/uaxurlemail-tokenizer.asciidoc
new file mode 100644
index 0000000..9ed28e6
--- /dev/null
+++ b/docs/reference/analysis/tokenizers/uaxurlemail-tokenizer.asciidoc
@@ -0,0 +1,16 @@
+[[analysis-uaxurlemail-tokenizer]]
+=== UAX Email URL Tokenizer
+
+A tokenizer of type `uax_url_email` which works exactly like the
+`standard` tokenizer, but tokenizes emails and urls as single tokens.
+
+The following are settings that can be set for a `uax_url_email`
+tokenizer type:
+
+[cols="<,<",options="header",]
+|=======================================================================
+|Setting |Description
+|`max_token_length` |The maximum token length. If a token is seen that
+exceeds this length then it is discarded. Defaults to `255`.
+|=======================================================================
+
diff --git a/docs/reference/analysis/tokenizers/whitespace-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/whitespace-tokenizer.asciidoc
new file mode 100644
index 0000000..f0e1ce2
--- /dev/null
+++ b/docs/reference/analysis/tokenizers/whitespace-tokenizer.asciidoc
@@ -0,0 +1,4 @@
+[[analysis-whitespace-tokenizer]]
+=== Whitespace Tokenizer
+
+A tokenizer of type `whitespace` that divides text at whitespace.