/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.lucene.search.suggest.analyzing;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStreamToAutomaton;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.*;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PairOutputs;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
/**
* Implements a fuzzy {@link AnalyzingSuggester}. The similarity measurement is
* based on the Damerau-Levenshtein (optimal string alignment) algorithm, though
* you can explicitly choose classic Levenshtein by passing false
* for the transpositions
parameter.
*
* At most, this query will match terms up to * {@value org.apache.lucene.util.automaton.LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE} * edits. Higher distances are not supported. Note that the * fuzzy distance is measured in "byte space" on the bytes * returned by the {@link org.apache.lucene.analysis.TokenStream}'s {@link * org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute}, usually UTF8. By default * the analyzed bytes must be at least 3 {@link * #DEFAULT_MIN_FUZZY_LENGTH} bytes before any edits are * considered. Furthermore, the first 1 {@link * #DEFAULT_NON_FUZZY_PREFIX} byte is not allowed to be * edited. We allow up to 1 (@link * #DEFAULT_MAX_EDITS} edit. * If {@link #unicodeAware} parameter in the constructor is set to true, maxEdits, * minFuzzyLength, transpositions and nonFuzzyPrefix are measured in Unicode code * points (actual letters) instead of bytes.* * *
* NOTE: This suggester does not boost suggestions that * required no edits over suggestions that did require * edits. This is a known limitation. * *
* Note: complex query analyzers can have a significant impact on the lookup * performance. It's recommended to not use analyzers that drop or inject terms * like synonyms to keep the complexity of the prefix intersection low for good * lookup performance. At index time, complex analyzers can safely be used. *
* * @lucene.experimental */ public final class XFuzzySuggester extends XAnalyzingSuggester { private final int maxEdits; private final boolean transpositions; private final int nonFuzzyPrefix; private final int minFuzzyLength; private final boolean unicodeAware; /** * Measure maxEdits, minFuzzyLength, transpositions and nonFuzzyPrefix * parameters in Unicode code points (actual letters) * instead of bytes. */ public static final boolean DEFAULT_UNICODE_AWARE = false; /** * The default minimum length of the key passed to {@link * #lookup} before any edits are allowed. */ public static final int DEFAULT_MIN_FUZZY_LENGTH = 3; /** * The default prefix length where edits are not allowed. */ public static final int DEFAULT_NON_FUZZY_PREFIX = 1; /** * The default maximum number of edits for fuzzy * suggestions. */ public static final int DEFAULT_MAX_EDITS = 1; /** * The default transposition value passed to {@link org.apache.lucene.util.automaton.LevenshteinAutomata} */ public static final boolean DEFAULT_TRANSPOSITIONS = true; /** * Creates a {@link FuzzySuggester} instance initialized with default values. * * @param analyzer the analyzer used for this suggester */ public XFuzzySuggester(Analyzer analyzer) { this(analyzer, analyzer); } /** * Creates a {@link FuzzySuggester} instance with an index & a query analyzer initialized with default values. * * @param indexAnalyzer * Analyzer that will be used for analyzing suggestions while building the index. * @param queryAnalyzer * Analyzer that will be used for analyzing query text during lookup */ public XFuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) { this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, DEFAULT_MAX_EDITS, DEFAULT_TRANSPOSITIONS, DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH, DEFAULT_UNICODE_AWARE, null, false, 0, SEP_LABEL, PAYLOAD_SEP, END_BYTE, HOLE_CHARACTER); } /** * Creates a {@link FuzzySuggester} instance. * * @param indexAnalyzer Analyzer that will be used for * analyzing suggestions while building the index. * @param queryAnalyzer Analyzer that will be used for * analyzing query text during lookup * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP} * @param maxSurfaceFormsPerAnalyzedForm Maximum number of * surface forms to keep for a single analyzed form. * When there are too many surface forms we discard the * lowest weighted ones. * @param maxGraphExpansions Maximum number of graph paths * to expand from the analyzed form. Set this to -1 for * no limit. * @param maxEdits must be >= 0 and <= {@link org.apache.lucene.util.automaton.LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE} . * @param transpositionstrue
if transpositions should be treated as a primitive
* edit operation. If this is false, comparisons will implement the classic
* Levenshtein algorithm.
* @param nonFuzzyPrefix length of common (non-fuzzy) prefix (see default {@link #DEFAULT_NON_FUZZY_PREFIX}
* @param minFuzzyLength minimum length of lookup key before any edits are allowed (see default {@link #DEFAULT_MIN_FUZZY_LENGTH})
* @param sepLabel separation label
* @param payloadSep payload separator byte
* @param endByte end byte marker byte
*/
public XFuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
int maxEdits, boolean transpositions, int nonFuzzyPrefix, int minFuzzyLength, boolean unicodeAware,
FST