diff options
Diffstat (limited to 'src/main/java/org/apache/lucene')
35 files changed, 7347 insertions, 0 deletions
diff --git a/src/main/java/org/apache/lucene/analysis/CustomAnalyzerWrapper.java b/src/main/java/org/apache/lucene/analysis/CustomAnalyzerWrapper.java new file mode 100644 index 0000000..6ea95e4 --- /dev/null +++ b/src/main/java/org/apache/lucene/analysis/CustomAnalyzerWrapper.java @@ -0,0 +1,77 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.lucene.analysis; + +import java.io.Reader; + +/** + * Similar to Lucene {@link AnalyzerWrapper} but actually allows to set the reuse strategy.... + * //TODO add to lucene the ability to set it... + */ +public abstract class CustomAnalyzerWrapper extends Analyzer { + + /** + * Creates a new CustomAnalyzerWrapper. Since the {@link Analyzer.ReuseStrategy} of + * the wrapped Analyzers are unknown, {@link Analyzer.PerFieldReuseStrategy} is assumed + */ + protected CustomAnalyzerWrapper(ReuseStrategy reuseStrategy) { + super(reuseStrategy); + } + + /** + * Retrieves the wrapped Analyzer appropriate for analyzing the field with + * the given name + * + * @param fieldName Name of the field which is to be analyzed + * @return Analyzer for the field with the given name. Assumed to be non-null + */ + protected abstract Analyzer getWrappedAnalyzer(String fieldName); + + /** + * Wraps / alters the given TokenStreamComponents, taken from the wrapped + * Analyzer, to form new components. It is through this method that new + * TokenFilters can be added by AnalyzerWrappers. + * + * @param fieldName Name of the field which is to be analyzed + * @param components TokenStreamComponents taken from the wrapped Analyzer + * @return Wrapped / altered TokenStreamComponents. + */ + protected abstract TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components); + + @Override + protected final TokenStreamComponents createComponents(String fieldName, Reader aReader) { + return wrapComponents(fieldName, getWrappedAnalyzer(fieldName).createComponents(fieldName, aReader)); + } + + @Override + public int getPositionIncrementGap(String fieldName) { + return getWrappedAnalyzer(fieldName).getPositionIncrementGap(fieldName); + } + + @Override + public int getOffsetGap(String fieldName) { + return getWrappedAnalyzer(fieldName).getOffsetGap(fieldName); + } + + @Override + public final Reader initReader(String fieldName, Reader reader) { + return getWrappedAnalyzer(fieldName).initReader(fieldName, reader); + } +} diff --git a/src/main/java/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilter.java b/src/main/java/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilter.java new file mode 100644 index 0000000..ea93850 --- /dev/null +++ b/src/main/java/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilter.java @@ -0,0 +1,56 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.lucene.analysis.miscellaneous; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; + +import java.io.IOException; + +/** + * A token filter that truncates tokens. + */ +public class TruncateTokenFilter extends TokenFilter { + + private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); + + private final int size; + + public TruncateTokenFilter(TokenStream in, int size) { + super(in); + this.size = size; + } + + @Override + public final boolean incrementToken() throws IOException { + if (input.incrementToken()) { + final int length = termAttribute.length(); + if (length > size) { + termAttribute.setLength(size); + } + return true; + } else { + return false; + } + } +} + + diff --git a/src/main/java/org/apache/lucene/analysis/miscellaneous/UniqueTokenFilter.java b/src/main/java/org/apache/lucene/analysis/miscellaneous/UniqueTokenFilter.java new file mode 100644 index 0000000..4ccad58 --- /dev/null +++ b/src/main/java/org/apache/lucene/analysis/miscellaneous/UniqueTokenFilter.java @@ -0,0 +1,90 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.lucene.analysis.miscellaneous; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.util.Version; + +import java.io.IOException; + +/** + * A token filter that generates unique tokens. Can remove unique tokens only on the same + * position increments as well. + */ +public class UniqueTokenFilter extends TokenFilter { + + private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); + private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class); + + // use a fixed version, as we don't care about case sensitivity. + private final CharArraySet previous = new CharArraySet(Version.LUCENE_31, 8, false); + private final boolean onlyOnSamePosition; + + public UniqueTokenFilter(TokenStream in) { + this(in, false); + } + + public UniqueTokenFilter(TokenStream in, boolean onlyOnSamePosition) { + super(in); + this.onlyOnSamePosition = onlyOnSamePosition; + } + + @Override + public final boolean incrementToken() throws IOException { + while (input.incrementToken()) { + final char term[] = termAttribute.buffer(); + final int length = termAttribute.length(); + + boolean duplicate; + if (onlyOnSamePosition) { + final int posIncrement = posIncAttribute.getPositionIncrement(); + if (posIncrement > 0) { + previous.clear(); + } + + duplicate = (posIncrement == 0 && previous.contains(term, 0, length)); + } else { + duplicate = previous.contains(term, 0, length); + } + + // clone the term, and add to the set of seen terms. + char saved[] = new char[length]; + System.arraycopy(term, 0, saved, 0, length); + previous.add(saved); + + if (!duplicate) { + return true; + } + } + return false; + } + + @Override + public final void reset() throws IOException { + super.reset(); + previous.clear(); + } +} + + diff --git a/src/main/java/org/apache/lucene/index/TrackingConcurrentMergeScheduler.java b/src/main/java/org/apache/lucene/index/TrackingConcurrentMergeScheduler.java new file mode 100644 index 0000000..66b3328 --- /dev/null +++ b/src/main/java/org/apache/lucene/index/TrackingConcurrentMergeScheduler.java @@ -0,0 +1,149 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.lucene.index; + +import org.elasticsearch.common.logging.ESLogger; +import org.elasticsearch.common.metrics.CounterMetric; +import org.elasticsearch.common.metrics.MeanMetric; +import org.elasticsearch.common.unit.ByteSizeValue; +import org.elasticsearch.common.unit.TimeValue; +import org.elasticsearch.common.util.concurrent.ConcurrentCollections; +import org.elasticsearch.index.merge.OnGoingMerge; + +import java.io.IOException; +import java.util.Collections; +import java.util.Set; + +/** + * An extension to the {@link ConcurrentMergeScheduler} that provides tracking on merge times, total + * and current merges. + */ +public class TrackingConcurrentMergeScheduler extends ConcurrentMergeScheduler { + + protected final ESLogger logger; + + private final MeanMetric totalMerges = new MeanMetric(); + private final CounterMetric totalMergesNumDocs = new CounterMetric(); + private final CounterMetric totalMergesSizeInBytes = new CounterMetric(); + private final CounterMetric currentMerges = new CounterMetric(); + private final CounterMetric currentMergesNumDocs = new CounterMetric(); + private final CounterMetric currentMergesSizeInBytes = new CounterMetric(); + + private final Set<OnGoingMerge> onGoingMerges = ConcurrentCollections.newConcurrentSet(); + private final Set<OnGoingMerge> readOnlyOnGoingMerges = Collections.unmodifiableSet(onGoingMerges); + + public TrackingConcurrentMergeScheduler(ESLogger logger) { + super(); + this.logger = logger; + } + + public long totalMerges() { + return totalMerges.count(); + } + + public long totalMergeTime() { + return totalMerges.sum(); + } + + public long totalMergeNumDocs() { + return totalMergesNumDocs.count(); + } + + public long totalMergeSizeInBytes() { + return totalMergesSizeInBytes.count(); + } + + public long currentMerges() { + return currentMerges.count(); + } + + public long currentMergesNumDocs() { + return currentMergesNumDocs.count(); + } + + public long currentMergesSizeInBytes() { + return currentMergesSizeInBytes.count(); + } + + public Set<OnGoingMerge> onGoingMerges() { + return readOnlyOnGoingMerges; + } + + @Override + protected void doMerge(MergePolicy.OneMerge merge) throws IOException { + int totalNumDocs = merge.totalNumDocs(); + // don't used #totalBytesSize() since need to be executed under IW lock, might be fixed in future Lucene version + long totalSizeInBytes = merge.estimatedMergeBytes; + long time = System.currentTimeMillis(); + currentMerges.inc(); + currentMergesNumDocs.inc(totalNumDocs); + currentMergesSizeInBytes.inc(totalSizeInBytes); + + OnGoingMerge onGoingMerge = new OnGoingMerge(merge); + onGoingMerges.add(onGoingMerge); + + if (logger.isTraceEnabled()) { + logger.trace("merge [{}] starting..., merging [{}] segments, [{}] docs, [{}] size, into [{}] estimated_size", merge.info == null ? "_na_" : merge.info.info.name, merge.segments.size(), totalNumDocs, new ByteSizeValue(totalSizeInBytes), new ByteSizeValue(merge.estimatedMergeBytes)); + } + try { + beforeMerge(onGoingMerge); + super.doMerge(merge); + } finally { + long took = System.currentTimeMillis() - time; + + onGoingMerges.remove(onGoingMerge); + afterMerge(onGoingMerge); + + currentMerges.dec(); + currentMergesNumDocs.dec(totalNumDocs); + currentMergesSizeInBytes.dec(totalSizeInBytes); + + totalMergesNumDocs.inc(totalNumDocs); + totalMergesSizeInBytes.inc(totalSizeInBytes); + totalMerges.inc(took); + if (took > 20000) { // if more than 20 seconds, DEBUG log it + logger.debug("merge [{}] done, took [{}]", merge.info == null ? "_na_" : merge.info.info.name, TimeValue.timeValueMillis(took)); + } else if (logger.isTraceEnabled()) { + logger.trace("merge [{}] done, took [{}]", merge.info == null ? "_na_" : merge.info.info.name, TimeValue.timeValueMillis(took)); + } + } + } + + /** + * A callback allowing for custom logic before an actual merge starts. + */ + protected void beforeMerge(OnGoingMerge merge) { + + } + + /** + * A callback allowing for custom logic before an actual merge starts. + */ + protected void afterMerge(OnGoingMerge merge) { + + } + + @Override + public MergeScheduler clone() { + // Lucene IW makes a clone internally but since we hold on to this instance + // the clone will just be the identity. + return this; + } +}
\ No newline at end of file diff --git a/src/main/java/org/apache/lucene/index/TrackingSerialMergeScheduler.java b/src/main/java/org/apache/lucene/index/TrackingSerialMergeScheduler.java new file mode 100644 index 0000000..bd3ed9d --- /dev/null +++ b/src/main/java/org/apache/lucene/index/TrackingSerialMergeScheduler.java @@ -0,0 +1,165 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.lucene.index; + +import org.elasticsearch.common.logging.ESLogger; +import org.elasticsearch.common.metrics.CounterMetric; +import org.elasticsearch.common.metrics.MeanMetric; +import org.elasticsearch.common.unit.ByteSizeValue; +import org.elasticsearch.common.unit.TimeValue; +import org.elasticsearch.common.util.concurrent.ConcurrentCollections; +import org.elasticsearch.index.merge.OnGoingMerge; + +import java.io.IOException; +import java.util.Collections; +import java.util.Set; + +// LUCENE MONITOR - Copied from SerialMergeScheduler +public class TrackingSerialMergeScheduler extends MergeScheduler { + + protected final ESLogger logger; + + private final MeanMetric totalMerges = new MeanMetric(); + private final CounterMetric totalMergesNumDocs = new CounterMetric(); + private final CounterMetric totalMergesSizeInBytes = new CounterMetric(); + private final CounterMetric currentMerges = new CounterMetric(); + private final CounterMetric currentMergesNumDocs = new CounterMetric(); + private final CounterMetric currentMergesSizeInBytes = new CounterMetric(); + + private final Set<OnGoingMerge> onGoingMerges = ConcurrentCollections.newConcurrentSet(); + private final Set<OnGoingMerge> readOnlyOnGoingMerges = Collections.unmodifiableSet(onGoingMerges); + + public TrackingSerialMergeScheduler(ESLogger logger) { + this.logger = logger; + } + + public long totalMerges() { + return totalMerges.count(); + } + + public long totalMergeTime() { + return totalMerges.sum(); + } + + public long totalMergeNumDocs() { + return totalMergesNumDocs.count(); + } + + public long totalMergeSizeInBytes() { + return totalMergesSizeInBytes.count(); + } + + public long currentMerges() { + return currentMerges.count(); + } + + public long currentMergesNumDocs() { + return currentMergesNumDocs.count(); + } + + public long currentMergesSizeInBytes() { + return currentMergesSizeInBytes.count(); + } + + public Set<OnGoingMerge> onGoingMerges() { + return readOnlyOnGoingMerges; + } + + /** + * Just do the merges in sequence. We do this + * "synchronized" so that even if the application is using + * multiple threads, only one merge may run at a time. + */ + @Override + synchronized public void merge(IndexWriter writer) throws CorruptIndexException, IOException { + while (true) { + MergePolicy.OneMerge merge = writer.getNextMerge(); + if (merge == null) + break; + + // different from serial merge, call mergeInit here so we get the correct stats + // mergeInit can be called several times without side affects (checks on merge.info not being null) + writer.mergeInit(merge); + + int totalNumDocs = merge.totalNumDocs(); + // don't used #totalBytesSize() since need to be executed under IW lock, might be fixed in future Lucene version + long totalSizeInBytes = merge.estimatedMergeBytes; + long time = System.currentTimeMillis(); + currentMerges.inc(); + currentMergesNumDocs.inc(totalNumDocs); + currentMergesSizeInBytes.inc(totalSizeInBytes); + + OnGoingMerge onGoingMerge = new OnGoingMerge(merge); + onGoingMerges.add(onGoingMerge); + + // sadly, segment name is not available since mergeInit is called from merge itself... + if (logger.isTraceEnabled()) { + logger.trace("merge [{}] starting..., merging [{}] segments, [{}] docs, [{}] size, into [{}] estimated_size", merge.info == null ? "_na_" : merge.info.info.name, merge.segments.size(), totalNumDocs, new ByteSizeValue(totalSizeInBytes), new ByteSizeValue(merge.estimatedMergeBytes)); + } + try { + beforeMerge(onGoingMerge); + writer.merge(merge); + } finally { + long took = System.currentTimeMillis() - time; + + onGoingMerges.remove(onGoingMerge); + afterMerge(onGoingMerge); + + currentMerges.dec(); + currentMergesNumDocs.dec(totalNumDocs); + currentMergesSizeInBytes.dec(totalSizeInBytes); + + totalMergesNumDocs.inc(totalNumDocs); + totalMergesSizeInBytes.inc(totalSizeInBytes); + totalMerges.inc(took); + if (took > 20000) { // if more than 20 seconds, DEBUG log it + logger.debug("merge [{}] done, took [{}]", merge.info == null ? "_na_" : merge.info.info.name, TimeValue.timeValueMillis(took)); + } else if (logger.isTraceEnabled()) { + logger.trace("merge [{}] done, took [{}]", merge.info == null ? "_na_" : merge.info.info.name, TimeValue.timeValueMillis(took)); + } + } + } + } + + /** + * A callback allowing for custom logic before an actual merge starts. + */ + protected void beforeMerge(OnGoingMerge merge) { + + } + + /** + * A callback allowing for custom logic before an actual merge starts. + */ + protected void afterMerge(OnGoingMerge merge) { + + } + + @Override + public void close() { + } + + @Override + public MergeScheduler clone() { + // Lucene IW makes a clone internally but since we hold on to this instance + // the clone will just be the identity. + return this; + } +}
\ No newline at end of file diff --git a/src/main/java/org/apache/lucene/index/memory/ExtendedMemoryIndex.java b/src/main/java/org/apache/lucene/index/memory/ExtendedMemoryIndex.java new file mode 100644 index 0000000..5f99fce --- /dev/null +++ b/src/main/java/org/apache/lucene/index/memory/ExtendedMemoryIndex.java @@ -0,0 +1,31 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.lucene.index.memory; + +/** + * This class overwrites {@link MemoryIndex} to make the reuse constructor visible. + */ +public final class ExtendedMemoryIndex extends MemoryIndex { + + public ExtendedMemoryIndex(boolean storeOffsets, long maxReusedBytes) { + super(storeOffsets, maxReusedBytes); + } + +} diff --git a/src/main/java/org/apache/lucene/queries/ExtendedCommonTermsQuery.java b/src/main/java/org/apache/lucene/queries/ExtendedCommonTermsQuery.java new file mode 100644 index 0000000..e1dcf58 --- /dev/null +++ b/src/main/java/org/apache/lucene/queries/ExtendedCommonTermsQuery.java @@ -0,0 +1,170 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.lucene.queries; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermContext; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.elasticsearch.common.lucene.search.Queries; +import org.elasticsearch.index.mapper.FieldMapper; + +import java.io.IOException; + +/** + * Extended version of {@link CommonTermsQuery} that allows to pass in a + * <tt>minimumNumberShouldMatch</tt> specification that uses the actual num of high frequent terms + * to calculate the minimum matching terms. + */ +public class ExtendedCommonTermsQuery extends CommonTermsQuery { + + private final FieldMapper<?> mapper; + + public ExtendedCommonTermsQuery(Occur highFreqOccur, Occur lowFreqOccur, float maxTermFrequency, boolean disableCoord, FieldMapper<?> mapper) { + super(highFreqOccur, lowFreqOccur, maxTermFrequency, disableCoord); + this.mapper = mapper; + } + + private String lowFreqMinNumShouldMatchSpec; + private String highFreqMinNumShouldMatchSpec; + + @Override + protected int calcLowFreqMinimumNumberShouldMatch(int numOptional) { + return calcMinimumNumberShouldMatch(lowFreqMinNumShouldMatchSpec, numOptional); + } + + protected int calcMinimumNumberShouldMatch(String spec, int numOptional) { + if (spec == null) { + return 0; + } + return Queries.calculateMinShouldMatch(numOptional, spec); + } + + @Override + protected int calcHighFreqMinimumNumberShouldMatch(int numOptional) { + return calcMinimumNumberShouldMatch(highFreqMinNumShouldMatchSpec, numOptional); + } + + public void setHighFreqMinimumNumberShouldMatch(String spec) { + this.highFreqMinNumShouldMatchSpec = spec; + } + + public String getHighFreqMinimumNumberShouldMatchSpec() { + return highFreqMinNumShouldMatchSpec; + } + + public void setLowFreqMinimumNumberShouldMatch(String spec) { + this.lowFreqMinNumShouldMatchSpec = spec; + } + + public String getLowFreqMinimumNumberShouldMatchSpec() { + return lowFreqMinNumShouldMatchSpec; + } + + // LUCENE-UPGRADE: remove this method if on 4.8 + @Override + public Query rewrite(IndexReader reader) throws IOException { + if (this.terms.isEmpty()) { + return new BooleanQuery(); + } else if (this.terms.size() == 1) { + final Query tq = newTermQuery(this.terms.get(0), null); + tq.setBoost(getBoost()); + return tq; + } + return super.rewrite(reader); + } + + // LUCENE-UPGRADE: remove this method if on 4.8 + @Override + protected Query buildQuery(final int maxDoc, + final TermContext[] contextArray, final Term[] queryTerms) { + BooleanQuery lowFreq = new BooleanQuery(disableCoord); + BooleanQuery highFreq = new BooleanQuery(disableCoord); + highFreq.setBoost(highFreqBoost); + lowFreq.setBoost(lowFreqBoost); + BooleanQuery query = new BooleanQuery(true); + for (int i = 0; i < queryTerms.length; i++) { + TermContext termContext = contextArray[i]; + if (termContext == null) { + lowFreq.add(newTermQuery(queryTerms[i], null), lowFreqOccur); + } else { + if ((maxTermFrequency >= 1f && termContext.docFreq() > maxTermFrequency) + || (termContext.docFreq() > (int) Math.ceil(maxTermFrequency * (float) maxDoc))) { + highFreq.add(newTermQuery(queryTerms[i], termContext), highFreqOccur); + } else { + lowFreq.add(newTermQuery(queryTerms[i], termContext), lowFreqOccur); + } + } + + } + final int numLowFreqClauses = lowFreq.clauses().size(); + final int numHighFreqClauses = highFreq.clauses().size(); + if (lowFreqOccur == Occur.SHOULD && numLowFreqClauses > 0) { + int minMustMatch = calcLowFreqMinimumNumberShouldMatch(numLowFreqClauses); + lowFreq.setMinimumNumberShouldMatch(minMustMatch); + } + if (highFreqOccur == Occur.SHOULD && numHighFreqClauses > 0) { + int minMustMatch = calcHighFreqMinimumNumberShouldMatch(numHighFreqClauses); + highFreq.setMinimumNumberShouldMatch(minMustMatch); + } + if (lowFreq.clauses().isEmpty()) { + /* + * if lowFreq is empty we rewrite the high freq terms in a conjunction to + * prevent slow queries. + */ + if (highFreq.getMinimumNumberShouldMatch() == 0 && highFreqOccur != Occur.MUST) { + for (BooleanClause booleanClause : highFreq) { + booleanClause.setOccur(Occur.MUST); + } + } + highFreq.setBoost(getBoost()); + return highFreq; + } else if (highFreq.clauses().isEmpty()) { + // only do low freq terms - we don't have high freq terms + lowFreq.setBoost(getBoost()); + return lowFreq; + } else { + query.add(highFreq, Occur.SHOULD); + query.add(lowFreq, Occur.MUST); + query.setBoost(getBoost()); + return query; + } + } + + //@Override + // LUCENE-UPGRADE: remove this method if on 4.8 + protected Query newTermQuery(Term term, TermContext context) { + if (mapper == null) { + // this should be super.newTermQuery(term, context) once it's available in the super class + return context == null ? new TermQuery(term) : new TermQuery(term, context); + } + final Query query = mapper.queryStringTermQuery(term); + if (query == null) { + // this should be super.newTermQuery(term, context) once it's available in the super class + return context == null ? new TermQuery(term) : new TermQuery(term, context); + } else { + return query; + } + } +} diff --git a/src/main/java/org/apache/lucene/queries/XTermsFilter.java b/src/main/java/org/apache/lucene/queries/XTermsFilter.java new file mode 100644 index 0000000..ea11f03 --- /dev/null +++ b/src/main/java/org/apache/lucene/queries/XTermsFilter.java @@ -0,0 +1,328 @@ +package org.apache.lucene.queries; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.*; +import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.Filter; +import org.apache.lucene.util.*; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; + +/** + * Constructs a filter for docs matching any of the terms added to this class. + * Unlike a RangeFilter this can be used for filtering on multiple terms that are not necessarily in + * a sequence. An example might be a collection of primary keys from a database query result or perhaps + * a choice of "category" labels picked by the end user. As a filter, this is much faster than the + * equivalent query (a BooleanQuery with many "should" TermQueries) + */ +public final class XTermsFilter extends Filter { + + static { + assert Version.LUCENE_46 == org.elasticsearch.Version.CURRENT.luceneVersion : "Remove this once we are on LUCENE_48 - see LUCENE-5502"; + } + + /* + * this class is often used for large number of terms in a single field. + * to optimize for this case and to be filter-cache friendly we + * serialize all terms into a single byte array and store offsets + * in a parallel array to keep the # of object constant and speed up + * equals / hashcode. + * + * This adds quite a bit of complexity but allows large term filters to + * be efficient for GC and cache-lookups + */ + private final int[] offsets; + private final byte[] termsBytes; + private final TermsAndField[] termsAndFields; + private final int hashCode; // cached hashcode for fast cache lookups + private static final int PRIME = 31; + + /** + * Creates a new {@link XTermsFilter} from the given list. The list + * can contain duplicate terms and multiple fields. + */ + public XTermsFilter(final List<Term> terms) { + this(new FieldAndTermEnum() { + // we need to sort for deduplication and to have a common cache key + final Iterator<Term> iter = sort(terms).iterator(); + @Override + public BytesRef next() { + if (iter.hasNext()) { + Term next = iter.next(); + field = next.field(); + return next.bytes(); + } + return null; + }}, terms.size()); + } + + /** + * Creates a new {@link XTermsFilter} from the given {@link BytesRef} list for + * a single field. + */ + public XTermsFilter(final String field, final List<BytesRef> terms) { + this(new FieldAndTermEnum(field) { + // we need to sort for deduplication and to have a common cache key + final Iterator<BytesRef> iter = sort(terms).iterator(); + @Override + public BytesRef next() { + if (iter.hasNext()) { + return iter.next(); + } + return null; + } + }, terms.size()); + } + + /** + * Creates a new {@link XTermsFilter} from the given {@link BytesRef} array for + * a single field. + */ + public XTermsFilter(final String field, final BytesRef...terms) { + // this ctor prevents unnecessary Term creations + this(field, Arrays.asList(terms)); + } + + /** + * Creates a new {@link XTermsFilter} from the given array. The array can + * contain duplicate terms and multiple fields. + */ + public XTermsFilter(final Term... terms) { + this(Arrays.asList(terms)); + } + + + private XTermsFilter(FieldAndTermEnum iter, int length) { + // TODO: maybe use oal.index.PrefixCodedTerms instead? + // If number of terms is more than a few hundred it + // should be a win + + // TODO: we also pack terms in FieldCache/DocValues + // ... maybe we can refactor to share that code + + // TODO: yet another option is to build the union of the terms in + // an automaton an call intersect on the termsenum if the density is high + + int hash = 9; + byte[] serializedTerms = new byte[0]; + this.offsets = new int[length+1]; + int lastEndOffset = 0; + int index = 0; + ArrayList<TermsAndField> termsAndFields = new ArrayList<TermsAndField>(); + TermsAndField lastTermsAndField = null; + BytesRef previousTerm = null; + String previousField = null; + BytesRef currentTerm; + String currentField; + while((currentTerm = iter.next()) != null) { + currentField = iter.field(); + if (currentField == null) { + throw new IllegalArgumentException("Field must not be null"); + } + if (previousField != null) { + // deduplicate + if (previousField.equals(currentField)) { + if (previousTerm.bytesEquals(currentTerm)){ + continue; + } + } else { + final int start = lastTermsAndField == null ? 0 : lastTermsAndField.end; + lastTermsAndField = new TermsAndField(start, index, previousField); + termsAndFields.add(lastTermsAndField); + } + } + hash = PRIME * hash + currentField.hashCode(); + hash = PRIME * hash + currentTerm.hashCode(); + if (serializedTerms.length < lastEndOffset+currentTerm.length) { + serializedTerms = ArrayUtil.grow(serializedTerms, lastEndOffset+currentTerm.length); + } + System.arraycopy(currentTerm.bytes, currentTerm.offset, serializedTerms, lastEndOffset, currentTerm.length); + offsets[index] = lastEndOffset; + lastEndOffset += currentTerm.length; + index++; + previousTerm = currentTerm; + previousField = currentField; + } + offsets[index] = lastEndOffset; + final int start = lastTermsAndField == null ? 0 : lastTermsAndField.end; + lastTermsAndField = new TermsAndField(start, index, previousField); + termsAndFields.add(lastTermsAndField); + this.termsBytes = ArrayUtil.shrink(serializedTerms, lastEndOffset); + this.termsAndFields = termsAndFields.toArray(new TermsAndField[termsAndFields.size()]); + this.hashCode = hash; + + } + + + @Override + public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException { + final AtomicReader reader = context.reader(); + FixedBitSet result = null; // lazy init if needed - no need to create a big bitset ahead of time + final Fields fields = reader.fields(); + final BytesRef spare = new BytesRef(this.termsBytes); + if (fields == null) { + return result; + } + Terms terms = null; + TermsEnum termsEnum = null; + DocsEnum docs = null; + for (TermsAndField termsAndField : this.termsAndFields) { + if ((terms = fields.terms(termsAndField.field)) != null) { + termsEnum = terms.iterator(termsEnum); // this won't return null + for (int i = termsAndField.start; i < termsAndField.end; i++) { + spare.offset = offsets[i]; + spare.length = offsets[i+1] - offsets[i]; + if (termsEnum.seekExact(spare)) { + docs = termsEnum.docs(acceptDocs, docs, DocsEnum.FLAG_NONE); // no freq since we don't need them + if (result == null) { + if (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + result = new FixedBitSet(reader.maxDoc()); + // lazy init but don't do it in the hot loop since we could read many docs + result.set(docs.docID()); + } + } + while (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + result.set(docs.docID()); + } + } + } + } + } + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if ((obj == null) || (obj.getClass() != this.getClass())) { + return false; + } + + XTermsFilter test = (XTermsFilter) obj; + // first check the fields before even comparing the bytes + if (test.hashCode == hashCode && Arrays.equals(termsAndFields, test.termsAndFields)) { + int lastOffset = termsAndFields[termsAndFields.length - 1].end; + // compare offsets since we sort they must be identical + if (ArrayUtil.equals(offsets, 0, test.offsets, 0, lastOffset + 1)) { + // straight byte comparison since we sort they must be identical + return ArrayUtil.equals(termsBytes, 0, test.termsBytes, 0, offsets[lastOffset]); + } + } + return false; + } + + @Override + public int hashCode() { + return hashCode; + } + + @Override + public String toString() { + StringBuilder builder = new StringBuilder(); + BytesRef spare = new BytesRef(termsBytes); + boolean first = true; + for (int i = 0; i < termsAndFields.length; i++) { + TermsAndField current = termsAndFields[i]; + for (int j = current.start; j < current.end; j++) { + spare.offset = offsets[j]; + spare.length = offsets[j+1] - offsets[j]; + if (!first) { + builder.append(' '); + } + first = false; + builder.append(current.field).append(':'); + builder.append(spare.utf8ToString()); + } + } + + return builder.toString(); + } + + private static final class TermsAndField { + final int start; + final int end; + final String field; + + + TermsAndField(int start, int end, String field) { + super(); + this.start = start; + this.end = end; + this.field = field; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((field == null) ? 0 : field.hashCode()); + result = prime * result + end; + result = prime * result + start; + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) return true; + if (obj == null) return false; + if (getClass() != obj.getClass()) return false; + TermsAndField other = (TermsAndField) obj; + if (field == null) { + if (other.field != null) return false; + } else if (!field.equals(other.field)) return false; + if (end != other.end) return false; + if (start != other.start) return false; + return true; + } + + } + + private static abstract class FieldAndTermEnum { + protected String field; + + public abstract BytesRef next(); + + public FieldAndTermEnum() {} + + public FieldAndTermEnum(String field) { this.field = field; } + + public String field() { + return field; + } + } + + /* + * simple utility that returns the in-place sorted list + */ + private static <T extends Comparable<? super T>> List<T> sort(List<T> toSort) { + if (toSort.isEmpty()) { + throw new IllegalArgumentException("no terms provided"); + } + Collections.sort(toSort); + return toSort; + } +} diff --git a/src/main/java/org/apache/lucene/queryparser/XSimpleQueryParser.java b/src/main/java/org/apache/lucene/queryparser/XSimpleQueryParser.java new file mode 100644 index 0000000..91bc21d --- /dev/null +++ b/src/main/java/org/apache/lucene/queryparser/XSimpleQueryParser.java @@ -0,0 +1,521 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.queryparser; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.*; +import org.apache.lucene.util.QueryBuilder; +import org.apache.lucene.util.Version; +import org.elasticsearch.common.lucene.Lucene; + +import java.util.Collections; +import java.util.Map; + +/** + * XSimpleQueryParser is used to parse human readable query syntax. + * <p> + * The main idea behind this parser is that a person should be able to type + * whatever they want to represent a query, and this parser will do its best + * to interpret what to search for no matter how poorly composed the request + * may be. Tokens are considered to be any of a term, phrase, or subquery for the + * operations described below. Whitespace including ' ' '\n' '\r' and '\t' + * and certain operators may be used to delimit tokens ( ) + | " . + * <p> + * Any errors in query syntax will be ignored and the parser will attempt + * to decipher what it can; however, this may mean odd or unexpected results. + * <h4>Query Operators</h4> + * <ul> + * <li>'{@code +}' specifies {@code AND} operation: <tt>token1+token2</tt> + * <li>'{@code |}' specifies {@code OR} operation: <tt>token1|token2</tt> + * <li>'{@code -}' negates a single token: <tt>-token0</tt> + * <li>'{@code "}' creates phrases of terms: <tt>"term1 term2 ..."</tt> + * <li>'{@code *}' at the end of terms specifies prefix query: <tt>term*</tt> + * <li>'{@code (}' and '{@code )}' specifies precedence: <tt>token1 + (token2 | token3)</tt> + * </ul> + * <p> + * The {@link #setDefaultOperator default operator} is {@code OR} if no other operator is specified. + * For example, the following will {@code OR} {@code token1} and {@code token2} together: + * <tt>token1 token2</tt> + * <p> + * Normal operator precedence will be simple order from right to left. + * For example, the following will evaluate {@code token1 OR token2} first, + * then {@code AND} with {@code token3}: + * <blockquote>token1 | token2 + token3</blockquote> + * <h4>Escaping</h4> + * <p> + * An individual term may contain any possible character with certain characters + * requiring escaping using a '{@code \}'. The following characters will need to be escaped in + * terms and phrases: + * {@code + | " ( ) ' \} + * <p> + * The '{@code -}' operator is a special case. On individual terms (not phrases) the first + * character of a term that is {@code -} must be escaped; however, any '{@code -}' characters + * beyond the first character do not need to be escaped. + * For example: + * <ul> + * <li>{@code -term1} -- Specifies {@code NOT} operation against {@code term1} + * <li>{@code \-term1} -- Searches for the term {@code -term1}. + * <li>{@code term-1} -- Searches for the term {@code term-1}. + * <li>{@code term\-1} -- Searches for the term {@code term-1}. + * </ul> + * <p> + * The '{@code *}' operator is a special case. On individual terms (not phrases) the last + * character of a term that is '{@code *}' must be escaped; however, any '{@code *}' characters + * before the last character do not need to be escaped: + * <ul> + * <li>{@code term1*} -- Searches for the prefix {@code term1} + * <li>{@code term1\*} -- Searches for the term {@code term1*} + * <li>{@code term*1} -- Searches for the term {@code term*1} + * <li>{@code term\*1} -- Searches for the term {@code term*1} + * </ul> + * <p> + * Note that above examples consider the terms before text processing. + */ +public class XSimpleQueryParser extends QueryBuilder { + + static { + assert Version.LUCENE_46.onOrAfter(Lucene.VERSION) : "Lucene 4.7 adds SimpleQueryParser, remove me!"; + } + + /** Map of fields to query against with their weights */ + protected final Map<String,Float> weights; + /** flags to the parser (to turn features on/off) */ + protected final int flags; + + /** Enables {@code AND} operator (+) */ + public static final int AND_OPERATOR = 1<<0; + /** Enables {@code NOT} operator (-) */ + public static final int NOT_OPERATOR = 1<<1; + /** Enables {@code OR} operator (|) */ + public static final int OR_OPERATOR = 1<<2; + /** Enables {@code PREFIX} operator (*) */ + public static final int PREFIX_OPERATOR = 1<<3; + /** Enables {@code PHRASE} operator (") */ + public static final int PHRASE_OPERATOR = 1<<4; + /** Enables {@code PRECEDENCE} operators: {@code (} and {@code )} */ + public static final int PRECEDENCE_OPERATORS = 1<<5; + /** Enables {@code ESCAPE} operator (\) */ + public static final int ESCAPE_OPERATOR = 1<<6; + /** Enables {@code WHITESPACE} operators: ' ' '\n' '\r' '\t' */ + public static final int WHITESPACE_OPERATOR = 1<<7; + + private BooleanClause.Occur defaultOperator = BooleanClause.Occur.SHOULD; + + /** Creates a new parser searching over a single field. */ + public XSimpleQueryParser(Analyzer analyzer, String field) { + this(analyzer, Collections.singletonMap(field, 1.0F)); + } + + /** Creates a new parser searching over multiple fields with different weights. */ + public XSimpleQueryParser(Analyzer analyzer, Map<String, Float> weights) { + this(analyzer, weights, -1); + } + + /** Creates a new parser with custom flags used to enable/disable certain features. */ + public XSimpleQueryParser(Analyzer analyzer, Map<String, Float> weights, int flags) { + super(analyzer); + this.weights = weights; + this.flags = flags; + } + + /** Parses the query text and returns parsed query (or null if empty) */ + public Query parse(String queryText) { + char data[] = queryText.toCharArray(); + char buffer[] = new char[data.length]; + + State state = new State(data, buffer, 0, data.length); + parseSubQuery(state); + return state.top; + } + + private void parseSubQuery(State state) { + while (state.index < state.length) { + if (state.data[state.index] == '(' && (flags & PRECEDENCE_OPERATORS) != 0) { + // the beginning of a subquery has been found + consumeSubQuery(state); + } else if (state.data[state.index] == ')' && (flags & PRECEDENCE_OPERATORS) != 0) { + // this is an extraneous character so it is ignored + ++state.index; + } else if (state.data[state.index] == '"' && (flags & PHRASE_OPERATOR) != 0) { + // the beginning of a phrase has been found + consumePhrase(state); + } else if (state.data[state.index] == '+' && (flags & AND_OPERATOR) != 0) { + // an and operation has been explicitly set + // if an operation has already been set this one is ignored + // if a term (or phrase or subquery) has not been found yet the + // operation is also ignored since there is no previous + // term (or phrase or subquery) to and with + if (state.currentOperation == null && state.top != null) { + state.currentOperation = BooleanClause.Occur.MUST; + } + + ++state.index; + } else if (state.data[state.index] == '|' && (flags & OR_OPERATOR) != 0) { + // an or operation has been explicitly set + // if an operation has already been set this one is ignored + // if a term (or phrase or subquery) has not been found yet the + // operation is also ignored since there is no previous + // term (or phrase or subquery) to or with + if (state.currentOperation == null && state.top != null) { + state.currentOperation = BooleanClause.Occur.SHOULD; + } + + ++state.index; + } else if (state.data[state.index] == '-' && (flags & NOT_OPERATOR) != 0) { + // a not operator has been found, so increase the not count + // two not operators in a row negate each other + ++state.not; + ++state.index; + + // continue so the not operator is not reset + // before the next character is determined + continue; + } else if ((state.data[state.index] == ' ' + || state.data[state.index] == '\t' + || state.data[state.index] == '\n' + || state.data[state.index] == '\r') && (flags & WHITESPACE_OPERATOR) != 0) { + // ignore any whitespace found as it may have already been + // used a delimiter across a term (or phrase or subquery) + // or is simply extraneous + ++state.index; + } else { + // the beginning of a token has been found + consumeToken(state); + } + + // reset the not operator as even whitespace is not allowed when + // specifying the not operation for a term (or phrase or subquery) + state.not = 0; + } + } + + private void consumeSubQuery(State state) { + assert (flags & PRECEDENCE_OPERATORS) != 0; + int start = ++state.index; + int precedence = 1; + boolean escaped = false; + + while (state.index < state.length) { + if (!escaped) { + if (state.data[state.index] == '\\' && (flags & ESCAPE_OPERATOR) != 0) { + // an escape character has been found so + // whatever character is next will become + // part of the subquery unless the escape + // character is the last one in the data + escaped = true; + ++state.index; + + continue; + } else if (state.data[state.index] == '(') { + // increase the precedence as there is a + // subquery in the current subquery + ++precedence; + } else if (state.data[state.index] == ')') { + --precedence; + + if (precedence == 0) { + // this should be the end of the subquery + // all characters found will used for + // creating the subquery + break; + } + } + } + + escaped = false; + ++state.index; + } + + if (state.index == state.length) { + // a closing parenthesis was never found so the opening + // parenthesis is considered extraneous and will be ignored + state.index = start; + } else if (state.index == start) { + // a closing parenthesis was found immediately after the opening + // parenthesis so the current operation is reset since it would + // have been applied to this subquery + state.currentOperation = null; + + ++state.index; + } else { + // a complete subquery has been found and is recursively parsed by + // starting over with a new state object + State subState = new State(state.data, state.buffer, start, state.index); + parseSubQuery(subState); + buildQueryTree(state, subState.top); + + ++state.index; + } + } + + private void consumePhrase(State state) { + assert (flags & PHRASE_OPERATOR) != 0; + int start = ++state.index; + int copied = 0; + boolean escaped = false; + + while (state.index < state.length) { + if (!escaped) { + if (state.data[state.index] == '\\' && (flags & ESCAPE_OPERATOR) != 0) { + // an escape character has been found so + // whatever character is next will become + // part of the phrase unless the escape + // character is the last one in the data + escaped = true; + ++state.index; + + continue; + } else if (state.data[state.index] == '"') { + // this should be the end of the phrase + // all characters found will used for + // creating the phrase query + break; + } + } + + escaped = false; + state.buffer[copied++] = state.data[state.index++]; + } + + if (state.index == state.length) { + // a closing double quote was never found so the opening + // double quote is considered extraneous and will be ignored + state.index = start; + } else if (state.index == start) { + // a closing double quote was found immediately after the opening + // double quote so the current operation is reset since it would + // have been applied to this phrase + state.currentOperation = null; + + ++state.index; + } else { + // a complete phrase has been found and is parsed through + // through the analyzer from the given field + String phrase = new String(state.buffer, 0, copied); + Query branch = newPhraseQuery(phrase); + buildQueryTree(state, branch); + + ++state.index; + } + } + + private void consumeToken(State state) { + int copied = 0; + boolean escaped = false; + boolean prefix = false; + + while (state.index < state.length) { + if (!escaped) { + if (state.data[state.index] == '\\' && (flags & ESCAPE_OPERATOR) != 0) { + // an escape character has been found so + // whatever character is next will become + // part of the term unless the escape + // character is the last one in the data + escaped = true; + prefix = false; + ++state.index; + + continue; + } else if ((state.data[state.index] == '"' && (flags & PHRASE_OPERATOR) != 0) + || (state.data[state.index] == '|' && (flags & OR_OPERATOR) != 0) + || (state.data[state.index] == '+' && (flags & AND_OPERATOR) != 0) + || (state.data[state.index] == '(' && (flags & PRECEDENCE_OPERATORS) != 0) + || (state.data[state.index] == ')' && (flags & PRECEDENCE_OPERATORS) != 0) + || ((state.data[state.index] == ' ' + || state.data[state.index] == '\t' + || state.data[state.index] == '\n' + || state.data[state.index] == '\r') && (flags & WHITESPACE_OPERATOR) != 0)) { + // this should be the end of the term + // all characters found will used for + // creating the term query + break; + } + + // wildcard tracks whether or not the last character + // was a '*' operator that hasn't been escaped + // there must be at least one valid character before + // searching for a prefixed set of terms + prefix = copied > 0 && state.data[state.index] == '*' && (flags & PREFIX_OPERATOR) != 0; + } + + escaped = false; + state.buffer[copied++] = state.data[state.index++]; + } + + if (copied > 0) { + final Query branch; + + if (prefix) { + // if a term is found with a closing '*' it is considered to be a prefix query + // and will have prefix added as an option + String token = new String(state.buffer, 0, copied - 1); + branch = newPrefixQuery(token); + } else { + // a standard term has been found so it will be run through + // the entire analysis chain from the specified schema field + String token = new String(state.buffer, 0, copied); + branch = newDefaultQuery(token); + } + + buildQueryTree(state, branch); + } + } + + // buildQueryTree should be called after a term, phrase, or subquery + // is consumed to be added to our existing query tree + // this method will only add to the existing tree if the branch contained in state is not null + private void buildQueryTree(State state, Query branch) { + if (branch != null) { + // modify our branch to a BooleanQuery wrapper for not + // this is necessary any time a term, phrase, or subquery is negated + if (state.not % 2 == 1) { + BooleanQuery nq = new BooleanQuery(); + nq.add(branch, BooleanClause.Occur.MUST_NOT); + nq.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD); + branch = nq; + } + + // first term (or phrase or subquery) found and will begin our query tree + if (state.top == null) { + state.top = branch; + } else { + // more than one term (or phrase or subquery) found + // set currentOperation to the default if no other operation is explicitly set + if (state.currentOperation == null) { + state.currentOperation = defaultOperator; + } + + // operational change requiring a new parent node + // this occurs if the previous operation is not the same as current operation + // because the previous operation must be evaluated separately to preserve + // the proper precedence and the current operation will take over as the top of the tree + if (state.previousOperation != state.currentOperation) { + BooleanQuery bq = new BooleanQuery(); + bq.add(state.top, state.currentOperation); + state.top = bq; + } + + // reset all of the state for reuse + ((BooleanQuery)state.top).add(branch, state.currentOperation); + state.previousOperation = state.currentOperation; + } + + // reset the current operation as it was intended to be applied to + // the incoming term (or phrase or subquery) even if branch was null + // due to other possible errors + state.currentOperation = null; + } + } + + /** + * Factory method to generate a standard query (no phrase or prefix operators). + */ + protected Query newDefaultQuery(String text) { + BooleanQuery bq = new BooleanQuery(true); + for (Map.Entry<String,Float> entry : weights.entrySet()) { + Query q = createBooleanQuery(entry.getKey(), text, defaultOperator); + if (q != null) { + q.setBoost(entry.getValue()); + bq.add(q, BooleanClause.Occur.SHOULD); + } + } + return simplify(bq); + } + + /** + * Factory method to generate a phrase query. + */ + protected Query newPhraseQuery(String text) { + BooleanQuery bq = new BooleanQuery(true); + for (Map.Entry<String,Float> entry : weights.entrySet()) { + Query q = createPhraseQuery(entry.getKey(), text); + if (q != null) { + q.setBoost(entry.getValue()); + bq.add(q, BooleanClause.Occur.SHOULD); + } + } + return simplify(bq); + } + + /** + * Factory method to generate a prefix query. + */ + protected Query newPrefixQuery(String text) { + BooleanQuery bq = new BooleanQuery(true); + for (Map.Entry<String,Float> entry : weights.entrySet()) { + PrefixQuery prefix = new PrefixQuery(new Term(entry.getKey(), text)); + prefix.setBoost(entry.getValue()); + bq.add(prefix, BooleanClause.Occur.SHOULD); + } + return simplify(bq); + } + + /** + * Helper to simplify boolean queries with 0 or 1 clause + */ + protected Query simplify(BooleanQuery bq) { + if (bq.clauses().isEmpty()) { + return null; + } else if (bq.clauses().size() == 1) { + return bq.clauses().get(0).getQuery(); + } else { + return bq; + } + } + + /** + * Returns the implicit operator setting, which will be + * either {@code SHOULD} or {@code MUST}. + */ + public BooleanClause.Occur getDefaultOperator() { + return defaultOperator; + } + + /** + * Sets the implicit operator setting, which must be + * either {@code SHOULD} or {@code MUST}. + */ + public void setDefaultOperator(BooleanClause.Occur operator) { + if (operator != BooleanClause.Occur.SHOULD && operator != BooleanClause.Occur.MUST) { + throw new IllegalArgumentException("invalid operator: only SHOULD or MUST are allowed"); + } + this.defaultOperator = operator; + } + + static class State { + final char[] data; // the characters in the query string + final char[] buffer; // a temporary buffer used to reduce necessary allocations + int index; + int length; + + BooleanClause.Occur currentOperation; + BooleanClause.Occur previousOperation; + int not; + + Query top; + + State(char[] data, char[] buffer, int index, int length) { + this.data = data; + this.buffer = buffer; + this.index = index; + this.length = length; + } + } +} + diff --git a/src/main/java/org/apache/lucene/queryparser/classic/ExistsFieldQueryExtension.java b/src/main/java/org/apache/lucene/queryparser/classic/ExistsFieldQueryExtension.java new file mode 100644 index 0000000..470f7b8 --- /dev/null +++ b/src/main/java/org/apache/lucene/queryparser/classic/ExistsFieldQueryExtension.java @@ -0,0 +1,38 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.lucene.queryparser.classic; + +import org.apache.lucene.search.Query; +import org.elasticsearch.common.lucene.search.XConstantScoreQuery; +import org.elasticsearch.index.query.ExistsFilterParser; +import org.elasticsearch.index.query.QueryParseContext; + +/** + * + */ +public class ExistsFieldQueryExtension implements FieldQueryExtension { + + public static final String NAME = "_exists_"; + + @Override + public Query query(QueryParseContext parseContext, String queryText) { + return new XConstantScoreQuery(ExistsFilterParser.newFilter(parseContext, queryText, null)); + } +} diff --git a/src/main/java/org/apache/lucene/queryparser/classic/FieldQueryExtension.java b/src/main/java/org/apache/lucene/queryparser/classic/FieldQueryExtension.java new file mode 100644 index 0000000..003ff18 --- /dev/null +++ b/src/main/java/org/apache/lucene/queryparser/classic/FieldQueryExtension.java @@ -0,0 +1,31 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.lucene.queryparser.classic; + +import org.apache.lucene.search.Query; +import org.elasticsearch.index.query.QueryParseContext; + +/** + * + */ +public interface FieldQueryExtension { + + Query query(QueryParseContext parseContext, String queryText); +} diff --git a/src/main/java/org/apache/lucene/queryparser/classic/MapperQueryParser.java b/src/main/java/org/apache/lucene/queryparser/classic/MapperQueryParser.java new file mode 100644 index 0000000..eceada9 --- /dev/null +++ b/src/main/java/org/apache/lucene/queryparser/classic/MapperQueryParser.java @@ -0,0 +1,884 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.lucene.queryparser.classic; + +import com.google.common.base.Objects; +import com.google.common.collect.ImmutableMap; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.*; +import org.apache.lucene.util.automaton.RegExp; +import org.elasticsearch.common.lucene.Lucene; +import org.elasticsearch.common.lucene.search.MatchNoDocsQuery; +import org.elasticsearch.common.lucene.search.Queries; +import org.elasticsearch.common.lucene.search.XFilteredQuery; +import org.elasticsearch.common.unit.Fuzziness; +import org.elasticsearch.index.mapper.FieldMapper; +import org.elasticsearch.index.mapper.MapperService; +import org.elasticsearch.index.query.QueryParseContext; +import org.elasticsearch.index.query.support.QueryParsers; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +import static org.elasticsearch.common.lucene.search.Queries.fixNegativeQueryIfNeeded; +import static org.elasticsearch.common.lucene.search.Queries.optimizeQuery; +import static org.elasticsearch.index.query.support.QueryParsers.wrapSmartNameQuery; + +/** + * A query parser that uses the {@link MapperService} in order to build smarter + * queries based on the mapping information. + * <p/> + * <p>Also breaks fields with [type].[name] into a boolean query that must include the type + * as well as the query on the name. + */ +public class MapperQueryParser extends QueryParser { + + public static final ImmutableMap<String, FieldQueryExtension> fieldQueryExtensions; + + static { + fieldQueryExtensions = ImmutableMap.<String, FieldQueryExtension>builder() + .put(ExistsFieldQueryExtension.NAME, new ExistsFieldQueryExtension()) + .put(MissingFieldQueryExtension.NAME, new MissingFieldQueryExtension()) + .build(); + } + + private final QueryParseContext parseContext; + + private QueryParserSettings settings; + + private Analyzer quoteAnalyzer; + + private boolean forcedAnalyzer; + private boolean forcedQuoteAnalyzer; + + private FieldMapper currentMapper; + + private boolean analyzeWildcard; + + private String quoteFieldSuffix; + + public MapperQueryParser(QueryParseContext parseContext) { + super(Lucene.QUERYPARSER_VERSION, null, null); + this.parseContext = parseContext; + } + + public MapperQueryParser(QueryParserSettings settings, QueryParseContext parseContext) { + super(Lucene.QUERYPARSER_VERSION, settings.defaultField(), settings.defaultAnalyzer()); + this.parseContext = parseContext; + reset(settings); + } + + public void reset(QueryParserSettings settings) { + this.settings = settings; + this.field = settings.defaultField(); + + if (settings.fields() != null) { + if (settings.fields.size() == 1) { + // just mark it as the default field + this.field = settings.fields().get(0); + } else { + // otherwise, we need to have the default field being null... + this.field = null; + } + } + + this.forcedAnalyzer = settings.forcedAnalyzer() != null; + this.setAnalyzer(forcedAnalyzer ? settings.forcedAnalyzer() : settings.defaultAnalyzer()); + if (settings.forcedQuoteAnalyzer() != null) { + this.forcedQuoteAnalyzer = true; + this.quoteAnalyzer = settings.forcedQuoteAnalyzer(); + } else if (forcedAnalyzer) { + this.forcedQuoteAnalyzer = true; + this.quoteAnalyzer = settings.forcedAnalyzer(); + } else { + this.forcedAnalyzer = false; + this.quoteAnalyzer = settings.defaultQuoteAnalyzer(); + } + this.quoteFieldSuffix = settings.quoteFieldSuffix(); + setMultiTermRewriteMethod(settings.rewriteMethod()); + setEnablePositionIncrements(settings.enablePositionIncrements()); + setAutoGeneratePhraseQueries(settings.autoGeneratePhraseQueries()); + setAllowLeadingWildcard(settings.allowLeadingWildcard()); + setLowercaseExpandedTerms(settings.lowercaseExpandedTerms()); + setPhraseSlop(settings.phraseSlop()); + setDefaultOperator(settings.defaultOperator()); + setFuzzyMinSim(settings.fuzzyMinSim()); + setFuzzyPrefixLength(settings.fuzzyPrefixLength()); + this.analyzeWildcard = settings.analyzeWildcard(); + } + + /** + * We override this one so we can get the fuzzy part to be treated as string, so people can do: "age:10~5" or "timestamp:2012-10-10~5d" + */ + @Override + Query handleBareFuzzy(String qfield, Token fuzzySlop, String termImage) throws ParseException { + if (fuzzySlop.image.length() == 1) { + return getFuzzyQuery(qfield, termImage, Float.toString(fuzzyMinSim)); + } + return getFuzzyQuery(qfield, termImage, fuzzySlop.image.substring(1)); + } + + @Override + protected Query newTermQuery(Term term) { + if (currentMapper != null) { + Query termQuery = currentMapper.queryStringTermQuery(term); + if (termQuery != null) { + return termQuery; + } + } + return super.newTermQuery(term); + } + + @Override + protected Query newMatchAllDocsQuery() { + return Queries.newMatchAllQuery(); + } + + @Override + public Query getFieldQuery(String field, String queryText, boolean quoted) throws ParseException { + FieldQueryExtension fieldQueryExtension = fieldQueryExtensions.get(field); + if (fieldQueryExtension != null) { + return fieldQueryExtension.query(parseContext, queryText); + } + Collection<String> fields = extractMultiFields(field); + if (fields != null) { + if (fields.size() == 1) { + return getFieldQuerySingle(fields.iterator().next(), queryText, quoted); + } + if (settings.useDisMax()) { + DisjunctionMaxQuery disMaxQuery = new DisjunctionMaxQuery(settings.tieBreaker()); + boolean added = false; + for (String mField : fields) { + Query q = getFieldQuerySingle(mField, queryText, quoted); + if (q != null) { + added = true; + applyBoost(mField, q); + disMaxQuery.add(q); + } + } + if (!added) { + return null; + } + return disMaxQuery; + } else { + List<BooleanClause> clauses = new ArrayList<BooleanClause>(); + for (String mField : fields) { + Query q = getFieldQuerySingle(mField, queryText, quoted); + if (q != null) { + applyBoost(mField, q); + clauses.add(new BooleanClause(q, BooleanClause.Occur.SHOULD)); + } + } + if (clauses.size() == 0) // happens for stopwords + return null; + return getBooleanQuery(clauses, true); + } + } else { + return getFieldQuerySingle(field, queryText, quoted); + } + } + + private Query getFieldQuerySingle(String field, String queryText, boolean quoted) throws ParseException { + if (!quoted && queryText.length() > 1) { + if (queryText.charAt(0) == '>') { + if (queryText.length() > 2) { + if (queryText.charAt(1) == '=') { + return getRangeQuerySingle(field, queryText.substring(2), null, true, true); + } + } + return getRangeQuerySingle(field, queryText.substring(1), null, false, true); + } else if (queryText.charAt(0) == '<') { + if (queryText.length() > 2) { + if (queryText.charAt(1) == '=') { + return getRangeQuerySingle(field, null, queryText.substring(2), true, true); + } + } + return getRangeQuerySingle(field, null, queryText.substring(1), true, false); + } + } + currentMapper = null; + Analyzer oldAnalyzer = getAnalyzer(); + try { + MapperService.SmartNameFieldMappers fieldMappers = null; + if (quoted) { + setAnalyzer(quoteAnalyzer); + if (quoteFieldSuffix != null) { + fieldMappers = parseContext.smartFieldMappers(field + quoteFieldSuffix); + } + } + if (fieldMappers == null) { + fieldMappers = parseContext.smartFieldMappers(field); + } + if (fieldMappers != null) { + if (quoted) { + if (!forcedQuoteAnalyzer) { + setAnalyzer(fieldMappers.searchQuoteAnalyzer()); + } + } else { + if (!forcedAnalyzer) { + setAnalyzer(fieldMappers.searchAnalyzer()); + } + } + currentMapper = fieldMappers.fieldMappers().mapper(); + if (currentMapper != null) { + Query query = null; + if (currentMapper.useTermQueryWithQueryString()) { + try { + if (fieldMappers.explicitTypeInNameWithDocMapper()) { + String[] previousTypes = QueryParseContext.setTypesWithPrevious(new String[]{fieldMappers.docMapper().type()}); + try { + query = currentMapper.termQuery(queryText, parseContext); + } finally { + QueryParseContext.setTypes(previousTypes); + } + } else { + query = currentMapper.termQuery(queryText, parseContext); + } + } catch (RuntimeException e) { + if (settings.lenient()) { + return null; + } else { + throw e; + } + } + } + if (query == null) { + query = super.getFieldQuery(currentMapper.names().indexName(), queryText, quoted); + } + return wrapSmartNameQuery(query, fieldMappers, parseContext); + } + } + return super.getFieldQuery(field, queryText, quoted); + } finally { + setAnalyzer(oldAnalyzer); + } + } + + @Override + protected Query getFieldQuery(String field, String queryText, int slop) throws ParseException { + Collection<String> fields = extractMultiFields(field); + if (fields != null) { + if (settings.useDisMax()) { + DisjunctionMaxQuery disMaxQuery = new DisjunctionMaxQuery(settings.tieBreaker()); + boolean added = false; + for (String mField : fields) { + Query q = super.getFieldQuery(mField, queryText, slop); + if (q != null) { + added = true; + applyBoost(mField, q); + applySlop(q, slop); + disMaxQuery.add(q); + } + } + if (!added) { + return null; + } + return disMaxQuery; + } else { + List<BooleanClause> clauses = new ArrayList<BooleanClause>(); + for (String mField : fields) { + Query q = super.getFieldQuery(mField, queryText, slop); + if (q != null) { + applyBoost(mField, q); + applySlop(q, slop); + clauses.add(new BooleanClause(q, BooleanClause.Occur.SHOULD)); + } + } + if (clauses.size() == 0) // happens for stopwords + return null; + return getBooleanQuery(clauses, true); + } + } else { + return super.getFieldQuery(field, queryText, slop); + } + } + + @Override + protected Query getRangeQuery(String field, String part1, String part2, boolean startInclusive, boolean endInclusive) throws ParseException { + if ("*".equals(part1)) { + part1 = null; + } + if ("*".equals(part2)) { + part2 = null; + } + + Collection<String> fields = extractMultiFields(field); + + if (fields == null) { + return getRangeQuerySingle(field, part1, part2, startInclusive, endInclusive); + } + + + if (fields.size() == 1) { + return getRangeQuerySingle(fields.iterator().next(), part1, part2, startInclusive, endInclusive); + } + + if (settings.useDisMax()) { + DisjunctionMaxQuery disMaxQuery = new DisjunctionMaxQuery(settings.tieBreaker()); + boolean added = false; + for (String mField : fields) { + Query q = getRangeQuerySingle(mField, part1, part2, startInclusive, endInclusive); + if (q != null) { + added = true; + applyBoost(mField, q); + disMaxQuery.add(q); + } + } + if (!added) { + return null; + } + return disMaxQuery; + } else { + List<BooleanClause> clauses = new ArrayList<BooleanClause>(); + for (String mField : fields) { + Query q = getRangeQuerySingle(mField, part1, part2, startInclusive, endInclusive); + if (q != null) { + applyBoost(mField, q); + clauses.add(new BooleanClause(q, BooleanClause.Occur.SHOULD)); + } + } + if (clauses.size() == 0) // happens for stopwords + return null; + return getBooleanQuery(clauses, true); + } + } + + private Query getRangeQuerySingle(String field, String part1, String part2, boolean startInclusive, boolean endInclusive) { + currentMapper = null; + MapperService.SmartNameFieldMappers fieldMappers = parseContext.smartFieldMappers(field); + if (fieldMappers != null) { + currentMapper = fieldMappers.fieldMappers().mapper(); + if (currentMapper != null) { + + if (lowercaseExpandedTerms && !currentMapper.isNumeric()) { + part1 = part1 == null ? null : part1.toLowerCase(locale); + part2 = part2 == null ? null : part2.toLowerCase(locale); + } + + try { + Query rangeQuery = currentMapper.rangeQuery(part1, part2, startInclusive, endInclusive, parseContext); + return wrapSmartNameQuery(rangeQuery, fieldMappers, parseContext); + } catch (RuntimeException e) { + if (settings.lenient()) { + return null; + } + throw e; + } + } + } + return newRangeQuery(field, part1, part2, startInclusive, endInclusive); + } + + protected Query getFuzzyQuery(String field, String termStr, String minSimilarity) throws ParseException { + if (lowercaseExpandedTerms) { + termStr = termStr.toLowerCase(locale); + } + Collection<String> fields = extractMultiFields(field); + if (fields != null) { + if (fields.size() == 1) { + return getFuzzyQuerySingle(fields.iterator().next(), termStr, minSimilarity); + } + if (settings.useDisMax()) { + DisjunctionMaxQuery disMaxQuery = new DisjunctionMaxQuery(settings.tieBreaker()); + boolean added = false; + for (String mField : fields) { + Query q = getFuzzyQuerySingle(mField, termStr, minSimilarity); + if (q != null) { + added = true; + applyBoost(mField, q); + disMaxQuery.add(q); + } + } + if (!added) { + return null; + } + return disMaxQuery; + } else { + List<BooleanClause> clauses = new ArrayList<BooleanClause>(); + for (String mField : fields) { + Query q = getFuzzyQuerySingle(mField, termStr, minSimilarity); + applyBoost(mField, q); + clauses.add(new BooleanClause(q, BooleanClause.Occur.SHOULD)); + } + return getBooleanQuery(clauses, true); + } + } else { + return getFuzzyQuerySingle(field, termStr, minSimilarity); + } + } + + private Query getFuzzyQuerySingle(String field, String termStr, String minSimilarity) throws ParseException { + currentMapper = null; + MapperService.SmartNameFieldMappers fieldMappers = parseContext.smartFieldMappers(field); + if (fieldMappers != null) { + currentMapper = fieldMappers.fieldMappers().mapper(); + if (currentMapper != null) { + try { + //LUCENE 4 UPGRADE I disabled transpositions here by default - maybe this needs to be changed + Query fuzzyQuery = currentMapper.fuzzyQuery(termStr, Fuzziness.build(minSimilarity), fuzzyPrefixLength, settings.fuzzyMaxExpansions(), false); + return wrapSmartNameQuery(fuzzyQuery, fieldMappers, parseContext); + } catch (RuntimeException e) { + if (settings.lenient()) { + return null; + } + throw e; + } + } + } + return super.getFuzzyQuery(field, termStr, Float.parseFloat(minSimilarity)); + } + + @Override + protected Query newFuzzyQuery(Term term, float minimumSimilarity, int prefixLength) { + String text = term.text(); + int numEdits = FuzzyQuery.floatToEdits(minimumSimilarity, text.codePointCount(0, text.length())); + //LUCENE 4 UPGRADE I disabled transpositions here by default - maybe this needs to be changed + FuzzyQuery query = new FuzzyQuery(term, numEdits, prefixLength, settings.fuzzyMaxExpansions(), false); + QueryParsers.setRewriteMethod(query, settings.fuzzyRewriteMethod()); + return query; + } + + @Override + protected Query getPrefixQuery(String field, String termStr) throws ParseException { + if (lowercaseExpandedTerms) { + termStr = termStr.toLowerCase(locale); + } + Collection<String> fields = extractMultiFields(field); + if (fields != null) { + if (fields.size() == 1) { + return getPrefixQuerySingle(fields.iterator().next(), termStr); + } + if (settings.useDisMax()) { + DisjunctionMaxQuery disMaxQuery = new DisjunctionMaxQuery(settings.tieBreaker()); + boolean added = false; + for (String mField : fields) { + Query q = getPrefixQuerySingle(mField, termStr); + if (q != null) { + added = true; + applyBoost(mField, q); + disMaxQuery.add(q); + } + } + if (!added) { + return null; + } + return disMaxQuery; + } else { + List<BooleanClause> clauses = new ArrayList<BooleanClause>(); + for (String mField : fields) { + Query q = getPrefixQuerySingle(mField, termStr); + if (q != null) { + applyBoost(mField, q); + clauses.add(new BooleanClause(q, BooleanClause.Occur.SHOULD)); + } + } + if (clauses.size() == 0) // happens for stopwords + return null; + return getBooleanQuery(clauses, true); + } + } else { + return getPrefixQuerySingle(field, termStr); + } + } + + private Query getPrefixQuerySingle(String field, String termStr) throws ParseException { + currentMapper = null; + Analyzer oldAnalyzer = getAnalyzer(); + try { + MapperService.SmartNameFieldMappers fieldMappers = parseContext.smartFieldMappers(field); + if (fieldMappers != null) { + if (!forcedAnalyzer) { + setAnalyzer(fieldMappers.searchAnalyzer()); + } + currentMapper = fieldMappers.fieldMappers().mapper(); + if (currentMapper != null) { + Query query = null; + if (currentMapper.useTermQueryWithQueryString()) { + if (fieldMappers.explicitTypeInNameWithDocMapper()) { + String[] previousTypes = QueryParseContext.setTypesWithPrevious(new String[]{fieldMappers.docMapper().type()}); + try { + query = currentMapper.prefixQuery(termStr, multiTermRewriteMethod, parseContext); + } finally { + QueryParseContext.setTypes(previousTypes); + } + } else { + query = currentMapper.prefixQuery(termStr, multiTermRewriteMethod, parseContext); + } + } + if (query == null) { + query = getPossiblyAnalyzedPrefixQuery(currentMapper.names().indexName(), termStr); + } + return wrapSmartNameQuery(query, fieldMappers, parseContext); + } + } + return getPossiblyAnalyzedPrefixQuery(field, termStr); + } catch (RuntimeException e) { + if (settings.lenient()) { + return null; + } + throw e; + } finally { + setAnalyzer(oldAnalyzer); + } + } + + private Query getPossiblyAnalyzedPrefixQuery(String field, String termStr) throws ParseException { + if (!analyzeWildcard) { + return super.getPrefixQuery(field, termStr); + } + // get Analyzer from superclass and tokenize the term + TokenStream source; + try { + source = getAnalyzer().tokenStream(field, termStr); + source.reset(); + } catch (IOException e) { + return super.getPrefixQuery(field, termStr); + } + List<String> tlist = new ArrayList<String>(); + CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class); + + while (true) { + try { + if (!source.incrementToken()) break; + } catch (IOException e) { + break; + } + tlist.add(termAtt.toString()); + } + + try { + source.close(); + } catch (IOException e) { + // ignore + } + + if (tlist.size() == 1) { + return super.getPrefixQuery(field, tlist.get(0)); + } else { + // build a boolean query with prefix on each one... + List<BooleanClause> clauses = new ArrayList<BooleanClause>(); + for (String token : tlist) { + clauses.add(new BooleanClause(super.getPrefixQuery(field, token), BooleanClause.Occur.SHOULD)); + } + return getBooleanQuery(clauses, true); + + //return super.getPrefixQuery(field, termStr); + + /* this means that the analyzer used either added or consumed +* (common for a stemmer) tokens, and we can't build a PrefixQuery */ +// throw new ParseException("Cannot build PrefixQuery with analyzer " +// + getAnalyzer().getClass() +// + (tlist.size() > 1 ? " - token(s) added" : " - token consumed")); + } + + } + + @Override + protected Query getWildcardQuery(String field, String termStr) throws ParseException { + if (termStr.equals("*")) { + // we want to optimize for match all query for the "*:*", and "*" cases + if ("*".equals(field) || Objects.equal(field, this.field)) { + String actualField = field; + if (actualField == null) { + actualField = this.field; + } + if (actualField == null) { + return newMatchAllDocsQuery(); + } + if ("*".equals(actualField) || "_all".equals(actualField)) { + return newMatchAllDocsQuery(); + } + // effectively, we check if a field exists or not + return fieldQueryExtensions.get(ExistsFieldQueryExtension.NAME).query(parseContext, actualField); + } + } + if (lowercaseExpandedTerms) { + termStr = termStr.toLowerCase(locale); + } + Collection<String> fields = extractMultiFields(field); + if (fields != null) { + if (fields.size() == 1) { + return getWildcardQuerySingle(fields.iterator().next(), termStr); + } + if (settings.useDisMax()) { + DisjunctionMaxQuery disMaxQuery = new DisjunctionMaxQuery(settings.tieBreaker()); + boolean added = false; + for (String mField : fields) { + Query q = getWildcardQuerySingle(mField, termStr); + if (q != null) { + added = true; + applyBoost(mField, q); + disMaxQuery.add(q); + } + } + if (!added) { + return null; + } + return disMaxQuery; + } else { + List<BooleanClause> clauses = new ArrayList<BooleanClause>(); + for (String mField : fields) { + Query q = getWildcardQuerySingle(mField, termStr); + if (q != null) { + applyBoost(mField, q); + clauses.add(new BooleanClause(q, BooleanClause.Occur.SHOULD)); + } + } + if (clauses.size() == 0) // happens for stopwords + return null; + return getBooleanQuery(clauses, true); + } + } else { + return getWildcardQuerySingle(field, termStr); + } + } + + private Query getWildcardQuerySingle(String field, String termStr) throws ParseException { + String indexedNameField = field; + currentMapper = null; + Analyzer oldAnalyzer = getAnalyzer(); + try { + MapperService.SmartNameFieldMappers fieldMappers = parseContext.smartFieldMappers(field); + if (fieldMappers != null) { + if (!forcedAnalyzer) { + setAnalyzer(fieldMappers.searchAnalyzer()); + } + currentMapper = fieldMappers.fieldMappers().mapper(); + if (currentMapper != null) { + indexedNameField = currentMapper.names().indexName(); + } + return wrapSmartNameQuery(getPossiblyAnalyzedWildcardQuery(indexedNameField, termStr), fieldMappers, parseContext); + } + return getPossiblyAnalyzedWildcardQuery(indexedNameField, termStr); + } catch (RuntimeException e) { + if (settings.lenient()) { + return null; + } + throw e; + } finally { + setAnalyzer(oldAnalyzer); + } + } + + private Query getPossiblyAnalyzedWildcardQuery(String field, String termStr) throws ParseException { + if (!analyzeWildcard) { + return super.getWildcardQuery(field, termStr); + } + boolean isWithinToken = (!termStr.startsWith("?") && !termStr.startsWith("*")); + StringBuilder aggStr = new StringBuilder(); + StringBuilder tmp = new StringBuilder(); + for (int i = 0; i < termStr.length(); i++) { + char c = termStr.charAt(i); + if (c == '?' || c == '*') { + if (isWithinToken) { + try { + TokenStream source = getAnalyzer().tokenStream(field, tmp.toString()); + source.reset(); + CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class); + if (source.incrementToken()) { + String term = termAtt.toString(); + if (term.length() == 0) { + // no tokens, just use what we have now + aggStr.append(tmp); + } else { + aggStr.append(term); + } + } else { + // no tokens, just use what we have now + aggStr.append(tmp); + } + source.close(); + } catch (IOException e) { + aggStr.append(tmp); + } + tmp.setLength(0); + } + isWithinToken = false; + aggStr.append(c); + } else { + tmp.append(c); + isWithinToken = true; + } + } + if (isWithinToken) { + try { + TokenStream source = getAnalyzer().tokenStream(field, tmp.toString()); + source.reset(); + CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class); + if (source.incrementToken()) { + String term = termAtt.toString(); + if (term.length() == 0) { + // no tokens, just use what we have now + aggStr.append(tmp); + } else { + aggStr.append(term); + } + } else { + // no tokens, just use what we have now + aggStr.append(tmp); + } + source.close(); + } catch (IOException e) { + aggStr.append(tmp); + } + } + + return super.getWildcardQuery(field, aggStr.toString()); + } + + @Override + protected Query getRegexpQuery(String field, String termStr) throws ParseException { + if (lowercaseExpandedTerms) { + termStr = termStr.toLowerCase(locale); + } + Collection<String> fields = extractMultiFields(field); + if (fields != null) { + if (fields.size() == 1) { + return getRegexpQuerySingle(fields.iterator().next(), termStr); + } + if (settings.useDisMax()) { + DisjunctionMaxQuery disMaxQuery = new DisjunctionMaxQuery(settings.tieBreaker()); + boolean added = false; + for (String mField : fields) { + Query q = getRegexpQuerySingle(mField, termStr); + if (q != null) { + added = true; + applyBoost(mField, q); + disMaxQuery.add(q); + } + } + if (!added) { + return null; + } + return disMaxQuery; + } else { + List<BooleanClause> clauses = new ArrayList<BooleanClause>(); + for (String mField : fields) { + Query q = getRegexpQuerySingle(mField, termStr); + if (q != null) { + applyBoost(mField, q); + clauses.add(new BooleanClause(q, BooleanClause.Occur.SHOULD)); + } + } + if (clauses.size() == 0) // happens for stopwords + return null; + return getBooleanQuery(clauses, true); + } + } else { + return getRegexpQuerySingle(field, termStr); + } + } + + private Query getRegexpQuerySingle(String field, String termStr) throws ParseException { + currentMapper = null; + Analyzer oldAnalyzer = getAnalyzer(); + try { + MapperService.SmartNameFieldMappers fieldMappers = parseContext.smartFieldMappers(field); + if (fieldMappers != null) { + if (!forcedAnalyzer) { + setAnalyzer(fieldMappers.searchAnalyzer()); + } + currentMapper = fieldMappers.fieldMappers().mapper(); + if (currentMapper != null) { + Query query = null; + if (currentMapper.useTermQueryWithQueryString()) { + if (fieldMappers.explicitTypeInNameWithDocMapper()) { + String[] previousTypes = QueryParseContext.setTypesWithPrevious(new String[]{fieldMappers.docMapper().type()}); + try { + query = currentMapper.regexpQuery(termStr, RegExp.ALL, multiTermRewriteMethod, parseContext); + } finally { + QueryParseContext.setTypes(previousTypes); + } + } else { + query = currentMapper.regexpQuery(termStr, RegExp.ALL, multiTermRewriteMethod, parseContext); + } + } + if (query == null) { + query = super.getRegexpQuery(field, termStr); + } + return wrapSmartNameQuery(query, fieldMappers, parseContext); + } + } + return super.getRegexpQuery(field, termStr); + } catch (RuntimeException e) { + if (settings.lenient()) { + return null; + } + throw e; + } finally { + setAnalyzer(oldAnalyzer); + } + } + + @Override + protected Query getBooleanQuery(List<BooleanClause> clauses, boolean disableCoord) throws ParseException { + Query q = super.getBooleanQuery(clauses, disableCoord); + if (q == null) { + return null; + } + return optimizeQuery(fixNegativeQueryIfNeeded(q)); + } + + private void applyBoost(String field, Query q) { + if (settings.boosts() != null) { + float boost = 1f; + if (settings.boosts().containsKey(field)) { + boost = settings.boosts().lget(); + } + q.setBoost(boost); + } + } + + private void applySlop(Query q, int slop) { + if (q instanceof XFilteredQuery) { + applySlop(((XFilteredQuery)q).getQuery(), slop); + } + if (q instanceof PhraseQuery) { + ((PhraseQuery) q).setSlop(slop); + } else if (q instanceof MultiPhraseQuery) { + ((MultiPhraseQuery) q).setSlop(slop); + } + } + + private Collection<String> extractMultiFields(String field) { + Collection<String> fields = null; + if (field != null) { + fields = parseContext.simpleMatchToIndexNames(field); + } else { + fields = settings.fields(); + } + return fields; + } + + public Query parse(String query) throws ParseException { + if (query.trim().isEmpty()) { + // if the query string is empty we return no docs / empty result + // the behavior is simple to change in the client if all docs is required + // or a default query + return new MatchNoDocsQuery(); + } + return super.parse(query); + } +} diff --git a/src/main/java/org/apache/lucene/queryparser/classic/MissingFieldQueryExtension.java b/src/main/java/org/apache/lucene/queryparser/classic/MissingFieldQueryExtension.java new file mode 100644 index 0000000..ad200d4 --- /dev/null +++ b/src/main/java/org/apache/lucene/queryparser/classic/MissingFieldQueryExtension.java @@ -0,0 +1,39 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.lucene.queryparser.classic; + +import org.apache.lucene.search.Query; +import org.elasticsearch.common.lucene.search.XConstantScoreQuery; +import org.elasticsearch.index.query.MissingFilterParser; +import org.elasticsearch.index.query.QueryParseContext; + +/** + * + */ +public class MissingFieldQueryExtension implements FieldQueryExtension { + + public static final String NAME = "_missing_"; + + @Override + public Query query(QueryParseContext parseContext, String queryText) { + return new XConstantScoreQuery(MissingFilterParser.newFilter(parseContext, queryText, + MissingFilterParser.DEFAULT_EXISTENCE_VALUE, MissingFilterParser.DEFAULT_NULL_VALUE, null)); + } +} diff --git a/src/main/java/org/apache/lucene/queryparser/classic/QueryParserSettings.java b/src/main/java/org/apache/lucene/queryparser/classic/QueryParserSettings.java new file mode 100644 index 0000000..9b8f9aa --- /dev/null +++ b/src/main/java/org/apache/lucene/queryparser/classic/QueryParserSettings.java @@ -0,0 +1,376 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.lucene.queryparser.classic; + +import com.carrotsearch.hppc.ObjectFloatOpenHashMap; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.search.FuzzyQuery; +import org.apache.lucene.search.MultiTermQuery; + +import java.util.Collection; +import java.util.List; + +/** + * + */ +public class QueryParserSettings { + + public static final boolean DEFAULT_ALLOW_LEADING_WILDCARD = true; + public static final boolean DEFAULT_ANALYZE_WILDCARD = false; + public static final float DEFAULT_BOOST = 1.f; + + private String queryString; + private String defaultField; + private float boost = DEFAULT_BOOST; + private MapperQueryParser.Operator defaultOperator = QueryParser.Operator.OR; + private boolean autoGeneratePhraseQueries = false; + private boolean allowLeadingWildcard = DEFAULT_ALLOW_LEADING_WILDCARD; + private boolean lowercaseExpandedTerms = true; + private boolean enablePositionIncrements = true; + private int phraseSlop = 0; + private float fuzzyMinSim = FuzzyQuery.defaultMinSimilarity; + private int fuzzyPrefixLength = FuzzyQuery.defaultPrefixLength; + private int fuzzyMaxExpansions = FuzzyQuery.defaultMaxExpansions; + private MultiTermQuery.RewriteMethod fuzzyRewriteMethod = null; + private boolean analyzeWildcard = DEFAULT_ANALYZE_WILDCARD; + private boolean escape = false; + private Analyzer defaultAnalyzer = null; + private Analyzer defaultQuoteAnalyzer = null; + private Analyzer forcedAnalyzer = null; + private Analyzer forcedQuoteAnalyzer = null; + private String quoteFieldSuffix = null; + private MultiTermQuery.RewriteMethod rewriteMethod = MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT; + private String minimumShouldMatch; + private boolean lenient; + + + List<String> fields = null; + Collection<String> queryTypes = null; + ObjectFloatOpenHashMap<String> boosts = null; + float tieBreaker = 0.0f; + boolean useDisMax = true; + + public boolean isCacheable() { + // a hack for now :) to determine if a query string is cacheable + return !queryString.contains("now"); + } + + public String queryString() { + return queryString; + } + + public void queryString(String queryString) { + this.queryString = queryString; + } + + public String defaultField() { + return defaultField; + } + + public void defaultField(String defaultField) { + this.defaultField = defaultField; + } + + public float boost() { + return boost; + } + + public void boost(float boost) { + this.boost = boost; + } + + public QueryParser.Operator defaultOperator() { + return defaultOperator; + } + + public void defaultOperator(QueryParser.Operator defaultOperator) { + this.defaultOperator = defaultOperator; + } + + public boolean autoGeneratePhraseQueries() { + return autoGeneratePhraseQueries; + } + + public void autoGeneratePhraseQueries(boolean autoGeneratePhraseQueries) { + this.autoGeneratePhraseQueries = autoGeneratePhraseQueries; + } + + public boolean allowLeadingWildcard() { + return allowLeadingWildcard; + } + + public void allowLeadingWildcard(boolean allowLeadingWildcard) { + this.allowLeadingWildcard = allowLeadingWildcard; + } + + public boolean lowercaseExpandedTerms() { + return lowercaseExpandedTerms; + } + + public void lowercaseExpandedTerms(boolean lowercaseExpandedTerms) { + this.lowercaseExpandedTerms = lowercaseExpandedTerms; + } + + public boolean enablePositionIncrements() { + return enablePositionIncrements; + } + + public void enablePositionIncrements(boolean enablePositionIncrements) { + this.enablePositionIncrements = enablePositionIncrements; + } + + public int phraseSlop() { + return phraseSlop; + } + + public void phraseSlop(int phraseSlop) { + this.phraseSlop = phraseSlop; + } + + public float fuzzyMinSim() { + return fuzzyMinSim; + } + + public void fuzzyMinSim(float fuzzyMinSim) { + this.fuzzyMinSim = fuzzyMinSim; + } + + public int fuzzyPrefixLength() { + return fuzzyPrefixLength; + } + + public void fuzzyPrefixLength(int fuzzyPrefixLength) { + this.fuzzyPrefixLength = fuzzyPrefixLength; + } + + public int fuzzyMaxExpansions() { + return fuzzyMaxExpansions; + } + + public void fuzzyMaxExpansions(int fuzzyMaxExpansions) { + this.fuzzyMaxExpansions = fuzzyMaxExpansions; + } + + public MultiTermQuery.RewriteMethod fuzzyRewriteMethod() { + return fuzzyRewriteMethod; + } + + public void fuzzyRewriteMethod(MultiTermQuery.RewriteMethod fuzzyRewriteMethod) { + this.fuzzyRewriteMethod = fuzzyRewriteMethod; + } + + public boolean escape() { + return escape; + } + + public void escape(boolean escape) { + this.escape = escape; + } + + public Analyzer defaultAnalyzer() { + return defaultAnalyzer; + } + + public void defaultAnalyzer(Analyzer defaultAnalyzer) { + this.defaultAnalyzer = defaultAnalyzer; + } + + public Analyzer defaultQuoteAnalyzer() { + return defaultQuoteAnalyzer; + } + + public void defaultQuoteAnalyzer(Analyzer defaultAnalyzer) { + this.defaultQuoteAnalyzer = defaultAnalyzer; + } + + public Analyzer forcedAnalyzer() { + return forcedAnalyzer; + } + + public void forcedAnalyzer(Analyzer forcedAnalyzer) { + this.forcedAnalyzer = forcedAnalyzer; + } + + public Analyzer forcedQuoteAnalyzer() { + return forcedQuoteAnalyzer; + } + + public void forcedQuoteAnalyzer(Analyzer forcedAnalyzer) { + this.forcedQuoteAnalyzer = forcedAnalyzer; + } + + public boolean analyzeWildcard() { + return this.analyzeWildcard; + } + + public void analyzeWildcard(boolean analyzeWildcard) { + this.analyzeWildcard = analyzeWildcard; + } + + public MultiTermQuery.RewriteMethod rewriteMethod() { + return this.rewriteMethod; + } + + public void rewriteMethod(MultiTermQuery.RewriteMethod rewriteMethod) { + this.rewriteMethod = rewriteMethod; + } + + public String minimumShouldMatch() { + return this.minimumShouldMatch; + } + + public void minimumShouldMatch(String minimumShouldMatch) { + this.minimumShouldMatch = minimumShouldMatch; + } + + public void quoteFieldSuffix(String quoteFieldSuffix) { + this.quoteFieldSuffix = quoteFieldSuffix; + } + + public String quoteFieldSuffix() { + return this.quoteFieldSuffix; + } + + public void lenient(boolean lenient) { + this.lenient = lenient; + } + + public boolean lenient() { + return this.lenient; + } + + public List<String> fields() { + return fields; + } + + public void fields(List<String> fields) { + this.fields = fields; + } + + public Collection<String> queryTypes() { + return queryTypes; + } + + public void queryTypes(Collection<String> queryTypes) { + this.queryTypes = queryTypes; + } + + public ObjectFloatOpenHashMap<String> boosts() { + return boosts; + } + + public void boosts(ObjectFloatOpenHashMap<String> boosts) { + this.boosts = boosts; + } + + public float tieBreaker() { + return tieBreaker; + } + + public void tieBreaker(float tieBreaker) { + this.tieBreaker = tieBreaker; + } + + public boolean useDisMax() { + return useDisMax; + } + + public void useDisMax(boolean useDisMax) { + this.useDisMax = useDisMax; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + QueryParserSettings that = (QueryParserSettings) o; + + if (autoGeneratePhraseQueries != that.autoGeneratePhraseQueries()) return false; + if (allowLeadingWildcard != that.allowLeadingWildcard) return false; + if (Float.compare(that.boost, boost) != 0) return false; + if (enablePositionIncrements != that.enablePositionIncrements) return false; + if (escape != that.escape) return false; + if (analyzeWildcard != that.analyzeWildcard) return false; + if (Float.compare(that.fuzzyMinSim, fuzzyMinSim) != 0) return false; + if (fuzzyPrefixLength != that.fuzzyPrefixLength) return false; + if (fuzzyMaxExpansions != that.fuzzyMaxExpansions) return false; + if (fuzzyRewriteMethod != null ? !fuzzyRewriteMethod.equals(that.fuzzyRewriteMethod) : that.fuzzyRewriteMethod != null) + return false; + if (lowercaseExpandedTerms != that.lowercaseExpandedTerms) return false; + if (phraseSlop != that.phraseSlop) return false; + if (defaultAnalyzer != null ? !defaultAnalyzer.equals(that.defaultAnalyzer) : that.defaultAnalyzer != null) + return false; + if (defaultQuoteAnalyzer != null ? !defaultQuoteAnalyzer.equals(that.defaultQuoteAnalyzer) : that.defaultQuoteAnalyzer != null) + return false; + if (forcedAnalyzer != null ? !forcedAnalyzer.equals(that.forcedAnalyzer) : that.forcedAnalyzer != null) + return false; + if (forcedQuoteAnalyzer != null ? !forcedQuoteAnalyzer.equals(that.forcedQuoteAnalyzer) : that.forcedQuoteAnalyzer != null) + return false; + if (defaultField != null ? !defaultField.equals(that.defaultField) : that.defaultField != null) return false; + if (defaultOperator != that.defaultOperator) return false; + if (queryString != null ? !queryString.equals(that.queryString) : that.queryString != null) return false; + if (rewriteMethod != null ? !rewriteMethod.equals(that.rewriteMethod) : that.rewriteMethod != null) + return false; + if (minimumShouldMatch != null ? !minimumShouldMatch.equals(that.minimumShouldMatch) : that.minimumShouldMatch != null) + return false; + if (quoteFieldSuffix != null ? !quoteFieldSuffix.equals(that.quoteFieldSuffix) : that.quoteFieldSuffix != null) + return false; + if (lenient != that.lenient) { + return false; + } + + if (Float.compare(that.tieBreaker, tieBreaker) != 0) return false; + if (useDisMax != that.useDisMax) return false; + if (boosts != null ? !boosts.equals(that.boosts) : that.boosts != null) return false; + if (fields != null ? !fields.equals(that.fields) : that.fields != null) return false; + if (queryTypes != null ? !queryTypes.equals(that.queryTypes) : that.queryTypes != null) return false; + + return true; + } + + @Override + public int hashCode() { + int result = queryString != null ? queryString.hashCode() : 0; + result = 31 * result + (defaultField != null ? defaultField.hashCode() : 0); + result = 31 * result + (boost != +0.0f ? Float.floatToIntBits(boost) : 0); + result = 31 * result + (defaultOperator != null ? defaultOperator.hashCode() : 0); + result = 31 * result + (autoGeneratePhraseQueries ? 1 : 0); + result = 31 * result + (allowLeadingWildcard ? 1 : 0); + result = 31 * result + (lowercaseExpandedTerms ? 1 : 0); + result = 31 * result + (enablePositionIncrements ? 1 : 0); + result = 31 * result + phraseSlop; + result = 31 * result + (fuzzyMinSim != +0.0f ? Float.floatToIntBits(fuzzyMinSim) : 0); + result = 31 * result + fuzzyPrefixLength; + result = 31 * result + (escape ? 1 : 0); + result = 31 * result + (defaultAnalyzer != null ? defaultAnalyzer.hashCode() : 0); + result = 31 * result + (defaultQuoteAnalyzer != null ? defaultQuoteAnalyzer.hashCode() : 0); + result = 31 * result + (forcedAnalyzer != null ? forcedAnalyzer.hashCode() : 0); + result = 31 * result + (forcedQuoteAnalyzer != null ? forcedQuoteAnalyzer.hashCode() : 0); + result = 31 * result + (analyzeWildcard ? 1 : 0); + + result = 31 * result + (fields != null ? fields.hashCode() : 0); + result = 31 * result + (queryTypes != null ? queryTypes.hashCode() : 0); + result = 31 * result + (boosts != null ? boosts.hashCode() : 0); + result = 31 * result + (tieBreaker != +0.0f ? Float.floatToIntBits(tieBreaker) : 0); + result = 31 * result + (useDisMax ? 1 : 0); + return result; + } +} diff --git a/src/main/java/org/apache/lucene/search/XReferenceManager.java b/src/main/java/org/apache/lucene/search/XReferenceManager.java new file mode 100644 index 0000000..07fb066 --- /dev/null +++ b/src/main/java/org/apache/lucene/search/XReferenceManager.java @@ -0,0 +1,326 @@ +package org.apache.lucene.search; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.AlreadyClosedException; +import org.apache.lucene.util.Version; + +import java.io.Closeable; +import java.io.IOException; +import java.util.List; +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.concurrent.locks.Lock; +import java.util.concurrent.locks.ReentrantLock; + +/** + * Utility class to safely share instances of a certain type across multiple + * threads, while periodically refreshing them. This class ensures each + * reference is closed only once all threads have finished using it. It is + * recommended to consult the documentation of {@link org.apache.lucene.search.XReferenceManager} + * implementations for their {@link #maybeRefresh()} semantics. + * + * @param <G> + * the concrete type that will be {@link #acquire() acquired} and + * {@link #release(Object) released}. + * + * @lucene.experimental + */ +public abstract class XReferenceManager<G> implements Closeable { + static { + assert Version.LUCENE_46 == org.elasticsearch.Version.CURRENT.luceneVersion : "Remove this once we are on LUCENE_47 - see LUCENE-5436"; + } + + private static final String REFERENCE_MANAGER_IS_CLOSED_MSG = "this ReferenceManager is closed"; + + protected volatile G current; + + private final Lock refreshLock = new ReentrantLock(); + + private final List<RefreshListener> refreshListeners = new CopyOnWriteArrayList<RefreshListener>(); + + private void ensureOpen() { + if (current == null) { + throw new AlreadyClosedException(REFERENCE_MANAGER_IS_CLOSED_MSG); + } + } + + private synchronized void swapReference(G newReference) throws IOException { + ensureOpen(); + final G oldReference = current; + current = newReference; + release(oldReference); + } + + /** + * Decrement reference counting on the given reference. + * @throws java.io.IOException if reference decrement on the given resource failed. + * */ + protected abstract void decRef(G reference) throws IOException; + + /** + * Refresh the given reference if needed. Returns {@code null} if no refresh + * was needed, otherwise a new refreshed reference. + * @throws org.apache.lucene.store.AlreadyClosedException if the reference manager has been {@link #close() closed}. + * @throws java.io.IOException if the refresh operation failed + */ + protected abstract G refreshIfNeeded(G referenceToRefresh) throws IOException; + + /** + * Try to increment reference counting on the given reference. Return true if + * the operation was successful. + * @throws org.apache.lucene.store.AlreadyClosedException if the reference manager has been {@link #close() closed}. + */ + protected abstract boolean tryIncRef(G reference) throws IOException; + + /** + * Obtain the current reference. You must match every call to acquire with one + * call to {@link #release}; it's best to do so in a finally clause, and set + * the reference to {@code null} to prevent accidental usage after it has been + * released. + * @throws org.apache.lucene.store.AlreadyClosedException if the reference manager has been {@link #close() closed}. + */ + public final G acquire() throws IOException { + G ref; + + do { + if ((ref = current) == null) { + throw new AlreadyClosedException(REFERENCE_MANAGER_IS_CLOSED_MSG); + } + if (tryIncRef(ref)) { + return ref; + } + if (getRefCount(ref) == 0 && current == ref) { + assert ref != null; + /* if we can't increment the reader but we are + still the current reference the RM is in a + illegal states since we can't make any progress + anymore. The reference is closed but the RM still + holds on to it as the actual instance. + This can only happen if somebody outside of the RM + decrements the refcount without a corresponding increment + since the RM assigns the new reference before counting down + the reference. */ + throw new IllegalStateException("The managed reference has already closed - this is likely a bug when the reference count is modified outside of the ReferenceManager"); + } + } while (true); + } + + /** + * <p> + * Closes this ReferenceManager to prevent future {@link #acquire() acquiring}. A + * reference manager should be closed if the reference to the managed resource + * should be disposed or the application using the {@link org.apache.lucene.search.XReferenceManager} + * is shutting down. The managed resource might not be released immediately, + * if the {@link org.apache.lucene.search.XReferenceManager} user is holding on to a previously + * {@link #acquire() acquired} reference. The resource will be released once + * when the last reference is {@link #release(Object) released}. Those + * references can still be used as if the manager was still active. + * </p> + * <p> + * Applications should not {@link #acquire() acquire} new references from this + * manager once this method has been called. {@link #acquire() Acquiring} a + * resource on a closed {@link org.apache.lucene.search.XReferenceManager} will throw an + * {@link org.apache.lucene.store.AlreadyClosedException}. + * </p> + * + * @throws java.io.IOException + * if the underlying reader of the current reference could not be closed + */ + @Override + public final synchronized void close() throws IOException { + if (current != null) { + // make sure we can call this more than once + // closeable javadoc says: + // if this is already closed then invoking this method has no effect. + swapReference(null); + afterClose(); + } + } + + /** + * Returns the current reference count of the given reference. + */ + protected abstract int getRefCount(G reference); + + /** + * Called after close(), so subclass can free any resources. + * @throws java.io.IOException if the after close operation in a sub-class throws an {@link java.io.IOException} + * */ + protected void afterClose() throws IOException { + } + + private void doMaybeRefresh() throws IOException { + // it's ok to call lock() here (blocking) because we're supposed to get here + // from either maybeRefreh() or maybeRefreshBlocking(), after the lock has + // already been obtained. Doing that protects us from an accidental bug + // where this method will be called outside the scope of refreshLock. + // Per ReentrantLock's javadoc, calling lock() by the same thread more than + // once is ok, as long as unlock() is called a matching number of times. + refreshLock.lock(); + boolean refreshed = false; + try { + final G reference = acquire(); + try { + notifyRefreshListenersBefore(); + G newReference = refreshIfNeeded(reference); + if (newReference != null) { + assert newReference != reference : "refreshIfNeeded should return null if refresh wasn't needed"; + try { + swapReference(newReference); + refreshed = true; + } finally { + if (!refreshed) { + release(newReference); + } + } + } + } finally { + release(reference); + notifyRefreshListenersRefreshed(refreshed); + } + afterMaybeRefresh(); + } finally { + refreshLock.unlock(); + } + } + + /** + * You must call this (or {@link #maybeRefreshBlocking()}), periodically, if + * you want that {@link #acquire()} will return refreshed instances. + * + * <p> + * <b>Threads</b>: it's fine for more than one thread to call this at once. + * Only the first thread will attempt the refresh; subsequent threads will see + * that another thread is already handling refresh and will return + * immediately. Note that this means if another thread is already refreshing + * then subsequent threads will return right away without waiting for the + * refresh to complete. + * + * <p> + * If this method returns true it means the calling thread either refreshed or + * that there were no changes to refresh. If it returns false it means another + * thread is currently refreshing. + * </p> + * @throws java.io.IOException if refreshing the resource causes an {@link java.io.IOException} + * @throws org.apache.lucene.store.AlreadyClosedException if the reference manager has been {@link #close() closed}. + */ + public final boolean maybeRefresh() throws IOException { + ensureOpen(); + + // Ensure only 1 thread does refresh at once; other threads just return immediately: + final boolean doTryRefresh = refreshLock.tryLock(); + if (doTryRefresh) { + try { + doMaybeRefresh(); + } finally { + refreshLock.unlock(); + } + } + + return doTryRefresh; + } + + /** + * You must call this (or {@link #maybeRefresh()}), periodically, if you want + * that {@link #acquire()} will return refreshed instances. + * + * <p> + * <b>Threads</b>: unlike {@link #maybeRefresh()}, if another thread is + * currently refreshing, this method blocks until that thread completes. It is + * useful if you want to guarantee that the next call to {@link #acquire()} + * will return a refreshed instance. Otherwise, consider using the + * non-blocking {@link #maybeRefresh()}. + * @throws java.io.IOException if refreshing the resource causes an {@link java.io.IOException} + * @throws org.apache.lucene.store.AlreadyClosedException if the reference manager has been {@link #close() closed}. + */ + public final void maybeRefreshBlocking() throws IOException { + ensureOpen(); + + // Ensure only 1 thread does refresh at once + refreshLock.lock(); + try { + doMaybeRefresh(); + } finally { + refreshLock.unlock(); + } + } + + /** Called after a refresh was attempted, regardless of + * whether a new reference was in fact created. + * @throws java.io.IOException if a low level I/O exception occurs + **/ + protected void afterMaybeRefresh() throws IOException { + } + + /** + * Release the reference previously obtained via {@link #acquire()}. + * <p> + * <b>NOTE:</b> it's safe to call this after {@link #close()}. + * @throws java.io.IOException if the release operation on the given resource throws an {@link java.io.IOException} + */ + public final void release(G reference) throws IOException { + assert reference != null; + decRef(reference); + } + + private void notifyRefreshListenersBefore() throws IOException { + for (RefreshListener refreshListener : refreshListeners) { + refreshListener.beforeRefresh(); + } + } + + private void notifyRefreshListenersRefreshed(boolean didRefresh) throws IOException { + for (RefreshListener refreshListener : refreshListeners) { + refreshListener.afterRefresh(didRefresh); + } + } + + /** + * Adds a listener, to be notified when a reference is refreshed/swapped. + */ + public void addListener(RefreshListener listener) { + if (listener == null) { + throw new NullPointerException("Listener cannot be null"); + } + refreshListeners.add(listener); + } + + /** + * Remove a listener added with {@link #addListener(RefreshListener)}. + */ + public void removeListener(RefreshListener listener) { + if (listener == null) { + throw new NullPointerException("Listener cannot be null"); + } + refreshListeners.remove(listener); + } + + /** Use to receive notification when a refresh has + * finished. See {@link #addListener}. */ + public interface RefreshListener { + + /** Called right before a refresh attempt starts. */ + void beforeRefresh() throws IOException; + + /** Called after the attempted refresh; if the refresh + * did open a new reference then didRefresh will be true + * and {@link #acquire()} is guaranteed to return the new + * reference. */ + void afterRefresh(boolean didRefresh) throws IOException; + } +} diff --git a/src/main/java/org/apache/lucene/search/XSearcherManager.java b/src/main/java/org/apache/lucene/search/XSearcherManager.java new file mode 100644 index 0000000..36c74a0 --- /dev/null +++ b/src/main/java/org/apache/lucene/search/XSearcherManager.java @@ -0,0 +1,177 @@ +package org.apache.lucene.search; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.Version; + +/** + * Utility class to safely share {@link IndexSearcher} instances across multiple + * threads, while periodically reopening. This class ensures each searcher is + * closed only once all threads have finished using it. + * + * <p> + * Use {@link #acquire} to obtain the current searcher, and {@link #release} to + * release it, like this: + * + * <pre class="prettyprint"> + * IndexSearcher s = manager.acquire(); + * try { + * // Do searching, doc retrieval, etc. with s + * } finally { + * manager.release(s); + * } + * // Do not use s after this! + * s = null; + * </pre> + * + * <p> + * In addition you should periodically call {@link #maybeRefresh}. While it's + * possible to call this just before running each query, this is discouraged + * since it penalizes the unlucky queries that do the reopen. It's better to use + * a separate background thread, that periodically calls maybeReopen. Finally, + * be sure to call {@link #close} once you are done. + * + * @see SearcherFactory + * + * @lucene.experimental + */ +public final class XSearcherManager extends XReferenceManager<IndexSearcher> { + + static { + assert Version.LUCENE_46 == org.elasticsearch.Version.CURRENT.luceneVersion : "Remove this once we are on LUCENE_47 - see LUCENE-5436"; + } + + private final SearcherFactory searcherFactory; + + /** + * Creates and returns a new XSearcherManager from the given + * {@link IndexWriter}. + * + * @param writer + * the IndexWriter to open the IndexReader from. + * @param applyAllDeletes + * If <code>true</code>, all buffered deletes will be applied (made + * visible) in the {@link IndexSearcher} / {@link DirectoryReader}. + * If <code>false</code>, the deletes may or may not be applied, but + * remain buffered (in IndexWriter) so that they will be applied in + * the future. Applying deletes can be costly, so if your app can + * tolerate deleted documents being returned you might gain some + * performance by passing <code>false</code>. See + * {@link DirectoryReader#openIfChanged(DirectoryReader, IndexWriter, boolean)}. + * @param searcherFactory + * An optional {@link SearcherFactory}. Pass <code>null</code> if you + * don't require the searcher to be warmed before going live or other + * custom behavior. + * + * @throws IOException if there is a low-level I/O error + */ + public XSearcherManager(IndexWriter writer, boolean applyAllDeletes, SearcherFactory searcherFactory) throws IOException { + if (searcherFactory == null) { + searcherFactory = new SearcherFactory(); + } + this.searcherFactory = searcherFactory; + current = getSearcher(searcherFactory, DirectoryReader.open(writer, applyAllDeletes)); + } + + /** + * Creates and returns a new XSearcherManager from the given {@link Directory}. + * @param dir the directory to open the DirectoryReader on. + * @param searcherFactory An optional {@link SearcherFactory}. Pass + * <code>null</code> if you don't require the searcher to be warmed + * before going live or other custom behavior. + * + * @throws IOException if there is a low-level I/O error + */ + public XSearcherManager(Directory dir, SearcherFactory searcherFactory) throws IOException { + if (searcherFactory == null) { + searcherFactory = new SearcherFactory(); + } + this.searcherFactory = searcherFactory; + current = getSearcher(searcherFactory, DirectoryReader.open(dir)); + } + + @Override + protected void decRef(IndexSearcher reference) throws IOException { + reference.getIndexReader().decRef(); + } + + @Override + protected IndexSearcher refreshIfNeeded(IndexSearcher referenceToRefresh) throws IOException { + final IndexReader r = referenceToRefresh.getIndexReader(); + assert r instanceof DirectoryReader: "searcher's IndexReader should be a DirectoryReader, but got " + r; + final IndexReader newReader = DirectoryReader.openIfChanged((DirectoryReader) r); + if (newReader == null) { + return null; + } else { + return getSearcher(searcherFactory, newReader); + } + } + + @Override + protected boolean tryIncRef(IndexSearcher reference) { + return reference.getIndexReader().tryIncRef(); + } + + @Override + protected int getRefCount(IndexSearcher reference) { + return reference.getIndexReader().getRefCount(); + } + + /** + * Returns <code>true</code> if no changes have occured since this searcher + * ie. reader was opened, otherwise <code>false</code>. + * @see DirectoryReader#isCurrent() + */ + public boolean isSearcherCurrent() throws IOException { + final IndexSearcher searcher = acquire(); + try { + final IndexReader r = searcher.getIndexReader(); + assert r instanceof DirectoryReader: "searcher's IndexReader should be a DirectoryReader, but got " + r; + return ((DirectoryReader) r).isCurrent(); + } finally { + release(searcher); + } + } + + /** Expert: creates a searcher from the provided {@link + * IndexReader} using the provided {@link + * SearcherFactory}. NOTE: this decRefs incoming reader + * on throwing an exception. */ + public static IndexSearcher getSearcher(SearcherFactory searcherFactory, IndexReader reader) throws IOException { + boolean success = false; + final IndexSearcher searcher; + try { + searcher = searcherFactory.newSearcher(reader); + if (searcher.getIndexReader() != reader) { + throw new IllegalStateException("SearcherFactory must wrap exactly the provided reader (got " + searcher.getIndexReader() + " but expected " + reader + ")"); + } + success = true; + } finally { + if (!success) { + reader.decRef(); + } + } + return searcher; + } +} diff --git a/src/main/java/org/apache/lucene/search/postingshighlight/CustomPassageFormatter.java b/src/main/java/org/apache/lucene/search/postingshighlight/CustomPassageFormatter.java new file mode 100644 index 0000000..59ef784 --- /dev/null +++ b/src/main/java/org/apache/lucene/search/postingshighlight/CustomPassageFormatter.java @@ -0,0 +1,78 @@ +/* + * Licensed to Elasticsearch under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Elasticsearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package org.apache.lucene.search.postingshighlight; + +import org.apache.lucene.search.highlight.Encoder; +import org.elasticsearch.search.highlight.HighlightUtils; + +/** +Custom passage formatter that allows us to: +1) extract different snippets (instead of a single big string) together with their scores ({@link Snippet}) +2) use the {@link Encoder} implementations that are already used with the other highlighters + */ +public class CustomPassageFormatter extends PassageFormatter { + + private final String preTag; + private final String postTag; + private final Encoder encoder; + + public CustomPassageFormatter(String preTag, String postTag, Encoder encoder) { + this.preTag = preTag; + this.postTag = postTag; + this.encoder = encoder; + } + + @Override + public Snippet[] format(Passage[] passages, String content) { + Snippet[] snippets = new Snippet[passages.length]; + int pos; + for (int j = 0; j < passages.length; j++) { + Passage passage = passages[j]; + StringBuilder sb = new StringBuilder(); + pos = passage.startOffset; + for (int i = 0; i < passage.numMatches; i++) { + int start = passage.matchStarts[i]; + int end = passage.matchEnds[i]; + // its possible to have overlapping terms + if (start > pos) { + append(sb, content, pos, start); + } + if (end > pos) { + sb.append(preTag); + append(sb, content, Math.max(pos, start), end); + sb.append(postTag); + pos = end; + } + } + // its possible a "term" from the analyzer could span a sentence boundary. + append(sb, content, pos, Math.max(pos, passage.endOffset)); + //we remove the paragraph separator if present at the end of the snippet (we used it as separator between values) + if (sb.charAt(sb.length() - 1) == HighlightUtils.PARAGRAPH_SEPARATOR) { + sb.deleteCharAt(sb.length() - 1); + } + //and we trim the snippets too + snippets[j] = new Snippet(sb.toString().trim(), passage.score, passage.numMatches > 0); + } + return snippets; + } + + protected void append(StringBuilder dest, String content, int start, int end) { + dest.append(encoder.encodeText(content.substring(start, end))); + } +} diff --git a/src/main/java/org/apache/lucene/search/postingshighlight/CustomPostingsHighlighter.java b/src/main/java/org/apache/lucene/search/postingshighlight/CustomPostingsHighlighter.java new file mode 100644 index 0000000..be5ad66 --- /dev/null +++ b/src/main/java/org/apache/lucene/search/postingshighlight/CustomPostingsHighlighter.java @@ -0,0 +1,187 @@ +/* + * Licensed to Elasticsearch under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Elasticsearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package org.apache.lucene.search.postingshighlight; + +import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexReaderContext; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.common.Strings; +import org.elasticsearch.search.highlight.HighlightUtils; + +import java.io.IOException; +import java.text.BreakIterator; +import java.util.List; +import java.util.Map; + +/** + * Subclass of the {@link XPostingsHighlighter} that works for a single field in a single document. + * It receives the field values as input and it performs discrete highlighting on each single value + * calling the highlightDoc method multiple times. + * It allows to pass in the query terms to avoid calling extract terms multiple times. + * + * The use that we make of the postings highlighter is not optimal. It would be much better to + * highlight multiple docs in a single call, as we actually lose its sequential IO. But that would require: + * 1) to make our fork more complex and harder to maintain to perform discrete highlighting (needed to return + * a different snippet per value when number_of_fragments=0 and the field has multiple values) + * 2) refactoring of the elasticsearch highlight api which currently works per hit + * + */ +public final class CustomPostingsHighlighter extends XPostingsHighlighter { + + private static final Snippet[] EMPTY_SNIPPET = new Snippet[0]; + private static final Passage[] EMPTY_PASSAGE = new Passage[0]; + + private final CustomPassageFormatter passageFormatter; + private final int noMatchSize; + private final int totalContentLength; + private final String[] fieldValues; + private final int[] fieldValuesOffsets; + private int currentValueIndex = 0; + + private BreakIterator breakIterator; + + public CustomPostingsHighlighter(CustomPassageFormatter passageFormatter, List<Object> fieldValues, boolean mergeValues, int maxLength, int noMatchSize) { + super(maxLength); + this.passageFormatter = passageFormatter; + this.noMatchSize = noMatchSize; + + if (mergeValues) { + String rawValue = Strings.collectionToDelimitedString(fieldValues, String.valueOf(getMultiValuedSeparator(""))); + String fieldValue = rawValue.substring(0, Math.min(rawValue.length(), maxLength)); + this.fieldValues = new String[]{fieldValue}; + this.fieldValuesOffsets = new int[]{0}; + this.totalContentLength = fieldValue.length(); + } else { + this.fieldValues = new String[fieldValues.size()]; + this.fieldValuesOffsets = new int[fieldValues.size()]; + int contentLength = 0; + int offset = 0; + int previousLength = -1; + for (int i = 0; i < fieldValues.size(); i++) { + String rawValue = fieldValues.get(i).toString(); + String fieldValue = rawValue.substring(0, Math.min(rawValue.length(), maxLength)); + this.fieldValues[i] = fieldValue; + contentLength += fieldValue.length(); + offset += previousLength + 1; + this.fieldValuesOffsets[i] = offset; + previousLength = fieldValue.length(); + } + this.totalContentLength = contentLength; + } + } + + /* + Our own api to highlight a single document field, passing in the query terms, and get back our own Snippet object + */ + public Snippet[] highlightDoc(String field, BytesRef[] terms, IndexSearcher searcher, int docId, int maxPassages) throws IOException { + IndexReader reader = searcher.getIndexReader(); + IndexReaderContext readerContext = reader.getContext(); + List<AtomicReaderContext> leaves = readerContext.leaves(); + + String[] contents = new String[]{loadCurrentFieldValue()}; + Map<Integer, Object> snippetsMap = highlightField(field, contents, getBreakIterator(field), terms, new int[]{docId}, leaves, maxPassages); + + //increment the current value index so that next time we'll highlight the next value if available + currentValueIndex++; + + Object snippetObject = snippetsMap.get(docId); + if (snippetObject != null && snippetObject instanceof Snippet[]) { + return (Snippet[]) snippetObject; + } + return EMPTY_SNIPPET; + } + + /* + Method provided through our own fork: allows to do proper scoring when doing per value discrete highlighting. + Used to provide the total length of the field (all values) for proper scoring. + */ + @Override + protected int getContentLength(String field, int docId) { + return totalContentLength; + } + + /* + Method provided through our own fork: allows to perform proper per value discrete highlighting. + Used to provide the offset for the current value. + */ + @Override + protected int getOffsetForCurrentValue(String field, int docId) { + if (currentValueIndex < fieldValuesOffsets.length) { + return fieldValuesOffsets[currentValueIndex]; + } + throw new IllegalArgumentException("No more values offsets to return"); + } + + public void setBreakIterator(BreakIterator breakIterator) { + this.breakIterator = breakIterator; + } + + @Override + protected PassageFormatter getFormatter(String field) { + return passageFormatter; + } + + @Override + protected BreakIterator getBreakIterator(String field) { + if (breakIterator == null) { + return super.getBreakIterator(field); + } + return breakIterator; + } + + @Override + protected char getMultiValuedSeparator(String field) { + //U+2029 PARAGRAPH SEPARATOR (PS): each value holds a discrete passage for highlighting + return HighlightUtils.PARAGRAPH_SEPARATOR; + } + + /* + By default the postings highlighter returns non highlighted snippet when there are no matches. + We want to return no snippets by default, unless no_match_size is greater than 0 + */ + @Override + protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) { + if (noMatchSize > 0) { + //we want to return the first sentence of the first snippet only + return super.getEmptyHighlight(fieldName, bi, 1); + } + return EMPTY_PASSAGE; + } + + /* + Not needed since we call our own loadCurrentFieldValue explicitly, but we override it anyway for consistency. + */ + @Override + protected String[][] loadFieldValues(IndexSearcher searcher, String[] fields, int[] docids, int maxLength) throws IOException { + return new String[][]{new String[]{loadCurrentFieldValue()}}; + } + + /* + Our own method that returns the field values, which relies on the content that was provided when creating the highlighter. + Supports per value discrete highlighting calling the highlightDoc method multiple times, one per value. + */ + protected String loadCurrentFieldValue() { + if (currentValueIndex < fieldValues.length) { + return fieldValues[currentValueIndex]; + } + throw new IllegalArgumentException("No more values to return"); + } +} diff --git a/src/main/java/org/apache/lucene/search/postingshighlight/Snippet.java b/src/main/java/org/apache/lucene/search/postingshighlight/Snippet.java new file mode 100644 index 0000000..bf68020 --- /dev/null +++ b/src/main/java/org/apache/lucene/search/postingshighlight/Snippet.java @@ -0,0 +1,50 @@ +/* + * Licensed to Elasticsearch under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Elasticsearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package org.apache.lucene.search.postingshighlight; + +/** + * Represents a scored highlighted snippet. + * It's our own arbitrary object that we get back from the postings highlighter when highlighting a document. + * Every snippet contains its formatted text and its score. + * The score is needed since we highlight every single value separately and we might want to return snippets sorted by score. + */ +public class Snippet { + + private final String text; + private final float score; + private final boolean isHighlighted; + + public Snippet(String text, float score, boolean isHighlighted) { + this.text = text; + this.score = score; + this.isHighlighted = isHighlighted; + } + + public String getText() { + return text; + } + + public float getScore() { + return score; + } + + public boolean isHighlighted() { + return isHighlighted; + } +} diff --git a/src/main/java/org/apache/lucene/search/postingshighlight/XPostingsHighlighter.java b/src/main/java/org/apache/lucene/search/postingshighlight/XPostingsHighlighter.java new file mode 100644 index 0000000..869b559 --- /dev/null +++ b/src/main/java/org/apache/lucene/search/postingshighlight/XPostingsHighlighter.java @@ -0,0 +1,777 @@ +/* + * Licensed to Elasticsearch under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Elasticsearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package org.apache.lucene.search.postingshighlight; + +import org.apache.lucene.index.*; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.InPlaceMergeSorter; +import org.apache.lucene.util.UnicodeUtil; + +import java.io.IOException; +import java.text.BreakIterator; +import java.util.*; + +/* +FORKED from Lucene 4.5 to be able to: +1) support discrete highlighting for multiple values, so that we can return a different snippet per value when highlighting the whole text +2) call the highlightField method directly from subclasses and provide the terms by ourselves +3) Applied LUCENE-4906 to allow PassageFormatter to return arbitrary objects (LUCENE 4.6) + +All our changes start with //BEGIN EDIT + */ +public class XPostingsHighlighter { + + //BEGIN EDIT added method to override offset for current value (default 0) + //we need this to perform discrete highlighting per field + protected int getOffsetForCurrentValue(String field, int docId) { + return 0; + } + //END EDIT + + //BEGIN EDIT + //we need this to fix scoring when highlighting every single value separately, since the score depends on the total length of the field (all values rather than only the current one) + protected int getContentLength(String field, int docId) { + return -1; + } + //END EDIT + + + // TODO: maybe allow re-analysis for tiny fields? currently we require offsets, + // but if the analyzer is really fast and the field is tiny, this might really be + // unnecessary. + + /** for rewriting: we don't want slow processing from MTQs */ + private static final IndexReader EMPTY_INDEXREADER = new MultiReader(); + + /** Default maximum content size to process. Typically snippets + * closer to the beginning of the document better summarize its content */ + public static final int DEFAULT_MAX_LENGTH = 10000; + + private final int maxLength; + + /** Set the first time {@link #getFormatter} is called, + * and then reused. */ + private PassageFormatter defaultFormatter; + + /** Set the first time {@link #getScorer} is called, + * and then reused. */ + private PassageScorer defaultScorer; + + /** + * Creates a new highlighter with default parameters. + */ + public XPostingsHighlighter() { + this(DEFAULT_MAX_LENGTH); + } + + /** + * Creates a new highlighter, specifying maximum content length. + * @param maxLength maximum content size to process. + * @throws IllegalArgumentException if <code>maxLength</code> is negative or <code>Integer.MAX_VALUE</code> + */ + public XPostingsHighlighter(int maxLength) { + if (maxLength < 0 || maxLength == Integer.MAX_VALUE) { + // two reasons: no overflow problems in BreakIterator.preceding(offset+1), + // our sentinel in the offsets queue uses this value to terminate. + throw new IllegalArgumentException("maxLength must be < Integer.MAX_VALUE"); + } + this.maxLength = maxLength; + } + + /** Returns the {@link java.text.BreakIterator} to use for + * dividing text into passages. This returns + * {@link java.text.BreakIterator#getSentenceInstance(java.util.Locale)} by default; + * subclasses can override to customize. */ + protected BreakIterator getBreakIterator(String field) { + return BreakIterator.getSentenceInstance(Locale.ROOT); + } + + /** Returns the {@link PassageFormatter} to use for + * formatting passages into highlighted snippets. This + * returns a new {@code PassageFormatter} by default; + * subclasses can override to customize. */ + protected PassageFormatter getFormatter(String field) { + if (defaultFormatter == null) { + defaultFormatter = new DefaultPassageFormatter(); + } + return defaultFormatter; + } + + /** Returns the {@link PassageScorer} to use for + * ranking passages. This + * returns a new {@code PassageScorer} by default; + * subclasses can override to customize. */ + protected PassageScorer getScorer(String field) { + if (defaultScorer == null) { + defaultScorer = new PassageScorer(); + } + return defaultScorer; + } + + /** + * Highlights the top passages from a single field. + * + * @param field field name to highlight. + * Must have a stored string value and also be indexed with offsets. + * @param query query to highlight. + * @param searcher searcher that was previously used to execute the query. + * @param topDocs TopDocs containing the summary result documents to highlight. + * @return Array of formatted snippets corresponding to the documents in <code>topDocs</code>. + * If no highlights were found for a document, the + * first sentence for the field will be returned. + * @throws java.io.IOException if an I/O error occurred during processing + * @throws IllegalArgumentException if <code>field</code> was indexed without + * {@link org.apache.lucene.index.FieldInfo.IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS} + */ + public String[] highlight(String field, Query query, IndexSearcher searcher, TopDocs topDocs) throws IOException { + return highlight(field, query, searcher, topDocs, 1); + } + + /** + * Highlights the top-N passages from a single field. + * + * @param field field name to highlight. + * Must have a stored string value and also be indexed with offsets. + * @param query query to highlight. + * @param searcher searcher that was previously used to execute the query. + * @param topDocs TopDocs containing the summary result documents to highlight. + * @param maxPassages The maximum number of top-N ranked passages used to + * form the highlighted snippets. + * @return Array of formatted snippets corresponding to the documents in <code>topDocs</code>. + * If no highlights were found for a document, the + * first {@code maxPassages} sentences from the + * field will be returned. + * @throws IOException if an I/O error occurred during processing + * @throws IllegalArgumentException if <code>field</code> was indexed without + * {@link org.apache.lucene.index.FieldInfo.IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS} + */ + public String[] highlight(String field, Query query, IndexSearcher searcher, TopDocs topDocs, int maxPassages) throws IOException { + Map<String,String[]> res = highlightFields(new String[] { field }, query, searcher, topDocs, new int[] { maxPassages }); + return res.get(field); + } + + /** + * Highlights the top passages from multiple fields. + * <p> + * Conceptually, this behaves as a more efficient form of: + * <pre class="prettyprint"> + * Map m = new HashMap(); + * for (String field : fields) { + * m.put(field, highlight(field, query, searcher, topDocs)); + * } + * return m; + * </pre> + * + * @param fields field names to highlight. + * Must have a stored string value and also be indexed with offsets. + * @param query query to highlight. + * @param searcher searcher that was previously used to execute the query. + * @param topDocs TopDocs containing the summary result documents to highlight. + * @return Map keyed on field name, containing the array of formatted snippets + * corresponding to the documents in <code>topDocs</code>. + * If no highlights were found for a document, the + * first sentence from the field will be returned. + * @throws IOException if an I/O error occurred during processing + * @throws IllegalArgumentException if <code>field</code> was indexed without + * {@link org.apache.lucene.index.FieldInfo.IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS} + */ + public Map<String,String[]> highlightFields(String fields[], Query query, IndexSearcher searcher, TopDocs topDocs) throws IOException { + int maxPassages[] = new int[fields.length]; + Arrays.fill(maxPassages, 1); + return highlightFields(fields, query, searcher, topDocs, maxPassages); + } + + /** + * Highlights the top-N passages from multiple fields. + * <p> + * Conceptually, this behaves as a more efficient form of: + * <pre class="prettyprint"> + * Map m = new HashMap(); + * for (String field : fields) { + * m.put(field, highlight(field, query, searcher, topDocs, maxPassages)); + * } + * return m; + * </pre> + * + * @param fields field names to highlight. + * Must have a stored string value and also be indexed with offsets. + * @param query query to highlight. + * @param searcher searcher that was previously used to execute the query. + * @param topDocs TopDocs containing the summary result documents to highlight. + * @param maxPassages The maximum number of top-N ranked passages per-field used to + * form the highlighted snippets. + * @return Map keyed on field name, containing the array of formatted snippets + * corresponding to the documents in <code>topDocs</code>. + * If no highlights were found for a document, the + * first {@code maxPassages} sentences from the + * field will be returned. + * @throws IOException if an I/O error occurred during processing + * @throws IllegalArgumentException if <code>field</code> was indexed without + * {@link org.apache.lucene.index.FieldInfo.IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS} + */ + public Map<String,String[]> highlightFields(String fields[], Query query, IndexSearcher searcher, TopDocs topDocs, int maxPassages[]) throws IOException { + final ScoreDoc scoreDocs[] = topDocs.scoreDocs; + int docids[] = new int[scoreDocs.length]; + for (int i = 0; i < docids.length; i++) { + docids[i] = scoreDocs[i].doc; + } + + return highlightFields(fields, query, searcher, docids, maxPassages); + } + + /** + * Highlights the top-N passages from multiple fields, + * for the provided int[] docids. + * + * @param fieldsIn field names to highlight. + * Must have a stored string value and also be indexed with offsets. + * @param query query to highlight. + * @param searcher searcher that was previously used to execute the query. + * @param docidsIn containing the document IDs to highlight. + * @param maxPassagesIn The maximum number of top-N ranked passages per-field used to + * form the highlighted snippets. + * @return Map keyed on field name, containing the array of formatted snippets + * corresponding to the documents in <code>topDocs</code>. + * If no highlights were found for a document, the + * first {@code maxPassages} from the field will + * be returned. + * @throws IOException if an I/O error occurred during processing + * @throws IllegalArgumentException if <code>field</code> was indexed without + * {@link org.apache.lucene.index.FieldInfo.IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS} + */ + public Map<String,String[]> highlightFields(String fieldsIn[], Query query, IndexSearcher searcher, int[] docidsIn, int maxPassagesIn[]) throws IOException { + Map<String,String[]> snippets = new HashMap<String,String[]>(); + for(Map.Entry<String,Object[]> ent : highlightFieldsAsObjects(fieldsIn, query, searcher, docidsIn, maxPassagesIn).entrySet()) { + Object[] snippetObjects = ent.getValue(); + String[] snippetStrings = new String[snippetObjects.length]; + snippets.put(ent.getKey(), snippetStrings); + for(int i=0;i<snippetObjects.length;i++) { + Object snippet = snippetObjects[i]; + if (snippet != null) { + snippetStrings[i] = snippet.toString(); + } + } + } + + return snippets; + } + + public Map<String,Object[]> highlightFieldsAsObjects(String fieldsIn[], Query query, IndexSearcher searcher, int[] docidsIn, int maxPassagesIn[]) throws IOException { + if (fieldsIn.length < 1) { + throw new IllegalArgumentException("fieldsIn must not be empty"); + } + if (fieldsIn.length != maxPassagesIn.length) { + throw new IllegalArgumentException("invalid number of maxPassagesIn"); + } + final IndexReader reader = searcher.getIndexReader(); + query = rewrite(query); + SortedSet<Term> queryTerms = new TreeSet<Term>(); + query.extractTerms(queryTerms); + + IndexReaderContext readerContext = reader.getContext(); + List<AtomicReaderContext> leaves = readerContext.leaves(); + + // Make our own copies because we sort in-place: + int[] docids = new int[docidsIn.length]; + System.arraycopy(docidsIn, 0, docids, 0, docidsIn.length); + final String fields[] = new String[fieldsIn.length]; + System.arraycopy(fieldsIn, 0, fields, 0, fieldsIn.length); + final int maxPassages[] = new int[maxPassagesIn.length]; + System.arraycopy(maxPassagesIn, 0, maxPassages, 0, maxPassagesIn.length); + + // sort for sequential io + Arrays.sort(docids); + new InPlaceMergeSorter() { + + @Override + protected void swap(int i, int j) { + String tmp = fields[i]; + fields[i] = fields[j]; + fields[j] = tmp; + int tmp2 = maxPassages[i]; + maxPassages[i] = maxPassages[j]; + maxPassages[j] = tmp2; + } + + @Override + protected int compare(int i, int j) { + return fields[i].compareTo(fields[j]); + } + + }.sort(0, fields.length); + + // pull stored data: + String[][] contents = loadFieldValues(searcher, fields, docids, maxLength); + + Map<String,Object[]> highlights = new HashMap<String,Object[]>(); + for (int i = 0; i < fields.length; i++) { + String field = fields[i]; + int numPassages = maxPassages[i]; + + Term floor = new Term(field, ""); + Term ceiling = new Term(field, UnicodeUtil.BIG_TERM); + SortedSet<Term> fieldTerms = queryTerms.subSet(floor, ceiling); + // TODO: should we have some reasonable defaults for term pruning? (e.g. stopwords) + + // Strip off the redundant field: + BytesRef terms[] = new BytesRef[fieldTerms.size()]; + int termUpto = 0; + for(Term term : fieldTerms) { + terms[termUpto++] = term.bytes(); + } + Map<Integer,Object> fieldHighlights = highlightField(field, contents[i], getBreakIterator(field), terms, docids, leaves, numPassages); + + Object[] result = new Object[docids.length]; + for (int j = 0; j < docidsIn.length; j++) { + result[j] = fieldHighlights.get(docidsIn[j]); + } + highlights.put(field, result); + } + return highlights; + } + + /** Loads the String values for each field X docID to be + * highlighted. By default this loads from stored + * fields, but a subclass can change the source. This + * method should allocate the String[fields.length][docids.length] + * and fill all values. The returned Strings must be + * identical to what was indexed. */ + protected String[][] loadFieldValues(IndexSearcher searcher, String[] fields, int[] docids, int maxLength) throws IOException { + String contents[][] = new String[fields.length][docids.length]; + char valueSeparators[] = new char[fields.length]; + for (int i = 0; i < fields.length; i++) { + valueSeparators[i] = getMultiValuedSeparator(fields[i]); + } + LimitedStoredFieldVisitor visitor = new LimitedStoredFieldVisitor(fields, valueSeparators, maxLength); + for (int i = 0; i < docids.length; i++) { + searcher.doc(docids[i], visitor); + for (int j = 0; j < fields.length; j++) { + contents[j][i] = visitor.getValue(j); + } + visitor.reset(); + } + return contents; + } + + /** + * Returns the logical separator between values for multi-valued fields. + * The default value is a space character, which means passages can span across values, + * but a subclass can override, for example with {@code U+2029 PARAGRAPH SEPARATOR (PS)} + * if each value holds a discrete passage for highlighting. + */ + protected char getMultiValuedSeparator(String field) { + return ' '; + } + + //BEGIN EDIT: made protected so that we can call from our subclass and pass in the terms by ourselves + protected Map<Integer,Object> highlightField(String field, String contents[], BreakIterator bi, BytesRef terms[], int[] docids, List<AtomicReaderContext> leaves, int maxPassages) throws IOException { + //private Map<Integer,Object> highlightField(String field, String contents[], BreakIterator bi, BytesRef terms[], int[] docids, List<AtomicReaderContext > leaves, int maxPassages) throws IOException { + //END EDIT + + Map<Integer,Object> highlights = new HashMap<Integer,Object>(); + + // reuse in the real sense... for docs in same segment we just advance our old enum + DocsAndPositionsEnum postings[] = null; + TermsEnum termsEnum = null; + int lastLeaf = -1; + + PassageFormatter fieldFormatter = getFormatter(field); + if (fieldFormatter == null) { + throw new NullPointerException("PassageFormatter cannot be null"); + } + + for (int i = 0; i < docids.length; i++) { + String content = contents[i]; + if (content.length() == 0) { + continue; // nothing to do + } + bi.setText(content); + int doc = docids[i]; + int leaf = ReaderUtil.subIndex(doc, leaves); + AtomicReaderContext subContext = leaves.get(leaf); + AtomicReader r = subContext.reader(); + Terms t = r.terms(field); + if (t == null) { + continue; // nothing to do + } + if (leaf != lastLeaf) { + termsEnum = t.iterator(null); + postings = new DocsAndPositionsEnum[terms.length]; + } + Passage passages[] = highlightDoc(field, terms, content.length(), bi, doc - subContext.docBase, termsEnum, postings, maxPassages); + if (passages.length == 0) { + passages = getEmptyHighlight(field, bi, maxPassages); + } + if (passages.length > 0) { + // otherwise a null snippet (eg if field is missing + // entirely from the doc) + highlights.put(doc, fieldFormatter.format(passages, content)); + } + lastLeaf = leaf; + } + + return highlights; + } + + // algorithm: treat sentence snippets as miniature documents + // we can intersect these with the postings lists via BreakIterator.preceding(offset),s + // score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq)) + private Passage[] highlightDoc(String field, BytesRef terms[], int contentLength, BreakIterator bi, int doc, + TermsEnum termsEnum, DocsAndPositionsEnum[] postings, int n) throws IOException { + + //BEGIN EDIT added call to method that returns the offset for the current value (discrete highlighting) + int valueOffset = getOffsetForCurrentValue(field, doc); + //END EDIT + + PassageScorer scorer = getScorer(field); + if (scorer == null) { + throw new NullPointerException("PassageScorer cannot be null"); + } + + + //BEGIN EDIT discrete highlighting + // the scoring needs to be based on the length of the whole field (all values rather than only the current one) + int totalContentLength = getContentLength(field, doc); + if (totalContentLength == -1) { + totalContentLength = contentLength; + } + //END EDIT + + + PriorityQueue<OffsetsEnum> pq = new PriorityQueue<OffsetsEnum>(); + float weights[] = new float[terms.length]; + // initialize postings + for (int i = 0; i < terms.length; i++) { + DocsAndPositionsEnum de = postings[i]; + int pDoc; + if (de == EMPTY) { + continue; + } else if (de == null) { + postings[i] = EMPTY; // initially + if (!termsEnum.seekExact(terms[i])) { + continue; // term not found + } + de = postings[i] = termsEnum.docsAndPositions(null, null, DocsAndPositionsEnum.FLAG_OFFSETS); + if (de == null) { + // no positions available + throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight"); + } + pDoc = de.advance(doc); + } else { + pDoc = de.docID(); + if (pDoc < doc) { + pDoc = de.advance(doc); + } + } + + if (doc == pDoc) { + //BEGIN EDIT we take into account the length of the whole field (all values) to properly score the snippets + weights[i] = scorer.weight(totalContentLength, de.freq()); + //weights[i] = scorer.weight(contentLength, de.freq()); + //END EDIT + de.nextPosition(); + pq.add(new OffsetsEnum(de, i)); + } + } + + pq.add(new OffsetsEnum(EMPTY, Integer.MAX_VALUE)); // a sentinel for termination + + PriorityQueue<Passage> passageQueue = new PriorityQueue<Passage>(n, new Comparator<Passage>() { + @Override + public int compare(Passage left, Passage right) { + if (left.score < right.score) { + return -1; + } else if (left.score > right.score) { + return 1; + } else { + return left.startOffset - right.startOffset; + } + } + }); + Passage current = new Passage(); + + OffsetsEnum off; + while ((off = pq.poll()) != null) { + final DocsAndPositionsEnum dp = off.dp; + + int start = dp.startOffset(); + if (start == -1) { + throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight"); + } + int end = dp.endOffset(); + // LUCENE-5166: this hit would span the content limit... however more valid + // hits may exist (they are sorted by start). so we pretend like we never + // saw this term, it won't cause a passage to be added to passageQueue or anything. + assert EMPTY.startOffset() == Integer.MAX_VALUE; + if (start < contentLength && end > contentLength) { + continue; + } + + + //BEGIN EDIT support for discrete highlighting (added block code) + //switch to the first match in the current value if there is one + boolean seenEnough = false; + while (start < valueOffset) { + if (off.pos == dp.freq()) { + seenEnough = true; + break; + } else { + off.pos++; + dp.nextPosition(); + start = dp.startOffset(); + end = dp.endOffset(); + } + } + + //continue with next term if we've already seen the current one all the times it appears + //that means that the current value doesn't hold matches for the current term + if (seenEnough) { + continue; + } + + //we now subtract the offset of the current value to both start and end + start -= valueOffset; + end -= valueOffset; + //END EDIT + + + if (start >= current.endOffset) { + if (current.startOffset >= 0) { + // finalize current + //BEGIN EDIT we take into account the value offset when scoring the snippet based on its position + current.score *= scorer.norm(current.startOffset + valueOffset); + //current.score *= scorer.norm(current.startOffset); + //END EDIT + // new sentence: first add 'current' to queue + if (passageQueue.size() == n && current.score < passageQueue.peek().score) { + current.reset(); // can't compete, just reset it + } else { + passageQueue.offer(current); + if (passageQueue.size() > n) { + current = passageQueue.poll(); + current.reset(); + } else { + current = new Passage(); + } + } + } + // if we exceed limit, we are done + if (start >= contentLength) { + Passage passages[] = new Passage[passageQueue.size()]; + passageQueue.toArray(passages); + for (Passage p : passages) { + p.sort(); + } + // sort in ascending order + Arrays.sort(passages, new Comparator<Passage>() { + @Override + public int compare(Passage left, Passage right) { + return left.startOffset - right.startOffset; + } + }); + return passages; + } + // advance breakiterator + assert BreakIterator.DONE < 0; + current.startOffset = Math.max(bi.preceding(start+1), 0); + current.endOffset = Math.min(bi.next(), contentLength); + } + int tf = 0; + while (true) { + tf++; + current.addMatch(start, end, terms[off.id]); + if (off.pos == dp.freq()) { + break; // removed from pq + } else { + off.pos++; + dp.nextPosition(); + //BEGIN EDIT support for discrete highlighting + start = dp.startOffset() - valueOffset; + end = dp.endOffset() - valueOffset; + //start = dp.startOffset(); + //end = dp.endOffset(); + //END EDIT + } + if (start >= current.endOffset || end > contentLength) { + pq.offer(off); + break; + } + } + current.score += weights[off.id] * scorer.tf(tf, current.endOffset - current.startOffset); + } + + // Dead code but compiler disagrees: + assert false; + return null; + } + + /** Called to summarize a document when no hits were + * found. By default this just returns the first + * {@code maxPassages} sentences; subclasses can override + * to customize. */ + protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) { + // BreakIterator should be un-next'd: + List<Passage> passages = new ArrayList<Passage>(); + int pos = bi.current(); + assert pos == 0; + while (passages.size() < maxPassages) { + int next = bi.next(); + if (next == BreakIterator.DONE) { + break; + } + Passage passage = new Passage(); + passage.score = Float.NaN; + passage.startOffset = pos; + passage.endOffset = next; + passages.add(passage); + pos = next; + } + + return passages.toArray(new Passage[passages.size()]); + } + + private static class OffsetsEnum implements Comparable<OffsetsEnum> { + DocsAndPositionsEnum dp; + int pos; + int id; + + OffsetsEnum(DocsAndPositionsEnum dp, int id) throws IOException { + this.dp = dp; + this.id = id; + this.pos = 1; + } + + @Override + public int compareTo(OffsetsEnum other) { + try { + int off = dp.startOffset(); + int otherOff = other.dp.startOffset(); + if (off == otherOff) { + return id - other.id; + } else { + return Long.signum(((long)off) - otherOff); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + } + } + + private static final DocsAndPositionsEnum EMPTY = new DocsAndPositionsEnum() { + + @Override + public int nextPosition() throws IOException { return 0; } + + @Override + public int startOffset() throws IOException { return Integer.MAX_VALUE; } + + @Override + public int endOffset() throws IOException { return Integer.MAX_VALUE; } + + @Override + public BytesRef getPayload() throws IOException { return null; } + + @Override + public int freq() throws IOException { return 0; } + + @Override + public int docID() { return NO_MORE_DOCS; } + + @Override + public int nextDoc() throws IOException { return NO_MORE_DOCS; } + + @Override + public int advance(int target) throws IOException { return NO_MORE_DOCS; } + + @Override + public long cost() { return 0; } + }; + + /** + * we rewrite against an empty indexreader: as we don't want things like + * rangeQueries that don't summarize the document + */ + private static Query rewrite(Query original) throws IOException { + Query query = original; + for (Query rewrittenQuery = query.rewrite(EMPTY_INDEXREADER); rewrittenQuery != query; + rewrittenQuery = query.rewrite(EMPTY_INDEXREADER)) { + query = rewrittenQuery; + } + return query; + } + + private static class LimitedStoredFieldVisitor extends StoredFieldVisitor { + private final String fields[]; + private final char valueSeparators[]; + private final int maxLength; + private final StringBuilder builders[]; + private int currentField = -1; + + public LimitedStoredFieldVisitor(String fields[], char valueSeparators[], int maxLength) { + assert fields.length == valueSeparators.length; + this.fields = fields; + this.valueSeparators = valueSeparators; + this.maxLength = maxLength; + builders = new StringBuilder[fields.length]; + for (int i = 0; i < builders.length; i++) { + builders[i] = new StringBuilder(); + } + } + + @Override + public void stringField(FieldInfo fieldInfo, String value) throws IOException { + assert currentField >= 0; + StringBuilder builder = builders[currentField]; + if (builder.length() > 0 && builder.length() < maxLength) { + builder.append(valueSeparators[currentField]); + } + if (builder.length() + value.length() > maxLength) { + builder.append(value, 0, maxLength - builder.length()); + } else { + builder.append(value); + } + } + + @Override + public Status needsField(FieldInfo fieldInfo) throws IOException { + currentField = Arrays.binarySearch(fields, fieldInfo.name); + if (currentField < 0) { + return Status.NO; + } else if (builders[currentField].length() > maxLength) { + return fields.length == 1 ? Status.STOP : Status.NO; + } + return Status.YES; + } + + String getValue(int i) { + return builders[i].toString(); + } + + void reset() { + currentField = -1; + for (int i = 0; i < fields.length; i++) { + builders[i].setLength(0); + } + } + } +} diff --git a/src/main/java/org/apache/lucene/search/suggest/analyzing/XAnalyzingSuggester.java b/src/main/java/org/apache/lucene/search/suggest/analyzing/XAnalyzingSuggester.java new file mode 100644 index 0000000..6e8b0a1 --- /dev/null +++ b/src/main/java/org/apache/lucene/search/suggest/analyzing/XAnalyzingSuggester.java @@ -0,0 +1,1052 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.lucene.search.suggest.analyzing; + +import com.carrotsearch.hppc.ObjectIntOpenHashMap; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.TokenStreamToAutomaton; +import org.apache.lucene.search.suggest.InputIterator; +import org.apache.lucene.search.suggest.Lookup; +import org.apache.lucene.search.suggest.Sort; +import org.apache.lucene.store.*; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.*; +import org.apache.lucene.util.automaton.*; +import org.apache.lucene.util.fst.*; +import org.apache.lucene.util.fst.FST.BytesReader; +import org.apache.lucene.util.fst.PairOutputs.Pair; +import org.apache.lucene.util.fst.Util.MinResult; +import org.elasticsearch.common.collect.HppcMaps; + +import java.io.*; +import java.util.*; + +/** + * Suggester that first analyzes the surface form, adds the + * analyzed form to a weighted FST, and then does the same + * thing at lookup time. This means lookup is based on the + * analyzed form while suggestions are still the surface + * form(s). + * + * <p> + * This can result in powerful suggester functionality. For + * example, if you use an analyzer removing stop words, + * then the partial text "ghost chr..." could see the + * suggestion "The Ghost of Christmas Past". Note that + * position increments MUST NOT be preserved for this example + * to work, so you should call the constructor with + * <code>preservePositionIncrements</code> parameter set to + * false + * + * <p> + * If SynonymFilter is used to map wifi and wireless network to + * hotspot then the partial text "wirele..." could suggest + * "wifi router". Token normalization like stemmers, accent + * removal, etc., would allow suggestions to ignore such + * variations. + * + * <p> + * When two matching suggestions have the same weight, they + * are tie-broken by the analyzed form. If their analyzed + * form is the same then the order is undefined. + * + * <p> + * There are some limitations: + * <ul> + * + * <li> A lookup from a query like "net" in English won't + * be any different than "net " (ie, user added a + * trailing space) because analyzers don't reflect + * when they've seen a token separator and when they + * haven't. + * + * <li> If you're using {@code StopFilter}, and the user will + * type "fast apple", but so far all they've typed is + * "fast a", again because the analyzer doesn't convey whether + * it's seen a token separator after the "a", + * {@code StopFilter} will remove that "a" causing + * far more matches than you'd expect. + * + * <li> Lookups with the empty string return no results + * instead of all results. + * </ul> + * + * @lucene.experimental + */ +public class XAnalyzingSuggester extends Lookup { + + /** + * FST<Weight,Surface>: + * input is the analyzed form, with a null byte between terms + * weights are encoded as costs: (Integer.MAX_VALUE-weight) + * surface is the original, unanalyzed form. + */ + private FST<Pair<Long,BytesRef>> fst = null; + + /** + * Analyzer that will be used for analyzing suggestions at + * index time. + */ + private final Analyzer indexAnalyzer; + + /** + * Analyzer that will be used for analyzing suggestions at + * query time. + */ + private final Analyzer queryAnalyzer; + + /** + * True if exact match suggestions should always be returned first. + */ + private final boolean exactFirst; + + /** + * True if separator between tokens should be preserved. + */ + private final boolean preserveSep; + + /** Include this flag in the options parameter to {@link + * #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int,int)} to always + * return the exact match first, regardless of score. This + * has no performance impact but could result in + * low-quality suggestions. */ + public static final int EXACT_FIRST = 1; + + /** Include this flag in the options parameter to {@link + * #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int,int)} to preserve + * token separators when matching. */ + public static final int PRESERVE_SEP = 2; + + /** Represents the separation between tokens, if + * PRESERVE_SEP was specified */ + public static final int SEP_LABEL = '\u001F'; + + /** Marks end of the analyzed input and start of dedup + * byte. */ + public static final int END_BYTE = 0x0; + + /** Maximum number of dup surface forms (different surface + * forms for the same analyzed form). */ + private final int maxSurfaceFormsPerAnalyzedForm; + + /** Maximum graph paths to index for a single analyzed + * surface form. This only matters if your analyzer + * makes lots of alternate paths (e.g. contains + * SynonymFilter). */ + private final int maxGraphExpansions; + + /** Highest number of analyzed paths we saw for any single + * input surface form. For analyzers that never create + * graphs this will always be 1. */ + private int maxAnalyzedPathsForOneInput; + + private boolean hasPayloads; + + private final int sepLabel; + private final int payloadSep; + private final int endByte; + private final int holeCharacter; + + public static final int PAYLOAD_SEP = '\u001F'; + public static final int HOLE_CHARACTER = '\u001E'; + + /** Whether position holes should appear in the automaton. */ + private boolean preservePositionIncrements; + + /** + * Calls {@link #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int,int) + * AnalyzingSuggester(analyzer, analyzer, EXACT_FIRST | + * PRESERVE_SEP, 256, -1)} + */ + public XAnalyzingSuggester(Analyzer analyzer) { + this(analyzer, analyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, true, null, false, 0, SEP_LABEL, PAYLOAD_SEP, END_BYTE, HOLE_CHARACTER); + } + + /** + * Calls {@link #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int,int) + * AnalyzingSuggester(indexAnalyzer, queryAnalyzer, EXACT_FIRST | + * PRESERVE_SEP, 256, -1)} + */ + public XAnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) { + this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, true, null, false, 0, SEP_LABEL, PAYLOAD_SEP, END_BYTE, HOLE_CHARACTER); + } + + /** + * Creates a new suggester. + * + * @param indexAnalyzer Analyzer that will be used for + * analyzing suggestions while building the index. + * @param queryAnalyzer Analyzer that will be used for + * analyzing query text during lookup + * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP} + * @param maxSurfaceFormsPerAnalyzedForm Maximum number of + * surface forms to keep for a single analyzed form. + * When there are too many surface forms we discard the + * lowest weighted ones. + * @param maxGraphExpansions Maximum number of graph paths + * to expand from the analyzed form. Set this to -1 for + * no limit. + */ + public XAnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, + boolean preservePositionIncrements, FST<Pair<Long, BytesRef>> fst, boolean hasPayloads, int maxAnalyzedPathsForOneInput, + int sepLabel, int payloadSep, int endByte, int holeCharacter) { + // SIMON EDIT: I added fst, hasPayloads and maxAnalyzedPathsForOneInput + this.indexAnalyzer = indexAnalyzer; + this.queryAnalyzer = queryAnalyzer; + this.fst = fst; + this.hasPayloads = hasPayloads; + if ((options & ~(EXACT_FIRST | PRESERVE_SEP)) != 0) { + throw new IllegalArgumentException("options should only contain EXACT_FIRST and PRESERVE_SEP; got " + options); + } + this.exactFirst = (options & EXACT_FIRST) != 0; + this.preserveSep = (options & PRESERVE_SEP) != 0; + + // NOTE: this is just an implementation limitation; if + // somehow this is a problem we could fix it by using + // more than one byte to disambiguate ... but 256 seems + // like it should be way more then enough. + if (maxSurfaceFormsPerAnalyzedForm <= 0 || maxSurfaceFormsPerAnalyzedForm > 256) { + throw new IllegalArgumentException("maxSurfaceFormsPerAnalyzedForm must be > 0 and < 256 (got: " + maxSurfaceFormsPerAnalyzedForm + ")"); + } + this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm; + + if (maxGraphExpansions < 1 && maxGraphExpansions != -1) { + throw new IllegalArgumentException("maxGraphExpansions must -1 (no limit) or > 0 (got: " + maxGraphExpansions + ")"); + } + this.maxGraphExpansions = maxGraphExpansions; + this.maxAnalyzedPathsForOneInput = maxAnalyzedPathsForOneInput; + this.preservePositionIncrements = preservePositionIncrements; + this.sepLabel = sepLabel; + this.payloadSep = payloadSep; + this.endByte = endByte; + this.holeCharacter = holeCharacter; + } + + /** Returns byte size of the underlying FST. */ + public long sizeInBytes() { + return fst == null ? 0 : fst.sizeInBytes(); + } + + private static void copyDestTransitions(State from, State to, List<Transition> transitions) { + if (to.isAccept()) { + from.setAccept(true); + } + for(Transition t : to.getTransitions()) { + transitions.add(t); + } + } + + // Replaces SEP with epsilon or remaps them if + // we were asked to preserve them: + private static void replaceSep(Automaton a, boolean preserveSep, int replaceSep) { + + State[] states = a.getNumberedStates(); + + // Go in reverse topo sort so we know we only have to + // make one pass: + for(int stateNumber=states.length-1;stateNumber >=0;stateNumber--) { + final State state = states[stateNumber]; + List<Transition> newTransitions = new ArrayList<Transition>(); + for(Transition t : state.getTransitions()) { + assert t.getMin() == t.getMax(); + if (t.getMin() == TokenStreamToAutomaton.POS_SEP) { + if (preserveSep) { + // Remap to SEP_LABEL: + newTransitions.add(new Transition(replaceSep, t.getDest())); + } else { + copyDestTransitions(state, t.getDest(), newTransitions); + a.setDeterministic(false); + } + } else if (t.getMin() == TokenStreamToAutomaton.HOLE) { + + // Just remove the hole: there will then be two + // SEP tokens next to each other, which will only + // match another hole at search time. Note that + // it will also match an empty-string token ... if + // that's somehow a problem we can always map HOLE + // to a dedicated byte (and escape it in the + // input). + copyDestTransitions(state, t.getDest(), newTransitions); + a.setDeterministic(false); + } else { + newTransitions.add(t); + } + } + state.setTransitions(newTransitions.toArray(new Transition[newTransitions.size()])); + } + } + + protected Automaton convertAutomaton(Automaton a) { + return a; + } + + /** Just escapes the 0xff byte (which we still for SEP). */ + private static final class EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton { + + final BytesRef spare = new BytesRef(); + private char sepLabel; + + public EscapingTokenStreamToAutomaton(char sepLabel) { + this.sepLabel = sepLabel; + } + + @Override + protected BytesRef changeToken(BytesRef in) { + int upto = 0; + for(int i=0;i<in.length;i++) { + byte b = in.bytes[in.offset+i]; + if (b == (byte) sepLabel) { + if (spare.bytes.length == upto) { + spare.grow(upto+2); + } + spare.bytes[upto++] = (byte) sepLabel; + spare.bytes[upto++] = b; + } else { + if (spare.bytes.length == upto) { + spare.grow(upto+1); + } + spare.bytes[upto++] = b; + } + } + spare.offset = 0; + spare.length = upto; + return spare; + } + } + + public TokenStreamToAutomaton getTokenStreamToAutomaton() { + final TokenStreamToAutomaton tsta; + if (preserveSep) { + tsta = new EscapingTokenStreamToAutomaton((char) sepLabel); + } else { + // When we're not preserving sep, we don't steal 0xff + // byte, so we don't need to do any escaping: + tsta = new TokenStreamToAutomaton(); + } + tsta.setPreservePositionIncrements(preservePositionIncrements); + return tsta; + } + + private static class AnalyzingComparator implements Comparator<BytesRef> { + + private final boolean hasPayloads; + + public AnalyzingComparator(boolean hasPayloads) { + this.hasPayloads = hasPayloads; + } + + private final ByteArrayDataInput readerA = new ByteArrayDataInput(); + private final ByteArrayDataInput readerB = new ByteArrayDataInput(); + private final BytesRef scratchA = new BytesRef(); + private final BytesRef scratchB = new BytesRef(); + + @Override + public int compare(BytesRef a, BytesRef b) { + + // First by analyzed form: + readerA.reset(a.bytes, a.offset, a.length); + scratchA.length = readerA.readShort(); + scratchA.bytes = a.bytes; + scratchA.offset = readerA.getPosition(); + + readerB.reset(b.bytes, b.offset, b.length); + scratchB.bytes = b.bytes; + scratchB.length = readerB.readShort(); + scratchB.offset = readerB.getPosition(); + + int cmp = scratchA.compareTo(scratchB); + if (cmp != 0) { + return cmp; + } + readerA.skipBytes(scratchA.length); + readerB.skipBytes(scratchB.length); + // Next by cost: + long aCost = readerA.readInt(); + long bCost = readerB.readInt(); + if (aCost < bCost) { + return -1; + } else if (aCost > bCost) { + return 1; + } + + // Finally by surface form: + if (hasPayloads) { + scratchA.length = readerA.readShort(); + scratchA.offset = readerA.getPosition(); + scratchB.length = readerB.readShort(); + scratchB.offset = readerB.getPosition(); + } else { + scratchA.offset = readerA.getPosition(); + scratchA.length = a.length - scratchA.offset; + scratchB.offset = readerB.getPosition(); + scratchB.length = b.length - scratchB.offset; + } + return scratchA.compareTo(scratchB); + } + } + + @Override + public void build(InputIterator iterator) throws IOException { + String prefix = getClass().getSimpleName(); + File directory = Sort.defaultTempDir(); + File tempInput = File.createTempFile(prefix, ".input", directory); + File tempSorted = File.createTempFile(prefix, ".sorted", directory); + + hasPayloads = iterator.hasPayloads(); + + Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput); + Sort.ByteSequencesReader reader = null; + BytesRef scratch = new BytesRef(); + + TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton(); + + boolean success = false; + byte buffer[] = new byte[8]; + try { + ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); + BytesRef surfaceForm; + + while ((surfaceForm = iterator.next()) != null) { + Set<IntsRef> paths = toFiniteStrings(surfaceForm, ts2a); + + maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, paths.size()); + + for (IntsRef path : paths) { + + Util.toBytesRef(path, scratch); + + // length of the analyzed text (FST input) + if (scratch.length > Short.MAX_VALUE-2) { + throw new IllegalArgumentException("cannot handle analyzed forms > " + (Short.MAX_VALUE-2) + " in length (got " + scratch.length + ")"); + } + short analyzedLength = (short) scratch.length; + + // compute the required length: + // analyzed sequence + weight (4) + surface + analyzedLength (short) + int requiredLength = analyzedLength + 4 + surfaceForm.length + 2; + + BytesRef payload; + + if (hasPayloads) { + if (surfaceForm.length > (Short.MAX_VALUE-2)) { + throw new IllegalArgumentException("cannot handle surface form > " + (Short.MAX_VALUE-2) + " in length (got " + surfaceForm.length + ")"); + } + payload = iterator.payload(); + // payload + surfaceLength (short) + requiredLength += payload.length + 2; + } else { + payload = null; + } + + buffer = ArrayUtil.grow(buffer, requiredLength); + + output.reset(buffer); + + output.writeShort(analyzedLength); + + output.writeBytes(scratch.bytes, scratch.offset, scratch.length); + + output.writeInt(encodeWeight(iterator.weight())); + + if (hasPayloads) { + for(int i=0;i<surfaceForm.length;i++) { + if (surfaceForm.bytes[i] == payloadSep) { + throw new IllegalArgumentException("surface form cannot contain unit separator character U+001F; this character is reserved"); + } + } + output.writeShort((short) surfaceForm.length); + output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length); + output.writeBytes(payload.bytes, payload.offset, payload.length); + } else { + output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length); + } + + assert output.getPosition() == requiredLength: output.getPosition() + " vs " + requiredLength; + + writer.write(buffer, 0, output.getPosition()); + } + } + writer.close(); + + // Sort all input/output pairs (required by FST.Builder): + new Sort(new AnalyzingComparator(hasPayloads)).sort(tempInput, tempSorted); + + // Free disk space: + tempInput.delete(); + + reader = new Sort.ByteSequencesReader(tempSorted); + + PairOutputs<Long,BytesRef> outputs = new PairOutputs<Long,BytesRef>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()); + Builder<Pair<Long,BytesRef>> builder = new Builder<Pair<Long,BytesRef>>(FST.INPUT_TYPE.BYTE1, outputs); + + // Build FST: + BytesRef previousAnalyzed = null; + BytesRef analyzed = new BytesRef(); + BytesRef surface = new BytesRef(); + IntsRef scratchInts = new IntsRef(); + ByteArrayDataInput input = new ByteArrayDataInput(); + + // Used to remove duplicate surface forms (but we + // still index the hightest-weight one). We clear + // this when we see a new analyzed form, so it cannot + // grow unbounded (at most 256 entries): + Set<BytesRef> seenSurfaceForms = new HashSet<BytesRef>(); + + int dedup = 0; + while (reader.read(scratch)) { + input.reset(scratch.bytes, scratch.offset, scratch.length); + short analyzedLength = input.readShort(); + analyzed.grow(analyzedLength+2); + input.readBytes(analyzed.bytes, 0, analyzedLength); + analyzed.length = analyzedLength; + + long cost = input.readInt(); + + surface.bytes = scratch.bytes; + if (hasPayloads) { + surface.length = input.readShort(); + surface.offset = input.getPosition(); + } else { + surface.offset = input.getPosition(); + surface.length = scratch.length - surface.offset; + } + + if (previousAnalyzed == null) { + previousAnalyzed = new BytesRef(); + previousAnalyzed.copyBytes(analyzed); + seenSurfaceForms.add(BytesRef.deepCopyOf(surface)); + } else if (analyzed.equals(previousAnalyzed)) { + dedup++; + if (dedup >= maxSurfaceFormsPerAnalyzedForm) { + // More than maxSurfaceFormsPerAnalyzedForm + // dups: skip the rest: + continue; + } + if (seenSurfaceForms.contains(surface)) { + continue; + } + seenSurfaceForms.add(BytesRef.deepCopyOf(surface)); + } else { + dedup = 0; + previousAnalyzed.copyBytes(analyzed); + seenSurfaceForms.clear(); + seenSurfaceForms.add(BytesRef.deepCopyOf(surface)); + } + + // TODO: I think we can avoid the extra 2 bytes when + // there is no dup (dedup==0), but we'd have to fix + // the exactFirst logic ... which would be sort of + // hairy because we'd need to special case the two + // (dup/not dup)... + + // NOTE: must be byte 0 so we sort before whatever + // is next + analyzed.bytes[analyzed.offset+analyzed.length] = 0; + analyzed.bytes[analyzed.offset+analyzed.length+1] = (byte) dedup; + analyzed.length += 2; + + Util.toIntsRef(analyzed, scratchInts); + //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString()); + if (!hasPayloads) { + builder.add(scratchInts, outputs.newPair(cost, BytesRef.deepCopyOf(surface))); + } else { + int payloadOffset = input.getPosition() + surface.length; + int payloadLength = scratch.length - payloadOffset; + BytesRef br = new BytesRef(surface.length + 1 + payloadLength); + System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length); + br.bytes[surface.length] = (byte) payloadSep; + System.arraycopy(scratch.bytes, payloadOffset, br.bytes, surface.length+1, payloadLength); + br.length = br.bytes.length; + builder.add(scratchInts, outputs.newPair(cost, br)); + } + } + fst = builder.finish(); + + //PrintWriter pw = new PrintWriter("/tmp/out.dot"); + //Util.toDot(fst, pw, true, true); + //pw.close(); + + success = true; + } finally { + if (success) { + IOUtils.close(reader, writer); + } else { + IOUtils.closeWhileHandlingException(reader, writer); + } + + tempInput.delete(); + tempSorted.delete(); + } + } + + @Override + public boolean store(OutputStream output) throws IOException { + DataOutput dataOut = new OutputStreamDataOutput(output); + try { + if (fst == null) { + return false; + } + + fst.save(dataOut); + dataOut.writeVInt(maxAnalyzedPathsForOneInput); + dataOut.writeByte((byte) (hasPayloads ? 1 : 0)); + } finally { + IOUtils.close(output); + } + return true; + } + + @Override + public boolean load(InputStream input) throws IOException { + DataInput dataIn = new InputStreamDataInput(input); + try { + this.fst = new FST<Pair<Long,BytesRef>>(dataIn, new PairOutputs<Long,BytesRef>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton())); + maxAnalyzedPathsForOneInput = dataIn.readVInt(); + hasPayloads = dataIn.readByte() == 1; + } finally { + IOUtils.close(input); + } + return true; + } + + private LookupResult getLookupResult(Long output1, BytesRef output2, CharsRef spare) { + LookupResult result; + if (hasPayloads) { + int sepIndex = -1; + for(int i=0;i<output2.length;i++) { + if (output2.bytes[output2.offset+i] == payloadSep) { + sepIndex = i; + break; + } + } + assert sepIndex != -1; + spare.grow(sepIndex); + final int payloadLen = output2.length - sepIndex - 1; + UnicodeUtil.UTF8toUTF16(output2.bytes, output2.offset, sepIndex, spare); + BytesRef payload = new BytesRef(payloadLen); + System.arraycopy(output2.bytes, sepIndex+1, payload.bytes, 0, payloadLen); + payload.length = payloadLen; + result = new LookupResult(spare.toString(), decodeWeight(output1), payload); + } else { + spare.grow(output2.length); + UnicodeUtil.UTF8toUTF16(output2, spare); + result = new LookupResult(spare.toString(), decodeWeight(output1)); + } + + return result; + } + + private boolean sameSurfaceForm(BytesRef key, BytesRef output2) { + if (hasPayloads) { + // output2 has at least PAYLOAD_SEP byte: + if (key.length >= output2.length) { + return false; + } + for(int i=0;i<key.length;i++) { + if (key.bytes[key.offset+i] != output2.bytes[output2.offset+i]) { + return false; + } + } + return output2.bytes[output2.offset + key.length] == payloadSep; + } else { + return key.bytesEquals(output2); + } + } + + @Override + public List<LookupResult> lookup(final CharSequence key, boolean onlyMorePopular, int num) { + assert num > 0; + + if (onlyMorePopular) { + throw new IllegalArgumentException("this suggester only works with onlyMorePopular=false"); + } + if (fst == null) { + return Collections.emptyList(); + } + + //System.out.println("lookup key=" + key + " num=" + num); + for (int i = 0; i < key.length(); i++) { + if (key.charAt(i) == holeCharacter) { + throw new IllegalArgumentException("lookup key cannot contain HOLE character U+001E; this character is reserved"); + } + if (key.charAt(i) == sepLabel) { + throw new IllegalArgumentException("lookup key cannot contain unit separator character U+001F; this character is reserved"); + } + } + final BytesRef utf8Key = new BytesRef(key); + try { + + Automaton lookupAutomaton = toLookupAutomaton(key); + + final CharsRef spare = new CharsRef(); + + //System.out.println(" now intersect exactFirst=" + exactFirst); + + // Intersect automaton w/ suggest wFST and get all + // prefix starting nodes & their outputs: + //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst); + + //System.out.println(" prefixPaths: " + prefixPaths.size()); + + BytesReader bytesReader = fst.getBytesReader(); + + FST.Arc<Pair<Long,BytesRef>> scratchArc = new FST.Arc<Pair<Long,BytesRef>>(); + + final List<LookupResult> results = new ArrayList<LookupResult>(); + + List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(convertAutomaton(lookupAutomaton), fst); + + if (exactFirst) { + + int count = 0; + for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) { + if (fst.findTargetArc(endByte, path.fstNode, scratchArc, bytesReader) != null) { + // This node has END_BYTE arc leaving, meaning it's an + // "exact" match: + count++; + } + } + + // Searcher just to find the single exact only + // match, if present: + Util.TopNSearcher<Pair<Long,BytesRef>> searcher; + searcher = new Util.TopNSearcher<Pair<Long,BytesRef>>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator); + + // NOTE: we could almost get away with only using + // the first start node. The only catch is if + // maxSurfaceFormsPerAnalyzedForm had kicked in and + // pruned our exact match from one of these nodes + // ...: + for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) { + if (fst.findTargetArc(endByte, path.fstNode, scratchArc, bytesReader) != null) { + // This node has END_BYTE arc leaving, meaning it's an + // "exact" match: + searcher.addStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.output), false, path.input); + } + } + + MinResult<Pair<Long,BytesRef>> completions[] = searcher.search(); + + // NOTE: this is rather inefficient: we enumerate + // every matching "exactly the same analyzed form" + // path, and then do linear scan to see if one of + // these exactly matches the input. It should be + // possible (though hairy) to do something similar + // to getByOutput, since the surface form is encoded + // into the FST output, so we more efficiently hone + // in on the exact surface-form match. Still, I + // suspect very little time is spent in this linear + // seach: it's bounded by how many prefix start + // nodes we have and the + // maxSurfaceFormsPerAnalyzedForm: + for(MinResult<Pair<Long,BytesRef>> completion : completions) { + BytesRef output2 = completion.output.output2; + if (sameSurfaceForm(utf8Key, output2)) { + results.add(getLookupResult(completion.output.output1, output2, spare)); + break; + } + } + + if (results.size() == num) { + // That was quick: + return results; + } + } + + Util.TopNSearcher<Pair<Long,BytesRef>> searcher; + searcher = new Util.TopNSearcher<Pair<Long,BytesRef>>(fst, + num - results.size(), + num * maxAnalyzedPathsForOneInput, + weightComparator) { + private final Set<BytesRef> seen = new HashSet<BytesRef>(); + + @Override + protected boolean acceptResult(IntsRef input, Pair<Long,BytesRef> output) { + + // Dedup: when the input analyzes to a graph we + // can get duplicate surface forms: + if (seen.contains(output.output2)) { + return false; + } + seen.add(output.output2); + + if (!exactFirst) { + return true; + } else { + // In exactFirst mode, don't accept any paths + // matching the surface form since that will + // create duplicate results: + if (sameSurfaceForm(utf8Key, output.output2)) { + // We found exact match, which means we should + // have already found it in the first search: + assert results.size() == 1; + return false; + } else { + return true; + } + } + } + }; + + prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst); + + for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) { + searcher.addStartPaths(path.fstNode, path.output, true, path.input); + } + + MinResult<Pair<Long,BytesRef>> completions[] = searcher.search(); + + for(MinResult<Pair<Long,BytesRef>> completion : completions) { + + LookupResult result = getLookupResult(completion.output.output1, completion.output.output2, spare); + + // TODO: for fuzzy case would be nice to return + // how many edits were required + + //System.out.println(" result=" + result); + results.add(result); + + if (results.size() == num) { + // In the exactFirst=true case the search may + // produce one extra path + break; + } + } + + return results; + } catch (IOException bogus) { + throw new RuntimeException(bogus); + } + } + + /** Returns all completion paths to initialize the search. */ + protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths, + Automaton lookupAutomaton, + FST<Pair<Long,BytesRef>> fst) + throws IOException { + return prefixPaths; + } + + public final Set<IntsRef> toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException { + // Analyze surface form: + TokenStream ts = indexAnalyzer.tokenStream("", surfaceForm.utf8ToString()); + return toFiniteStrings(ts2a, ts); + } + public final Set<IntsRef> toFiniteStrings(final TokenStreamToAutomaton ts2a, TokenStream ts) throws IOException { + // Analyze surface form: + + // Create corresponding automaton: labels are bytes + // from each analyzed token, with byte 0 used as + // separator between tokens: + Automaton automaton = ts2a.toAutomaton(ts); + ts.close(); + + replaceSep(automaton, preserveSep, sepLabel); + + assert SpecialOperations.isFinite(automaton); + + // Get all paths from the automaton (there can be + // more than one path, eg if the analyzer created a + // graph using SynFilter or WDF): + + // TODO: we could walk & add simultaneously, so we + // don't have to alloc [possibly biggish] + // intermediate HashSet in RAM: + return SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions); + } + + final Automaton toLookupAutomaton(final CharSequence key) throws IOException { + // Turn tokenstream into automaton: + TokenStream ts = queryAnalyzer.tokenStream("", key.toString()); + Automaton automaton = (getTokenStreamToAutomaton()).toAutomaton(ts); + ts.close(); + + // TODO: we could use the end offset to "guess" + // whether the final token was a partial token; this + // would only be a heuristic ... but maybe an OK one. + // This way we could eg differentiate "net" from "net ", + // which we can't today... + + replaceSep(automaton, preserveSep, sepLabel); + + // TODO: we can optimize this somewhat by determinizing + // while we convert + BasicOperations.determinize(automaton); + return automaton; + } + + + + /** + * Returns the weight associated with an input string, + * or null if it does not exist. + */ + public Object get(CharSequence key) { + throw new UnsupportedOperationException(); + } + + /** cost -> weight */ + public static int decodeWeight(long encoded) { + return (int)(Integer.MAX_VALUE - encoded); + } + + /** weight -> cost */ + public static int encodeWeight(long value) { + if (value < 0 || value > Integer.MAX_VALUE) { + throw new UnsupportedOperationException("cannot encode value: " + value); + } + return Integer.MAX_VALUE - (int)value; + } + + static final Comparator<Pair<Long,BytesRef>> weightComparator = new Comparator<Pair<Long,BytesRef>> () { + @Override + public int compare(Pair<Long,BytesRef> left, Pair<Long,BytesRef> right) { + return left.output1.compareTo(right.output1); + } + }; + + + public static class XBuilder { + private Builder<Pair<Long, BytesRef>> builder; + private int maxSurfaceFormsPerAnalyzedForm; + private IntsRef scratchInts = new IntsRef(); + private final PairOutputs<Long, BytesRef> outputs; + private boolean hasPayloads; + private BytesRef analyzed = new BytesRef(); + private final SurfaceFormAndPayload[] surfaceFormsAndPayload; + private int count; + private ObjectIntOpenHashMap<BytesRef> seenSurfaceForms = HppcMaps.Object.Integer.ensureNoNullKeys(256, 0.75f); + private int payloadSep; + + public XBuilder(int maxSurfaceFormsPerAnalyzedForm, boolean hasPayloads, int payloadSep) { + this.payloadSep = payloadSep; + this.outputs = new PairOutputs<Long, BytesRef>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()); + this.builder = new Builder<Pair<Long, BytesRef>>(FST.INPUT_TYPE.BYTE1, outputs); + this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm; + this.hasPayloads = hasPayloads; + surfaceFormsAndPayload = new SurfaceFormAndPayload[maxSurfaceFormsPerAnalyzedForm]; + + } + public void startTerm(BytesRef analyzed) { + this.analyzed.copyBytes(analyzed); + this.analyzed.grow(analyzed.length+2); + } + + private final static class SurfaceFormAndPayload implements Comparable<SurfaceFormAndPayload> { + BytesRef payload; + long weight; + + public SurfaceFormAndPayload(BytesRef payload, long cost) { + super(); + this.payload = payload; + this.weight = cost; + } + + @Override + public int compareTo(SurfaceFormAndPayload o) { + int res = compare(weight, o.weight); + if (res == 0 ){ + return payload.compareTo(o.payload); + } + return res; + } + public static int compare(long x, long y) { + return (x < y) ? -1 : ((x == y) ? 0 : 1); + } + } + + public void addSurface(BytesRef surface, BytesRef payload, long cost) throws IOException { + int surfaceIndex = -1; + long encodedWeight = cost == -1 ? cost : encodeWeight(cost); + /* + * we need to check if we have seen this surface form, if so only use the + * the surface form with the highest weight and drop the rest no matter if + * the payload differs. + */ + if (count >= maxSurfaceFormsPerAnalyzedForm) { + // More than maxSurfaceFormsPerAnalyzedForm + // dups: skip the rest: + return; + } + BytesRef surfaceCopy; + if (count > 0 && seenSurfaceForms.containsKey(surface)) { + surfaceIndex = seenSurfaceForms.lget(); + SurfaceFormAndPayload surfaceFormAndPayload = surfaceFormsAndPayload[surfaceIndex]; + if (encodedWeight >= surfaceFormAndPayload.weight) { + return; + } + surfaceCopy = BytesRef.deepCopyOf(surface); + } else { + surfaceIndex = count++; + surfaceCopy = BytesRef.deepCopyOf(surface); + seenSurfaceForms.put(surfaceCopy, surfaceIndex); + } + + BytesRef payloadRef; + if (!hasPayloads) { + payloadRef = surfaceCopy; + } else { + int len = surface.length + 1 + payload.length; + final BytesRef br = new BytesRef(len); + System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length); + br.bytes[surface.length] = (byte) payloadSep; + System.arraycopy(payload.bytes, payload.offset, br.bytes, surface.length + 1, payload.length); + br.length = len; + payloadRef = br; + } + if (surfaceFormsAndPayload[surfaceIndex] == null) { + surfaceFormsAndPayload[surfaceIndex] = new SurfaceFormAndPayload(payloadRef, encodedWeight); + } else { + surfaceFormsAndPayload[surfaceIndex].payload = payloadRef; + surfaceFormsAndPayload[surfaceIndex].weight = encodedWeight; + } + } + + public void finishTerm(long defaultWeight) throws IOException { + ArrayUtil.timSort(surfaceFormsAndPayload, 0, count); + int deduplicator = 0; + analyzed.bytes[analyzed.offset + analyzed.length] = 0; + analyzed.length += 2; + for (int i = 0; i < count; i++) { + analyzed.bytes[analyzed.offset + analyzed.length - 1 ] = (byte) deduplicator++; + Util.toIntsRef(analyzed, scratchInts); + SurfaceFormAndPayload candiate = surfaceFormsAndPayload[i]; + long cost = candiate.weight == -1 ? encodeWeight(Math.min(Integer.MAX_VALUE, defaultWeight)) : candiate.weight; + builder.add(scratchInts, outputs.newPair(cost, candiate.payload)); + } + seenSurfaceForms.clear(); + count = 0; + } + + public FST<Pair<Long, BytesRef>> build() throws IOException { + return builder.finish(); + } + + public boolean hasPayloads() { + return hasPayloads; + } + + public int maxSurfaceFormsPerAnalyzedForm() { + return maxSurfaceFormsPerAnalyzedForm; + } + + } +} diff --git a/src/main/java/org/apache/lucene/search/suggest/analyzing/XFuzzySuggester.java b/src/main/java/org/apache/lucene/search/suggest/analyzing/XFuzzySuggester.java new file mode 100644 index 0000000..4379cda --- /dev/null +++ b/src/main/java/org/apache/lucene/search/suggest/analyzing/XFuzzySuggester.java @@ -0,0 +1,263 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.lucene.search.suggest.analyzing; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStreamToAutomaton; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.automaton.*; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.PairOutputs; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Set; + +/** + * Implements a fuzzy {@link AnalyzingSuggester}. The similarity measurement is + * based on the Damerau-Levenshtein (optimal string alignment) algorithm, though + * you can explicitly choose classic Levenshtein by passing <code>false</code> + * for the <code>transpositions</code> parameter. + * <p> + * At most, this query will match terms up to + * {@value org.apache.lucene.util.automaton.LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE} + * edits. Higher distances are not supported. Note that the + * fuzzy distance is measured in "byte space" on the bytes + * returned by the {@link org.apache.lucene.analysis.TokenStream}'s {@link + * org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute}, usually UTF8. By default + * the analyzed bytes must be at least 3 {@link + * #DEFAULT_MIN_FUZZY_LENGTH} bytes before any edits are + * considered. Furthermore, the first 1 {@link + * #DEFAULT_NON_FUZZY_PREFIX} byte is not allowed to be + * edited. We allow up to 1 (@link + * #DEFAULT_MAX_EDITS} edit. + * If {@link #unicodeAware} parameter in the constructor is set to true, maxEdits, + * minFuzzyLength, transpositions and nonFuzzyPrefix are measured in Unicode code + * points (actual letters) instead of bytes.* + * + * <p> + * NOTE: This suggester does not boost suggestions that + * required no edits over suggestions that did require + * edits. This is a known limitation. + * + * <p> + * Note: complex query analyzers can have a significant impact on the lookup + * performance. It's recommended to not use analyzers that drop or inject terms + * like synonyms to keep the complexity of the prefix intersection low for good + * lookup performance. At index time, complex analyzers can safely be used. + * </p> + * + * @lucene.experimental + */ +public final class XFuzzySuggester extends XAnalyzingSuggester { + private final int maxEdits; + private final boolean transpositions; + private final int nonFuzzyPrefix; + private final int minFuzzyLength; + private final boolean unicodeAware; + + /** + * Measure maxEdits, minFuzzyLength, transpositions and nonFuzzyPrefix + * parameters in Unicode code points (actual letters) + * instead of bytes. + */ + public static final boolean DEFAULT_UNICODE_AWARE = false; + + /** + * The default minimum length of the key passed to {@link + * #lookup} before any edits are allowed. + */ + public static final int DEFAULT_MIN_FUZZY_LENGTH = 3; + + /** + * The default prefix length where edits are not allowed. + */ + public static final int DEFAULT_NON_FUZZY_PREFIX = 1; + + /** + * The default maximum number of edits for fuzzy + * suggestions. + */ + public static final int DEFAULT_MAX_EDITS = 1; + + /** + * The default transposition value passed to {@link org.apache.lucene.util.automaton.LevenshteinAutomata} + */ + public static final boolean DEFAULT_TRANSPOSITIONS = true; + + /** + * Creates a {@link FuzzySuggester} instance initialized with default values. + * + * @param analyzer the analyzer used for this suggester + */ + public XFuzzySuggester(Analyzer analyzer) { + this(analyzer, analyzer); + } + + /** + * Creates a {@link FuzzySuggester} instance with an index & a query analyzer initialized with default values. + * + * @param indexAnalyzer + * Analyzer that will be used for analyzing suggestions while building the index. + * @param queryAnalyzer + * Analyzer that will be used for analyzing query text during lookup + */ + public XFuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) { + this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, DEFAULT_MAX_EDITS, DEFAULT_TRANSPOSITIONS, + DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH, DEFAULT_UNICODE_AWARE, null, false, 0, SEP_LABEL, PAYLOAD_SEP, END_BYTE, HOLE_CHARACTER); + + } + + /** + * Creates a {@link FuzzySuggester} instance. + * + * @param indexAnalyzer Analyzer that will be used for + * analyzing suggestions while building the index. + * @param queryAnalyzer Analyzer that will be used for + * analyzing query text during lookup + * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP} + * @param maxSurfaceFormsPerAnalyzedForm Maximum number of + * surface forms to keep for a single analyzed form. + * When there are too many surface forms we discard the + * lowest weighted ones. + * @param maxGraphExpansions Maximum number of graph paths + * to expand from the analyzed form. Set this to -1 for + * no limit. + * @param maxEdits must be >= 0 and <= {@link org.apache.lucene.util.automaton.LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE} . + * @param transpositions <code>true</code> if transpositions should be treated as a primitive + * edit operation. If this is false, comparisons will implement the classic + * Levenshtein algorithm. + * @param nonFuzzyPrefix length of common (non-fuzzy) prefix (see default {@link #DEFAULT_NON_FUZZY_PREFIX} + * @param minFuzzyLength minimum length of lookup key before any edits are allowed (see default {@link #DEFAULT_MIN_FUZZY_LENGTH}) + * @param sepLabel separation label + * @param payloadSep payload separator byte + * @param endByte end byte marker byte + */ + public XFuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, + int maxEdits, boolean transpositions, int nonFuzzyPrefix, int minFuzzyLength, boolean unicodeAware, + FST<PairOutputs.Pair<Long, BytesRef>> fst, boolean hasPayloads, int maxAnalyzedPathsForOneInput, + int sepLabel, int payloadSep, int endByte, int holeCharacter) { + super(indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, true, fst, hasPayloads, maxAnalyzedPathsForOneInput, sepLabel, payloadSep, endByte, holeCharacter); + if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) { + throw new IllegalArgumentException("maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE); + } + if (nonFuzzyPrefix < 0) { + throw new IllegalArgumentException("nonFuzzyPrefix must not be >= 0 (got " + nonFuzzyPrefix + ")"); + } + if (minFuzzyLength < 0) { + throw new IllegalArgumentException("minFuzzyLength must not be >= 0 (got " + minFuzzyLength + ")"); + } + + this.maxEdits = maxEdits; + this.transpositions = transpositions; + this.nonFuzzyPrefix = nonFuzzyPrefix; + this.minFuzzyLength = minFuzzyLength; + this.unicodeAware = unicodeAware; + } + + @Override + protected List<FSTUtil.Path<PairOutputs.Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<PairOutputs.Pair<Long,BytesRef>>> prefixPaths, + Automaton lookupAutomaton, + FST<PairOutputs.Pair<Long,BytesRef>> fst) + throws IOException { + + // TODO: right now there's no penalty for fuzzy/edits, + // ie a completion whose prefix matched exactly what the + // user typed gets no boost over completions that + // required an edit, which get no boost over completions + // requiring two edits. I suspect a multiplicative + // factor is appropriate (eg, say a fuzzy match must be at + // least 2X better weight than the non-fuzzy match to + // "compete") ... in which case I think the wFST needs + // to be log weights or something ... + + Automaton levA = convertAutomaton(toLevenshteinAutomata(lookupAutomaton)); + /* + Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8"); + w.write(levA.toDot()); + w.close(); + System.out.println("Wrote LevA to out.dot"); + */ + return FSTUtil.intersectPrefixPaths(levA, fst); + } + + @Override + protected Automaton convertAutomaton(Automaton a) { + if (unicodeAware) { + Automaton utf8automaton = new UTF32ToUTF8().convert(a); + BasicOperations.determinize(utf8automaton); + return utf8automaton; + } else { + return a; + } + } + + @Override + public TokenStreamToAutomaton getTokenStreamToAutomaton() { + final TokenStreamToAutomaton tsta = super.getTokenStreamToAutomaton(); + tsta.setUnicodeArcs(unicodeAware); + return tsta; + } + + Automaton toLevenshteinAutomata(Automaton automaton) { + final Set<IntsRef> ref = SpecialOperations.getFiniteStrings(automaton, -1); + Automaton subs[] = new Automaton[ref.size()]; + int upto = 0; + for (IntsRef path : ref) { + if (path.length <= nonFuzzyPrefix || path.length < minFuzzyLength) { + subs[upto] = BasicAutomata.makeString(path.ints, path.offset, path.length); + upto++; + } else { + Automaton prefix = BasicAutomata.makeString(path.ints, path.offset, nonFuzzyPrefix); + int ints[] = new int[path.length-nonFuzzyPrefix]; + System.arraycopy(path.ints, path.offset+nonFuzzyPrefix, ints, 0, ints.length); + // TODO: maybe add alphaMin to LevenshteinAutomata, + // and pass 1 instead of 0? We probably don't want + // to allow the trailing dedup bytes to be + // edited... but then 0 byte is "in general" allowed + // on input (but not in UTF8). + LevenshteinAutomata lev = new LevenshteinAutomata(ints, unicodeAware ? Character.MAX_CODE_POINT : 255, transpositions); + Automaton levAutomaton = lev.toAutomaton(maxEdits); + Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton)); + combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already + subs[upto] = combined; + upto++; + } + } + + if (subs.length == 0) { + // automaton is empty, there is no accepted paths through it + return BasicAutomata.makeEmpty(); // matches nothing + } else if (subs.length == 1) { + // no synonyms or anything: just a single path through the tokenstream + return subs[0]; + } else { + // multiple paths: this is really scary! is it slow? + // maybe we should not do this and throw UOE? + Automaton a = BasicOperations.union(Arrays.asList(subs)); + // TODO: we could call toLevenshteinAutomata() before det? + // this only happens if you have multiple paths anyway (e.g. synonyms) + BasicOperations.determinize(a); + + return a; + } + } +} diff --git a/src/main/java/org/apache/lucene/search/vectorhighlight/CustomFieldQuery.java b/src/main/java/org/apache/lucene/search/vectorhighlight/CustomFieldQuery.java new file mode 100644 index 0000000..8512e5c --- /dev/null +++ b/src/main/java/org/apache/lucene/search/vectorhighlight/CustomFieldQuery.java @@ -0,0 +1,161 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.lucene.search.vectorhighlight; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.queries.FilterClause; +import org.apache.lucene.queries.TermFilter; +import org.apache.lucene.search.*; +import org.apache.lucene.search.spans.SpanTermQuery; +import org.apache.lucene.util.Version; +import org.elasticsearch.common.lucene.Lucene; +import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery; +import org.elasticsearch.common.lucene.search.XBooleanFilter; +import org.elasticsearch.common.lucene.search.XFilteredQuery; +import org.elasticsearch.common.lucene.search.function.FiltersFunctionScoreQuery; +import org.elasticsearch.common.lucene.search.function.FunctionScoreQuery; + +import java.io.IOException; +import java.lang.reflect.Field; +import java.util.Collection; +import java.util.List; + +/** + * + */ +// LUCENE MONITOR +public class CustomFieldQuery extends FieldQuery { + + private static Field multiTermQueryWrapperFilterQueryField; + + static { + try { + multiTermQueryWrapperFilterQueryField = MultiTermQueryWrapperFilter.class.getDeclaredField("query"); + multiTermQueryWrapperFilterQueryField.setAccessible(true); + } catch (NoSuchFieldException e) { + // ignore + } + } + + public static final ThreadLocal<Boolean> highlightFilters = new ThreadLocal<Boolean>(); + + public CustomFieldQuery(Query query, IndexReader reader, FastVectorHighlighter highlighter) throws IOException { + this(query, reader, highlighter.isPhraseHighlight(), highlighter.isFieldMatch()); + } + + public CustomFieldQuery(Query query, IndexReader reader, boolean phraseHighlight, boolean fieldMatch) throws IOException { + super(query, reader, phraseHighlight, fieldMatch); + highlightFilters.remove(); + } + + @Override + void flatten(Query sourceQuery, IndexReader reader, Collection<Query> flatQueries) throws IOException { + if (sourceQuery instanceof SpanTermQuery) { + super.flatten(new TermQuery(((SpanTermQuery) sourceQuery).getTerm()), reader, flatQueries); + } else if (sourceQuery instanceof ConstantScoreQuery) { + ConstantScoreQuery constantScoreQuery = (ConstantScoreQuery) sourceQuery; + if (constantScoreQuery.getFilter() != null) { + flatten(constantScoreQuery.getFilter(), reader, flatQueries); + } else { + flatten(constantScoreQuery.getQuery(), reader, flatQueries); + } + } else if (sourceQuery instanceof FunctionScoreQuery) { + flatten(((FunctionScoreQuery) sourceQuery).getSubQuery(), reader, flatQueries); + } else if (sourceQuery instanceof FilteredQuery) { + flatten(((FilteredQuery) sourceQuery).getQuery(), reader, flatQueries); + flatten(((FilteredQuery) sourceQuery).getFilter(), reader, flatQueries); + } else if (sourceQuery instanceof XFilteredQuery) { + flatten(((XFilteredQuery) sourceQuery).getQuery(), reader, flatQueries); + flatten(((XFilteredQuery) sourceQuery).getFilter(), reader, flatQueries); + } else if (sourceQuery instanceof MultiPhrasePrefixQuery) { + flatten(sourceQuery.rewrite(reader), reader, flatQueries); + } else if (sourceQuery instanceof FiltersFunctionScoreQuery) { + flatten(((FiltersFunctionScoreQuery) sourceQuery).getSubQuery(), reader, flatQueries); + } else if (sourceQuery instanceof MultiPhraseQuery) { + MultiPhraseQuery q = ((MultiPhraseQuery) sourceQuery); + convertMultiPhraseQuery(0, new int[q.getTermArrays().size()] , q, q.getTermArrays(), q.getPositions(), reader, flatQueries); + } else { + super.flatten(sourceQuery, reader, flatQueries); + } + } + + private void convertMultiPhraseQuery(int currentPos, int[] termsIdx, MultiPhraseQuery orig, List<Term[]> terms, int[] pos, IndexReader reader, Collection<Query> flatQueries) throws IOException { + if (currentPos == 0) { + // if we have more than 16 terms + int numTerms = 0; + for (Term[] currentPosTerm : terms) { + numTerms += currentPosTerm.length; + } + if (numTerms > 16) { + for (Term[] currentPosTerm : terms) { + for (Term term : currentPosTerm) { + super.flatten(new TermQuery(term), reader, flatQueries); + } + } + return; + } + } + /* + * we walk all possible ways and for each path down the MPQ we create a PhraseQuery this is what FieldQuery supports. + * It seems expensive but most queries will pretty small. + */ + if (currentPos == terms.size()) { + PhraseQuery query = new PhraseQuery(); + query.setBoost(orig.getBoost()); + query.setSlop(orig.getSlop()); + for (int i = 0; i < termsIdx.length; i++) { + query.add(terms.get(i)[termsIdx[i]], pos[i]); + } + this.flatten(query, reader, flatQueries); + } else { + Term[] t = terms.get(currentPos); + for (int i = 0; i < t.length; i++) { + termsIdx[currentPos] = i; + convertMultiPhraseQuery(currentPos+1, termsIdx, orig, terms, pos, reader, flatQueries); + } + } + } + + void flatten(Filter sourceFilter, IndexReader reader, Collection<Query> flatQueries) throws IOException { + Boolean highlight = highlightFilters.get(); + if (highlight == null || highlight.equals(Boolean.FALSE)) { + return; + } + if (sourceFilter instanceof TermFilter) { + flatten(new TermQuery(((TermFilter) sourceFilter).getTerm()), reader, flatQueries); + } else if (sourceFilter instanceof MultiTermQueryWrapperFilter) { + if (multiTermQueryWrapperFilterQueryField != null) { + try { + flatten((Query) multiTermQueryWrapperFilterQueryField.get(sourceFilter), reader, flatQueries); + } catch (IllegalAccessException e) { + // ignore + } + } + } else if (sourceFilter instanceof XBooleanFilter) { + XBooleanFilter booleanFilter = (XBooleanFilter) sourceFilter; + for (FilterClause clause : booleanFilter.clauses()) { + if (clause.getOccur() == BooleanClause.Occur.MUST || clause.getOccur() == BooleanClause.Occur.SHOULD) { + flatten(clause.getFilter(), reader, flatQueries); + } + } + } + } +} diff --git a/src/main/java/org/apache/lucene/store/BufferedChecksumIndexOutput.java b/src/main/java/org/apache/lucene/store/BufferedChecksumIndexOutput.java new file mode 100644 index 0000000..0c64a93 --- /dev/null +++ b/src/main/java/org/apache/lucene/store/BufferedChecksumIndexOutput.java @@ -0,0 +1,115 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.lucene.store; + +import java.io.IOException; +import java.util.zip.Checksum; + +/** + */ +public class BufferedChecksumIndexOutput extends BufferedIndexOutput { + + private final IndexOutput delegate; + private final BufferedIndexOutput bufferedDelegate; + private final Checksum digest; + + public BufferedChecksumIndexOutput(IndexOutput delegate, Checksum digest) { + super(delegate instanceof BufferedIndexOutput ? ((BufferedIndexOutput) delegate).getBufferSize() : BufferedIndexOutput.DEFAULT_BUFFER_SIZE); + if (delegate instanceof BufferedIndexOutput) { + bufferedDelegate = (BufferedIndexOutput) delegate; + this.delegate = delegate; + } else { + this.delegate = delegate; + bufferedDelegate = null; + } + this.digest = digest; + } + + public Checksum digest() { + return digest; + } + + public IndexOutput underlying() { + return this.delegate; + } + + // don't override it, base class method simple reads from input and writes to this output +// @Override public void copyBytes(IndexInput input, long numBytes) throws IOException { +// delegate.copyBytes(input, numBytes); +// } + + @Override + public void close() throws IOException { + try { + super.close(); + } finally { + delegate.close(); + } + + } + + @Override + protected void flushBuffer(byte[] b, int offset, int len) throws IOException { + if (bufferedDelegate != null) { + bufferedDelegate.flushBuffer(b, offset, len); + } else { + delegate.writeBytes(b, offset, len); + } + digest.update(b, offset, len); + } + + // don't override it, base class method simple reads from input and writes to this output +// @Override public void copyBytes(IndexInput input, long numBytes) throws IOException { +// delegate.copyBytes(input, numBytes); +// } + + @Override + public void flush() throws IOException { + try { + super.flush(); + } finally { + delegate.flush(); + } + } + + @Override + public void seek(long pos) throws IOException { + // seek might be called on files, which means that the checksum is not file checksum + // but a checksum of the bytes written to this stream, which is the same for each + // type of file in lucene + super.seek(pos); + delegate.seek(pos); + } + + @Override + public long length() throws IOException { + return delegate.length(); + } + + @Override + public void setLength(long length) throws IOException { + delegate.setLength(length); + } + + @Override + public String toString() { + return delegate.toString(); + } +} diff --git a/src/main/java/org/apache/lucene/store/RateLimitedFSDirectory.java b/src/main/java/org/apache/lucene/store/RateLimitedFSDirectory.java new file mode 100644 index 0000000..814a75f --- /dev/null +++ b/src/main/java/org/apache/lucene/store/RateLimitedFSDirectory.java @@ -0,0 +1,143 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.lucene.store; + +import org.apache.lucene.store.IOContext.Context; + +import java.io.IOException; + +public final class RateLimitedFSDirectory extends FilterDirectory{ + + private final StoreRateLimiting.Provider rateLimitingProvider; + + private final StoreRateLimiting.Listener rateListener; + + public RateLimitedFSDirectory(FSDirectory wrapped, StoreRateLimiting.Provider rateLimitingProvider, + StoreRateLimiting.Listener rateListener) { + super(wrapped); + this.rateLimitingProvider = rateLimitingProvider; + this.rateListener = rateListener; + } + + @Override + public IndexOutput createOutput(String name, IOContext context) throws IOException { + final IndexOutput output = in.createOutput(name, context); + + StoreRateLimiting rateLimiting = rateLimitingProvider.rateLimiting(); + StoreRateLimiting.Type type = rateLimiting.getType(); + RateLimiter limiter = rateLimiting.getRateLimiter(); + if (type == StoreRateLimiting.Type.NONE || limiter == null) { + return output; + } + if (context.context == Context.MERGE) { + // we are mering, and type is either MERGE or ALL, rate limit... + return new RateLimitedIndexOutput(limiter, rateListener, output); + } + if (type == StoreRateLimiting.Type.ALL) { + return new RateLimitedIndexOutput(limiter, rateListener, output); + } + // we shouldn't really get here... + return output; + } + + + @Override + public void close() throws IOException { + in.close(); + } + + @Override + public String toString() { + StoreRateLimiting rateLimiting = rateLimitingProvider.rateLimiting(); + StoreRateLimiting.Type type = rateLimiting.getType(); + RateLimiter limiter = rateLimiting.getRateLimiter(); + if (type == StoreRateLimiting.Type.NONE || limiter == null) { + return StoreUtils.toString(in); + } else { + return "rate_limited(" + StoreUtils.toString(in) + ", type=" + type.name() + ", rate=" + limiter.getMbPerSec() + ")"; + } + } + + + static final class RateLimitedIndexOutput extends BufferedIndexOutput { + + private final IndexOutput delegate; + private final BufferedIndexOutput bufferedDelegate; + private final RateLimiter rateLimiter; + private final StoreRateLimiting.Listener rateListener; + + RateLimitedIndexOutput(final RateLimiter rateLimiter, final StoreRateLimiting.Listener rateListener, final IndexOutput delegate) { + super(delegate instanceof BufferedIndexOutput ? ((BufferedIndexOutput) delegate).getBufferSize() : BufferedIndexOutput.DEFAULT_BUFFER_SIZE); + if (delegate instanceof BufferedIndexOutput) { + bufferedDelegate = (BufferedIndexOutput) delegate; + this.delegate = delegate; + } else { + this.delegate = delegate; + bufferedDelegate = null; + } + this.rateLimiter = rateLimiter; + this.rateListener = rateListener; + } + + @Override + protected void flushBuffer(byte[] b, int offset, int len) throws IOException { + rateListener.onPause(rateLimiter.pause(len)); + if (bufferedDelegate != null) { + bufferedDelegate.flushBuffer(b, offset, len); + } else { + delegate.writeBytes(b, offset, len); + } + + } + + @Override + public long length() throws IOException { + return delegate.length(); + } + + @Override + public void seek(long pos) throws IOException { + flush(); + delegate.seek(pos); + } + + @Override + public void flush() throws IOException { + try { + super.flush(); + } finally { + delegate.flush(); + } + } + + @Override + public void setLength(long length) throws IOException { + delegate.setLength(length); + } + + @Override + public void close() throws IOException { + try { + super.close(); + } finally { + delegate.close(); + } + } + } +} diff --git a/src/main/java/org/apache/lucene/store/StoreRateLimiting.java b/src/main/java/org/apache/lucene/store/StoreRateLimiting.java new file mode 100644 index 0000000..8b745ef --- /dev/null +++ b/src/main/java/org/apache/lucene/store/StoreRateLimiting.java @@ -0,0 +1,94 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.lucene.store; + +import org.apache.lucene.store.RateLimiter.SimpleRateLimiter; +import org.elasticsearch.ElasticsearchIllegalArgumentException; +import org.elasticsearch.common.Nullable; +import org.elasticsearch.common.unit.ByteSizeValue; + +/** + */ +public class StoreRateLimiting { + + public static interface Provider { + + StoreRateLimiting rateLimiting(); + } + + public interface Listener { + + void onPause(long nanos); + } + + public static enum Type { + NONE, + MERGE, + ALL; + + public static Type fromString(String type) throws ElasticsearchIllegalArgumentException { + if ("none".equalsIgnoreCase(type)) { + return NONE; + } else if ("merge".equalsIgnoreCase(type)) { + return MERGE; + } else if ("all".equalsIgnoreCase(type)) { + return ALL; + } + throw new ElasticsearchIllegalArgumentException("rate limiting type [" + type + "] not valid, can be one of [all|merge|none]"); + } + } + + private final SimpleRateLimiter rateLimiter = new SimpleRateLimiter(0); + private volatile SimpleRateLimiter actualRateLimiter; + + private volatile Type type; + + public StoreRateLimiting() { + + } + + @Nullable + public RateLimiter getRateLimiter() { + return actualRateLimiter; + } + + public void setMaxRate(ByteSizeValue rate) { + if (rate.bytes() <= 0) { + actualRateLimiter = null; + } else if (actualRateLimiter == null) { + actualRateLimiter = rateLimiter; + actualRateLimiter.setMbPerSec(rate.mbFrac()); + } else { + assert rateLimiter == actualRateLimiter; + rateLimiter.setMbPerSec(rate.mbFrac()); + } + } + + public Type getType() { + return type; + } + + public void setType(Type type) { + this.type = type; + } + + public void setType(String type) throws ElasticsearchIllegalArgumentException { + this.type = Type.fromString(type); + } +} diff --git a/src/main/java/org/apache/lucene/store/StoreUtils.java b/src/main/java/org/apache/lucene/store/StoreUtils.java new file mode 100644 index 0000000..5364970 --- /dev/null +++ b/src/main/java/org/apache/lucene/store/StoreUtils.java @@ -0,0 +1,44 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.lucene.store; + +/** + */ +public final class StoreUtils { + + private StoreUtils() { + + } + + public static String toString(Directory directory) { + if (directory instanceof NIOFSDirectory) { + NIOFSDirectory niofsDirectory = (NIOFSDirectory)directory; + return "niofs(" + niofsDirectory.getDirectory() + ")"; + } + if (directory instanceof MMapDirectory) { + MMapDirectory mMapDirectory = (MMapDirectory)directory; + return "mmapfs(" + mMapDirectory.getDirectory() + ")"; + } + if (directory instanceof SimpleFSDirectory) { + SimpleFSDirectory simpleFSDirectory = (SimpleFSDirectory)directory; + return "simplefs(" + simpleFSDirectory.getDirectory() + ")"; + } + return directory.toString(); + } +} diff --git a/src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferAllocator.java b/src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferAllocator.java new file mode 100644 index 0000000..27b4a1a --- /dev/null +++ b/src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferAllocator.java @@ -0,0 +1,97 @@ +package org.apache.lucene.store.bytebuffer; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.lang.reflect.Method; +import java.nio.ByteBuffer; + +/** + * A byte buffer allocator simple allocates byte buffers, and handles releasing + * them. Implementation can include special direct buffer cleaning when releasing + * a buffer, as well as caching of byte buffers. + * <p/> + * <p>There are two types of buffers that can be allocated, small and big. This + * comes in handy when knowing in advance (more or less) the size of the buffers + * needed (large files or small), as well as in caching implementations. + */ +public interface ByteBufferAllocator { + + /** + * Helper class to allocator implementations allowing to clean direct buffers. + */ + public static class Cleaner { + public static final boolean CLEAN_SUPPORTED; + private static final Method directBufferCleaner; + private static final Method directBufferCleanerClean; + + static { + Method directBufferCleanerX = null; + Method directBufferCleanerCleanX = null; + boolean v; + try { + directBufferCleanerX = Class.forName("java.nio.DirectByteBuffer").getMethod("cleaner"); + directBufferCleanerX.setAccessible(true); + directBufferCleanerCleanX = Class.forName("sun.misc.Cleaner").getMethod("clean"); + directBufferCleanerCleanX.setAccessible(true); + v = true; + } catch (Exception e) { + v = false; + } + CLEAN_SUPPORTED = v; + directBufferCleaner = directBufferCleanerX; + directBufferCleanerClean = directBufferCleanerCleanX; + } + + public static void clean(ByteBuffer buffer) { + if (CLEAN_SUPPORTED && buffer.isDirect()) { + try { + Object cleaner = directBufferCleaner.invoke(buffer); + directBufferCleanerClean.invoke(cleaner); + } catch (Exception e) { + // silently ignore exception + } + } + } + } + + public static enum Type { + SMALL, + LARGE + } + + /** + * The size (in bytes) that is allocated for the provided type. + */ + int sizeInBytes(Type type); + + /** + * Allocate a byte buffer for the specific type. + */ + ByteBuffer allocate(Type type) throws IOException; + + /** + * Release the buffer. + */ + void release(ByteBuffer buffer); + + /** + * Close the allocator, releasing any cached buffers for example. + */ + void close(); +} diff --git a/src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferDirectory.java b/src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferDirectory.java new file mode 100644 index 0000000..9403085 --- /dev/null +++ b/src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferDirectory.java @@ -0,0 +1,182 @@ +package org.apache.lucene.store.bytebuffer; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import com.google.common.collect.ImmutableSet; +import org.apache.lucene.store.*; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Collection; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.atomic.AtomicLong; + +/** + * A memory based directory that uses {@link java.nio.ByteBuffer} in order to store the directory content. + * <p/> + * <p>The benefit of using {@link java.nio.ByteBuffer} is the fact that it can be stored in "native" memory + * outside of the JVM heap, thus not incurring the GC overhead of large in memory index. + * <p/> + * <p>Each "file" is segmented into one or more byte buffers. + * <p/> + * <p>If constructed with {@link ByteBufferAllocator}, it allows to control the allocation and release of + * byte buffer. For example, custom implementations can include caching of byte buffers. + */ +public class ByteBufferDirectory extends BaseDirectory { + + protected final Map<String, ByteBufferFile> files = new ConcurrentHashMap<String, ByteBufferFile>(); + + private final ByteBufferAllocator allocator; + + private final boolean internalAllocator; + + final AtomicLong sizeInBytes = new AtomicLong(); + + + /** + * Constructs a new directory using {@link PlainByteBufferAllocator}. + */ + public ByteBufferDirectory() { + this.allocator = new PlainByteBufferAllocator(false, 1024, 1024 * 10); + this.internalAllocator = true; + try { + setLockFactory(new SingleInstanceLockFactory()); + } catch (IOException e) { + // will not happen + } + } + + /** + * Constructs a new byte buffer directory with a custom allocator. + */ + public ByteBufferDirectory(ByteBufferAllocator allocator) { + this.allocator = allocator; + this.internalAllocator = false; + try { + setLockFactory(new SingleInstanceLockFactory()); + } catch (IOException e) { + // will not happen + } + } + + /** + * Returns the size in bytes of the directory, chunk by buffer size. + */ + public long sizeInBytes() { + return sizeInBytes.get(); + } + + public void sync(Collection<String> names) throws IOException { + // nothing to do here + } + + @Override + public String[] listAll() throws IOException { + return files.keySet().toArray(new String[0]); + } + + @Override + public boolean fileExists(String name) throws IOException { + return files.containsKey(name); + } + + @Override + public void deleteFile(String name) throws IOException { + ByteBufferFile file = files.remove(name); + if (file == null) + throw new FileNotFoundException(name); + sizeInBytes.addAndGet(-file.sizeInBytes()); + file.delete(); + } + + @Override + public long fileLength(String name) throws IOException { + ByteBufferFile file = files.get(name); + if (file == null) + throw new FileNotFoundException(name); + return file.getLength(); + } + + private final static ImmutableSet<String> SMALL_FILES_SUFFIXES = ImmutableSet.of( + "del", // 1 bit per doc + "cfe", // compound file metadata + "si", // segment info + "fnm" // field info (metadata like omit norms etc) + ); + + private static boolean isSmallFile(String fileName) { + if (fileName.startsWith("segments")) { + return true; + } + if (fileName.lastIndexOf('.') > 0) { + String suffix = fileName.substring(fileName.lastIndexOf('.') + 1); + return SMALL_FILES_SUFFIXES.contains(suffix); + } + return false; + } + + @Override + public IndexOutput createOutput(String name, IOContext context) throws IOException { + ByteBufferAllocator.Type allocatorType = ByteBufferAllocator.Type.LARGE; + if (isSmallFile(name)) { + allocatorType = ByteBufferAllocator.Type.SMALL; + } + ByteBufferFileOutput file = new ByteBufferFileOutput(this, allocator.sizeInBytes(allocatorType)); + ByteBufferFile existing = files.put(name, file); + if (existing != null) { + sizeInBytes.addAndGet(-existing.sizeInBytes()); + existing.delete(); + } + return new ByteBufferIndexOutput(this, name, allocator, allocatorType, file); + } + + void closeOutput(String name, ByteBufferFileOutput file) { + // we replace the output file with a read only file, with no sync + files.put(name, new ByteBufferFile(file)); + } + + @Override + public IndexInput openInput(String name, IOContext context) throws IOException { + ByteBufferFile file = files.get(name); + if (file == null) + throw new FileNotFoundException(name); + return new ByteBufferIndexInput(name, file); + } + + @Override + public void close() throws IOException { + String[] files = listAll(); + for (String file : files) { + deleteFile(file); + } + if (internalAllocator) { + allocator.close(); + } + } + + @Override + public String toString() { + return "byte_buffer"; + } + + void releaseBuffer(ByteBuffer byteBuffer) { + allocator.release(byteBuffer); + } +} diff --git a/src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferFile.java b/src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferFile.java new file mode 100644 index 0000000..b726d77 --- /dev/null +++ b/src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferFile.java @@ -0,0 +1,102 @@ +package org.apache.lucene.store.bytebuffer; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; + +/** + */ +public class ByteBufferFile { + + final ByteBufferDirectory dir; + + final int bufferSize; + + final List<ByteBuffer> buffers; + + long length; + + volatile long lastModified = System.currentTimeMillis(); + + final AtomicInteger refCount; + + long sizeInBytes; + + public ByteBufferFile(ByteBufferDirectory dir, int bufferSize) { + this.dir = dir; + this.bufferSize = bufferSize; + this.buffers = new ArrayList<ByteBuffer>(); + this.refCount = new AtomicInteger(1); + } + + ByteBufferFile(ByteBufferFile file) { + this.dir = file.dir; + this.bufferSize = file.bufferSize; + this.buffers = file.buffers; + this.length = file.length; + this.lastModified = file.lastModified; + this.refCount = file.refCount; + this.sizeInBytes = file.sizeInBytes; + } + + public long getLength() { + return length; + } + + public long getLastModified() { + return lastModified; + } + + void setLastModified(long lastModified) { + this.lastModified = lastModified; + } + + long sizeInBytes() { + return sizeInBytes; + } + + ByteBuffer getBuffer(int index) { + return buffers.get(index); + } + + int numBuffers() { + return buffers.size(); + } + + void delete() { + decRef(); + } + + void incRef() { + refCount.incrementAndGet(); + } + + void decRef() { + if (refCount.decrementAndGet() == 0) { + length = 0; + for (ByteBuffer buffer : buffers) { + dir.releaseBuffer(buffer); + } + buffers.clear(); + sizeInBytes = 0; + } + } +} diff --git a/src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferFileOutput.java b/src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferFileOutput.java new file mode 100644 index 0000000..7de55bf --- /dev/null +++ b/src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferFileOutput.java @@ -0,0 +1,65 @@ +package org.apache.lucene.store.bytebuffer; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import java.nio.ByteBuffer; + +/** + */ +public class ByteBufferFileOutput extends ByteBufferFile { + + public ByteBufferFileOutput(ByteBufferDirectory dir, int bufferSize) { + super(dir, bufferSize); + } + + @Override + public synchronized long getLength() { + return super.getLength(); + } + + @Override + public synchronized long getLastModified() { + return super.getLastModified(); + } + + synchronized void setLength(long length) { + this.length = length; + } + + synchronized final void addBuffer(ByteBuffer buffer) { + buffers.add(buffer); + sizeInBytes += buffer.remaining(); + dir.sizeInBytes.addAndGet(buffer.remaining()); + } + + @Override + synchronized ByteBuffer getBuffer(int index) { + return super.getBuffer(index); + } + + @Override + synchronized int numBuffers() { + return super.numBuffers(); + } + + @Override + synchronized long sizeInBytes() { + return super.sizeInBytes(); + } +} diff --git a/src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferIndexInput.java b/src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferIndexInput.java new file mode 100644 index 0000000..aeba553 --- /dev/null +++ b/src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferIndexInput.java @@ -0,0 +1,198 @@ +package org.apache.lucene.store.bytebuffer; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.IndexInput; + +import java.io.EOFException; +import java.io.IOException; +import java.nio.BufferUnderflowException; +import java.nio.ByteBuffer; + +/** + */ +public class ByteBufferIndexInput extends IndexInput { + + private final static ByteBuffer EMPTY_BUFFER = ByteBuffer.allocate(0).asReadOnlyBuffer(); + + private final ByteBufferFile file; + private final long length; + + private ByteBuffer currentBuffer; + private int currentBufferIndex; + + private long bufferStart; + private final int BUFFER_SIZE; + + private volatile boolean closed = false; + + public ByteBufferIndexInput(String name, ByteBufferFile file) throws IOException { + super("BBIndexInput(name=" + name + ")"); + this.file = file; + this.file.incRef(); + this.length = file.getLength(); + this.BUFFER_SIZE = file.bufferSize; + + // make sure that we switch to the + // first needed buffer lazily + currentBufferIndex = -1; + currentBuffer = EMPTY_BUFFER; + } + + @Override + public void close() { + // we protected from double closing the index input since + // some tests do that... + if (closed) { + return; + } + closed = true; + file.decRef(); + } + + @Override + public long length() { + return length; + } + + @Override + public short readShort() throws IOException { + try { + currentBuffer.mark(); + return currentBuffer.getShort(); + } catch (BufferUnderflowException e) { + currentBuffer.reset(); + return super.readShort(); + } + } + + @Override + public int readInt() throws IOException { + try { + currentBuffer.mark(); + return currentBuffer.getInt(); + } catch (BufferUnderflowException e) { + currentBuffer.reset(); + return super.readInt(); + } + } + + @Override + public long readLong() throws IOException { + try { + currentBuffer.mark(); + return currentBuffer.getLong(); + } catch (BufferUnderflowException e) { + currentBuffer.reset(); + return super.readLong(); + } + } + + @Override + public byte readByte() throws IOException { + if (!currentBuffer.hasRemaining()) { + currentBufferIndex++; + switchCurrentBuffer(true); + } + return currentBuffer.get(); + } + + @Override + public void readBytes(byte[] b, int offset, int len) throws IOException { + while (len > 0) { + if (!currentBuffer.hasRemaining()) { + currentBufferIndex++; + switchCurrentBuffer(true); + } + + int remainInBuffer = currentBuffer.remaining(); + int bytesToCopy = len < remainInBuffer ? len : remainInBuffer; + currentBuffer.get(b, offset, bytesToCopy); + offset += bytesToCopy; + len -= bytesToCopy; + } + } + + @Override + public long getFilePointer() { + return currentBufferIndex < 0 ? 0 : bufferStart + currentBuffer.position(); + } + + @Override + public void seek(long pos) throws IOException { + if (currentBuffer == EMPTY_BUFFER || pos < bufferStart || pos >= bufferStart + BUFFER_SIZE) { + currentBufferIndex = (int) (pos / BUFFER_SIZE); + switchCurrentBuffer(false); + } + try { + currentBuffer.position((int) (pos % BUFFER_SIZE)); + // Grrr, need to wrap in IllegalArgumentException since tests (if not other places) + // expect an IOException... + } catch (IllegalArgumentException e) { + IOException ioException = new IOException("seeking past position"); + ioException.initCause(e); + throw ioException; + } + } + + private void switchCurrentBuffer(boolean enforceEOF) throws IOException { + if (currentBufferIndex >= file.numBuffers()) { + // end of file reached, no more buffers left + if (enforceEOF) { + throw new EOFException("Read past EOF (resource: " + this + ")"); + } else { + // Force EOF if a read takes place at this position + currentBufferIndex--; + currentBuffer.position(currentBuffer.limit()); + } + } else { + ByteBuffer buffer = file.getBuffer(currentBufferIndex); + // we must duplicate (and make it read only while we are at it) since we need position and such to be independent + currentBuffer = buffer.asReadOnlyBuffer(); + currentBuffer.position(0); + bufferStart = (long) BUFFER_SIZE * (long) currentBufferIndex; + // if we are at the tip, limit the current buffer to only whats available to read + long buflen = length - bufferStart; + if (buflen < BUFFER_SIZE) { + currentBuffer.limit((int) buflen); + } + + // we need to enforce EOF here as well... + if (!currentBuffer.hasRemaining()) { + if (enforceEOF) { + throw new EOFException("Read past EOF (resource: " + this + ")"); + } else { + // Force EOF if a read takes place at this position + currentBufferIndex--; + currentBuffer.position(currentBuffer.limit()); + } + } + } + } + + @Override + public IndexInput clone() { + ByteBufferIndexInput cloned = (ByteBufferIndexInput) super.clone(); + cloned.file.incRef(); // inc ref on cloned one + if (currentBuffer != EMPTY_BUFFER) { + cloned.currentBuffer = currentBuffer.asReadOnlyBuffer(); + cloned.currentBuffer.position(currentBuffer.position()); + } + return cloned; + } +} diff --git a/src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferIndexOutput.java b/src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferIndexOutput.java new file mode 100644 index 0000000..a667bab --- /dev/null +++ b/src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferIndexOutput.java @@ -0,0 +1,132 @@ +package org.apache.lucene.store.bytebuffer; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import org.apache.lucene.store.IndexOutput; + +import java.io.IOException; +import java.nio.ByteBuffer; + +/** + */ +public class ByteBufferIndexOutput extends IndexOutput { + + private final static ByteBuffer EMPTY_BUFFER = ByteBuffer.allocate(0).asReadOnlyBuffer(); + + private final ByteBufferDirectory dir; + private final String name; + private final ByteBufferAllocator allocator; + private final ByteBufferAllocator.Type allocatorType; + private final int BUFFER_SIZE; + private final ByteBufferFileOutput file; + + private ByteBuffer currentBuffer; + private int currentBufferIndex; + + private long bufferStart; + + public ByteBufferIndexOutput(ByteBufferDirectory dir, String name, ByteBufferAllocator allocator, ByteBufferAllocator.Type allocatorType, ByteBufferFileOutput file) throws IOException { + this.dir = dir; + this.name = name; + this.allocator = allocator; + this.allocatorType = allocatorType; + this.BUFFER_SIZE = file.bufferSize; + this.file = file; + + currentBufferIndex = -1; + currentBuffer = EMPTY_BUFFER; + } + + @Override + public void close() throws IOException { + flush(); + dir.closeOutput(name, file); + } + + @Override + public void seek(long pos) throws IOException { + // set the file length in case we seek back + // and flush() has not been called yet + setFileLength(); + if (pos < bufferStart || pos >= bufferStart + BUFFER_SIZE) { + currentBufferIndex = (int) (pos / BUFFER_SIZE); + switchCurrentBuffer(); + } + currentBuffer.position((int) (pos % BUFFER_SIZE)); + } + + @Override + public long length() { + return file.getLength(); + } + + @Override + public void writeByte(byte b) throws IOException { + if (!currentBuffer.hasRemaining()) { + currentBufferIndex++; + switchCurrentBuffer(); + } + currentBuffer.put(b); + } + + @Override + public void writeBytes(byte[] b, int offset, int len) throws IOException { + while (len > 0) { + if (!currentBuffer.hasRemaining()) { + currentBufferIndex++; + switchCurrentBuffer(); + } + + int remainInBuffer = currentBuffer.remaining(); + int bytesToCopy = len < remainInBuffer ? len : remainInBuffer; + currentBuffer.put(b, offset, bytesToCopy); + offset += bytesToCopy; + len -= bytesToCopy; + } + } + + private void switchCurrentBuffer() throws IOException { + if (currentBufferIndex == file.numBuffers()) { + currentBuffer = allocator.allocate(allocatorType); + file.addBuffer(currentBuffer); + } else { + currentBuffer = file.getBuffer(currentBufferIndex); + } + currentBuffer.position(0); + bufferStart = (long) BUFFER_SIZE * (long) currentBufferIndex; + } + + private void setFileLength() { + long pointer = bufferStart + currentBuffer.position(); + if (pointer > file.getLength()) { + file.setLength(pointer); + } + } + + @Override + public void flush() throws IOException { + file.setLastModified(System.currentTimeMillis()); + setFileLength(); + } + + @Override + public long getFilePointer() { + return currentBufferIndex < 0 ? 0 : bufferStart + currentBuffer.position(); + } +} diff --git a/src/main/java/org/apache/lucene/store/bytebuffer/CachingByteBufferAllocator.java b/src/main/java/org/apache/lucene/store/bytebuffer/CachingByteBufferAllocator.java new file mode 100644 index 0000000..925a76c --- /dev/null +++ b/src/main/java/org/apache/lucene/store/bytebuffer/CachingByteBufferAllocator.java @@ -0,0 +1,82 @@ +package org.apache.lucene.store.bytebuffer; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.LinkedBlockingQueue; + +/** + * The caching byte buffer allocator allows to define a global size for both the small and large buffers + * allocated. Those will be reused when possible. + */ +public class CachingByteBufferAllocator extends PlainByteBufferAllocator { + + private final BlockingQueue<ByteBuffer> smallCache; + private final BlockingQueue<ByteBuffer> largeCache; + + /** + * @param direct If set to true, will allocate direct buffers (off heap). + * @param smallBufferSizeInBytes The size (in bytes) of the small buffer allocation. + * @param largeBufferSizeInBytes The size (in bytes) of the large buffer allocation. + * @param smallCacheSizeInBytes The size of the small cache buffer in bytes. + * @param largeCacheSizeInBytes The size of the large cache buffer in bytes. + */ + public CachingByteBufferAllocator(boolean direct, int smallBufferSizeInBytes, int largeBufferSizeInBytes, + int smallCacheSizeInBytes, int largeCacheSizeInBytes) { + super(direct, smallBufferSizeInBytes, largeBufferSizeInBytes); + this.smallCache = new LinkedBlockingQueue<ByteBuffer>(smallCacheSizeInBytes / smallBufferSizeInBytes); + this.largeCache = new LinkedBlockingQueue<ByteBuffer>(largeCacheSizeInBytes / largeBufferSizeInBytes); + } + + + public ByteBuffer allocate(Type type) throws IOException { + ByteBuffer buffer = type == Type.SMALL ? smallCache.poll() : largeCache.poll(); + if (buffer == null) { + buffer = super.allocate(type); + } + return buffer; + } + + public void release(ByteBuffer buffer) { + if (buffer.capacity() == smallBufferSizeInBytes) { + boolean success = smallCache.offer(buffer); + if (!success) { + super.release(buffer); + } + } else if (buffer.capacity() == largeBufferSizeInBytes) { + boolean success = largeCache.offer(buffer); + if (!success) { + super.release(buffer); + } + } + // otherwise, just ignore it? not our allocation... + } + + public void close() { + for (ByteBuffer buffer : smallCache) { + super.release(buffer); + } + smallCache.clear(); + for (ByteBuffer buffer : largeCache) { + super.release(buffer); + } + largeCache.clear(); + } +} diff --git a/src/main/java/org/apache/lucene/store/bytebuffer/PlainByteBufferAllocator.java b/src/main/java/org/apache/lucene/store/bytebuffer/PlainByteBufferAllocator.java new file mode 100644 index 0000000..e800b61 --- /dev/null +++ b/src/main/java/org/apache/lucene/store/bytebuffer/PlainByteBufferAllocator.java @@ -0,0 +1,67 @@ +package org.apache.lucene.store.bytebuffer; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.nio.ByteBuffer; + +/** + * A simple byte buffer allocator that does not caching. The direct flag + * allows to control if the byte buffer will be allocated off heap or not. + */ +public class PlainByteBufferAllocator implements ByteBufferAllocator { + + protected final boolean direct; + + protected final int smallBufferSizeInBytes; + + protected final int largeBufferSizeInBytes; + + /** + * Constructs a new plain byte buffer allocator that does no caching. + * + * @param direct If set to true, will allocate direct buffers (off heap). + * @param smallBufferSizeInBytes The size (in bytes) of the small buffer allocation. + * @param largeBufferSizeInBytes The size (in bytes) of the large buffer allocation. + */ + public PlainByteBufferAllocator(boolean direct, int smallBufferSizeInBytes, int largeBufferSizeInBytes) { + this.direct = direct; + this.smallBufferSizeInBytes = smallBufferSizeInBytes; + this.largeBufferSizeInBytes = largeBufferSizeInBytes; + } + + public int sizeInBytes(Type type) { + return type == Type.SMALL ? smallBufferSizeInBytes : largeBufferSizeInBytes; + } + + public ByteBuffer allocate(Type type) throws IOException { + int sizeToAllocate = type == Type.SMALL ? smallBufferSizeInBytes : largeBufferSizeInBytes; + if (direct) { + return ByteBuffer.allocateDirect(sizeToAllocate); + } + return ByteBuffer.allocate(sizeToAllocate); + } + + public void release(ByteBuffer buffer) { + Cleaner.clean(buffer); + } + + public void close() { + // nothing to do here... + } +} |