summaryrefslogtreecommitdiff
path: root/src/main/java/org/apache/lucene
diff options
context:
space:
mode:
Diffstat (limited to 'src/main/java/org/apache/lucene')
-rw-r--r--src/main/java/org/apache/lucene/analysis/CustomAnalyzerWrapper.java77
-rw-r--r--src/main/java/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilter.java56
-rw-r--r--src/main/java/org/apache/lucene/analysis/miscellaneous/UniqueTokenFilter.java90
-rw-r--r--src/main/java/org/apache/lucene/index/TrackingConcurrentMergeScheduler.java149
-rw-r--r--src/main/java/org/apache/lucene/index/TrackingSerialMergeScheduler.java165
-rw-r--r--src/main/java/org/apache/lucene/index/memory/ExtendedMemoryIndex.java31
-rw-r--r--src/main/java/org/apache/lucene/queries/ExtendedCommonTermsQuery.java170
-rw-r--r--src/main/java/org/apache/lucene/queries/XTermsFilter.java328
-rw-r--r--src/main/java/org/apache/lucene/queryparser/XSimpleQueryParser.java521
-rw-r--r--src/main/java/org/apache/lucene/queryparser/classic/ExistsFieldQueryExtension.java38
-rw-r--r--src/main/java/org/apache/lucene/queryparser/classic/FieldQueryExtension.java31
-rw-r--r--src/main/java/org/apache/lucene/queryparser/classic/MapperQueryParser.java884
-rw-r--r--src/main/java/org/apache/lucene/queryparser/classic/MissingFieldQueryExtension.java39
-rw-r--r--src/main/java/org/apache/lucene/queryparser/classic/QueryParserSettings.java376
-rw-r--r--src/main/java/org/apache/lucene/search/XReferenceManager.java326
-rw-r--r--src/main/java/org/apache/lucene/search/XSearcherManager.java177
-rw-r--r--src/main/java/org/apache/lucene/search/postingshighlight/CustomPassageFormatter.java78
-rw-r--r--src/main/java/org/apache/lucene/search/postingshighlight/CustomPostingsHighlighter.java187
-rw-r--r--src/main/java/org/apache/lucene/search/postingshighlight/Snippet.java50
-rw-r--r--src/main/java/org/apache/lucene/search/postingshighlight/XPostingsHighlighter.java777
-rw-r--r--src/main/java/org/apache/lucene/search/suggest/analyzing/XAnalyzingSuggester.java1052
-rw-r--r--src/main/java/org/apache/lucene/search/suggest/analyzing/XFuzzySuggester.java263
-rw-r--r--src/main/java/org/apache/lucene/search/vectorhighlight/CustomFieldQuery.java161
-rw-r--r--src/main/java/org/apache/lucene/store/BufferedChecksumIndexOutput.java115
-rw-r--r--src/main/java/org/apache/lucene/store/RateLimitedFSDirectory.java143
-rw-r--r--src/main/java/org/apache/lucene/store/StoreRateLimiting.java94
-rw-r--r--src/main/java/org/apache/lucene/store/StoreUtils.java44
-rw-r--r--src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferAllocator.java97
-rw-r--r--src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferDirectory.java182
-rw-r--r--src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferFile.java102
-rw-r--r--src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferFileOutput.java65
-rw-r--r--src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferIndexInput.java198
-rw-r--r--src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferIndexOutput.java132
-rw-r--r--src/main/java/org/apache/lucene/store/bytebuffer/CachingByteBufferAllocator.java82
-rw-r--r--src/main/java/org/apache/lucene/store/bytebuffer/PlainByteBufferAllocator.java67
35 files changed, 7347 insertions, 0 deletions
diff --git a/src/main/java/org/apache/lucene/analysis/CustomAnalyzerWrapper.java b/src/main/java/org/apache/lucene/analysis/CustomAnalyzerWrapper.java
new file mode 100644
index 0000000..6ea95e4
--- /dev/null
+++ b/src/main/java/org/apache/lucene/analysis/CustomAnalyzerWrapper.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.lucene.analysis;
+
+import java.io.Reader;
+
+/**
+ * Similar to Lucene {@link AnalyzerWrapper} but actually allows to set the reuse strategy....
+ * //TODO add to lucene the ability to set it...
+ */
+public abstract class CustomAnalyzerWrapper extends Analyzer {
+
+ /**
+ * Creates a new CustomAnalyzerWrapper. Since the {@link Analyzer.ReuseStrategy} of
+ * the wrapped Analyzers are unknown, {@link Analyzer.PerFieldReuseStrategy} is assumed
+ */
+ protected CustomAnalyzerWrapper(ReuseStrategy reuseStrategy) {
+ super(reuseStrategy);
+ }
+
+ /**
+ * Retrieves the wrapped Analyzer appropriate for analyzing the field with
+ * the given name
+ *
+ * @param fieldName Name of the field which is to be analyzed
+ * @return Analyzer for the field with the given name. Assumed to be non-null
+ */
+ protected abstract Analyzer getWrappedAnalyzer(String fieldName);
+
+ /**
+ * Wraps / alters the given TokenStreamComponents, taken from the wrapped
+ * Analyzer, to form new components. It is through this method that new
+ * TokenFilters can be added by AnalyzerWrappers.
+ *
+ * @param fieldName Name of the field which is to be analyzed
+ * @param components TokenStreamComponents taken from the wrapped Analyzer
+ * @return Wrapped / altered TokenStreamComponents.
+ */
+ protected abstract TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components);
+
+ @Override
+ protected final TokenStreamComponents createComponents(String fieldName, Reader aReader) {
+ return wrapComponents(fieldName, getWrappedAnalyzer(fieldName).createComponents(fieldName, aReader));
+ }
+
+ @Override
+ public int getPositionIncrementGap(String fieldName) {
+ return getWrappedAnalyzer(fieldName).getPositionIncrementGap(fieldName);
+ }
+
+ @Override
+ public int getOffsetGap(String fieldName) {
+ return getWrappedAnalyzer(fieldName).getOffsetGap(fieldName);
+ }
+
+ @Override
+ public final Reader initReader(String fieldName, Reader reader) {
+ return getWrappedAnalyzer(fieldName).initReader(fieldName, reader);
+ }
+}
diff --git a/src/main/java/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilter.java b/src/main/java/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilter.java
new file mode 100644
index 0000000..ea93850
--- /dev/null
+++ b/src/main/java/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilter.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+import java.io.IOException;
+
+/**
+ * A token filter that truncates tokens.
+ */
+public class TruncateTokenFilter extends TokenFilter {
+
+ private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
+
+ private final int size;
+
+ public TruncateTokenFilter(TokenStream in, int size) {
+ super(in);
+ this.size = size;
+ }
+
+ @Override
+ public final boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ final int length = termAttribute.length();
+ if (length > size) {
+ termAttribute.setLength(size);
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
+}
+
+
diff --git a/src/main/java/org/apache/lucene/analysis/miscellaneous/UniqueTokenFilter.java b/src/main/java/org/apache/lucene/analysis/miscellaneous/UniqueTokenFilter.java
new file mode 100644
index 0000000..4ccad58
--- /dev/null
+++ b/src/main/java/org/apache/lucene/analysis/miscellaneous/UniqueTokenFilter.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.util.Version;
+
+import java.io.IOException;
+
+/**
+ * A token filter that generates unique tokens. Can remove unique tokens only on the same
+ * position increments as well.
+ */
+public class UniqueTokenFilter extends TokenFilter {
+
+ private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
+ private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
+
+ // use a fixed version, as we don't care about case sensitivity.
+ private final CharArraySet previous = new CharArraySet(Version.LUCENE_31, 8, false);
+ private final boolean onlyOnSamePosition;
+
+ public UniqueTokenFilter(TokenStream in) {
+ this(in, false);
+ }
+
+ public UniqueTokenFilter(TokenStream in, boolean onlyOnSamePosition) {
+ super(in);
+ this.onlyOnSamePosition = onlyOnSamePosition;
+ }
+
+ @Override
+ public final boolean incrementToken() throws IOException {
+ while (input.incrementToken()) {
+ final char term[] = termAttribute.buffer();
+ final int length = termAttribute.length();
+
+ boolean duplicate;
+ if (onlyOnSamePosition) {
+ final int posIncrement = posIncAttribute.getPositionIncrement();
+ if (posIncrement > 0) {
+ previous.clear();
+ }
+
+ duplicate = (posIncrement == 0 && previous.contains(term, 0, length));
+ } else {
+ duplicate = previous.contains(term, 0, length);
+ }
+
+ // clone the term, and add to the set of seen terms.
+ char saved[] = new char[length];
+ System.arraycopy(term, 0, saved, 0, length);
+ previous.add(saved);
+
+ if (!duplicate) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ @Override
+ public final void reset() throws IOException {
+ super.reset();
+ previous.clear();
+ }
+}
+
+
diff --git a/src/main/java/org/apache/lucene/index/TrackingConcurrentMergeScheduler.java b/src/main/java/org/apache/lucene/index/TrackingConcurrentMergeScheduler.java
new file mode 100644
index 0000000..66b3328
--- /dev/null
+++ b/src/main/java/org/apache/lucene/index/TrackingConcurrentMergeScheduler.java
@@ -0,0 +1,149 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.lucene.index;
+
+import org.elasticsearch.common.logging.ESLogger;
+import org.elasticsearch.common.metrics.CounterMetric;
+import org.elasticsearch.common.metrics.MeanMetric;
+import org.elasticsearch.common.unit.ByteSizeValue;
+import org.elasticsearch.common.unit.TimeValue;
+import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
+import org.elasticsearch.index.merge.OnGoingMerge;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Set;
+
+/**
+ * An extension to the {@link ConcurrentMergeScheduler} that provides tracking on merge times, total
+ * and current merges.
+ */
+public class TrackingConcurrentMergeScheduler extends ConcurrentMergeScheduler {
+
+ protected final ESLogger logger;
+
+ private final MeanMetric totalMerges = new MeanMetric();
+ private final CounterMetric totalMergesNumDocs = new CounterMetric();
+ private final CounterMetric totalMergesSizeInBytes = new CounterMetric();
+ private final CounterMetric currentMerges = new CounterMetric();
+ private final CounterMetric currentMergesNumDocs = new CounterMetric();
+ private final CounterMetric currentMergesSizeInBytes = new CounterMetric();
+
+ private final Set<OnGoingMerge> onGoingMerges = ConcurrentCollections.newConcurrentSet();
+ private final Set<OnGoingMerge> readOnlyOnGoingMerges = Collections.unmodifiableSet(onGoingMerges);
+
+ public TrackingConcurrentMergeScheduler(ESLogger logger) {
+ super();
+ this.logger = logger;
+ }
+
+ public long totalMerges() {
+ return totalMerges.count();
+ }
+
+ public long totalMergeTime() {
+ return totalMerges.sum();
+ }
+
+ public long totalMergeNumDocs() {
+ return totalMergesNumDocs.count();
+ }
+
+ public long totalMergeSizeInBytes() {
+ return totalMergesSizeInBytes.count();
+ }
+
+ public long currentMerges() {
+ return currentMerges.count();
+ }
+
+ public long currentMergesNumDocs() {
+ return currentMergesNumDocs.count();
+ }
+
+ public long currentMergesSizeInBytes() {
+ return currentMergesSizeInBytes.count();
+ }
+
+ public Set<OnGoingMerge> onGoingMerges() {
+ return readOnlyOnGoingMerges;
+ }
+
+ @Override
+ protected void doMerge(MergePolicy.OneMerge merge) throws IOException {
+ int totalNumDocs = merge.totalNumDocs();
+ // don't used #totalBytesSize() since need to be executed under IW lock, might be fixed in future Lucene version
+ long totalSizeInBytes = merge.estimatedMergeBytes;
+ long time = System.currentTimeMillis();
+ currentMerges.inc();
+ currentMergesNumDocs.inc(totalNumDocs);
+ currentMergesSizeInBytes.inc(totalSizeInBytes);
+
+ OnGoingMerge onGoingMerge = new OnGoingMerge(merge);
+ onGoingMerges.add(onGoingMerge);
+
+ if (logger.isTraceEnabled()) {
+ logger.trace("merge [{}] starting..., merging [{}] segments, [{}] docs, [{}] size, into [{}] estimated_size", merge.info == null ? "_na_" : merge.info.info.name, merge.segments.size(), totalNumDocs, new ByteSizeValue(totalSizeInBytes), new ByteSizeValue(merge.estimatedMergeBytes));
+ }
+ try {
+ beforeMerge(onGoingMerge);
+ super.doMerge(merge);
+ } finally {
+ long took = System.currentTimeMillis() - time;
+
+ onGoingMerges.remove(onGoingMerge);
+ afterMerge(onGoingMerge);
+
+ currentMerges.dec();
+ currentMergesNumDocs.dec(totalNumDocs);
+ currentMergesSizeInBytes.dec(totalSizeInBytes);
+
+ totalMergesNumDocs.inc(totalNumDocs);
+ totalMergesSizeInBytes.inc(totalSizeInBytes);
+ totalMerges.inc(took);
+ if (took > 20000) { // if more than 20 seconds, DEBUG log it
+ logger.debug("merge [{}] done, took [{}]", merge.info == null ? "_na_" : merge.info.info.name, TimeValue.timeValueMillis(took));
+ } else if (logger.isTraceEnabled()) {
+ logger.trace("merge [{}] done, took [{}]", merge.info == null ? "_na_" : merge.info.info.name, TimeValue.timeValueMillis(took));
+ }
+ }
+ }
+
+ /**
+ * A callback allowing for custom logic before an actual merge starts.
+ */
+ protected void beforeMerge(OnGoingMerge merge) {
+
+ }
+
+ /**
+ * A callback allowing for custom logic before an actual merge starts.
+ */
+ protected void afterMerge(OnGoingMerge merge) {
+
+ }
+
+ @Override
+ public MergeScheduler clone() {
+ // Lucene IW makes a clone internally but since we hold on to this instance
+ // the clone will just be the identity.
+ return this;
+ }
+} \ No newline at end of file
diff --git a/src/main/java/org/apache/lucene/index/TrackingSerialMergeScheduler.java b/src/main/java/org/apache/lucene/index/TrackingSerialMergeScheduler.java
new file mode 100644
index 0000000..bd3ed9d
--- /dev/null
+++ b/src/main/java/org/apache/lucene/index/TrackingSerialMergeScheduler.java
@@ -0,0 +1,165 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.lucene.index;
+
+import org.elasticsearch.common.logging.ESLogger;
+import org.elasticsearch.common.metrics.CounterMetric;
+import org.elasticsearch.common.metrics.MeanMetric;
+import org.elasticsearch.common.unit.ByteSizeValue;
+import org.elasticsearch.common.unit.TimeValue;
+import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
+import org.elasticsearch.index.merge.OnGoingMerge;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Set;
+
+// LUCENE MONITOR - Copied from SerialMergeScheduler
+public class TrackingSerialMergeScheduler extends MergeScheduler {
+
+ protected final ESLogger logger;
+
+ private final MeanMetric totalMerges = new MeanMetric();
+ private final CounterMetric totalMergesNumDocs = new CounterMetric();
+ private final CounterMetric totalMergesSizeInBytes = new CounterMetric();
+ private final CounterMetric currentMerges = new CounterMetric();
+ private final CounterMetric currentMergesNumDocs = new CounterMetric();
+ private final CounterMetric currentMergesSizeInBytes = new CounterMetric();
+
+ private final Set<OnGoingMerge> onGoingMerges = ConcurrentCollections.newConcurrentSet();
+ private final Set<OnGoingMerge> readOnlyOnGoingMerges = Collections.unmodifiableSet(onGoingMerges);
+
+ public TrackingSerialMergeScheduler(ESLogger logger) {
+ this.logger = logger;
+ }
+
+ public long totalMerges() {
+ return totalMerges.count();
+ }
+
+ public long totalMergeTime() {
+ return totalMerges.sum();
+ }
+
+ public long totalMergeNumDocs() {
+ return totalMergesNumDocs.count();
+ }
+
+ public long totalMergeSizeInBytes() {
+ return totalMergesSizeInBytes.count();
+ }
+
+ public long currentMerges() {
+ return currentMerges.count();
+ }
+
+ public long currentMergesNumDocs() {
+ return currentMergesNumDocs.count();
+ }
+
+ public long currentMergesSizeInBytes() {
+ return currentMergesSizeInBytes.count();
+ }
+
+ public Set<OnGoingMerge> onGoingMerges() {
+ return readOnlyOnGoingMerges;
+ }
+
+ /**
+ * Just do the merges in sequence. We do this
+ * "synchronized" so that even if the application is using
+ * multiple threads, only one merge may run at a time.
+ */
+ @Override
+ synchronized public void merge(IndexWriter writer) throws CorruptIndexException, IOException {
+ while (true) {
+ MergePolicy.OneMerge merge = writer.getNextMerge();
+ if (merge == null)
+ break;
+
+ // different from serial merge, call mergeInit here so we get the correct stats
+ // mergeInit can be called several times without side affects (checks on merge.info not being null)
+ writer.mergeInit(merge);
+
+ int totalNumDocs = merge.totalNumDocs();
+ // don't used #totalBytesSize() since need to be executed under IW lock, might be fixed in future Lucene version
+ long totalSizeInBytes = merge.estimatedMergeBytes;
+ long time = System.currentTimeMillis();
+ currentMerges.inc();
+ currentMergesNumDocs.inc(totalNumDocs);
+ currentMergesSizeInBytes.inc(totalSizeInBytes);
+
+ OnGoingMerge onGoingMerge = new OnGoingMerge(merge);
+ onGoingMerges.add(onGoingMerge);
+
+ // sadly, segment name is not available since mergeInit is called from merge itself...
+ if (logger.isTraceEnabled()) {
+ logger.trace("merge [{}] starting..., merging [{}] segments, [{}] docs, [{}] size, into [{}] estimated_size", merge.info == null ? "_na_" : merge.info.info.name, merge.segments.size(), totalNumDocs, new ByteSizeValue(totalSizeInBytes), new ByteSizeValue(merge.estimatedMergeBytes));
+ }
+ try {
+ beforeMerge(onGoingMerge);
+ writer.merge(merge);
+ } finally {
+ long took = System.currentTimeMillis() - time;
+
+ onGoingMerges.remove(onGoingMerge);
+ afterMerge(onGoingMerge);
+
+ currentMerges.dec();
+ currentMergesNumDocs.dec(totalNumDocs);
+ currentMergesSizeInBytes.dec(totalSizeInBytes);
+
+ totalMergesNumDocs.inc(totalNumDocs);
+ totalMergesSizeInBytes.inc(totalSizeInBytes);
+ totalMerges.inc(took);
+ if (took > 20000) { // if more than 20 seconds, DEBUG log it
+ logger.debug("merge [{}] done, took [{}]", merge.info == null ? "_na_" : merge.info.info.name, TimeValue.timeValueMillis(took));
+ } else if (logger.isTraceEnabled()) {
+ logger.trace("merge [{}] done, took [{}]", merge.info == null ? "_na_" : merge.info.info.name, TimeValue.timeValueMillis(took));
+ }
+ }
+ }
+ }
+
+ /**
+ * A callback allowing for custom logic before an actual merge starts.
+ */
+ protected void beforeMerge(OnGoingMerge merge) {
+
+ }
+
+ /**
+ * A callback allowing for custom logic before an actual merge starts.
+ */
+ protected void afterMerge(OnGoingMerge merge) {
+
+ }
+
+ @Override
+ public void close() {
+ }
+
+ @Override
+ public MergeScheduler clone() {
+ // Lucene IW makes a clone internally but since we hold on to this instance
+ // the clone will just be the identity.
+ return this;
+ }
+} \ No newline at end of file
diff --git a/src/main/java/org/apache/lucene/index/memory/ExtendedMemoryIndex.java b/src/main/java/org/apache/lucene/index/memory/ExtendedMemoryIndex.java
new file mode 100644
index 0000000..5f99fce
--- /dev/null
+++ b/src/main/java/org/apache/lucene/index/memory/ExtendedMemoryIndex.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.lucene.index.memory;
+
+/**
+ * This class overwrites {@link MemoryIndex} to make the reuse constructor visible.
+ */
+public final class ExtendedMemoryIndex extends MemoryIndex {
+
+ public ExtendedMemoryIndex(boolean storeOffsets, long maxReusedBytes) {
+ super(storeOffsets, maxReusedBytes);
+ }
+
+}
diff --git a/src/main/java/org/apache/lucene/queries/ExtendedCommonTermsQuery.java b/src/main/java/org/apache/lucene/queries/ExtendedCommonTermsQuery.java
new file mode 100644
index 0000000..e1dcf58
--- /dev/null
+++ b/src/main/java/org/apache/lucene/queries/ExtendedCommonTermsQuery.java
@@ -0,0 +1,170 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.lucene.queries;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermContext;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+import org.elasticsearch.common.lucene.search.Queries;
+import org.elasticsearch.index.mapper.FieldMapper;
+
+import java.io.IOException;
+
+/**
+ * Extended version of {@link CommonTermsQuery} that allows to pass in a
+ * <tt>minimumNumberShouldMatch</tt> specification that uses the actual num of high frequent terms
+ * to calculate the minimum matching terms.
+ */
+public class ExtendedCommonTermsQuery extends CommonTermsQuery {
+
+ private final FieldMapper<?> mapper;
+
+ public ExtendedCommonTermsQuery(Occur highFreqOccur, Occur lowFreqOccur, float maxTermFrequency, boolean disableCoord, FieldMapper<?> mapper) {
+ super(highFreqOccur, lowFreqOccur, maxTermFrequency, disableCoord);
+ this.mapper = mapper;
+ }
+
+ private String lowFreqMinNumShouldMatchSpec;
+ private String highFreqMinNumShouldMatchSpec;
+
+ @Override
+ protected int calcLowFreqMinimumNumberShouldMatch(int numOptional) {
+ return calcMinimumNumberShouldMatch(lowFreqMinNumShouldMatchSpec, numOptional);
+ }
+
+ protected int calcMinimumNumberShouldMatch(String spec, int numOptional) {
+ if (spec == null) {
+ return 0;
+ }
+ return Queries.calculateMinShouldMatch(numOptional, spec);
+ }
+
+ @Override
+ protected int calcHighFreqMinimumNumberShouldMatch(int numOptional) {
+ return calcMinimumNumberShouldMatch(highFreqMinNumShouldMatchSpec, numOptional);
+ }
+
+ public void setHighFreqMinimumNumberShouldMatch(String spec) {
+ this.highFreqMinNumShouldMatchSpec = spec;
+ }
+
+ public String getHighFreqMinimumNumberShouldMatchSpec() {
+ return highFreqMinNumShouldMatchSpec;
+ }
+
+ public void setLowFreqMinimumNumberShouldMatch(String spec) {
+ this.lowFreqMinNumShouldMatchSpec = spec;
+ }
+
+ public String getLowFreqMinimumNumberShouldMatchSpec() {
+ return lowFreqMinNumShouldMatchSpec;
+ }
+
+ // LUCENE-UPGRADE: remove this method if on 4.8
+ @Override
+ public Query rewrite(IndexReader reader) throws IOException {
+ if (this.terms.isEmpty()) {
+ return new BooleanQuery();
+ } else if (this.terms.size() == 1) {
+ final Query tq = newTermQuery(this.terms.get(0), null);
+ tq.setBoost(getBoost());
+ return tq;
+ }
+ return super.rewrite(reader);
+ }
+
+ // LUCENE-UPGRADE: remove this method if on 4.8
+ @Override
+ protected Query buildQuery(final int maxDoc,
+ final TermContext[] contextArray, final Term[] queryTerms) {
+ BooleanQuery lowFreq = new BooleanQuery(disableCoord);
+ BooleanQuery highFreq = new BooleanQuery(disableCoord);
+ highFreq.setBoost(highFreqBoost);
+ lowFreq.setBoost(lowFreqBoost);
+ BooleanQuery query = new BooleanQuery(true);
+ for (int i = 0; i < queryTerms.length; i++) {
+ TermContext termContext = contextArray[i];
+ if (termContext == null) {
+ lowFreq.add(newTermQuery(queryTerms[i], null), lowFreqOccur);
+ } else {
+ if ((maxTermFrequency >= 1f && termContext.docFreq() > maxTermFrequency)
+ || (termContext.docFreq() > (int) Math.ceil(maxTermFrequency * (float) maxDoc))) {
+ highFreq.add(newTermQuery(queryTerms[i], termContext), highFreqOccur);
+ } else {
+ lowFreq.add(newTermQuery(queryTerms[i], termContext), lowFreqOccur);
+ }
+ }
+
+ }
+ final int numLowFreqClauses = lowFreq.clauses().size();
+ final int numHighFreqClauses = highFreq.clauses().size();
+ if (lowFreqOccur == Occur.SHOULD && numLowFreqClauses > 0) {
+ int minMustMatch = calcLowFreqMinimumNumberShouldMatch(numLowFreqClauses);
+ lowFreq.setMinimumNumberShouldMatch(minMustMatch);
+ }
+ if (highFreqOccur == Occur.SHOULD && numHighFreqClauses > 0) {
+ int minMustMatch = calcHighFreqMinimumNumberShouldMatch(numHighFreqClauses);
+ highFreq.setMinimumNumberShouldMatch(minMustMatch);
+ }
+ if (lowFreq.clauses().isEmpty()) {
+ /*
+ * if lowFreq is empty we rewrite the high freq terms in a conjunction to
+ * prevent slow queries.
+ */
+ if (highFreq.getMinimumNumberShouldMatch() == 0 && highFreqOccur != Occur.MUST) {
+ for (BooleanClause booleanClause : highFreq) {
+ booleanClause.setOccur(Occur.MUST);
+ }
+ }
+ highFreq.setBoost(getBoost());
+ return highFreq;
+ } else if (highFreq.clauses().isEmpty()) {
+ // only do low freq terms - we don't have high freq terms
+ lowFreq.setBoost(getBoost());
+ return lowFreq;
+ } else {
+ query.add(highFreq, Occur.SHOULD);
+ query.add(lowFreq, Occur.MUST);
+ query.setBoost(getBoost());
+ return query;
+ }
+ }
+
+ //@Override
+ // LUCENE-UPGRADE: remove this method if on 4.8
+ protected Query newTermQuery(Term term, TermContext context) {
+ if (mapper == null) {
+ // this should be super.newTermQuery(term, context) once it's available in the super class
+ return context == null ? new TermQuery(term) : new TermQuery(term, context);
+ }
+ final Query query = mapper.queryStringTermQuery(term);
+ if (query == null) {
+ // this should be super.newTermQuery(term, context) once it's available in the super class
+ return context == null ? new TermQuery(term) : new TermQuery(term, context);
+ } else {
+ return query;
+ }
+ }
+}
diff --git a/src/main/java/org/apache/lucene/queries/XTermsFilter.java b/src/main/java/org/apache/lucene/queries/XTermsFilter.java
new file mode 100644
index 0000000..ea11f03
--- /dev/null
+++ b/src/main/java/org/apache/lucene/queries/XTermsFilter.java
@@ -0,0 +1,328 @@
+package org.apache.lucene.queries;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.*;
+import org.apache.lucene.search.DocIdSet;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.Filter;
+import org.apache.lucene.util.*;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+
+/**
+ * Constructs a filter for docs matching any of the terms added to this class.
+ * Unlike a RangeFilter this can be used for filtering on multiple terms that are not necessarily in
+ * a sequence. An example might be a collection of primary keys from a database query result or perhaps
+ * a choice of "category" labels picked by the end user. As a filter, this is much faster than the
+ * equivalent query (a BooleanQuery with many "should" TermQueries)
+ */
+public final class XTermsFilter extends Filter {
+
+ static {
+ assert Version.LUCENE_46 == org.elasticsearch.Version.CURRENT.luceneVersion : "Remove this once we are on LUCENE_48 - see LUCENE-5502";
+ }
+
+ /*
+ * this class is often used for large number of terms in a single field.
+ * to optimize for this case and to be filter-cache friendly we
+ * serialize all terms into a single byte array and store offsets
+ * in a parallel array to keep the # of object constant and speed up
+ * equals / hashcode.
+ *
+ * This adds quite a bit of complexity but allows large term filters to
+ * be efficient for GC and cache-lookups
+ */
+ private final int[] offsets;
+ private final byte[] termsBytes;
+ private final TermsAndField[] termsAndFields;
+ private final int hashCode; // cached hashcode for fast cache lookups
+ private static final int PRIME = 31;
+
+ /**
+ * Creates a new {@link XTermsFilter} from the given list. The list
+ * can contain duplicate terms and multiple fields.
+ */
+ public XTermsFilter(final List<Term> terms) {
+ this(new FieldAndTermEnum() {
+ // we need to sort for deduplication and to have a common cache key
+ final Iterator<Term> iter = sort(terms).iterator();
+ @Override
+ public BytesRef next() {
+ if (iter.hasNext()) {
+ Term next = iter.next();
+ field = next.field();
+ return next.bytes();
+ }
+ return null;
+ }}, terms.size());
+ }
+
+ /**
+ * Creates a new {@link XTermsFilter} from the given {@link BytesRef} list for
+ * a single field.
+ */
+ public XTermsFilter(final String field, final List<BytesRef> terms) {
+ this(new FieldAndTermEnum(field) {
+ // we need to sort for deduplication and to have a common cache key
+ final Iterator<BytesRef> iter = sort(terms).iterator();
+ @Override
+ public BytesRef next() {
+ if (iter.hasNext()) {
+ return iter.next();
+ }
+ return null;
+ }
+ }, terms.size());
+ }
+
+ /**
+ * Creates a new {@link XTermsFilter} from the given {@link BytesRef} array for
+ * a single field.
+ */
+ public XTermsFilter(final String field, final BytesRef...terms) {
+ // this ctor prevents unnecessary Term creations
+ this(field, Arrays.asList(terms));
+ }
+
+ /**
+ * Creates a new {@link XTermsFilter} from the given array. The array can
+ * contain duplicate terms and multiple fields.
+ */
+ public XTermsFilter(final Term... terms) {
+ this(Arrays.asList(terms));
+ }
+
+
+ private XTermsFilter(FieldAndTermEnum iter, int length) {
+ // TODO: maybe use oal.index.PrefixCodedTerms instead?
+ // If number of terms is more than a few hundred it
+ // should be a win
+
+ // TODO: we also pack terms in FieldCache/DocValues
+ // ... maybe we can refactor to share that code
+
+ // TODO: yet another option is to build the union of the terms in
+ // an automaton an call intersect on the termsenum if the density is high
+
+ int hash = 9;
+ byte[] serializedTerms = new byte[0];
+ this.offsets = new int[length+1];
+ int lastEndOffset = 0;
+ int index = 0;
+ ArrayList<TermsAndField> termsAndFields = new ArrayList<TermsAndField>();
+ TermsAndField lastTermsAndField = null;
+ BytesRef previousTerm = null;
+ String previousField = null;
+ BytesRef currentTerm;
+ String currentField;
+ while((currentTerm = iter.next()) != null) {
+ currentField = iter.field();
+ if (currentField == null) {
+ throw new IllegalArgumentException("Field must not be null");
+ }
+ if (previousField != null) {
+ // deduplicate
+ if (previousField.equals(currentField)) {
+ if (previousTerm.bytesEquals(currentTerm)){
+ continue;
+ }
+ } else {
+ final int start = lastTermsAndField == null ? 0 : lastTermsAndField.end;
+ lastTermsAndField = new TermsAndField(start, index, previousField);
+ termsAndFields.add(lastTermsAndField);
+ }
+ }
+ hash = PRIME * hash + currentField.hashCode();
+ hash = PRIME * hash + currentTerm.hashCode();
+ if (serializedTerms.length < lastEndOffset+currentTerm.length) {
+ serializedTerms = ArrayUtil.grow(serializedTerms, lastEndOffset+currentTerm.length);
+ }
+ System.arraycopy(currentTerm.bytes, currentTerm.offset, serializedTerms, lastEndOffset, currentTerm.length);
+ offsets[index] = lastEndOffset;
+ lastEndOffset += currentTerm.length;
+ index++;
+ previousTerm = currentTerm;
+ previousField = currentField;
+ }
+ offsets[index] = lastEndOffset;
+ final int start = lastTermsAndField == null ? 0 : lastTermsAndField.end;
+ lastTermsAndField = new TermsAndField(start, index, previousField);
+ termsAndFields.add(lastTermsAndField);
+ this.termsBytes = ArrayUtil.shrink(serializedTerms, lastEndOffset);
+ this.termsAndFields = termsAndFields.toArray(new TermsAndField[termsAndFields.size()]);
+ this.hashCode = hash;
+
+ }
+
+
+ @Override
+ public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException {
+ final AtomicReader reader = context.reader();
+ FixedBitSet result = null; // lazy init if needed - no need to create a big bitset ahead of time
+ final Fields fields = reader.fields();
+ final BytesRef spare = new BytesRef(this.termsBytes);
+ if (fields == null) {
+ return result;
+ }
+ Terms terms = null;
+ TermsEnum termsEnum = null;
+ DocsEnum docs = null;
+ for (TermsAndField termsAndField : this.termsAndFields) {
+ if ((terms = fields.terms(termsAndField.field)) != null) {
+ termsEnum = terms.iterator(termsEnum); // this won't return null
+ for (int i = termsAndField.start; i < termsAndField.end; i++) {
+ spare.offset = offsets[i];
+ spare.length = offsets[i+1] - offsets[i];
+ if (termsEnum.seekExact(spare)) {
+ docs = termsEnum.docs(acceptDocs, docs, DocsEnum.FLAG_NONE); // no freq since we don't need them
+ if (result == null) {
+ if (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
+ result = new FixedBitSet(reader.maxDoc());
+ // lazy init but don't do it in the hot loop since we could read many docs
+ result.set(docs.docID());
+ }
+ }
+ while (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
+ result.set(docs.docID());
+ }
+ }
+ }
+ }
+ }
+ return result;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if ((obj == null) || (obj.getClass() != this.getClass())) {
+ return false;
+ }
+
+ XTermsFilter test = (XTermsFilter) obj;
+ // first check the fields before even comparing the bytes
+ if (test.hashCode == hashCode && Arrays.equals(termsAndFields, test.termsAndFields)) {
+ int lastOffset = termsAndFields[termsAndFields.length - 1].end;
+ // compare offsets since we sort they must be identical
+ if (ArrayUtil.equals(offsets, 0, test.offsets, 0, lastOffset + 1)) {
+ // straight byte comparison since we sort they must be identical
+ return ArrayUtil.equals(termsBytes, 0, test.termsBytes, 0, offsets[lastOffset]);
+ }
+ }
+ return false;
+ }
+
+ @Override
+ public int hashCode() {
+ return hashCode;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder builder = new StringBuilder();
+ BytesRef spare = new BytesRef(termsBytes);
+ boolean first = true;
+ for (int i = 0; i < termsAndFields.length; i++) {
+ TermsAndField current = termsAndFields[i];
+ for (int j = current.start; j < current.end; j++) {
+ spare.offset = offsets[j];
+ spare.length = offsets[j+1] - offsets[j];
+ if (!first) {
+ builder.append(' ');
+ }
+ first = false;
+ builder.append(current.field).append(':');
+ builder.append(spare.utf8ToString());
+ }
+ }
+
+ return builder.toString();
+ }
+
+ private static final class TermsAndField {
+ final int start;
+ final int end;
+ final String field;
+
+
+ TermsAndField(int start, int end, String field) {
+ super();
+ this.start = start;
+ this.end = end;
+ this.field = field;
+ }
+
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + ((field == null) ? 0 : field.hashCode());
+ result = prime * result + end;
+ result = prime * result + start;
+ return result;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) return true;
+ if (obj == null) return false;
+ if (getClass() != obj.getClass()) return false;
+ TermsAndField other = (TermsAndField) obj;
+ if (field == null) {
+ if (other.field != null) return false;
+ } else if (!field.equals(other.field)) return false;
+ if (end != other.end) return false;
+ if (start != other.start) return false;
+ return true;
+ }
+
+ }
+
+ private static abstract class FieldAndTermEnum {
+ protected String field;
+
+ public abstract BytesRef next();
+
+ public FieldAndTermEnum() {}
+
+ public FieldAndTermEnum(String field) { this.field = field; }
+
+ public String field() {
+ return field;
+ }
+ }
+
+ /*
+ * simple utility that returns the in-place sorted list
+ */
+ private static <T extends Comparable<? super T>> List<T> sort(List<T> toSort) {
+ if (toSort.isEmpty()) {
+ throw new IllegalArgumentException("no terms provided");
+ }
+ Collections.sort(toSort);
+ return toSort;
+ }
+}
diff --git a/src/main/java/org/apache/lucene/queryparser/XSimpleQueryParser.java b/src/main/java/org/apache/lucene/queryparser/XSimpleQueryParser.java
new file mode 100644
index 0000000..91bc21d
--- /dev/null
+++ b/src/main/java/org/apache/lucene/queryparser/XSimpleQueryParser.java
@@ -0,0 +1,521 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.queryparser;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.*;
+import org.apache.lucene.util.QueryBuilder;
+import org.apache.lucene.util.Version;
+import org.elasticsearch.common.lucene.Lucene;
+
+import java.util.Collections;
+import java.util.Map;
+
+/**
+ * XSimpleQueryParser is used to parse human readable query syntax.
+ * <p>
+ * The main idea behind this parser is that a person should be able to type
+ * whatever they want to represent a query, and this parser will do its best
+ * to interpret what to search for no matter how poorly composed the request
+ * may be. Tokens are considered to be any of a term, phrase, or subquery for the
+ * operations described below. Whitespace including ' ' '\n' '\r' and '\t'
+ * and certain operators may be used to delimit tokens ( ) + | " .
+ * <p>
+ * Any errors in query syntax will be ignored and the parser will attempt
+ * to decipher what it can; however, this may mean odd or unexpected results.
+ * <h4>Query Operators</h4>
+ * <ul>
+ * <li>'{@code +}' specifies {@code AND} operation: <tt>token1+token2</tt>
+ * <li>'{@code |}' specifies {@code OR} operation: <tt>token1|token2</tt>
+ * <li>'{@code -}' negates a single token: <tt>-token0</tt>
+ * <li>'{@code "}' creates phrases of terms: <tt>"term1 term2 ..."</tt>
+ * <li>'{@code *}' at the end of terms specifies prefix query: <tt>term*</tt>
+ * <li>'{@code (}' and '{@code )}' specifies precedence: <tt>token1 + (token2 | token3)</tt>
+ * </ul>
+ * <p>
+ * The {@link #setDefaultOperator default operator} is {@code OR} if no other operator is specified.
+ * For example, the following will {@code OR} {@code token1} and {@code token2} together:
+ * <tt>token1 token2</tt>
+ * <p>
+ * Normal operator precedence will be simple order from right to left.
+ * For example, the following will evaluate {@code token1 OR token2} first,
+ * then {@code AND} with {@code token3}:
+ * <blockquote>token1 | token2 + token3</blockquote>
+ * <h4>Escaping</h4>
+ * <p>
+ * An individual term may contain any possible character with certain characters
+ * requiring escaping using a '{@code \}'. The following characters will need to be escaped in
+ * terms and phrases:
+ * {@code + | " ( ) ' \}
+ * <p>
+ * The '{@code -}' operator is a special case. On individual terms (not phrases) the first
+ * character of a term that is {@code -} must be escaped; however, any '{@code -}' characters
+ * beyond the first character do not need to be escaped.
+ * For example:
+ * <ul>
+ * <li>{@code -term1} -- Specifies {@code NOT} operation against {@code term1}
+ * <li>{@code \-term1} -- Searches for the term {@code -term1}.
+ * <li>{@code term-1} -- Searches for the term {@code term-1}.
+ * <li>{@code term\-1} -- Searches for the term {@code term-1}.
+ * </ul>
+ * <p>
+ * The '{@code *}' operator is a special case. On individual terms (not phrases) the last
+ * character of a term that is '{@code *}' must be escaped; however, any '{@code *}' characters
+ * before the last character do not need to be escaped:
+ * <ul>
+ * <li>{@code term1*} -- Searches for the prefix {@code term1}
+ * <li>{@code term1\*} -- Searches for the term {@code term1*}
+ * <li>{@code term*1} -- Searches for the term {@code term*1}
+ * <li>{@code term\*1} -- Searches for the term {@code term*1}
+ * </ul>
+ * <p>
+ * Note that above examples consider the terms before text processing.
+ */
+public class XSimpleQueryParser extends QueryBuilder {
+
+ static {
+ assert Version.LUCENE_46.onOrAfter(Lucene.VERSION) : "Lucene 4.7 adds SimpleQueryParser, remove me!";
+ }
+
+ /** Map of fields to query against with their weights */
+ protected final Map<String,Float> weights;
+ /** flags to the parser (to turn features on/off) */
+ protected final int flags;
+
+ /** Enables {@code AND} operator (+) */
+ public static final int AND_OPERATOR = 1<<0;
+ /** Enables {@code NOT} operator (-) */
+ public static final int NOT_OPERATOR = 1<<1;
+ /** Enables {@code OR} operator (|) */
+ public static final int OR_OPERATOR = 1<<2;
+ /** Enables {@code PREFIX} operator (*) */
+ public static final int PREFIX_OPERATOR = 1<<3;
+ /** Enables {@code PHRASE} operator (") */
+ public static final int PHRASE_OPERATOR = 1<<4;
+ /** Enables {@code PRECEDENCE} operators: {@code (} and {@code )} */
+ public static final int PRECEDENCE_OPERATORS = 1<<5;
+ /** Enables {@code ESCAPE} operator (\) */
+ public static final int ESCAPE_OPERATOR = 1<<6;
+ /** Enables {@code WHITESPACE} operators: ' ' '\n' '\r' '\t' */
+ public static final int WHITESPACE_OPERATOR = 1<<7;
+
+ private BooleanClause.Occur defaultOperator = BooleanClause.Occur.SHOULD;
+
+ /** Creates a new parser searching over a single field. */
+ public XSimpleQueryParser(Analyzer analyzer, String field) {
+ this(analyzer, Collections.singletonMap(field, 1.0F));
+ }
+
+ /** Creates a new parser searching over multiple fields with different weights. */
+ public XSimpleQueryParser(Analyzer analyzer, Map<String, Float> weights) {
+ this(analyzer, weights, -1);
+ }
+
+ /** Creates a new parser with custom flags used to enable/disable certain features. */
+ public XSimpleQueryParser(Analyzer analyzer, Map<String, Float> weights, int flags) {
+ super(analyzer);
+ this.weights = weights;
+ this.flags = flags;
+ }
+
+ /** Parses the query text and returns parsed query (or null if empty) */
+ public Query parse(String queryText) {
+ char data[] = queryText.toCharArray();
+ char buffer[] = new char[data.length];
+
+ State state = new State(data, buffer, 0, data.length);
+ parseSubQuery(state);
+ return state.top;
+ }
+
+ private void parseSubQuery(State state) {
+ while (state.index < state.length) {
+ if (state.data[state.index] == '(' && (flags & PRECEDENCE_OPERATORS) != 0) {
+ // the beginning of a subquery has been found
+ consumeSubQuery(state);
+ } else if (state.data[state.index] == ')' && (flags & PRECEDENCE_OPERATORS) != 0) {
+ // this is an extraneous character so it is ignored
+ ++state.index;
+ } else if (state.data[state.index] == '"' && (flags & PHRASE_OPERATOR) != 0) {
+ // the beginning of a phrase has been found
+ consumePhrase(state);
+ } else if (state.data[state.index] == '+' && (flags & AND_OPERATOR) != 0) {
+ // an and operation has been explicitly set
+ // if an operation has already been set this one is ignored
+ // if a term (or phrase or subquery) has not been found yet the
+ // operation is also ignored since there is no previous
+ // term (or phrase or subquery) to and with
+ if (state.currentOperation == null && state.top != null) {
+ state.currentOperation = BooleanClause.Occur.MUST;
+ }
+
+ ++state.index;
+ } else if (state.data[state.index] == '|' && (flags & OR_OPERATOR) != 0) {
+ // an or operation has been explicitly set
+ // if an operation has already been set this one is ignored
+ // if a term (or phrase or subquery) has not been found yet the
+ // operation is also ignored since there is no previous
+ // term (or phrase or subquery) to or with
+ if (state.currentOperation == null && state.top != null) {
+ state.currentOperation = BooleanClause.Occur.SHOULD;
+ }
+
+ ++state.index;
+ } else if (state.data[state.index] == '-' && (flags & NOT_OPERATOR) != 0) {
+ // a not operator has been found, so increase the not count
+ // two not operators in a row negate each other
+ ++state.not;
+ ++state.index;
+
+ // continue so the not operator is not reset
+ // before the next character is determined
+ continue;
+ } else if ((state.data[state.index] == ' '
+ || state.data[state.index] == '\t'
+ || state.data[state.index] == '\n'
+ || state.data[state.index] == '\r') && (flags & WHITESPACE_OPERATOR) != 0) {
+ // ignore any whitespace found as it may have already been
+ // used a delimiter across a term (or phrase or subquery)
+ // or is simply extraneous
+ ++state.index;
+ } else {
+ // the beginning of a token has been found
+ consumeToken(state);
+ }
+
+ // reset the not operator as even whitespace is not allowed when
+ // specifying the not operation for a term (or phrase or subquery)
+ state.not = 0;
+ }
+ }
+
+ private void consumeSubQuery(State state) {
+ assert (flags & PRECEDENCE_OPERATORS) != 0;
+ int start = ++state.index;
+ int precedence = 1;
+ boolean escaped = false;
+
+ while (state.index < state.length) {
+ if (!escaped) {
+ if (state.data[state.index] == '\\' && (flags & ESCAPE_OPERATOR) != 0) {
+ // an escape character has been found so
+ // whatever character is next will become
+ // part of the subquery unless the escape
+ // character is the last one in the data
+ escaped = true;
+ ++state.index;
+
+ continue;
+ } else if (state.data[state.index] == '(') {
+ // increase the precedence as there is a
+ // subquery in the current subquery
+ ++precedence;
+ } else if (state.data[state.index] == ')') {
+ --precedence;
+
+ if (precedence == 0) {
+ // this should be the end of the subquery
+ // all characters found will used for
+ // creating the subquery
+ break;
+ }
+ }
+ }
+
+ escaped = false;
+ ++state.index;
+ }
+
+ if (state.index == state.length) {
+ // a closing parenthesis was never found so the opening
+ // parenthesis is considered extraneous and will be ignored
+ state.index = start;
+ } else if (state.index == start) {
+ // a closing parenthesis was found immediately after the opening
+ // parenthesis so the current operation is reset since it would
+ // have been applied to this subquery
+ state.currentOperation = null;
+
+ ++state.index;
+ } else {
+ // a complete subquery has been found and is recursively parsed by
+ // starting over with a new state object
+ State subState = new State(state.data, state.buffer, start, state.index);
+ parseSubQuery(subState);
+ buildQueryTree(state, subState.top);
+
+ ++state.index;
+ }
+ }
+
+ private void consumePhrase(State state) {
+ assert (flags & PHRASE_OPERATOR) != 0;
+ int start = ++state.index;
+ int copied = 0;
+ boolean escaped = false;
+
+ while (state.index < state.length) {
+ if (!escaped) {
+ if (state.data[state.index] == '\\' && (flags & ESCAPE_OPERATOR) != 0) {
+ // an escape character has been found so
+ // whatever character is next will become
+ // part of the phrase unless the escape
+ // character is the last one in the data
+ escaped = true;
+ ++state.index;
+
+ continue;
+ } else if (state.data[state.index] == '"') {
+ // this should be the end of the phrase
+ // all characters found will used for
+ // creating the phrase query
+ break;
+ }
+ }
+
+ escaped = false;
+ state.buffer[copied++] = state.data[state.index++];
+ }
+
+ if (state.index == state.length) {
+ // a closing double quote was never found so the opening
+ // double quote is considered extraneous and will be ignored
+ state.index = start;
+ } else if (state.index == start) {
+ // a closing double quote was found immediately after the opening
+ // double quote so the current operation is reset since it would
+ // have been applied to this phrase
+ state.currentOperation = null;
+
+ ++state.index;
+ } else {
+ // a complete phrase has been found and is parsed through
+ // through the analyzer from the given field
+ String phrase = new String(state.buffer, 0, copied);
+ Query branch = newPhraseQuery(phrase);
+ buildQueryTree(state, branch);
+
+ ++state.index;
+ }
+ }
+
+ private void consumeToken(State state) {
+ int copied = 0;
+ boolean escaped = false;
+ boolean prefix = false;
+
+ while (state.index < state.length) {
+ if (!escaped) {
+ if (state.data[state.index] == '\\' && (flags & ESCAPE_OPERATOR) != 0) {
+ // an escape character has been found so
+ // whatever character is next will become
+ // part of the term unless the escape
+ // character is the last one in the data
+ escaped = true;
+ prefix = false;
+ ++state.index;
+
+ continue;
+ } else if ((state.data[state.index] == '"' && (flags & PHRASE_OPERATOR) != 0)
+ || (state.data[state.index] == '|' && (flags & OR_OPERATOR) != 0)
+ || (state.data[state.index] == '+' && (flags & AND_OPERATOR) != 0)
+ || (state.data[state.index] == '(' && (flags & PRECEDENCE_OPERATORS) != 0)
+ || (state.data[state.index] == ')' && (flags & PRECEDENCE_OPERATORS) != 0)
+ || ((state.data[state.index] == ' '
+ || state.data[state.index] == '\t'
+ || state.data[state.index] == '\n'
+ || state.data[state.index] == '\r') && (flags & WHITESPACE_OPERATOR) != 0)) {
+ // this should be the end of the term
+ // all characters found will used for
+ // creating the term query
+ break;
+ }
+
+ // wildcard tracks whether or not the last character
+ // was a '*' operator that hasn't been escaped
+ // there must be at least one valid character before
+ // searching for a prefixed set of terms
+ prefix = copied > 0 && state.data[state.index] == '*' && (flags & PREFIX_OPERATOR) != 0;
+ }
+
+ escaped = false;
+ state.buffer[copied++] = state.data[state.index++];
+ }
+
+ if (copied > 0) {
+ final Query branch;
+
+ if (prefix) {
+ // if a term is found with a closing '*' it is considered to be a prefix query
+ // and will have prefix added as an option
+ String token = new String(state.buffer, 0, copied - 1);
+ branch = newPrefixQuery(token);
+ } else {
+ // a standard term has been found so it will be run through
+ // the entire analysis chain from the specified schema field
+ String token = new String(state.buffer, 0, copied);
+ branch = newDefaultQuery(token);
+ }
+
+ buildQueryTree(state, branch);
+ }
+ }
+
+ // buildQueryTree should be called after a term, phrase, or subquery
+ // is consumed to be added to our existing query tree
+ // this method will only add to the existing tree if the branch contained in state is not null
+ private void buildQueryTree(State state, Query branch) {
+ if (branch != null) {
+ // modify our branch to a BooleanQuery wrapper for not
+ // this is necessary any time a term, phrase, or subquery is negated
+ if (state.not % 2 == 1) {
+ BooleanQuery nq = new BooleanQuery();
+ nq.add(branch, BooleanClause.Occur.MUST_NOT);
+ nq.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD);
+ branch = nq;
+ }
+
+ // first term (or phrase or subquery) found and will begin our query tree
+ if (state.top == null) {
+ state.top = branch;
+ } else {
+ // more than one term (or phrase or subquery) found
+ // set currentOperation to the default if no other operation is explicitly set
+ if (state.currentOperation == null) {
+ state.currentOperation = defaultOperator;
+ }
+
+ // operational change requiring a new parent node
+ // this occurs if the previous operation is not the same as current operation
+ // because the previous operation must be evaluated separately to preserve
+ // the proper precedence and the current operation will take over as the top of the tree
+ if (state.previousOperation != state.currentOperation) {
+ BooleanQuery bq = new BooleanQuery();
+ bq.add(state.top, state.currentOperation);
+ state.top = bq;
+ }
+
+ // reset all of the state for reuse
+ ((BooleanQuery)state.top).add(branch, state.currentOperation);
+ state.previousOperation = state.currentOperation;
+ }
+
+ // reset the current operation as it was intended to be applied to
+ // the incoming term (or phrase or subquery) even if branch was null
+ // due to other possible errors
+ state.currentOperation = null;
+ }
+ }
+
+ /**
+ * Factory method to generate a standard query (no phrase or prefix operators).
+ */
+ protected Query newDefaultQuery(String text) {
+ BooleanQuery bq = new BooleanQuery(true);
+ for (Map.Entry<String,Float> entry : weights.entrySet()) {
+ Query q = createBooleanQuery(entry.getKey(), text, defaultOperator);
+ if (q != null) {
+ q.setBoost(entry.getValue());
+ bq.add(q, BooleanClause.Occur.SHOULD);
+ }
+ }
+ return simplify(bq);
+ }
+
+ /**
+ * Factory method to generate a phrase query.
+ */
+ protected Query newPhraseQuery(String text) {
+ BooleanQuery bq = new BooleanQuery(true);
+ for (Map.Entry<String,Float> entry : weights.entrySet()) {
+ Query q = createPhraseQuery(entry.getKey(), text);
+ if (q != null) {
+ q.setBoost(entry.getValue());
+ bq.add(q, BooleanClause.Occur.SHOULD);
+ }
+ }
+ return simplify(bq);
+ }
+
+ /**
+ * Factory method to generate a prefix query.
+ */
+ protected Query newPrefixQuery(String text) {
+ BooleanQuery bq = new BooleanQuery(true);
+ for (Map.Entry<String,Float> entry : weights.entrySet()) {
+ PrefixQuery prefix = new PrefixQuery(new Term(entry.getKey(), text));
+ prefix.setBoost(entry.getValue());
+ bq.add(prefix, BooleanClause.Occur.SHOULD);
+ }
+ return simplify(bq);
+ }
+
+ /**
+ * Helper to simplify boolean queries with 0 or 1 clause
+ */
+ protected Query simplify(BooleanQuery bq) {
+ if (bq.clauses().isEmpty()) {
+ return null;
+ } else if (bq.clauses().size() == 1) {
+ return bq.clauses().get(0).getQuery();
+ } else {
+ return bq;
+ }
+ }
+
+ /**
+ * Returns the implicit operator setting, which will be
+ * either {@code SHOULD} or {@code MUST}.
+ */
+ public BooleanClause.Occur getDefaultOperator() {
+ return defaultOperator;
+ }
+
+ /**
+ * Sets the implicit operator setting, which must be
+ * either {@code SHOULD} or {@code MUST}.
+ */
+ public void setDefaultOperator(BooleanClause.Occur operator) {
+ if (operator != BooleanClause.Occur.SHOULD && operator != BooleanClause.Occur.MUST) {
+ throw new IllegalArgumentException("invalid operator: only SHOULD or MUST are allowed");
+ }
+ this.defaultOperator = operator;
+ }
+
+ static class State {
+ final char[] data; // the characters in the query string
+ final char[] buffer; // a temporary buffer used to reduce necessary allocations
+ int index;
+ int length;
+
+ BooleanClause.Occur currentOperation;
+ BooleanClause.Occur previousOperation;
+ int not;
+
+ Query top;
+
+ State(char[] data, char[] buffer, int index, int length) {
+ this.data = data;
+ this.buffer = buffer;
+ this.index = index;
+ this.length = length;
+ }
+ }
+}
+
diff --git a/src/main/java/org/apache/lucene/queryparser/classic/ExistsFieldQueryExtension.java b/src/main/java/org/apache/lucene/queryparser/classic/ExistsFieldQueryExtension.java
new file mode 100644
index 0000000..470f7b8
--- /dev/null
+++ b/src/main/java/org/apache/lucene/queryparser/classic/ExistsFieldQueryExtension.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.lucene.queryparser.classic;
+
+import org.apache.lucene.search.Query;
+import org.elasticsearch.common.lucene.search.XConstantScoreQuery;
+import org.elasticsearch.index.query.ExistsFilterParser;
+import org.elasticsearch.index.query.QueryParseContext;
+
+/**
+ *
+ */
+public class ExistsFieldQueryExtension implements FieldQueryExtension {
+
+ public static final String NAME = "_exists_";
+
+ @Override
+ public Query query(QueryParseContext parseContext, String queryText) {
+ return new XConstantScoreQuery(ExistsFilterParser.newFilter(parseContext, queryText, null));
+ }
+}
diff --git a/src/main/java/org/apache/lucene/queryparser/classic/FieldQueryExtension.java b/src/main/java/org/apache/lucene/queryparser/classic/FieldQueryExtension.java
new file mode 100644
index 0000000..003ff18
--- /dev/null
+++ b/src/main/java/org/apache/lucene/queryparser/classic/FieldQueryExtension.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.lucene.queryparser.classic;
+
+import org.apache.lucene.search.Query;
+import org.elasticsearch.index.query.QueryParseContext;
+
+/**
+ *
+ */
+public interface FieldQueryExtension {
+
+ Query query(QueryParseContext parseContext, String queryText);
+}
diff --git a/src/main/java/org/apache/lucene/queryparser/classic/MapperQueryParser.java b/src/main/java/org/apache/lucene/queryparser/classic/MapperQueryParser.java
new file mode 100644
index 0000000..eceada9
--- /dev/null
+++ b/src/main/java/org/apache/lucene/queryparser/classic/MapperQueryParser.java
@@ -0,0 +1,884 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.lucene.queryparser.classic;
+
+import com.google.common.base.Objects;
+import com.google.common.collect.ImmutableMap;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.*;
+import org.apache.lucene.util.automaton.RegExp;
+import org.elasticsearch.common.lucene.Lucene;
+import org.elasticsearch.common.lucene.search.MatchNoDocsQuery;
+import org.elasticsearch.common.lucene.search.Queries;
+import org.elasticsearch.common.lucene.search.XFilteredQuery;
+import org.elasticsearch.common.unit.Fuzziness;
+import org.elasticsearch.index.mapper.FieldMapper;
+import org.elasticsearch.index.mapper.MapperService;
+import org.elasticsearch.index.query.QueryParseContext;
+import org.elasticsearch.index.query.support.QueryParsers;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+import static org.elasticsearch.common.lucene.search.Queries.fixNegativeQueryIfNeeded;
+import static org.elasticsearch.common.lucene.search.Queries.optimizeQuery;
+import static org.elasticsearch.index.query.support.QueryParsers.wrapSmartNameQuery;
+
+/**
+ * A query parser that uses the {@link MapperService} in order to build smarter
+ * queries based on the mapping information.
+ * <p/>
+ * <p>Also breaks fields with [type].[name] into a boolean query that must include the type
+ * as well as the query on the name.
+ */
+public class MapperQueryParser extends QueryParser {
+
+ public static final ImmutableMap<String, FieldQueryExtension> fieldQueryExtensions;
+
+ static {
+ fieldQueryExtensions = ImmutableMap.<String, FieldQueryExtension>builder()
+ .put(ExistsFieldQueryExtension.NAME, new ExistsFieldQueryExtension())
+ .put(MissingFieldQueryExtension.NAME, new MissingFieldQueryExtension())
+ .build();
+ }
+
+ private final QueryParseContext parseContext;
+
+ private QueryParserSettings settings;
+
+ private Analyzer quoteAnalyzer;
+
+ private boolean forcedAnalyzer;
+ private boolean forcedQuoteAnalyzer;
+
+ private FieldMapper currentMapper;
+
+ private boolean analyzeWildcard;
+
+ private String quoteFieldSuffix;
+
+ public MapperQueryParser(QueryParseContext parseContext) {
+ super(Lucene.QUERYPARSER_VERSION, null, null);
+ this.parseContext = parseContext;
+ }
+
+ public MapperQueryParser(QueryParserSettings settings, QueryParseContext parseContext) {
+ super(Lucene.QUERYPARSER_VERSION, settings.defaultField(), settings.defaultAnalyzer());
+ this.parseContext = parseContext;
+ reset(settings);
+ }
+
+ public void reset(QueryParserSettings settings) {
+ this.settings = settings;
+ this.field = settings.defaultField();
+
+ if (settings.fields() != null) {
+ if (settings.fields.size() == 1) {
+ // just mark it as the default field
+ this.field = settings.fields().get(0);
+ } else {
+ // otherwise, we need to have the default field being null...
+ this.field = null;
+ }
+ }
+
+ this.forcedAnalyzer = settings.forcedAnalyzer() != null;
+ this.setAnalyzer(forcedAnalyzer ? settings.forcedAnalyzer() : settings.defaultAnalyzer());
+ if (settings.forcedQuoteAnalyzer() != null) {
+ this.forcedQuoteAnalyzer = true;
+ this.quoteAnalyzer = settings.forcedQuoteAnalyzer();
+ } else if (forcedAnalyzer) {
+ this.forcedQuoteAnalyzer = true;
+ this.quoteAnalyzer = settings.forcedAnalyzer();
+ } else {
+ this.forcedAnalyzer = false;
+ this.quoteAnalyzer = settings.defaultQuoteAnalyzer();
+ }
+ this.quoteFieldSuffix = settings.quoteFieldSuffix();
+ setMultiTermRewriteMethod(settings.rewriteMethod());
+ setEnablePositionIncrements(settings.enablePositionIncrements());
+ setAutoGeneratePhraseQueries(settings.autoGeneratePhraseQueries());
+ setAllowLeadingWildcard(settings.allowLeadingWildcard());
+ setLowercaseExpandedTerms(settings.lowercaseExpandedTerms());
+ setPhraseSlop(settings.phraseSlop());
+ setDefaultOperator(settings.defaultOperator());
+ setFuzzyMinSim(settings.fuzzyMinSim());
+ setFuzzyPrefixLength(settings.fuzzyPrefixLength());
+ this.analyzeWildcard = settings.analyzeWildcard();
+ }
+
+ /**
+ * We override this one so we can get the fuzzy part to be treated as string, so people can do: "age:10~5" or "timestamp:2012-10-10~5d"
+ */
+ @Override
+ Query handleBareFuzzy(String qfield, Token fuzzySlop, String termImage) throws ParseException {
+ if (fuzzySlop.image.length() == 1) {
+ return getFuzzyQuery(qfield, termImage, Float.toString(fuzzyMinSim));
+ }
+ return getFuzzyQuery(qfield, termImage, fuzzySlop.image.substring(1));
+ }
+
+ @Override
+ protected Query newTermQuery(Term term) {
+ if (currentMapper != null) {
+ Query termQuery = currentMapper.queryStringTermQuery(term);
+ if (termQuery != null) {
+ return termQuery;
+ }
+ }
+ return super.newTermQuery(term);
+ }
+
+ @Override
+ protected Query newMatchAllDocsQuery() {
+ return Queries.newMatchAllQuery();
+ }
+
+ @Override
+ public Query getFieldQuery(String field, String queryText, boolean quoted) throws ParseException {
+ FieldQueryExtension fieldQueryExtension = fieldQueryExtensions.get(field);
+ if (fieldQueryExtension != null) {
+ return fieldQueryExtension.query(parseContext, queryText);
+ }
+ Collection<String> fields = extractMultiFields(field);
+ if (fields != null) {
+ if (fields.size() == 1) {
+ return getFieldQuerySingle(fields.iterator().next(), queryText, quoted);
+ }
+ if (settings.useDisMax()) {
+ DisjunctionMaxQuery disMaxQuery = new DisjunctionMaxQuery(settings.tieBreaker());
+ boolean added = false;
+ for (String mField : fields) {
+ Query q = getFieldQuerySingle(mField, queryText, quoted);
+ if (q != null) {
+ added = true;
+ applyBoost(mField, q);
+ disMaxQuery.add(q);
+ }
+ }
+ if (!added) {
+ return null;
+ }
+ return disMaxQuery;
+ } else {
+ List<BooleanClause> clauses = new ArrayList<BooleanClause>();
+ for (String mField : fields) {
+ Query q = getFieldQuerySingle(mField, queryText, quoted);
+ if (q != null) {
+ applyBoost(mField, q);
+ clauses.add(new BooleanClause(q, BooleanClause.Occur.SHOULD));
+ }
+ }
+ if (clauses.size() == 0) // happens for stopwords
+ return null;
+ return getBooleanQuery(clauses, true);
+ }
+ } else {
+ return getFieldQuerySingle(field, queryText, quoted);
+ }
+ }
+
+ private Query getFieldQuerySingle(String field, String queryText, boolean quoted) throws ParseException {
+ if (!quoted && queryText.length() > 1) {
+ if (queryText.charAt(0) == '>') {
+ if (queryText.length() > 2) {
+ if (queryText.charAt(1) == '=') {
+ return getRangeQuerySingle(field, queryText.substring(2), null, true, true);
+ }
+ }
+ return getRangeQuerySingle(field, queryText.substring(1), null, false, true);
+ } else if (queryText.charAt(0) == '<') {
+ if (queryText.length() > 2) {
+ if (queryText.charAt(1) == '=') {
+ return getRangeQuerySingle(field, null, queryText.substring(2), true, true);
+ }
+ }
+ return getRangeQuerySingle(field, null, queryText.substring(1), true, false);
+ }
+ }
+ currentMapper = null;
+ Analyzer oldAnalyzer = getAnalyzer();
+ try {
+ MapperService.SmartNameFieldMappers fieldMappers = null;
+ if (quoted) {
+ setAnalyzer(quoteAnalyzer);
+ if (quoteFieldSuffix != null) {
+ fieldMappers = parseContext.smartFieldMappers(field + quoteFieldSuffix);
+ }
+ }
+ if (fieldMappers == null) {
+ fieldMappers = parseContext.smartFieldMappers(field);
+ }
+ if (fieldMappers != null) {
+ if (quoted) {
+ if (!forcedQuoteAnalyzer) {
+ setAnalyzer(fieldMappers.searchQuoteAnalyzer());
+ }
+ } else {
+ if (!forcedAnalyzer) {
+ setAnalyzer(fieldMappers.searchAnalyzer());
+ }
+ }
+ currentMapper = fieldMappers.fieldMappers().mapper();
+ if (currentMapper != null) {
+ Query query = null;
+ if (currentMapper.useTermQueryWithQueryString()) {
+ try {
+ if (fieldMappers.explicitTypeInNameWithDocMapper()) {
+ String[] previousTypes = QueryParseContext.setTypesWithPrevious(new String[]{fieldMappers.docMapper().type()});
+ try {
+ query = currentMapper.termQuery(queryText, parseContext);
+ } finally {
+ QueryParseContext.setTypes(previousTypes);
+ }
+ } else {
+ query = currentMapper.termQuery(queryText, parseContext);
+ }
+ } catch (RuntimeException e) {
+ if (settings.lenient()) {
+ return null;
+ } else {
+ throw e;
+ }
+ }
+ }
+ if (query == null) {
+ query = super.getFieldQuery(currentMapper.names().indexName(), queryText, quoted);
+ }
+ return wrapSmartNameQuery(query, fieldMappers, parseContext);
+ }
+ }
+ return super.getFieldQuery(field, queryText, quoted);
+ } finally {
+ setAnalyzer(oldAnalyzer);
+ }
+ }
+
+ @Override
+ protected Query getFieldQuery(String field, String queryText, int slop) throws ParseException {
+ Collection<String> fields = extractMultiFields(field);
+ if (fields != null) {
+ if (settings.useDisMax()) {
+ DisjunctionMaxQuery disMaxQuery = new DisjunctionMaxQuery(settings.tieBreaker());
+ boolean added = false;
+ for (String mField : fields) {
+ Query q = super.getFieldQuery(mField, queryText, slop);
+ if (q != null) {
+ added = true;
+ applyBoost(mField, q);
+ applySlop(q, slop);
+ disMaxQuery.add(q);
+ }
+ }
+ if (!added) {
+ return null;
+ }
+ return disMaxQuery;
+ } else {
+ List<BooleanClause> clauses = new ArrayList<BooleanClause>();
+ for (String mField : fields) {
+ Query q = super.getFieldQuery(mField, queryText, slop);
+ if (q != null) {
+ applyBoost(mField, q);
+ applySlop(q, slop);
+ clauses.add(new BooleanClause(q, BooleanClause.Occur.SHOULD));
+ }
+ }
+ if (clauses.size() == 0) // happens for stopwords
+ return null;
+ return getBooleanQuery(clauses, true);
+ }
+ } else {
+ return super.getFieldQuery(field, queryText, slop);
+ }
+ }
+
+ @Override
+ protected Query getRangeQuery(String field, String part1, String part2, boolean startInclusive, boolean endInclusive) throws ParseException {
+ if ("*".equals(part1)) {
+ part1 = null;
+ }
+ if ("*".equals(part2)) {
+ part2 = null;
+ }
+
+ Collection<String> fields = extractMultiFields(field);
+
+ if (fields == null) {
+ return getRangeQuerySingle(field, part1, part2, startInclusive, endInclusive);
+ }
+
+
+ if (fields.size() == 1) {
+ return getRangeQuerySingle(fields.iterator().next(), part1, part2, startInclusive, endInclusive);
+ }
+
+ if (settings.useDisMax()) {
+ DisjunctionMaxQuery disMaxQuery = new DisjunctionMaxQuery(settings.tieBreaker());
+ boolean added = false;
+ for (String mField : fields) {
+ Query q = getRangeQuerySingle(mField, part1, part2, startInclusive, endInclusive);
+ if (q != null) {
+ added = true;
+ applyBoost(mField, q);
+ disMaxQuery.add(q);
+ }
+ }
+ if (!added) {
+ return null;
+ }
+ return disMaxQuery;
+ } else {
+ List<BooleanClause> clauses = new ArrayList<BooleanClause>();
+ for (String mField : fields) {
+ Query q = getRangeQuerySingle(mField, part1, part2, startInclusive, endInclusive);
+ if (q != null) {
+ applyBoost(mField, q);
+ clauses.add(new BooleanClause(q, BooleanClause.Occur.SHOULD));
+ }
+ }
+ if (clauses.size() == 0) // happens for stopwords
+ return null;
+ return getBooleanQuery(clauses, true);
+ }
+ }
+
+ private Query getRangeQuerySingle(String field, String part1, String part2, boolean startInclusive, boolean endInclusive) {
+ currentMapper = null;
+ MapperService.SmartNameFieldMappers fieldMappers = parseContext.smartFieldMappers(field);
+ if (fieldMappers != null) {
+ currentMapper = fieldMappers.fieldMappers().mapper();
+ if (currentMapper != null) {
+
+ if (lowercaseExpandedTerms && !currentMapper.isNumeric()) {
+ part1 = part1 == null ? null : part1.toLowerCase(locale);
+ part2 = part2 == null ? null : part2.toLowerCase(locale);
+ }
+
+ try {
+ Query rangeQuery = currentMapper.rangeQuery(part1, part2, startInclusive, endInclusive, parseContext);
+ return wrapSmartNameQuery(rangeQuery, fieldMappers, parseContext);
+ } catch (RuntimeException e) {
+ if (settings.lenient()) {
+ return null;
+ }
+ throw e;
+ }
+ }
+ }
+ return newRangeQuery(field, part1, part2, startInclusive, endInclusive);
+ }
+
+ protected Query getFuzzyQuery(String field, String termStr, String minSimilarity) throws ParseException {
+ if (lowercaseExpandedTerms) {
+ termStr = termStr.toLowerCase(locale);
+ }
+ Collection<String> fields = extractMultiFields(field);
+ if (fields != null) {
+ if (fields.size() == 1) {
+ return getFuzzyQuerySingle(fields.iterator().next(), termStr, minSimilarity);
+ }
+ if (settings.useDisMax()) {
+ DisjunctionMaxQuery disMaxQuery = new DisjunctionMaxQuery(settings.tieBreaker());
+ boolean added = false;
+ for (String mField : fields) {
+ Query q = getFuzzyQuerySingle(mField, termStr, minSimilarity);
+ if (q != null) {
+ added = true;
+ applyBoost(mField, q);
+ disMaxQuery.add(q);
+ }
+ }
+ if (!added) {
+ return null;
+ }
+ return disMaxQuery;
+ } else {
+ List<BooleanClause> clauses = new ArrayList<BooleanClause>();
+ for (String mField : fields) {
+ Query q = getFuzzyQuerySingle(mField, termStr, minSimilarity);
+ applyBoost(mField, q);
+ clauses.add(new BooleanClause(q, BooleanClause.Occur.SHOULD));
+ }
+ return getBooleanQuery(clauses, true);
+ }
+ } else {
+ return getFuzzyQuerySingle(field, termStr, minSimilarity);
+ }
+ }
+
+ private Query getFuzzyQuerySingle(String field, String termStr, String minSimilarity) throws ParseException {
+ currentMapper = null;
+ MapperService.SmartNameFieldMappers fieldMappers = parseContext.smartFieldMappers(field);
+ if (fieldMappers != null) {
+ currentMapper = fieldMappers.fieldMappers().mapper();
+ if (currentMapper != null) {
+ try {
+ //LUCENE 4 UPGRADE I disabled transpositions here by default - maybe this needs to be changed
+ Query fuzzyQuery = currentMapper.fuzzyQuery(termStr, Fuzziness.build(minSimilarity), fuzzyPrefixLength, settings.fuzzyMaxExpansions(), false);
+ return wrapSmartNameQuery(fuzzyQuery, fieldMappers, parseContext);
+ } catch (RuntimeException e) {
+ if (settings.lenient()) {
+ return null;
+ }
+ throw e;
+ }
+ }
+ }
+ return super.getFuzzyQuery(field, termStr, Float.parseFloat(minSimilarity));
+ }
+
+ @Override
+ protected Query newFuzzyQuery(Term term, float minimumSimilarity, int prefixLength) {
+ String text = term.text();
+ int numEdits = FuzzyQuery.floatToEdits(minimumSimilarity, text.codePointCount(0, text.length()));
+ //LUCENE 4 UPGRADE I disabled transpositions here by default - maybe this needs to be changed
+ FuzzyQuery query = new FuzzyQuery(term, numEdits, prefixLength, settings.fuzzyMaxExpansions(), false);
+ QueryParsers.setRewriteMethod(query, settings.fuzzyRewriteMethod());
+ return query;
+ }
+
+ @Override
+ protected Query getPrefixQuery(String field, String termStr) throws ParseException {
+ if (lowercaseExpandedTerms) {
+ termStr = termStr.toLowerCase(locale);
+ }
+ Collection<String> fields = extractMultiFields(field);
+ if (fields != null) {
+ if (fields.size() == 1) {
+ return getPrefixQuerySingle(fields.iterator().next(), termStr);
+ }
+ if (settings.useDisMax()) {
+ DisjunctionMaxQuery disMaxQuery = new DisjunctionMaxQuery(settings.tieBreaker());
+ boolean added = false;
+ for (String mField : fields) {
+ Query q = getPrefixQuerySingle(mField, termStr);
+ if (q != null) {
+ added = true;
+ applyBoost(mField, q);
+ disMaxQuery.add(q);
+ }
+ }
+ if (!added) {
+ return null;
+ }
+ return disMaxQuery;
+ } else {
+ List<BooleanClause> clauses = new ArrayList<BooleanClause>();
+ for (String mField : fields) {
+ Query q = getPrefixQuerySingle(mField, termStr);
+ if (q != null) {
+ applyBoost(mField, q);
+ clauses.add(new BooleanClause(q, BooleanClause.Occur.SHOULD));
+ }
+ }
+ if (clauses.size() == 0) // happens for stopwords
+ return null;
+ return getBooleanQuery(clauses, true);
+ }
+ } else {
+ return getPrefixQuerySingle(field, termStr);
+ }
+ }
+
+ private Query getPrefixQuerySingle(String field, String termStr) throws ParseException {
+ currentMapper = null;
+ Analyzer oldAnalyzer = getAnalyzer();
+ try {
+ MapperService.SmartNameFieldMappers fieldMappers = parseContext.smartFieldMappers(field);
+ if (fieldMappers != null) {
+ if (!forcedAnalyzer) {
+ setAnalyzer(fieldMappers.searchAnalyzer());
+ }
+ currentMapper = fieldMappers.fieldMappers().mapper();
+ if (currentMapper != null) {
+ Query query = null;
+ if (currentMapper.useTermQueryWithQueryString()) {
+ if (fieldMappers.explicitTypeInNameWithDocMapper()) {
+ String[] previousTypes = QueryParseContext.setTypesWithPrevious(new String[]{fieldMappers.docMapper().type()});
+ try {
+ query = currentMapper.prefixQuery(termStr, multiTermRewriteMethod, parseContext);
+ } finally {
+ QueryParseContext.setTypes(previousTypes);
+ }
+ } else {
+ query = currentMapper.prefixQuery(termStr, multiTermRewriteMethod, parseContext);
+ }
+ }
+ if (query == null) {
+ query = getPossiblyAnalyzedPrefixQuery(currentMapper.names().indexName(), termStr);
+ }
+ return wrapSmartNameQuery(query, fieldMappers, parseContext);
+ }
+ }
+ return getPossiblyAnalyzedPrefixQuery(field, termStr);
+ } catch (RuntimeException e) {
+ if (settings.lenient()) {
+ return null;
+ }
+ throw e;
+ } finally {
+ setAnalyzer(oldAnalyzer);
+ }
+ }
+
+ private Query getPossiblyAnalyzedPrefixQuery(String field, String termStr) throws ParseException {
+ if (!analyzeWildcard) {
+ return super.getPrefixQuery(field, termStr);
+ }
+ // get Analyzer from superclass and tokenize the term
+ TokenStream source;
+ try {
+ source = getAnalyzer().tokenStream(field, termStr);
+ source.reset();
+ } catch (IOException e) {
+ return super.getPrefixQuery(field, termStr);
+ }
+ List<String> tlist = new ArrayList<String>();
+ CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
+
+ while (true) {
+ try {
+ if (!source.incrementToken()) break;
+ } catch (IOException e) {
+ break;
+ }
+ tlist.add(termAtt.toString());
+ }
+
+ try {
+ source.close();
+ } catch (IOException e) {
+ // ignore
+ }
+
+ if (tlist.size() == 1) {
+ return super.getPrefixQuery(field, tlist.get(0));
+ } else {
+ // build a boolean query with prefix on each one...
+ List<BooleanClause> clauses = new ArrayList<BooleanClause>();
+ for (String token : tlist) {
+ clauses.add(new BooleanClause(super.getPrefixQuery(field, token), BooleanClause.Occur.SHOULD));
+ }
+ return getBooleanQuery(clauses, true);
+
+ //return super.getPrefixQuery(field, termStr);
+
+ /* this means that the analyzer used either added or consumed
+* (common for a stemmer) tokens, and we can't build a PrefixQuery */
+// throw new ParseException("Cannot build PrefixQuery with analyzer "
+// + getAnalyzer().getClass()
+// + (tlist.size() > 1 ? " - token(s) added" : " - token consumed"));
+ }
+
+ }
+
+ @Override
+ protected Query getWildcardQuery(String field, String termStr) throws ParseException {
+ if (termStr.equals("*")) {
+ // we want to optimize for match all query for the "*:*", and "*" cases
+ if ("*".equals(field) || Objects.equal(field, this.field)) {
+ String actualField = field;
+ if (actualField == null) {
+ actualField = this.field;
+ }
+ if (actualField == null) {
+ return newMatchAllDocsQuery();
+ }
+ if ("*".equals(actualField) || "_all".equals(actualField)) {
+ return newMatchAllDocsQuery();
+ }
+ // effectively, we check if a field exists or not
+ return fieldQueryExtensions.get(ExistsFieldQueryExtension.NAME).query(parseContext, actualField);
+ }
+ }
+ if (lowercaseExpandedTerms) {
+ termStr = termStr.toLowerCase(locale);
+ }
+ Collection<String> fields = extractMultiFields(field);
+ if (fields != null) {
+ if (fields.size() == 1) {
+ return getWildcardQuerySingle(fields.iterator().next(), termStr);
+ }
+ if (settings.useDisMax()) {
+ DisjunctionMaxQuery disMaxQuery = new DisjunctionMaxQuery(settings.tieBreaker());
+ boolean added = false;
+ for (String mField : fields) {
+ Query q = getWildcardQuerySingle(mField, termStr);
+ if (q != null) {
+ added = true;
+ applyBoost(mField, q);
+ disMaxQuery.add(q);
+ }
+ }
+ if (!added) {
+ return null;
+ }
+ return disMaxQuery;
+ } else {
+ List<BooleanClause> clauses = new ArrayList<BooleanClause>();
+ for (String mField : fields) {
+ Query q = getWildcardQuerySingle(mField, termStr);
+ if (q != null) {
+ applyBoost(mField, q);
+ clauses.add(new BooleanClause(q, BooleanClause.Occur.SHOULD));
+ }
+ }
+ if (clauses.size() == 0) // happens for stopwords
+ return null;
+ return getBooleanQuery(clauses, true);
+ }
+ } else {
+ return getWildcardQuerySingle(field, termStr);
+ }
+ }
+
+ private Query getWildcardQuerySingle(String field, String termStr) throws ParseException {
+ String indexedNameField = field;
+ currentMapper = null;
+ Analyzer oldAnalyzer = getAnalyzer();
+ try {
+ MapperService.SmartNameFieldMappers fieldMappers = parseContext.smartFieldMappers(field);
+ if (fieldMappers != null) {
+ if (!forcedAnalyzer) {
+ setAnalyzer(fieldMappers.searchAnalyzer());
+ }
+ currentMapper = fieldMappers.fieldMappers().mapper();
+ if (currentMapper != null) {
+ indexedNameField = currentMapper.names().indexName();
+ }
+ return wrapSmartNameQuery(getPossiblyAnalyzedWildcardQuery(indexedNameField, termStr), fieldMappers, parseContext);
+ }
+ return getPossiblyAnalyzedWildcardQuery(indexedNameField, termStr);
+ } catch (RuntimeException e) {
+ if (settings.lenient()) {
+ return null;
+ }
+ throw e;
+ } finally {
+ setAnalyzer(oldAnalyzer);
+ }
+ }
+
+ private Query getPossiblyAnalyzedWildcardQuery(String field, String termStr) throws ParseException {
+ if (!analyzeWildcard) {
+ return super.getWildcardQuery(field, termStr);
+ }
+ boolean isWithinToken = (!termStr.startsWith("?") && !termStr.startsWith("*"));
+ StringBuilder aggStr = new StringBuilder();
+ StringBuilder tmp = new StringBuilder();
+ for (int i = 0; i < termStr.length(); i++) {
+ char c = termStr.charAt(i);
+ if (c == '?' || c == '*') {
+ if (isWithinToken) {
+ try {
+ TokenStream source = getAnalyzer().tokenStream(field, tmp.toString());
+ source.reset();
+ CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
+ if (source.incrementToken()) {
+ String term = termAtt.toString();
+ if (term.length() == 0) {
+ // no tokens, just use what we have now
+ aggStr.append(tmp);
+ } else {
+ aggStr.append(term);
+ }
+ } else {
+ // no tokens, just use what we have now
+ aggStr.append(tmp);
+ }
+ source.close();
+ } catch (IOException e) {
+ aggStr.append(tmp);
+ }
+ tmp.setLength(0);
+ }
+ isWithinToken = false;
+ aggStr.append(c);
+ } else {
+ tmp.append(c);
+ isWithinToken = true;
+ }
+ }
+ if (isWithinToken) {
+ try {
+ TokenStream source = getAnalyzer().tokenStream(field, tmp.toString());
+ source.reset();
+ CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
+ if (source.incrementToken()) {
+ String term = termAtt.toString();
+ if (term.length() == 0) {
+ // no tokens, just use what we have now
+ aggStr.append(tmp);
+ } else {
+ aggStr.append(term);
+ }
+ } else {
+ // no tokens, just use what we have now
+ aggStr.append(tmp);
+ }
+ source.close();
+ } catch (IOException e) {
+ aggStr.append(tmp);
+ }
+ }
+
+ return super.getWildcardQuery(field, aggStr.toString());
+ }
+
+ @Override
+ protected Query getRegexpQuery(String field, String termStr) throws ParseException {
+ if (lowercaseExpandedTerms) {
+ termStr = termStr.toLowerCase(locale);
+ }
+ Collection<String> fields = extractMultiFields(field);
+ if (fields != null) {
+ if (fields.size() == 1) {
+ return getRegexpQuerySingle(fields.iterator().next(), termStr);
+ }
+ if (settings.useDisMax()) {
+ DisjunctionMaxQuery disMaxQuery = new DisjunctionMaxQuery(settings.tieBreaker());
+ boolean added = false;
+ for (String mField : fields) {
+ Query q = getRegexpQuerySingle(mField, termStr);
+ if (q != null) {
+ added = true;
+ applyBoost(mField, q);
+ disMaxQuery.add(q);
+ }
+ }
+ if (!added) {
+ return null;
+ }
+ return disMaxQuery;
+ } else {
+ List<BooleanClause> clauses = new ArrayList<BooleanClause>();
+ for (String mField : fields) {
+ Query q = getRegexpQuerySingle(mField, termStr);
+ if (q != null) {
+ applyBoost(mField, q);
+ clauses.add(new BooleanClause(q, BooleanClause.Occur.SHOULD));
+ }
+ }
+ if (clauses.size() == 0) // happens for stopwords
+ return null;
+ return getBooleanQuery(clauses, true);
+ }
+ } else {
+ return getRegexpQuerySingle(field, termStr);
+ }
+ }
+
+ private Query getRegexpQuerySingle(String field, String termStr) throws ParseException {
+ currentMapper = null;
+ Analyzer oldAnalyzer = getAnalyzer();
+ try {
+ MapperService.SmartNameFieldMappers fieldMappers = parseContext.smartFieldMappers(field);
+ if (fieldMappers != null) {
+ if (!forcedAnalyzer) {
+ setAnalyzer(fieldMappers.searchAnalyzer());
+ }
+ currentMapper = fieldMappers.fieldMappers().mapper();
+ if (currentMapper != null) {
+ Query query = null;
+ if (currentMapper.useTermQueryWithQueryString()) {
+ if (fieldMappers.explicitTypeInNameWithDocMapper()) {
+ String[] previousTypes = QueryParseContext.setTypesWithPrevious(new String[]{fieldMappers.docMapper().type()});
+ try {
+ query = currentMapper.regexpQuery(termStr, RegExp.ALL, multiTermRewriteMethod, parseContext);
+ } finally {
+ QueryParseContext.setTypes(previousTypes);
+ }
+ } else {
+ query = currentMapper.regexpQuery(termStr, RegExp.ALL, multiTermRewriteMethod, parseContext);
+ }
+ }
+ if (query == null) {
+ query = super.getRegexpQuery(field, termStr);
+ }
+ return wrapSmartNameQuery(query, fieldMappers, parseContext);
+ }
+ }
+ return super.getRegexpQuery(field, termStr);
+ } catch (RuntimeException e) {
+ if (settings.lenient()) {
+ return null;
+ }
+ throw e;
+ } finally {
+ setAnalyzer(oldAnalyzer);
+ }
+ }
+
+ @Override
+ protected Query getBooleanQuery(List<BooleanClause> clauses, boolean disableCoord) throws ParseException {
+ Query q = super.getBooleanQuery(clauses, disableCoord);
+ if (q == null) {
+ return null;
+ }
+ return optimizeQuery(fixNegativeQueryIfNeeded(q));
+ }
+
+ private void applyBoost(String field, Query q) {
+ if (settings.boosts() != null) {
+ float boost = 1f;
+ if (settings.boosts().containsKey(field)) {
+ boost = settings.boosts().lget();
+ }
+ q.setBoost(boost);
+ }
+ }
+
+ private void applySlop(Query q, int slop) {
+ if (q instanceof XFilteredQuery) {
+ applySlop(((XFilteredQuery)q).getQuery(), slop);
+ }
+ if (q instanceof PhraseQuery) {
+ ((PhraseQuery) q).setSlop(slop);
+ } else if (q instanceof MultiPhraseQuery) {
+ ((MultiPhraseQuery) q).setSlop(slop);
+ }
+ }
+
+ private Collection<String> extractMultiFields(String field) {
+ Collection<String> fields = null;
+ if (field != null) {
+ fields = parseContext.simpleMatchToIndexNames(field);
+ } else {
+ fields = settings.fields();
+ }
+ return fields;
+ }
+
+ public Query parse(String query) throws ParseException {
+ if (query.trim().isEmpty()) {
+ // if the query string is empty we return no docs / empty result
+ // the behavior is simple to change in the client if all docs is required
+ // or a default query
+ return new MatchNoDocsQuery();
+ }
+ return super.parse(query);
+ }
+}
diff --git a/src/main/java/org/apache/lucene/queryparser/classic/MissingFieldQueryExtension.java b/src/main/java/org/apache/lucene/queryparser/classic/MissingFieldQueryExtension.java
new file mode 100644
index 0000000..ad200d4
--- /dev/null
+++ b/src/main/java/org/apache/lucene/queryparser/classic/MissingFieldQueryExtension.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.lucene.queryparser.classic;
+
+import org.apache.lucene.search.Query;
+import org.elasticsearch.common.lucene.search.XConstantScoreQuery;
+import org.elasticsearch.index.query.MissingFilterParser;
+import org.elasticsearch.index.query.QueryParseContext;
+
+/**
+ *
+ */
+public class MissingFieldQueryExtension implements FieldQueryExtension {
+
+ public static final String NAME = "_missing_";
+
+ @Override
+ public Query query(QueryParseContext parseContext, String queryText) {
+ return new XConstantScoreQuery(MissingFilterParser.newFilter(parseContext, queryText,
+ MissingFilterParser.DEFAULT_EXISTENCE_VALUE, MissingFilterParser.DEFAULT_NULL_VALUE, null));
+ }
+}
diff --git a/src/main/java/org/apache/lucene/queryparser/classic/QueryParserSettings.java b/src/main/java/org/apache/lucene/queryparser/classic/QueryParserSettings.java
new file mode 100644
index 0000000..9b8f9aa
--- /dev/null
+++ b/src/main/java/org/apache/lucene/queryparser/classic/QueryParserSettings.java
@@ -0,0 +1,376 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.lucene.queryparser.classic;
+
+import com.carrotsearch.hppc.ObjectFloatOpenHashMap;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.search.FuzzyQuery;
+import org.apache.lucene.search.MultiTermQuery;
+
+import java.util.Collection;
+import java.util.List;
+
+/**
+ *
+ */
+public class QueryParserSettings {
+
+ public static final boolean DEFAULT_ALLOW_LEADING_WILDCARD = true;
+ public static final boolean DEFAULT_ANALYZE_WILDCARD = false;
+ public static final float DEFAULT_BOOST = 1.f;
+
+ private String queryString;
+ private String defaultField;
+ private float boost = DEFAULT_BOOST;
+ private MapperQueryParser.Operator defaultOperator = QueryParser.Operator.OR;
+ private boolean autoGeneratePhraseQueries = false;
+ private boolean allowLeadingWildcard = DEFAULT_ALLOW_LEADING_WILDCARD;
+ private boolean lowercaseExpandedTerms = true;
+ private boolean enablePositionIncrements = true;
+ private int phraseSlop = 0;
+ private float fuzzyMinSim = FuzzyQuery.defaultMinSimilarity;
+ private int fuzzyPrefixLength = FuzzyQuery.defaultPrefixLength;
+ private int fuzzyMaxExpansions = FuzzyQuery.defaultMaxExpansions;
+ private MultiTermQuery.RewriteMethod fuzzyRewriteMethod = null;
+ private boolean analyzeWildcard = DEFAULT_ANALYZE_WILDCARD;
+ private boolean escape = false;
+ private Analyzer defaultAnalyzer = null;
+ private Analyzer defaultQuoteAnalyzer = null;
+ private Analyzer forcedAnalyzer = null;
+ private Analyzer forcedQuoteAnalyzer = null;
+ private String quoteFieldSuffix = null;
+ private MultiTermQuery.RewriteMethod rewriteMethod = MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT;
+ private String minimumShouldMatch;
+ private boolean lenient;
+
+
+ List<String> fields = null;
+ Collection<String> queryTypes = null;
+ ObjectFloatOpenHashMap<String> boosts = null;
+ float tieBreaker = 0.0f;
+ boolean useDisMax = true;
+
+ public boolean isCacheable() {
+ // a hack for now :) to determine if a query string is cacheable
+ return !queryString.contains("now");
+ }
+
+ public String queryString() {
+ return queryString;
+ }
+
+ public void queryString(String queryString) {
+ this.queryString = queryString;
+ }
+
+ public String defaultField() {
+ return defaultField;
+ }
+
+ public void defaultField(String defaultField) {
+ this.defaultField = defaultField;
+ }
+
+ public float boost() {
+ return boost;
+ }
+
+ public void boost(float boost) {
+ this.boost = boost;
+ }
+
+ public QueryParser.Operator defaultOperator() {
+ return defaultOperator;
+ }
+
+ public void defaultOperator(QueryParser.Operator defaultOperator) {
+ this.defaultOperator = defaultOperator;
+ }
+
+ public boolean autoGeneratePhraseQueries() {
+ return autoGeneratePhraseQueries;
+ }
+
+ public void autoGeneratePhraseQueries(boolean autoGeneratePhraseQueries) {
+ this.autoGeneratePhraseQueries = autoGeneratePhraseQueries;
+ }
+
+ public boolean allowLeadingWildcard() {
+ return allowLeadingWildcard;
+ }
+
+ public void allowLeadingWildcard(boolean allowLeadingWildcard) {
+ this.allowLeadingWildcard = allowLeadingWildcard;
+ }
+
+ public boolean lowercaseExpandedTerms() {
+ return lowercaseExpandedTerms;
+ }
+
+ public void lowercaseExpandedTerms(boolean lowercaseExpandedTerms) {
+ this.lowercaseExpandedTerms = lowercaseExpandedTerms;
+ }
+
+ public boolean enablePositionIncrements() {
+ return enablePositionIncrements;
+ }
+
+ public void enablePositionIncrements(boolean enablePositionIncrements) {
+ this.enablePositionIncrements = enablePositionIncrements;
+ }
+
+ public int phraseSlop() {
+ return phraseSlop;
+ }
+
+ public void phraseSlop(int phraseSlop) {
+ this.phraseSlop = phraseSlop;
+ }
+
+ public float fuzzyMinSim() {
+ return fuzzyMinSim;
+ }
+
+ public void fuzzyMinSim(float fuzzyMinSim) {
+ this.fuzzyMinSim = fuzzyMinSim;
+ }
+
+ public int fuzzyPrefixLength() {
+ return fuzzyPrefixLength;
+ }
+
+ public void fuzzyPrefixLength(int fuzzyPrefixLength) {
+ this.fuzzyPrefixLength = fuzzyPrefixLength;
+ }
+
+ public int fuzzyMaxExpansions() {
+ return fuzzyMaxExpansions;
+ }
+
+ public void fuzzyMaxExpansions(int fuzzyMaxExpansions) {
+ this.fuzzyMaxExpansions = fuzzyMaxExpansions;
+ }
+
+ public MultiTermQuery.RewriteMethod fuzzyRewriteMethod() {
+ return fuzzyRewriteMethod;
+ }
+
+ public void fuzzyRewriteMethod(MultiTermQuery.RewriteMethod fuzzyRewriteMethod) {
+ this.fuzzyRewriteMethod = fuzzyRewriteMethod;
+ }
+
+ public boolean escape() {
+ return escape;
+ }
+
+ public void escape(boolean escape) {
+ this.escape = escape;
+ }
+
+ public Analyzer defaultAnalyzer() {
+ return defaultAnalyzer;
+ }
+
+ public void defaultAnalyzer(Analyzer defaultAnalyzer) {
+ this.defaultAnalyzer = defaultAnalyzer;
+ }
+
+ public Analyzer defaultQuoteAnalyzer() {
+ return defaultQuoteAnalyzer;
+ }
+
+ public void defaultQuoteAnalyzer(Analyzer defaultAnalyzer) {
+ this.defaultQuoteAnalyzer = defaultAnalyzer;
+ }
+
+ public Analyzer forcedAnalyzer() {
+ return forcedAnalyzer;
+ }
+
+ public void forcedAnalyzer(Analyzer forcedAnalyzer) {
+ this.forcedAnalyzer = forcedAnalyzer;
+ }
+
+ public Analyzer forcedQuoteAnalyzer() {
+ return forcedQuoteAnalyzer;
+ }
+
+ public void forcedQuoteAnalyzer(Analyzer forcedAnalyzer) {
+ this.forcedQuoteAnalyzer = forcedAnalyzer;
+ }
+
+ public boolean analyzeWildcard() {
+ return this.analyzeWildcard;
+ }
+
+ public void analyzeWildcard(boolean analyzeWildcard) {
+ this.analyzeWildcard = analyzeWildcard;
+ }
+
+ public MultiTermQuery.RewriteMethod rewriteMethod() {
+ return this.rewriteMethod;
+ }
+
+ public void rewriteMethod(MultiTermQuery.RewriteMethod rewriteMethod) {
+ this.rewriteMethod = rewriteMethod;
+ }
+
+ public String minimumShouldMatch() {
+ return this.minimumShouldMatch;
+ }
+
+ public void minimumShouldMatch(String minimumShouldMatch) {
+ this.minimumShouldMatch = minimumShouldMatch;
+ }
+
+ public void quoteFieldSuffix(String quoteFieldSuffix) {
+ this.quoteFieldSuffix = quoteFieldSuffix;
+ }
+
+ public String quoteFieldSuffix() {
+ return this.quoteFieldSuffix;
+ }
+
+ public void lenient(boolean lenient) {
+ this.lenient = lenient;
+ }
+
+ public boolean lenient() {
+ return this.lenient;
+ }
+
+ public List<String> fields() {
+ return fields;
+ }
+
+ public void fields(List<String> fields) {
+ this.fields = fields;
+ }
+
+ public Collection<String> queryTypes() {
+ return queryTypes;
+ }
+
+ public void queryTypes(Collection<String> queryTypes) {
+ this.queryTypes = queryTypes;
+ }
+
+ public ObjectFloatOpenHashMap<String> boosts() {
+ return boosts;
+ }
+
+ public void boosts(ObjectFloatOpenHashMap<String> boosts) {
+ this.boosts = boosts;
+ }
+
+ public float tieBreaker() {
+ return tieBreaker;
+ }
+
+ public void tieBreaker(float tieBreaker) {
+ this.tieBreaker = tieBreaker;
+ }
+
+ public boolean useDisMax() {
+ return useDisMax;
+ }
+
+ public void useDisMax(boolean useDisMax) {
+ this.useDisMax = useDisMax;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+
+ QueryParserSettings that = (QueryParserSettings) o;
+
+ if (autoGeneratePhraseQueries != that.autoGeneratePhraseQueries()) return false;
+ if (allowLeadingWildcard != that.allowLeadingWildcard) return false;
+ if (Float.compare(that.boost, boost) != 0) return false;
+ if (enablePositionIncrements != that.enablePositionIncrements) return false;
+ if (escape != that.escape) return false;
+ if (analyzeWildcard != that.analyzeWildcard) return false;
+ if (Float.compare(that.fuzzyMinSim, fuzzyMinSim) != 0) return false;
+ if (fuzzyPrefixLength != that.fuzzyPrefixLength) return false;
+ if (fuzzyMaxExpansions != that.fuzzyMaxExpansions) return false;
+ if (fuzzyRewriteMethod != null ? !fuzzyRewriteMethod.equals(that.fuzzyRewriteMethod) : that.fuzzyRewriteMethod != null)
+ return false;
+ if (lowercaseExpandedTerms != that.lowercaseExpandedTerms) return false;
+ if (phraseSlop != that.phraseSlop) return false;
+ if (defaultAnalyzer != null ? !defaultAnalyzer.equals(that.defaultAnalyzer) : that.defaultAnalyzer != null)
+ return false;
+ if (defaultQuoteAnalyzer != null ? !defaultQuoteAnalyzer.equals(that.defaultQuoteAnalyzer) : that.defaultQuoteAnalyzer != null)
+ return false;
+ if (forcedAnalyzer != null ? !forcedAnalyzer.equals(that.forcedAnalyzer) : that.forcedAnalyzer != null)
+ return false;
+ if (forcedQuoteAnalyzer != null ? !forcedQuoteAnalyzer.equals(that.forcedQuoteAnalyzer) : that.forcedQuoteAnalyzer != null)
+ return false;
+ if (defaultField != null ? !defaultField.equals(that.defaultField) : that.defaultField != null) return false;
+ if (defaultOperator != that.defaultOperator) return false;
+ if (queryString != null ? !queryString.equals(that.queryString) : that.queryString != null) return false;
+ if (rewriteMethod != null ? !rewriteMethod.equals(that.rewriteMethod) : that.rewriteMethod != null)
+ return false;
+ if (minimumShouldMatch != null ? !minimumShouldMatch.equals(that.minimumShouldMatch) : that.minimumShouldMatch != null)
+ return false;
+ if (quoteFieldSuffix != null ? !quoteFieldSuffix.equals(that.quoteFieldSuffix) : that.quoteFieldSuffix != null)
+ return false;
+ if (lenient != that.lenient) {
+ return false;
+ }
+
+ if (Float.compare(that.tieBreaker, tieBreaker) != 0) return false;
+ if (useDisMax != that.useDisMax) return false;
+ if (boosts != null ? !boosts.equals(that.boosts) : that.boosts != null) return false;
+ if (fields != null ? !fields.equals(that.fields) : that.fields != null) return false;
+ if (queryTypes != null ? !queryTypes.equals(that.queryTypes) : that.queryTypes != null) return false;
+
+ return true;
+ }
+
+ @Override
+ public int hashCode() {
+ int result = queryString != null ? queryString.hashCode() : 0;
+ result = 31 * result + (defaultField != null ? defaultField.hashCode() : 0);
+ result = 31 * result + (boost != +0.0f ? Float.floatToIntBits(boost) : 0);
+ result = 31 * result + (defaultOperator != null ? defaultOperator.hashCode() : 0);
+ result = 31 * result + (autoGeneratePhraseQueries ? 1 : 0);
+ result = 31 * result + (allowLeadingWildcard ? 1 : 0);
+ result = 31 * result + (lowercaseExpandedTerms ? 1 : 0);
+ result = 31 * result + (enablePositionIncrements ? 1 : 0);
+ result = 31 * result + phraseSlop;
+ result = 31 * result + (fuzzyMinSim != +0.0f ? Float.floatToIntBits(fuzzyMinSim) : 0);
+ result = 31 * result + fuzzyPrefixLength;
+ result = 31 * result + (escape ? 1 : 0);
+ result = 31 * result + (defaultAnalyzer != null ? defaultAnalyzer.hashCode() : 0);
+ result = 31 * result + (defaultQuoteAnalyzer != null ? defaultQuoteAnalyzer.hashCode() : 0);
+ result = 31 * result + (forcedAnalyzer != null ? forcedAnalyzer.hashCode() : 0);
+ result = 31 * result + (forcedQuoteAnalyzer != null ? forcedQuoteAnalyzer.hashCode() : 0);
+ result = 31 * result + (analyzeWildcard ? 1 : 0);
+
+ result = 31 * result + (fields != null ? fields.hashCode() : 0);
+ result = 31 * result + (queryTypes != null ? queryTypes.hashCode() : 0);
+ result = 31 * result + (boosts != null ? boosts.hashCode() : 0);
+ result = 31 * result + (tieBreaker != +0.0f ? Float.floatToIntBits(tieBreaker) : 0);
+ result = 31 * result + (useDisMax ? 1 : 0);
+ return result;
+ }
+}
diff --git a/src/main/java/org/apache/lucene/search/XReferenceManager.java b/src/main/java/org/apache/lucene/search/XReferenceManager.java
new file mode 100644
index 0000000..07fb066
--- /dev/null
+++ b/src/main/java/org/apache/lucene/search/XReferenceManager.java
@@ -0,0 +1,326 @@
+package org.apache.lucene.search;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.store.AlreadyClosedException;
+import org.apache.lucene.util.Version;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.List;
+import java.util.concurrent.CopyOnWriteArrayList;
+import java.util.concurrent.locks.Lock;
+import java.util.concurrent.locks.ReentrantLock;
+
+/**
+ * Utility class to safely share instances of a certain type across multiple
+ * threads, while periodically refreshing them. This class ensures each
+ * reference is closed only once all threads have finished using it. It is
+ * recommended to consult the documentation of {@link org.apache.lucene.search.XReferenceManager}
+ * implementations for their {@link #maybeRefresh()} semantics.
+ *
+ * @param <G>
+ * the concrete type that will be {@link #acquire() acquired} and
+ * {@link #release(Object) released}.
+ *
+ * @lucene.experimental
+ */
+public abstract class XReferenceManager<G> implements Closeable {
+ static {
+ assert Version.LUCENE_46 == org.elasticsearch.Version.CURRENT.luceneVersion : "Remove this once we are on LUCENE_47 - see LUCENE-5436";
+ }
+
+ private static final String REFERENCE_MANAGER_IS_CLOSED_MSG = "this ReferenceManager is closed";
+
+ protected volatile G current;
+
+ private final Lock refreshLock = new ReentrantLock();
+
+ private final List<RefreshListener> refreshListeners = new CopyOnWriteArrayList<RefreshListener>();
+
+ private void ensureOpen() {
+ if (current == null) {
+ throw new AlreadyClosedException(REFERENCE_MANAGER_IS_CLOSED_MSG);
+ }
+ }
+
+ private synchronized void swapReference(G newReference) throws IOException {
+ ensureOpen();
+ final G oldReference = current;
+ current = newReference;
+ release(oldReference);
+ }
+
+ /**
+ * Decrement reference counting on the given reference.
+ * @throws java.io.IOException if reference decrement on the given resource failed.
+ * */
+ protected abstract void decRef(G reference) throws IOException;
+
+ /**
+ * Refresh the given reference if needed. Returns {@code null} if no refresh
+ * was needed, otherwise a new refreshed reference.
+ * @throws org.apache.lucene.store.AlreadyClosedException if the reference manager has been {@link #close() closed}.
+ * @throws java.io.IOException if the refresh operation failed
+ */
+ protected abstract G refreshIfNeeded(G referenceToRefresh) throws IOException;
+
+ /**
+ * Try to increment reference counting on the given reference. Return true if
+ * the operation was successful.
+ * @throws org.apache.lucene.store.AlreadyClosedException if the reference manager has been {@link #close() closed}.
+ */
+ protected abstract boolean tryIncRef(G reference) throws IOException;
+
+ /**
+ * Obtain the current reference. You must match every call to acquire with one
+ * call to {@link #release}; it's best to do so in a finally clause, and set
+ * the reference to {@code null} to prevent accidental usage after it has been
+ * released.
+ * @throws org.apache.lucene.store.AlreadyClosedException if the reference manager has been {@link #close() closed}.
+ */
+ public final G acquire() throws IOException {
+ G ref;
+
+ do {
+ if ((ref = current) == null) {
+ throw new AlreadyClosedException(REFERENCE_MANAGER_IS_CLOSED_MSG);
+ }
+ if (tryIncRef(ref)) {
+ return ref;
+ }
+ if (getRefCount(ref) == 0 && current == ref) {
+ assert ref != null;
+ /* if we can't increment the reader but we are
+ still the current reference the RM is in a
+ illegal states since we can't make any progress
+ anymore. The reference is closed but the RM still
+ holds on to it as the actual instance.
+ This can only happen if somebody outside of the RM
+ decrements the refcount without a corresponding increment
+ since the RM assigns the new reference before counting down
+ the reference. */
+ throw new IllegalStateException("The managed reference has already closed - this is likely a bug when the reference count is modified outside of the ReferenceManager");
+ }
+ } while (true);
+ }
+
+ /**
+ * <p>
+ * Closes this ReferenceManager to prevent future {@link #acquire() acquiring}. A
+ * reference manager should be closed if the reference to the managed resource
+ * should be disposed or the application using the {@link org.apache.lucene.search.XReferenceManager}
+ * is shutting down. The managed resource might not be released immediately,
+ * if the {@link org.apache.lucene.search.XReferenceManager} user is holding on to a previously
+ * {@link #acquire() acquired} reference. The resource will be released once
+ * when the last reference is {@link #release(Object) released}. Those
+ * references can still be used as if the manager was still active.
+ * </p>
+ * <p>
+ * Applications should not {@link #acquire() acquire} new references from this
+ * manager once this method has been called. {@link #acquire() Acquiring} a
+ * resource on a closed {@link org.apache.lucene.search.XReferenceManager} will throw an
+ * {@link org.apache.lucene.store.AlreadyClosedException}.
+ * </p>
+ *
+ * @throws java.io.IOException
+ * if the underlying reader of the current reference could not be closed
+ */
+ @Override
+ public final synchronized void close() throws IOException {
+ if (current != null) {
+ // make sure we can call this more than once
+ // closeable javadoc says:
+ // if this is already closed then invoking this method has no effect.
+ swapReference(null);
+ afterClose();
+ }
+ }
+
+ /**
+ * Returns the current reference count of the given reference.
+ */
+ protected abstract int getRefCount(G reference);
+
+ /**
+ * Called after close(), so subclass can free any resources.
+ * @throws java.io.IOException if the after close operation in a sub-class throws an {@link java.io.IOException}
+ * */
+ protected void afterClose() throws IOException {
+ }
+
+ private void doMaybeRefresh() throws IOException {
+ // it's ok to call lock() here (blocking) because we're supposed to get here
+ // from either maybeRefreh() or maybeRefreshBlocking(), after the lock has
+ // already been obtained. Doing that protects us from an accidental bug
+ // where this method will be called outside the scope of refreshLock.
+ // Per ReentrantLock's javadoc, calling lock() by the same thread more than
+ // once is ok, as long as unlock() is called a matching number of times.
+ refreshLock.lock();
+ boolean refreshed = false;
+ try {
+ final G reference = acquire();
+ try {
+ notifyRefreshListenersBefore();
+ G newReference = refreshIfNeeded(reference);
+ if (newReference != null) {
+ assert newReference != reference : "refreshIfNeeded should return null if refresh wasn't needed";
+ try {
+ swapReference(newReference);
+ refreshed = true;
+ } finally {
+ if (!refreshed) {
+ release(newReference);
+ }
+ }
+ }
+ } finally {
+ release(reference);
+ notifyRefreshListenersRefreshed(refreshed);
+ }
+ afterMaybeRefresh();
+ } finally {
+ refreshLock.unlock();
+ }
+ }
+
+ /**
+ * You must call this (or {@link #maybeRefreshBlocking()}), periodically, if
+ * you want that {@link #acquire()} will return refreshed instances.
+ *
+ * <p>
+ * <b>Threads</b>: it's fine for more than one thread to call this at once.
+ * Only the first thread will attempt the refresh; subsequent threads will see
+ * that another thread is already handling refresh and will return
+ * immediately. Note that this means if another thread is already refreshing
+ * then subsequent threads will return right away without waiting for the
+ * refresh to complete.
+ *
+ * <p>
+ * If this method returns true it means the calling thread either refreshed or
+ * that there were no changes to refresh. If it returns false it means another
+ * thread is currently refreshing.
+ * </p>
+ * @throws java.io.IOException if refreshing the resource causes an {@link java.io.IOException}
+ * @throws org.apache.lucene.store.AlreadyClosedException if the reference manager has been {@link #close() closed}.
+ */
+ public final boolean maybeRefresh() throws IOException {
+ ensureOpen();
+
+ // Ensure only 1 thread does refresh at once; other threads just return immediately:
+ final boolean doTryRefresh = refreshLock.tryLock();
+ if (doTryRefresh) {
+ try {
+ doMaybeRefresh();
+ } finally {
+ refreshLock.unlock();
+ }
+ }
+
+ return doTryRefresh;
+ }
+
+ /**
+ * You must call this (or {@link #maybeRefresh()}), periodically, if you want
+ * that {@link #acquire()} will return refreshed instances.
+ *
+ * <p>
+ * <b>Threads</b>: unlike {@link #maybeRefresh()}, if another thread is
+ * currently refreshing, this method blocks until that thread completes. It is
+ * useful if you want to guarantee that the next call to {@link #acquire()}
+ * will return a refreshed instance. Otherwise, consider using the
+ * non-blocking {@link #maybeRefresh()}.
+ * @throws java.io.IOException if refreshing the resource causes an {@link java.io.IOException}
+ * @throws org.apache.lucene.store.AlreadyClosedException if the reference manager has been {@link #close() closed}.
+ */
+ public final void maybeRefreshBlocking() throws IOException {
+ ensureOpen();
+
+ // Ensure only 1 thread does refresh at once
+ refreshLock.lock();
+ try {
+ doMaybeRefresh();
+ } finally {
+ refreshLock.unlock();
+ }
+ }
+
+ /** Called after a refresh was attempted, regardless of
+ * whether a new reference was in fact created.
+ * @throws java.io.IOException if a low level I/O exception occurs
+ **/
+ protected void afterMaybeRefresh() throws IOException {
+ }
+
+ /**
+ * Release the reference previously obtained via {@link #acquire()}.
+ * <p>
+ * <b>NOTE:</b> it's safe to call this after {@link #close()}.
+ * @throws java.io.IOException if the release operation on the given resource throws an {@link java.io.IOException}
+ */
+ public final void release(G reference) throws IOException {
+ assert reference != null;
+ decRef(reference);
+ }
+
+ private void notifyRefreshListenersBefore() throws IOException {
+ for (RefreshListener refreshListener : refreshListeners) {
+ refreshListener.beforeRefresh();
+ }
+ }
+
+ private void notifyRefreshListenersRefreshed(boolean didRefresh) throws IOException {
+ for (RefreshListener refreshListener : refreshListeners) {
+ refreshListener.afterRefresh(didRefresh);
+ }
+ }
+
+ /**
+ * Adds a listener, to be notified when a reference is refreshed/swapped.
+ */
+ public void addListener(RefreshListener listener) {
+ if (listener == null) {
+ throw new NullPointerException("Listener cannot be null");
+ }
+ refreshListeners.add(listener);
+ }
+
+ /**
+ * Remove a listener added with {@link #addListener(RefreshListener)}.
+ */
+ public void removeListener(RefreshListener listener) {
+ if (listener == null) {
+ throw new NullPointerException("Listener cannot be null");
+ }
+ refreshListeners.remove(listener);
+ }
+
+ /** Use to receive notification when a refresh has
+ * finished. See {@link #addListener}. */
+ public interface RefreshListener {
+
+ /** Called right before a refresh attempt starts. */
+ void beforeRefresh() throws IOException;
+
+ /** Called after the attempted refresh; if the refresh
+ * did open a new reference then didRefresh will be true
+ * and {@link #acquire()} is guaranteed to return the new
+ * reference. */
+ void afterRefresh(boolean didRefresh) throws IOException;
+ }
+}
diff --git a/src/main/java/org/apache/lucene/search/XSearcherManager.java b/src/main/java/org/apache/lucene/search/XSearcherManager.java
new file mode 100644
index 0000000..36c74a0
--- /dev/null
+++ b/src/main/java/org/apache/lucene/search/XSearcherManager.java
@@ -0,0 +1,177 @@
+package org.apache.lucene.search;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.Version;
+
+/**
+ * Utility class to safely share {@link IndexSearcher} instances across multiple
+ * threads, while periodically reopening. This class ensures each searcher is
+ * closed only once all threads have finished using it.
+ *
+ * <p>
+ * Use {@link #acquire} to obtain the current searcher, and {@link #release} to
+ * release it, like this:
+ *
+ * <pre class="prettyprint">
+ * IndexSearcher s = manager.acquire();
+ * try {
+ * // Do searching, doc retrieval, etc. with s
+ * } finally {
+ * manager.release(s);
+ * }
+ * // Do not use s after this!
+ * s = null;
+ * </pre>
+ *
+ * <p>
+ * In addition you should periodically call {@link #maybeRefresh}. While it's
+ * possible to call this just before running each query, this is discouraged
+ * since it penalizes the unlucky queries that do the reopen. It's better to use
+ * a separate background thread, that periodically calls maybeReopen. Finally,
+ * be sure to call {@link #close} once you are done.
+ *
+ * @see SearcherFactory
+ *
+ * @lucene.experimental
+ */
+public final class XSearcherManager extends XReferenceManager<IndexSearcher> {
+
+ static {
+ assert Version.LUCENE_46 == org.elasticsearch.Version.CURRENT.luceneVersion : "Remove this once we are on LUCENE_47 - see LUCENE-5436";
+ }
+
+ private final SearcherFactory searcherFactory;
+
+ /**
+ * Creates and returns a new XSearcherManager from the given
+ * {@link IndexWriter}.
+ *
+ * @param writer
+ * the IndexWriter to open the IndexReader from.
+ * @param applyAllDeletes
+ * If <code>true</code>, all buffered deletes will be applied (made
+ * visible) in the {@link IndexSearcher} / {@link DirectoryReader}.
+ * If <code>false</code>, the deletes may or may not be applied, but
+ * remain buffered (in IndexWriter) so that they will be applied in
+ * the future. Applying deletes can be costly, so if your app can
+ * tolerate deleted documents being returned you might gain some
+ * performance by passing <code>false</code>. See
+ * {@link DirectoryReader#openIfChanged(DirectoryReader, IndexWriter, boolean)}.
+ * @param searcherFactory
+ * An optional {@link SearcherFactory}. Pass <code>null</code> if you
+ * don't require the searcher to be warmed before going live or other
+ * custom behavior.
+ *
+ * @throws IOException if there is a low-level I/O error
+ */
+ public XSearcherManager(IndexWriter writer, boolean applyAllDeletes, SearcherFactory searcherFactory) throws IOException {
+ if (searcherFactory == null) {
+ searcherFactory = new SearcherFactory();
+ }
+ this.searcherFactory = searcherFactory;
+ current = getSearcher(searcherFactory, DirectoryReader.open(writer, applyAllDeletes));
+ }
+
+ /**
+ * Creates and returns a new XSearcherManager from the given {@link Directory}.
+ * @param dir the directory to open the DirectoryReader on.
+ * @param searcherFactory An optional {@link SearcherFactory}. Pass
+ * <code>null</code> if you don't require the searcher to be warmed
+ * before going live or other custom behavior.
+ *
+ * @throws IOException if there is a low-level I/O error
+ */
+ public XSearcherManager(Directory dir, SearcherFactory searcherFactory) throws IOException {
+ if (searcherFactory == null) {
+ searcherFactory = new SearcherFactory();
+ }
+ this.searcherFactory = searcherFactory;
+ current = getSearcher(searcherFactory, DirectoryReader.open(dir));
+ }
+
+ @Override
+ protected void decRef(IndexSearcher reference) throws IOException {
+ reference.getIndexReader().decRef();
+ }
+
+ @Override
+ protected IndexSearcher refreshIfNeeded(IndexSearcher referenceToRefresh) throws IOException {
+ final IndexReader r = referenceToRefresh.getIndexReader();
+ assert r instanceof DirectoryReader: "searcher's IndexReader should be a DirectoryReader, but got " + r;
+ final IndexReader newReader = DirectoryReader.openIfChanged((DirectoryReader) r);
+ if (newReader == null) {
+ return null;
+ } else {
+ return getSearcher(searcherFactory, newReader);
+ }
+ }
+
+ @Override
+ protected boolean tryIncRef(IndexSearcher reference) {
+ return reference.getIndexReader().tryIncRef();
+ }
+
+ @Override
+ protected int getRefCount(IndexSearcher reference) {
+ return reference.getIndexReader().getRefCount();
+ }
+
+ /**
+ * Returns <code>true</code> if no changes have occured since this searcher
+ * ie. reader was opened, otherwise <code>false</code>.
+ * @see DirectoryReader#isCurrent()
+ */
+ public boolean isSearcherCurrent() throws IOException {
+ final IndexSearcher searcher = acquire();
+ try {
+ final IndexReader r = searcher.getIndexReader();
+ assert r instanceof DirectoryReader: "searcher's IndexReader should be a DirectoryReader, but got " + r;
+ return ((DirectoryReader) r).isCurrent();
+ } finally {
+ release(searcher);
+ }
+ }
+
+ /** Expert: creates a searcher from the provided {@link
+ * IndexReader} using the provided {@link
+ * SearcherFactory}. NOTE: this decRefs incoming reader
+ * on throwing an exception. */
+ public static IndexSearcher getSearcher(SearcherFactory searcherFactory, IndexReader reader) throws IOException {
+ boolean success = false;
+ final IndexSearcher searcher;
+ try {
+ searcher = searcherFactory.newSearcher(reader);
+ if (searcher.getIndexReader() != reader) {
+ throw new IllegalStateException("SearcherFactory must wrap exactly the provided reader (got " + searcher.getIndexReader() + " but expected " + reader + ")");
+ }
+ success = true;
+ } finally {
+ if (!success) {
+ reader.decRef();
+ }
+ }
+ return searcher;
+ }
+}
diff --git a/src/main/java/org/apache/lucene/search/postingshighlight/CustomPassageFormatter.java b/src/main/java/org/apache/lucene/search/postingshighlight/CustomPassageFormatter.java
new file mode 100644
index 0000000..59ef784
--- /dev/null
+++ b/src/main/java/org/apache/lucene/search/postingshighlight/CustomPassageFormatter.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to Elasticsearch under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. Elasticsearch licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package org.apache.lucene.search.postingshighlight;
+
+import org.apache.lucene.search.highlight.Encoder;
+import org.elasticsearch.search.highlight.HighlightUtils;
+
+/**
+Custom passage formatter that allows us to:
+1) extract different snippets (instead of a single big string) together with their scores ({@link Snippet})
+2) use the {@link Encoder} implementations that are already used with the other highlighters
+ */
+public class CustomPassageFormatter extends PassageFormatter {
+
+ private final String preTag;
+ private final String postTag;
+ private final Encoder encoder;
+
+ public CustomPassageFormatter(String preTag, String postTag, Encoder encoder) {
+ this.preTag = preTag;
+ this.postTag = postTag;
+ this.encoder = encoder;
+ }
+
+ @Override
+ public Snippet[] format(Passage[] passages, String content) {
+ Snippet[] snippets = new Snippet[passages.length];
+ int pos;
+ for (int j = 0; j < passages.length; j++) {
+ Passage passage = passages[j];
+ StringBuilder sb = new StringBuilder();
+ pos = passage.startOffset;
+ for (int i = 0; i < passage.numMatches; i++) {
+ int start = passage.matchStarts[i];
+ int end = passage.matchEnds[i];
+ // its possible to have overlapping terms
+ if (start > pos) {
+ append(sb, content, pos, start);
+ }
+ if (end > pos) {
+ sb.append(preTag);
+ append(sb, content, Math.max(pos, start), end);
+ sb.append(postTag);
+ pos = end;
+ }
+ }
+ // its possible a "term" from the analyzer could span a sentence boundary.
+ append(sb, content, pos, Math.max(pos, passage.endOffset));
+ //we remove the paragraph separator if present at the end of the snippet (we used it as separator between values)
+ if (sb.charAt(sb.length() - 1) == HighlightUtils.PARAGRAPH_SEPARATOR) {
+ sb.deleteCharAt(sb.length() - 1);
+ }
+ //and we trim the snippets too
+ snippets[j] = new Snippet(sb.toString().trim(), passage.score, passage.numMatches > 0);
+ }
+ return snippets;
+ }
+
+ protected void append(StringBuilder dest, String content, int start, int end) {
+ dest.append(encoder.encodeText(content.substring(start, end)));
+ }
+}
diff --git a/src/main/java/org/apache/lucene/search/postingshighlight/CustomPostingsHighlighter.java b/src/main/java/org/apache/lucene/search/postingshighlight/CustomPostingsHighlighter.java
new file mode 100644
index 0000000..be5ad66
--- /dev/null
+++ b/src/main/java/org/apache/lucene/search/postingshighlight/CustomPostingsHighlighter.java
@@ -0,0 +1,187 @@
+/*
+ * Licensed to Elasticsearch under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. Elasticsearch licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package org.apache.lucene.search.postingshighlight;
+
+import org.apache.lucene.index.AtomicReaderContext;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexReaderContext;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.util.BytesRef;
+import org.elasticsearch.common.Strings;
+import org.elasticsearch.search.highlight.HighlightUtils;
+
+import java.io.IOException;
+import java.text.BreakIterator;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Subclass of the {@link XPostingsHighlighter} that works for a single field in a single document.
+ * It receives the field values as input and it performs discrete highlighting on each single value
+ * calling the highlightDoc method multiple times.
+ * It allows to pass in the query terms to avoid calling extract terms multiple times.
+ *
+ * The use that we make of the postings highlighter is not optimal. It would be much better to
+ * highlight multiple docs in a single call, as we actually lose its sequential IO. But that would require:
+ * 1) to make our fork more complex and harder to maintain to perform discrete highlighting (needed to return
+ * a different snippet per value when number_of_fragments=0 and the field has multiple values)
+ * 2) refactoring of the elasticsearch highlight api which currently works per hit
+ *
+ */
+public final class CustomPostingsHighlighter extends XPostingsHighlighter {
+
+ private static final Snippet[] EMPTY_SNIPPET = new Snippet[0];
+ private static final Passage[] EMPTY_PASSAGE = new Passage[0];
+
+ private final CustomPassageFormatter passageFormatter;
+ private final int noMatchSize;
+ private final int totalContentLength;
+ private final String[] fieldValues;
+ private final int[] fieldValuesOffsets;
+ private int currentValueIndex = 0;
+
+ private BreakIterator breakIterator;
+
+ public CustomPostingsHighlighter(CustomPassageFormatter passageFormatter, List<Object> fieldValues, boolean mergeValues, int maxLength, int noMatchSize) {
+ super(maxLength);
+ this.passageFormatter = passageFormatter;
+ this.noMatchSize = noMatchSize;
+
+ if (mergeValues) {
+ String rawValue = Strings.collectionToDelimitedString(fieldValues, String.valueOf(getMultiValuedSeparator("")));
+ String fieldValue = rawValue.substring(0, Math.min(rawValue.length(), maxLength));
+ this.fieldValues = new String[]{fieldValue};
+ this.fieldValuesOffsets = new int[]{0};
+ this.totalContentLength = fieldValue.length();
+ } else {
+ this.fieldValues = new String[fieldValues.size()];
+ this.fieldValuesOffsets = new int[fieldValues.size()];
+ int contentLength = 0;
+ int offset = 0;
+ int previousLength = -1;
+ for (int i = 0; i < fieldValues.size(); i++) {
+ String rawValue = fieldValues.get(i).toString();
+ String fieldValue = rawValue.substring(0, Math.min(rawValue.length(), maxLength));
+ this.fieldValues[i] = fieldValue;
+ contentLength += fieldValue.length();
+ offset += previousLength + 1;
+ this.fieldValuesOffsets[i] = offset;
+ previousLength = fieldValue.length();
+ }
+ this.totalContentLength = contentLength;
+ }
+ }
+
+ /*
+ Our own api to highlight a single document field, passing in the query terms, and get back our own Snippet object
+ */
+ public Snippet[] highlightDoc(String field, BytesRef[] terms, IndexSearcher searcher, int docId, int maxPassages) throws IOException {
+ IndexReader reader = searcher.getIndexReader();
+ IndexReaderContext readerContext = reader.getContext();
+ List<AtomicReaderContext> leaves = readerContext.leaves();
+
+ String[] contents = new String[]{loadCurrentFieldValue()};
+ Map<Integer, Object> snippetsMap = highlightField(field, contents, getBreakIterator(field), terms, new int[]{docId}, leaves, maxPassages);
+
+ //increment the current value index so that next time we'll highlight the next value if available
+ currentValueIndex++;
+
+ Object snippetObject = snippetsMap.get(docId);
+ if (snippetObject != null && snippetObject instanceof Snippet[]) {
+ return (Snippet[]) snippetObject;
+ }
+ return EMPTY_SNIPPET;
+ }
+
+ /*
+ Method provided through our own fork: allows to do proper scoring when doing per value discrete highlighting.
+ Used to provide the total length of the field (all values) for proper scoring.
+ */
+ @Override
+ protected int getContentLength(String field, int docId) {
+ return totalContentLength;
+ }
+
+ /*
+ Method provided through our own fork: allows to perform proper per value discrete highlighting.
+ Used to provide the offset for the current value.
+ */
+ @Override
+ protected int getOffsetForCurrentValue(String field, int docId) {
+ if (currentValueIndex < fieldValuesOffsets.length) {
+ return fieldValuesOffsets[currentValueIndex];
+ }
+ throw new IllegalArgumentException("No more values offsets to return");
+ }
+
+ public void setBreakIterator(BreakIterator breakIterator) {
+ this.breakIterator = breakIterator;
+ }
+
+ @Override
+ protected PassageFormatter getFormatter(String field) {
+ return passageFormatter;
+ }
+
+ @Override
+ protected BreakIterator getBreakIterator(String field) {
+ if (breakIterator == null) {
+ return super.getBreakIterator(field);
+ }
+ return breakIterator;
+ }
+
+ @Override
+ protected char getMultiValuedSeparator(String field) {
+ //U+2029 PARAGRAPH SEPARATOR (PS): each value holds a discrete passage for highlighting
+ return HighlightUtils.PARAGRAPH_SEPARATOR;
+ }
+
+ /*
+ By default the postings highlighter returns non highlighted snippet when there are no matches.
+ We want to return no snippets by default, unless no_match_size is greater than 0
+ */
+ @Override
+ protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) {
+ if (noMatchSize > 0) {
+ //we want to return the first sentence of the first snippet only
+ return super.getEmptyHighlight(fieldName, bi, 1);
+ }
+ return EMPTY_PASSAGE;
+ }
+
+ /*
+ Not needed since we call our own loadCurrentFieldValue explicitly, but we override it anyway for consistency.
+ */
+ @Override
+ protected String[][] loadFieldValues(IndexSearcher searcher, String[] fields, int[] docids, int maxLength) throws IOException {
+ return new String[][]{new String[]{loadCurrentFieldValue()}};
+ }
+
+ /*
+ Our own method that returns the field values, which relies on the content that was provided when creating the highlighter.
+ Supports per value discrete highlighting calling the highlightDoc method multiple times, one per value.
+ */
+ protected String loadCurrentFieldValue() {
+ if (currentValueIndex < fieldValues.length) {
+ return fieldValues[currentValueIndex];
+ }
+ throw new IllegalArgumentException("No more values to return");
+ }
+}
diff --git a/src/main/java/org/apache/lucene/search/postingshighlight/Snippet.java b/src/main/java/org/apache/lucene/search/postingshighlight/Snippet.java
new file mode 100644
index 0000000..bf68020
--- /dev/null
+++ b/src/main/java/org/apache/lucene/search/postingshighlight/Snippet.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to Elasticsearch under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. Elasticsearch licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package org.apache.lucene.search.postingshighlight;
+
+/**
+ * Represents a scored highlighted snippet.
+ * It's our own arbitrary object that we get back from the postings highlighter when highlighting a document.
+ * Every snippet contains its formatted text and its score.
+ * The score is needed since we highlight every single value separately and we might want to return snippets sorted by score.
+ */
+public class Snippet {
+
+ private final String text;
+ private final float score;
+ private final boolean isHighlighted;
+
+ public Snippet(String text, float score, boolean isHighlighted) {
+ this.text = text;
+ this.score = score;
+ this.isHighlighted = isHighlighted;
+ }
+
+ public String getText() {
+ return text;
+ }
+
+ public float getScore() {
+ return score;
+ }
+
+ public boolean isHighlighted() {
+ return isHighlighted;
+ }
+}
diff --git a/src/main/java/org/apache/lucene/search/postingshighlight/XPostingsHighlighter.java b/src/main/java/org/apache/lucene/search/postingshighlight/XPostingsHighlighter.java
new file mode 100644
index 0000000..869b559
--- /dev/null
+++ b/src/main/java/org/apache/lucene/search/postingshighlight/XPostingsHighlighter.java
@@ -0,0 +1,777 @@
+/*
+ * Licensed to Elasticsearch under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. Elasticsearch licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package org.apache.lucene.search.postingshighlight;
+
+import org.apache.lucene.index.*;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.InPlaceMergeSorter;
+import org.apache.lucene.util.UnicodeUtil;
+
+import java.io.IOException;
+import java.text.BreakIterator;
+import java.util.*;
+
+/*
+FORKED from Lucene 4.5 to be able to:
+1) support discrete highlighting for multiple values, so that we can return a different snippet per value when highlighting the whole text
+2) call the highlightField method directly from subclasses and provide the terms by ourselves
+3) Applied LUCENE-4906 to allow PassageFormatter to return arbitrary objects (LUCENE 4.6)
+
+All our changes start with //BEGIN EDIT
+ */
+public class XPostingsHighlighter {
+
+ //BEGIN EDIT added method to override offset for current value (default 0)
+ //we need this to perform discrete highlighting per field
+ protected int getOffsetForCurrentValue(String field, int docId) {
+ return 0;
+ }
+ //END EDIT
+
+ //BEGIN EDIT
+ //we need this to fix scoring when highlighting every single value separately, since the score depends on the total length of the field (all values rather than only the current one)
+ protected int getContentLength(String field, int docId) {
+ return -1;
+ }
+ //END EDIT
+
+
+ // TODO: maybe allow re-analysis for tiny fields? currently we require offsets,
+ // but if the analyzer is really fast and the field is tiny, this might really be
+ // unnecessary.
+
+ /** for rewriting: we don't want slow processing from MTQs */
+ private static final IndexReader EMPTY_INDEXREADER = new MultiReader();
+
+ /** Default maximum content size to process. Typically snippets
+ * closer to the beginning of the document better summarize its content */
+ public static final int DEFAULT_MAX_LENGTH = 10000;
+
+ private final int maxLength;
+
+ /** Set the first time {@link #getFormatter} is called,
+ * and then reused. */
+ private PassageFormatter defaultFormatter;
+
+ /** Set the first time {@link #getScorer} is called,
+ * and then reused. */
+ private PassageScorer defaultScorer;
+
+ /**
+ * Creates a new highlighter with default parameters.
+ */
+ public XPostingsHighlighter() {
+ this(DEFAULT_MAX_LENGTH);
+ }
+
+ /**
+ * Creates a new highlighter, specifying maximum content length.
+ * @param maxLength maximum content size to process.
+ * @throws IllegalArgumentException if <code>maxLength</code> is negative or <code>Integer.MAX_VALUE</code>
+ */
+ public XPostingsHighlighter(int maxLength) {
+ if (maxLength < 0 || maxLength == Integer.MAX_VALUE) {
+ // two reasons: no overflow problems in BreakIterator.preceding(offset+1),
+ // our sentinel in the offsets queue uses this value to terminate.
+ throw new IllegalArgumentException("maxLength must be < Integer.MAX_VALUE");
+ }
+ this.maxLength = maxLength;
+ }
+
+ /** Returns the {@link java.text.BreakIterator} to use for
+ * dividing text into passages. This returns
+ * {@link java.text.BreakIterator#getSentenceInstance(java.util.Locale)} by default;
+ * subclasses can override to customize. */
+ protected BreakIterator getBreakIterator(String field) {
+ return BreakIterator.getSentenceInstance(Locale.ROOT);
+ }
+
+ /** Returns the {@link PassageFormatter} to use for
+ * formatting passages into highlighted snippets. This
+ * returns a new {@code PassageFormatter} by default;
+ * subclasses can override to customize. */
+ protected PassageFormatter getFormatter(String field) {
+ if (defaultFormatter == null) {
+ defaultFormatter = new DefaultPassageFormatter();
+ }
+ return defaultFormatter;
+ }
+
+ /** Returns the {@link PassageScorer} to use for
+ * ranking passages. This
+ * returns a new {@code PassageScorer} by default;
+ * subclasses can override to customize. */
+ protected PassageScorer getScorer(String field) {
+ if (defaultScorer == null) {
+ defaultScorer = new PassageScorer();
+ }
+ return defaultScorer;
+ }
+
+ /**
+ * Highlights the top passages from a single field.
+ *
+ * @param field field name to highlight.
+ * Must have a stored string value and also be indexed with offsets.
+ * @param query query to highlight.
+ * @param searcher searcher that was previously used to execute the query.
+ * @param topDocs TopDocs containing the summary result documents to highlight.
+ * @return Array of formatted snippets corresponding to the documents in <code>topDocs</code>.
+ * If no highlights were found for a document, the
+ * first sentence for the field will be returned.
+ * @throws java.io.IOException if an I/O error occurred during processing
+ * @throws IllegalArgumentException if <code>field</code> was indexed without
+ * {@link org.apache.lucene.index.FieldInfo.IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
+ */
+ public String[] highlight(String field, Query query, IndexSearcher searcher, TopDocs topDocs) throws IOException {
+ return highlight(field, query, searcher, topDocs, 1);
+ }
+
+ /**
+ * Highlights the top-N passages from a single field.
+ *
+ * @param field field name to highlight.
+ * Must have a stored string value and also be indexed with offsets.
+ * @param query query to highlight.
+ * @param searcher searcher that was previously used to execute the query.
+ * @param topDocs TopDocs containing the summary result documents to highlight.
+ * @param maxPassages The maximum number of top-N ranked passages used to
+ * form the highlighted snippets.
+ * @return Array of formatted snippets corresponding to the documents in <code>topDocs</code>.
+ * If no highlights were found for a document, the
+ * first {@code maxPassages} sentences from the
+ * field will be returned.
+ * @throws IOException if an I/O error occurred during processing
+ * @throws IllegalArgumentException if <code>field</code> was indexed without
+ * {@link org.apache.lucene.index.FieldInfo.IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
+ */
+ public String[] highlight(String field, Query query, IndexSearcher searcher, TopDocs topDocs, int maxPassages) throws IOException {
+ Map<String,String[]> res = highlightFields(new String[] { field }, query, searcher, topDocs, new int[] { maxPassages });
+ return res.get(field);
+ }
+
+ /**
+ * Highlights the top passages from multiple fields.
+ * <p>
+ * Conceptually, this behaves as a more efficient form of:
+ * <pre class="prettyprint">
+ * Map m = new HashMap();
+ * for (String field : fields) {
+ * m.put(field, highlight(field, query, searcher, topDocs));
+ * }
+ * return m;
+ * </pre>
+ *
+ * @param fields field names to highlight.
+ * Must have a stored string value and also be indexed with offsets.
+ * @param query query to highlight.
+ * @param searcher searcher that was previously used to execute the query.
+ * @param topDocs TopDocs containing the summary result documents to highlight.
+ * @return Map keyed on field name, containing the array of formatted snippets
+ * corresponding to the documents in <code>topDocs</code>.
+ * If no highlights were found for a document, the
+ * first sentence from the field will be returned.
+ * @throws IOException if an I/O error occurred during processing
+ * @throws IllegalArgumentException if <code>field</code> was indexed without
+ * {@link org.apache.lucene.index.FieldInfo.IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
+ */
+ public Map<String,String[]> highlightFields(String fields[], Query query, IndexSearcher searcher, TopDocs topDocs) throws IOException {
+ int maxPassages[] = new int[fields.length];
+ Arrays.fill(maxPassages, 1);
+ return highlightFields(fields, query, searcher, topDocs, maxPassages);
+ }
+
+ /**
+ * Highlights the top-N passages from multiple fields.
+ * <p>
+ * Conceptually, this behaves as a more efficient form of:
+ * <pre class="prettyprint">
+ * Map m = new HashMap();
+ * for (String field : fields) {
+ * m.put(field, highlight(field, query, searcher, topDocs, maxPassages));
+ * }
+ * return m;
+ * </pre>
+ *
+ * @param fields field names to highlight.
+ * Must have a stored string value and also be indexed with offsets.
+ * @param query query to highlight.
+ * @param searcher searcher that was previously used to execute the query.
+ * @param topDocs TopDocs containing the summary result documents to highlight.
+ * @param maxPassages The maximum number of top-N ranked passages per-field used to
+ * form the highlighted snippets.
+ * @return Map keyed on field name, containing the array of formatted snippets
+ * corresponding to the documents in <code>topDocs</code>.
+ * If no highlights were found for a document, the
+ * first {@code maxPassages} sentences from the
+ * field will be returned.
+ * @throws IOException if an I/O error occurred during processing
+ * @throws IllegalArgumentException if <code>field</code> was indexed without
+ * {@link org.apache.lucene.index.FieldInfo.IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
+ */
+ public Map<String,String[]> highlightFields(String fields[], Query query, IndexSearcher searcher, TopDocs topDocs, int maxPassages[]) throws IOException {
+ final ScoreDoc scoreDocs[] = topDocs.scoreDocs;
+ int docids[] = new int[scoreDocs.length];
+ for (int i = 0; i < docids.length; i++) {
+ docids[i] = scoreDocs[i].doc;
+ }
+
+ return highlightFields(fields, query, searcher, docids, maxPassages);
+ }
+
+ /**
+ * Highlights the top-N passages from multiple fields,
+ * for the provided int[] docids.
+ *
+ * @param fieldsIn field names to highlight.
+ * Must have a stored string value and also be indexed with offsets.
+ * @param query query to highlight.
+ * @param searcher searcher that was previously used to execute the query.
+ * @param docidsIn containing the document IDs to highlight.
+ * @param maxPassagesIn The maximum number of top-N ranked passages per-field used to
+ * form the highlighted snippets.
+ * @return Map keyed on field name, containing the array of formatted snippets
+ * corresponding to the documents in <code>topDocs</code>.
+ * If no highlights were found for a document, the
+ * first {@code maxPassages} from the field will
+ * be returned.
+ * @throws IOException if an I/O error occurred during processing
+ * @throws IllegalArgumentException if <code>field</code> was indexed without
+ * {@link org.apache.lucene.index.FieldInfo.IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
+ */
+ public Map<String,String[]> highlightFields(String fieldsIn[], Query query, IndexSearcher searcher, int[] docidsIn, int maxPassagesIn[]) throws IOException {
+ Map<String,String[]> snippets = new HashMap<String,String[]>();
+ for(Map.Entry<String,Object[]> ent : highlightFieldsAsObjects(fieldsIn, query, searcher, docidsIn, maxPassagesIn).entrySet()) {
+ Object[] snippetObjects = ent.getValue();
+ String[] snippetStrings = new String[snippetObjects.length];
+ snippets.put(ent.getKey(), snippetStrings);
+ for(int i=0;i<snippetObjects.length;i++) {
+ Object snippet = snippetObjects[i];
+ if (snippet != null) {
+ snippetStrings[i] = snippet.toString();
+ }
+ }
+ }
+
+ return snippets;
+ }
+
+ public Map<String,Object[]> highlightFieldsAsObjects(String fieldsIn[], Query query, IndexSearcher searcher, int[] docidsIn, int maxPassagesIn[]) throws IOException {
+ if (fieldsIn.length < 1) {
+ throw new IllegalArgumentException("fieldsIn must not be empty");
+ }
+ if (fieldsIn.length != maxPassagesIn.length) {
+ throw new IllegalArgumentException("invalid number of maxPassagesIn");
+ }
+ final IndexReader reader = searcher.getIndexReader();
+ query = rewrite(query);
+ SortedSet<Term> queryTerms = new TreeSet<Term>();
+ query.extractTerms(queryTerms);
+
+ IndexReaderContext readerContext = reader.getContext();
+ List<AtomicReaderContext> leaves = readerContext.leaves();
+
+ // Make our own copies because we sort in-place:
+ int[] docids = new int[docidsIn.length];
+ System.arraycopy(docidsIn, 0, docids, 0, docidsIn.length);
+ final String fields[] = new String[fieldsIn.length];
+ System.arraycopy(fieldsIn, 0, fields, 0, fieldsIn.length);
+ final int maxPassages[] = new int[maxPassagesIn.length];
+ System.arraycopy(maxPassagesIn, 0, maxPassages, 0, maxPassagesIn.length);
+
+ // sort for sequential io
+ Arrays.sort(docids);
+ new InPlaceMergeSorter() {
+
+ @Override
+ protected void swap(int i, int j) {
+ String tmp = fields[i];
+ fields[i] = fields[j];
+ fields[j] = tmp;
+ int tmp2 = maxPassages[i];
+ maxPassages[i] = maxPassages[j];
+ maxPassages[j] = tmp2;
+ }
+
+ @Override
+ protected int compare(int i, int j) {
+ return fields[i].compareTo(fields[j]);
+ }
+
+ }.sort(0, fields.length);
+
+ // pull stored data:
+ String[][] contents = loadFieldValues(searcher, fields, docids, maxLength);
+
+ Map<String,Object[]> highlights = new HashMap<String,Object[]>();
+ for (int i = 0; i < fields.length; i++) {
+ String field = fields[i];
+ int numPassages = maxPassages[i];
+
+ Term floor = new Term(field, "");
+ Term ceiling = new Term(field, UnicodeUtil.BIG_TERM);
+ SortedSet<Term> fieldTerms = queryTerms.subSet(floor, ceiling);
+ // TODO: should we have some reasonable defaults for term pruning? (e.g. stopwords)
+
+ // Strip off the redundant field:
+ BytesRef terms[] = new BytesRef[fieldTerms.size()];
+ int termUpto = 0;
+ for(Term term : fieldTerms) {
+ terms[termUpto++] = term.bytes();
+ }
+ Map<Integer,Object> fieldHighlights = highlightField(field, contents[i], getBreakIterator(field), terms, docids, leaves, numPassages);
+
+ Object[] result = new Object[docids.length];
+ for (int j = 0; j < docidsIn.length; j++) {
+ result[j] = fieldHighlights.get(docidsIn[j]);
+ }
+ highlights.put(field, result);
+ }
+ return highlights;
+ }
+
+ /** Loads the String values for each field X docID to be
+ * highlighted. By default this loads from stored
+ * fields, but a subclass can change the source. This
+ * method should allocate the String[fields.length][docids.length]
+ * and fill all values. The returned Strings must be
+ * identical to what was indexed. */
+ protected String[][] loadFieldValues(IndexSearcher searcher, String[] fields, int[] docids, int maxLength) throws IOException {
+ String contents[][] = new String[fields.length][docids.length];
+ char valueSeparators[] = new char[fields.length];
+ for (int i = 0; i < fields.length; i++) {
+ valueSeparators[i] = getMultiValuedSeparator(fields[i]);
+ }
+ LimitedStoredFieldVisitor visitor = new LimitedStoredFieldVisitor(fields, valueSeparators, maxLength);
+ for (int i = 0; i < docids.length; i++) {
+ searcher.doc(docids[i], visitor);
+ for (int j = 0; j < fields.length; j++) {
+ contents[j][i] = visitor.getValue(j);
+ }
+ visitor.reset();
+ }
+ return contents;
+ }
+
+ /**
+ * Returns the logical separator between values for multi-valued fields.
+ * The default value is a space character, which means passages can span across values,
+ * but a subclass can override, for example with {@code U+2029 PARAGRAPH SEPARATOR (PS)}
+ * if each value holds a discrete passage for highlighting.
+ */
+ protected char getMultiValuedSeparator(String field) {
+ return ' ';
+ }
+
+ //BEGIN EDIT: made protected so that we can call from our subclass and pass in the terms by ourselves
+ protected Map<Integer,Object> highlightField(String field, String contents[], BreakIterator bi, BytesRef terms[], int[] docids, List<AtomicReaderContext> leaves, int maxPassages) throws IOException {
+ //private Map<Integer,Object> highlightField(String field, String contents[], BreakIterator bi, BytesRef terms[], int[] docids, List<AtomicReaderContext > leaves, int maxPassages) throws IOException {
+ //END EDIT
+
+ Map<Integer,Object> highlights = new HashMap<Integer,Object>();
+
+ // reuse in the real sense... for docs in same segment we just advance our old enum
+ DocsAndPositionsEnum postings[] = null;
+ TermsEnum termsEnum = null;
+ int lastLeaf = -1;
+
+ PassageFormatter fieldFormatter = getFormatter(field);
+ if (fieldFormatter == null) {
+ throw new NullPointerException("PassageFormatter cannot be null");
+ }
+
+ for (int i = 0; i < docids.length; i++) {
+ String content = contents[i];
+ if (content.length() == 0) {
+ continue; // nothing to do
+ }
+ bi.setText(content);
+ int doc = docids[i];
+ int leaf = ReaderUtil.subIndex(doc, leaves);
+ AtomicReaderContext subContext = leaves.get(leaf);
+ AtomicReader r = subContext.reader();
+ Terms t = r.terms(field);
+ if (t == null) {
+ continue; // nothing to do
+ }
+ if (leaf != lastLeaf) {
+ termsEnum = t.iterator(null);
+ postings = new DocsAndPositionsEnum[terms.length];
+ }
+ Passage passages[] = highlightDoc(field, terms, content.length(), bi, doc - subContext.docBase, termsEnum, postings, maxPassages);
+ if (passages.length == 0) {
+ passages = getEmptyHighlight(field, bi, maxPassages);
+ }
+ if (passages.length > 0) {
+ // otherwise a null snippet (eg if field is missing
+ // entirely from the doc)
+ highlights.put(doc, fieldFormatter.format(passages, content));
+ }
+ lastLeaf = leaf;
+ }
+
+ return highlights;
+ }
+
+ // algorithm: treat sentence snippets as miniature documents
+ // we can intersect these with the postings lists via BreakIterator.preceding(offset),s
+ // score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq))
+ private Passage[] highlightDoc(String field, BytesRef terms[], int contentLength, BreakIterator bi, int doc,
+ TermsEnum termsEnum, DocsAndPositionsEnum[] postings, int n) throws IOException {
+
+ //BEGIN EDIT added call to method that returns the offset for the current value (discrete highlighting)
+ int valueOffset = getOffsetForCurrentValue(field, doc);
+ //END EDIT
+
+ PassageScorer scorer = getScorer(field);
+ if (scorer == null) {
+ throw new NullPointerException("PassageScorer cannot be null");
+ }
+
+
+ //BEGIN EDIT discrete highlighting
+ // the scoring needs to be based on the length of the whole field (all values rather than only the current one)
+ int totalContentLength = getContentLength(field, doc);
+ if (totalContentLength == -1) {
+ totalContentLength = contentLength;
+ }
+ //END EDIT
+
+
+ PriorityQueue<OffsetsEnum> pq = new PriorityQueue<OffsetsEnum>();
+ float weights[] = new float[terms.length];
+ // initialize postings
+ for (int i = 0; i < terms.length; i++) {
+ DocsAndPositionsEnum de = postings[i];
+ int pDoc;
+ if (de == EMPTY) {
+ continue;
+ } else if (de == null) {
+ postings[i] = EMPTY; // initially
+ if (!termsEnum.seekExact(terms[i])) {
+ continue; // term not found
+ }
+ de = postings[i] = termsEnum.docsAndPositions(null, null, DocsAndPositionsEnum.FLAG_OFFSETS);
+ if (de == null) {
+ // no positions available
+ throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight");
+ }
+ pDoc = de.advance(doc);
+ } else {
+ pDoc = de.docID();
+ if (pDoc < doc) {
+ pDoc = de.advance(doc);
+ }
+ }
+
+ if (doc == pDoc) {
+ //BEGIN EDIT we take into account the length of the whole field (all values) to properly score the snippets
+ weights[i] = scorer.weight(totalContentLength, de.freq());
+ //weights[i] = scorer.weight(contentLength, de.freq());
+ //END EDIT
+ de.nextPosition();
+ pq.add(new OffsetsEnum(de, i));
+ }
+ }
+
+ pq.add(new OffsetsEnum(EMPTY, Integer.MAX_VALUE)); // a sentinel for termination
+
+ PriorityQueue<Passage> passageQueue = new PriorityQueue<Passage>(n, new Comparator<Passage>() {
+ @Override
+ public int compare(Passage left, Passage right) {
+ if (left.score < right.score) {
+ return -1;
+ } else if (left.score > right.score) {
+ return 1;
+ } else {
+ return left.startOffset - right.startOffset;
+ }
+ }
+ });
+ Passage current = new Passage();
+
+ OffsetsEnum off;
+ while ((off = pq.poll()) != null) {
+ final DocsAndPositionsEnum dp = off.dp;
+
+ int start = dp.startOffset();
+ if (start == -1) {
+ throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight");
+ }
+ int end = dp.endOffset();
+ // LUCENE-5166: this hit would span the content limit... however more valid
+ // hits may exist (they are sorted by start). so we pretend like we never
+ // saw this term, it won't cause a passage to be added to passageQueue or anything.
+ assert EMPTY.startOffset() == Integer.MAX_VALUE;
+ if (start < contentLength && end > contentLength) {
+ continue;
+ }
+
+
+ //BEGIN EDIT support for discrete highlighting (added block code)
+ //switch to the first match in the current value if there is one
+ boolean seenEnough = false;
+ while (start < valueOffset) {
+ if (off.pos == dp.freq()) {
+ seenEnough = true;
+ break;
+ } else {
+ off.pos++;
+ dp.nextPosition();
+ start = dp.startOffset();
+ end = dp.endOffset();
+ }
+ }
+
+ //continue with next term if we've already seen the current one all the times it appears
+ //that means that the current value doesn't hold matches for the current term
+ if (seenEnough) {
+ continue;
+ }
+
+ //we now subtract the offset of the current value to both start and end
+ start -= valueOffset;
+ end -= valueOffset;
+ //END EDIT
+
+
+ if (start >= current.endOffset) {
+ if (current.startOffset >= 0) {
+ // finalize current
+ //BEGIN EDIT we take into account the value offset when scoring the snippet based on its position
+ current.score *= scorer.norm(current.startOffset + valueOffset);
+ //current.score *= scorer.norm(current.startOffset);
+ //END EDIT
+ // new sentence: first add 'current' to queue
+ if (passageQueue.size() == n && current.score < passageQueue.peek().score) {
+ current.reset(); // can't compete, just reset it
+ } else {
+ passageQueue.offer(current);
+ if (passageQueue.size() > n) {
+ current = passageQueue.poll();
+ current.reset();
+ } else {
+ current = new Passage();
+ }
+ }
+ }
+ // if we exceed limit, we are done
+ if (start >= contentLength) {
+ Passage passages[] = new Passage[passageQueue.size()];
+ passageQueue.toArray(passages);
+ for (Passage p : passages) {
+ p.sort();
+ }
+ // sort in ascending order
+ Arrays.sort(passages, new Comparator<Passage>() {
+ @Override
+ public int compare(Passage left, Passage right) {
+ return left.startOffset - right.startOffset;
+ }
+ });
+ return passages;
+ }
+ // advance breakiterator
+ assert BreakIterator.DONE < 0;
+ current.startOffset = Math.max(bi.preceding(start+1), 0);
+ current.endOffset = Math.min(bi.next(), contentLength);
+ }
+ int tf = 0;
+ while (true) {
+ tf++;
+ current.addMatch(start, end, terms[off.id]);
+ if (off.pos == dp.freq()) {
+ break; // removed from pq
+ } else {
+ off.pos++;
+ dp.nextPosition();
+ //BEGIN EDIT support for discrete highlighting
+ start = dp.startOffset() - valueOffset;
+ end = dp.endOffset() - valueOffset;
+ //start = dp.startOffset();
+ //end = dp.endOffset();
+ //END EDIT
+ }
+ if (start >= current.endOffset || end > contentLength) {
+ pq.offer(off);
+ break;
+ }
+ }
+ current.score += weights[off.id] * scorer.tf(tf, current.endOffset - current.startOffset);
+ }
+
+ // Dead code but compiler disagrees:
+ assert false;
+ return null;
+ }
+
+ /** Called to summarize a document when no hits were
+ * found. By default this just returns the first
+ * {@code maxPassages} sentences; subclasses can override
+ * to customize. */
+ protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) {
+ // BreakIterator should be un-next'd:
+ List<Passage> passages = new ArrayList<Passage>();
+ int pos = bi.current();
+ assert pos == 0;
+ while (passages.size() < maxPassages) {
+ int next = bi.next();
+ if (next == BreakIterator.DONE) {
+ break;
+ }
+ Passage passage = new Passage();
+ passage.score = Float.NaN;
+ passage.startOffset = pos;
+ passage.endOffset = next;
+ passages.add(passage);
+ pos = next;
+ }
+
+ return passages.toArray(new Passage[passages.size()]);
+ }
+
+ private static class OffsetsEnum implements Comparable<OffsetsEnum> {
+ DocsAndPositionsEnum dp;
+ int pos;
+ int id;
+
+ OffsetsEnum(DocsAndPositionsEnum dp, int id) throws IOException {
+ this.dp = dp;
+ this.id = id;
+ this.pos = 1;
+ }
+
+ @Override
+ public int compareTo(OffsetsEnum other) {
+ try {
+ int off = dp.startOffset();
+ int otherOff = other.dp.startOffset();
+ if (off == otherOff) {
+ return id - other.id;
+ } else {
+ return Long.signum(((long)off) - otherOff);
+ }
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ }
+
+ private static final DocsAndPositionsEnum EMPTY = new DocsAndPositionsEnum() {
+
+ @Override
+ public int nextPosition() throws IOException { return 0; }
+
+ @Override
+ public int startOffset() throws IOException { return Integer.MAX_VALUE; }
+
+ @Override
+ public int endOffset() throws IOException { return Integer.MAX_VALUE; }
+
+ @Override
+ public BytesRef getPayload() throws IOException { return null; }
+
+ @Override
+ public int freq() throws IOException { return 0; }
+
+ @Override
+ public int docID() { return NO_MORE_DOCS; }
+
+ @Override
+ public int nextDoc() throws IOException { return NO_MORE_DOCS; }
+
+ @Override
+ public int advance(int target) throws IOException { return NO_MORE_DOCS; }
+
+ @Override
+ public long cost() { return 0; }
+ };
+
+ /**
+ * we rewrite against an empty indexreader: as we don't want things like
+ * rangeQueries that don't summarize the document
+ */
+ private static Query rewrite(Query original) throws IOException {
+ Query query = original;
+ for (Query rewrittenQuery = query.rewrite(EMPTY_INDEXREADER); rewrittenQuery != query;
+ rewrittenQuery = query.rewrite(EMPTY_INDEXREADER)) {
+ query = rewrittenQuery;
+ }
+ return query;
+ }
+
+ private static class LimitedStoredFieldVisitor extends StoredFieldVisitor {
+ private final String fields[];
+ private final char valueSeparators[];
+ private final int maxLength;
+ private final StringBuilder builders[];
+ private int currentField = -1;
+
+ public LimitedStoredFieldVisitor(String fields[], char valueSeparators[], int maxLength) {
+ assert fields.length == valueSeparators.length;
+ this.fields = fields;
+ this.valueSeparators = valueSeparators;
+ this.maxLength = maxLength;
+ builders = new StringBuilder[fields.length];
+ for (int i = 0; i < builders.length; i++) {
+ builders[i] = new StringBuilder();
+ }
+ }
+
+ @Override
+ public void stringField(FieldInfo fieldInfo, String value) throws IOException {
+ assert currentField >= 0;
+ StringBuilder builder = builders[currentField];
+ if (builder.length() > 0 && builder.length() < maxLength) {
+ builder.append(valueSeparators[currentField]);
+ }
+ if (builder.length() + value.length() > maxLength) {
+ builder.append(value, 0, maxLength - builder.length());
+ } else {
+ builder.append(value);
+ }
+ }
+
+ @Override
+ public Status needsField(FieldInfo fieldInfo) throws IOException {
+ currentField = Arrays.binarySearch(fields, fieldInfo.name);
+ if (currentField < 0) {
+ return Status.NO;
+ } else if (builders[currentField].length() > maxLength) {
+ return fields.length == 1 ? Status.STOP : Status.NO;
+ }
+ return Status.YES;
+ }
+
+ String getValue(int i) {
+ return builders[i].toString();
+ }
+
+ void reset() {
+ currentField = -1;
+ for (int i = 0; i < fields.length; i++) {
+ builders[i].setLength(0);
+ }
+ }
+ }
+}
diff --git a/src/main/java/org/apache/lucene/search/suggest/analyzing/XAnalyzingSuggester.java b/src/main/java/org/apache/lucene/search/suggest/analyzing/XAnalyzingSuggester.java
new file mode 100644
index 0000000..6e8b0a1
--- /dev/null
+++ b/src/main/java/org/apache/lucene/search/suggest/analyzing/XAnalyzingSuggester.java
@@ -0,0 +1,1052 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.lucene.search.suggest.analyzing;
+
+import com.carrotsearch.hppc.ObjectIntOpenHashMap;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.TokenStreamToAutomaton;
+import org.apache.lucene.search.suggest.InputIterator;
+import org.apache.lucene.search.suggest.Lookup;
+import org.apache.lucene.search.suggest.Sort;
+import org.apache.lucene.store.*;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.util.*;
+import org.apache.lucene.util.automaton.*;
+import org.apache.lucene.util.fst.*;
+import org.apache.lucene.util.fst.FST.BytesReader;
+import org.apache.lucene.util.fst.PairOutputs.Pair;
+import org.apache.lucene.util.fst.Util.MinResult;
+import org.elasticsearch.common.collect.HppcMaps;
+
+import java.io.*;
+import java.util.*;
+
+/**
+ * Suggester that first analyzes the surface form, adds the
+ * analyzed form to a weighted FST, and then does the same
+ * thing at lookup time. This means lookup is based on the
+ * analyzed form while suggestions are still the surface
+ * form(s).
+ *
+ * <p>
+ * This can result in powerful suggester functionality. For
+ * example, if you use an analyzer removing stop words,
+ * then the partial text "ghost chr..." could see the
+ * suggestion "The Ghost of Christmas Past". Note that
+ * position increments MUST NOT be preserved for this example
+ * to work, so you should call the constructor with
+ * <code>preservePositionIncrements</code> parameter set to
+ * false
+ *
+ * <p>
+ * If SynonymFilter is used to map wifi and wireless network to
+ * hotspot then the partial text "wirele..." could suggest
+ * "wifi router". Token normalization like stemmers, accent
+ * removal, etc., would allow suggestions to ignore such
+ * variations.
+ *
+ * <p>
+ * When two matching suggestions have the same weight, they
+ * are tie-broken by the analyzed form. If their analyzed
+ * form is the same then the order is undefined.
+ *
+ * <p>
+ * There are some limitations:
+ * <ul>
+ *
+ * <li> A lookup from a query like "net" in English won't
+ * be any different than "net " (ie, user added a
+ * trailing space) because analyzers don't reflect
+ * when they've seen a token separator and when they
+ * haven't.
+ *
+ * <li> If you're using {@code StopFilter}, and the user will
+ * type "fast apple", but so far all they've typed is
+ * "fast a", again because the analyzer doesn't convey whether
+ * it's seen a token separator after the "a",
+ * {@code StopFilter} will remove that "a" causing
+ * far more matches than you'd expect.
+ *
+ * <li> Lookups with the empty string return no results
+ * instead of all results.
+ * </ul>
+ *
+ * @lucene.experimental
+ */
+public class XAnalyzingSuggester extends Lookup {
+
+ /**
+ * FST<Weight,Surface>:
+ * input is the analyzed form, with a null byte between terms
+ * weights are encoded as costs: (Integer.MAX_VALUE-weight)
+ * surface is the original, unanalyzed form.
+ */
+ private FST<Pair<Long,BytesRef>> fst = null;
+
+ /**
+ * Analyzer that will be used for analyzing suggestions at
+ * index time.
+ */
+ private final Analyzer indexAnalyzer;
+
+ /**
+ * Analyzer that will be used for analyzing suggestions at
+ * query time.
+ */
+ private final Analyzer queryAnalyzer;
+
+ /**
+ * True if exact match suggestions should always be returned first.
+ */
+ private final boolean exactFirst;
+
+ /**
+ * True if separator between tokens should be preserved.
+ */
+ private final boolean preserveSep;
+
+ /** Include this flag in the options parameter to {@link
+ * #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int,int)} to always
+ * return the exact match first, regardless of score. This
+ * has no performance impact but could result in
+ * low-quality suggestions. */
+ public static final int EXACT_FIRST = 1;
+
+ /** Include this flag in the options parameter to {@link
+ * #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int,int)} to preserve
+ * token separators when matching. */
+ public static final int PRESERVE_SEP = 2;
+
+ /** Represents the separation between tokens, if
+ * PRESERVE_SEP was specified */
+ public static final int SEP_LABEL = '\u001F';
+
+ /** Marks end of the analyzed input and start of dedup
+ * byte. */
+ public static final int END_BYTE = 0x0;
+
+ /** Maximum number of dup surface forms (different surface
+ * forms for the same analyzed form). */
+ private final int maxSurfaceFormsPerAnalyzedForm;
+
+ /** Maximum graph paths to index for a single analyzed
+ * surface form. This only matters if your analyzer
+ * makes lots of alternate paths (e.g. contains
+ * SynonymFilter). */
+ private final int maxGraphExpansions;
+
+ /** Highest number of analyzed paths we saw for any single
+ * input surface form. For analyzers that never create
+ * graphs this will always be 1. */
+ private int maxAnalyzedPathsForOneInput;
+
+ private boolean hasPayloads;
+
+ private final int sepLabel;
+ private final int payloadSep;
+ private final int endByte;
+ private final int holeCharacter;
+
+ public static final int PAYLOAD_SEP = '\u001F';
+ public static final int HOLE_CHARACTER = '\u001E';
+
+ /** Whether position holes should appear in the automaton. */
+ private boolean preservePositionIncrements;
+
+ /**
+ * Calls {@link #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int,int)
+ * AnalyzingSuggester(analyzer, analyzer, EXACT_FIRST |
+ * PRESERVE_SEP, 256, -1)}
+ */
+ public XAnalyzingSuggester(Analyzer analyzer) {
+ this(analyzer, analyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, true, null, false, 0, SEP_LABEL, PAYLOAD_SEP, END_BYTE, HOLE_CHARACTER);
+ }
+
+ /**
+ * Calls {@link #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int,int)
+ * AnalyzingSuggester(indexAnalyzer, queryAnalyzer, EXACT_FIRST |
+ * PRESERVE_SEP, 256, -1)}
+ */
+ public XAnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {
+ this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, true, null, false, 0, SEP_LABEL, PAYLOAD_SEP, END_BYTE, HOLE_CHARACTER);
+ }
+
+ /**
+ * Creates a new suggester.
+ *
+ * @param indexAnalyzer Analyzer that will be used for
+ * analyzing suggestions while building the index.
+ * @param queryAnalyzer Analyzer that will be used for
+ * analyzing query text during lookup
+ * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP}
+ * @param maxSurfaceFormsPerAnalyzedForm Maximum number of
+ * surface forms to keep for a single analyzed form.
+ * When there are too many surface forms we discard the
+ * lowest weighted ones.
+ * @param maxGraphExpansions Maximum number of graph paths
+ * to expand from the analyzed form. Set this to -1 for
+ * no limit.
+ */
+ public XAnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
+ boolean preservePositionIncrements, FST<Pair<Long, BytesRef>> fst, boolean hasPayloads, int maxAnalyzedPathsForOneInput,
+ int sepLabel, int payloadSep, int endByte, int holeCharacter) {
+ // SIMON EDIT: I added fst, hasPayloads and maxAnalyzedPathsForOneInput
+ this.indexAnalyzer = indexAnalyzer;
+ this.queryAnalyzer = queryAnalyzer;
+ this.fst = fst;
+ this.hasPayloads = hasPayloads;
+ if ((options & ~(EXACT_FIRST | PRESERVE_SEP)) != 0) {
+ throw new IllegalArgumentException("options should only contain EXACT_FIRST and PRESERVE_SEP; got " + options);
+ }
+ this.exactFirst = (options & EXACT_FIRST) != 0;
+ this.preserveSep = (options & PRESERVE_SEP) != 0;
+
+ // NOTE: this is just an implementation limitation; if
+ // somehow this is a problem we could fix it by using
+ // more than one byte to disambiguate ... but 256 seems
+ // like it should be way more then enough.
+ if (maxSurfaceFormsPerAnalyzedForm <= 0 || maxSurfaceFormsPerAnalyzedForm > 256) {
+ throw new IllegalArgumentException("maxSurfaceFormsPerAnalyzedForm must be > 0 and < 256 (got: " + maxSurfaceFormsPerAnalyzedForm + ")");
+ }
+ this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm;
+
+ if (maxGraphExpansions < 1 && maxGraphExpansions != -1) {
+ throw new IllegalArgumentException("maxGraphExpansions must -1 (no limit) or > 0 (got: " + maxGraphExpansions + ")");
+ }
+ this.maxGraphExpansions = maxGraphExpansions;
+ this.maxAnalyzedPathsForOneInput = maxAnalyzedPathsForOneInput;
+ this.preservePositionIncrements = preservePositionIncrements;
+ this.sepLabel = sepLabel;
+ this.payloadSep = payloadSep;
+ this.endByte = endByte;
+ this.holeCharacter = holeCharacter;
+ }
+
+ /** Returns byte size of the underlying FST. */
+ public long sizeInBytes() {
+ return fst == null ? 0 : fst.sizeInBytes();
+ }
+
+ private static void copyDestTransitions(State from, State to, List<Transition> transitions) {
+ if (to.isAccept()) {
+ from.setAccept(true);
+ }
+ for(Transition t : to.getTransitions()) {
+ transitions.add(t);
+ }
+ }
+
+ // Replaces SEP with epsilon or remaps them if
+ // we were asked to preserve them:
+ private static void replaceSep(Automaton a, boolean preserveSep, int replaceSep) {
+
+ State[] states = a.getNumberedStates();
+
+ // Go in reverse topo sort so we know we only have to
+ // make one pass:
+ for(int stateNumber=states.length-1;stateNumber >=0;stateNumber--) {
+ final State state = states[stateNumber];
+ List<Transition> newTransitions = new ArrayList<Transition>();
+ for(Transition t : state.getTransitions()) {
+ assert t.getMin() == t.getMax();
+ if (t.getMin() == TokenStreamToAutomaton.POS_SEP) {
+ if (preserveSep) {
+ // Remap to SEP_LABEL:
+ newTransitions.add(new Transition(replaceSep, t.getDest()));
+ } else {
+ copyDestTransitions(state, t.getDest(), newTransitions);
+ a.setDeterministic(false);
+ }
+ } else if (t.getMin() == TokenStreamToAutomaton.HOLE) {
+
+ // Just remove the hole: there will then be two
+ // SEP tokens next to each other, which will only
+ // match another hole at search time. Note that
+ // it will also match an empty-string token ... if
+ // that's somehow a problem we can always map HOLE
+ // to a dedicated byte (and escape it in the
+ // input).
+ copyDestTransitions(state, t.getDest(), newTransitions);
+ a.setDeterministic(false);
+ } else {
+ newTransitions.add(t);
+ }
+ }
+ state.setTransitions(newTransitions.toArray(new Transition[newTransitions.size()]));
+ }
+ }
+
+ protected Automaton convertAutomaton(Automaton a) {
+ return a;
+ }
+
+ /** Just escapes the 0xff byte (which we still for SEP). */
+ private static final class EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton {
+
+ final BytesRef spare = new BytesRef();
+ private char sepLabel;
+
+ public EscapingTokenStreamToAutomaton(char sepLabel) {
+ this.sepLabel = sepLabel;
+ }
+
+ @Override
+ protected BytesRef changeToken(BytesRef in) {
+ int upto = 0;
+ for(int i=0;i<in.length;i++) {
+ byte b = in.bytes[in.offset+i];
+ if (b == (byte) sepLabel) {
+ if (spare.bytes.length == upto) {
+ spare.grow(upto+2);
+ }
+ spare.bytes[upto++] = (byte) sepLabel;
+ spare.bytes[upto++] = b;
+ } else {
+ if (spare.bytes.length == upto) {
+ spare.grow(upto+1);
+ }
+ spare.bytes[upto++] = b;
+ }
+ }
+ spare.offset = 0;
+ spare.length = upto;
+ return spare;
+ }
+ }
+
+ public TokenStreamToAutomaton getTokenStreamToAutomaton() {
+ final TokenStreamToAutomaton tsta;
+ if (preserveSep) {
+ tsta = new EscapingTokenStreamToAutomaton((char) sepLabel);
+ } else {
+ // When we're not preserving sep, we don't steal 0xff
+ // byte, so we don't need to do any escaping:
+ tsta = new TokenStreamToAutomaton();
+ }
+ tsta.setPreservePositionIncrements(preservePositionIncrements);
+ return tsta;
+ }
+
+ private static class AnalyzingComparator implements Comparator<BytesRef> {
+
+ private final boolean hasPayloads;
+
+ public AnalyzingComparator(boolean hasPayloads) {
+ this.hasPayloads = hasPayloads;
+ }
+
+ private final ByteArrayDataInput readerA = new ByteArrayDataInput();
+ private final ByteArrayDataInput readerB = new ByteArrayDataInput();
+ private final BytesRef scratchA = new BytesRef();
+ private final BytesRef scratchB = new BytesRef();
+
+ @Override
+ public int compare(BytesRef a, BytesRef b) {
+
+ // First by analyzed form:
+ readerA.reset(a.bytes, a.offset, a.length);
+ scratchA.length = readerA.readShort();
+ scratchA.bytes = a.bytes;
+ scratchA.offset = readerA.getPosition();
+
+ readerB.reset(b.bytes, b.offset, b.length);
+ scratchB.bytes = b.bytes;
+ scratchB.length = readerB.readShort();
+ scratchB.offset = readerB.getPosition();
+
+ int cmp = scratchA.compareTo(scratchB);
+ if (cmp != 0) {
+ return cmp;
+ }
+ readerA.skipBytes(scratchA.length);
+ readerB.skipBytes(scratchB.length);
+ // Next by cost:
+ long aCost = readerA.readInt();
+ long bCost = readerB.readInt();
+ if (aCost < bCost) {
+ return -1;
+ } else if (aCost > bCost) {
+ return 1;
+ }
+
+ // Finally by surface form:
+ if (hasPayloads) {
+ scratchA.length = readerA.readShort();
+ scratchA.offset = readerA.getPosition();
+ scratchB.length = readerB.readShort();
+ scratchB.offset = readerB.getPosition();
+ } else {
+ scratchA.offset = readerA.getPosition();
+ scratchA.length = a.length - scratchA.offset;
+ scratchB.offset = readerB.getPosition();
+ scratchB.length = b.length - scratchB.offset;
+ }
+ return scratchA.compareTo(scratchB);
+ }
+ }
+
+ @Override
+ public void build(InputIterator iterator) throws IOException {
+ String prefix = getClass().getSimpleName();
+ File directory = Sort.defaultTempDir();
+ File tempInput = File.createTempFile(prefix, ".input", directory);
+ File tempSorted = File.createTempFile(prefix, ".sorted", directory);
+
+ hasPayloads = iterator.hasPayloads();
+
+ Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput);
+ Sort.ByteSequencesReader reader = null;
+ BytesRef scratch = new BytesRef();
+
+ TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
+
+ boolean success = false;
+ byte buffer[] = new byte[8];
+ try {
+ ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
+ BytesRef surfaceForm;
+
+ while ((surfaceForm = iterator.next()) != null) {
+ Set<IntsRef> paths = toFiniteStrings(surfaceForm, ts2a);
+
+ maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, paths.size());
+
+ for (IntsRef path : paths) {
+
+ Util.toBytesRef(path, scratch);
+
+ // length of the analyzed text (FST input)
+ if (scratch.length > Short.MAX_VALUE-2) {
+ throw new IllegalArgumentException("cannot handle analyzed forms > " + (Short.MAX_VALUE-2) + " in length (got " + scratch.length + ")");
+ }
+ short analyzedLength = (short) scratch.length;
+
+ // compute the required length:
+ // analyzed sequence + weight (4) + surface + analyzedLength (short)
+ int requiredLength = analyzedLength + 4 + surfaceForm.length + 2;
+
+ BytesRef payload;
+
+ if (hasPayloads) {
+ if (surfaceForm.length > (Short.MAX_VALUE-2)) {
+ throw new IllegalArgumentException("cannot handle surface form > " + (Short.MAX_VALUE-2) + " in length (got " + surfaceForm.length + ")");
+ }
+ payload = iterator.payload();
+ // payload + surfaceLength (short)
+ requiredLength += payload.length + 2;
+ } else {
+ payload = null;
+ }
+
+ buffer = ArrayUtil.grow(buffer, requiredLength);
+
+ output.reset(buffer);
+
+ output.writeShort(analyzedLength);
+
+ output.writeBytes(scratch.bytes, scratch.offset, scratch.length);
+
+ output.writeInt(encodeWeight(iterator.weight()));
+
+ if (hasPayloads) {
+ for(int i=0;i<surfaceForm.length;i++) {
+ if (surfaceForm.bytes[i] == payloadSep) {
+ throw new IllegalArgumentException("surface form cannot contain unit separator character U+001F; this character is reserved");
+ }
+ }
+ output.writeShort((short) surfaceForm.length);
+ output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
+ output.writeBytes(payload.bytes, payload.offset, payload.length);
+ } else {
+ output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
+ }
+
+ assert output.getPosition() == requiredLength: output.getPosition() + " vs " + requiredLength;
+
+ writer.write(buffer, 0, output.getPosition());
+ }
+ }
+ writer.close();
+
+ // Sort all input/output pairs (required by FST.Builder):
+ new Sort(new AnalyzingComparator(hasPayloads)).sort(tempInput, tempSorted);
+
+ // Free disk space:
+ tempInput.delete();
+
+ reader = new Sort.ByteSequencesReader(tempSorted);
+
+ PairOutputs<Long,BytesRef> outputs = new PairOutputs<Long,BytesRef>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
+ Builder<Pair<Long,BytesRef>> builder = new Builder<Pair<Long,BytesRef>>(FST.INPUT_TYPE.BYTE1, outputs);
+
+ // Build FST:
+ BytesRef previousAnalyzed = null;
+ BytesRef analyzed = new BytesRef();
+ BytesRef surface = new BytesRef();
+ IntsRef scratchInts = new IntsRef();
+ ByteArrayDataInput input = new ByteArrayDataInput();
+
+ // Used to remove duplicate surface forms (but we
+ // still index the hightest-weight one). We clear
+ // this when we see a new analyzed form, so it cannot
+ // grow unbounded (at most 256 entries):
+ Set<BytesRef> seenSurfaceForms = new HashSet<BytesRef>();
+
+ int dedup = 0;
+ while (reader.read(scratch)) {
+ input.reset(scratch.bytes, scratch.offset, scratch.length);
+ short analyzedLength = input.readShort();
+ analyzed.grow(analyzedLength+2);
+ input.readBytes(analyzed.bytes, 0, analyzedLength);
+ analyzed.length = analyzedLength;
+
+ long cost = input.readInt();
+
+ surface.bytes = scratch.bytes;
+ if (hasPayloads) {
+ surface.length = input.readShort();
+ surface.offset = input.getPosition();
+ } else {
+ surface.offset = input.getPosition();
+ surface.length = scratch.length - surface.offset;
+ }
+
+ if (previousAnalyzed == null) {
+ previousAnalyzed = new BytesRef();
+ previousAnalyzed.copyBytes(analyzed);
+ seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
+ } else if (analyzed.equals(previousAnalyzed)) {
+ dedup++;
+ if (dedup >= maxSurfaceFormsPerAnalyzedForm) {
+ // More than maxSurfaceFormsPerAnalyzedForm
+ // dups: skip the rest:
+ continue;
+ }
+ if (seenSurfaceForms.contains(surface)) {
+ continue;
+ }
+ seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
+ } else {
+ dedup = 0;
+ previousAnalyzed.copyBytes(analyzed);
+ seenSurfaceForms.clear();
+ seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
+ }
+
+ // TODO: I think we can avoid the extra 2 bytes when
+ // there is no dup (dedup==0), but we'd have to fix
+ // the exactFirst logic ... which would be sort of
+ // hairy because we'd need to special case the two
+ // (dup/not dup)...
+
+ // NOTE: must be byte 0 so we sort before whatever
+ // is next
+ analyzed.bytes[analyzed.offset+analyzed.length] = 0;
+ analyzed.bytes[analyzed.offset+analyzed.length+1] = (byte) dedup;
+ analyzed.length += 2;
+
+ Util.toIntsRef(analyzed, scratchInts);
+ //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
+ if (!hasPayloads) {
+ builder.add(scratchInts, outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
+ } else {
+ int payloadOffset = input.getPosition() + surface.length;
+ int payloadLength = scratch.length - payloadOffset;
+ BytesRef br = new BytesRef(surface.length + 1 + payloadLength);
+ System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length);
+ br.bytes[surface.length] = (byte) payloadSep;
+ System.arraycopy(scratch.bytes, payloadOffset, br.bytes, surface.length+1, payloadLength);
+ br.length = br.bytes.length;
+ builder.add(scratchInts, outputs.newPair(cost, br));
+ }
+ }
+ fst = builder.finish();
+
+ //PrintWriter pw = new PrintWriter("/tmp/out.dot");
+ //Util.toDot(fst, pw, true, true);
+ //pw.close();
+
+ success = true;
+ } finally {
+ if (success) {
+ IOUtils.close(reader, writer);
+ } else {
+ IOUtils.closeWhileHandlingException(reader, writer);
+ }
+
+ tempInput.delete();
+ tempSorted.delete();
+ }
+ }
+
+ @Override
+ public boolean store(OutputStream output) throws IOException {
+ DataOutput dataOut = new OutputStreamDataOutput(output);
+ try {
+ if (fst == null) {
+ return false;
+ }
+
+ fst.save(dataOut);
+ dataOut.writeVInt(maxAnalyzedPathsForOneInput);
+ dataOut.writeByte((byte) (hasPayloads ? 1 : 0));
+ } finally {
+ IOUtils.close(output);
+ }
+ return true;
+ }
+
+ @Override
+ public boolean load(InputStream input) throws IOException {
+ DataInput dataIn = new InputStreamDataInput(input);
+ try {
+ this.fst = new FST<Pair<Long,BytesRef>>(dataIn, new PairOutputs<Long,BytesRef>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()));
+ maxAnalyzedPathsForOneInput = dataIn.readVInt();
+ hasPayloads = dataIn.readByte() == 1;
+ } finally {
+ IOUtils.close(input);
+ }
+ return true;
+ }
+
+ private LookupResult getLookupResult(Long output1, BytesRef output2, CharsRef spare) {
+ LookupResult result;
+ if (hasPayloads) {
+ int sepIndex = -1;
+ for(int i=0;i<output2.length;i++) {
+ if (output2.bytes[output2.offset+i] == payloadSep) {
+ sepIndex = i;
+ break;
+ }
+ }
+ assert sepIndex != -1;
+ spare.grow(sepIndex);
+ final int payloadLen = output2.length - sepIndex - 1;
+ UnicodeUtil.UTF8toUTF16(output2.bytes, output2.offset, sepIndex, spare);
+ BytesRef payload = new BytesRef(payloadLen);
+ System.arraycopy(output2.bytes, sepIndex+1, payload.bytes, 0, payloadLen);
+ payload.length = payloadLen;
+ result = new LookupResult(spare.toString(), decodeWeight(output1), payload);
+ } else {
+ spare.grow(output2.length);
+ UnicodeUtil.UTF8toUTF16(output2, spare);
+ result = new LookupResult(spare.toString(), decodeWeight(output1));
+ }
+
+ return result;
+ }
+
+ private boolean sameSurfaceForm(BytesRef key, BytesRef output2) {
+ if (hasPayloads) {
+ // output2 has at least PAYLOAD_SEP byte:
+ if (key.length >= output2.length) {
+ return false;
+ }
+ for(int i=0;i<key.length;i++) {
+ if (key.bytes[key.offset+i] != output2.bytes[output2.offset+i]) {
+ return false;
+ }
+ }
+ return output2.bytes[output2.offset + key.length] == payloadSep;
+ } else {
+ return key.bytesEquals(output2);
+ }
+ }
+
+ @Override
+ public List<LookupResult> lookup(final CharSequence key, boolean onlyMorePopular, int num) {
+ assert num > 0;
+
+ if (onlyMorePopular) {
+ throw new IllegalArgumentException("this suggester only works with onlyMorePopular=false");
+ }
+ if (fst == null) {
+ return Collections.emptyList();
+ }
+
+ //System.out.println("lookup key=" + key + " num=" + num);
+ for (int i = 0; i < key.length(); i++) {
+ if (key.charAt(i) == holeCharacter) {
+ throw new IllegalArgumentException("lookup key cannot contain HOLE character U+001E; this character is reserved");
+ }
+ if (key.charAt(i) == sepLabel) {
+ throw new IllegalArgumentException("lookup key cannot contain unit separator character U+001F; this character is reserved");
+ }
+ }
+ final BytesRef utf8Key = new BytesRef(key);
+ try {
+
+ Automaton lookupAutomaton = toLookupAutomaton(key);
+
+ final CharsRef spare = new CharsRef();
+
+ //System.out.println(" now intersect exactFirst=" + exactFirst);
+
+ // Intersect automaton w/ suggest wFST and get all
+ // prefix starting nodes & their outputs:
+ //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);
+
+ //System.out.println(" prefixPaths: " + prefixPaths.size());
+
+ BytesReader bytesReader = fst.getBytesReader();
+
+ FST.Arc<Pair<Long,BytesRef>> scratchArc = new FST.Arc<Pair<Long,BytesRef>>();
+
+ final List<LookupResult> results = new ArrayList<LookupResult>();
+
+ List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(convertAutomaton(lookupAutomaton), fst);
+
+ if (exactFirst) {
+
+ int count = 0;
+ for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) {
+ if (fst.findTargetArc(endByte, path.fstNode, scratchArc, bytesReader) != null) {
+ // This node has END_BYTE arc leaving, meaning it's an
+ // "exact" match:
+ count++;
+ }
+ }
+
+ // Searcher just to find the single exact only
+ // match, if present:
+ Util.TopNSearcher<Pair<Long,BytesRef>> searcher;
+ searcher = new Util.TopNSearcher<Pair<Long,BytesRef>>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator);
+
+ // NOTE: we could almost get away with only using
+ // the first start node. The only catch is if
+ // maxSurfaceFormsPerAnalyzedForm had kicked in and
+ // pruned our exact match from one of these nodes
+ // ...:
+ for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) {
+ if (fst.findTargetArc(endByte, path.fstNode, scratchArc, bytesReader) != null) {
+ // This node has END_BYTE arc leaving, meaning it's an
+ // "exact" match:
+ searcher.addStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.output), false, path.input);
+ }
+ }
+
+ MinResult<Pair<Long,BytesRef>> completions[] = searcher.search();
+
+ // NOTE: this is rather inefficient: we enumerate
+ // every matching "exactly the same analyzed form"
+ // path, and then do linear scan to see if one of
+ // these exactly matches the input. It should be
+ // possible (though hairy) to do something similar
+ // to getByOutput, since the surface form is encoded
+ // into the FST output, so we more efficiently hone
+ // in on the exact surface-form match. Still, I
+ // suspect very little time is spent in this linear
+ // seach: it's bounded by how many prefix start
+ // nodes we have and the
+ // maxSurfaceFormsPerAnalyzedForm:
+ for(MinResult<Pair<Long,BytesRef>> completion : completions) {
+ BytesRef output2 = completion.output.output2;
+ if (sameSurfaceForm(utf8Key, output2)) {
+ results.add(getLookupResult(completion.output.output1, output2, spare));
+ break;
+ }
+ }
+
+ if (results.size() == num) {
+ // That was quick:
+ return results;
+ }
+ }
+
+ Util.TopNSearcher<Pair<Long,BytesRef>> searcher;
+ searcher = new Util.TopNSearcher<Pair<Long,BytesRef>>(fst,
+ num - results.size(),
+ num * maxAnalyzedPathsForOneInput,
+ weightComparator) {
+ private final Set<BytesRef> seen = new HashSet<BytesRef>();
+
+ @Override
+ protected boolean acceptResult(IntsRef input, Pair<Long,BytesRef> output) {
+
+ // Dedup: when the input analyzes to a graph we
+ // can get duplicate surface forms:
+ if (seen.contains(output.output2)) {
+ return false;
+ }
+ seen.add(output.output2);
+
+ if (!exactFirst) {
+ return true;
+ } else {
+ // In exactFirst mode, don't accept any paths
+ // matching the surface form since that will
+ // create duplicate results:
+ if (sameSurfaceForm(utf8Key, output.output2)) {
+ // We found exact match, which means we should
+ // have already found it in the first search:
+ assert results.size() == 1;
+ return false;
+ } else {
+ return true;
+ }
+ }
+ }
+ };
+
+ prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);
+
+ for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) {
+ searcher.addStartPaths(path.fstNode, path.output, true, path.input);
+ }
+
+ MinResult<Pair<Long,BytesRef>> completions[] = searcher.search();
+
+ for(MinResult<Pair<Long,BytesRef>> completion : completions) {
+
+ LookupResult result = getLookupResult(completion.output.output1, completion.output.output2, spare);
+
+ // TODO: for fuzzy case would be nice to return
+ // how many edits were required
+
+ //System.out.println(" result=" + result);
+ results.add(result);
+
+ if (results.size() == num) {
+ // In the exactFirst=true case the search may
+ // produce one extra path
+ break;
+ }
+ }
+
+ return results;
+ } catch (IOException bogus) {
+ throw new RuntimeException(bogus);
+ }
+ }
+
+ /** Returns all completion paths to initialize the search. */
+ protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths,
+ Automaton lookupAutomaton,
+ FST<Pair<Long,BytesRef>> fst)
+ throws IOException {
+ return prefixPaths;
+ }
+
+ public final Set<IntsRef> toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {
+ // Analyze surface form:
+ TokenStream ts = indexAnalyzer.tokenStream("", surfaceForm.utf8ToString());
+ return toFiniteStrings(ts2a, ts);
+ }
+ public final Set<IntsRef> toFiniteStrings(final TokenStreamToAutomaton ts2a, TokenStream ts) throws IOException {
+ // Analyze surface form:
+
+ // Create corresponding automaton: labels are bytes
+ // from each analyzed token, with byte 0 used as
+ // separator between tokens:
+ Automaton automaton = ts2a.toAutomaton(ts);
+ ts.close();
+
+ replaceSep(automaton, preserveSep, sepLabel);
+
+ assert SpecialOperations.isFinite(automaton);
+
+ // Get all paths from the automaton (there can be
+ // more than one path, eg if the analyzer created a
+ // graph using SynFilter or WDF):
+
+ // TODO: we could walk & add simultaneously, so we
+ // don't have to alloc [possibly biggish]
+ // intermediate HashSet in RAM:
+ return SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions);
+ }
+
+ final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
+ // Turn tokenstream into automaton:
+ TokenStream ts = queryAnalyzer.tokenStream("", key.toString());
+ Automaton automaton = (getTokenStreamToAutomaton()).toAutomaton(ts);
+ ts.close();
+
+ // TODO: we could use the end offset to "guess"
+ // whether the final token was a partial token; this
+ // would only be a heuristic ... but maybe an OK one.
+ // This way we could eg differentiate "net" from "net ",
+ // which we can't today...
+
+ replaceSep(automaton, preserveSep, sepLabel);
+
+ // TODO: we can optimize this somewhat by determinizing
+ // while we convert
+ BasicOperations.determinize(automaton);
+ return automaton;
+ }
+
+
+
+ /**
+ * Returns the weight associated with an input string,
+ * or null if it does not exist.
+ */
+ public Object get(CharSequence key) {
+ throw new UnsupportedOperationException();
+ }
+
+ /** cost -> weight */
+ public static int decodeWeight(long encoded) {
+ return (int)(Integer.MAX_VALUE - encoded);
+ }
+
+ /** weight -> cost */
+ public static int encodeWeight(long value) {
+ if (value < 0 || value > Integer.MAX_VALUE) {
+ throw new UnsupportedOperationException("cannot encode value: " + value);
+ }
+ return Integer.MAX_VALUE - (int)value;
+ }
+
+ static final Comparator<Pair<Long,BytesRef>> weightComparator = new Comparator<Pair<Long,BytesRef>> () {
+ @Override
+ public int compare(Pair<Long,BytesRef> left, Pair<Long,BytesRef> right) {
+ return left.output1.compareTo(right.output1);
+ }
+ };
+
+
+ public static class XBuilder {
+ private Builder<Pair<Long, BytesRef>> builder;
+ private int maxSurfaceFormsPerAnalyzedForm;
+ private IntsRef scratchInts = new IntsRef();
+ private final PairOutputs<Long, BytesRef> outputs;
+ private boolean hasPayloads;
+ private BytesRef analyzed = new BytesRef();
+ private final SurfaceFormAndPayload[] surfaceFormsAndPayload;
+ private int count;
+ private ObjectIntOpenHashMap<BytesRef> seenSurfaceForms = HppcMaps.Object.Integer.ensureNoNullKeys(256, 0.75f);
+ private int payloadSep;
+
+ public XBuilder(int maxSurfaceFormsPerAnalyzedForm, boolean hasPayloads, int payloadSep) {
+ this.payloadSep = payloadSep;
+ this.outputs = new PairOutputs<Long, BytesRef>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
+ this.builder = new Builder<Pair<Long, BytesRef>>(FST.INPUT_TYPE.BYTE1, outputs);
+ this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm;
+ this.hasPayloads = hasPayloads;
+ surfaceFormsAndPayload = new SurfaceFormAndPayload[maxSurfaceFormsPerAnalyzedForm];
+
+ }
+ public void startTerm(BytesRef analyzed) {
+ this.analyzed.copyBytes(analyzed);
+ this.analyzed.grow(analyzed.length+2);
+ }
+
+ private final static class SurfaceFormAndPayload implements Comparable<SurfaceFormAndPayload> {
+ BytesRef payload;
+ long weight;
+
+ public SurfaceFormAndPayload(BytesRef payload, long cost) {
+ super();
+ this.payload = payload;
+ this.weight = cost;
+ }
+
+ @Override
+ public int compareTo(SurfaceFormAndPayload o) {
+ int res = compare(weight, o.weight);
+ if (res == 0 ){
+ return payload.compareTo(o.payload);
+ }
+ return res;
+ }
+ public static int compare(long x, long y) {
+ return (x < y) ? -1 : ((x == y) ? 0 : 1);
+ }
+ }
+
+ public void addSurface(BytesRef surface, BytesRef payload, long cost) throws IOException {
+ int surfaceIndex = -1;
+ long encodedWeight = cost == -1 ? cost : encodeWeight(cost);
+ /*
+ * we need to check if we have seen this surface form, if so only use the
+ * the surface form with the highest weight and drop the rest no matter if
+ * the payload differs.
+ */
+ if (count >= maxSurfaceFormsPerAnalyzedForm) {
+ // More than maxSurfaceFormsPerAnalyzedForm
+ // dups: skip the rest:
+ return;
+ }
+ BytesRef surfaceCopy;
+ if (count > 0 && seenSurfaceForms.containsKey(surface)) {
+ surfaceIndex = seenSurfaceForms.lget();
+ SurfaceFormAndPayload surfaceFormAndPayload = surfaceFormsAndPayload[surfaceIndex];
+ if (encodedWeight >= surfaceFormAndPayload.weight) {
+ return;
+ }
+ surfaceCopy = BytesRef.deepCopyOf(surface);
+ } else {
+ surfaceIndex = count++;
+ surfaceCopy = BytesRef.deepCopyOf(surface);
+ seenSurfaceForms.put(surfaceCopy, surfaceIndex);
+ }
+
+ BytesRef payloadRef;
+ if (!hasPayloads) {
+ payloadRef = surfaceCopy;
+ } else {
+ int len = surface.length + 1 + payload.length;
+ final BytesRef br = new BytesRef(len);
+ System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length);
+ br.bytes[surface.length] = (byte) payloadSep;
+ System.arraycopy(payload.bytes, payload.offset, br.bytes, surface.length + 1, payload.length);
+ br.length = len;
+ payloadRef = br;
+ }
+ if (surfaceFormsAndPayload[surfaceIndex] == null) {
+ surfaceFormsAndPayload[surfaceIndex] = new SurfaceFormAndPayload(payloadRef, encodedWeight);
+ } else {
+ surfaceFormsAndPayload[surfaceIndex].payload = payloadRef;
+ surfaceFormsAndPayload[surfaceIndex].weight = encodedWeight;
+ }
+ }
+
+ public void finishTerm(long defaultWeight) throws IOException {
+ ArrayUtil.timSort(surfaceFormsAndPayload, 0, count);
+ int deduplicator = 0;
+ analyzed.bytes[analyzed.offset + analyzed.length] = 0;
+ analyzed.length += 2;
+ for (int i = 0; i < count; i++) {
+ analyzed.bytes[analyzed.offset + analyzed.length - 1 ] = (byte) deduplicator++;
+ Util.toIntsRef(analyzed, scratchInts);
+ SurfaceFormAndPayload candiate = surfaceFormsAndPayload[i];
+ long cost = candiate.weight == -1 ? encodeWeight(Math.min(Integer.MAX_VALUE, defaultWeight)) : candiate.weight;
+ builder.add(scratchInts, outputs.newPair(cost, candiate.payload));
+ }
+ seenSurfaceForms.clear();
+ count = 0;
+ }
+
+ public FST<Pair<Long, BytesRef>> build() throws IOException {
+ return builder.finish();
+ }
+
+ public boolean hasPayloads() {
+ return hasPayloads;
+ }
+
+ public int maxSurfaceFormsPerAnalyzedForm() {
+ return maxSurfaceFormsPerAnalyzedForm;
+ }
+
+ }
+}
diff --git a/src/main/java/org/apache/lucene/search/suggest/analyzing/XFuzzySuggester.java b/src/main/java/org/apache/lucene/search/suggest/analyzing/XFuzzySuggester.java
new file mode 100644
index 0000000..4379cda
--- /dev/null
+++ b/src/main/java/org/apache/lucene/search/suggest/analyzing/XFuzzySuggester.java
@@ -0,0 +1,263 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.lucene.search.suggest.analyzing;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStreamToAutomaton;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.automaton.*;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.PairOutputs;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Set;
+
+/**
+ * Implements a fuzzy {@link AnalyzingSuggester}. The similarity measurement is
+ * based on the Damerau-Levenshtein (optimal string alignment) algorithm, though
+ * you can explicitly choose classic Levenshtein by passing <code>false</code>
+ * for the <code>transpositions</code> parameter.
+ * <p>
+ * At most, this query will match terms up to
+ * {@value org.apache.lucene.util.automaton.LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE}
+ * edits. Higher distances are not supported. Note that the
+ * fuzzy distance is measured in "byte space" on the bytes
+ * returned by the {@link org.apache.lucene.analysis.TokenStream}'s {@link
+ * org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute}, usually UTF8. By default
+ * the analyzed bytes must be at least 3 {@link
+ * #DEFAULT_MIN_FUZZY_LENGTH} bytes before any edits are
+ * considered. Furthermore, the first 1 {@link
+ * #DEFAULT_NON_FUZZY_PREFIX} byte is not allowed to be
+ * edited. We allow up to 1 (@link
+ * #DEFAULT_MAX_EDITS} edit.
+ * If {@link #unicodeAware} parameter in the constructor is set to true, maxEdits,
+ * minFuzzyLength, transpositions and nonFuzzyPrefix are measured in Unicode code
+ * points (actual letters) instead of bytes.*
+ *
+ * <p>
+ * NOTE: This suggester does not boost suggestions that
+ * required no edits over suggestions that did require
+ * edits. This is a known limitation.
+ *
+ * <p>
+ * Note: complex query analyzers can have a significant impact on the lookup
+ * performance. It's recommended to not use analyzers that drop or inject terms
+ * like synonyms to keep the complexity of the prefix intersection low for good
+ * lookup performance. At index time, complex analyzers can safely be used.
+ * </p>
+ *
+ * @lucene.experimental
+ */
+public final class XFuzzySuggester extends XAnalyzingSuggester {
+ private final int maxEdits;
+ private final boolean transpositions;
+ private final int nonFuzzyPrefix;
+ private final int minFuzzyLength;
+ private final boolean unicodeAware;
+
+ /**
+ * Measure maxEdits, minFuzzyLength, transpositions and nonFuzzyPrefix
+ * parameters in Unicode code points (actual letters)
+ * instead of bytes.
+ */
+ public static final boolean DEFAULT_UNICODE_AWARE = false;
+
+ /**
+ * The default minimum length of the key passed to {@link
+ * #lookup} before any edits are allowed.
+ */
+ public static final int DEFAULT_MIN_FUZZY_LENGTH = 3;
+
+ /**
+ * The default prefix length where edits are not allowed.
+ */
+ public static final int DEFAULT_NON_FUZZY_PREFIX = 1;
+
+ /**
+ * The default maximum number of edits for fuzzy
+ * suggestions.
+ */
+ public static final int DEFAULT_MAX_EDITS = 1;
+
+ /**
+ * The default transposition value passed to {@link org.apache.lucene.util.automaton.LevenshteinAutomata}
+ */
+ public static final boolean DEFAULT_TRANSPOSITIONS = true;
+
+ /**
+ * Creates a {@link FuzzySuggester} instance initialized with default values.
+ *
+ * @param analyzer the analyzer used for this suggester
+ */
+ public XFuzzySuggester(Analyzer analyzer) {
+ this(analyzer, analyzer);
+ }
+
+ /**
+ * Creates a {@link FuzzySuggester} instance with an index & a query analyzer initialized with default values.
+ *
+ * @param indexAnalyzer
+ * Analyzer that will be used for analyzing suggestions while building the index.
+ * @param queryAnalyzer
+ * Analyzer that will be used for analyzing query text during lookup
+ */
+ public XFuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {
+ this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, DEFAULT_MAX_EDITS, DEFAULT_TRANSPOSITIONS,
+ DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH, DEFAULT_UNICODE_AWARE, null, false, 0, SEP_LABEL, PAYLOAD_SEP, END_BYTE, HOLE_CHARACTER);
+
+ }
+
+ /**
+ * Creates a {@link FuzzySuggester} instance.
+ *
+ * @param indexAnalyzer Analyzer that will be used for
+ * analyzing suggestions while building the index.
+ * @param queryAnalyzer Analyzer that will be used for
+ * analyzing query text during lookup
+ * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP}
+ * @param maxSurfaceFormsPerAnalyzedForm Maximum number of
+ * surface forms to keep for a single analyzed form.
+ * When there are too many surface forms we discard the
+ * lowest weighted ones.
+ * @param maxGraphExpansions Maximum number of graph paths
+ * to expand from the analyzed form. Set this to -1 for
+ * no limit.
+ * @param maxEdits must be >= 0 and <= {@link org.apache.lucene.util.automaton.LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE} .
+ * @param transpositions <code>true</code> if transpositions should be treated as a primitive
+ * edit operation. If this is false, comparisons will implement the classic
+ * Levenshtein algorithm.
+ * @param nonFuzzyPrefix length of common (non-fuzzy) prefix (see default {@link #DEFAULT_NON_FUZZY_PREFIX}
+ * @param minFuzzyLength minimum length of lookup key before any edits are allowed (see default {@link #DEFAULT_MIN_FUZZY_LENGTH})
+ * @param sepLabel separation label
+ * @param payloadSep payload separator byte
+ * @param endByte end byte marker byte
+ */
+ public XFuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
+ int maxEdits, boolean transpositions, int nonFuzzyPrefix, int minFuzzyLength, boolean unicodeAware,
+ FST<PairOutputs.Pair<Long, BytesRef>> fst, boolean hasPayloads, int maxAnalyzedPathsForOneInput,
+ int sepLabel, int payloadSep, int endByte, int holeCharacter) {
+ super(indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, true, fst, hasPayloads, maxAnalyzedPathsForOneInput, sepLabel, payloadSep, endByte, holeCharacter);
+ if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
+ throw new IllegalArgumentException("maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
+ }
+ if (nonFuzzyPrefix < 0) {
+ throw new IllegalArgumentException("nonFuzzyPrefix must not be >= 0 (got " + nonFuzzyPrefix + ")");
+ }
+ if (minFuzzyLength < 0) {
+ throw new IllegalArgumentException("minFuzzyLength must not be >= 0 (got " + minFuzzyLength + ")");
+ }
+
+ this.maxEdits = maxEdits;
+ this.transpositions = transpositions;
+ this.nonFuzzyPrefix = nonFuzzyPrefix;
+ this.minFuzzyLength = minFuzzyLength;
+ this.unicodeAware = unicodeAware;
+ }
+
+ @Override
+ protected List<FSTUtil.Path<PairOutputs.Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<PairOutputs.Pair<Long,BytesRef>>> prefixPaths,
+ Automaton lookupAutomaton,
+ FST<PairOutputs.Pair<Long,BytesRef>> fst)
+ throws IOException {
+
+ // TODO: right now there's no penalty for fuzzy/edits,
+ // ie a completion whose prefix matched exactly what the
+ // user typed gets no boost over completions that
+ // required an edit, which get no boost over completions
+ // requiring two edits. I suspect a multiplicative
+ // factor is appropriate (eg, say a fuzzy match must be at
+ // least 2X better weight than the non-fuzzy match to
+ // "compete") ... in which case I think the wFST needs
+ // to be log weights or something ...
+
+ Automaton levA = convertAutomaton(toLevenshteinAutomata(lookupAutomaton));
+ /*
+ Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
+ w.write(levA.toDot());
+ w.close();
+ System.out.println("Wrote LevA to out.dot");
+ */
+ return FSTUtil.intersectPrefixPaths(levA, fst);
+ }
+
+ @Override
+ protected Automaton convertAutomaton(Automaton a) {
+ if (unicodeAware) {
+ Automaton utf8automaton = new UTF32ToUTF8().convert(a);
+ BasicOperations.determinize(utf8automaton);
+ return utf8automaton;
+ } else {
+ return a;
+ }
+ }
+
+ @Override
+ public TokenStreamToAutomaton getTokenStreamToAutomaton() {
+ final TokenStreamToAutomaton tsta = super.getTokenStreamToAutomaton();
+ tsta.setUnicodeArcs(unicodeAware);
+ return tsta;
+ }
+
+ Automaton toLevenshteinAutomata(Automaton automaton) {
+ final Set<IntsRef> ref = SpecialOperations.getFiniteStrings(automaton, -1);
+ Automaton subs[] = new Automaton[ref.size()];
+ int upto = 0;
+ for (IntsRef path : ref) {
+ if (path.length <= nonFuzzyPrefix || path.length < minFuzzyLength) {
+ subs[upto] = BasicAutomata.makeString(path.ints, path.offset, path.length);
+ upto++;
+ } else {
+ Automaton prefix = BasicAutomata.makeString(path.ints, path.offset, nonFuzzyPrefix);
+ int ints[] = new int[path.length-nonFuzzyPrefix];
+ System.arraycopy(path.ints, path.offset+nonFuzzyPrefix, ints, 0, ints.length);
+ // TODO: maybe add alphaMin to LevenshteinAutomata,
+ // and pass 1 instead of 0? We probably don't want
+ // to allow the trailing dedup bytes to be
+ // edited... but then 0 byte is "in general" allowed
+ // on input (but not in UTF8).
+ LevenshteinAutomata lev = new LevenshteinAutomata(ints, unicodeAware ? Character.MAX_CODE_POINT : 255, transpositions);
+ Automaton levAutomaton = lev.toAutomaton(maxEdits);
+ Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton));
+ combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already
+ subs[upto] = combined;
+ upto++;
+ }
+ }
+
+ if (subs.length == 0) {
+ // automaton is empty, there is no accepted paths through it
+ return BasicAutomata.makeEmpty(); // matches nothing
+ } else if (subs.length == 1) {
+ // no synonyms or anything: just a single path through the tokenstream
+ return subs[0];
+ } else {
+ // multiple paths: this is really scary! is it slow?
+ // maybe we should not do this and throw UOE?
+ Automaton a = BasicOperations.union(Arrays.asList(subs));
+ // TODO: we could call toLevenshteinAutomata() before det?
+ // this only happens if you have multiple paths anyway (e.g. synonyms)
+ BasicOperations.determinize(a);
+
+ return a;
+ }
+ }
+}
diff --git a/src/main/java/org/apache/lucene/search/vectorhighlight/CustomFieldQuery.java b/src/main/java/org/apache/lucene/search/vectorhighlight/CustomFieldQuery.java
new file mode 100644
index 0000000..8512e5c
--- /dev/null
+++ b/src/main/java/org/apache/lucene/search/vectorhighlight/CustomFieldQuery.java
@@ -0,0 +1,161 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.lucene.search.vectorhighlight;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.queries.FilterClause;
+import org.apache.lucene.queries.TermFilter;
+import org.apache.lucene.search.*;
+import org.apache.lucene.search.spans.SpanTermQuery;
+import org.apache.lucene.util.Version;
+import org.elasticsearch.common.lucene.Lucene;
+import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery;
+import org.elasticsearch.common.lucene.search.XBooleanFilter;
+import org.elasticsearch.common.lucene.search.XFilteredQuery;
+import org.elasticsearch.common.lucene.search.function.FiltersFunctionScoreQuery;
+import org.elasticsearch.common.lucene.search.function.FunctionScoreQuery;
+
+import java.io.IOException;
+import java.lang.reflect.Field;
+import java.util.Collection;
+import java.util.List;
+
+/**
+ *
+ */
+// LUCENE MONITOR
+public class CustomFieldQuery extends FieldQuery {
+
+ private static Field multiTermQueryWrapperFilterQueryField;
+
+ static {
+ try {
+ multiTermQueryWrapperFilterQueryField = MultiTermQueryWrapperFilter.class.getDeclaredField("query");
+ multiTermQueryWrapperFilterQueryField.setAccessible(true);
+ } catch (NoSuchFieldException e) {
+ // ignore
+ }
+ }
+
+ public static final ThreadLocal<Boolean> highlightFilters = new ThreadLocal<Boolean>();
+
+ public CustomFieldQuery(Query query, IndexReader reader, FastVectorHighlighter highlighter) throws IOException {
+ this(query, reader, highlighter.isPhraseHighlight(), highlighter.isFieldMatch());
+ }
+
+ public CustomFieldQuery(Query query, IndexReader reader, boolean phraseHighlight, boolean fieldMatch) throws IOException {
+ super(query, reader, phraseHighlight, fieldMatch);
+ highlightFilters.remove();
+ }
+
+ @Override
+ void flatten(Query sourceQuery, IndexReader reader, Collection<Query> flatQueries) throws IOException {
+ if (sourceQuery instanceof SpanTermQuery) {
+ super.flatten(new TermQuery(((SpanTermQuery) sourceQuery).getTerm()), reader, flatQueries);
+ } else if (sourceQuery instanceof ConstantScoreQuery) {
+ ConstantScoreQuery constantScoreQuery = (ConstantScoreQuery) sourceQuery;
+ if (constantScoreQuery.getFilter() != null) {
+ flatten(constantScoreQuery.getFilter(), reader, flatQueries);
+ } else {
+ flatten(constantScoreQuery.getQuery(), reader, flatQueries);
+ }
+ } else if (sourceQuery instanceof FunctionScoreQuery) {
+ flatten(((FunctionScoreQuery) sourceQuery).getSubQuery(), reader, flatQueries);
+ } else if (sourceQuery instanceof FilteredQuery) {
+ flatten(((FilteredQuery) sourceQuery).getQuery(), reader, flatQueries);
+ flatten(((FilteredQuery) sourceQuery).getFilter(), reader, flatQueries);
+ } else if (sourceQuery instanceof XFilteredQuery) {
+ flatten(((XFilteredQuery) sourceQuery).getQuery(), reader, flatQueries);
+ flatten(((XFilteredQuery) sourceQuery).getFilter(), reader, flatQueries);
+ } else if (sourceQuery instanceof MultiPhrasePrefixQuery) {
+ flatten(sourceQuery.rewrite(reader), reader, flatQueries);
+ } else if (sourceQuery instanceof FiltersFunctionScoreQuery) {
+ flatten(((FiltersFunctionScoreQuery) sourceQuery).getSubQuery(), reader, flatQueries);
+ } else if (sourceQuery instanceof MultiPhraseQuery) {
+ MultiPhraseQuery q = ((MultiPhraseQuery) sourceQuery);
+ convertMultiPhraseQuery(0, new int[q.getTermArrays().size()] , q, q.getTermArrays(), q.getPositions(), reader, flatQueries);
+ } else {
+ super.flatten(sourceQuery, reader, flatQueries);
+ }
+ }
+
+ private void convertMultiPhraseQuery(int currentPos, int[] termsIdx, MultiPhraseQuery orig, List<Term[]> terms, int[] pos, IndexReader reader, Collection<Query> flatQueries) throws IOException {
+ if (currentPos == 0) {
+ // if we have more than 16 terms
+ int numTerms = 0;
+ for (Term[] currentPosTerm : terms) {
+ numTerms += currentPosTerm.length;
+ }
+ if (numTerms > 16) {
+ for (Term[] currentPosTerm : terms) {
+ for (Term term : currentPosTerm) {
+ super.flatten(new TermQuery(term), reader, flatQueries);
+ }
+ }
+ return;
+ }
+ }
+ /*
+ * we walk all possible ways and for each path down the MPQ we create a PhraseQuery this is what FieldQuery supports.
+ * It seems expensive but most queries will pretty small.
+ */
+ if (currentPos == terms.size()) {
+ PhraseQuery query = new PhraseQuery();
+ query.setBoost(orig.getBoost());
+ query.setSlop(orig.getSlop());
+ for (int i = 0; i < termsIdx.length; i++) {
+ query.add(terms.get(i)[termsIdx[i]], pos[i]);
+ }
+ this.flatten(query, reader, flatQueries);
+ } else {
+ Term[] t = terms.get(currentPos);
+ for (int i = 0; i < t.length; i++) {
+ termsIdx[currentPos] = i;
+ convertMultiPhraseQuery(currentPos+1, termsIdx, orig, terms, pos, reader, flatQueries);
+ }
+ }
+ }
+
+ void flatten(Filter sourceFilter, IndexReader reader, Collection<Query> flatQueries) throws IOException {
+ Boolean highlight = highlightFilters.get();
+ if (highlight == null || highlight.equals(Boolean.FALSE)) {
+ return;
+ }
+ if (sourceFilter instanceof TermFilter) {
+ flatten(new TermQuery(((TermFilter) sourceFilter).getTerm()), reader, flatQueries);
+ } else if (sourceFilter instanceof MultiTermQueryWrapperFilter) {
+ if (multiTermQueryWrapperFilterQueryField != null) {
+ try {
+ flatten((Query) multiTermQueryWrapperFilterQueryField.get(sourceFilter), reader, flatQueries);
+ } catch (IllegalAccessException e) {
+ // ignore
+ }
+ }
+ } else if (sourceFilter instanceof XBooleanFilter) {
+ XBooleanFilter booleanFilter = (XBooleanFilter) sourceFilter;
+ for (FilterClause clause : booleanFilter.clauses()) {
+ if (clause.getOccur() == BooleanClause.Occur.MUST || clause.getOccur() == BooleanClause.Occur.SHOULD) {
+ flatten(clause.getFilter(), reader, flatQueries);
+ }
+ }
+ }
+ }
+}
diff --git a/src/main/java/org/apache/lucene/store/BufferedChecksumIndexOutput.java b/src/main/java/org/apache/lucene/store/BufferedChecksumIndexOutput.java
new file mode 100644
index 0000000..0c64a93
--- /dev/null
+++ b/src/main/java/org/apache/lucene/store/BufferedChecksumIndexOutput.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.lucene.store;
+
+import java.io.IOException;
+import java.util.zip.Checksum;
+
+/**
+ */
+public class BufferedChecksumIndexOutput extends BufferedIndexOutput {
+
+ private final IndexOutput delegate;
+ private final BufferedIndexOutput bufferedDelegate;
+ private final Checksum digest;
+
+ public BufferedChecksumIndexOutput(IndexOutput delegate, Checksum digest) {
+ super(delegate instanceof BufferedIndexOutput ? ((BufferedIndexOutput) delegate).getBufferSize() : BufferedIndexOutput.DEFAULT_BUFFER_SIZE);
+ if (delegate instanceof BufferedIndexOutput) {
+ bufferedDelegate = (BufferedIndexOutput) delegate;
+ this.delegate = delegate;
+ } else {
+ this.delegate = delegate;
+ bufferedDelegate = null;
+ }
+ this.digest = digest;
+ }
+
+ public Checksum digest() {
+ return digest;
+ }
+
+ public IndexOutput underlying() {
+ return this.delegate;
+ }
+
+ // don't override it, base class method simple reads from input and writes to this output
+// @Override public void copyBytes(IndexInput input, long numBytes) throws IOException {
+// delegate.copyBytes(input, numBytes);
+// }
+
+ @Override
+ public void close() throws IOException {
+ try {
+ super.close();
+ } finally {
+ delegate.close();
+ }
+
+ }
+
+ @Override
+ protected void flushBuffer(byte[] b, int offset, int len) throws IOException {
+ if (bufferedDelegate != null) {
+ bufferedDelegate.flushBuffer(b, offset, len);
+ } else {
+ delegate.writeBytes(b, offset, len);
+ }
+ digest.update(b, offset, len);
+ }
+
+ // don't override it, base class method simple reads from input and writes to this output
+// @Override public void copyBytes(IndexInput input, long numBytes) throws IOException {
+// delegate.copyBytes(input, numBytes);
+// }
+
+ @Override
+ public void flush() throws IOException {
+ try {
+ super.flush();
+ } finally {
+ delegate.flush();
+ }
+ }
+
+ @Override
+ public void seek(long pos) throws IOException {
+ // seek might be called on files, which means that the checksum is not file checksum
+ // but a checksum of the bytes written to this stream, which is the same for each
+ // type of file in lucene
+ super.seek(pos);
+ delegate.seek(pos);
+ }
+
+ @Override
+ public long length() throws IOException {
+ return delegate.length();
+ }
+
+ @Override
+ public void setLength(long length) throws IOException {
+ delegate.setLength(length);
+ }
+
+ @Override
+ public String toString() {
+ return delegate.toString();
+ }
+}
diff --git a/src/main/java/org/apache/lucene/store/RateLimitedFSDirectory.java b/src/main/java/org/apache/lucene/store/RateLimitedFSDirectory.java
new file mode 100644
index 0000000..814a75f
--- /dev/null
+++ b/src/main/java/org/apache/lucene/store/RateLimitedFSDirectory.java
@@ -0,0 +1,143 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.lucene.store;
+
+import org.apache.lucene.store.IOContext.Context;
+
+import java.io.IOException;
+
+public final class RateLimitedFSDirectory extends FilterDirectory{
+
+ private final StoreRateLimiting.Provider rateLimitingProvider;
+
+ private final StoreRateLimiting.Listener rateListener;
+
+ public RateLimitedFSDirectory(FSDirectory wrapped, StoreRateLimiting.Provider rateLimitingProvider,
+ StoreRateLimiting.Listener rateListener) {
+ super(wrapped);
+ this.rateLimitingProvider = rateLimitingProvider;
+ this.rateListener = rateListener;
+ }
+
+ @Override
+ public IndexOutput createOutput(String name, IOContext context) throws IOException {
+ final IndexOutput output = in.createOutput(name, context);
+
+ StoreRateLimiting rateLimiting = rateLimitingProvider.rateLimiting();
+ StoreRateLimiting.Type type = rateLimiting.getType();
+ RateLimiter limiter = rateLimiting.getRateLimiter();
+ if (type == StoreRateLimiting.Type.NONE || limiter == null) {
+ return output;
+ }
+ if (context.context == Context.MERGE) {
+ // we are mering, and type is either MERGE or ALL, rate limit...
+ return new RateLimitedIndexOutput(limiter, rateListener, output);
+ }
+ if (type == StoreRateLimiting.Type.ALL) {
+ return new RateLimitedIndexOutput(limiter, rateListener, output);
+ }
+ // we shouldn't really get here...
+ return output;
+ }
+
+
+ @Override
+ public void close() throws IOException {
+ in.close();
+ }
+
+ @Override
+ public String toString() {
+ StoreRateLimiting rateLimiting = rateLimitingProvider.rateLimiting();
+ StoreRateLimiting.Type type = rateLimiting.getType();
+ RateLimiter limiter = rateLimiting.getRateLimiter();
+ if (type == StoreRateLimiting.Type.NONE || limiter == null) {
+ return StoreUtils.toString(in);
+ } else {
+ return "rate_limited(" + StoreUtils.toString(in) + ", type=" + type.name() + ", rate=" + limiter.getMbPerSec() + ")";
+ }
+ }
+
+
+ static final class RateLimitedIndexOutput extends BufferedIndexOutput {
+
+ private final IndexOutput delegate;
+ private final BufferedIndexOutput bufferedDelegate;
+ private final RateLimiter rateLimiter;
+ private final StoreRateLimiting.Listener rateListener;
+
+ RateLimitedIndexOutput(final RateLimiter rateLimiter, final StoreRateLimiting.Listener rateListener, final IndexOutput delegate) {
+ super(delegate instanceof BufferedIndexOutput ? ((BufferedIndexOutput) delegate).getBufferSize() : BufferedIndexOutput.DEFAULT_BUFFER_SIZE);
+ if (delegate instanceof BufferedIndexOutput) {
+ bufferedDelegate = (BufferedIndexOutput) delegate;
+ this.delegate = delegate;
+ } else {
+ this.delegate = delegate;
+ bufferedDelegate = null;
+ }
+ this.rateLimiter = rateLimiter;
+ this.rateListener = rateListener;
+ }
+
+ @Override
+ protected void flushBuffer(byte[] b, int offset, int len) throws IOException {
+ rateListener.onPause(rateLimiter.pause(len));
+ if (bufferedDelegate != null) {
+ bufferedDelegate.flushBuffer(b, offset, len);
+ } else {
+ delegate.writeBytes(b, offset, len);
+ }
+
+ }
+
+ @Override
+ public long length() throws IOException {
+ return delegate.length();
+ }
+
+ @Override
+ public void seek(long pos) throws IOException {
+ flush();
+ delegate.seek(pos);
+ }
+
+ @Override
+ public void flush() throws IOException {
+ try {
+ super.flush();
+ } finally {
+ delegate.flush();
+ }
+ }
+
+ @Override
+ public void setLength(long length) throws IOException {
+ delegate.setLength(length);
+ }
+
+ @Override
+ public void close() throws IOException {
+ try {
+ super.close();
+ } finally {
+ delegate.close();
+ }
+ }
+ }
+}
diff --git a/src/main/java/org/apache/lucene/store/StoreRateLimiting.java b/src/main/java/org/apache/lucene/store/StoreRateLimiting.java
new file mode 100644
index 0000000..8b745ef
--- /dev/null
+++ b/src/main/java/org/apache/lucene/store/StoreRateLimiting.java
@@ -0,0 +1,94 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.lucene.store;
+
+import org.apache.lucene.store.RateLimiter.SimpleRateLimiter;
+import org.elasticsearch.ElasticsearchIllegalArgumentException;
+import org.elasticsearch.common.Nullable;
+import org.elasticsearch.common.unit.ByteSizeValue;
+
+/**
+ */
+public class StoreRateLimiting {
+
+ public static interface Provider {
+
+ StoreRateLimiting rateLimiting();
+ }
+
+ public interface Listener {
+
+ void onPause(long nanos);
+ }
+
+ public static enum Type {
+ NONE,
+ MERGE,
+ ALL;
+
+ public static Type fromString(String type) throws ElasticsearchIllegalArgumentException {
+ if ("none".equalsIgnoreCase(type)) {
+ return NONE;
+ } else if ("merge".equalsIgnoreCase(type)) {
+ return MERGE;
+ } else if ("all".equalsIgnoreCase(type)) {
+ return ALL;
+ }
+ throw new ElasticsearchIllegalArgumentException("rate limiting type [" + type + "] not valid, can be one of [all|merge|none]");
+ }
+ }
+
+ private final SimpleRateLimiter rateLimiter = new SimpleRateLimiter(0);
+ private volatile SimpleRateLimiter actualRateLimiter;
+
+ private volatile Type type;
+
+ public StoreRateLimiting() {
+
+ }
+
+ @Nullable
+ public RateLimiter getRateLimiter() {
+ return actualRateLimiter;
+ }
+
+ public void setMaxRate(ByteSizeValue rate) {
+ if (rate.bytes() <= 0) {
+ actualRateLimiter = null;
+ } else if (actualRateLimiter == null) {
+ actualRateLimiter = rateLimiter;
+ actualRateLimiter.setMbPerSec(rate.mbFrac());
+ } else {
+ assert rateLimiter == actualRateLimiter;
+ rateLimiter.setMbPerSec(rate.mbFrac());
+ }
+ }
+
+ public Type getType() {
+ return type;
+ }
+
+ public void setType(Type type) {
+ this.type = type;
+ }
+
+ public void setType(String type) throws ElasticsearchIllegalArgumentException {
+ this.type = Type.fromString(type);
+ }
+}
diff --git a/src/main/java/org/apache/lucene/store/StoreUtils.java b/src/main/java/org/apache/lucene/store/StoreUtils.java
new file mode 100644
index 0000000..5364970
--- /dev/null
+++ b/src/main/java/org/apache/lucene/store/StoreUtils.java
@@ -0,0 +1,44 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.lucene.store;
+
+/**
+ */
+public final class StoreUtils {
+
+ private StoreUtils() {
+
+ }
+
+ public static String toString(Directory directory) {
+ if (directory instanceof NIOFSDirectory) {
+ NIOFSDirectory niofsDirectory = (NIOFSDirectory)directory;
+ return "niofs(" + niofsDirectory.getDirectory() + ")";
+ }
+ if (directory instanceof MMapDirectory) {
+ MMapDirectory mMapDirectory = (MMapDirectory)directory;
+ return "mmapfs(" + mMapDirectory.getDirectory() + ")";
+ }
+ if (directory instanceof SimpleFSDirectory) {
+ SimpleFSDirectory simpleFSDirectory = (SimpleFSDirectory)directory;
+ return "simplefs(" + simpleFSDirectory.getDirectory() + ")";
+ }
+ return directory.toString();
+ }
+}
diff --git a/src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferAllocator.java b/src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferAllocator.java
new file mode 100644
index 0000000..27b4a1a
--- /dev/null
+++ b/src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferAllocator.java
@@ -0,0 +1,97 @@
+package org.apache.lucene.store.bytebuffer;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.lang.reflect.Method;
+import java.nio.ByteBuffer;
+
+/**
+ * A byte buffer allocator simple allocates byte buffers, and handles releasing
+ * them. Implementation can include special direct buffer cleaning when releasing
+ * a buffer, as well as caching of byte buffers.
+ * <p/>
+ * <p>There are two types of buffers that can be allocated, small and big. This
+ * comes in handy when knowing in advance (more or less) the size of the buffers
+ * needed (large files or small), as well as in caching implementations.
+ */
+public interface ByteBufferAllocator {
+
+ /**
+ * Helper class to allocator implementations allowing to clean direct buffers.
+ */
+ public static class Cleaner {
+ public static final boolean CLEAN_SUPPORTED;
+ private static final Method directBufferCleaner;
+ private static final Method directBufferCleanerClean;
+
+ static {
+ Method directBufferCleanerX = null;
+ Method directBufferCleanerCleanX = null;
+ boolean v;
+ try {
+ directBufferCleanerX = Class.forName("java.nio.DirectByteBuffer").getMethod("cleaner");
+ directBufferCleanerX.setAccessible(true);
+ directBufferCleanerCleanX = Class.forName("sun.misc.Cleaner").getMethod("clean");
+ directBufferCleanerCleanX.setAccessible(true);
+ v = true;
+ } catch (Exception e) {
+ v = false;
+ }
+ CLEAN_SUPPORTED = v;
+ directBufferCleaner = directBufferCleanerX;
+ directBufferCleanerClean = directBufferCleanerCleanX;
+ }
+
+ public static void clean(ByteBuffer buffer) {
+ if (CLEAN_SUPPORTED && buffer.isDirect()) {
+ try {
+ Object cleaner = directBufferCleaner.invoke(buffer);
+ directBufferCleanerClean.invoke(cleaner);
+ } catch (Exception e) {
+ // silently ignore exception
+ }
+ }
+ }
+ }
+
+ public static enum Type {
+ SMALL,
+ LARGE
+ }
+
+ /**
+ * The size (in bytes) that is allocated for the provided type.
+ */
+ int sizeInBytes(Type type);
+
+ /**
+ * Allocate a byte buffer for the specific type.
+ */
+ ByteBuffer allocate(Type type) throws IOException;
+
+ /**
+ * Release the buffer.
+ */
+ void release(ByteBuffer buffer);
+
+ /**
+ * Close the allocator, releasing any cached buffers for example.
+ */
+ void close();
+}
diff --git a/src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferDirectory.java b/src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferDirectory.java
new file mode 100644
index 0000000..9403085
--- /dev/null
+++ b/src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferDirectory.java
@@ -0,0 +1,182 @@
+package org.apache.lucene.store.bytebuffer;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import com.google.common.collect.ImmutableSet;
+import org.apache.lucene.store.*;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Collection;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.atomic.AtomicLong;
+
+/**
+ * A memory based directory that uses {@link java.nio.ByteBuffer} in order to store the directory content.
+ * <p/>
+ * <p>The benefit of using {@link java.nio.ByteBuffer} is the fact that it can be stored in "native" memory
+ * outside of the JVM heap, thus not incurring the GC overhead of large in memory index.
+ * <p/>
+ * <p>Each "file" is segmented into one or more byte buffers.
+ * <p/>
+ * <p>If constructed with {@link ByteBufferAllocator}, it allows to control the allocation and release of
+ * byte buffer. For example, custom implementations can include caching of byte buffers.
+ */
+public class ByteBufferDirectory extends BaseDirectory {
+
+ protected final Map<String, ByteBufferFile> files = new ConcurrentHashMap<String, ByteBufferFile>();
+
+ private final ByteBufferAllocator allocator;
+
+ private final boolean internalAllocator;
+
+ final AtomicLong sizeInBytes = new AtomicLong();
+
+
+ /**
+ * Constructs a new directory using {@link PlainByteBufferAllocator}.
+ */
+ public ByteBufferDirectory() {
+ this.allocator = new PlainByteBufferAllocator(false, 1024, 1024 * 10);
+ this.internalAllocator = true;
+ try {
+ setLockFactory(new SingleInstanceLockFactory());
+ } catch (IOException e) {
+ // will not happen
+ }
+ }
+
+ /**
+ * Constructs a new byte buffer directory with a custom allocator.
+ */
+ public ByteBufferDirectory(ByteBufferAllocator allocator) {
+ this.allocator = allocator;
+ this.internalAllocator = false;
+ try {
+ setLockFactory(new SingleInstanceLockFactory());
+ } catch (IOException e) {
+ // will not happen
+ }
+ }
+
+ /**
+ * Returns the size in bytes of the directory, chunk by buffer size.
+ */
+ public long sizeInBytes() {
+ return sizeInBytes.get();
+ }
+
+ public void sync(Collection<String> names) throws IOException {
+ // nothing to do here
+ }
+
+ @Override
+ public String[] listAll() throws IOException {
+ return files.keySet().toArray(new String[0]);
+ }
+
+ @Override
+ public boolean fileExists(String name) throws IOException {
+ return files.containsKey(name);
+ }
+
+ @Override
+ public void deleteFile(String name) throws IOException {
+ ByteBufferFile file = files.remove(name);
+ if (file == null)
+ throw new FileNotFoundException(name);
+ sizeInBytes.addAndGet(-file.sizeInBytes());
+ file.delete();
+ }
+
+ @Override
+ public long fileLength(String name) throws IOException {
+ ByteBufferFile file = files.get(name);
+ if (file == null)
+ throw new FileNotFoundException(name);
+ return file.getLength();
+ }
+
+ private final static ImmutableSet<String> SMALL_FILES_SUFFIXES = ImmutableSet.of(
+ "del", // 1 bit per doc
+ "cfe", // compound file metadata
+ "si", // segment info
+ "fnm" // field info (metadata like omit norms etc)
+ );
+
+ private static boolean isSmallFile(String fileName) {
+ if (fileName.startsWith("segments")) {
+ return true;
+ }
+ if (fileName.lastIndexOf('.') > 0) {
+ String suffix = fileName.substring(fileName.lastIndexOf('.') + 1);
+ return SMALL_FILES_SUFFIXES.contains(suffix);
+ }
+ return false;
+ }
+
+ @Override
+ public IndexOutput createOutput(String name, IOContext context) throws IOException {
+ ByteBufferAllocator.Type allocatorType = ByteBufferAllocator.Type.LARGE;
+ if (isSmallFile(name)) {
+ allocatorType = ByteBufferAllocator.Type.SMALL;
+ }
+ ByteBufferFileOutput file = new ByteBufferFileOutput(this, allocator.sizeInBytes(allocatorType));
+ ByteBufferFile existing = files.put(name, file);
+ if (existing != null) {
+ sizeInBytes.addAndGet(-existing.sizeInBytes());
+ existing.delete();
+ }
+ return new ByteBufferIndexOutput(this, name, allocator, allocatorType, file);
+ }
+
+ void closeOutput(String name, ByteBufferFileOutput file) {
+ // we replace the output file with a read only file, with no sync
+ files.put(name, new ByteBufferFile(file));
+ }
+
+ @Override
+ public IndexInput openInput(String name, IOContext context) throws IOException {
+ ByteBufferFile file = files.get(name);
+ if (file == null)
+ throw new FileNotFoundException(name);
+ return new ByteBufferIndexInput(name, file);
+ }
+
+ @Override
+ public void close() throws IOException {
+ String[] files = listAll();
+ for (String file : files) {
+ deleteFile(file);
+ }
+ if (internalAllocator) {
+ allocator.close();
+ }
+ }
+
+ @Override
+ public String toString() {
+ return "byte_buffer";
+ }
+
+ void releaseBuffer(ByteBuffer byteBuffer) {
+ allocator.release(byteBuffer);
+ }
+}
diff --git a/src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferFile.java b/src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferFile.java
new file mode 100644
index 0000000..b726d77
--- /dev/null
+++ b/src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferFile.java
@@ -0,0 +1,102 @@
+package org.apache.lucene.store.bytebuffer;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicInteger;
+
+/**
+ */
+public class ByteBufferFile {
+
+ final ByteBufferDirectory dir;
+
+ final int bufferSize;
+
+ final List<ByteBuffer> buffers;
+
+ long length;
+
+ volatile long lastModified = System.currentTimeMillis();
+
+ final AtomicInteger refCount;
+
+ long sizeInBytes;
+
+ public ByteBufferFile(ByteBufferDirectory dir, int bufferSize) {
+ this.dir = dir;
+ this.bufferSize = bufferSize;
+ this.buffers = new ArrayList<ByteBuffer>();
+ this.refCount = new AtomicInteger(1);
+ }
+
+ ByteBufferFile(ByteBufferFile file) {
+ this.dir = file.dir;
+ this.bufferSize = file.bufferSize;
+ this.buffers = file.buffers;
+ this.length = file.length;
+ this.lastModified = file.lastModified;
+ this.refCount = file.refCount;
+ this.sizeInBytes = file.sizeInBytes;
+ }
+
+ public long getLength() {
+ return length;
+ }
+
+ public long getLastModified() {
+ return lastModified;
+ }
+
+ void setLastModified(long lastModified) {
+ this.lastModified = lastModified;
+ }
+
+ long sizeInBytes() {
+ return sizeInBytes;
+ }
+
+ ByteBuffer getBuffer(int index) {
+ return buffers.get(index);
+ }
+
+ int numBuffers() {
+ return buffers.size();
+ }
+
+ void delete() {
+ decRef();
+ }
+
+ void incRef() {
+ refCount.incrementAndGet();
+ }
+
+ void decRef() {
+ if (refCount.decrementAndGet() == 0) {
+ length = 0;
+ for (ByteBuffer buffer : buffers) {
+ dir.releaseBuffer(buffer);
+ }
+ buffers.clear();
+ sizeInBytes = 0;
+ }
+ }
+}
diff --git a/src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferFileOutput.java b/src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferFileOutput.java
new file mode 100644
index 0000000..7de55bf
--- /dev/null
+++ b/src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferFileOutput.java
@@ -0,0 +1,65 @@
+package org.apache.lucene.store.bytebuffer;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import java.nio.ByteBuffer;
+
+/**
+ */
+public class ByteBufferFileOutput extends ByteBufferFile {
+
+ public ByteBufferFileOutput(ByteBufferDirectory dir, int bufferSize) {
+ super(dir, bufferSize);
+ }
+
+ @Override
+ public synchronized long getLength() {
+ return super.getLength();
+ }
+
+ @Override
+ public synchronized long getLastModified() {
+ return super.getLastModified();
+ }
+
+ synchronized void setLength(long length) {
+ this.length = length;
+ }
+
+ synchronized final void addBuffer(ByteBuffer buffer) {
+ buffers.add(buffer);
+ sizeInBytes += buffer.remaining();
+ dir.sizeInBytes.addAndGet(buffer.remaining());
+ }
+
+ @Override
+ synchronized ByteBuffer getBuffer(int index) {
+ return super.getBuffer(index);
+ }
+
+ @Override
+ synchronized int numBuffers() {
+ return super.numBuffers();
+ }
+
+ @Override
+ synchronized long sizeInBytes() {
+ return super.sizeInBytes();
+ }
+}
diff --git a/src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferIndexInput.java b/src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferIndexInput.java
new file mode 100644
index 0000000..aeba553
--- /dev/null
+++ b/src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferIndexInput.java
@@ -0,0 +1,198 @@
+package org.apache.lucene.store.bytebuffer;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.store.IndexInput;
+
+import java.io.EOFException;
+import java.io.IOException;
+import java.nio.BufferUnderflowException;
+import java.nio.ByteBuffer;
+
+/**
+ */
+public class ByteBufferIndexInput extends IndexInput {
+
+ private final static ByteBuffer EMPTY_BUFFER = ByteBuffer.allocate(0).asReadOnlyBuffer();
+
+ private final ByteBufferFile file;
+ private final long length;
+
+ private ByteBuffer currentBuffer;
+ private int currentBufferIndex;
+
+ private long bufferStart;
+ private final int BUFFER_SIZE;
+
+ private volatile boolean closed = false;
+
+ public ByteBufferIndexInput(String name, ByteBufferFile file) throws IOException {
+ super("BBIndexInput(name=" + name + ")");
+ this.file = file;
+ this.file.incRef();
+ this.length = file.getLength();
+ this.BUFFER_SIZE = file.bufferSize;
+
+ // make sure that we switch to the
+ // first needed buffer lazily
+ currentBufferIndex = -1;
+ currentBuffer = EMPTY_BUFFER;
+ }
+
+ @Override
+ public void close() {
+ // we protected from double closing the index input since
+ // some tests do that...
+ if (closed) {
+ return;
+ }
+ closed = true;
+ file.decRef();
+ }
+
+ @Override
+ public long length() {
+ return length;
+ }
+
+ @Override
+ public short readShort() throws IOException {
+ try {
+ currentBuffer.mark();
+ return currentBuffer.getShort();
+ } catch (BufferUnderflowException e) {
+ currentBuffer.reset();
+ return super.readShort();
+ }
+ }
+
+ @Override
+ public int readInt() throws IOException {
+ try {
+ currentBuffer.mark();
+ return currentBuffer.getInt();
+ } catch (BufferUnderflowException e) {
+ currentBuffer.reset();
+ return super.readInt();
+ }
+ }
+
+ @Override
+ public long readLong() throws IOException {
+ try {
+ currentBuffer.mark();
+ return currentBuffer.getLong();
+ } catch (BufferUnderflowException e) {
+ currentBuffer.reset();
+ return super.readLong();
+ }
+ }
+
+ @Override
+ public byte readByte() throws IOException {
+ if (!currentBuffer.hasRemaining()) {
+ currentBufferIndex++;
+ switchCurrentBuffer(true);
+ }
+ return currentBuffer.get();
+ }
+
+ @Override
+ public void readBytes(byte[] b, int offset, int len) throws IOException {
+ while (len > 0) {
+ if (!currentBuffer.hasRemaining()) {
+ currentBufferIndex++;
+ switchCurrentBuffer(true);
+ }
+
+ int remainInBuffer = currentBuffer.remaining();
+ int bytesToCopy = len < remainInBuffer ? len : remainInBuffer;
+ currentBuffer.get(b, offset, bytesToCopy);
+ offset += bytesToCopy;
+ len -= bytesToCopy;
+ }
+ }
+
+ @Override
+ public long getFilePointer() {
+ return currentBufferIndex < 0 ? 0 : bufferStart + currentBuffer.position();
+ }
+
+ @Override
+ public void seek(long pos) throws IOException {
+ if (currentBuffer == EMPTY_BUFFER || pos < bufferStart || pos >= bufferStart + BUFFER_SIZE) {
+ currentBufferIndex = (int) (pos / BUFFER_SIZE);
+ switchCurrentBuffer(false);
+ }
+ try {
+ currentBuffer.position((int) (pos % BUFFER_SIZE));
+ // Grrr, need to wrap in IllegalArgumentException since tests (if not other places)
+ // expect an IOException...
+ } catch (IllegalArgumentException e) {
+ IOException ioException = new IOException("seeking past position");
+ ioException.initCause(e);
+ throw ioException;
+ }
+ }
+
+ private void switchCurrentBuffer(boolean enforceEOF) throws IOException {
+ if (currentBufferIndex >= file.numBuffers()) {
+ // end of file reached, no more buffers left
+ if (enforceEOF) {
+ throw new EOFException("Read past EOF (resource: " + this + ")");
+ } else {
+ // Force EOF if a read takes place at this position
+ currentBufferIndex--;
+ currentBuffer.position(currentBuffer.limit());
+ }
+ } else {
+ ByteBuffer buffer = file.getBuffer(currentBufferIndex);
+ // we must duplicate (and make it read only while we are at it) since we need position and such to be independent
+ currentBuffer = buffer.asReadOnlyBuffer();
+ currentBuffer.position(0);
+ bufferStart = (long) BUFFER_SIZE * (long) currentBufferIndex;
+ // if we are at the tip, limit the current buffer to only whats available to read
+ long buflen = length - bufferStart;
+ if (buflen < BUFFER_SIZE) {
+ currentBuffer.limit((int) buflen);
+ }
+
+ // we need to enforce EOF here as well...
+ if (!currentBuffer.hasRemaining()) {
+ if (enforceEOF) {
+ throw new EOFException("Read past EOF (resource: " + this + ")");
+ } else {
+ // Force EOF if a read takes place at this position
+ currentBufferIndex--;
+ currentBuffer.position(currentBuffer.limit());
+ }
+ }
+ }
+ }
+
+ @Override
+ public IndexInput clone() {
+ ByteBufferIndexInput cloned = (ByteBufferIndexInput) super.clone();
+ cloned.file.incRef(); // inc ref on cloned one
+ if (currentBuffer != EMPTY_BUFFER) {
+ cloned.currentBuffer = currentBuffer.asReadOnlyBuffer();
+ cloned.currentBuffer.position(currentBuffer.position());
+ }
+ return cloned;
+ }
+}
diff --git a/src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferIndexOutput.java b/src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferIndexOutput.java
new file mode 100644
index 0000000..a667bab
--- /dev/null
+++ b/src/main/java/org/apache/lucene/store/bytebuffer/ByteBufferIndexOutput.java
@@ -0,0 +1,132 @@
+package org.apache.lucene.store.bytebuffer;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import org.apache.lucene.store.IndexOutput;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+/**
+ */
+public class ByteBufferIndexOutput extends IndexOutput {
+
+ private final static ByteBuffer EMPTY_BUFFER = ByteBuffer.allocate(0).asReadOnlyBuffer();
+
+ private final ByteBufferDirectory dir;
+ private final String name;
+ private final ByteBufferAllocator allocator;
+ private final ByteBufferAllocator.Type allocatorType;
+ private final int BUFFER_SIZE;
+ private final ByteBufferFileOutput file;
+
+ private ByteBuffer currentBuffer;
+ private int currentBufferIndex;
+
+ private long bufferStart;
+
+ public ByteBufferIndexOutput(ByteBufferDirectory dir, String name, ByteBufferAllocator allocator, ByteBufferAllocator.Type allocatorType, ByteBufferFileOutput file) throws IOException {
+ this.dir = dir;
+ this.name = name;
+ this.allocator = allocator;
+ this.allocatorType = allocatorType;
+ this.BUFFER_SIZE = file.bufferSize;
+ this.file = file;
+
+ currentBufferIndex = -1;
+ currentBuffer = EMPTY_BUFFER;
+ }
+
+ @Override
+ public void close() throws IOException {
+ flush();
+ dir.closeOutput(name, file);
+ }
+
+ @Override
+ public void seek(long pos) throws IOException {
+ // set the file length in case we seek back
+ // and flush() has not been called yet
+ setFileLength();
+ if (pos < bufferStart || pos >= bufferStart + BUFFER_SIZE) {
+ currentBufferIndex = (int) (pos / BUFFER_SIZE);
+ switchCurrentBuffer();
+ }
+ currentBuffer.position((int) (pos % BUFFER_SIZE));
+ }
+
+ @Override
+ public long length() {
+ return file.getLength();
+ }
+
+ @Override
+ public void writeByte(byte b) throws IOException {
+ if (!currentBuffer.hasRemaining()) {
+ currentBufferIndex++;
+ switchCurrentBuffer();
+ }
+ currentBuffer.put(b);
+ }
+
+ @Override
+ public void writeBytes(byte[] b, int offset, int len) throws IOException {
+ while (len > 0) {
+ if (!currentBuffer.hasRemaining()) {
+ currentBufferIndex++;
+ switchCurrentBuffer();
+ }
+
+ int remainInBuffer = currentBuffer.remaining();
+ int bytesToCopy = len < remainInBuffer ? len : remainInBuffer;
+ currentBuffer.put(b, offset, bytesToCopy);
+ offset += bytesToCopy;
+ len -= bytesToCopy;
+ }
+ }
+
+ private void switchCurrentBuffer() throws IOException {
+ if (currentBufferIndex == file.numBuffers()) {
+ currentBuffer = allocator.allocate(allocatorType);
+ file.addBuffer(currentBuffer);
+ } else {
+ currentBuffer = file.getBuffer(currentBufferIndex);
+ }
+ currentBuffer.position(0);
+ bufferStart = (long) BUFFER_SIZE * (long) currentBufferIndex;
+ }
+
+ private void setFileLength() {
+ long pointer = bufferStart + currentBuffer.position();
+ if (pointer > file.getLength()) {
+ file.setLength(pointer);
+ }
+ }
+
+ @Override
+ public void flush() throws IOException {
+ file.setLastModified(System.currentTimeMillis());
+ setFileLength();
+ }
+
+ @Override
+ public long getFilePointer() {
+ return currentBufferIndex < 0 ? 0 : bufferStart + currentBuffer.position();
+ }
+}
diff --git a/src/main/java/org/apache/lucene/store/bytebuffer/CachingByteBufferAllocator.java b/src/main/java/org/apache/lucene/store/bytebuffer/CachingByteBufferAllocator.java
new file mode 100644
index 0000000..925a76c
--- /dev/null
+++ b/src/main/java/org/apache/lucene/store/bytebuffer/CachingByteBufferAllocator.java
@@ -0,0 +1,82 @@
+package org.apache.lucene.store.bytebuffer;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.LinkedBlockingQueue;
+
+/**
+ * The caching byte buffer allocator allows to define a global size for both the small and large buffers
+ * allocated. Those will be reused when possible.
+ */
+public class CachingByteBufferAllocator extends PlainByteBufferAllocator {
+
+ private final BlockingQueue<ByteBuffer> smallCache;
+ private final BlockingQueue<ByteBuffer> largeCache;
+
+ /**
+ * @param direct If set to true, will allocate direct buffers (off heap).
+ * @param smallBufferSizeInBytes The size (in bytes) of the small buffer allocation.
+ * @param largeBufferSizeInBytes The size (in bytes) of the large buffer allocation.
+ * @param smallCacheSizeInBytes The size of the small cache buffer in bytes.
+ * @param largeCacheSizeInBytes The size of the large cache buffer in bytes.
+ */
+ public CachingByteBufferAllocator(boolean direct, int smallBufferSizeInBytes, int largeBufferSizeInBytes,
+ int smallCacheSizeInBytes, int largeCacheSizeInBytes) {
+ super(direct, smallBufferSizeInBytes, largeBufferSizeInBytes);
+ this.smallCache = new LinkedBlockingQueue<ByteBuffer>(smallCacheSizeInBytes / smallBufferSizeInBytes);
+ this.largeCache = new LinkedBlockingQueue<ByteBuffer>(largeCacheSizeInBytes / largeBufferSizeInBytes);
+ }
+
+
+ public ByteBuffer allocate(Type type) throws IOException {
+ ByteBuffer buffer = type == Type.SMALL ? smallCache.poll() : largeCache.poll();
+ if (buffer == null) {
+ buffer = super.allocate(type);
+ }
+ return buffer;
+ }
+
+ public void release(ByteBuffer buffer) {
+ if (buffer.capacity() == smallBufferSizeInBytes) {
+ boolean success = smallCache.offer(buffer);
+ if (!success) {
+ super.release(buffer);
+ }
+ } else if (buffer.capacity() == largeBufferSizeInBytes) {
+ boolean success = largeCache.offer(buffer);
+ if (!success) {
+ super.release(buffer);
+ }
+ }
+ // otherwise, just ignore it? not our allocation...
+ }
+
+ public void close() {
+ for (ByteBuffer buffer : smallCache) {
+ super.release(buffer);
+ }
+ smallCache.clear();
+ for (ByteBuffer buffer : largeCache) {
+ super.release(buffer);
+ }
+ largeCache.clear();
+ }
+}
diff --git a/src/main/java/org/apache/lucene/store/bytebuffer/PlainByteBufferAllocator.java b/src/main/java/org/apache/lucene/store/bytebuffer/PlainByteBufferAllocator.java
new file mode 100644
index 0000000..e800b61
--- /dev/null
+++ b/src/main/java/org/apache/lucene/store/bytebuffer/PlainByteBufferAllocator.java
@@ -0,0 +1,67 @@
+package org.apache.lucene.store.bytebuffer;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+/**
+ * A simple byte buffer allocator that does not caching. The direct flag
+ * allows to control if the byte buffer will be allocated off heap or not.
+ */
+public class PlainByteBufferAllocator implements ByteBufferAllocator {
+
+ protected final boolean direct;
+
+ protected final int smallBufferSizeInBytes;
+
+ protected final int largeBufferSizeInBytes;
+
+ /**
+ * Constructs a new plain byte buffer allocator that does no caching.
+ *
+ * @param direct If set to true, will allocate direct buffers (off heap).
+ * @param smallBufferSizeInBytes The size (in bytes) of the small buffer allocation.
+ * @param largeBufferSizeInBytes The size (in bytes) of the large buffer allocation.
+ */
+ public PlainByteBufferAllocator(boolean direct, int smallBufferSizeInBytes, int largeBufferSizeInBytes) {
+ this.direct = direct;
+ this.smallBufferSizeInBytes = smallBufferSizeInBytes;
+ this.largeBufferSizeInBytes = largeBufferSizeInBytes;
+ }
+
+ public int sizeInBytes(Type type) {
+ return type == Type.SMALL ? smallBufferSizeInBytes : largeBufferSizeInBytes;
+ }
+
+ public ByteBuffer allocate(Type type) throws IOException {
+ int sizeToAllocate = type == Type.SMALL ? smallBufferSizeInBytes : largeBufferSizeInBytes;
+ if (direct) {
+ return ByteBuffer.allocateDirect(sizeToAllocate);
+ }
+ return ByteBuffer.allocate(sizeToAllocate);
+ }
+
+ public void release(ByteBuffer buffer) {
+ Cleaner.clean(buffer);
+ }
+
+ public void close() {
+ // nothing to do here...
+ }
+}