summaryrefslogtreecommitdiff
path: root/src/main/java/org/elasticsearch/indices/analysis/HunspellService.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/main/java/org/elasticsearch/indices/analysis/HunspellService.java')
-rw-r--r--src/main/java/org/elasticsearch/indices/analysis/HunspellService.java257
1 files changed, 257 insertions, 0 deletions
diff --git a/src/main/java/org/elasticsearch/indices/analysis/HunspellService.java b/src/main/java/org/elasticsearch/indices/analysis/HunspellService.java
new file mode 100644
index 0000000..26e02c6
--- /dev/null
+++ b/src/main/java/org/elasticsearch/indices/analysis/HunspellService.java
@@ -0,0 +1,257 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.indices.analysis;
+
+import com.google.common.cache.CacheBuilder;
+import com.google.common.cache.CacheLoader;
+import com.google.common.cache.LoadingCache;
+import org.apache.lucene.analysis.hunspell.HunspellDictionary;
+import org.apache.lucene.util.Version;
+import org.elasticsearch.ElasticsearchException;
+import org.elasticsearch.common.component.AbstractComponent;
+import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.lucene.Lucene;
+import org.elasticsearch.common.settings.ImmutableSettings;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+
+import java.io.*;
+import java.net.MalformedURLException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+
+/**
+ * Serves as a node level registry for hunspell dictionaries. This services expects all dictionaries to be located under
+ * the {@code <path.conf>/hunspell} directory, where each locale has its dedicated sub-directory which holds the dictionary
+ * files. For example, the dictionary files for {@code en_US} locale must be placed under {@code <path.conf>/hunspell/en_US}
+ * directory.
+ * <p/>
+ * The following settings can be set for each dictionary:
+ * <ul>
+ * <li>{@code ignore_case} - If true, dictionary matching will be case insensitive (defaults to {@code false})</li>
+ * <li>{@code strict_affix_parsing} - Determines whether errors while reading a affix rules file will cause exception or simple be ignored (defaults to {@code true})</li>
+ * </ul>
+ * <p/>
+ * These settings can either be configured as node level configuration, such as:
+ * <br/><br/>
+ * <pre><code>
+ * indices.analysis.hunspell.dictionary.en_US.ignore_case: true
+ * indices.analysis.hunspell.dictionary.en_US.strict_affix_parsing: false
+ * </code></pre>
+ * <p/>
+ * or, as dedicated configuration per dictionary, placed in a {@code settings.yml} file under the dictionary directory. For
+ * example, the following can be the content of the {@code <path.config>/hunspell/en_US/settings.yml} file:
+ * <br/><br/>
+ * <pre><code>
+ * ignore_case: true
+ * strict_affix_parsing: false
+ * </code></pre>
+ *
+ * @see org.elasticsearch.index.analysis.HunspellTokenFilterFactory
+ */
+public class HunspellService extends AbstractComponent {
+
+ private final static DictionaryFileFilter DIC_FILE_FILTER = new DictionaryFileFilter();
+ private final static AffixFileFilter AFFIX_FILE_FILTER = new AffixFileFilter();
+
+ private final LoadingCache<String, HunspellDictionary> dictionaries;
+ private final Map<String, HunspellDictionary> knownDictionaries;
+
+ private final boolean defaultIgnoreCase;
+ private final boolean defaultStrictAffixParsing;
+ private final File hunspellDir;
+
+ public HunspellService(final Settings settings, final Environment env) {
+ this(settings, env, Collections.<String, HunspellDictionary>emptyMap());
+ }
+
+ @Inject
+ public HunspellService(final Settings settings, final Environment env, final Map<String, HunspellDictionary> knownDictionaries) {
+ super(settings);
+ this.knownDictionaries = knownDictionaries;
+ this.hunspellDir = resolveHunspellDirectory(settings, env);
+ this.defaultIgnoreCase = settings.getAsBoolean("indices.analysis.hunspell.dictionary.ignore_case", false);
+ this.defaultStrictAffixParsing = settings.getAsBoolean("indices.analysis.hunspell.dictionary.strict_affix_parsing", false);
+ final Version version = Lucene.parseVersion(settings.get("indices.analysis.hunspell.version"), Lucene.ANALYZER_VERSION, logger);
+ dictionaries = CacheBuilder.newBuilder().build(new CacheLoader<String, HunspellDictionary>() {
+ @Override
+ public HunspellDictionary load(String locale) throws Exception {
+ HunspellDictionary dictionary = knownDictionaries.get(locale);
+ if (dictionary == null) {
+ dictionary = loadDictionary(locale, settings, env, version);
+ }
+ return dictionary;
+ }
+ });
+ scanAndLoadDictionaries();
+ }
+
+ /**
+ * Returns the hunspell dictionary for the given locale.
+ *
+ * @param locale The name of the locale
+ */
+ public HunspellDictionary getDictionary(String locale) {
+ return dictionaries.getUnchecked(locale);
+ }
+
+ private File resolveHunspellDirectory(Settings settings, Environment env) {
+ String location = settings.get("indices.analysis.hunspell.dictionary.location", null);
+ if (location != null) {
+ return new File(location);
+ }
+ return new File(env.configFile(), "hunspell");
+ }
+
+ /**
+ * Scans the hunspell directory and loads all found dictionaries
+ */
+ private void scanAndLoadDictionaries() {
+ if (hunspellDir.exists() && hunspellDir.isDirectory()) {
+ for (File file : hunspellDir.listFiles()) {
+ if (file.isDirectory()) {
+ if (file.list(AFFIX_FILE_FILTER).length > 0) { // just making sure it's indeed a dictionary dir
+ dictionaries.getUnchecked(file.getName());
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Loads the hunspell dictionary for the given local.
+ *
+ * @param locale The locale of the hunspell dictionary to be loaded.
+ * @param nodeSettings The node level settings
+ * @param env The node environment (from which the conf path will be resolved)
+ * @param version The lucene version
+ * @return The loaded Hunspell dictionary
+ * @throws Exception when loading fails (due to IO errors or malformed dictionary files)
+ */
+ private HunspellDictionary loadDictionary(String locale, Settings nodeSettings, Environment env, Version version) throws Exception {
+ if (logger.isDebugEnabled()) {
+ logger.debug("Loading huspell dictionary [{}]...", locale);
+ }
+ File dicDir = new File(hunspellDir, locale);
+ if (!dicDir.exists() || !dicDir.isDirectory()) {
+ throw new ElasticsearchException(String.format(Locale.ROOT, "Could not find hunspell dictionary [%s]", locale));
+ }
+
+ // merging node settings with hunspell dictionary specific settings
+ nodeSettings = loadDictionarySettings(dicDir, nodeSettings.getByPrefix("indices.analysis.hunspell.dictionary." + locale + "."));
+
+ boolean ignoreCase = nodeSettings.getAsBoolean("ignore_case", defaultIgnoreCase);
+ boolean strictAffixParsing = nodeSettings.getAsBoolean("strict_affix_parsing", defaultStrictAffixParsing);
+
+ File[] affixFiles = dicDir.listFiles(AFFIX_FILE_FILTER);
+ if (affixFiles.length != 1) {
+ throw new ElasticsearchException(String.format(Locale.ROOT, "Missing affix file for hunspell dictionary [%s]", locale));
+ }
+ InputStream affixStream = null;
+
+ File[] dicFiles = dicDir.listFiles(DIC_FILE_FILTER);
+ List<InputStream> dicStreams = new ArrayList<InputStream>(dicFiles.length);
+ try {
+
+ for (int i = 0; i < dicFiles.length; i++) {
+ dicStreams.add(new FileInputStream(dicFiles[i]));
+ }
+
+ affixStream = new FileInputStream(affixFiles[0]);
+
+ return new HunspellDictionary(affixStream, dicStreams, version, ignoreCase, strictAffixParsing);
+
+ } catch (Exception e) {
+ logger.error("Could not load hunspell dictionary [{}]", e, locale);
+ throw e;
+ } finally {
+ if (affixStream != null) {
+ try {
+ affixStream.close();
+ } catch (IOException e) {
+ // nothing much we can do here
+ }
+ }
+ for (InputStream in : dicStreams) {
+ if (in != null) {
+ try {
+ in.close();
+ } catch (IOException e) {
+ // nothing much we can do here
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Each hunspell dictionary directory may contain a {@code settings.yml} which holds dictionary specific settings. Default
+ * values for these settings are defined in the given default settings.
+ *
+ * @param dir The directory of the dictionary
+ * @param defaults The default settings for this dictionary
+ * @return The resolved settings.
+ */
+ private static Settings loadDictionarySettings(File dir, Settings defaults) {
+ File file = new File(dir, "settings.yml");
+ if (file.exists()) {
+ try {
+ return ImmutableSettings.settingsBuilder().loadFromUrl(file.toURI().toURL()).put(defaults).build();
+ } catch (MalformedURLException e) {
+ throw new ElasticsearchException(String.format(Locale.ROOT, "Could not load hunspell dictionary settings from [%s]", file.getAbsolutePath()), e);
+ }
+ }
+
+ file = new File(dir, "settings.json");
+ if (file.exists()) {
+ try {
+ return ImmutableSettings.settingsBuilder().loadFromUrl(file.toURI().toURL()).put(defaults).build();
+ } catch (MalformedURLException e) {
+ throw new ElasticsearchException(String.format(Locale.ROOT, "Could not load hunspell dictionary settings from [%s]", file.getAbsolutePath()), e);
+ }
+ }
+
+ return defaults;
+ }
+
+ /**
+ * Only accepts {@code *.dic} files
+ */
+ static class DictionaryFileFilter implements FilenameFilter {
+ @Override
+ public boolean accept(File dir, String name) {
+ return name.toLowerCase(Locale.ROOT).endsWith(".dic");
+ }
+ }
+
+ /**
+ * Only accepts {@code *.aff} files
+ */
+ static class AffixFileFilter implements FilenameFilter {
+ @Override
+ public boolean accept(File dir, String name) {
+ return name.toLowerCase(Locale.ROOT).endsWith(".aff");
+ }
+ }
+
+}
+