summaryrefslogtreecommitdiff
path: root/ept
diff options
context:
space:
mode:
authorEnrico Zini <enrico@enricozini.org>2010-05-10 17:50:28 +0100
committerEnrico Zini <enrico@enricozini.org>2010-05-10 17:50:28 +0100
commit1b5ac4e66699cacde7123da3b111a0a16ee55db4 (patch)
tree63d345bf5d93f5ed3ab639571d60ccd8c2b2b331 /ept
parent80d13236ef7014f077a5793a9cb3e9a0573ffbe1 (diff)
downloadlibept-1b5ac4e66699cacde7123da3b111a0a16ee55db4.tar.gz
Replaced textindex with axi, which contains just a handful of helper functions to handle apt-xapian-index
Diffstat (limited to 'ept')
-rw-r--r--ept/CMakeLists.txt13
-rw-r--r--ept/axi/axi.cc (renamed from ept/textsearch/extraindexers.cc)53
-rw-r--r--ept/axi/axi.h89
-rw-r--r--ept/axi/axi.test.h (renamed from ept/textsearch/extraindexers.h)55
-rw-r--r--ept/textsearch/maint/path.cc93
-rw-r--r--ept/textsearch/maint/path.h74
-rw-r--r--ept/textsearch/textsearch.cc256
-rw-r--r--ept/textsearch/textsearch.h219
-rw-r--r--ept/textsearch/textsearch.test.h166
9 files changed, 160 insertions, 858 deletions
diff --git a/ept/CMakeLists.txt b/ept/CMakeLists.txt
index e24d6ef..ed9b6c7 100644
--- a/ept/CMakeLists.txt
+++ b/ept/CMakeLists.txt
@@ -2,8 +2,7 @@ project( ept )
include( ${WIBBLE_TEST_CMAKE} )
file( GLOB src *.cpp debtags/*.cc debtags/maint/*.cc
- popcon/*.cc popcon/maint/*.cc apt/*.cc textsearch/*.cc
- textsearch/maint/*.cc )
+ popcon/*.cc popcon/maint/*.cc apt/*.cc axi/*.cc )
file( GLOB h_top *.h )
file( GLOB h_apt apt/*.h )
@@ -11,15 +10,14 @@ file( GLOB h_debtags debtags/*.h debtags/*.tcc )
file( GLOB h_debtags_maint debtags/maint/*.h debtags/maint/*.tcc )
file( GLOB h_popcon popcon/*.h )
file( GLOB h_popcon_maint popcon/maint/*.h )
-file( GLOB h_textsearch textsearch/*.h )
-file( GLOB h_textsearch_maint textsearch/maint/*.h )
+file( GLOB h_axi axi/*.h )
file( GLOB debtagstesth debtags/*.test.h debtags/maint/*.test.h )
file( GLOB popcontesth popcon/*.test.h )
file( GLOB apttesth apt/*.test.h )
-file( GLOB textsearchtesth textsearch/*.test.h )
+file( GLOB axitesth axi/*.test.h )
set( testh ${debtagstesth} ${popcontesth}
- ${apttesth} ${textsearchtesth} )
+ ${apttesth} ${axitesth} )
include_directories( ${CMAKE_SOURCE_DIR} ${CMAKE_BINARY_DIR}
${TAGCOLL_INCLUDE_DIRS} ${WIBBLE_INCLUDE_DIRS} )
@@ -90,5 +88,4 @@ install( FILES ${h_debtags} DESTINATION include/ept/debtags )
install( FILES ${h_debtags_maint} DESTINATION include/ept/debtags/maint )
install( FILES ${h_popcon} DESTINATION include/ept/popcon )
install( FILES ${h_popcon_maint} DESTINATION include/ept/popcon/maint )
-install( FILES ${h_textsearch} DESTINATION include/ept/textsearch )
-install( FILES ${h_textsearch_maint} DESTINATION include/ept/textsearch/maint )
+install( FILES ${h_axi} DESTINATION include/ept/axi )
diff --git a/ept/textsearch/extraindexers.cc b/ept/axi/axi.cc
index 179688f..a9b3203 100644
--- a/ept/textsearch/extraindexers.cc
+++ b/ept/axi/axi.cc
@@ -22,37 +22,54 @@
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
-#include <ept/textsearch/extraindexers.h>
-#include <ept/apt/packagerecord.h>
-#include <ept/debtags/debtags.h>
+#include <ept/config.h>
+#include <ept/axi/axi.h>
+
+#include <wibble/exception.h>
+#include <wibble/string.h>
+#include <wibble/sys/fs.h>
+#include <memory>
using namespace std;
-using namespace ept::debtags;
+using namespace wibble;
namespace ept {
-namespace textsearch {
+namespace axi {
+
+static std::string m_index_dir = TEXTSEARCH_DB_DIR;
-void AptTagsExtraIndexer::operator()(Xapian::Document& doc, const apt::PackageRecord& rec) const
+std::string path_dir()
{
- // Index tags as well
- set<string> tags = rec.tag();
- for (set<string>::const_iterator ti = tags.begin();
- ti != tags.end(); ++ti)
- doc.add_term("XT"+*ti);
+ return m_index_dir;
}
-void DebtagsExtraIndexer::operator()(Xapian::Document& doc, const apt::PackageRecord& rec) const
+std::string path_db()
{
- // Index tags as well
- set<std::string> tags = debtags.getTagsOfItem(doc.get_data());
- for (set<std::string>::const_iterator ti = tags.begin();
- ti != tags.end(); ++ti)
- doc.add_term("XT"+*ti);
+ return str::joinpath(m_index_dir, "/index");
}
+time_t timestamp()
+{
+ string tsfile = str::joinpath(m_index_dir, "update-timestamp");
+ std::auto_ptr<struct stat> st = sys::fs::stat(tsfile);
+ if (st.get())
+ return st->st_mtime;
+ else
+ return 0;
}
+
+
+OverrideIndexDir::OverrideIndexDir(const std::string& path) : old(m_index_dir)
+{
+ m_index_dir = path;
}
-#include <ept/debtags/debtags.tcc>
+OverrideIndexDir::~OverrideIndexDir()
+{
+ m_index_dir = old;
+}
+
+}
+}
// vim:set ts=4 sw=4:
diff --git a/ept/axi/axi.h b/ept/axi/axi.h
new file mode 100644
index 0000000..29a9686
--- /dev/null
+++ b/ept/axi/axi.h
@@ -0,0 +1,89 @@
+#ifndef EPT_TEXTSEARCH_TEXTSEARCH_H
+#define EPT_TEXTSEARCH_TEXTSEARCH_H
+
+/** @file
+ * @author Enrico Zini <enrico@enricozini.org>
+ * Fast full-text search
+ */
+
+/*
+ * Copyright (C) 2007 Enrico Zini <enrico@debian.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <xapian.h>
+#include <string>
+
+namespace ept {
+
+/**
+ * Maintains and accesses a Xapian index of package descriptions.
+ *
+ * Contrarily to Debtags and Popcon, TextSearch does not attempt to create the
+ * index in the home directory if no system index is found and it is not
+ * running as root: this is to avoid secretly building large indexes (>50Mb)
+ * in the home directory of users.
+ *
+ * The idea then is to have root keep the index up to date, possibly running a
+ * reindexing tool once a day, or after an apt-get update.
+ *
+ * This works because the full text search index is useful even if it is
+ * slightly out of date.
+ */
+namespace axi {
+
+// Allocate value indexes for known values
+const Xapian::valueno VAL_APT_INSTALLED_SIZE = 1;
+const Xapian::valueno VAL_APT_PACKAGE_SIZE = 2;
+const Xapian::valueno VAL_POPCON = 10;
+const Xapian::valueno VAL_ITERATING_RATING = 20;
+const Xapian::valueno VAL_ITERATING_FUNCTIONALITY = 21;
+const Xapian::valueno VAL_ITERATING_USABILITY = 22;
+const Xapian::valueno VAL_ITERATING_SECURITY = 23;
+const Xapian::valueno VAL_ITERATING_PERFORMANCE = 24;
+const Xapian::valueno VAL_ITERATING_QUALITY = 25;
+const Xapian::valueno VAL_ITERATING_SUPPORT = 26;
+const Xapian::valueno VAL_ITERATING_ADOPTION = 27;
+// If you need to index a value and cannot edit this file, feel free to use any
+// value starting from 1000000
+
+/// Return the path to the Apt Xapian index root directory
+std::string path_dir();
+
+/// Return the path to the Apt Xapian index Xapian database
+std::string path_db();
+
+/// Return the last update timestamp of the index
+time_t timestamp();
+
+
+/**
+ * RAII temporary override of the location of the index root
+ * directory, used for tests
+ */
+class OverrideIndexDir
+{
+ std::string old;
+public:
+ OverrideIndexDir(const std::string& path);
+ ~OverrideIndexDir();
+};
+
+}
+}
+
+// vim:set ts=4 sw=4:
+#endif
diff --git a/ept/textsearch/extraindexers.h b/ept/axi/axi.test.h
index ce1d042..5481bd8 100644
--- a/ept/textsearch/extraindexers.h
+++ b/ept/axi/axi.test.h
@@ -1,12 +1,7 @@
-#ifndef EPT_TEXTSEARCH_EXTRAINDEXERS_H
-#define EPT_TEXTSEARCH_EXTRAINDEXERS_H
-
-/** @file
- * @author Enrico Zini <enrico@enricozini.org>
- * Fast full-text search
- */
-
+// -*- mode: c++; tab-width: 4; indent-tabs-mode: t -*-
/*
+ * popcon test
+ *
* Copyright (C) 2007 Enrico Zini <enrico@debian.org>
*
* This program is free software; you can redistribute it and/or modify
@@ -24,28 +19,40 @@
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
-#include <ept/textsearch/textsearch.h>
+#include <ept/test.h>
+#include <ept/axi/axi.h>
+#include <ept/apt/apt.h>
+#include <wibble/sys/fs.h>
+#include <set>
-namespace ept {
-namespace debtags {
-class Debtags;
-}
-namespace textsearch {
+using namespace std;
+using namespace ept;
-struct AptTagsExtraIndexer : public TextSearch::ExtraIndexer
+struct DirMaker
{
- virtual void operator()(Xapian::Document& doc, const apt::PackageRecord& rec) const;
+ DirMaker(const std::string& name)
+ {
+ wibble::sys::fs::mkdirIfMissing(name, 0755);
+ }
};
-struct DebtagsExtraIndexer : public TextSearch::ExtraIndexer
+struct TestAxi : AptTestEnvironment
{
- const debtags::Debtags& debtags;
- DebtagsExtraIndexer(const debtags::Debtags& debtags) : debtags(debtags) {}
- virtual void operator()(Xapian::Document& doc, const apt::PackageRecord& rec) const;
+ DirMaker md;
+ axi::OverrideIndexDir oid;
+ apt::Apt apt;
+
+ TestAxi()
+ : md( TEST_ENV_DIR "xapian"), oid( TEST_ENV_DIR "xapian")
+ {
+ }
+
+// Access an empty index
+ Test empty()
+ {
+ axi::OverrideIndexDir oid("./empty");
+ assert_eq(axi::timestamp(), 0);
+ }
};
-}
-}
-
// vim:set ts=4 sw=4:
-#endif
diff --git a/ept/textsearch/maint/path.cc b/ept/textsearch/maint/path.cc
deleted file mode 100644
index 4b82ba1..0000000
--- a/ept/textsearch/maint/path.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-// -*- mode: c++; indent-tabs-mode: t -*-
-
-/** \file
- * popcon paths
- */
-
-/*
- * Copyright (C) 2005,2006,2007 Enrico Zini <enrico@debian.org>, Peter Rockai <me@mornfall.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#include <ept/config.h>
-#include <ept/textsearch/maint/path.h>
-
-#include <wibble/exception.h>
-#include <wibble/sys/fs.h>
-#include <wibble/string.h>
-
-#include <cstdio>
-#include <cerrno>
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <unistd.h>
-
-using namespace std;
-using namespace wibble;
-
-namespace ept {
-namespace textsearch {
-
-Path &Path::instance() {
- if (!s_instance) {
- s_instance = new Path;
- instance().m_indexDir = TEXTSEARCH_DB_DIR;
- }
- return *s_instance;
-}
-
-int Path::access( const std::string &s, int m )
-{
- return ::access( s.c_str(), m );
-}
-
-time_t Path::indexTimestamp()
-{
- string tsfile = str::joinpath(instance().indexDir(), "update-timestamp");
- std::auto_ptr<struct stat> st = wibble::sys::fs::stat(tsfile);
- if (st.get())
- return st->st_mtime;
- else
- return 0;
-}
-
-void Path::setTimestamp(time_t ts)
-{
- string tsfile = str::joinpath(instance().indexDir(), "/update-timestamp");
- FILE* out = fopen(tsfile.c_str(), "wt");
- if (!out)
- throw wibble::exception::File(tsfile, "opening file for truncate/writing");
- if (fprintf(out, "%ld\n", ts) < 0)
- throw wibble::exception::File(tsfile, "writing the modification time");
- if (fclose(out) < 0)
- throw wibble::exception::File(tsfile, "closing the file");
-}
-
-void Path::setIndexDir( const std::string &s )
-{
- instance().m_indexDir = s;
-}
-
-std::string Path::indexDir() { return instance().m_indexDir; }
-std::string Path::index() { return str::joinpath(instance().m_indexDir, "/index"); }
-
-Path *Path::s_instance = 0;
-
-}
-}
-
-// vim:set ts=4 sw=4:
diff --git a/ept/textsearch/maint/path.h b/ept/textsearch/maint/path.h
deleted file mode 100644
index 468d271..0000000
--- a/ept/textsearch/maint/path.h
+++ /dev/null
@@ -1,74 +0,0 @@
-// -*- mode: c++; indent-tabs-mode: t -*-
-/** \file
- * popcon paths
- */
-
-/*
- * Copyright (C) 2005,2006,2007 Enrico Zini <enrico@debian.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#ifndef EPT_TEXTSEARCH_PATH_H
-#define EPT_TEXTSEARCH_PATH_H
-
-#include <string>
-
-struct stat;
-
-namespace ept {
-namespace textsearch {
-
-/**
- * Singleton class to configure and access the various Popcon paths
- */
-class Path
-{
-public:
- static std::string indexDir();
- static std::string index();
-
- // Directory where Popcon source data is found
- static void setIndexDir( const std::string &s );
-
- static int access( const std::string &, int );
- static time_t indexTimestamp();
- static void setTimestamp(time_t ts);
-
- // RAII-style classes to temporarily override directories
- class OverrideIndexDir
- {
- std::string old;
- public:
- OverrideIndexDir(const std::string& path) : old(Path::indexDir())
- {
- Path::setIndexDir(path);
- }
- ~OverrideIndexDir() { Path::setIndexDir(old); }
- };
-
-protected:
- static Path *s_instance;
- static Path &instance();
-
- // Directory where Popcon source data is found
- std::string m_indexDir;
-};
-
-}
-}
-
-// vim:set ts=4 sw=4:
-#endif
diff --git a/ept/textsearch/textsearch.cc b/ept/textsearch/textsearch.cc
deleted file mode 100644
index adbe235..0000000
--- a/ept/textsearch/textsearch.cc
+++ /dev/null
@@ -1,256 +0,0 @@
-
-/** @file
- * @author Enrico Zini <enrico@enricozini.org>
- * Fast full-text search
- */
-
-/*
- * Copyright (C) 2007 Enrico Zini <enrico@debian.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#include <ept/textsearch/textsearch.h>
-#include <ept/textsearch/maint/path.h>
-#include <ept/apt/apt.h>
-#include <ept/apt/packagerecord.h>
-//#include <ept/debtags/debtags.h>
-
-#include <wibble/regexp.h>
-#include <cctype>
-#include <cmath>
-
-#include <xapian/queryparser.h>
-
-#include <algorithm>
-
-#include <iostream>
-
-using namespace std;
-using namespace ept::apt;
-using namespace ept::debtags;
-
-namespace ept {
-namespace textsearch {
-
-size_t max_index = 0;
-
-TextSearch::TextSearch()
- : m_timestamp(0), m_stem("en")
-{
- m_timestamp = Path::indexTimestamp();
- if (m_timestamp)
- m_db.add_database(Xapian::Database(Path::index()));
-}
-
-std::string TextSearch::toLower(const std::string& str)
-{
- std::string res;
- res.reserve(str.size());
- for (std::string::const_iterator i = str.begin(); i != str.end(); ++i)
- res += tolower(*i);
- return res;
-}
-
-bool TextSearch::needsRebuild(apt::Apt& apt)
-{
- return apt.timestamp() > m_timestamp;
-}
-
-void TextSearch::normalize_and_add(Xapian::Document& doc, const std::string& term, int& pos) const
-{
- string t = TextSearch::toLower(term);
- string s = m_stem(t);
- doc.add_term(t);
- if (s != t)
- doc.add_term(s);
-}
-
-bool TextSearch::rebuildIfNeeded(apt::Apt& apt, const std::vector<const TextSearch::ExtraIndexer*>& extraIndexers)
-{
- // Check if a rebuild is needed, and keep a copy of the APT timestamp for
- // saving later
- time_t aptts = apt.timestamp();
- if (aptts <= m_timestamp)
- return false;
-
- // Reindex
- Xapian::WritableDatabase database(Xapian::Flint::open(Path::index(), Xapian::DB_CREATE_OR_OPEN));
- Xapian::TermGenerator termgen;
- termgen.set_stemmer(m_stem);
- //database.begin_transaction();
- PackageRecord rec;
- size_t count = 0;
- for (Apt::record_iterator i = apt.recordBegin();
- i != apt.recordEnd(); ++i, ++count)
- {
- // If we are testing, we can set a limit to how many packages we index,
- // to avoid it taking too much time
- if (max_index != 0 && count > max_index)
- break;
-
- rec.scan(*i);
-
- Xapian::Document doc;
- doc.set_data(rec.package());
-
- string pkgid = "XP" + rec.package();
- //std::cerr << "Add " << pkgid << ": " << idx << std::endl;
- doc.add_term(pkgid);
-
- // Index tags as well
- set<string> tags = rec.tag();
- for (set<string>::const_iterator ti = tags.begin();
- ti != tags.end(); ++ti)
- doc.add_term("XT"+*ti);
-
- termgen.set_document(doc);
- termgen.index_text_without_positions(rec.package());
- termgen.index_text_without_positions(rec.description());
-
- // Add the values
- doc.add_value(VAL_APT_INSTALLED_SIZE, Xapian::sortable_serialise(rec.installedSize()));
- doc.add_value(VAL_APT_PACKAGE_SIZE, Xapian::sortable_serialise(rec.packageSize()));
-
- if (m_timestamp)
- database.replace_document(pkgid, doc);
- else
- database.add_document(doc);
- }
-
- //database.commit_transaction();
-
- if (!m_timestamp)
- m_db.add_database(Xapian::Database(Path::index()));
- else
- m_db.reopen();
-
- m_timestamp = aptts;
-
- Path::setTimestamp(aptts);
-
- return true;
-}
-
-Xapian::Query TextSearch::makeORQuery(const std::string& keywords) const
-{
- wibble::Tokenizer tok(keywords, "[A-Za-z0-9_-]+", REG_EXTENDED);
- return makeORQuery(tok.begin(), tok.end());
-}
-
-Xapian::Query TextSearch::makePartialORQuery(const std::string& keywords) const
-{
- wibble::Tokenizer tok(keywords, "[A-Za-z0-9_-]+", REG_EXTENDED);
- vector<string> tokens;
- // FIXME: make the Tokenizer iterators properly iterable
- for (wibble::Tokenizer::const_iterator i = tok.begin();
- i != tok.end(); ++i)
- tokens.push_back(*i);
- // Add all the terms starting with 'last'
- if (!tokens.empty())
- {
- string& last = *tokens.rbegin();
- if (last.size() == 1)
- // Ignore one-letter partial terms: they make the query uselessly
- // large and slow, and it's worth just to wait for more characters
- // to come
- tokens.resize(tokens.size() - 1);
- else
- copy(m_db.allterms_begin(last), m_db.allterms_end(last), back_inserter(tokens));
- /*
- for (Xapian::TermIterator t = m_db.allterms_begin(last);
- t != m_db.allterms_end(last); ++t)
- tokens.push_back(*t);
- */
- }
- return makeORQuery(tokens.begin(), tokens.end());
-}
-
-Xapian::docid TextSearch::docidByName(const std::string& pkgname) const
-{
- Xapian::PostingIterator i = m_db.postlist_begin("XP"+pkgname);
- if (i == m_db.postlist_end("XP"+pkgname))
- return 0;
- else
- return *i;
-}
-
-struct TagFilter : public Xapian::ExpandDecider
-{
- virtual bool operator()(const std::string &term) const { return term[0] == 'T'; }
-};
-
-static TagFilter tagFilter;
-
-vector<string> TextSearch::expand(Xapian::Enquire& enq) const
-{
- Xapian::RSet rset;
- // Get the top 5 results as 'good ones' to compute the search expansion
- Xapian::MSet mset = enq.get_mset(0, 5);
- for (Xapian::MSet::iterator i = mset.begin(); i != mset.end(); ++i)
- rset.add_document(i);
- // Get the expanded set, only expanding the query with tag names
- Xapian::ESet eset = enq.get_eset(5, rset, &tagFilter);
- vector<string> res;
- for (Xapian::ESetIterator i = eset.begin(); i != eset.end(); ++i)
- res.push_back(*i);
- return res;
-}
-
-Xapian::Query TextSearch::makeRelatedQuery(const std::string& pkgname) const
-{
- Xapian::Enquire enquire(db());
-
- // Retrieve the document for the given package
- enquire.set_query(Xapian::Query("XP"+pkgname));
- Xapian::MSet matches = enquire.get_mset(0, 1);
- Xapian::MSetIterator mi = matches.begin();
- if (mi == matches.end()) return Xapian::Query();
- Xapian::Document doc = mi.get_document();
-
- // Return the query to get the list of similar documents
- return Xapian::Query(Xapian::Query::OP_OR, doc.termlist_begin(), doc.termlist_end());
-}
-
-double TextSearch::getDoubleValue(const std::string& pkgname, Xapian::valueno val_id) const
-{
- Xapian::docid id = docidByName(pkgname);
- if (id == 0)
- return 0.0;
- Xapian::Document doc = db().get_document(id);
- string val = doc.get_value(val_id);
- if (val.empty())
- return 0.0;
- else
- return Xapian::sortable_unserialise(val);
-}
-
-int TextSearch::getIntValue(const std::string& pkgname, Xapian::valueno val_id) const
-{
- Xapian::docid id = docidByName(pkgname);
- if (id == 0)
- return 0;
- Xapian::Document doc = db().get_document(id);
- string val = doc.get_value(val_id);
- if (val.empty())
- return 0;
- else
- return (int)nearbyint(Xapian::sortable_unserialise(val));
-}
-
-}
-}
-
-// vim:set ts=4 sw=4:
diff --git a/ept/textsearch/textsearch.h b/ept/textsearch/textsearch.h
deleted file mode 100644
index 6f61e02..0000000
--- a/ept/textsearch/textsearch.h
+++ /dev/null
@@ -1,219 +0,0 @@
-#ifndef EPT_TEXTSEARCH_TEXTSEARCH_H
-#define EPT_TEXTSEARCH_TEXTSEARCH_H
-
-/** @file
- * @author Enrico Zini <enrico@enricozini.org>
- * Fast full-text search
- */
-
-/*
- * Copyright (C) 2007 Enrico Zini <enrico@debian.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#include <xapian.h>
-#include <vector>
-#include <string>
-
-namespace ept {
-namespace apt {
-class Apt;
-class PackageRecord;
-}
-namespace debtags {
-class Debtags;
-}
-namespace textsearch {
-
-// Allocate value indexes for known values
-const Xapian::valueno VAL_APT_INSTALLED_SIZE = 1;
-const Xapian::valueno VAL_APT_PACKAGE_SIZE = 2;
-const Xapian::valueno VAL_POPCON = 10;
-const Xapian::valueno VAL_ITERATING_RATING = 20;
-const Xapian::valueno VAL_ITERATING_FUNCTIONALITY = 21;
-const Xapian::valueno VAL_ITERATING_USABILITY = 22;
-const Xapian::valueno VAL_ITERATING_SECURITY = 23;
-const Xapian::valueno VAL_ITERATING_PERFORMANCE = 24;
-const Xapian::valueno VAL_ITERATING_QUALITY = 25;
-const Xapian::valueno VAL_ITERATING_SUPPORT = 26;
-const Xapian::valueno VAL_ITERATING_ADOPTION = 27;
-// If you need to index a value and cannot edit this file, feel free to use any
-// value starting from 1000000
-
-
-/*
-Fallback on apt scan searches when index is not present
-
-Explicitly decide at instantiation (or at any other time) if a rebuild should
-be performed. Just adding a 'rebuildIfNeeded' method would be enough.
-
-17:14 #xapian < enrico> Hello. I'm finally in a position of writing a library to maintain
- a xapian index with Debian package descriptions in a Debian system
-17:14 #xapian < enrico> I have a question, though
-17:14 #xapian < enrico> The descriptions change regularly as people run 'apt-get update'
-17:15 #xapian < enrico> I'd need to have a way to update the description index after
- apt-get update, without rebuilding it from scratch
-17:15 #xapian < enrico> Is there some documentation on how to do that? I can't exactly
- tell Xapian "the new description for package foo is this" because
- I'd need the xapian id
-19:11 #xapian < omega> you can add a unique term with a boolean prefix?
-19:11 #xapian < omega> like Qpackage-name
-19:11 #xapian < omega> then you search for it and replace_document
-19:24 #xapian < richardb> Or indeed, you use the "replace_document()" form which takes a
- unique_id term.
-19:25 #xapian < richardb> Xapian::docid replace_document(const std::string &
- unique_term,
-19:25 #xapian < richardb> const Xapian::Document &
- document);
-19:43 #xapian < enrico> unique term
-19:43 #xapian < enrico> nice!
-19:44 #xapian < enrico> can I use a non-alpha prefix, like :package-name ?
-19:45 #xapian < enrico> or pkg:package-name
-19:45 #xapian < enrico> I suppose I can
-*/
-
-/**
- * Maintains and accesses a Xapian index of package descriptions.
- *
- * Contrarily to Debtags and Popcon, TextSearch does not attempt to create the
- * index in the home directory if no system index is found and it is not
- * running as root: this is to avoid secretly building large indexes (>50Mb)
- * in the home directory of users.
- *
- * The idea then is to have root keep the index up to date, possibly running a
- * reindexing tool once a day, or after an apt-get update.
- *
- * This works because the full text search index is useful even if it is
- * slightly out of date.
- */
-class TextSearch
-{
-protected:
- time_t m_timestamp;
- Xapian::Database m_db;
- Xapian::Stem m_stem;
-
- /// Return a lowercased copy of the string
- static std::string toLower(const std::string& str);
-
- /**
- * Add normalised tokens computed from the string to the document doc.
- *
- * pos is used as a sequence generator for entering the token position in
- * the document.
- */
- void normalize_and_add(Xapian::Document& doc, const std::string& term, int& pos) const;
-
-public:
- struct ExtraIndexer
- {
- virtual ~ExtraIndexer() {}
- virtual void operator()(Xapian::Document& doc, const apt::PackageRecord& rec) const = 0;
- };
-
- TextSearch();
-
- /// Access the Xapian database
- Xapian::Database& db() { return m_db; }
-
- /// Access the Xapian database
- const Xapian::Database& db() const { return m_db; }
-
- /// Timestamp of when the Xapian database was last updated
- time_t timestamp() const { return m_timestamp; }
-
- /// Returns true if the index has data
- bool hasData() const { return m_timestamp > 0; }
-
- /// Returns true if the index is older than the Apt database information
- bool needsRebuild(apt::Apt& apt);
-
- /**
- * Rebuild the index if needed.
- *
- * Allow to specify functors that contribute to the indexing.
- *
- * @note This requires write access to the index directory.
- * @note This is not the main way to update the index: it is provided here
- * only as a way to build a draft index for the library tests
- */
- bool rebuildIfNeeded(
- apt::Apt& apt,
- const std::vector<const ExtraIndexer*>& extraIndexers = std::vector<const ExtraIndexer*>());
-
- /**
- * Retrieve a Xapian docid by package name
- */
- Xapian::docid docidByName(const std::string& pkgname) const;
-
- /**
- * Tokenize the string and build an OR query with the resulting keywords
- */
- Xapian::Query makeORQuery(const std::string& keywords) const;
-
- /**
- * Tokenize the string and build an OR query with the resulting keywords.
- *
- * The last token in keywords is considered to be typed only partially, to
- * implement proper search-as-you-type.
- */
- Xapian::Query makePartialORQuery(const std::string& keywords) const;
-
- /**
- * Build a query with the given keywords, specified as iterators of strings
- */
- template<typename ITER>
- Xapian::Query makeORQuery(const ITER& begin, const ITER& end) const
- {
- std::vector<std::string> terms;
- // Insert both the lowercased and the stemmed lowercased query terms
- for (ITER i = begin; i != end; ++i)
- {
- std::string t = toLower(*i);
- std::string s = m_stem(t);
- terms.push_back(t);
- if (s != t)
- terms.push_back("Z" + s);
- }
- return Xapian::Query(Xapian::Query::OP_OR, terms.begin(), terms.end());
- }
-
- /// Return a list of tag-based terms that can be used to expand an OR query
- std::vector<std::string> expand(Xapian::Enquire& enq) const;
-
-// std::vector<std::string> similar(const std::string& pkg);
-
- /**
- * Create a query to look for packages similar to the given one
- */
- Xapian::Query makeRelatedQuery(const std::string& pkgname) const;
-
- /**
- * Get the integer value for
- */
- double getDoubleValue(const std::string& pkgname, Xapian::valueno val_id) const;
-
- /**
- * Get the integer value for
- */
- int getIntValue(const std::string& pkgname, Xapian::valueno val_id) const;
-};
-
-}
-}
-
-// vim:set ts=4 sw=4:
-#endif
diff --git a/ept/textsearch/textsearch.test.h b/ept/textsearch/textsearch.test.h
deleted file mode 100644
index 39516c3..0000000
--- a/ept/textsearch/textsearch.test.h
+++ /dev/null
@@ -1,166 +0,0 @@
-// -*- mode: c++; tab-width: 4; indent-tabs-mode: t -*-
-/*
- * popcon test
- *
- * Copyright (C) 2007 Enrico Zini <enrico@debian.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#include <ept/test.h>
-#include <ept/textsearch/textsearch.h>
-#include <ept/textsearch/maint/path.h>
-#include <ept/apt/apt.h>
-#include <wibble/sys/fs.h>
-#include <set>
-
-namespace ept {
-namespace textsearch {
-extern size_t max_index;
-}
-}
-
-using namespace std;
-using namespace ept;
-using namespace ept::textsearch;
-using namespace ept::apt;
-
-struct DirMaker
-{
- DirMaker(const std::string& name)
- {
- wibble::sys::fs::mkdirIfMissing(name, 0755);
- }
-};
-
-struct TestTextsearch : AptTestEnvironment
-{
- DirMaker md;
- Path::OverrideIndexDir oid;
- Apt apt;
- TextSearch textsearch;
-
- TestTextsearch()
- : md( TEST_ENV_DIR "xapian"), oid( TEST_ENV_DIR "xapian")
- {
- try {
- ept::textsearch::max_index = 1000;
- textsearch.rebuildIfNeeded(apt);
- } catch (Xapian::Error& e) {
- cerr << e.get_type() << " " << e.get_msg() << " " << e.get_context() << endl;
- throw;
- }
- }
-
-// Access an empty index
- Test empty()
- {
- Path::OverrideIndexDir oid("./empty");
- TextSearch empty;
- assert_eq(empty.timestamp(), 0);
- assert(!empty.hasData());
- assert(empty.needsRebuild(apt));
- /*
- Xapian::Enquire enq(empty.db());
- empty.search(enq, "apt");
- Xapian::MSet matches = enq.get_mset(0, 100);
- assert_eq(matches.size(), 0u);
- */
- }
-
-// Very basic access
- Test basicAccess()
- {
- assert(textsearch.hasData());
- assert(textsearch.timestamp() > 0);
- assert(!textsearch.needsRebuild(apt));
-
- Xapian::Enquire enq(textsearch.db());
- enq.set_query(textsearch.makeORQuery("sgml"));
- Xapian::MSet matches = enq.get_mset(0, 100);
- assert(matches.size() > 0);
-
- // See if the apt package is among the results
- set<string> results;
- for (Xapian::MSetIterator i = matches.begin(); i != matches.end(); ++i)
- results.insert(i.get_document().get_data());
- assert(results.find("sp") != results.end());
- }
-
-// Alternate access using intermediate Xapian::Query objects
- Test queryAccess()
- {
- Xapian::Enquire enq(textsearch.db());
- enq.set_query(textsearch.makeORQuery("sgml"));
- Xapian::MSet matches = enq.get_mset(0, 100);
- assert(matches.size() > 0);
-
- // See if the apt package is among the results
- set<string> results;
- for (Xapian::MSetIterator i = matches.begin(); i != matches.end(); ++i)
- results.insert(i.get_document().get_data());
- assert(results.find("sp") != results.end());
- }
-
-// Try makePartialORQuery
- Test partialOrQuery()
- {
- Xapian::Enquire enq(textsearch.db());
- enq.set_query(textsearch.makePartialORQuery("sgml"));
- Xapian::MSet matches = enq.get_mset(0, 100);
- assert(matches.size() > 0);
-
- // See if the apt package is among the results
- set<string> results;
- for (Xapian::MSetIterator i = matches.begin(); i != matches.end(); ++i)
- results.insert(i.get_document().get_data());
- assert(results.find("sp") != results.end());
- }
-
-// Try docidByName
- Test docidByName()
- {
- assert(textsearch.docidByName("sp") != 0);
- assert_eq(textsearch.docidByName("thereisnopackagewiththisname"), 0u);
- }
-
-// Access values
- Test values()
- {
- assert(textsearch.hasData());
- assert(textsearch.timestamp() > 0);
- assert(!textsearch.needsRebuild(apt));
-
- double dval;
- dval = textsearch.getDoubleValue("autoconf", VAL_APT_INSTALLED_SIZE);
- assert(dval == 2408);
- dval = textsearch.getDoubleValue("autoconf", VAL_APT_PACKAGE_SIZE);
- assert(dval == 741486);
- assert_eq(textsearch.getDoubleValue("thereisnopackagewiththisname", VAL_APT_INSTALLED_SIZE), 0.0);
- assert_eq(textsearch.getDoubleValue("thereisnopackagewiththisname", VAL_APT_PACKAGE_SIZE), 0.0);
-
- int val;
- val = textsearch.getIntValue("autoconf", VAL_APT_INSTALLED_SIZE);
- assert(val == 2408);
- val = textsearch.getIntValue("autoconf", VAL_APT_PACKAGE_SIZE);
- assert(val == 741486);
- cout << val;
- assert_eq(textsearch.getIntValue("thereisnopackagewiththisname", VAL_APT_INSTALLED_SIZE), 0);
- assert_eq(textsearch.getIntValue("thereisnopackagewiththisname", VAL_APT_PACKAGE_SIZE), 0);
- }
-
-};
-
-// vim:set ts=4 sw=4: