diff options
author | Enrico Zini <enrico@enricozini.org> | 2010-05-10 17:50:28 +0100 |
---|---|---|
committer | Enrico Zini <enrico@enricozini.org> | 2010-05-10 17:50:28 +0100 |
commit | 1b5ac4e66699cacde7123da3b111a0a16ee55db4 (patch) | |
tree | 63d345bf5d93f5ed3ab639571d60ccd8c2b2b331 /ept | |
parent | 80d13236ef7014f077a5793a9cb3e9a0573ffbe1 (diff) | |
download | libept-1b5ac4e66699cacde7123da3b111a0a16ee55db4.tar.gz |
Replaced textindex with axi, which contains just a handful of helper functions to handle apt-xapian-index
Diffstat (limited to 'ept')
-rw-r--r-- | ept/CMakeLists.txt | 13 | ||||
-rw-r--r-- | ept/axi/axi.cc (renamed from ept/textsearch/extraindexers.cc) | 53 | ||||
-rw-r--r-- | ept/axi/axi.h | 89 | ||||
-rw-r--r-- | ept/axi/axi.test.h (renamed from ept/textsearch/extraindexers.h) | 55 | ||||
-rw-r--r-- | ept/textsearch/maint/path.cc | 93 | ||||
-rw-r--r-- | ept/textsearch/maint/path.h | 74 | ||||
-rw-r--r-- | ept/textsearch/textsearch.cc | 256 | ||||
-rw-r--r-- | ept/textsearch/textsearch.h | 219 | ||||
-rw-r--r-- | ept/textsearch/textsearch.test.h | 166 |
9 files changed, 160 insertions, 858 deletions
diff --git a/ept/CMakeLists.txt b/ept/CMakeLists.txt index e24d6ef..ed9b6c7 100644 --- a/ept/CMakeLists.txt +++ b/ept/CMakeLists.txt @@ -2,8 +2,7 @@ project( ept ) include( ${WIBBLE_TEST_CMAKE} ) file( GLOB src *.cpp debtags/*.cc debtags/maint/*.cc - popcon/*.cc popcon/maint/*.cc apt/*.cc textsearch/*.cc - textsearch/maint/*.cc ) + popcon/*.cc popcon/maint/*.cc apt/*.cc axi/*.cc ) file( GLOB h_top *.h ) file( GLOB h_apt apt/*.h ) @@ -11,15 +10,14 @@ file( GLOB h_debtags debtags/*.h debtags/*.tcc ) file( GLOB h_debtags_maint debtags/maint/*.h debtags/maint/*.tcc ) file( GLOB h_popcon popcon/*.h ) file( GLOB h_popcon_maint popcon/maint/*.h ) -file( GLOB h_textsearch textsearch/*.h ) -file( GLOB h_textsearch_maint textsearch/maint/*.h ) +file( GLOB h_axi axi/*.h ) file( GLOB debtagstesth debtags/*.test.h debtags/maint/*.test.h ) file( GLOB popcontesth popcon/*.test.h ) file( GLOB apttesth apt/*.test.h ) -file( GLOB textsearchtesth textsearch/*.test.h ) +file( GLOB axitesth axi/*.test.h ) set( testh ${debtagstesth} ${popcontesth} - ${apttesth} ${textsearchtesth} ) + ${apttesth} ${axitesth} ) include_directories( ${CMAKE_SOURCE_DIR} ${CMAKE_BINARY_DIR} ${TAGCOLL_INCLUDE_DIRS} ${WIBBLE_INCLUDE_DIRS} ) @@ -90,5 +88,4 @@ install( FILES ${h_debtags} DESTINATION include/ept/debtags ) install( FILES ${h_debtags_maint} DESTINATION include/ept/debtags/maint ) install( FILES ${h_popcon} DESTINATION include/ept/popcon ) install( FILES ${h_popcon_maint} DESTINATION include/ept/popcon/maint ) -install( FILES ${h_textsearch} DESTINATION include/ept/textsearch ) -install( FILES ${h_textsearch_maint} DESTINATION include/ept/textsearch/maint ) +install( FILES ${h_axi} DESTINATION include/ept/axi ) diff --git a/ept/textsearch/extraindexers.cc b/ept/axi/axi.cc index 179688f..a9b3203 100644 --- a/ept/textsearch/extraindexers.cc +++ b/ept/axi/axi.cc @@ -22,37 +22,54 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#include <ept/textsearch/extraindexers.h> -#include <ept/apt/packagerecord.h> -#include <ept/debtags/debtags.h> +#include <ept/config.h> +#include <ept/axi/axi.h> + +#include <wibble/exception.h> +#include <wibble/string.h> +#include <wibble/sys/fs.h> +#include <memory> using namespace std; -using namespace ept::debtags; +using namespace wibble; namespace ept { -namespace textsearch { +namespace axi { + +static std::string m_index_dir = TEXTSEARCH_DB_DIR; -void AptTagsExtraIndexer::operator()(Xapian::Document& doc, const apt::PackageRecord& rec) const +std::string path_dir() { - // Index tags as well - set<string> tags = rec.tag(); - for (set<string>::const_iterator ti = tags.begin(); - ti != tags.end(); ++ti) - doc.add_term("XT"+*ti); + return m_index_dir; } -void DebtagsExtraIndexer::operator()(Xapian::Document& doc, const apt::PackageRecord& rec) const +std::string path_db() { - // Index tags as well - set<std::string> tags = debtags.getTagsOfItem(doc.get_data()); - for (set<std::string>::const_iterator ti = tags.begin(); - ti != tags.end(); ++ti) - doc.add_term("XT"+*ti); + return str::joinpath(m_index_dir, "/index"); } +time_t timestamp() +{ + string tsfile = str::joinpath(m_index_dir, "update-timestamp"); + std::auto_ptr<struct stat> st = sys::fs::stat(tsfile); + if (st.get()) + return st->st_mtime; + else + return 0; } + + +OverrideIndexDir::OverrideIndexDir(const std::string& path) : old(m_index_dir) +{ + m_index_dir = path; } -#include <ept/debtags/debtags.tcc> +OverrideIndexDir::~OverrideIndexDir() +{ + m_index_dir = old; +} + +} +} // vim:set ts=4 sw=4: diff --git a/ept/axi/axi.h b/ept/axi/axi.h new file mode 100644 index 0000000..29a9686 --- /dev/null +++ b/ept/axi/axi.h @@ -0,0 +1,89 @@ +#ifndef EPT_TEXTSEARCH_TEXTSEARCH_H +#define EPT_TEXTSEARCH_TEXTSEARCH_H + +/** @file + * @author Enrico Zini <enrico@enricozini.org> + * Fast full-text search + */ + +/* + * Copyright (C) 2007 Enrico Zini <enrico@debian.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <xapian.h> +#include <string> + +namespace ept { + +/** + * Maintains and accesses a Xapian index of package descriptions. + * + * Contrarily to Debtags and Popcon, TextSearch does not attempt to create the + * index in the home directory if no system index is found and it is not + * running as root: this is to avoid secretly building large indexes (>50Mb) + * in the home directory of users. + * + * The idea then is to have root keep the index up to date, possibly running a + * reindexing tool once a day, or after an apt-get update. + * + * This works because the full text search index is useful even if it is + * slightly out of date. + */ +namespace axi { + +// Allocate value indexes for known values +const Xapian::valueno VAL_APT_INSTALLED_SIZE = 1; +const Xapian::valueno VAL_APT_PACKAGE_SIZE = 2; +const Xapian::valueno VAL_POPCON = 10; +const Xapian::valueno VAL_ITERATING_RATING = 20; +const Xapian::valueno VAL_ITERATING_FUNCTIONALITY = 21; +const Xapian::valueno VAL_ITERATING_USABILITY = 22; +const Xapian::valueno VAL_ITERATING_SECURITY = 23; +const Xapian::valueno VAL_ITERATING_PERFORMANCE = 24; +const Xapian::valueno VAL_ITERATING_QUALITY = 25; +const Xapian::valueno VAL_ITERATING_SUPPORT = 26; +const Xapian::valueno VAL_ITERATING_ADOPTION = 27; +// If you need to index a value and cannot edit this file, feel free to use any +// value starting from 1000000 + +/// Return the path to the Apt Xapian index root directory +std::string path_dir(); + +/// Return the path to the Apt Xapian index Xapian database +std::string path_db(); + +/// Return the last update timestamp of the index +time_t timestamp(); + + +/** + * RAII temporary override of the location of the index root + * directory, used for tests + */ +class OverrideIndexDir +{ + std::string old; +public: + OverrideIndexDir(const std::string& path); + ~OverrideIndexDir(); +}; + +} +} + +// vim:set ts=4 sw=4: +#endif diff --git a/ept/textsearch/extraindexers.h b/ept/axi/axi.test.h index ce1d042..5481bd8 100644 --- a/ept/textsearch/extraindexers.h +++ b/ept/axi/axi.test.h @@ -1,12 +1,7 @@ -#ifndef EPT_TEXTSEARCH_EXTRAINDEXERS_H -#define EPT_TEXTSEARCH_EXTRAINDEXERS_H - -/** @file - * @author Enrico Zini <enrico@enricozini.org> - * Fast full-text search - */ - +// -*- mode: c++; tab-width: 4; indent-tabs-mode: t -*- /* + * popcon test + * * Copyright (C) 2007 Enrico Zini <enrico@debian.org> * * This program is free software; you can redistribute it and/or modify @@ -24,28 +19,40 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#include <ept/textsearch/textsearch.h> +#include <ept/test.h> +#include <ept/axi/axi.h> +#include <ept/apt/apt.h> +#include <wibble/sys/fs.h> +#include <set> -namespace ept { -namespace debtags { -class Debtags; -} -namespace textsearch { +using namespace std; +using namespace ept; -struct AptTagsExtraIndexer : public TextSearch::ExtraIndexer +struct DirMaker { - virtual void operator()(Xapian::Document& doc, const apt::PackageRecord& rec) const; + DirMaker(const std::string& name) + { + wibble::sys::fs::mkdirIfMissing(name, 0755); + } }; -struct DebtagsExtraIndexer : public TextSearch::ExtraIndexer +struct TestAxi : AptTestEnvironment { - const debtags::Debtags& debtags; - DebtagsExtraIndexer(const debtags::Debtags& debtags) : debtags(debtags) {} - virtual void operator()(Xapian::Document& doc, const apt::PackageRecord& rec) const; + DirMaker md; + axi::OverrideIndexDir oid; + apt::Apt apt; + + TestAxi() + : md( TEST_ENV_DIR "xapian"), oid( TEST_ENV_DIR "xapian") + { + } + +// Access an empty index + Test empty() + { + axi::OverrideIndexDir oid("./empty"); + assert_eq(axi::timestamp(), 0); + } }; -} -} - // vim:set ts=4 sw=4: -#endif diff --git a/ept/textsearch/maint/path.cc b/ept/textsearch/maint/path.cc deleted file mode 100644 index 4b82ba1..0000000 --- a/ept/textsearch/maint/path.cc +++ /dev/null @@ -1,93 +0,0 @@ -// -*- mode: c++; indent-tabs-mode: t -*- - -/** \file - * popcon paths - */ - -/* - * Copyright (C) 2005,2006,2007 Enrico Zini <enrico@debian.org>, Peter Rockai <me@mornfall.net> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include <ept/config.h> -#include <ept/textsearch/maint/path.h> - -#include <wibble/exception.h> -#include <wibble/sys/fs.h> -#include <wibble/string.h> - -#include <cstdio> -#include <cerrno> - -#include <sys/types.h> -#include <sys/stat.h> -#include <unistd.h> - -using namespace std; -using namespace wibble; - -namespace ept { -namespace textsearch { - -Path &Path::instance() { - if (!s_instance) { - s_instance = new Path; - instance().m_indexDir = TEXTSEARCH_DB_DIR; - } - return *s_instance; -} - -int Path::access( const std::string &s, int m ) -{ - return ::access( s.c_str(), m ); -} - -time_t Path::indexTimestamp() -{ - string tsfile = str::joinpath(instance().indexDir(), "update-timestamp"); - std::auto_ptr<struct stat> st = wibble::sys::fs::stat(tsfile); - if (st.get()) - return st->st_mtime; - else - return 0; -} - -void Path::setTimestamp(time_t ts) -{ - string tsfile = str::joinpath(instance().indexDir(), "/update-timestamp"); - FILE* out = fopen(tsfile.c_str(), "wt"); - if (!out) - throw wibble::exception::File(tsfile, "opening file for truncate/writing"); - if (fprintf(out, "%ld\n", ts) < 0) - throw wibble::exception::File(tsfile, "writing the modification time"); - if (fclose(out) < 0) - throw wibble::exception::File(tsfile, "closing the file"); -} - -void Path::setIndexDir( const std::string &s ) -{ - instance().m_indexDir = s; -} - -std::string Path::indexDir() { return instance().m_indexDir; } -std::string Path::index() { return str::joinpath(instance().m_indexDir, "/index"); } - -Path *Path::s_instance = 0; - -} -} - -// vim:set ts=4 sw=4: diff --git a/ept/textsearch/maint/path.h b/ept/textsearch/maint/path.h deleted file mode 100644 index 468d271..0000000 --- a/ept/textsearch/maint/path.h +++ /dev/null @@ -1,74 +0,0 @@ -// -*- mode: c++; indent-tabs-mode: t -*- -/** \file - * popcon paths - */ - -/* - * Copyright (C) 2005,2006,2007 Enrico Zini <enrico@debian.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#ifndef EPT_TEXTSEARCH_PATH_H -#define EPT_TEXTSEARCH_PATH_H - -#include <string> - -struct stat; - -namespace ept { -namespace textsearch { - -/** - * Singleton class to configure and access the various Popcon paths - */ -class Path -{ -public: - static std::string indexDir(); - static std::string index(); - - // Directory where Popcon source data is found - static void setIndexDir( const std::string &s ); - - static int access( const std::string &, int ); - static time_t indexTimestamp(); - static void setTimestamp(time_t ts); - - // RAII-style classes to temporarily override directories - class OverrideIndexDir - { - std::string old; - public: - OverrideIndexDir(const std::string& path) : old(Path::indexDir()) - { - Path::setIndexDir(path); - } - ~OverrideIndexDir() { Path::setIndexDir(old); } - }; - -protected: - static Path *s_instance; - static Path &instance(); - - // Directory where Popcon source data is found - std::string m_indexDir; -}; - -} -} - -// vim:set ts=4 sw=4: -#endif diff --git a/ept/textsearch/textsearch.cc b/ept/textsearch/textsearch.cc deleted file mode 100644 index adbe235..0000000 --- a/ept/textsearch/textsearch.cc +++ /dev/null @@ -1,256 +0,0 @@ - -/** @file - * @author Enrico Zini <enrico@enricozini.org> - * Fast full-text search - */ - -/* - * Copyright (C) 2007 Enrico Zini <enrico@debian.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include <ept/textsearch/textsearch.h> -#include <ept/textsearch/maint/path.h> -#include <ept/apt/apt.h> -#include <ept/apt/packagerecord.h> -//#include <ept/debtags/debtags.h> - -#include <wibble/regexp.h> -#include <cctype> -#include <cmath> - -#include <xapian/queryparser.h> - -#include <algorithm> - -#include <iostream> - -using namespace std; -using namespace ept::apt; -using namespace ept::debtags; - -namespace ept { -namespace textsearch { - -size_t max_index = 0; - -TextSearch::TextSearch() - : m_timestamp(0), m_stem("en") -{ - m_timestamp = Path::indexTimestamp(); - if (m_timestamp) - m_db.add_database(Xapian::Database(Path::index())); -} - -std::string TextSearch::toLower(const std::string& str) -{ - std::string res; - res.reserve(str.size()); - for (std::string::const_iterator i = str.begin(); i != str.end(); ++i) - res += tolower(*i); - return res; -} - -bool TextSearch::needsRebuild(apt::Apt& apt) -{ - return apt.timestamp() > m_timestamp; -} - -void TextSearch::normalize_and_add(Xapian::Document& doc, const std::string& term, int& pos) const -{ - string t = TextSearch::toLower(term); - string s = m_stem(t); - doc.add_term(t); - if (s != t) - doc.add_term(s); -} - -bool TextSearch::rebuildIfNeeded(apt::Apt& apt, const std::vector<const TextSearch::ExtraIndexer*>& extraIndexers) -{ - // Check if a rebuild is needed, and keep a copy of the APT timestamp for - // saving later - time_t aptts = apt.timestamp(); - if (aptts <= m_timestamp) - return false; - - // Reindex - Xapian::WritableDatabase database(Xapian::Flint::open(Path::index(), Xapian::DB_CREATE_OR_OPEN)); - Xapian::TermGenerator termgen; - termgen.set_stemmer(m_stem); - //database.begin_transaction(); - PackageRecord rec; - size_t count = 0; - for (Apt::record_iterator i = apt.recordBegin(); - i != apt.recordEnd(); ++i, ++count) - { - // If we are testing, we can set a limit to how many packages we index, - // to avoid it taking too much time - if (max_index != 0 && count > max_index) - break; - - rec.scan(*i); - - Xapian::Document doc; - doc.set_data(rec.package()); - - string pkgid = "XP" + rec.package(); - //std::cerr << "Add " << pkgid << ": " << idx << std::endl; - doc.add_term(pkgid); - - // Index tags as well - set<string> tags = rec.tag(); - for (set<string>::const_iterator ti = tags.begin(); - ti != tags.end(); ++ti) - doc.add_term("XT"+*ti); - - termgen.set_document(doc); - termgen.index_text_without_positions(rec.package()); - termgen.index_text_without_positions(rec.description()); - - // Add the values - doc.add_value(VAL_APT_INSTALLED_SIZE, Xapian::sortable_serialise(rec.installedSize())); - doc.add_value(VAL_APT_PACKAGE_SIZE, Xapian::sortable_serialise(rec.packageSize())); - - if (m_timestamp) - database.replace_document(pkgid, doc); - else - database.add_document(doc); - } - - //database.commit_transaction(); - - if (!m_timestamp) - m_db.add_database(Xapian::Database(Path::index())); - else - m_db.reopen(); - - m_timestamp = aptts; - - Path::setTimestamp(aptts); - - return true; -} - -Xapian::Query TextSearch::makeORQuery(const std::string& keywords) const -{ - wibble::Tokenizer tok(keywords, "[A-Za-z0-9_-]+", REG_EXTENDED); - return makeORQuery(tok.begin(), tok.end()); -} - -Xapian::Query TextSearch::makePartialORQuery(const std::string& keywords) const -{ - wibble::Tokenizer tok(keywords, "[A-Za-z0-9_-]+", REG_EXTENDED); - vector<string> tokens; - // FIXME: make the Tokenizer iterators properly iterable - for (wibble::Tokenizer::const_iterator i = tok.begin(); - i != tok.end(); ++i) - tokens.push_back(*i); - // Add all the terms starting with 'last' - if (!tokens.empty()) - { - string& last = *tokens.rbegin(); - if (last.size() == 1) - // Ignore one-letter partial terms: they make the query uselessly - // large and slow, and it's worth just to wait for more characters - // to come - tokens.resize(tokens.size() - 1); - else - copy(m_db.allterms_begin(last), m_db.allterms_end(last), back_inserter(tokens)); - /* - for (Xapian::TermIterator t = m_db.allterms_begin(last); - t != m_db.allterms_end(last); ++t) - tokens.push_back(*t); - */ - } - return makeORQuery(tokens.begin(), tokens.end()); -} - -Xapian::docid TextSearch::docidByName(const std::string& pkgname) const -{ - Xapian::PostingIterator i = m_db.postlist_begin("XP"+pkgname); - if (i == m_db.postlist_end("XP"+pkgname)) - return 0; - else - return *i; -} - -struct TagFilter : public Xapian::ExpandDecider -{ - virtual bool operator()(const std::string &term) const { return term[0] == 'T'; } -}; - -static TagFilter tagFilter; - -vector<string> TextSearch::expand(Xapian::Enquire& enq) const -{ - Xapian::RSet rset; - // Get the top 5 results as 'good ones' to compute the search expansion - Xapian::MSet mset = enq.get_mset(0, 5); - for (Xapian::MSet::iterator i = mset.begin(); i != mset.end(); ++i) - rset.add_document(i); - // Get the expanded set, only expanding the query with tag names - Xapian::ESet eset = enq.get_eset(5, rset, &tagFilter); - vector<string> res; - for (Xapian::ESetIterator i = eset.begin(); i != eset.end(); ++i) - res.push_back(*i); - return res; -} - -Xapian::Query TextSearch::makeRelatedQuery(const std::string& pkgname) const -{ - Xapian::Enquire enquire(db()); - - // Retrieve the document for the given package - enquire.set_query(Xapian::Query("XP"+pkgname)); - Xapian::MSet matches = enquire.get_mset(0, 1); - Xapian::MSetIterator mi = matches.begin(); - if (mi == matches.end()) return Xapian::Query(); - Xapian::Document doc = mi.get_document(); - - // Return the query to get the list of similar documents - return Xapian::Query(Xapian::Query::OP_OR, doc.termlist_begin(), doc.termlist_end()); -} - -double TextSearch::getDoubleValue(const std::string& pkgname, Xapian::valueno val_id) const -{ - Xapian::docid id = docidByName(pkgname); - if (id == 0) - return 0.0; - Xapian::Document doc = db().get_document(id); - string val = doc.get_value(val_id); - if (val.empty()) - return 0.0; - else - return Xapian::sortable_unserialise(val); -} - -int TextSearch::getIntValue(const std::string& pkgname, Xapian::valueno val_id) const -{ - Xapian::docid id = docidByName(pkgname); - if (id == 0) - return 0; - Xapian::Document doc = db().get_document(id); - string val = doc.get_value(val_id); - if (val.empty()) - return 0; - else - return (int)nearbyint(Xapian::sortable_unserialise(val)); -} - -} -} - -// vim:set ts=4 sw=4: diff --git a/ept/textsearch/textsearch.h b/ept/textsearch/textsearch.h deleted file mode 100644 index 6f61e02..0000000 --- a/ept/textsearch/textsearch.h +++ /dev/null @@ -1,219 +0,0 @@ -#ifndef EPT_TEXTSEARCH_TEXTSEARCH_H -#define EPT_TEXTSEARCH_TEXTSEARCH_H - -/** @file - * @author Enrico Zini <enrico@enricozini.org> - * Fast full-text search - */ - -/* - * Copyright (C) 2007 Enrico Zini <enrico@debian.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include <xapian.h> -#include <vector> -#include <string> - -namespace ept { -namespace apt { -class Apt; -class PackageRecord; -} -namespace debtags { -class Debtags; -} -namespace textsearch { - -// Allocate value indexes for known values -const Xapian::valueno VAL_APT_INSTALLED_SIZE = 1; -const Xapian::valueno VAL_APT_PACKAGE_SIZE = 2; -const Xapian::valueno VAL_POPCON = 10; -const Xapian::valueno VAL_ITERATING_RATING = 20; -const Xapian::valueno VAL_ITERATING_FUNCTIONALITY = 21; -const Xapian::valueno VAL_ITERATING_USABILITY = 22; -const Xapian::valueno VAL_ITERATING_SECURITY = 23; -const Xapian::valueno VAL_ITERATING_PERFORMANCE = 24; -const Xapian::valueno VAL_ITERATING_QUALITY = 25; -const Xapian::valueno VAL_ITERATING_SUPPORT = 26; -const Xapian::valueno VAL_ITERATING_ADOPTION = 27; -// If you need to index a value and cannot edit this file, feel free to use any -// value starting from 1000000 - - -/* -Fallback on apt scan searches when index is not present - -Explicitly decide at instantiation (or at any other time) if a rebuild should -be performed. Just adding a 'rebuildIfNeeded' method would be enough. - -17:14 #xapian < enrico> Hello. I'm finally in a position of writing a library to maintain - a xapian index with Debian package descriptions in a Debian system -17:14 #xapian < enrico> I have a question, though -17:14 #xapian < enrico> The descriptions change regularly as people run 'apt-get update' -17:15 #xapian < enrico> I'd need to have a way to update the description index after - apt-get update, without rebuilding it from scratch -17:15 #xapian < enrico> Is there some documentation on how to do that? I can't exactly - tell Xapian "the new description for package foo is this" because - I'd need the xapian id -19:11 #xapian < omega> you can add a unique term with a boolean prefix? -19:11 #xapian < omega> like Qpackage-name -19:11 #xapian < omega> then you search for it and replace_document -19:24 #xapian < richardb> Or indeed, you use the "replace_document()" form which takes a - unique_id term. -19:25 #xapian < richardb> Xapian::docid replace_document(const std::string & - unique_term, -19:25 #xapian < richardb> const Xapian::Document & - document); -19:43 #xapian < enrico> unique term -19:43 #xapian < enrico> nice! -19:44 #xapian < enrico> can I use a non-alpha prefix, like :package-name ? -19:45 #xapian < enrico> or pkg:package-name -19:45 #xapian < enrico> I suppose I can -*/ - -/** - * Maintains and accesses a Xapian index of package descriptions. - * - * Contrarily to Debtags and Popcon, TextSearch does not attempt to create the - * index in the home directory if no system index is found and it is not - * running as root: this is to avoid secretly building large indexes (>50Mb) - * in the home directory of users. - * - * The idea then is to have root keep the index up to date, possibly running a - * reindexing tool once a day, or after an apt-get update. - * - * This works because the full text search index is useful even if it is - * slightly out of date. - */ -class TextSearch -{ -protected: - time_t m_timestamp; - Xapian::Database m_db; - Xapian::Stem m_stem; - - /// Return a lowercased copy of the string - static std::string toLower(const std::string& str); - - /** - * Add normalised tokens computed from the string to the document doc. - * - * pos is used as a sequence generator for entering the token position in - * the document. - */ - void normalize_and_add(Xapian::Document& doc, const std::string& term, int& pos) const; - -public: - struct ExtraIndexer - { - virtual ~ExtraIndexer() {} - virtual void operator()(Xapian::Document& doc, const apt::PackageRecord& rec) const = 0; - }; - - TextSearch(); - - /// Access the Xapian database - Xapian::Database& db() { return m_db; } - - /// Access the Xapian database - const Xapian::Database& db() const { return m_db; } - - /// Timestamp of when the Xapian database was last updated - time_t timestamp() const { return m_timestamp; } - - /// Returns true if the index has data - bool hasData() const { return m_timestamp > 0; } - - /// Returns true if the index is older than the Apt database information - bool needsRebuild(apt::Apt& apt); - - /** - * Rebuild the index if needed. - * - * Allow to specify functors that contribute to the indexing. - * - * @note This requires write access to the index directory. - * @note This is not the main way to update the index: it is provided here - * only as a way to build a draft index for the library tests - */ - bool rebuildIfNeeded( - apt::Apt& apt, - const std::vector<const ExtraIndexer*>& extraIndexers = std::vector<const ExtraIndexer*>()); - - /** - * Retrieve a Xapian docid by package name - */ - Xapian::docid docidByName(const std::string& pkgname) const; - - /** - * Tokenize the string and build an OR query with the resulting keywords - */ - Xapian::Query makeORQuery(const std::string& keywords) const; - - /** - * Tokenize the string and build an OR query with the resulting keywords. - * - * The last token in keywords is considered to be typed only partially, to - * implement proper search-as-you-type. - */ - Xapian::Query makePartialORQuery(const std::string& keywords) const; - - /** - * Build a query with the given keywords, specified as iterators of strings - */ - template<typename ITER> - Xapian::Query makeORQuery(const ITER& begin, const ITER& end) const - { - std::vector<std::string> terms; - // Insert both the lowercased and the stemmed lowercased query terms - for (ITER i = begin; i != end; ++i) - { - std::string t = toLower(*i); - std::string s = m_stem(t); - terms.push_back(t); - if (s != t) - terms.push_back("Z" + s); - } - return Xapian::Query(Xapian::Query::OP_OR, terms.begin(), terms.end()); - } - - /// Return a list of tag-based terms that can be used to expand an OR query - std::vector<std::string> expand(Xapian::Enquire& enq) const; - -// std::vector<std::string> similar(const std::string& pkg); - - /** - * Create a query to look for packages similar to the given one - */ - Xapian::Query makeRelatedQuery(const std::string& pkgname) const; - - /** - * Get the integer value for - */ - double getDoubleValue(const std::string& pkgname, Xapian::valueno val_id) const; - - /** - * Get the integer value for - */ - int getIntValue(const std::string& pkgname, Xapian::valueno val_id) const; -}; - -} -} - -// vim:set ts=4 sw=4: -#endif diff --git a/ept/textsearch/textsearch.test.h b/ept/textsearch/textsearch.test.h deleted file mode 100644 index 39516c3..0000000 --- a/ept/textsearch/textsearch.test.h +++ /dev/null @@ -1,166 +0,0 @@ -// -*- mode: c++; tab-width: 4; indent-tabs-mode: t -*- -/* - * popcon test - * - * Copyright (C) 2007 Enrico Zini <enrico@debian.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include <ept/test.h> -#include <ept/textsearch/textsearch.h> -#include <ept/textsearch/maint/path.h> -#include <ept/apt/apt.h> -#include <wibble/sys/fs.h> -#include <set> - -namespace ept { -namespace textsearch { -extern size_t max_index; -} -} - -using namespace std; -using namespace ept; -using namespace ept::textsearch; -using namespace ept::apt; - -struct DirMaker -{ - DirMaker(const std::string& name) - { - wibble::sys::fs::mkdirIfMissing(name, 0755); - } -}; - -struct TestTextsearch : AptTestEnvironment -{ - DirMaker md; - Path::OverrideIndexDir oid; - Apt apt; - TextSearch textsearch; - - TestTextsearch() - : md( TEST_ENV_DIR "xapian"), oid( TEST_ENV_DIR "xapian") - { - try { - ept::textsearch::max_index = 1000; - textsearch.rebuildIfNeeded(apt); - } catch (Xapian::Error& e) { - cerr << e.get_type() << " " << e.get_msg() << " " << e.get_context() << endl; - throw; - } - } - -// Access an empty index - Test empty() - { - Path::OverrideIndexDir oid("./empty"); - TextSearch empty; - assert_eq(empty.timestamp(), 0); - assert(!empty.hasData()); - assert(empty.needsRebuild(apt)); - /* - Xapian::Enquire enq(empty.db()); - empty.search(enq, "apt"); - Xapian::MSet matches = enq.get_mset(0, 100); - assert_eq(matches.size(), 0u); - */ - } - -// Very basic access - Test basicAccess() - { - assert(textsearch.hasData()); - assert(textsearch.timestamp() > 0); - assert(!textsearch.needsRebuild(apt)); - - Xapian::Enquire enq(textsearch.db()); - enq.set_query(textsearch.makeORQuery("sgml")); - Xapian::MSet matches = enq.get_mset(0, 100); - assert(matches.size() > 0); - - // See if the apt package is among the results - set<string> results; - for (Xapian::MSetIterator i = matches.begin(); i != matches.end(); ++i) - results.insert(i.get_document().get_data()); - assert(results.find("sp") != results.end()); - } - -// Alternate access using intermediate Xapian::Query objects - Test queryAccess() - { - Xapian::Enquire enq(textsearch.db()); - enq.set_query(textsearch.makeORQuery("sgml")); - Xapian::MSet matches = enq.get_mset(0, 100); - assert(matches.size() > 0); - - // See if the apt package is among the results - set<string> results; - for (Xapian::MSetIterator i = matches.begin(); i != matches.end(); ++i) - results.insert(i.get_document().get_data()); - assert(results.find("sp") != results.end()); - } - -// Try makePartialORQuery - Test partialOrQuery() - { - Xapian::Enquire enq(textsearch.db()); - enq.set_query(textsearch.makePartialORQuery("sgml")); - Xapian::MSet matches = enq.get_mset(0, 100); - assert(matches.size() > 0); - - // See if the apt package is among the results - set<string> results; - for (Xapian::MSetIterator i = matches.begin(); i != matches.end(); ++i) - results.insert(i.get_document().get_data()); - assert(results.find("sp") != results.end()); - } - -// Try docidByName - Test docidByName() - { - assert(textsearch.docidByName("sp") != 0); - assert_eq(textsearch.docidByName("thereisnopackagewiththisname"), 0u); - } - -// Access values - Test values() - { - assert(textsearch.hasData()); - assert(textsearch.timestamp() > 0); - assert(!textsearch.needsRebuild(apt)); - - double dval; - dval = textsearch.getDoubleValue("autoconf", VAL_APT_INSTALLED_SIZE); - assert(dval == 2408); - dval = textsearch.getDoubleValue("autoconf", VAL_APT_PACKAGE_SIZE); - assert(dval == 741486); - assert_eq(textsearch.getDoubleValue("thereisnopackagewiththisname", VAL_APT_INSTALLED_SIZE), 0.0); - assert_eq(textsearch.getDoubleValue("thereisnopackagewiththisname", VAL_APT_PACKAGE_SIZE), 0.0); - - int val; - val = textsearch.getIntValue("autoconf", VAL_APT_INSTALLED_SIZE); - assert(val == 2408); - val = textsearch.getIntValue("autoconf", VAL_APT_PACKAGE_SIZE); - assert(val == 741486); - cout << val; - assert_eq(textsearch.getIntValue("thereisnopackagewiththisname", VAL_APT_INSTALLED_SIZE), 0); - assert_eq(textsearch.getIntValue("thereisnopackagewiththisname", VAL_APT_PACKAGE_SIZE), 0); - } - -}; - -// vim:set ts=4 sw=4: |