1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
|
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<html lang="en">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
<title>LCOV - lcov.info - ept/textsearch/textsearch.h</title>
<link rel="stylesheet" type="text/css" href="../../gcov.css">
</head>
<body>
<table width="100%" border=0 cellspacing=0 cellpadding=0>
<tr><td class="title">LTP GCOV extension - code coverage report</td></tr>
<tr><td class="ruler"><img src="../../glass.png" width=3 height=3 alt=""></td></tr>
<tr>
<td width="100%">
<table cellpadding=1 border=0 width="100%">
<tr>
<td class="headerItem" width="20%">Current view:</td>
<td class="headerValue" width="80%" colspan=4><a href="../../index.html">directory</a> - <a href="index.html">ept/textsearch</a> - textsearch.h</td>
</tr>
<tr>
<td class="headerItem" width="20%">Test:</td>
<td class="headerValue" width="80%" colspan=4>lcov.info</td>
</tr>
<tr>
<td class="headerItem" width="20%">Date:</td>
<td class="headerValue" width="20%">2008-08-14</td>
<td width="20%"></td>
<td class="headerItem" width="20%">Instrumented lines:</td>
<td class="headerValue" width="20%">15</td>
</tr>
<tr>
<td class="headerItem" width="20%">Code covered:</td>
<td class="headerValue" width="20%">86.7 %</td>
<td width="20%"></td>
<td class="headerItem" width="20%">Executed lines:</td>
<td class="headerValue" width="20%">13</td>
</tr>
</table>
</td>
</tr>
<tr><td class="ruler"><img src="../../glass.png" width=3 height=3 alt=""></td></tr>
</table>
<table cellpadding=0 cellspacing=0 border=0>
<tr>
<td><br></td>
</tr>
<tr>
<td><pre class="source">
<span class="lineNum"> 1 </span> : #ifndef EPT_TEXTSEARCH_TEXTSEARCH_H
<span class="lineNum"> 2 </span> : #define EPT_TEXTSEARCH_TEXTSEARCH_H
<span class="lineNum"> 3 </span> :
<span class="lineNum"> 4 </span> : /** @file
<span class="lineNum"> 5 </span> : * @author Enrico Zini <enrico@enricozini.org>
<span class="lineNum"> 6 </span> : * Fast full-text search
<span class="lineNum"> 7 </span> : */
<span class="lineNum"> 8 </span> :
<span class="lineNum"> 9 </span> : /*
<span class="lineNum"> 10 </span> : * Copyright (C) 2007 Enrico Zini <enrico@debian.org>
<span class="lineNum"> 11 </span> : *
<span class="lineNum"> 12 </span> : * This program is free software; you can redistribute it and/or modify
<span class="lineNum"> 13 </span> : * it under the terms of the GNU General Public License as published by
<span class="lineNum"> 14 </span> : * the Free Software Foundation; either version 2 of the License, or
<span class="lineNum"> 15 </span> : * (at your option) any later version.
<span class="lineNum"> 16 </span> : *
<span class="lineNum"> 17 </span> : * This program is distributed in the hope that it will be useful,
<span class="lineNum"> 18 </span> : * but WITHOUT ANY WARRANTY; without even the implied warranty of
<span class="lineNum"> 19 </span> : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
<span class="lineNum"> 20 </span> : * GNU General Public License for more details.
<span class="lineNum"> 21 </span> : *
<span class="lineNum"> 22 </span> : * You should have received a copy of the GNU General Public License
<span class="lineNum"> 23 </span> : * along with this program; if not, write to the Free Software
<span class="lineNum"> 24 </span> : * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
<span class="lineNum"> 25 </span> : */
<span class="lineNum"> 26 </span> :
<span class="lineNum"> 27 </span> : #include <xapian.h>
<span class="lineNum"> 28 </span> : #include <vector>
<span class="lineNum"> 29 </span> : #include <string>
<span class="lineNum"> 30 </span> :
<span class="lineNum"> 31 </span> : namespace ept {
<span class="lineNum"> 32 </span> : namespace apt {
<span class="lineNum"> 33 </span> : class Apt;
<span class="lineNum"> 34 </span> : class PackageRecord;
<span class="lineNum"> 35 </span> : }
<span class="lineNum"> 36 </span> : namespace debtags {
<span class="lineNum"> 37 </span> : class Debtags;
<span class="lineNum"> 38 </span> : }
<span class="lineNum"> 39 </span> : namespace textsearch {
<span class="lineNum"> 40 </span> :
<span class="lineNum"> 41 </span> : // Allocate value indexes for known values
<span class="lineNum"> 42 </span> : const Xapian::valueno VAL_APT_INSTALLED_SIZE = 1;
<span class="lineNum"> 43 </span> : const Xapian::valueno VAL_APT_PACKAGE_SIZE = 2;
<span class="lineNum"> 44 </span> : const Xapian::valueno VAL_POPCON = 10;
<span class="lineNum"> 45 </span> : const Xapian::valueno VAL_ITERATING_RATING = 20;
<span class="lineNum"> 46 </span> : const Xapian::valueno VAL_ITERATING_FUNCTIONALITY = 21;
<span class="lineNum"> 47 </span> : const Xapian::valueno VAL_ITERATING_USABILITY = 22;
<span class="lineNum"> 48 </span> : const Xapian::valueno VAL_ITERATING_SECURITY = 23;
<span class="lineNum"> 49 </span> : const Xapian::valueno VAL_ITERATING_PERFORMANCE = 24;
<span class="lineNum"> 50 </span> : const Xapian::valueno VAL_ITERATING_QUALITY = 25;
<span class="lineNum"> 51 </span> : const Xapian::valueno VAL_ITERATING_SUPPORT = 26;
<span class="lineNum"> 52 </span> : const Xapian::valueno VAL_ITERATING_ADOPTION = 27;
<span class="lineNum"> 53 </span> : // If you need to index a value and cannot edit this file, feel free to use any
<span class="lineNum"> 54 </span> : // value starting from 1000000
<span class="lineNum"> 55 </span> :
<span class="lineNum"> 56 </span> :
<span class="lineNum"> 57 </span> : /*
<span class="lineNum"> 58 </span> : Fallback on apt scan searches when index is not present
<span class="lineNum"> 59 </span> :
<span class="lineNum"> 60 </span> : Explicitly decide at instantiation (or at any other time) if a rebuild should
<span class="lineNum"> 61 </span> : be performed. Just adding a 'rebuildIfNeeded' method would be enough.
<span class="lineNum"> 62 </span> :
<span class="lineNum"> 63 </span> : 17:14 #xapian < enrico> Hello. I'm finally in a position of writing a library to maintain
<span class="lineNum"> 64 </span> : a xapian index with Debian package descriptions in a Debian system
<span class="lineNum"> 65 </span> : 17:14 #xapian < enrico> I have a question, though
<span class="lineNum"> 66 </span> : 17:14 #xapian < enrico> The descriptions change regularly as people run 'apt-get update'
<span class="lineNum"> 67 </span> : 17:15 #xapian < enrico> I'd need to have a way to update the description index after
<span class="lineNum"> 68 </span> : apt-get update, without rebuilding it from scratch
<span class="lineNum"> 69 </span> : 17:15 #xapian < enrico> Is there some documentation on how to do that? I can't exactly
<span class="lineNum"> 70 </span> : tell Xapian "the new description for package foo is this" because
<span class="lineNum"> 71 </span> : I'd need the xapian id
<span class="lineNum"> 72 </span> : 19:11 #xapian < omega> you can add a unique term with a boolean prefix?
<span class="lineNum"> 73 </span> : 19:11 #xapian < omega> like Qpackage-name
<span class="lineNum"> 74 </span> : 19:11 #xapian < omega> then you search for it and replace_document
<span class="lineNum"> 75 </span> : 19:24 #xapian < richardb> Or indeed, you use the "replace_document()" form which takes a
<span class="lineNum"> 76 </span> : unique_id term.
<span class="lineNum"> 77 </span> : 19:25 #xapian < richardb> Xapian::docid replace_document(const std::string &
<span class="lineNum"> 78 </span> : unique_term,
<span class="lineNum"> 79 </span> : 19:25 #xapian < richardb> const Xapian::Document &
<span class="lineNum"> 80 </span> : document);
<span class="lineNum"> 81 </span> : 19:43 #xapian < enrico> unique term
<span class="lineNum"> 82 </span> : 19:43 #xapian < enrico> nice!
<span class="lineNum"> 83 </span> : 19:44 #xapian < enrico> can I use a non-alpha prefix, like :package-name ?
<span class="lineNum"> 84 </span> : 19:45 #xapian < enrico> or pkg:package-name
<span class="lineNum"> 85 </span> : 19:45 #xapian < enrico> I suppose I can
<span class="lineNum"> 86 </span> : */
<span class="lineNum"> 87 </span> :
<span class="lineNum"> 88 </span> : /**
<span class="lineNum"> 89 </span> : * Maintains and accesses a Xapian index of package descriptions.
<span class="lineNum"> 90 </span> : *
<span class="lineNum"> 91 </span> : * Contrarily to Debtags and Popcon, TextSearch does not attempt to create the
<span class="lineNum"> 92 </span> : * index in the home directory if no system index is found and it is not
<span class="lineNum"> 93 </span> : * running as root: this is to avoid secretly building large indexes (>50Mb)
<span class="lineNum"> 94 </span> : * in the home directory of users.
<span class="lineNum"> 95 </span> : *
<span class="lineNum"> 96 </span> : * The idea then is to have root keep the index up to date, possibly running a
<span class="lineNum"> 97 </span> : * reindexing tool once a day, or after an apt-get update.
<span class="lineNum"> 98 </span> : *
<span class="lineNum"> 99 </span> : * This works because the full text search index is useful even if it is
<span class="lineNum"> 100 </span> : * slightly out of date.
<span class="lineNum"> 101 </span> : */
<span class="lineNum"> 102 </span> : class TextSearch
<span class="lineNum"> 103 </span><span class="lineCov"> 11 : {</span>
<span class="lineNum"> 104 </span> : protected:
<span class="lineNum"> 105 </span> : time_t m_timestamp;
<span class="lineNum"> 106 </span> : Xapian::Database m_db;
<span class="lineNum"> 107 </span> : Xapian::Stem m_stem;
<span class="lineNum"> 108 </span> :
<span class="lineNum"> 109 </span> : /// Return a lowercased copy of the string
<span class="lineNum"> 110 </span> : static std::string toLower(const std::string& str);
<span class="lineNum"> 111 </span> :
<span class="lineNum"> 112 </span> : /**
<span class="lineNum"> 113 </span> : * Add normalised tokens computed from the string to the document doc.
<span class="lineNum"> 114 </span> : *
<span class="lineNum"> 115 </span> : * pos is used as a sequence generator for entering the token position in
<span class="lineNum"> 116 </span> : * the document.
<span class="lineNum"> 117 </span> : */
<span class="lineNum"> 118 </span> : void normalize_and_add(Xapian::Document& doc, const std::string& term, int& pos) const;
<span class="lineNum"> 119 </span> :
<span class="lineNum"> 120 </span> : public:
<span class="lineNum"> 121 </span> : struct ExtraIndexer
<span class="lineNum"> 122 </span> : {
<span class="lineNum"> 123 </span><span class="lineNoCov"> 0 : virtual ~ExtraIndexer() {}</span>
<span class="lineNum"> 124 </span> : virtual void operator()(Xapian::Document& doc, const apt::PackageRecord& rec) const = 0;
<span class="lineNum"> 125 </span> : };
<span class="lineNum"> 126 </span> :
<span class="lineNum"> 127 </span> : TextSearch();
<span class="lineNum"> 128 </span> :
<span class="lineNum"> 129 </span> : /// Access the Xapian database
<span class="lineNum"> 130 </span><span class="lineCov"> 3 : Xapian::Database& db() { return m_db; }</span>
<span class="lineNum"> 131 </span> :
<span class="lineNum"> 132 </span> : /// Access the Xapian database
<span class="lineNum"> 133 </span><span class="lineCov"> 4 : const Xapian::Database& db() const { return m_db; }</span>
<span class="lineNum"> 134 </span> :
<span class="lineNum"> 135 </span> : /// Timestamp of when the Xapian database was last updated
<span class="lineNum"> 136 </span><span class="lineCov"> 3 : time_t timestamp() const { return m_timestamp; }</span>
<span class="lineNum"> 137 </span> :
<span class="lineNum"> 138 </span> : /// Returns true if the index has data
<span class="lineNum"> 139 </span><span class="lineCov"> 3 : bool hasData() const { return m_timestamp > 0; }</span>
<span class="lineNum"> 140 </span> :
<span class="lineNum"> 141 </span> : /// Returns true if the index is older than the Apt database information
<span class="lineNum"> 142 </span> : bool needsRebuild(apt::Apt& apt);
<span class="lineNum"> 143 </span> :
<span class="lineNum"> 144 </span> : /**
<span class="lineNum"> 145 </span> : * Rebuild the index if needed.
<span class="lineNum"> 146 </span> : *
<span class="lineNum"> 147 </span> : * Allow to specify functors that contribute to the indexing.
<span class="lineNum"> 148 </span> : *
<span class="lineNum"> 149 </span> : * @note This requires write access to the index directory.
<span class="lineNum"> 150 </span> : * @note This is not the main way to update the index: it is provided here
<span class="lineNum"> 151 </span> : * only as a way to build a draft index for the library tests
<span class="lineNum"> 152 </span> : */
<span class="lineNum"> 153 </span> : bool rebuildIfNeeded(
<span class="lineNum"> 154 </span> : apt::Apt& apt,
<span class="lineNum"> 155 </span> : const std::vector<const ExtraIndexer*>& extraIndexers = std::vector<const ExtraIndexer*>());
<span class="lineNum"> 156 </span> :
<span class="lineNum"> 157 </span> : /**
<span class="lineNum"> 158 </span> : * Retrieve a Xapian docid by package name
<span class="lineNum"> 159 </span> : */
<span class="lineNum"> 160 </span> : Xapian::docid docidByName(const std::string& pkgname) const;
<span class="lineNum"> 161 </span> :
<span class="lineNum"> 162 </span> : /**
<span class="lineNum"> 163 </span> : * Tokenize the string and build an OR query with the resulting keywords
<span class="lineNum"> 164 </span> : */
<span class="lineNum"> 165 </span> : Xapian::Query makeORQuery(const std::string& keywords) const;
<span class="lineNum"> 166 </span> :
<span class="lineNum"> 167 </span> : /**
<span class="lineNum"> 168 </span> : * Tokenize the string and build an OR query with the resulting keywords.
<span class="lineNum"> 169 </span> : *
<span class="lineNum"> 170 </span> : * The last token in keywords is considered to be typed only partially, to
<span class="lineNum"> 171 </span> : * implement proper search-as-you-type.
<span class="lineNum"> 172 </span> : */
<span class="lineNum"> 173 </span> : Xapian::Query makePartialORQuery(const std::string& keywords) const;
<span class="lineNum"> 174 </span> :
<span class="lineNum"> 175 </span> : /**
<span class="lineNum"> 176 </span> : * Build a query with the given keywords, specified as iterators of strings
<span class="lineNum"> 177 </span> : */
<span class="lineNum"> 178 </span> : template<typename ITER>
<span class="lineNum"> 179 </span><span class="lineCov"> 3 : Xapian::Query makeORQuery(const ITER& begin, const ITER& end) const</span>
<span class="lineNum"> 180 </span> : {
<span class="lineNum"> 181 </span><span class="lineCov"> 3 : std::vector<std::string> terms;</span>
<span class="lineNum"> 182 </span> : // Insert both the lowercased and the stemmed lowercased query terms
<span class="lineNum"> 183 </span><span class="lineCov"> 10 : for (ITER i = begin; i != end; ++i)</span>
<span class="lineNum"> 184 </span> : {
<span class="lineNum"> 185 </span><span class="lineCov"> 7 : std::string t = toLower(*i);</span>
<span class="lineNum"> 186 </span><span class="lineCov"> 7 : std::string s = m_stem(t);</span>
<span class="lineNum"> 187 </span><span class="lineCov"> 7 : terms.push_back(t);</span>
<span class="lineNum"> 188 </span><span class="lineCov"> 7 : if (s != t)</span>
<span class="lineNum"> 189 </span><span class="lineNoCov"> 0 : terms.push_back("Z" + s);</span>
<span class="lineNum"> 190 </span> : }
<span class="lineNum"> 191 </span><span class="lineCov"> 3 : return Xapian::Query(Xapian::Query::OP_OR, terms.begin(), terms.end());</span>
<span class="lineNum"> 192 </span> : }
<span class="lineNum"> 193 </span> :
<span class="lineNum"> 194 </span> : /// Return a list of tag-based terms that can be used to expand an OR query
<span class="lineNum"> 195 </span> : std::vector<std::string> expand(Xapian::Enquire& enq) const;
<span class="lineNum"> 196 </span> :
<span class="lineNum"> 197 </span> : // std::vector<std::string> similar(const std::string& pkg);
<span class="lineNum"> 198 </span> :
<span class="lineNum"> 199 </span> : /**
<span class="lineNum"> 200 </span> : * Create a query to look for packages similar to the given one
<span class="lineNum"> 201 </span> : */
<span class="lineNum"> 202 </span> : Xapian::Query makeRelatedQuery(const std::string& pkgname) const;
<span class="lineNum"> 203 </span> :
<span class="lineNum"> 204 </span> : /**
<span class="lineNum"> 205 </span> : * Get the integer value for
<span class="lineNum"> 206 </span> : */
<span class="lineNum"> 207 </span> : double getDoubleValue(const std::string& pkgname, Xapian::valueno val_id) const;
<span class="lineNum"> 208 </span> :
<span class="lineNum"> 209 </span> : /**
<span class="lineNum"> 210 </span> : * Get the integer value for
<span class="lineNum"> 211 </span> : */
<span class="lineNum"> 212 </span> : int getIntValue(const std::string& pkgname, Xapian::valueno val_id) const;
<span class="lineNum"> 213 </span> : };
<span class="lineNum"> 214 </span> :
<span class="lineNum"> 215 </span> : }
<span class="lineNum"> 216 </span> : }
<span class="lineNum"> 217 </span> :
<span class="lineNum"> 218 </span> : // vim:set ts=4 sw=4:
<span class="lineNum"> 219 </span> : #endif
</pre>
</td>
</tr>
</table>
<br>
<table width="100%" border=0 cellspacing=0 cellpadding=0>
<tr><td class="ruler"><img src="../../glass.png" width=3 height=3 alt=""></td></tr>
<tr><td class="versionInfo">Generated by: <a href="http://ltp.sourceforge.net/coverage/lcov.php" target="_parent">LTP GCOV extension version 1.6</a></td></tr>
</table>
<br>
</body>
</html>
|