dan | 91f0ce3 | 2009-11-14 11:41:00 +0000 | [diff] [blame] | 1 | # 2009 November 04 |
| 2 | # |
| 3 | # The author disclaims copyright to this source code. In place of |
| 4 | # a legal notice, here is a blessing: |
| 5 | # |
| 6 | # May you do good and not evil. |
| 7 | # May you find forgiveness for yourself and forgive others. |
| 8 | # May you share freely, never taking more than you give. |
| 9 | # |
| 10 | #*********************************************************************** |
| 11 | # |
| 12 | # This file contains common code used the fts3 tests. At one point |
| 13 | # equivalent functionality was implemented in C code. But it is easier |
| 14 | # to use Tcl. |
| 15 | # |
| 16 | |
| 17 | #------------------------------------------------------------------------- |
| 18 | # USAGE: fts3_integrity_check TBL |
| 19 | # |
| 20 | # This proc is used to verify that the full-text index is consistent with |
| 21 | # the contents of the fts3 table. In other words, it checks that the |
| 22 | # data in the %_contents table matches that in the %_segdir and %_segments |
| 23 | # tables. |
| 24 | # |
| 25 | # This is not an efficient procedure. It uses a lot of memory and a lot |
| 26 | # of CPU. But it is better than not checking at all. |
| 27 | # |
| 28 | # The procedure is: |
| 29 | # |
| 30 | # 1) Read the entire full-text index from the %_segdir and %_segments |
| 31 | # tables into memory. For each entry in the index, the following is |
| 32 | # done: |
| 33 | # |
| 34 | # set C($iDocid,$iCol,$iPosition) $zTerm |
| 35 | # |
| 36 | # 2) Iterate through each column of each row of the %_content table. |
| 37 | # Tokenize all documents, and check that for each token there is |
| 38 | # a corresponding entry in the $C array. After checking a token, |
| 39 | # [unset] the $C array entry. |
| 40 | # |
| 41 | # 3) Check that array $C is now empty. |
| 42 | # |
| 43 | # |
| 44 | proc fts3_integrity_check {tbl} { |
| 45 | |
| 46 | fts3_read2 $tbl 1 A |
| 47 | |
| 48 | foreach zTerm [array names A] { |
| 49 | foreach doclist $A($zTerm) { |
| 50 | set docid 0 |
| 51 | while {[string length $doclist]>0} { |
| 52 | set iCol 0 |
| 53 | set iPos 0 |
| 54 | set lPos [list] |
| 55 | set lCol [list] |
| 56 | |
| 57 | # First varint of a doclist-entry is the docid. Delta-compressed |
| 58 | # with respect to the docid of the previous entry. |
| 59 | # |
| 60 | incr docid [gobble_varint doclist] |
| 61 | if {[info exists D($zTerm,$docid)]} { |
| 62 | while {[set iDelta [gobble_varint doclist]] != 0} {} |
| 63 | continue |
| 64 | } |
| 65 | set D($zTerm,$docid) 1 |
| 66 | |
| 67 | # Gobble varints until the 0x00 that terminates the doclist-entry |
| 68 | # is found. |
| 69 | while {[set iDelta [gobble_varint doclist]] > 0} { |
| 70 | if {$iDelta == 1} { |
| 71 | set iCol [gobble_varint doclist] |
| 72 | set iPos 0 |
| 73 | } else { |
| 74 | incr iPos $iDelta |
| 75 | incr iPos -2 |
| 76 | set C($docid,$iCol,$iPos) $zTerm |
| 77 | } |
| 78 | } |
| 79 | } |
| 80 | } |
| 81 | } |
| 82 | |
dan | 4b4d446 | 2009-11-14 23:50:11 +0000 | [diff] [blame] | 83 | foreach key [array names C] { |
| 84 | #puts "$key -> $C($key)" |
| 85 | } |
| 86 | |
dan | 91f0ce3 | 2009-11-14 11:41:00 +0000 | [diff] [blame] | 87 | |
| 88 | db eval "SELECT * FROM ${tbl}_content" E { |
| 89 | set iCol 0 |
| 90 | set iDoc $E(docid) |
| 91 | foreach col [lrange $E(*) 1 end] { |
| 92 | set c $E($col) |
| 93 | set sql {SELECT fts3_tokenizer_test('simple', $c)} |
| 94 | |
| 95 | foreach {pos term dummy} [db one $sql] { |
dan | 4b4d446 | 2009-11-14 23:50:11 +0000 | [diff] [blame] | 96 | if {![info exists C($iDoc,$iCol,$pos)]} { |
| 97 | set es "Error at docid=$iDoc col=$iCol pos=$pos. Index is missing" |
dan | 91f0ce3 | 2009-11-14 11:41:00 +0000 | [diff] [blame] | 98 | lappend errors $es |
dan | 4b4d446 | 2009-11-14 23:50:11 +0000 | [diff] [blame] | 99 | } else { |
| 100 | if {$C($iDoc,$iCol,$pos) != "$term"} { |
| 101 | set es "Error at docid=$iDoc col=$iCol pos=$pos. Index " |
| 102 | append es "has \"$C($iDoc,$iCol,$pos)\", document has \"$term\"" |
| 103 | lappend errors $es |
| 104 | } |
| 105 | unset C($iDoc,$iCol,$pos) |
dan | 91f0ce3 | 2009-11-14 11:41:00 +0000 | [diff] [blame] | 106 | } |
dan | 91f0ce3 | 2009-11-14 11:41:00 +0000 | [diff] [blame] | 107 | } |
| 108 | incr iCol |
| 109 | } |
| 110 | } |
| 111 | |
| 112 | foreach c [array names C] { |
| 113 | lappend errors "Bad index entry: $c -> $C($c)" |
| 114 | } |
| 115 | |
| 116 | if {[info exists errors]} { return [join $errors "\n"] } |
| 117 | return "ok" |
| 118 | } |
| 119 | |
| 120 | # USAGE: fts3_terms TBL WHERE |
| 121 | # |
| 122 | # Argument TBL must be the name of an FTS3 table. Argument WHERE is an |
| 123 | # SQL expression that will be used as the WHERE clause when scanning |
| 124 | # the %_segdir table. As in the following query: |
| 125 | # |
| 126 | # "SELECT * FROM ${TBL}_segdir WHERE ${WHERE}" |
| 127 | # |
| 128 | # This function returns a list of all terms present in the segments |
| 129 | # selected by the statement above. |
| 130 | # |
| 131 | proc fts3_terms {tbl where} { |
| 132 | fts3_read $tbl $where a |
| 133 | return [lsort [array names a]] |
| 134 | } |
| 135 | |
| 136 | |
| 137 | # USAGE: fts3_doclist TBL TERM WHERE |
| 138 | # |
| 139 | # Argument TBL must be the name of an FTS3 table. TERM is a term that may |
| 140 | # or may not be present in the table. Argument WHERE is used to select a |
| 141 | # subset of the b-tree segments in the associated full-text index as |
| 142 | # described above for [fts3_terms]. |
| 143 | # |
| 144 | # This function returns the results of merging the doclists associated |
| 145 | # with TERM in the selected segments. Each doclist is an element of the |
| 146 | # returned list. Each doclist is formatted as follows: |
| 147 | # |
| 148 | # [$docid ?$col[$off1 $off2...]?...] |
| 149 | # |
| 150 | # The formatting is odd for a Tcl command in order to be compatible with |
| 151 | # the original C-language implementation. If argument WHERE is "1", then |
| 152 | # any empty doclists are omitted from the returned list. |
| 153 | # |
| 154 | proc fts3_doclist {tbl term where} { |
| 155 | fts3_read $tbl $where a |
| 156 | |
| 157 | |
| 158 | foreach doclist $a($term) { |
| 159 | set docid 0 |
| 160 | |
| 161 | while {[string length $doclist]>0} { |
| 162 | set iCol 0 |
| 163 | set iPos 0 |
| 164 | set lPos [list] |
| 165 | set lCol [list] |
| 166 | incr docid [gobble_varint doclist] |
| 167 | |
| 168 | while {[set iDelta [gobble_varint doclist]] > 0} { |
| 169 | if {$iDelta == 1} { |
| 170 | lappend lCol [list $iCol $lPos] |
| 171 | set iPos 0 |
| 172 | set lPos [list] |
| 173 | set iCol [gobble_varint doclist] |
| 174 | } else { |
| 175 | incr iPos $iDelta |
| 176 | incr iPos -2 |
| 177 | lappend lPos $iPos |
| 178 | } |
| 179 | } |
| 180 | |
| 181 | if {[llength $lPos]>0} { |
| 182 | lappend lCol [list $iCol $lPos] |
| 183 | } |
| 184 | |
| 185 | if {$where != "1" || [llength $lCol]>0} { |
| 186 | set ret($docid) $lCol |
| 187 | } else { |
| 188 | unset -nocomplain ret($docid) |
| 189 | } |
| 190 | } |
| 191 | } |
| 192 | |
| 193 | set lDoc [list] |
| 194 | foreach docid [lsort -integer [array names ret]] { |
| 195 | set lCol [list] |
| 196 | set cols "" |
| 197 | foreach col $ret($docid) { |
| 198 | foreach {iCol lPos} $col {} |
| 199 | append cols " $iCol\[[join $lPos { }]\]" |
| 200 | } |
| 201 | lappend lDoc "\[${docid}${cols}\]" |
| 202 | } |
| 203 | |
| 204 | join $lDoc " " |
| 205 | } |
| 206 | |
| 207 | ########################################################################### |
| 208 | |
| 209 | proc gobble_varint {varname} { |
| 210 | upvar $varname blob |
dan | f5fff2a | 2009-12-12 09:51:25 +0000 | [diff] [blame] | 211 | set n [read_fts3varint $blob ret] |
dan | 91f0ce3 | 2009-11-14 11:41:00 +0000 | [diff] [blame] | 212 | set blob [string range $blob $n end] |
| 213 | return $ret |
| 214 | } |
| 215 | proc gobble_string {varname nLength} { |
| 216 | upvar $varname blob |
| 217 | set ret [string range $blob 0 [expr $nLength-1]] |
| 218 | set blob [string range $blob $nLength end] |
| 219 | return $ret |
| 220 | } |
| 221 | |
| 222 | # The argument is a blob of data representing an FTS3 segment leaf. |
| 223 | # Return a list consisting of alternating terms (strings) and doclists |
| 224 | # (blobs of data). |
| 225 | # |
| 226 | proc fts3_readleaf {blob} { |
| 227 | set zPrev "" |
| 228 | set terms [list] |
| 229 | |
| 230 | while {[string length $blob] > 0} { |
| 231 | set nPrefix [gobble_varint blob] |
| 232 | set nSuffix [gobble_varint blob] |
| 233 | |
| 234 | set zTerm [string range $zPrev 0 [expr $nPrefix-1]] |
| 235 | append zTerm [gobble_string blob $nSuffix] |
| 236 | set doclist [gobble_string blob [gobble_varint blob]] |
| 237 | |
| 238 | lappend terms $zTerm $doclist |
| 239 | set zPrev $zTerm |
| 240 | } |
| 241 | |
| 242 | return $terms |
| 243 | } |
| 244 | |
| 245 | proc fts3_read2 {tbl where varname} { |
| 246 | upvar $varname a |
| 247 | array unset a |
| 248 | db eval " SELECT start_block, leaves_end_block, root |
| 249 | FROM ${tbl}_segdir WHERE $where |
| 250 | ORDER BY level ASC, idx DESC |
| 251 | " { |
| 252 | if {$start_block == 0} { |
| 253 | foreach {t d} [fts3_readleaf $root] { lappend a($t) $d } |
| 254 | } else { |
| 255 | db eval " SELECT block |
| 256 | FROM ${tbl}_segments |
dan | 4b4d446 | 2009-11-14 23:50:11 +0000 | [diff] [blame] | 257 | WHERE blockid>=$start_block AND blockid<=$leaves_end_block |
dan | 91f0ce3 | 2009-11-14 11:41:00 +0000 | [diff] [blame] | 258 | ORDER BY blockid |
| 259 | " { |
| 260 | foreach {t d} [fts3_readleaf $block] { lappend a($t) $d } |
| 261 | |
| 262 | } |
| 263 | } |
| 264 | } |
| 265 | } |
| 266 | |
| 267 | proc fts3_read {tbl where varname} { |
| 268 | upvar $varname a |
| 269 | array unset a |
| 270 | db eval " SELECT start_block, leaves_end_block, root |
| 271 | FROM ${tbl}_segdir WHERE $where |
| 272 | ORDER BY level DESC, idx ASC |
| 273 | " { |
| 274 | if {$start_block == 0} { |
| 275 | foreach {t d} [fts3_readleaf $root] { lappend a($t) $d } |
| 276 | } else { |
| 277 | db eval " SELECT block |
| 278 | FROM ${tbl}_segments |
| 279 | WHERE blockid>=$start_block AND blockid<$leaves_end_block |
| 280 | ORDER BY blockid |
| 281 | " { |
| 282 | foreach {t d} [fts3_readleaf $block] { lappend a($t) $d } |
| 283 | |
| 284 | } |
| 285 | } |
| 286 | } |
| 287 | } |
| 288 | |
dan | 81fa6dc | 2009-11-28 12:40:32 +0000 | [diff] [blame] | 289 | ########################################################################## |
| 290 | |