blob: 97af54925f3953e866468ddac7cc3afa8736b471 [file] [log] [blame]
dane2e51452009-12-03 17:36:22 +00001# 2009 December 03
2#
3# May you do good and not evil.
4# May you find forgiveness for yourself and forgive others.
5# May you share freely, never taking more than you give.
6#
7#***********************************************************************
8#
9# Brute force (random data) tests for FTS3.
10#
11
dan1548f212009-12-11 07:07:36 +000012#-------------------------------------------------------------------------
13#
14# The FTS3 tests implemented in this file focus on testing that FTS3
15# returns the correct set of documents for various types of full-text
16# query. This is done using pseudo-randomly generated data and queries.
17# The expected result of each query is calculated using Tcl code.
18#
19# 1. The database is initialized to contain a single table with three
20# columns. 100 rows are inserted into the table. Each of the three
21# values in each row is a document consisting of between 0 and 100
22# terms. Terms are selected from a vocabulary of $G(nVocab) terms.
23#
24# 2. The following is performed 100 times:
25#
26# a. A row is inserted into the database. The row contents are
27# generated as in step 1. The docid is a pseudo-randomly selected
28# value between 0 and 1000000.
29#
30# b. A psuedo-randomly selected row is updated. One of its columns is
31# set to contain a new document generated in the same way as the
32# documents in step 1.
33#
34# c. A psuedo-randomly selected row is deleted.
35#
36# d. For each of several types of fts3 queries, 10 SELECT queries
37# of the form:
38#
39# SELECT docid FROM <tbl> WHERE <tbl> MATCH '<query>'
40#
41# are evaluated. The results are compared to those calculated by
42# Tcl code in this file. The patterns used for the different query
43# types are:
44#
45# 1. query = <term>
46# 2. query = <prefix>
47# 3. query = "<term> <term>"
48# 4. query = "<term> <term> <term>"
49# 5. query = "<prefix> <prefix> <prefix>"
50# 6. query = <term> NEAR <term>
51# 7. query = <term> NEAR/11 <term> NEAR/11 <term>
52# 8. query = <term> OR <term>
53# 9. query = <term> NOT <term>
54# 10. query = <term> AND <term>
55# 11. query = <term> NEAR <term> OR <term> NEAR <term>
56# 12. query = <term> NEAR <term> NOT <term> NEAR <term>
57# 13. query = <term> NEAR <term> AND <term> NEAR <term>
58#
59# where <term> is a term psuedo-randomly selected from the vocabulary
60# and prefix is the first 2 characters of such a term followed by
61# a "*" character.
62#
63# Every second iteration, steps (a) through (d) above are performed
64# within a single transaction. This forces the queries in (d) to
65# read data from both the database and the in-memory hash table
66# that caches the full-text index entries created by steps (a), (b)
67# and (c) until the transaction is committed.
68#
69# The procedure above is run 5 times, using advisory fts3 node sizes of 50,
70# 500, 1000 and 2000 bytes.
71#
72# After the test using an advisory node-size of 50, an OOM test is run using
73# the database. This test is similar to step (d) above, except that it tests
74# the effects of transient and persistent OOM conditions encountered while
75# executing each query.
76#
77
dane2e51452009-12-03 17:36:22 +000078set testdir [file dirname $argv0]
79source $testdir/tester.tcl
80
81# If this build does not include FTS3, skip the tests in this file.
82#
83ifcapable !fts3 { finish_test ; return }
84source $testdir/fts3_common.tcl
danef378022010-05-04 11:06:03 +000085source $testdir/malloc_common.tcl
dane2e51452009-12-03 17:36:22 +000086
dan1548f212009-12-11 07:07:36 +000087set G(nVocab) 100
88
dane2e51452009-12-03 17:36:22 +000089set nVocab 100
90set lVocab [list]
91
danb8937212009-12-10 16:04:25 +000092expr srand(0)
93
dane2e51452009-12-03 17:36:22 +000094# Generate a vocabulary of nVocab words. Each word is 3 characters long.
95#
96set lChar {a b c d e f g h i j k l m n o p q r s t u v w x y z}
97for {set i 0} {$i < $nVocab} {incr i} {
dan45bcd6c2009-12-12 13:16:09 +000098 set len [expr int(rand()*3)+2]
dane2e51452009-12-03 17:36:22 +000099 set word [lindex $lChar [expr int(rand()*26)]]
100 append word [lindex $lChar [expr int(rand()*26)]]
dan45bcd6c2009-12-12 13:16:09 +0000101 if {$len>2} { append word [lindex $lChar [expr int(rand()*26)]] }
102 if {$len>3} { append word [lindex $lChar [expr int(rand()*26)]] }
dane2e51452009-12-03 17:36:22 +0000103 lappend lVocab $word
104}
105
106proc random_term {} {
107 lindex $::lVocab [expr {int(rand()*$::nVocab)}]
108}
109
110# Return a document consisting of $nWord arbitrarily selected terms
111# from the $::lVocab list.
112#
113proc generate_doc {nWord} {
114 set doc [list]
115 for {set i 0} {$i < $nWord} {incr i} {
116 lappend doc [random_term]
117 }
118 return $doc
119}
120
121
122
123# Primitives to update the table.
124#
drha43be912009-12-04 01:44:42 +0000125unset -nocomplain t1
dane2e51452009-12-03 17:36:22 +0000126proc insert_row {rowid} {
127 set a [generate_doc [expr int((rand()*100))]]
128 set b [generate_doc [expr int((rand()*100))]]
129 set c [generate_doc [expr int((rand()*100))]]
130 execsql { INSERT INTO t1(docid, a, b, c) VALUES($rowid, $a, $b, $c) }
131 set ::t1($rowid) [list $a $b $c]
132}
133proc delete_row {rowid} {
134 execsql { DELETE FROM t1 WHERE rowid = $rowid }
135 catch {unset ::t1($rowid)}
136}
137proc update_row {rowid} {
138 set cols {a b c}
139 set iCol [expr int(rand()*3)]
140 set doc [generate_doc [expr int((rand()*100))]]
141 lset ::t1($rowid) $iCol $doc
142 execsql "UPDATE t1 SET [lindex $cols $iCol] = \$doc WHERE rowid = \$rowid"
143}
144
danff32e392009-12-07 12:34:51 +0000145proc simple_phrase {zPrefix} {
dane2e51452009-12-03 17:36:22 +0000146 set ret [list]
dan45bcd6c2009-12-12 13:16:09 +0000147
148 set reg [string map {* {[^ ]*}} $zPrefix]
149 set reg " $reg "
150
dan3540c1f2009-12-22 18:56:19 +0000151 foreach key [lsort -integer [array names ::t1]] {
152 set value $::t1($key)
153 set cnt [list]
dan45bcd6c2009-12-12 13:16:09 +0000154 foreach col $value {
dan3540c1f2009-12-22 18:56:19 +0000155 if {[regexp $reg " $col "]} { lappend ret $key ; break }
dan45bcd6c2009-12-12 13:16:09 +0000156 }
danacf28fb2009-12-04 14:11:33 +0000157 }
dan45bcd6c2009-12-12 13:16:09 +0000158
dan3540c1f2009-12-22 18:56:19 +0000159 #lsort -uniq -integer $ret
160 set ret
danacf28fb2009-12-04 14:11:33 +0000161}
dan45bcd6c2009-12-12 13:16:09 +0000162
danceafa472010-01-14 11:17:05 +0000163# This [proc] is used to test the FTS3 matchinfo() function.
164#
dan5289b012011-06-06 14:51:50 +0000165proc simple_token_matchinfo {zToken bDesc} {
dana98af172010-01-02 19:02:02 +0000166
danceafa472010-01-14 11:17:05 +0000167 set nDoc(0) 0
168 set nDoc(1) 0
169 set nDoc(2) 0
170 set nHit(0) 0
171 set nHit(1) 0
172 set nHit(2) 0
173
dan5289b012011-06-06 14:51:50 +0000174 set dir -inc
175 if {$bDesc} { set dir -dec }
danceafa472010-01-14 11:17:05 +0000176
177 foreach key [array names ::t1] {
dan3540c1f2009-12-22 18:56:19 +0000178 set value $::t1($key)
danceafa472010-01-14 11:17:05 +0000179 set a($key) [list]
dana98af172010-01-02 19:02:02 +0000180 foreach i {0 1 2} col $value {
danceafa472010-01-14 11:17:05 +0000181 set hit [llength [lsearch -all $col $zToken]]
182 lappend a($key) $hit
183 incr nHit($i) $hit
184 if {$hit>0} { incr nDoc($i) }
dan3540c1f2009-12-22 18:56:19 +0000185 }
186 }
danceafa472010-01-14 11:17:05 +0000187
188 set ret [list]
dan5289b012011-06-06 14:51:50 +0000189 foreach docid [lsort -integer $dir [array names a]] {
danceafa472010-01-14 11:17:05 +0000190 if { [lindex [lsort -integer $a($docid)] end] } {
191 set matchinfo [list 1 3]
192 foreach i {0 1 2} hit $a($docid) {
193 lappend matchinfo $hit $nHit($i) $nDoc($i)
194 }
195 lappend ret $docid $matchinfo
196 }
197 }
198
199 set ret
dan3540c1f2009-12-22 18:56:19 +0000200}
201
danacf28fb2009-12-04 14:11:33 +0000202proc simple_near {termlist nNear} {
dan165b67c2009-12-04 19:07:24 +0000203 set ret [list]
204
dan165b67c2009-12-04 19:07:24 +0000205 foreach {key value} [array get ::t1] {
206 foreach v $value {
dan165b67c2009-12-04 19:07:24 +0000207
208 set l [lsearch -exact -all $v [lindex $termlist 0]]
209 foreach T [lrange $termlist 1 end] {
210 set l2 [list]
211 foreach i $l {
212 set iStart [expr $i - $nNear - 1]
213 set iEnd [expr $i + $nNear + 1]
dan28f372f2009-12-05 14:29:22 +0000214 if {$iStart < 0} {set iStart 0}
dan165b67c2009-12-04 19:07:24 +0000215 foreach i2 [lsearch -exact -all [lrange $v $iStart $iEnd] $T] {
216 incr i2 $iStart
217 if {$i2 != $i} { lappend l2 $i2 }
218 }
219 }
dan28f372f2009-12-05 14:29:22 +0000220 set l [lsort -uniq -integer $l2]
dan165b67c2009-12-04 19:07:24 +0000221 }
222
223 if {[llength $l]} {
dan28f372f2009-12-05 14:29:22 +0000224#puts "MATCH($key): $v"
dan165b67c2009-12-04 19:07:24 +0000225 lappend ret $key
226 }
227 }
228 }
229
230 lsort -unique -integer $ret
danacf28fb2009-12-04 14:11:33 +0000231}
232
danff32e392009-12-07 12:34:51 +0000233# The following three procs:
234#
235# setup_not A B
236# setup_or A B
237# setup_and A B
238#
239# each take two arguments. Both arguments must be lists of integer values
240# sorted by value. The return value is the list produced by evaluating
241# the equivalent of "A op B", where op is the FTS3 operator NOT, OR or
242# AND.
243#
244proc setop_not {A B} {
245 foreach b $B { set n($b) {} }
246 set ret [list]
247 foreach a $A { if {![info exists n($a)]} {lappend ret $a} }
248 return $ret
249}
250proc setop_or {A B} {
251 lsort -integer -uniq [concat $A $B]
252}
253proc setop_and {A B} {
254 foreach b $B { set n($b) {} }
255 set ret [list]
256 foreach a $A { if {[info exists n($a)]} {lappend ret $a} }
257 return $ret
258}
259
dan3540c1f2009-12-22 18:56:19 +0000260proc mit {blob} {
261 set scan(littleEndian) i*
262 set scan(bigEndian) I*
263 binary scan $blob $scan($::tcl_platform(byteOrder)) r
264 return $r
265}
266db func mit mit
danff32e392009-12-07 12:34:51 +0000267set sqlite_fts3_enable_parentheses 1
268
dan5289b012011-06-06 14:51:50 +0000269proc do_orderbydocid_test {tn sql res} {
270 uplevel [list do_select_test $tn.asc "$sql ORDER BY docid ASC" $res]
271 uplevel [list do_select_test $tn.desc "$sql ORDER BY docid DESC" \
272 [lsort -int -dec $res]
273 ]
274}
275
dan786b0682011-06-09 10:48:02 +0000276set NUM_TRIALS 100
277
dan5289b012011-06-06 14:51:50 +0000278foreach {nodesize order} {
279 50 DESC
280 50 ASC
281 500 ASC
282 1000 DESC
283 2000 ASC
284} {
dane2e51452009-12-03 17:36:22 +0000285 catch { array unset ::t1 }
dan5289b012011-06-06 14:51:50 +0000286 set testname "$nodesize/$order"
dane2e51452009-12-03 17:36:22 +0000287
288 # Create the FTS3 table. Populate it (and the Tcl array) with 100 rows.
289 #
290 db transaction {
291 catchsql { DROP TABLE t1 }
dan5289b012011-06-06 14:51:50 +0000292 execsql "CREATE VIRTUAL TABLE t1 USING fts4(a, b, c, order=$order)"
dan601cd9a2009-12-11 16:03:45 +0000293 execsql "INSERT INTO t1(t1) VALUES('nodesize=$nodesize')"
dane2e51452009-12-03 17:36:22 +0000294 for {set i 0} {$i < 100} {incr i} { insert_row $i }
295 }
296
dan786b0682011-06-09 10:48:02 +0000297 for {set iTest 1} {$iTest <= $NUM_TRIALS} {incr iTest} {
danb8937212009-12-10 16:04:25 +0000298 catchsql COMMIT
dan18ff7fa2009-12-09 14:39:41 +0000299
300 set DO_MALLOC_TEST 0
301 set nRep 10
302 if {$iTest==100 && $nodesize==50} {
303 set DO_MALLOC_TEST 1
304 set nRep 2
305 }
dan5289b012011-06-06 14:51:50 +0000306
307 set ::testprefix fts3rnd-1.$testname.$iTest
dane2e51452009-12-03 17:36:22 +0000308
309 # Delete one row, update one row and insert one row.
310 #
311 set rows [array names ::t1]
312 set nRow [llength $rows]
313 set iUpdate [lindex $rows [expr {int(rand()*$nRow)}]]
314 set iDelete $iUpdate
315 while {$iDelete == $iUpdate} {
316 set iDelete [lindex $rows [expr {int(rand()*$nRow)}]]
317 }
318 set iInsert $iUpdate
319 while {[info exists ::t1($iInsert)]} {
320 set iInsert [expr {int(rand()*1000000)}]
321 }
danb8937212009-12-10 16:04:25 +0000322 execsql BEGIN
dane2e51452009-12-03 17:36:22 +0000323 insert_row $iInsert
324 update_row $iUpdate
325 delete_row $iDelete
danb8937212009-12-10 16:04:25 +0000326 if {0==($iTest%2)} { execsql COMMIT }
dan28f372f2009-12-05 14:29:22 +0000327
dan4d8d2782010-12-02 17:39:26 +0000328 if {0==($iTest%2)} {
dan5289b012011-06-06 14:51:50 +0000329 #do_test 0 { fts3_integrity_check t1 } ok
dan4d8d2782010-12-02 17:39:26 +0000330 }
331
dane2e51452009-12-03 17:36:22 +0000332 # Pick 10 terms from the vocabulary. Check that the results of querying
333 # the database for the set of documents containing each of these terms
334 # is the same as the result obtained by scanning the contents of the Tcl
335 # array for each term.
336 #
danacf28fb2009-12-04 14:11:33 +0000337 for {set i 0} {$i < 10} {incr i} {
338 set term [random_term]
dan786b0682011-06-09 10:48:02 +0000339 do_select_test 1.$i.asc {
dan3540c1f2009-12-22 18:56:19 +0000340 SELECT docid, mit(matchinfo(t1)) FROM t1 WHERE t1 MATCH $term
dan5289b012011-06-06 14:51:50 +0000341 ORDER BY docid ASC
342 } [simple_token_matchinfo $term 0]
dan786b0682011-06-09 10:48:02 +0000343 do_select_test 1.$i.desc {
dan5289b012011-06-06 14:51:50 +0000344 SELECT docid, mit(matchinfo(t1)) FROM t1 WHERE t1 MATCH $term
345 ORDER BY docid DESC
346 } [simple_token_matchinfo $term 1]
dane2e51452009-12-03 17:36:22 +0000347 }
348
danacf28fb2009-12-04 14:11:33 +0000349 # This time, use the first two characters of each term as a term prefix
350 # to query for. Test that querying the Tcl array produces the same results
351 # as querying the FTS3 table for the prefix.
352 #
dan18ff7fa2009-12-09 14:39:41 +0000353 for {set i 0} {$i < $nRep} {incr i} {
dan45bcd6c2009-12-12 13:16:09 +0000354 set prefix [string range [random_term] 0 end-1]
danacf28fb2009-12-04 14:11:33 +0000355 set match "${prefix}*"
dan5289b012011-06-06 14:51:50 +0000356 do_orderbydocid_test 2.$i {
dan18ff7fa2009-12-09 14:39:41 +0000357 SELECT docid FROM t1 WHERE t1 MATCH $match
danff32e392009-12-07 12:34:51 +0000358 } [simple_phrase $match]
danacf28fb2009-12-04 14:11:33 +0000359 }
360
dane2e51452009-12-03 17:36:22 +0000361 # Similar to the above, except for phrase queries.
362 #
dan18ff7fa2009-12-09 14:39:41 +0000363 for {set i 0} {$i < $nRep} {incr i} {
dane2e51452009-12-03 17:36:22 +0000364 set term [list [random_term] [random_term]]
365 set match "\"$term\""
dan5289b012011-06-06 14:51:50 +0000366 do_orderbydocid_test 3.$i {
dan18ff7fa2009-12-09 14:39:41 +0000367 SELECT docid FROM t1 WHERE t1 MATCH $match
danff32e392009-12-07 12:34:51 +0000368 } [simple_phrase $term]
dane2e51452009-12-03 17:36:22 +0000369 }
danff32e392009-12-07 12:34:51 +0000370
dane2e51452009-12-03 17:36:22 +0000371 # Three word phrases.
372 #
dan18ff7fa2009-12-09 14:39:41 +0000373 for {set i 0} {$i < $nRep} {incr i} {
dane2e51452009-12-03 17:36:22 +0000374 set term [list [random_term] [random_term] [random_term]]
375 set match "\"$term\""
dan5289b012011-06-06 14:51:50 +0000376 do_orderbydocid_test 4.$i {
dan18ff7fa2009-12-09 14:39:41 +0000377 SELECT docid FROM t1 WHERE t1 MATCH $match
danff32e392009-12-07 12:34:51 +0000378 } [simple_phrase $term]
379 }
380
381 # Three word phrases made up of term-prefixes.
382 #
dan18ff7fa2009-12-09 14:39:41 +0000383 for {set i 0} {$i < $nRep} {incr i} {
dan45bcd6c2009-12-12 13:16:09 +0000384 set query "[string range [random_term] 0 end-1]* "
385 append query "[string range [random_term] 0 end-1]* "
386 append query "[string range [random_term] 0 end-1]*"
danff32e392009-12-07 12:34:51 +0000387
388 set match "\"$query\""
dan5289b012011-06-06 14:51:50 +0000389 do_orderbydocid_test 5.$i {
dan18ff7fa2009-12-09 14:39:41 +0000390 SELECT docid FROM t1 WHERE t1 MATCH $match
danff32e392009-12-07 12:34:51 +0000391 } [simple_phrase $query]
dane2e51452009-12-03 17:36:22 +0000392 }
danacf28fb2009-12-04 14:11:33 +0000393
dan5289b012011-06-06 14:51:50 +0000394 # A NEAR query with terms as the arguments:
395 #
396 # ... MATCH '$term1 NEAR $term2' ...
danacf28fb2009-12-04 14:11:33 +0000397 #
dan18ff7fa2009-12-09 14:39:41 +0000398 for {set i 0} {$i < $nRep} {incr i} {
danacf28fb2009-12-04 14:11:33 +0000399 set terms [list [random_term] [random_term]]
400 set match [join $terms " NEAR "]
dan5289b012011-06-06 14:51:50 +0000401 do_orderbydocid_test 6.$i {
dan18ff7fa2009-12-09 14:39:41 +0000402 SELECT docid FROM t1 WHERE t1 MATCH $match
dan165b67c2009-12-04 19:07:24 +0000403 } [simple_near $terms 10]
404 }
danff32e392009-12-07 12:34:51 +0000405
dan165b67c2009-12-04 19:07:24 +0000406 # A 3-way NEAR query with terms as the arguments.
407 #
dan18ff7fa2009-12-09 14:39:41 +0000408 for {set i 0} {$i < $nRep} {incr i} {
dan165b67c2009-12-04 19:07:24 +0000409 set terms [list [random_term] [random_term] [random_term]]
dan28f372f2009-12-05 14:29:22 +0000410 set nNear 11
411 set match [join $terms " NEAR/$nNear "]
dan5289b012011-06-06 14:51:50 +0000412 do_orderbydocid_test 7.$i {
dan18ff7fa2009-12-09 14:39:41 +0000413 SELECT docid FROM t1 WHERE t1 MATCH $match
dan28f372f2009-12-05 14:29:22 +0000414 } [simple_near $terms $nNear]
danacf28fb2009-12-04 14:11:33 +0000415 }
danff32e392009-12-07 12:34:51 +0000416
417 # Set operations on simple term queries.
418 #
419 foreach {tn op proc} {
420 8 OR setop_or
421 9 NOT setop_not
422 10 AND setop_and
423 } {
dan18ff7fa2009-12-09 14:39:41 +0000424 for {set i 0} {$i < $nRep} {incr i} {
danff32e392009-12-07 12:34:51 +0000425 set term1 [random_term]
426 set term2 [random_term]
427 set match "$term1 $op $term2"
dan5289b012011-06-06 14:51:50 +0000428 do_orderbydocid_test $tn.$i {
dan18ff7fa2009-12-09 14:39:41 +0000429 SELECT docid FROM t1 WHERE t1 MATCH $match
danff32e392009-12-07 12:34:51 +0000430 } [$proc [simple_phrase $term1] [simple_phrase $term2]]
431 }
432 }
433
434 # Set operations on NEAR queries.
435 #
436 foreach {tn op proc} {
dan786b0682011-06-09 10:48:02 +0000437 11 OR setop_or
438 12 NOT setop_not
439 13 AND setop_and
danff32e392009-12-07 12:34:51 +0000440 } {
dan18ff7fa2009-12-09 14:39:41 +0000441 for {set i 0} {$i < $nRep} {incr i} {
danff32e392009-12-07 12:34:51 +0000442 set term1 [random_term]
443 set term2 [random_term]
444 set term3 [random_term]
445 set term4 [random_term]
446 set match "$term1 NEAR $term2 $op $term3 NEAR $term4"
dan5289b012011-06-06 14:51:50 +0000447 do_orderbydocid_test $tn.$i {
dan18ff7fa2009-12-09 14:39:41 +0000448 SELECT docid FROM t1 WHERE t1 MATCH $match
danff32e392009-12-07 12:34:51 +0000449 } [$proc \
450 [simple_near [list $term1 $term2] 10] \
451 [simple_near [list $term3 $term4] 10]
452 ]
453 }
454 }
danb8937212009-12-10 16:04:25 +0000455
456 catchsql COMMIT
dane2e51452009-12-03 17:36:22 +0000457 }
458}
459
460finish_test