Blame - test/fts4unicode.test - chromium.googlesource.com/chromium/deps/sqlite

blob: 8bd83f6d9eb0780e1c87cca1b7201ab97aba6511 [file] [log] [blame]

dan	3d403c7	2012-05-25 17:50:19 +0000	[diff] [blame]	1	# 2012 May 25
				2	#
				3	# The author disclaims copyright to this source code. In place of
				4	# a legal notice, here is a blessing:
				5	#
				6	# May you do good and not evil.
				7	# May you find forgiveness for yourself and forgive others.
				8	# May you share freely, never taking more than you give.
				9	#
				10	#*************************************************************************
				11	#
				12	# The tests in this file focus on testing the "unicode" FTS tokenizer.
				13	#
				14
				15	set testdir [file dirname $argv0]
				16	source $testdir/tester.tcl
dan	7946c53	2012-05-26 18:28:14 +0000	[diff] [blame]	17	ifcapable !fts3_unicode { finish_test ; return }
dan	3d403c7	2012-05-25 17:50:19 +0000	[diff] [blame]	18	set ::testprefix fts4unicode
				19
				20	proc do_unicode_token_test {tn input res} {
				21	set input [string map {' ''} $input]
				22	uplevel [list do_execsql_test $tn "
dan	754d3ad	2012-06-06 19:30:38 +0000	[diff] [blame]	23	SELECT fts3_tokenizer_test('unicode61', 'remove_diacritics=0', '$input');
				24	" [list [list {*}$res]]]
				25	}
				26
				27	proc do_unicode_token_test2 {tn input res} {
				28	set input [string map {' ''} $input]
				29	uplevel [list do_execsql_test $tn "
dan	ab322bd	2012-05-26 14:54:50 +0000	[diff] [blame]	30	SELECT fts3_tokenizer_test('unicode61', '$input');
dan	3d403c7	2012-05-25 17:50:19 +0000	[diff] [blame]	31	" [list [list {*}$res]]]
				32	}
				33
dan	25cdf46	2012-06-07 15:53:48 +0000	[diff] [blame]	34	proc do_unicode_token_test3 {tn args} {
				35	set res [lindex $args end]
				36	set sql "SELECT fts3_tokenizer_test('unicode61'"
				37	foreach a [lrange $args 0 end-1] {
				38	append sql ", '"
				39	append sql [string map {' ''} $a]
				40	append sql "'"
				41	}
				42	append sql ")"
				43	uplevel [list do_execsql_test $tn $sql [list [list {*}$res]]]
				44	}
				45
dan	3d403c7	2012-05-25 17:50:19 +0000	[diff] [blame]	46	do_unicode_token_test 1.0 {a B c D} {0 a a 1 b B 2 c c 3 d D}
drh	7c37e2f	2013-01-26 19:31:42 +0000	[diff] [blame]	47	do_unicode_token_test 1.1 {Ä Ö Ü} {0 ä Ä 1 ö Ö 2 ü Ü}
				48	do_unicode_token_test 1.2 {xÄx xÖx xÜx} {0 xäx xÄx 1 xöx xÖx 2 xüx xÜx}
dan	3d403c7	2012-05-25 17:50:19 +0000	[diff] [blame]	49
				50	# 0x00DF is a small "sharp s". 0x1E9E is a capital sharp s.
				51	do_unicode_token_test 1.3 "\uDF" "0 \uDF \uDF"
drh	7c37e2f	2013-01-26 19:31:42 +0000	[diff] [blame]	52	do_unicode_token_test 1.4 "\u1E9E" "0 ß \u1E9E"
dan	3d403c7	2012-05-25 17:50:19 +0000	[diff] [blame]	53	do_unicode_token_test 1.5 "\u1E9E" "0 \uDF \u1E9E"
				54
				55	do_unicode_token_test 1.6 "The quick brown fox" {
				56	0 the The 1 quick quick 2 brown brown 3 fox fox
				57	}
				58	do_unicode_token_test 1.7 "The\u00bfquick\u224ebrown\u2263fox" {
				59	0 the The 1 quick quick 2 brown brown 3 fox fox
				60	}
				61
dan	754d3ad	2012-06-06 19:30:38 +0000	[diff] [blame]	62	do_unicode_token_test2 1.8 {a B c D} {0 a a 1 b B 2 c c 3 d D}
drh	7c37e2f	2013-01-26 19:31:42 +0000	[diff] [blame]	63	do_unicode_token_test2 1.9 {Ä Ö Ü} {0 a Ä 1 o Ö 2 u Ü}
				64	do_unicode_token_test2 1.10 {xÄx xÖx xÜx} {0 xax xÄx 1 xox xÖx 2 xux xÜx}
dan	754d3ad	2012-06-06 19:30:38 +0000	[diff] [blame]	65
				66	# Check that diacritics are removed if remove_diacritics=1 is specified.
				67	# And that they do not break tokens.
drh	7c37e2f	2013-01-26 19:31:42 +0000	[diff] [blame]	68	do_unicode_token_test2 1.11 "xx\u0301xx" "0 xxxx xx\u301xx"
				69
				70	# Title-case mappings work
				71	do_unicode_token_test 1.12 "\u01c5" "0 \u01c6 \u01c5"
dan	754d3ad	2012-06-06 19:30:38 +0000	[diff] [blame]	72
dan	ab322bd	2012-05-26 14:54:50 +0000	[diff] [blame]	73	#-------------------------------------------------------------------------
				74	#
				75	set docs [list {
				76	Enhance the INSERT syntax to allow multiple rows to be inserted via the
				77	VALUES clause.
				78	} {
				79	Enhance the CREATE VIRTUAL TABLE command to support the IF NOT EXISTS clause.
				80	} {
				81	Added the sqlite3_stricmp() interface as a counterpart to sqlite3_strnicmp().
				82	} {
				83	Added the sqlite3_db_readonly() interface.
				84	} {
				85	Added the SQLITE_FCNTL_PRAGMA file control, giving VFS implementations the
				86	ability to add new PRAGMA statements or to override built-in PRAGMAs.
				87	} {
				88	Queries of the form: "SELECT max(x), y FROM table" returns the value of y on
				89	the same row that contains the maximum x value.
				90	} {
				91	Added support for the FTS4 languageid option.
				92	} {
				93	Documented support for the FTS4 content option. This feature has actually
				94	been in the code since version 3.7.9 but is only now considered to be
				95	officially supported.
				96	} {
				97	Pending statements no longer block ROLLBACK. Instead, the pending statement
				98	will return SQLITE_ABORT upon next access after the ROLLBACK.
				99	} {
				100	Improvements to the handling of CSV inputs in the command-line shell
				101	} {
				102	Fix a bug introduced in version 3.7.10 that might cause a LEFT JOIN to be
				103	incorrectly converted into an INNER JOIN if the WHERE clause indexable terms
				104	connected by OR.
				105	}]
				106
				107	set map(a) [list "\u00C4" "\u00E4"] ; # LATIN LETTER A WITH DIAERESIS
				108	set map(e) [list "\u00CB" "\u00EB"] ; # LATIN LETTER E WITH DIAERESIS
				109	set map(i) [list "\u00CF" "\u00EF"] ; # LATIN LETTER I WITH DIAERESIS
				110	set map(o) [list "\u00D6" "\u00F6"] ; # LATIN LETTER O WITH DIAERESIS
				111	set map(u) [list "\u00DC" "\u00FC"] ; # LATIN LETTER U WITH DIAERESIS
				112	set map(y) [list "\u0178" "\u00FF"] ; # LATIN LETTER Y WITH DIAERESIS
				113	set map(h) [list "\u1E26" "\u1E27"] ; # LATIN LETTER H WITH DIAERESIS
				114	set map(w) [list "\u1E84" "\u1E85"] ; # LATIN LETTER W WITH DIAERESIS
				115	set map(x) [list "\u1E8C" "\u1E8D"] ; # LATIN LETTER X WITH DIAERESIS
				116	foreach k [array names map] {
				117	lappend mappings [string toupper $k] [lindex $map($k) 0]
				118	lappend mappings $k [lindex $map($k) 1]
				119	}
				120	proc mapdoc {doc} {
				121	set doc [regsub -all {[[:space:]]+} $doc " "]
				122	string map $::mappings [string trim $doc]
				123	}
				124
				125	do_test 2.0 {
				126	execsql { CREATE VIRTUAL TABLE t2 USING fts4(tokenize=unicode61, x); }
				127	foreach doc $docs {
				128	set d [mapdoc $doc]
				129	execsql { INSERT INTO t2 VALUES($d) }
				130	}
				131	} {}
				132
				133	do_test 2.1 {
				134	set q [mapdoc "row"]
				135	execsql { SELECT * FROM t2 WHERE t2 MATCH $q }
				136	} [list [mapdoc {
				137	Queries of the form: "SELECT max(x), y FROM table" returns the value of y on
				138	the same row that contains the maximum x value.
				139	}]]
				140
				141	foreach {tn query snippet} {
				142	2 "row" {
				143	...returns the value of y on the same [row] that contains
				144	the maximum x value.
				145	}
				146	3 "ROW" {
				147	...returns the value of y on the same [row] that contains
				148	the maximum x value.
				149	}
				150	4 "rollback" {
				151	...[ROLLBACK]. Instead, the pending statement
				152	will return SQLITE_ABORT upon next access after the [ROLLBACK].
				153	}
				154	5 "rOllback" {
				155	...[ROLLBACK]. Instead, the pending statement
				156	will return SQLITE_ABORT upon next access after the [ROLLBACK].
				157	}
				158	6 "lang*" {
				159	Added support for the FTS4 [languageid] option.
				160	}
				161	} {
				162	do_test 2.$tn {
				163	set q [mapdoc $query]
				164	execsql { SELECT snippet(t2, '[', ']', '...') FROM t2 WHERE t2 MATCH $q }
				165	} [list [mapdoc $snippet]]
				166	}
				167
dan	7a79673	2012-05-26 16:22:56 +0000	[diff] [blame]	168	#-------------------------------------------------------------------------
				169	# Make sure the unicode61 tokenizer does not crash if it is passed a
				170	# NULL pointer.
				171	reset_db
				172	do_execsql_test 3.1 {
				173	CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61, x, y);
				174	INSERT INTO t1 VALUES(NULL, 'a b c');
				175	}
				176
				177	do_execsql_test 3.2 {
				178	SELECT snippet(t1, '[', ']') FROM t1 WHERE t1 MATCH 'b'
				179	} {{a [b] c}}
				180
				181	do_execsql_test 3.3 {
				182	BEGIN;
				183	DELETE FROM t1;
				184	INSERT INTO t1 VALUES('b b b b b b b b b b b', 'b b b b b b b b b b b b b');
				185	INSERT INTO t1 SELECT * FROM t1;
				186	INSERT INTO t1 SELECT * FROM t1;
				187	INSERT INTO t1 SELECT * FROM t1;
				188	INSERT INTO t1 SELECT * FROM t1;
				189	INSERT INTO t1 SELECT * FROM t1;
				190	INSERT INTO t1 SELECT * FROM t1;
				191	INSERT INTO t1 SELECT * FROM t1;
				192	INSERT INTO t1 SELECT * FROM t1;
				193	INSERT INTO t1 SELECT * FROM t1;
				194	INSERT INTO t1 SELECT * FROM t1;
				195	INSERT INTO t1 SELECT * FROM t1;
				196	INSERT INTO t1 SELECT * FROM t1;
				197	INSERT INTO t1 SELECT * FROM t1;
				198	INSERT INTO t1 SELECT * FROM t1;
				199	INSERT INTO t1 SELECT * FROM t1;
				200	INSERT INTO t1 SELECT * FROM t1;
				201	INSERT INTO t1 VALUES('a b c', NULL);
				202	INSERT INTO t1 VALUES('a x c', NULL);
				203	COMMIT;
				204	}
				205
				206	do_execsql_test 3.4 {
				207	SELECT * FROM t1 WHERE t1 MATCH 'a b';
				208	} {{a b c} {}}
				209
				210	#-------------------------------------------------------------------------
				211	#
				212	reset_db
				213
				214	do_test 4.1 {
				215	set a "abc\uFFFEdef"
				216	set b "abc\uD800def"
				217	set c "\uFFFEdef"
				218	set d "\uD800def"
				219	execsql {
				220	CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61, x);
				221	INSERT INTO t1 VALUES($a);
				222	INSERT INTO t1 VALUES($b);
				223	INSERT INTO t1 VALUES($c);
				224	INSERT INTO t1 VALUES($d);
				225	}
				226	} {}
				227
				228	do_test 4.2 {
				229	set a [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0x62}]
				230	set b [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0x62}]
				231	set c [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}]
				232	set d [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}]
				233	execsql {
				234	INSERT INTO t1 VALUES($a);
				235	INSERT INTO t1 VALUES($b);
				236	INSERT INTO t1 VALUES($c);
				237	INSERT INTO t1 VALUES($d);
				238	}
				239	} {}
				240
				241	do_test 4.3 {
				242	set a [binary format c* {0xF7 0xBF 0xBF 0xBF}]
				243	set b [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF}]
				244	set c [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF 0xBF}]
				245	set d [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF}]
				246	execsql {
				247	INSERT INTO t1 VALUES($a);
				248	INSERT INTO t1 VALUES($b);
				249	INSERT INTO t1 VALUES($c);
				250	INSERT INTO t1 VALUES($d);
				251	}
				252	} {}
				253
dan	25cdf46	2012-06-07 15:53:48 +0000	[diff] [blame]	254	#-------------------------------------------------------------------------
				255
				256	do_unicode_token_test3 5.1 {tokenchars=} {
				257	sqlite3_reset sqlite3_column_int
				258	} {
				259	0 sqlite3 sqlite3
				260	1 reset reset
				261	2 sqlite3 sqlite3
				262	3 column column
				263	4 int int
				264	}
				265
				266	do_unicode_token_test3 5.2 {tokenchars=_} {
				267	sqlite3_reset sqlite3_column_int
				268	} {
				269	0 sqlite3_reset sqlite3_reset
				270	1 sqlite3_column_int sqlite3_column_int
				271	}
				272
				273	do_unicode_token_test3 5.3 {separators=xyz} {
				274	Laotianxhorseyrunszfast
				275	} {
				276	0 laotian Laotian
				277	1 horse horse
				278	2 runs runs
				279	3 fast fast
				280	}
				281
				282	do_unicode_token_test3 5.4 {tokenchars=xyz} {
				283	Laotianxhorseyrunszfast
				284	} {
				285	0 laotianxhorseyrunszfast Laotianxhorseyrunszfast
				286	}
				287
				288	do_unicode_token_test3 5.5 {tokenchars=_} {separators=zyx} {
				289	sqlite3_resetxsqlite3_column_intyhonda_phantom
				290	} {
				291	0 sqlite3_reset sqlite3_reset
				292	1 sqlite3_column_int sqlite3_column_int
				293	2 honda_phantom honda_phantom
				294	}
				295
				296	do_unicode_token_test3 5.6 "separators=\u05D1" "abc\u05D1def" {
				297	0 abc abc 1 def def
				298	}
				299
				300	do_unicode_token_test3 5.7 \
				301	"tokenchars=\u2444\u2445" \
				302	"separators=\u05D0\u05D1\u05D2" \
				303	"\u2444fre\u2445sh\u05D0water\u05D2fish.\u2445timer" \
				304	[list \
				305	0 \u2444fre\u2445sh \u2444fre\u2445sh \
				306	1 water water \
				307	2 fish fish \
				308	3 \u2445timer \u2445timer \
				309	]
				310
				311	# Check that it is not possible to add a standalone diacritic codepoint
				312	# to either separators or tokenchars.
				313	do_unicode_token_test3 5.8 "separators=\u0301" \
				314	"hello\u0301world \u0301helloworld" \
				315	"0 helloworld hello\u0301world 1 helloworld helloworld"
				316
				317	do_unicode_token_test3 5.9 "tokenchars=\u0301" \
				318	"hello\u0301world \u0301helloworld" \
				319	"0 helloworld hello\u0301world 1 helloworld helloworld"
				320
				321	do_unicode_token_test3 5.10 "separators=\u0301" \
				322	"remove_diacritics=0" \
				323	"hello\u0301world \u0301helloworld" \
				324	"0 hello\u0301world hello\u0301world 1 helloworld helloworld"
				325
				326	do_unicode_token_test3 5.11 "tokenchars=\u0301" \
				327	"remove_diacritics=0" \
				328	"hello\u0301world \u0301helloworld" \
				329	"0 hello\u0301world hello\u0301world 1 helloworld helloworld"
dan	7a79673	2012-05-26 16:22:56 +0000	[diff] [blame]	330
				331
dan	3aaa4cd	2012-06-19 06:35:39 +0000	[diff] [blame]	332	#-------------------------------------------------------------------------
				333
				334	proc do_tokenize {tokenizer txt} {
				335	set res [list]
				336	foreach {a b c} [db one {SELECT fts3_tokenizer_test($tokenizer, $txt)}] {
				337	lappend res $b
				338	}
				339	set res
				340	}
				341
				342	# Argument $lCodepoint must be a list of codepoints (integers) that
				343	# correspond to whitespace characters. This command creates a string
				344	# $W from the codepoints, then tokenizes "${W}hello{$W}world${W}"
				345	# using tokenizer $tokenizer. The test passes if the tokenizer successfully
				346	# extracts the two 5 character tokens.
				347	#
				348	proc do_isspace_test {tn tokenizer lCp} {
				349	set whitespace [format [string repeat %c [llength $lCp]] {*}$lCp]
				350	set txt "${whitespace}hello${whitespace}world${whitespace}"
				351	uplevel [list do_test $tn [list do_tokenize $tokenizer $txt] {hello world}]
				352	}
				353
				354	set tokenizers [list unicode61]
				355	ifcapable icu { lappend tokenizers icu }
				356
				357	# Some tests to check that the tokenizers can both identify white-space
				358	# codepoints. All codepoints tested below are of type "Zs" in the
				359	# UnicodeData.txt file.
				360	foreach T $tokenizers {
				361	do_isspace_test 6.$T.1 $T 32
				362	do_isspace_test 6.$T.2 $T 160
				363	do_isspace_test 6.$T.3 $T 5760
				364	do_isspace_test 6.$T.4 $T 6158
				365	do_isspace_test 6.$T.5 $T 8192
				366	do_isspace_test 6.$T.6 $T 8193
				367	do_isspace_test 6.$T.7 $T 8194
				368	do_isspace_test 6.$T.8 $T 8195
				369	do_isspace_test 6.$T.9 $T 8196
				370	do_isspace_test 6.$T.10 $T 8197
				371	do_isspace_test 6.$T.11 $T 8198
				372	do_isspace_test 6.$T.12 $T 8199
				373	do_isspace_test 6.$T.13 $T 8200
				374	do_isspace_test 6.$T.14 $T 8201
				375	do_isspace_test 6.$T.15 $T 8202
				376	do_isspace_test 6.$T.16 $T 8239
				377	do_isspace_test 6.$T.17 $T 8287
				378	do_isspace_test 6.$T.18 $T 12288
				379
				380	do_isspace_test 6.$T.19 $T {32 160 5760 6158}
				381	do_isspace_test 6.$T.19 $T {8192 8193 8194 8195}
				382	do_isspace_test 6.$T.19 $T {8196 8197 8198 8199}
				383	do_isspace_test 6.$T.19 $T {8200 8201 8202 8239}
				384	do_isspace_test 6.$T.19 $T {8287 12288}
				385	}
				386
				387
dan	3d403c7	2012-05-25 17:50:19 +0000	[diff] [blame]	388	finish_test