Blame - test/fts4unicode.test - chromium.googlesource.com/chromium/deps/sqlite

blob: 500cfcdcaa7e74cdeb2c2186337a809ccb9cd8b4 [file] [log] [blame]

dan	3d403c7	2012-05-25 17:50:19 +0000	[diff] [blame]	1	# 2012 May 25
				2	#
				3	# The author disclaims copyright to this source code. In place of
				4	# a legal notice, here is a blessing:
				5	#
				6	# May you do good and not evil.
				7	# May you find forgiveness for yourself and forgive others.
				8	# May you share freely, never taking more than you give.
				9	#
				10	#*************************************************************************
				11	#
				12	# The tests in this file focus on testing the "unicode" FTS tokenizer.
				13	#
				14
				15	set testdir [file dirname $argv0]
				16	source $testdir/tester.tcl
dan	7946c53	2012-05-26 18:28:14 +0000	[diff] [blame]	17	ifcapable !fts3_unicode { finish_test ; return }
dan	3d403c7	2012-05-25 17:50:19 +0000	[diff] [blame]	18	set ::testprefix fts4unicode
				19
				20	proc do_unicode_token_test {tn input res} {
				21	set input [string map {' ''} $input]
				22	uplevel [list do_execsql_test $tn "
dan	754d3ad	2012-06-06 19:30:38 +0000	[diff] [blame]	23	SELECT fts3_tokenizer_test('unicode61', 'remove_diacritics=0', '$input');
				24	" [list [list {*}$res]]]
				25	}
				26
				27	proc do_unicode_token_test2 {tn input res} {
				28	set input [string map {' ''} $input]
				29	uplevel [list do_execsql_test $tn "
dan	ab322bd	2012-05-26 14:54:50 +0000	[diff] [blame]	30	SELECT fts3_tokenizer_test('unicode61', '$input');
dan	3d403c7	2012-05-25 17:50:19 +0000	[diff] [blame]	31	" [list [list {*}$res]]]
				32	}
				33
dan	25cdf46	2012-06-07 15:53:48 +0000	[diff] [blame]	34	proc do_unicode_token_test3 {tn args} {
				35	set res [lindex $args end]
				36	set sql "SELECT fts3_tokenizer_test('unicode61'"
				37	foreach a [lrange $args 0 end-1] {
				38	append sql ", '"
				39	append sql [string map {' ''} $a]
				40	append sql "'"
				41	}
				42	append sql ")"
				43	uplevel [list do_execsql_test $tn $sql [list [list {*}$res]]]
				44	}
				45
dan	3d403c7	2012-05-25 17:50:19 +0000	[diff] [blame]	46	do_unicode_token_test 1.0 {a B c D} {0 a a 1 b B 2 c c 3 d D}
mistachkin	549bc3d	2013-10-12 00:56:21 +0000	[diff] [blame]	47
				48	do_unicode_token_test 1.1 "\uC4 \uD6 \uDC" \
				49	"0 \uE4 \uC4 1 \uF6 \uD6 2 \uFC \uDC"
				50
				51	do_unicode_token_test 1.2 "x\uC4x x\uD6x x\uDCx" \
				52	"0 x\uE4x x\uC4x 1 x\uF6x x\uD6x 2 x\uFCx x\uDCx"
dan	3d403c7	2012-05-25 17:50:19 +0000	[diff] [blame]	53
				54	# 0x00DF is a small "sharp s". 0x1E9E is a capital sharp s.
				55	do_unicode_token_test 1.3 "\uDF" "0 \uDF \uDF"
mistachkin	549bc3d	2013-10-12 00:56:21 +0000	[diff] [blame]	56	do_unicode_token_test 1.4 "\u1E9E" "0 \uDF \u1E9E"
dan	3d403c7	2012-05-25 17:50:19 +0000	[diff] [blame]	57
mistachkin	549bc3d	2013-10-12 00:56:21 +0000	[diff] [blame]	58	do_unicode_token_test 1.5 "The quick brown fox" {
dan	3d403c7	2012-05-25 17:50:19 +0000	[diff] [blame]	59	0 the The 1 quick quick 2 brown brown 3 fox fox
				60	}
mistachkin	549bc3d	2013-10-12 00:56:21 +0000	[diff] [blame]	61	do_unicode_token_test 1.6 "The\u00bfquick\u224ebrown\u2263fox" {
dan	3d403c7	2012-05-25 17:50:19 +0000	[diff] [blame]	62	0 the The 1 quick quick 2 brown brown 3 fox fox
				63	}
				64
mistachkin	549bc3d	2013-10-12 00:56:21 +0000	[diff] [blame]	65	do_unicode_token_test2 1.7 {a B c D} {0 a a 1 b B 2 c c 3 d D}
				66	do_unicode_token_test2 1.8 "\uC4 \uD6 \uDC" "0 a \uC4 1 o \uD6 2 u \uDC"
				67
				68	do_unicode_token_test2 1.9 "x\uC4x x\uD6x x\uDCx" \
				69	"0 xax x\uC4x 1 xox x\uD6x 2 xux x\uDCx"
dan	754d3ad	2012-06-06 19:30:38 +0000	[diff] [blame]	70
				71	# Check that diacritics are removed if remove_diacritics=1 is specified.
				72	# And that they do not break tokens.
mistachkin	549bc3d	2013-10-12 00:56:21 +0000	[diff] [blame]	73	do_unicode_token_test2 1.10 "xx\u0301xx" "0 xxxx xx\u301xx"
drh	7c37e2f	2013-01-26 19:31:42 +0000	[diff] [blame]	74
				75	# Title-case mappings work
mistachkin	549bc3d	2013-10-12 00:56:21 +0000	[diff] [blame]	76	do_unicode_token_test 1.11 "\u01c5" "0 \u01c6 \u01c5"
dan	754d3ad	2012-06-06 19:30:38 +0000	[diff] [blame]	77
dan	ab322bd	2012-05-26 14:54:50 +0000	[diff] [blame]	78	#-------------------------------------------------------------------------
				79	#
				80	set docs [list {
				81	Enhance the INSERT syntax to allow multiple rows to be inserted via the
				82	VALUES clause.
				83	} {
				84	Enhance the CREATE VIRTUAL TABLE command to support the IF NOT EXISTS clause.
				85	} {
				86	Added the sqlite3_stricmp() interface as a counterpart to sqlite3_strnicmp().
				87	} {
				88	Added the sqlite3_db_readonly() interface.
				89	} {
				90	Added the SQLITE_FCNTL_PRAGMA file control, giving VFS implementations the
				91	ability to add new PRAGMA statements or to override built-in PRAGMAs.
				92	} {
				93	Queries of the form: "SELECT max(x), y FROM table" returns the value of y on
				94	the same row that contains the maximum x value.
				95	} {
				96	Added support for the FTS4 languageid option.
				97	} {
				98	Documented support for the FTS4 content option. This feature has actually
				99	been in the code since version 3.7.9 but is only now considered to be
				100	officially supported.
				101	} {
				102	Pending statements no longer block ROLLBACK. Instead, the pending statement
				103	will return SQLITE_ABORT upon next access after the ROLLBACK.
				104	} {
				105	Improvements to the handling of CSV inputs in the command-line shell
				106	} {
				107	Fix a bug introduced in version 3.7.10 that might cause a LEFT JOIN to be
				108	incorrectly converted into an INNER JOIN if the WHERE clause indexable terms
				109	connected by OR.
				110	}]
				111
				112	set map(a) [list "\u00C4" "\u00E4"] ; # LATIN LETTER A WITH DIAERESIS
				113	set map(e) [list "\u00CB" "\u00EB"] ; # LATIN LETTER E WITH DIAERESIS
				114	set map(i) [list "\u00CF" "\u00EF"] ; # LATIN LETTER I WITH DIAERESIS
				115	set map(o) [list "\u00D6" "\u00F6"] ; # LATIN LETTER O WITH DIAERESIS
				116	set map(u) [list "\u00DC" "\u00FC"] ; # LATIN LETTER U WITH DIAERESIS
				117	set map(y) [list "\u0178" "\u00FF"] ; # LATIN LETTER Y WITH DIAERESIS
				118	set map(h) [list "\u1E26" "\u1E27"] ; # LATIN LETTER H WITH DIAERESIS
				119	set map(w) [list "\u1E84" "\u1E85"] ; # LATIN LETTER W WITH DIAERESIS
				120	set map(x) [list "\u1E8C" "\u1E8D"] ; # LATIN LETTER X WITH DIAERESIS
				121	foreach k [array names map] {
				122	lappend mappings [string toupper $k] [lindex $map($k) 0]
				123	lappend mappings $k [lindex $map($k) 1]
				124	}
				125	proc mapdoc {doc} {
				126	set doc [regsub -all {[[:space:]]+} $doc " "]
				127	string map $::mappings [string trim $doc]
				128	}
				129
				130	do_test 2.0 {
				131	execsql { CREATE VIRTUAL TABLE t2 USING fts4(tokenize=unicode61, x); }
				132	foreach doc $docs {
				133	set d [mapdoc $doc]
				134	execsql { INSERT INTO t2 VALUES($d) }
				135	}
				136	} {}
				137
				138	do_test 2.1 {
				139	set q [mapdoc "row"]
				140	execsql { SELECT * FROM t2 WHERE t2 MATCH $q }
				141	} [list [mapdoc {
				142	Queries of the form: "SELECT max(x), y FROM table" returns the value of y on
				143	the same row that contains the maximum x value.
				144	}]]
				145
				146	foreach {tn query snippet} {
				147	2 "row" {
				148	...returns the value of y on the same [row] that contains
				149	the maximum x value.
				150	}
				151	3 "ROW" {
				152	...returns the value of y on the same [row] that contains
				153	the maximum x value.
				154	}
				155	4 "rollback" {
				156	...[ROLLBACK]. Instead, the pending statement
				157	will return SQLITE_ABORT upon next access after the [ROLLBACK].
				158	}
				159	5 "rOllback" {
				160	...[ROLLBACK]. Instead, the pending statement
				161	will return SQLITE_ABORT upon next access after the [ROLLBACK].
				162	}
				163	6 "lang*" {
				164	Added support for the FTS4 [languageid] option.
				165	}
				166	} {
				167	do_test 2.$tn {
				168	set q [mapdoc $query]
				169	execsql { SELECT snippet(t2, '[', ']', '...') FROM t2 WHERE t2 MATCH $q }
				170	} [list [mapdoc $snippet]]
				171	}
				172
dan	7a79673	2012-05-26 16:22:56 +0000	[diff] [blame]	173	#-------------------------------------------------------------------------
				174	# Make sure the unicode61 tokenizer does not crash if it is passed a
				175	# NULL pointer.
				176	reset_db
				177	do_execsql_test 3.1 {
				178	CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61, x, y);
				179	INSERT INTO t1 VALUES(NULL, 'a b c');
				180	}
				181
				182	do_execsql_test 3.2 {
				183	SELECT snippet(t1, '[', ']') FROM t1 WHERE t1 MATCH 'b'
				184	} {{a [b] c}}
				185
				186	do_execsql_test 3.3 {
				187	BEGIN;
				188	DELETE FROM t1;
				189	INSERT INTO t1 VALUES('b b b b b b b b b b b', 'b b b b b b b b b b b b b');
				190	INSERT INTO t1 SELECT * FROM t1;
				191	INSERT INTO t1 SELECT * FROM t1;
				192	INSERT INTO t1 SELECT * FROM t1;
				193	INSERT INTO t1 SELECT * FROM t1;
				194	INSERT INTO t1 SELECT * FROM t1;
				195	INSERT INTO t1 SELECT * FROM t1;
				196	INSERT INTO t1 SELECT * FROM t1;
				197	INSERT INTO t1 SELECT * FROM t1;
				198	INSERT INTO t1 SELECT * FROM t1;
				199	INSERT INTO t1 SELECT * FROM t1;
				200	INSERT INTO t1 SELECT * FROM t1;
				201	INSERT INTO t1 SELECT * FROM t1;
				202	INSERT INTO t1 SELECT * FROM t1;
				203	INSERT INTO t1 SELECT * FROM t1;
				204	INSERT INTO t1 SELECT * FROM t1;
				205	INSERT INTO t1 SELECT * FROM t1;
				206	INSERT INTO t1 VALUES('a b c', NULL);
				207	INSERT INTO t1 VALUES('a x c', NULL);
				208	COMMIT;
				209	}
				210
				211	do_execsql_test 3.4 {
				212	SELECT * FROM t1 WHERE t1 MATCH 'a b';
				213	} {{a b c} {}}
				214
				215	#-------------------------------------------------------------------------
				216	#
				217	reset_db
				218
				219	do_test 4.1 {
				220	set a "abc\uFFFEdef"
				221	set b "abc\uD800def"
				222	set c "\uFFFEdef"
				223	set d "\uD800def"
				224	execsql {
				225	CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61, x);
				226	INSERT INTO t1 VALUES($a);
				227	INSERT INTO t1 VALUES($b);
				228	INSERT INTO t1 VALUES($c);
				229	INSERT INTO t1 VALUES($d);
				230	}
				231	} {}
				232
				233	do_test 4.2 {
				234	set a [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0x62}]
				235	set b [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0x62}]
				236	set c [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}]
				237	set d [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}]
				238	execsql {
				239	INSERT INTO t1 VALUES($a);
				240	INSERT INTO t1 VALUES($b);
				241	INSERT INTO t1 VALUES($c);
				242	INSERT INTO t1 VALUES($d);
				243	}
				244	} {}
				245
				246	do_test 4.3 {
				247	set a [binary format c* {0xF7 0xBF 0xBF 0xBF}]
				248	set b [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF}]
				249	set c [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF 0xBF}]
				250	set d [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF}]
				251	execsql {
				252	INSERT INTO t1 VALUES($a);
				253	INSERT INTO t1 VALUES($b);
				254	INSERT INTO t1 VALUES($c);
				255	INSERT INTO t1 VALUES($d);
				256	}
				257	} {}
				258
dan	25cdf46	2012-06-07 15:53:48 +0000	[diff] [blame]	259	#-------------------------------------------------------------------------
				260
				261	do_unicode_token_test3 5.1 {tokenchars=} {
				262	sqlite3_reset sqlite3_column_int
				263	} {
				264	0 sqlite3 sqlite3
				265	1 reset reset
				266	2 sqlite3 sqlite3
				267	3 column column
				268	4 int int
				269	}
				270
				271	do_unicode_token_test3 5.2 {tokenchars=_} {
				272	sqlite3_reset sqlite3_column_int
				273	} {
				274	0 sqlite3_reset sqlite3_reset
				275	1 sqlite3_column_int sqlite3_column_int
				276	}
				277
				278	do_unicode_token_test3 5.3 {separators=xyz} {
				279	Laotianxhorseyrunszfast
				280	} {
				281	0 laotian Laotian
				282	1 horse horse
				283	2 runs runs
				284	3 fast fast
				285	}
				286
				287	do_unicode_token_test3 5.4 {tokenchars=xyz} {
				288	Laotianxhorseyrunszfast
				289	} {
				290	0 laotianxhorseyrunszfast Laotianxhorseyrunszfast
				291	}
				292
				293	do_unicode_token_test3 5.5 {tokenchars=_} {separators=zyx} {
				294	sqlite3_resetxsqlite3_column_intyhonda_phantom
				295	} {
				296	0 sqlite3_reset sqlite3_reset
				297	1 sqlite3_column_int sqlite3_column_int
				298	2 honda_phantom honda_phantom
				299	}
				300
				301	do_unicode_token_test3 5.6 "separators=\u05D1" "abc\u05D1def" {
				302	0 abc abc 1 def def
				303	}
				304
				305	do_unicode_token_test3 5.7 \
				306	"tokenchars=\u2444\u2445" \
				307	"separators=\u05D0\u05D1\u05D2" \
				308	"\u2444fre\u2445sh\u05D0water\u05D2fish.\u2445timer" \
				309	[list \
				310	0 \u2444fre\u2445sh \u2444fre\u2445sh \
				311	1 water water \
				312	2 fish fish \
				313	3 \u2445timer \u2445timer \
				314	]
				315
				316	# Check that it is not possible to add a standalone diacritic codepoint
				317	# to either separators or tokenchars.
				318	do_unicode_token_test3 5.8 "separators=\u0301" \
				319	"hello\u0301world \u0301helloworld" \
				320	"0 helloworld hello\u0301world 1 helloworld helloworld"
				321
				322	do_unicode_token_test3 5.9 "tokenchars=\u0301" \
				323	"hello\u0301world \u0301helloworld" \
				324	"0 helloworld hello\u0301world 1 helloworld helloworld"
				325
				326	do_unicode_token_test3 5.10 "separators=\u0301" \
				327	"remove_diacritics=0" \
				328	"hello\u0301world \u0301helloworld" \
				329	"0 hello\u0301world hello\u0301world 1 helloworld helloworld"
				330
				331	do_unicode_token_test3 5.11 "tokenchars=\u0301" \
				332	"remove_diacritics=0" \
				333	"hello\u0301world \u0301helloworld" \
				334	"0 hello\u0301world hello\u0301world 1 helloworld helloworld"
dan	7a79673	2012-05-26 16:22:56 +0000	[diff] [blame]	335
				336
dan	3aaa4cd	2012-06-19 06:35:39 +0000	[diff] [blame]	337	#-------------------------------------------------------------------------
				338
				339	proc do_tokenize {tokenizer txt} {
				340	set res [list]
				341	foreach {a b c} [db one {SELECT fts3_tokenizer_test($tokenizer, $txt)}] {
				342	lappend res $b
				343	}
				344	set res
				345	}
				346
				347	# Argument $lCodepoint must be a list of codepoints (integers) that
				348	# correspond to whitespace characters. This command creates a string
				349	# $W from the codepoints, then tokenizes "${W}hello{$W}world${W}"
				350	# using tokenizer $tokenizer. The test passes if the tokenizer successfully
				351	# extracts the two 5 character tokens.
				352	#
				353	proc do_isspace_test {tn tokenizer lCp} {
				354	set whitespace [format [string repeat %c [llength $lCp]] {*}$lCp]
				355	set txt "${whitespace}hello${whitespace}world${whitespace}"
				356	uplevel [list do_test $tn [list do_tokenize $tokenizer $txt] {hello world}]
				357	}
				358
				359	set tokenizers [list unicode61]
				360	ifcapable icu { lappend tokenizers icu }
				361
				362	# Some tests to check that the tokenizers can both identify white-space
				363	# codepoints. All codepoints tested below are of type "Zs" in the
				364	# UnicodeData.txt file.
drh	07d694c	2015-06-15 16:40:38 +0000	[diff] [blame]	365	#
				366	# Note that codepoint 6158 has changed from Zs to Cf in recent versions
				367	# of UnicodeData.txt. So take that into account for the "icu" tests.
				368	#
dan	3aaa4cd	2012-06-19 06:35:39 +0000	[diff] [blame]	369	foreach T $tokenizers {
				370	do_isspace_test 6.$T.1 $T 32
				371	do_isspace_test 6.$T.2 $T 160
				372	do_isspace_test 6.$T.3 $T 5760
drh	07d694c	2015-06-15 16:40:38 +0000	[diff] [blame]	373	if {$T!="icu"} {
				374	do_isspace_test 6.$T.4 $T 6158
				375	}
dan	3aaa4cd	2012-06-19 06:35:39 +0000	[diff] [blame]	376	do_isspace_test 6.$T.5 $T 8192
				377	do_isspace_test 6.$T.6 $T 8193
				378	do_isspace_test 6.$T.7 $T 8194
				379	do_isspace_test 6.$T.8 $T 8195
				380	do_isspace_test 6.$T.9 $T 8196
				381	do_isspace_test 6.$T.10 $T 8197
				382	do_isspace_test 6.$T.11 $T 8198
				383	do_isspace_test 6.$T.12 $T 8199
				384	do_isspace_test 6.$T.13 $T 8200
				385	do_isspace_test 6.$T.14 $T 8201
				386	do_isspace_test 6.$T.15 $T 8202
				387	do_isspace_test 6.$T.16 $T 8239
				388	do_isspace_test 6.$T.17 $T 8287
				389	do_isspace_test 6.$T.18 $T 12288
				390
drh	07d694c	2015-06-15 16:40:38 +0000	[diff] [blame]	391	if {$T!="icu"} {
				392	do_isspace_test 6.$T.19 $T {32 160 5760 6158}
				393	} else {
				394	do_isspace_test 6.$T.19 $T {32 160 5760 8192}
				395	}
mistachkin	cbc53fe	2013-10-11 22:17:39 +0000	[diff] [blame]	396	do_isspace_test 6.$T.20 $T {8192 8193 8194 8195}
				397	do_isspace_test 6.$T.21 $T {8196 8197 8198 8199}
				398	do_isspace_test 6.$T.22 $T {8200 8201 8202 8239}
				399	do_isspace_test 6.$T.23 $T {8287 12288}
dan	3aaa4cd	2012-06-19 06:35:39 +0000	[diff] [blame]	400	}
				401
dan	f2c9229	2013-06-05 16:17:21 +0000	[diff] [blame]	402	#-------------------------------------------------------------------------
				403	# Test that the private use ranges are treated as alphanumeric.
				404	#
dan	f2c9229	2013-06-05 16:17:21 +0000	[diff] [blame]	405	foreach {tn1 c} {
				406	1 \ue000 2 \ue001 3 \uf000 4 \uf8fe 5 \uf8ff
				407	} {
				408	foreach {tn2 config res} {
				409	1 "" "0 helloworld helloworld"
				410	2 "separators=*" "0 hello hello 1 world world"
				411	} {
				412	set config [string map [list * $c] $config]
				413	set input [string map [list * $c] "hello*world"]
				414	set output [string map [list * $c] $res]
				415	do_unicode_token_test3 7.$tn1.$tn2 {*}$config $input $output
				416	}
				417	}
				418
dan	4339808	2013-08-30 13:29:51 +0000	[diff] [blame]	419	#-------------------------------------------------------------------------
				420	# Cursory test of remove_diacritics=0.
				421	#
				422	# 00C4;LATIN CAPITAL LETTER A WITH DIAERESIS
				423	# 00D6;LATIN CAPITAL LETTER O WITH DIAERESIS
				424	# 00E4;LATIN SMALL LETTER A WITH DIAERESIS
				425	# 00F6;LATIN SMALL LETTER O WITH DIAERESIS
				426	#
				427	do_execsql_test 8.1.1 "
				428	CREATE VIRTUAL TABLE t3 USING fts4(tokenize=unicode61 'remove_diacritics=1');
				429	INSERT INTO t3 VALUES('o');
				430	INSERT INTO t3 VALUES('a');
				431	INSERT INTO t3 VALUES('O');
				432	INSERT INTO t3 VALUES('A');
				433	INSERT INTO t3 VALUES('\xD6');
				434	INSERT INTO t3 VALUES('\xC4');
				435	INSERT INTO t3 VALUES('\xF6');
				436	INSERT INTO t3 VALUES('\xE4');
				437	"
				438	do_execsql_test 8.1.2 {
				439	SELECT rowid FROM t3 WHERE t3 MATCH 'o';
				440	} {1 3 5 7}
				441	do_execsql_test 8.1.3 {
				442	SELECT rowid FROM t3 WHERE t3 MATCH 'a';
				443	} {2 4 6 8}
				444	do_execsql_test 8.2.1 {
				445	CREATE VIRTUAL TABLE t4 USING fts4(tokenize=unicode61 "remove_diacritics=0");
				446	INSERT INTO t4 SELECT * FROM t3;
				447	}
				448	do_execsql_test 8.2.2 {
				449	SELECT rowid FROM t4 WHERE t4 MATCH 'o';
				450	} {1 3}
				451	do_execsql_test 8.2.3 {
				452	SELECT rowid FROM t4 WHERE t4 MATCH 'a';
				453	} {2 4}
dan	3aaa4cd	2012-06-19 06:35:39 +0000	[diff] [blame]	454
dan	f1d2670	2013-09-13 12:10:09 +0000	[diff] [blame]	455	#-------------------------------------------------------------------------
				456	#
				457	foreach {tn sql} {
				458	1 {
				459	CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 [tokenchars= .]);
				460	CREATE VIRTUAL TABLE t6 USING fts4(
				461	tokenize=unicode61 [tokenchars=="] "tokenchars=[]");
				462	CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 [separators=x\xC4]);
				463	}
				464	2 {
				465	CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 "tokenchars= .");
				466	CREATE VIRTUAL TABLE t6 USING fts4(tokenize=unicode61 "tokenchars=[=""]");
				467	CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 "separators=x\xC4");
				468	}
				469	3 {
				470	CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 'tokenchars= .');
				471	CREATE VIRTUAL TABLE t6 USING fts4(tokenize=unicode61 'tokenchars=="[]');
				472	CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 'separators=x\xC4');
				473	}
				474	4 {
				475	CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 `tokenchars= .`);
				476	CREATE VIRTUAL TABLE t6 USING fts4(tokenize=unicode61 `tokenchars=[="]`);
				477	CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 `separators=x\xC4`);
				478	}
				479	} {
				480	do_execsql_test 9.$tn.0 {
				481	DROP TABLE IF EXISTS t5;
				482	DROP TABLE IF EXISTS t5aux;
				483	DROP TABLE IF EXISTS t6;
				484	DROP TABLE IF EXISTS t6aux;
				485	DROP TABLE IF EXISTS t7;
				486	DROP TABLE IF EXISTS t7aux;
				487	}
				488	do_execsql_test 9.$tn.1 $sql
				489
				490	do_execsql_test 9.$tn.2 {
				491	CREATE VIRTUAL TABLE t5aux USING fts4aux(t5);
				492	INSERT INTO t5 VALUES('one two three/four.five.six');
				493	SELECT * FROM t5aux;
				494	} {
				495	four.five.six * 1 1 four.five.six 0 1 1
				496	{one two three} * 1 1 {one two three} 0 1 1
				497	}
				498
				499	do_execsql_test 9.$tn.3 {
				500	CREATE VIRTUAL TABLE t6aux USING fts4aux(t6);
				501	INSERT INTO t6 VALUES('alpha=beta"gamma/delta[epsilon]zeta');
				502	SELECT * FROM t6aux;
				503	} {
				504	{alpha=beta"gamma} * 1 1 {alpha=beta"gamma} 0 1 1
				505	{delta[epsilon]zeta} * 1 1 {delta[epsilon]zeta} 0 1 1
				506	}
				507
				508	do_execsql_test 9.$tn.4 {
				509	CREATE VIRTUAL TABLE t7aux USING fts4aux(t7);
				510	INSERT INTO t7 VALUES('alephxbeth\xC4gimel');
				511	SELECT * FROM t7aux;
				512	} {
				513	aleph * 1 1 aleph 0 1 1
				514	beth * 1 1 beth 0 1 1
				515	gimel * 1 1 gimel 0 1 1
				516	}
				517	}
				518
				519	# Check that multiple options are handled correctly.
				520	#
				521	do_execsql_test 10.1 {
				522	DROP TABLE IF EXISTS t1;
				523	CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61
				524	"tokenchars=xyz" "tokenchars=.=" "separators=.=" "separators=xy"
				525	"separators=a" "separators=a" "tokenchars=a" "tokenchars=a"
				526	);
				527
				528	INSERT INTO t1 VALUES('oneatwoxthreeyfour');
				529	INSERT INTO t1 VALUES('a.single=word');
				530	CREATE VIRTUAL TABLE t1aux USING fts4aux(t1);
				531	SELECT * FROM t1aux;
				532	} {
				533	.single=word * 1 1 .single=word 0 1 1
				534	four * 1 1 four 0 1 1
				535	one * 1 1 one 0 1 1
				536	three * 1 1 three 0 1 1
				537	two * 1 1 two 0 1 1
				538	}
				539
				540	# Test that case folding happens after tokenization, not before.
				541	#
				542	do_execsql_test 10.2 {
				543	DROP TABLE IF EXISTS t2;
				544	CREATE VIRTUAL TABLE t2 USING fts4(tokenize=unicode61 "separators=aB");
				545	INSERT INTO t2 VALUES('oneatwoBthree');
				546	INSERT INTO t2 VALUES('onebtwoAthree');
				547	CREATE VIRTUAL TABLE t2aux USING fts4aux(t2);
				548	SELECT * FROM t2aux;
				549	} {
				550	one * 1 1 one 0 1 1
				551	onebtwoathree * 1 1 onebtwoathree 0 1 1
				552	three * 1 1 three 0 1 1
				553	two * 1 1 two 0 1 1
				554	}
				555
dan	6284d02	2013-09-18 11:16:32 +0000	[diff] [blame]	556	# Test that the tokenchars and separators options work with the
				557	# fts3tokenize table.
				558	#
				559	do_execsql_test 11.1 {
				560	CREATE VIRTUAL TABLE ft1 USING fts3tokenize(
				561	"unicode61", "tokenchars=@.", "separators=1234567890"
				562	);
				563	SELECT token FROM ft1 WHERE input = 'berlin@street123sydney.road';
				564	} {
				565	berlin@street sydney.road
				566	}
dan	f1d2670	2013-09-13 12:10:09 +0000	[diff] [blame]	567
dan	3d403c7	2012-05-25 17:50:19 +0000	[diff] [blame]	568	finish_test