Add code to get the tokenizer character-class logic working for EBCDIC. FossilOrigin-Name: 04f7da77c13925c1f1e287f4579bb85518297d81

commit: 34dcee65442abd1f48f19ea855189f5b642da6dd [log] [tgz]
author: drh <drh@noemail.net> Mon Feb 08 19:15:48 2016 +0000
committer: drh <drh@noemail.net> Mon Feb 08 19:15:48 2016 +0000
tree: c7af5c06a27140b43aa7844e34b67353a8f02e69
parent: 41aab89b3123f9b7ffcb59d3e57528c88d2f50eb [diff]
diff --git a/manifest b/manifest
index a3d1184..180d6f2 100644
--- a/manifest
+++ b/manifest

@@ -1,5 +1,5 @@
-C Faster\skeywordCode()\simplementation\sby\staking\sadvantage\sof\sthe\sfact\sthat\nthe\sinput\sis\salways\spure\sASCII\salphabetic\sand\sunderscore\sand\sthat\sthe\skeyword\ntable\sis\salways\supper-case.
-D 2016-02-08T03:23:46.173
+C Add\scode\sto\sget\sthe\stokenizer\scharacter-class\slogic\sworking\sfor\sEBCDIC.
+D 2016-02-08T19:15:48.295
 F Makefile.in 0a957a57243a3d55e96b1514e22ffae5db9ea116
 F Makefile.linux-gcc 7bc79876b875010e8c8f9502eb935ca92aa3c434
 F Makefile.msc a3f8092763bb5d0057f0f4feb6b7fcc19713e107
@@ -406,7 +406,7 @@
 F src/test_windirent.h b12055cab6227f7be10f5c19296f67c60cc5e2a5
 F src/test_wsd.c 41cadfd9d97fe8e3e4e44f61a4a8ccd6f7ca8fe9
 F src/threads.c bbfb74450643cb5372a43ad4f6cffd7e9dfcecb0
-F src/tokenize.c b3cfc123d65a5bf7ba615f74f28737ae2135620a
+F src/tokenize.c 5019666f8705e9f7135c6f1c1ffac95a1af76fa6
 F src/treeview.c dc39ccf04e9331237388b9cb73289c9d87ea050b
 F src/trigger.c e14840ee0c3e549e758ec9bf3e4146e166002280
 F src/update.c 310ca7adb86a7d1f2afae46905b21c83580f3e17
@@ -1383,7 +1383,7 @@
 F tool/loadfts.c c3c64e4d5e90e8ba41159232c2189dba4be7b862
 F tool/logest.c eef612f8adf4d0993dafed0416064cf50d5d33c6
 F tool/mkautoconfamal.sh a29b14d54302b33fd892958f6895582ea90e4a45
-F tool/mkkeywordhash.c 4451824f4f68f8e8d89eba080e0c1a9cf83f7b62
+F tool/mkkeywordhash.c f7f3b342211ac6a14258b9726d5b97cf4f548f22
 F tool/mkmsvcmin.tcl d57e6efc9428605f5418d0b235721ddf7b5d9c0b
 F tool/mkopcodec.tcl d1b6362bd3aa80d5520d4d6f3765badf01f6c43c
 F tool/mkopcodeh.tcl 385c62d78c38b2d92146dcb5abd319dbbc33506d
@@ -1427,7 +1427,7 @@
 F tool/warnings-clang.sh f6aa929dc20ef1f856af04a730772f59283631d4
 F tool/warnings.sh 48bd54594752d5be3337f12c72f28d2080cb630b
 F tool/win/sqlite.vsix deb315d026cc8400325c5863eef847784a219a2f
-P 9115baa1919584dc8ca25bbff54d3b65748a9631
-R b013689cd0826a67d194ddf9700064de
+P ff406b9701ebe3a01834837f380641c6f0c495bc
+R f322e625886c02a1fcd0df28b2c76f16
 U drh
-Z b5dc027d5e497c867b371327464bbd96
+Z 8351e730f91ff26fa5b93d74f8f175a3

diff --git a/manifest.uuid b/manifest.uuid
index 290fa63..96aac98 100644
--- a/manifest.uuid
+++ b/manifest.uuid

@@ -1 +1 @@
-ff406b9701ebe3a01834837f380641c6f0c495bc
\ No newline at end of file
+04f7da77c13925c1f1e287f4579bb85518297d81
\ No newline at end of file

diff --git a/src/tokenize.c b/src/tokenize.c
index c4b36c4..68e7b45 100644
--- a/src/tokenize.c
+++ b/src/tokenize.c

@@ -18,7 +18,14 @@
 #include "sqliteInt.h"
 #include <stdlib.h>
 
-/* Character classes for tokenizing */
+/* Character classes for tokenizing
+**
+** In the sqlite3GetToken() function, a switch() on aiClass[c] is implemented
+** using a lookup table, whereas a switch() directly on c uses a binary search.
+** The lookup table is much faster.  To maximize speed, and to ensure that
+** a lookup table is used, all of the classes need to be small integers and
+** all of them need to be used within the switch.
+*/
 #define CC_X          0    /* The letter 'x' or 'X'.  Start of x'01234fed' */
 #define CC_KYWD       1    /* Alphabetics or '_'.  Usable in a keyword */
 #define CC_ID         2    /* unicode characters usable in IDs */
@@ -49,6 +56,7 @@
 #define CC_ILLEGAL   27    /* Illegal character */
 
 static const unsigned char aiClass[] = {
+#ifdef SQLITE_ASCII
 /*         x0  x1  x2  x3  x4  x5  x6  x7  x8  x9  xa  xb  xc  xd  xe  xf */
 /* 0x */   27, 27, 27, 27, 27, 27, 27, 27, 27,  7,  7, 27,  7,  7, 27, 27,
 /* 1x */   27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
@@ -66,14 +74,36 @@
 /* Dx */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
 /* Ex */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
 /* Fx */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2
+#endif
+#ifdef SQLITE_EBCDIC
+/*         x0  x1  x2  x3  x4  x5  x6  x7  x8  x9  xa  xb  xc  xd  xe  xf */
+/* 0x */   27, 27, 27, 27, 27,  7, 27, 27, 27, 27, 27, 27,  7,  7, 27, 27,
+/* 1x */   27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+/* 2x */   27, 27, 27, 27, 27,  7, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+/* 3x */   27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+/* 4x */    7, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 12, 17, 20, 10,
+/* 5x */   24, 27, 27, 27, 27, 27, 27, 27, 27, 27, 15,  4, 21, 18, 19, 27,
+/* 6x */   11, 16, 27, 27, 27, 27, 27, 27, 27, 27, 27, 23, 22,  1, 13,  7,
+/* 7x */   27, 27, 27, 27, 27, 27, 27, 27, 27,  8,  5,  5,  5,  8, 14,  8,
+/* 8x */   27,  1,  1,  1,  1,  1,  1,  1,  1,  1, 27, 27, 27, 27, 27, 27,
+/* 9x */   27,  1,  1,  1,  1,  1,  1,  1,  1,  1, 27, 27, 27, 27, 27, 27,
+/* 9x */   25,  1,  1,  1,  1,  1,  1,  0,  1,  1, 27, 27, 27, 27, 27, 27,
+/* Bx */   27, 27, 27, 27, 27, 27, 27, 27, 27, 27,  9, 27, 27, 27, 27, 27,
+/* Cx */   27,  1,  1,  1,  1,  1,  1,  1,  1,  1, 27, 27, 27, 27, 27, 27,
+/* Dx */   27,  1,  1,  1,  1,  1,  1,  1,  1,  1, 27, 27, 27, 27, 27, 27,
+/* Ex */   27, 27,  1,  1,  1,  1,  1,  0,  1,  1, 27, 27, 27, 27, 27, 27,
+/* Fx */    3,  3,  3,  3,  3,  3,  3,  3,  3,  3, 27, 27, 27, 27, 27, 27,
+#endif
 };
 
 /*
-** The charMap() macro maps alphabetic characters into their
+** The charMap() macro maps alphabetic characters (only) into their
 ** lower-case ASCII equivalent.  On ASCII machines, this is just
 ** an upper-to-lower case map.  On EBCDIC machines we also need
-** to adjust the encoding.  Only alphabetic characters and underscores
-** need to be translated.
+** to adjust the encoding.  The mapping is only valid for alphabetics
+** which are the only characters for which this feature is used. 
+**
+** Used by keywordhash.h
 */
 #ifdef SQLITE_ASCII
 # define charMap(X) sqlite3UpperToLower[(unsigned char)X]
@@ -410,7 +440,7 @@
 #endif
     case CC_KYWD: {
       for(i=1; aiClass[z[i]]<=CC_KYWD; i++){}
-      if( aiClass[z[i]]<=CC_DOLLAR ){ i++; break; }
+      if( IdChar(z[i]) ){ i++; break; }
       *tokenType = TK_ID;
       return keywordCode((char*)z, i, tokenType);
     }
@@ -423,7 +453,7 @@
       return 1;
     }
   }
-  while( aiClass[z[i]]<=CC_DOLLAR ){ i++; }
+  while( IdChar(z[i]) ){ i++; }
   *tokenType = TK_ID;
   return i;
 }

diff --git a/tool/mkkeywordhash.c b/tool/mkkeywordhash.c
index 43455ef..7e5287e 100644
--- a/tool/mkkeywordhash.c
+++ b/tool/mkkeywordhash.c

@@ -277,7 +277,10 @@
 /* Number of keywords */
 static int nKeyword = (sizeof(aKeywordTable)/sizeof(aKeywordTable[0]));
 
-/* Map all alphabetic characters into the same case */
+/* Map all alphabetic characters into lower-case for hashing.  This is
+** only valid for alphabetics.  In particular it does not work for '_'
+** and so the hash cannot be on a keyword position that might be an '_'.
+*/
 #define charMap(X)   (0x20|(X))
 
 /*
@@ -565,16 +568,21 @@
   }
   printf("%s  };\n", j==0 ? "" : "\n");
 
-  printf("  int h, i, j;\n");
+  printf("  int i, j;\n");
   printf("  const char *zKW;\n");
   printf("  if( n>=2 ){\n");
-  printf("    h = ((charMap(z[0])*4) ^ (charMap(z[n-1])*3) ^ n) %% %d;\n",
+  printf("    i = ((charMap(z[0])*4) ^ (charMap(z[n-1])*3) ^ n) %% %d;\n",
           bestSize);
-  printf("    for(i=((int)aHash[h])-1; i>=0; i=((int)aNext[i])-1){\n");
+  printf("    for(i=((int)aHash[i])-1; i>=0; i=((int)aNext[i])-1){\n");
   printf("      if( aLen[i]!=n ) continue;\n");
   printf("      j = 0;\n");
   printf("      zKW = &zText[aOffset[i]];\n");
+  printf("#ifdef SQLITE_ASCII\n");
   printf("      while( j<n && (z[j]&~0x20)==zKW[j] ){ j++; }\n");
+  printf("#endif\n");
+  printf("#ifdef SQLITE_EBCDIC\n");
+  printf("      while( j<n && toupper(z[j])==zKW[j] ){ j++; }\n");
+  printf("#endif\n");
   printf("      if( j<n ) continue;\n");
   for(i=0; i<nKeyword; i++){
     printf("      testcase( i==%d ); /* %s */\n",
commit	34dcee65442abd1f48f19ea855189f5b642da6dd	[log] [tgz]
author	drh <drh@noemail.net>	Mon Feb 08 19:15:48 2016 +0000
committer	drh <drh@noemail.net>	Mon Feb 08 19:15:48 2016 +0000
tree	c7af5c06a27140b43aa7844e34b67353a8f02e69
parent	41aab89b3123f9b7ffcb59d3e57528c88d2f50eb [diff]