add PIRegularExpression
This commit is contained in:
1027
3rd/pcre2/maint/ucptest.c
Normal file
1027
3rd/pcre2/maint/ucptest.c
Normal file
@@ -0,0 +1,1027 @@
|
||||
/***************************************************
|
||||
* A program for testing the Unicode property table *
|
||||
***************************************************/
|
||||
|
||||
/* Copyright (c) University of Cambridge 2008-2023 */
|
||||
|
||||
/* Compile thus:
|
||||
|
||||
gcc -DHAVE_CONFIG_H -DPCRE2_CODE_UNIT_WIDTH=8 \
|
||||
-fvisibility=hidden -o ucptest ucptest.c \
|
||||
../src/pcre2_ord2utf.c ../src/pcre2_ucd.c ../src/pcre2_tables.c
|
||||
|
||||
Add -lreadline or -ledit if PCRE2 was configured with readline or libedit
|
||||
support in pcre2test.
|
||||
*/
|
||||
|
||||
/* This is a hacked-up program for testing the Unicode properties tables of
|
||||
PCRE2. It can also be used for finding characters with certain properties. I
|
||||
wrote it to help with debugging, and have added things that I found useful, in
|
||||
a rather haphazard way. The code has never been seriously tidied or checked for
|
||||
robustness, but it shouldn't now give compiler warnings.
|
||||
|
||||
There is only one option: "-s". If given, it applies only to the "findprop"
|
||||
command. It causes the UTF-8 sequence of bytes that encode the character to be
|
||||
output between angle brackets at the end of the line. On a UTF-8 terminal, this
|
||||
will show the appropriate graphic for the code point.
|
||||
|
||||
If the command has arguments, they are concatenated into a buffer, separated by
|
||||
spaces. If the first argument starts "U+" or consists entirely of hexadecimal
|
||||
digits, "findprop" is inserted at the start. The buffer is then processed as a
|
||||
single line file, after which the program exits. If there are no arguments, the
|
||||
program reads commands line by line on stdin and writes output to stdout. The
|
||||
return code is always zero.
|
||||
|
||||
There are three commands:
|
||||
|
||||
The command "findprop" must be followed by a space-separated list of Unicode
|
||||
code points as hex numbers, either without any prefix or starting with "U+", or
|
||||
as individual UTF-8 characters preceded by '+'. For example:
|
||||
|
||||
findprop U+1234 5Abc +?
|
||||
|
||||
The output is one long line per character, listing Unicode properties that have
|
||||
values, followed by its other case or cases if one or more exist, followed by
|
||||
its Script Extension list if there is one. This list is in square brackets. A
|
||||
second list in square brackets gives all the Boolean properties of the
|
||||
character. The properties that come first are:
|
||||
|
||||
Bidi class e.g. NSM (most common is L)
|
||||
General type e.g. Letter
|
||||
Specific type e.g. Upper case letter
|
||||
Script e.g. Medefaidrin
|
||||
Grapheme break type e.g. Extend (most common is Other)
|
||||
|
||||
Script names and Boolean property names are all in lower case, with underscores
|
||||
and hyphens removed, because that's how they are stored for "loose" matching.
|
||||
|
||||
The command "find" must be followed by a list of property types and their
|
||||
values. The values are case-sensitive, except for bidi class. This finds
|
||||
characters that have those properties. If multiple properties are listed, they
|
||||
must all be matched. Currently supported:
|
||||
|
||||
script <name> The character must have this script property. Only one
|
||||
such script may be given.
|
||||
scriptx <name> This script must be in the character's Script Extension
|
||||
property list. If this is used many times, all the given
|
||||
scripts must be present.
|
||||
type <abbrev> The character's specific type (e.g. Lu or Nd) must match.
|
||||
gbreak <name> The grapheme break property must match.
|
||||
bidi <class> The character's bidi class must match.
|
||||
bool <name> The character's Boolean property list must contain this
|
||||
property.
|
||||
|
||||
If a <name> or <abbrev> is preceded by !, the value must NOT be present. For
|
||||
Script Extensions and Boolean properties, there may be a mixture of positive
|
||||
and negative requirements. All must be satisfied.
|
||||
|
||||
Sequences of two or more characters are shown as ranges, for example
|
||||
U+0041..U+004A. No more than 100 lines are output. If there are more
|
||||
characters, the list ends with ...
|
||||
|
||||
The command "list" must be followed by one of property names script, bool,
|
||||
type, gbreak or bidi. The defined values for that property are listed. */
|
||||
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "../src/config.h"
|
||||
#endif
|
||||
|
||||
#ifndef SUPPORT_UNICODE
|
||||
#error "Unicode support not enabled"
|
||||
#endif
|
||||
|
||||
#include <ctype.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "../src/pcre2_internal.h"
|
||||
#include "../src/pcre2_ucp.h"
|
||||
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
|
||||
#if defined(SUPPORT_LIBREADLINE)
|
||||
#include <readline/readline.h>
|
||||
#include <readline/history.h>
|
||||
#else
|
||||
#if defined(HAVE_EDITLINE_READLINE_H)
|
||||
#include <editline/readline.h>
|
||||
#else
|
||||
#include <readline/readline.h>
|
||||
#ifdef RL_VERSION_MAJOR
|
||||
#include <readline/history.h>
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
/* -------------------------------------------------------------------*/
|
||||
|
||||
#define CS (char *)
|
||||
#define CCS (const char *)
|
||||
#define CSS (char **)
|
||||
#define US (unsigned char *)
|
||||
#define CUS (const unsigned char *)
|
||||
|
||||
/* -------------------------------------------------------------------*/
|
||||
|
||||
static BOOL show_character = FALSE;
|
||||
|
||||
static const unsigned char *type_names[] = {
|
||||
US"Cc", US"Control",
|
||||
US"Cf", US"Format",
|
||||
US"Cn", US"Unassigned",
|
||||
US"Co", US"Private use",
|
||||
US"Cs", US"Surrogate",
|
||||
US"Ll", US"Lower case letter",
|
||||
US"Lm", US"Modifier letter",
|
||||
US"Lo", US"Other letter",
|
||||
US"Lt", US"Title case letter",
|
||||
US"Lu", US"Upper case letter",
|
||||
US"Mc", US"Spacing mark",
|
||||
US"Me", US"Enclosing mark",
|
||||
US"Mn", US"Non-spacing mark",
|
||||
US"Nd", US"Decimal number",
|
||||
US"Nl", US"Letter number",
|
||||
US"No", US"Other number",
|
||||
US"Pc", US"Connector punctuation",
|
||||
US"Pd", US"Dash punctuation",
|
||||
US"Pe", US"Close punctuation",
|
||||
US"Pf", US"Final punctuation",
|
||||
US"Pi", US"Initial punctuation",
|
||||
US"Po", US"Other punctuation",
|
||||
US"Ps", US"Open punctuation",
|
||||
US"Sc", US"Currency symbol",
|
||||
US"Sk", US"Modifier symbol",
|
||||
US"Sm", US"Mathematical symbol",
|
||||
US"So", US"Other symbol",
|
||||
US"Zl", US"Line separator",
|
||||
US"Zp", US"Paragraph separator",
|
||||
US"Zs", US"Space separator"
|
||||
};
|
||||
|
||||
static const unsigned char *gb_names[] = {
|
||||
US"CR", US"carriage return",
|
||||
US"LF", US"linefeed",
|
||||
US"Control", US"",
|
||||
US"Extend", US"",
|
||||
US"Prepend", US"",
|
||||
US"SpacingMark", US"",
|
||||
US"L", US"Hangul syllable type L",
|
||||
US"V", US"Hangul syllable type V",
|
||||
US"T", US"Hangul syllable type T",
|
||||
US"LV", US"Hangul syllable type LV",
|
||||
US"LVT", US"Hangul syllable type LVT",
|
||||
US"Regional_Indicator", US"",
|
||||
US"Other", US"",
|
||||
US"ZWJ", US"zero width joiner",
|
||||
US"Extended_Pictographic", US""
|
||||
};
|
||||
|
||||
static const unsigned char *bd_names[] = {
|
||||
US"AL", US"ArabicLetter",
|
||||
US"AN", US"ArabicNumber",
|
||||
US"B", US"ParagraphSeparator",
|
||||
US"BN", US"BoundaryNeutral",
|
||||
US"CS", US"CommonSeparator",
|
||||
US"EN", US"EuropeanNumber",
|
||||
US"ES", US"EuropeanSeparator",
|
||||
US"ET", US"EuropeanTerminator",
|
||||
US"FSI", US"FirstStrongIsolate",
|
||||
US"L", US"LeftToRight",
|
||||
US"LRE", US"LeftToRightEmbedding",
|
||||
US"LRI", US"LeftToRightIsolate",
|
||||
US"LRO", US"LeftToRightOverride",
|
||||
US"NSM", US"NonspacingMark",
|
||||
US"ON", US"OtherNeutral",
|
||||
US"PDF", US"PopDirectionalFormat",
|
||||
US"PDI", US"PopDirectionalIsolate",
|
||||
US"R", US"RightToLeft",
|
||||
US"RLE", US"RightToLeftEmbedding",
|
||||
US"RLI", US"RightToLeftIsolate",
|
||||
US"RLO", US"RightToLeftOverride",
|
||||
US"S", US"SegmentSeparator",
|
||||
US"WS", US"WhiteSpace"
|
||||
};
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Test for interaction *
|
||||
*************************************************/
|
||||
|
||||
static BOOL
|
||||
is_stdin_tty(void)
|
||||
{
|
||||
#if defined WIN32
|
||||
return _isatty(_fileno(stdin));
|
||||
#else
|
||||
return isatty(fileno(stdin));
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Get name from ucp ident *
|
||||
*************************************************/
|
||||
|
||||
/* The utt table contains both full names and abbreviations. So search for both
|
||||
and use the longer if two are found, unless the first one is only 3 characters
|
||||
and we are looking for a script (some scripts have 3-character names). If this
|
||||
were not just a test program it might be worth making some kind of reverse
|
||||
index. */
|
||||
|
||||
static const char *
|
||||
get_propname(int prop, int type)
|
||||
{
|
||||
size_t i, j, len;
|
||||
size_t foundlist[2];
|
||||
const char *yield;
|
||||
int typex = (type == PT_SC)? PT_SCX : type;
|
||||
|
||||
j = 0;
|
||||
for (i = 0; i < PRIV(utt_size); i++)
|
||||
{
|
||||
const ucp_type_table *u = PRIV(utt) + i;
|
||||
if ((u->type == type || u->type == typex) && u->value == prop)
|
||||
{
|
||||
foundlist[j++] = i;
|
||||
if (j >= 2) break;
|
||||
}
|
||||
}
|
||||
|
||||
if (j == 0) return "??";
|
||||
|
||||
yield = NULL;
|
||||
len = 0;
|
||||
|
||||
for (i = 0; i < j; i++)
|
||||
{
|
||||
const char *s = PRIV(utt_names) + (PRIV(utt) + foundlist[i])->name_offset;
|
||||
size_t sl = strlen(s);
|
||||
|
||||
if (sl > len)
|
||||
{
|
||||
yield = s;
|
||||
if (sl == 3 && type == PT_SC) break;
|
||||
len = sl;
|
||||
}
|
||||
}
|
||||
|
||||
return yield;
|
||||
}
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Print Unicode property info for a char *
|
||||
*************************************************/
|
||||
|
||||
static void
|
||||
print_prop(unsigned int c, BOOL is_just_one)
|
||||
{
|
||||
unsigned int type = UCD_CATEGORY(c);
|
||||
int fulltype = UCD_CHARTYPE(c);
|
||||
int script = UCD_SCRIPT(c);
|
||||
int scriptx = UCD_SCRIPTX(c);
|
||||
int gbprop = UCD_GRAPHBREAK(c);
|
||||
int bidi = UCD_BIDICLASS(c);
|
||||
unsigned int othercase = UCD_OTHERCASE(c);
|
||||
int caseset = UCD_CASESET(c);
|
||||
int bprops = UCD_BPROPS(c);
|
||||
|
||||
const unsigned char *fulltypename = US"??";
|
||||
const unsigned char *typename = US"??";
|
||||
const unsigned char *graphbreak = US"??";
|
||||
const unsigned char *bidiclass = US"??";
|
||||
const unsigned char *scriptname = CUS get_propname(script, PT_SC);
|
||||
|
||||
switch (type)
|
||||
{
|
||||
case ucp_C: typename = US"Control"; break;
|
||||
case ucp_L: typename = US"Letter"; break;
|
||||
case ucp_M: typename = US"Mark"; break;
|
||||
case ucp_N: typename = US"Number"; break;
|
||||
case ucp_P: typename = US"Punctuation"; break;
|
||||
case ucp_S: typename = US"Symbol"; break;
|
||||
case ucp_Z: typename = US"Separator"; break;
|
||||
}
|
||||
|
||||
switch (fulltype)
|
||||
{
|
||||
case ucp_Cc: fulltypename = US"Control"; break;
|
||||
case ucp_Cf: fulltypename = US"Format"; break;
|
||||
case ucp_Cn: fulltypename = US"Unassigned"; break;
|
||||
case ucp_Co: fulltypename = US"Private use"; break;
|
||||
case ucp_Cs: fulltypename = US"Surrogate"; break;
|
||||
case ucp_Ll: fulltypename = US"Lower case letter"; break;
|
||||
case ucp_Lm: fulltypename = US"Modifier letter"; break;
|
||||
case ucp_Lo: fulltypename = US"Other letter"; break;
|
||||
case ucp_Lt: fulltypename = US"Title case letter"; break;
|
||||
case ucp_Lu: fulltypename = US"Upper case letter"; break;
|
||||
case ucp_Mc: fulltypename = US"Spacing mark"; break;
|
||||
case ucp_Me: fulltypename = US"Enclosing mark"; break;
|
||||
case ucp_Mn: fulltypename = US"Non-spacing mark"; break;
|
||||
case ucp_Nd: fulltypename = US"Decimal number"; break;
|
||||
case ucp_Nl: fulltypename = US"Letter number"; break;
|
||||
case ucp_No: fulltypename = US"Other number"; break;
|
||||
case ucp_Pc: fulltypename = US"Connector punctuation"; break;
|
||||
case ucp_Pd: fulltypename = US"Dash punctuation"; break;
|
||||
case ucp_Pe: fulltypename = US"Close punctuation"; break;
|
||||
case ucp_Pf: fulltypename = US"Final punctuation"; break;
|
||||
case ucp_Pi: fulltypename = US"Initial punctuation"; break;
|
||||
case ucp_Po: fulltypename = US"Other punctuation"; break;
|
||||
case ucp_Ps: fulltypename = US"Open punctuation"; break;
|
||||
case ucp_Sc: fulltypename = US"Currency symbol"; break;
|
||||
case ucp_Sk: fulltypename = US"Modifier symbol"; break;
|
||||
case ucp_Sm: fulltypename = US"Mathematical symbol"; break;
|
||||
case ucp_So: fulltypename = US"Other symbol"; break;
|
||||
case ucp_Zl: fulltypename = US"Line separator"; break;
|
||||
case ucp_Zp: fulltypename = US"Paragraph separator"; break;
|
||||
case ucp_Zs: fulltypename = US"Space separator"; break;
|
||||
}
|
||||
|
||||
switch(gbprop)
|
||||
{
|
||||
case ucp_gbCR: graphbreak = US"CR"; break;
|
||||
case ucp_gbLF: graphbreak = US"LF"; break;
|
||||
case ucp_gbControl: graphbreak = US"Control"; break;
|
||||
case ucp_gbExtend: graphbreak = US"Extend"; break;
|
||||
case ucp_gbPrepend: graphbreak = US"Prepend"; break;
|
||||
case ucp_gbSpacingMark: graphbreak = US"SpacingMark"; break;
|
||||
case ucp_gbL: graphbreak = US"Hangul syllable type L"; break;
|
||||
case ucp_gbV: graphbreak = US"Hangul syllable type V"; break;
|
||||
case ucp_gbT: graphbreak = US"Hangul syllable type T"; break;
|
||||
case ucp_gbLV: graphbreak = US"Hangul syllable type LV"; break;
|
||||
case ucp_gbLVT: graphbreak = US"Hangul syllable type LVT"; break;
|
||||
case ucp_gbRegional_Indicator:
|
||||
graphbreak = US"Regional Indicator"; break;
|
||||
case ucp_gbOther: graphbreak = US"Other"; break;
|
||||
case ucp_gbZWJ: graphbreak = US"Zero Width Joiner"; break;
|
||||
case ucp_gbExtended_Pictographic:
|
||||
graphbreak = US"Extended Pictographic"; break;
|
||||
default: graphbreak = US"Unknown"; break;
|
||||
}
|
||||
|
||||
switch(bidi)
|
||||
{
|
||||
case ucp_bidiAL: bidiclass = US"AL "; break;
|
||||
case ucp_bidiFSI: bidiclass = US"FSI"; break;
|
||||
case ucp_bidiL: bidiclass = US"L "; break;
|
||||
case ucp_bidiLRE: bidiclass = US"LRE"; break;
|
||||
case ucp_bidiLRI: bidiclass = US"LRI"; break;
|
||||
case ucp_bidiLRO: bidiclass = US"LRO"; break;
|
||||
case ucp_bidiPDF: bidiclass = US"PDF"; break;
|
||||
case ucp_bidiPDI: bidiclass = US"PDI"; break;
|
||||
case ucp_bidiR: bidiclass = US"R "; break;
|
||||
case ucp_bidiRLE: bidiclass = US"RLE"; break;
|
||||
case ucp_bidiRLI: bidiclass = US"RLI"; break;
|
||||
case ucp_bidiRLO: bidiclass = US"RLO"; break;
|
||||
case ucp_bidiAN: bidiclass = US"AN "; break;
|
||||
case ucp_bidiB: bidiclass = US"B "; break;
|
||||
case ucp_bidiBN: bidiclass = US"BN "; break;
|
||||
case ucp_bidiCS: bidiclass = US"CS "; break;
|
||||
case ucp_bidiEN: bidiclass = US"EN "; break;
|
||||
case ucp_bidiES: bidiclass = US"ES "; break;
|
||||
case ucp_bidiET: bidiclass = US"ET "; break;
|
||||
case ucp_bidiNSM: bidiclass = US"NSM"; break;
|
||||
case ucp_bidiON: bidiclass = US"ON "; break;
|
||||
case ucp_bidiS: bidiclass = US"S "; break;
|
||||
case ucp_bidiWS: bidiclass = US"WS "; break;
|
||||
default: bidiclass = US"???"; break;
|
||||
}
|
||||
|
||||
printf("U+%04X %s %s: %s, %s, %s", c, bidiclass, typename, fulltypename,
|
||||
scriptname, graphbreak);
|
||||
|
||||
if (is_just_one && (othercase != c || caseset != 0))
|
||||
{
|
||||
if (othercase != c) printf(", U+%04X", othercase);
|
||||
if (caseset != 0)
|
||||
{
|
||||
const uint32_t *p = PRIV(ucd_caseless_sets) + caseset - 1;
|
||||
while (*(++p) < NOTACHAR)
|
||||
{
|
||||
unsigned int d = *p;
|
||||
if (d != othercase && d != c) printf(", U+%04X", d);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (scriptx != 0)
|
||||
{
|
||||
const char *sep = "";
|
||||
const uint32_t *p = PRIV(ucd_script_sets) + scriptx;
|
||||
printf(", [");
|
||||
for (int i = 0; i < ucp_Unknown; i++)
|
||||
if (MAPBIT(p, i) != 0)
|
||||
{
|
||||
printf("%s%s", sep, get_propname(i, PT_SC));
|
||||
sep = ", ";
|
||||
}
|
||||
printf("]");
|
||||
}
|
||||
|
||||
if (bprops != 0)
|
||||
{
|
||||
const char *sep = "";
|
||||
const uint32_t *p = PRIV(ucd_boolprop_sets) + bprops;
|
||||
printf(", [");
|
||||
for (int i = 0; i < ucp_Bprop_Count; i++)
|
||||
if (MAPBIT(p, i) != 0)
|
||||
{
|
||||
printf("%s%s", sep, get_propname(i, PT_BOOL));
|
||||
sep = ", ";
|
||||
}
|
||||
printf("]");
|
||||
}
|
||||
|
||||
if (show_character && is_just_one)
|
||||
{
|
||||
unsigned char buffer[8];
|
||||
int len = (int)PRIV(ord2utf_8)(c, buffer);
|
||||
printf(", >%.*s<", len, buffer);
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Find character(s) with given property/ies *
|
||||
*************************************************/
|
||||
|
||||
static void
|
||||
find_chars(unsigned char *s)
|
||||
{
|
||||
unsigned char name[128];
|
||||
unsigned char value[128];
|
||||
unsigned char *t;
|
||||
unsigned int count= 0;
|
||||
int scriptx_list[128];
|
||||
unsigned int scriptx_count = 0;
|
||||
int bprop_list[128];
|
||||
unsigned int bprop_count = 0;
|
||||
uint32_t i, c;
|
||||
int script = -1;
|
||||
int type = -1;
|
||||
int gbreak = -1;
|
||||
int bidiclass = -1;
|
||||
BOOL script_not = FALSE;
|
||||
BOOL type_not = FALSE;
|
||||
BOOL gbreak_not = FALSE;
|
||||
BOOL bidiclass_not = FALSE;
|
||||
BOOL hadrange = FALSE;
|
||||
const ucd_record *ucd, *next_ucd;
|
||||
const char *pad = " ";
|
||||
|
||||
while (*s != 0)
|
||||
{
|
||||
unsigned int offset = 0;
|
||||
|
||||
for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
|
||||
*t = 0;
|
||||
while (isspace(*s)) s++;
|
||||
|
||||
for (t = value; *s != 0 && !isspace(*s); s++)
|
||||
{
|
||||
if (*s != '_' && *s != '-') *t++ = *s;
|
||||
}
|
||||
*t = 0;
|
||||
while (isspace(*s)) s++;
|
||||
|
||||
if (strcmp(CS name, "script") == 0 ||
|
||||
strcmp(CS name, "scriptx") == 0)
|
||||
{
|
||||
BOOL x = (name[6] == 'x');
|
||||
BOOL scriptx_not = FALSE;
|
||||
for (t = value; *t != 0; t++) *t = tolower(*t);
|
||||
|
||||
if (value[0] == '!')
|
||||
{
|
||||
if (x) scriptx_not = TRUE; else script_not = TRUE;
|
||||
offset = 1;
|
||||
}
|
||||
|
||||
for (i = 0; i < PRIV(utt_size); i++)
|
||||
{
|
||||
const ucp_type_table *u = PRIV(utt) + i;
|
||||
if ((u->type == PT_SCX || u->type == PT_SC) && strcmp(CS(value + offset),
|
||||
PRIV(utt_names) + u->name_offset) == 0)
|
||||
{
|
||||
c = u->value;
|
||||
if (x && !scriptx_not && u->type == PT_SC)
|
||||
{
|
||||
if (script < 0)
|
||||
{
|
||||
x = FALSE;
|
||||
script = -1;
|
||||
script_not = scriptx_not;
|
||||
}
|
||||
else if (!script_not)
|
||||
{
|
||||
printf("No characters found\n");
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (x)
|
||||
{
|
||||
scriptx_list[scriptx_count++] = scriptx_not? (-c):c;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (script < 0) script = c; else
|
||||
{
|
||||
printf("** Only 1 script value allowed\n");
|
||||
return;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (i >= PRIV(utt_size))
|
||||
{
|
||||
printf("** Unrecognized script name \"%s\"\n", value);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
else if (strcmp(CS name, "bool") == 0)
|
||||
{
|
||||
int not = 1;
|
||||
if (value[0] == '!')
|
||||
{
|
||||
not = -1;
|
||||
offset = 1;
|
||||
}
|
||||
|
||||
for (i = 0; i < PRIV(utt_size); i++)
|
||||
{
|
||||
const ucp_type_table *u = PRIV(utt) + i;
|
||||
if (u->type == PT_BOOL && strcmp(CS(value + offset),
|
||||
PRIV(utt_names) + u->name_offset) == 0)
|
||||
{
|
||||
bprop_list[bprop_count++] = u->value * not;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (i >= PRIV(utt_size))
|
||||
{
|
||||
printf("** Unrecognized property name \"%s\"\n", value);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
else if (strcmp(CS name, "type") == 0)
|
||||
{
|
||||
if (type >= 0)
|
||||
{
|
||||
printf("** Only 1 type value allowed\n");
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (value[0] == '!')
|
||||
{
|
||||
type_not = TRUE;
|
||||
offset = 1;
|
||||
}
|
||||
|
||||
for (i = 0; i < sizeof(type_names)/sizeof(char *); i += 2)
|
||||
{
|
||||
if (strcmp(CS (value + offset), CCS type_names[i]) == 0)
|
||||
{
|
||||
type = i/2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (i >= sizeof(type_names)/sizeof(char *))
|
||||
{
|
||||
printf("** Unrecognized type name \"%s\"\n", value);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
else if (strcmp(CS name, "gbreak") == 0)
|
||||
{
|
||||
if (gbreak >= 0)
|
||||
{
|
||||
printf("** Only 1 grapheme break value allowed\n");
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (value[0] == '!')
|
||||
{
|
||||
gbreak_not = TRUE;
|
||||
offset = 1;
|
||||
}
|
||||
|
||||
for (i = 0; i < sizeof(gb_names)/sizeof(char *); i += 2)
|
||||
{
|
||||
if (strcmp(CS (value + offset), CCS gb_names[i]) == 0)
|
||||
{
|
||||
gbreak = i/2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (i >= sizeof(gb_names)/sizeof(char *))
|
||||
{
|
||||
printf("** Unrecognized gbreak name \"%s\"\n", value);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
else if (strcmp(CS name, "bidi") == 0 ||
|
||||
strcmp(CS name, "bidiclass") == 0 ||
|
||||
strcmp(CS name, "bidi_class") == 0 )
|
||||
{
|
||||
if (bidiclass >= 0)
|
||||
{
|
||||
printf("** Only 1 bidi class value allowed\n");
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (value[0] == '!')
|
||||
{
|
||||
bidiclass_not = TRUE;
|
||||
offset = 1;
|
||||
}
|
||||
for (i = 0; i < sizeof(bd_names)/sizeof(char *); i++)
|
||||
{
|
||||
if (strcasecmp(CS (value + offset), CCS bd_names[i]) == 0)
|
||||
{
|
||||
bidiclass = i/2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (i >= sizeof(bd_names)/sizeof(char *))
|
||||
{
|
||||
printf("** Unrecognized bidi class name \"%s\"\n", value);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
else
|
||||
{
|
||||
printf("** Unrecognized property name \"%s\"\n", name);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (script < 0 && scriptx_count == 0 && bprop_count == 0 && type < 0 &&
|
||||
gbreak < 0 && bidiclass < 0)
|
||||
{
|
||||
printf("** No properties specified\n");
|
||||
return;
|
||||
}
|
||||
|
||||
for (c = 0; c <= 0x10ffff; c++)
|
||||
{
|
||||
if (script >= 0 && (script == UCD_SCRIPT(c)) == script_not) continue;
|
||||
|
||||
if (scriptx_count > 0)
|
||||
{
|
||||
const uint32_t *bits_scriptx = PRIV(ucd_script_sets) + UCD_SCRIPTX(c);
|
||||
unsigned int found = 0;
|
||||
|
||||
for (i = 0; i < scriptx_count; i++)
|
||||
{
|
||||
int x = scriptx_list[i]/32;
|
||||
int y = scriptx_list[i]%32;
|
||||
|
||||
/* Positive requirment */
|
||||
if (scriptx_list[i] >= 0)
|
||||
{
|
||||
if (scriptx_list[i] == UCD_SCRIPT(c) ||
|
||||
((scriptx_list[i] < ucp_Unknown) &&
|
||||
(bits_scriptx[x] & (1u<<y)) != 0)) found++;
|
||||
}
|
||||
/* Negative requirement */
|
||||
else
|
||||
{
|
||||
if ((-(scriptx_list[i]) < ucp_Unknown) &&
|
||||
(bits_scriptx[x] & (1u<<y)) == 0) found++;
|
||||
}
|
||||
}
|
||||
|
||||
if (found != scriptx_count) continue;
|
||||
}
|
||||
|
||||
if (bprop_count > 0)
|
||||
{
|
||||
const uint32_t *bits_bprop = PRIV(ucd_boolprop_sets) + UCD_BPROPS(c);
|
||||
unsigned int found = 0;
|
||||
|
||||
for (i = 0; i < bprop_count; i++)
|
||||
{
|
||||
int x = bprop_list[i]/32;
|
||||
int y = bprop_list[i]%32;
|
||||
|
||||
/* Positive requirement */
|
||||
if (bprop_list[i] >= 0)
|
||||
{
|
||||
if ((bits_bprop[x] & (1u<<y)) != 0) found++;
|
||||
}
|
||||
/* Negative requirement */
|
||||
else
|
||||
{
|
||||
if ((bits_bprop[-x] & (1u<<(-y))) == 0) found++;
|
||||
}
|
||||
}
|
||||
|
||||
if (found != bprop_count) continue;
|
||||
}
|
||||
|
||||
if (type >= 0)
|
||||
{
|
||||
if (type_not)
|
||||
{
|
||||
if (type == UCD_CHARTYPE(c)) continue;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (type != UCD_CHARTYPE(c)) continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (gbreak >= 0)
|
||||
{
|
||||
if (gbreak_not)
|
||||
{
|
||||
if (gbreak == UCD_GRAPHBREAK(c)) continue;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (gbreak != UCD_GRAPHBREAK(c)) continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (bidiclass >= 0)
|
||||
{
|
||||
if (bidiclass_not)
|
||||
{
|
||||
if (bidiclass == UCD_BIDICLASS(c)) continue;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (bidiclass != UCD_BIDICLASS(c)) continue;
|
||||
}
|
||||
}
|
||||
|
||||
/* All conditions are met. Look for runs. */
|
||||
|
||||
ucd = GET_UCD(c);
|
||||
|
||||
for (i = c + 1; i < 0x10ffff; i++)
|
||||
{
|
||||
next_ucd = GET_UCD(i);
|
||||
if (memcmp(ucd, next_ucd, sizeof(ucd_record)) != 0) break;
|
||||
}
|
||||
|
||||
if (--i > c)
|
||||
{
|
||||
printf("U+%04X..", c);
|
||||
c = i;
|
||||
hadrange = TRUE;
|
||||
}
|
||||
else if (hadrange) printf("%s", pad);
|
||||
|
||||
print_prop(c, FALSE);
|
||||
if (c >= 0x100000) pad = " ";
|
||||
else if (c >= 0x10000) pad = " ";
|
||||
count++;
|
||||
if (count >= 100)
|
||||
{
|
||||
printf("...\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (count == 0) printf("No characters found\n");
|
||||
}
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Process command line *
|
||||
*************************************************/
|
||||
|
||||
static void
|
||||
process_command_line(unsigned char *buffer)
|
||||
{
|
||||
unsigned char *s, *t;
|
||||
unsigned char name[24];
|
||||
|
||||
s = buffer;
|
||||
while (isspace(*s)) s++;
|
||||
if (*s == 0) return;
|
||||
|
||||
for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
|
||||
*t = 0;
|
||||
while (isspace(*s)) s++;
|
||||
|
||||
if (strcmp(CS name, "findprop") == 0)
|
||||
{
|
||||
while (*s != 0)
|
||||
{
|
||||
unsigned int c;
|
||||
unsigned char *endptr;
|
||||
t = s;
|
||||
|
||||
if (*t == '+')
|
||||
{
|
||||
c = *(++t);
|
||||
if (c > 0x7fu)
|
||||
{
|
||||
GETCHARINC(c, t);
|
||||
endptr = t;
|
||||
}
|
||||
else endptr = t+1;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (memcmp(t, "U+", 2) == 0) t += 2;
|
||||
c = (uint32_t)strtoul(CS t, CSS(&endptr), 16);
|
||||
}
|
||||
|
||||
if (*endptr != 0 && !isspace(*endptr))
|
||||
{
|
||||
while (*endptr != 0 && !isspace(*endptr)) endptr++;
|
||||
printf("** Invalid character specifier: ignored \"%.*s\"\n", (int)(endptr-s), s);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (c > 0x10ffff)
|
||||
printf("** U+%x is too big for a Unicode code point\n", c);
|
||||
else
|
||||
print_prop(c, TRUE);
|
||||
}
|
||||
s = endptr;
|
||||
while (isspace(*s)) s++;
|
||||
}
|
||||
}
|
||||
|
||||
else if (strcmp(CS name, "find") == 0)
|
||||
{
|
||||
find_chars(s);
|
||||
}
|
||||
|
||||
else if (strcmp(CS name, "list") == 0)
|
||||
{
|
||||
while (*s != 0)
|
||||
{
|
||||
size_t i;
|
||||
for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
|
||||
*t = 0;
|
||||
while (isspace(*s)) s++;
|
||||
|
||||
if (strcmp(CS name, "script") == 0 || strcmp(CS name, "scripts") == 0)
|
||||
{
|
||||
for (i = 0; i < PRIV(utt_size); i++)
|
||||
if (PRIV(utt)[i].type == PT_SCX || PRIV(utt)[i].type == PT_SC)
|
||||
printf("%s\n", PRIV(utt_names) + PRIV(utt)[i].name_offset);
|
||||
}
|
||||
|
||||
else if (strcmp(CS name, "bool") == 0)
|
||||
{
|
||||
for (i = 0; i < PRIV(utt_size); i++)
|
||||
if (PRIV(utt)[i].type == PT_BOOL)
|
||||
printf("%s\n", PRIV(utt_names) + PRIV(utt)[i].name_offset);
|
||||
}
|
||||
|
||||
else if (strcmp(CS name, "type") == 0 || strcmp(CS name, "types") == 0)
|
||||
{
|
||||
for (i = 0; i < sizeof(type_names)/sizeof(char *); i += 2)
|
||||
printf("%s %s\n", type_names[i], type_names[i+1]);
|
||||
}
|
||||
|
||||
else if (strcmp(CS name, "gbreak") == 0 || strcmp(CS name, "gbreaks") == 0)
|
||||
{
|
||||
for (i = 0; i < sizeof(gb_names)/sizeof(char *); i += 2)
|
||||
{
|
||||
if (gb_names[i+1][0] != 0)
|
||||
printf("%-3s (%s)\n", gb_names[i], gb_names[i+1]);
|
||||
else
|
||||
printf("%s\n", gb_names[i]);
|
||||
}
|
||||
}
|
||||
|
||||
else if (strcmp(CS name, "bidi") == 0 ||
|
||||
strcmp(CS name, "bidiclasses") == 0)
|
||||
{
|
||||
for (i = 0; i < sizeof(bd_names)/sizeof(char *); i += 2)
|
||||
printf("%3s %s\n", bd_names[i], bd_names[i+1]);
|
||||
}
|
||||
|
||||
else
|
||||
{
|
||||
printf("** Unknown property \"%s\"\n", name);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
else printf("** Unknown test command \"%s\"\n", name);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Main program *
|
||||
*************************************************/
|
||||
|
||||
int
|
||||
main(int argc, char **argv)
|
||||
{
|
||||
BOOL interactive;
|
||||
int first_arg = 1;
|
||||
unsigned char buffer[1024];
|
||||
|
||||
if (argc > 1 && strcmp(argv[1], "-s") == 0)
|
||||
{
|
||||
show_character = TRUE;
|
||||
first_arg++;
|
||||
}
|
||||
|
||||
if (argc > first_arg)
|
||||
{
|
||||
int i;
|
||||
BOOL datafirst = TRUE;
|
||||
char *arg = argv[first_arg];
|
||||
unsigned char *s = buffer;
|
||||
|
||||
if (*arg != '+' && memcmp(arg, "U+", 2) != 0 && !isdigit(*arg))
|
||||
{
|
||||
while (*arg != 0)
|
||||
{
|
||||
if (!isxdigit(*arg++)) { datafirst = FALSE; break; }
|
||||
}
|
||||
}
|
||||
|
||||
if (datafirst)
|
||||
{
|
||||
strcpy(CS s, "findprop ");
|
||||
s += 9;
|
||||
}
|
||||
|
||||
for (i = first_arg; i < argc; i++)
|
||||
{
|
||||
s += sprintf(CS s, "%s ", argv[i]);
|
||||
}
|
||||
|
||||
process_command_line(buffer);
|
||||
return 0;
|
||||
}
|
||||
|
||||
interactive = is_stdin_tty();
|
||||
|
||||
#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
|
||||
if (interactive) using_history();
|
||||
#endif
|
||||
|
||||
for(;;)
|
||||
{
|
||||
#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
|
||||
if (interactive)
|
||||
{
|
||||
size_t len;
|
||||
unsigned char *s = US readline("> ");
|
||||
if (s == NULL) break;
|
||||
len = strlen(CS s);
|
||||
if (len > 0) add_history(CS s);
|
||||
memcpy(buffer, s, len);
|
||||
buffer[len] = '\n';
|
||||
buffer[len+1] = 0;
|
||||
free(s);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
|
||||
{
|
||||
if (interactive) printf("> ");
|
||||
if (fgets(CS buffer, sizeof(buffer), stdin) == NULL) break;
|
||||
if (!interactive) printf("%s", buffer);
|
||||
}
|
||||
|
||||
process_command_line(buffer);
|
||||
}
|
||||
|
||||
if (interactive) printf("\n");
|
||||
|
||||
#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
|
||||
if (interactive) clear_history();
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* End */
|
||||
Reference in New Issue
Block a user