add PIRegularExpression
This commit is contained in:
186
3rd/pcre2/maint/GenerateTest.py
Normal file
186
3rd/pcre2/maint/GenerateTest.py
Normal file
@@ -0,0 +1,186 @@
|
||||
#! /usr/bin/env python3
|
||||
|
||||
# PCRE2 UNICODE PROPERTY SUPPORT
|
||||
# ------------------------------
|
||||
#
|
||||
# This file auto-generates Unicode property tests and their expected output.
|
||||
# It is recommended to re-run this generator after the Unicode files are
|
||||
# updated. The names of the generated files are `testinput` and `testoutput`
|
||||
# and should be copied over to replace either test26 or test27 files.
|
||||
|
||||
import re
|
||||
import sys
|
||||
|
||||
from GenerateCommon import \
|
||||
script_names, \
|
||||
script_abbrevs
|
||||
|
||||
def write_both(text):
|
||||
input_file.write(text)
|
||||
output_file.write(text)
|
||||
|
||||
def to_string_char(ch_idx):
|
||||
if ch_idx < 128:
|
||||
if ch_idx < 16:
|
||||
return "\\x{0%x}" % ch_idx
|
||||
if ch_idx >= 32:
|
||||
return chr(ch_idx)
|
||||
return "\\x{%x}" % ch_idx
|
||||
|
||||
try:
|
||||
input_file = open("testinput", "w")
|
||||
output_file = open("testoutput", "w")
|
||||
except IOError:
|
||||
print("** Couldn't create output files")
|
||||
sys.exit(1)
|
||||
|
||||
write_both("# These tests were generated by maint/GenerateTest.py using PCRE2's UCP\n");
|
||||
write_both("# data, do not edit unless that data has changed and they are reflecting\n");
|
||||
write_both("# a previous version.\n\n");
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# UNICODE SCRIPT EXTENSION TESTS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def gen_script_tests():
|
||||
script_data = [None] * len(script_names)
|
||||
char_data = [None] * 0x110000
|
||||
|
||||
property_re = re.compile(r"^([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))? +; ([A-Za-z_ ]+[A-Za-z]) +#")
|
||||
prev_name = ""
|
||||
script_idx = -1
|
||||
|
||||
with open("Unicode.tables/Scripts.txt") as f:
|
||||
version_pat = r"^# Scripts-(\d+\.\d+\.\d+)\.txt$"
|
||||
v = re.match(version_pat, f.readline())
|
||||
unicode_version = v.group(1)
|
||||
|
||||
write_both("# Unicode Script Extension tests for version " + unicode_version + "\n\n")
|
||||
write_both("#perltest\n\n")
|
||||
|
||||
for line in f:
|
||||
match_obj = property_re.match(line)
|
||||
|
||||
if match_obj == None:
|
||||
continue
|
||||
|
||||
name = match_obj.group(3)
|
||||
if name != prev_name:
|
||||
script_idx = script_names.index(name)
|
||||
prev_name = name
|
||||
|
||||
low = int(match_obj.group(1), 16)
|
||||
high = low
|
||||
char_data[low] = name
|
||||
|
||||
if match_obj.group(2) != None:
|
||||
high = int(match_obj.group(2), 16)
|
||||
for idx in range(low + 1, high + 1):
|
||||
char_data[idx] = name
|
||||
|
||||
if script_data[script_idx] == None:
|
||||
script_data[script_idx] = [low, None, None, None, None]
|
||||
script_data[script_idx][1] = high
|
||||
|
||||
extended_script_indicies = {}
|
||||
|
||||
with open("Unicode.tables/ScriptExtensions.txt") as f:
|
||||
for line in f:
|
||||
match_obj = property_re.match(line)
|
||||
|
||||
if match_obj == None:
|
||||
continue
|
||||
|
||||
low = int(match_obj.group(1), 16)
|
||||
high = low
|
||||
if match_obj.group(2) != None:
|
||||
high = int(match_obj.group(2), 16)
|
||||
|
||||
for abbrev in match_obj.group(3).split(" "):
|
||||
if abbrev not in extended_script_indicies:
|
||||
idx = script_abbrevs.index(abbrev)
|
||||
extended_script_indicies[abbrev] = idx
|
||||
rec = script_data[idx]
|
||||
rec[2] = low
|
||||
rec[3] = high
|
||||
else:
|
||||
idx = extended_script_indicies[abbrev]
|
||||
rec = script_data[idx]
|
||||
if rec[2] > low:
|
||||
rec[2] = low
|
||||
if rec[3] < high:
|
||||
rec[3] = high
|
||||
|
||||
if rec[4] == None:
|
||||
name = script_names[idx]
|
||||
for idx in range(low, high + 1):
|
||||
if char_data[idx] != name:
|
||||
rec[4] = idx
|
||||
break
|
||||
|
||||
long_property_name = False
|
||||
|
||||
for idx, rec in enumerate(script_data):
|
||||
script_name = script_names[idx]
|
||||
|
||||
if script_name == "Unknown":
|
||||
continue
|
||||
|
||||
script_abbrev = script_abbrevs[idx]
|
||||
|
||||
write_both("# Base script check\n")
|
||||
write_both("/^\\p{sc=%s}/utf\n" % script_name)
|
||||
write_both(" %s\n" % to_string_char(rec[0]))
|
||||
output_file.write(" 0: %s\n" % to_string_char(rec[0]))
|
||||
write_both("\n")
|
||||
|
||||
write_both("/^\\p{Script=%s}/utf\n" % script_abbrev)
|
||||
write_both(" %s\n" % to_string_char(rec[1]))
|
||||
output_file.write(" 0: %s\n" % to_string_char(rec[1]))
|
||||
write_both("\n")
|
||||
|
||||
if rec[2] != None:
|
||||
property_name = "scx"
|
||||
if long_property_name:
|
||||
property_name = "Script_Extensions"
|
||||
|
||||
write_both("# Script extension check\n")
|
||||
write_both("/^\\p{%s}/utf\n" % script_name)
|
||||
write_both(" %s\n" % to_string_char(rec[2]))
|
||||
output_file.write(" 0: %s\n" % to_string_char(rec[2]))
|
||||
write_both("\n")
|
||||
|
||||
write_both("/^\\p{%s=%s}/utf\n" % (property_name, script_abbrev))
|
||||
write_both(" %s\n" % to_string_char(rec[3]))
|
||||
output_file.write(" 0: %s\n" % to_string_char(rec[3]))
|
||||
write_both("\n")
|
||||
|
||||
long_property_name = not long_property_name
|
||||
|
||||
if rec[4] != None:
|
||||
write_both("# Script extension only character\n")
|
||||
write_both("/^\\p{%s}/utf\n" % script_name)
|
||||
write_both(" %s\n" % to_string_char(rec[4]))
|
||||
output_file.write(" 0: %s\n" % to_string_char(rec[4]))
|
||||
write_both("\n")
|
||||
|
||||
write_both("/^\\p{sc=%s}/utf\n" % script_name)
|
||||
write_both(" %s\n" % to_string_char(rec[4]))
|
||||
output_file.write("No match\n")
|
||||
write_both("\n")
|
||||
else:
|
||||
print("External character has not found for %s" % script_name)
|
||||
|
||||
high = rec[1]
|
||||
if rec[3] != None and rec[3] > rec[1]:
|
||||
high = rec[3]
|
||||
write_both("# Character not in script\n")
|
||||
write_both("/^\\p{%s}/utf\n" % script_name)
|
||||
write_both(" %s\n" % to_string_char(high + 1))
|
||||
output_file.write("No match\n")
|
||||
write_both("\n")
|
||||
|
||||
gen_script_tests()
|
||||
|
||||
write_both("# End of test\n")
|
||||
Reference in New Issue
Block a user